Daily bump.
[official-gcc.git] / gcc / tree-vect-transform.c
bloba33cbaa6d30f23fd3eb498b97c4901dc60388179
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to the Free
19 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
20 02110-1301, USA. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "ggc.h"
27 #include "tree.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "timevar.h"
35 #include "cfgloop.h"
36 #include "expr.h"
37 #include "optabs.h"
38 #include "recog.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
45 #include "toplev.h"
46 #include "real.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, block_stmt_iterator *, tree, tree *, tree *, bool, tree);
53 static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree);
54 static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree, tree);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void update_vuses_to_preheader (tree, struct loop*);
62 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
63 static tree get_initial_def_for_reduction (tree, tree, tree *);
65 /* Utility function dealing with loop peeling (not peeling itself). */
66 static void vect_generate_tmps_on_preheader
67 (loop_vec_info, tree *, tree *, tree *);
68 static tree vect_build_loop_niters (loop_vec_info);
69 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
70 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
71 static void vect_update_init_of_dr (struct data_reference *, tree niters);
72 static void vect_update_inits_of_drs (loop_vec_info, tree);
73 static int vect_min_worthwhile_factor (enum tree_code);
76 /* Function vect_get_new_vect_var.
78 Returns a name for a new variable. The current naming scheme appends the
79 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
80 the name of vectorizer generated variables, and appends that to NAME if
81 provided. */
83 static tree
84 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
86 const char *prefix;
87 tree new_vect_var;
89 switch (var_kind)
91 case vect_simple_var:
92 prefix = "vect_";
93 break;
94 case vect_scalar_var:
95 prefix = "stmp_";
96 break;
97 case vect_pointer_var:
98 prefix = "vect_p";
99 break;
100 default:
101 gcc_unreachable ();
104 if (name)
105 new_vect_var = create_tmp_var (type, concat (prefix, name, NULL));
106 else
107 new_vect_var = create_tmp_var (type, prefix);
109 return new_vect_var;
113 /* Function vect_create_addr_base_for_vector_ref.
115 Create an expression that computes the address of the first memory location
116 that will be accessed for a data reference.
118 Input:
119 STMT: The statement containing the data reference.
120 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
121 OFFSET: Optional. If supplied, it is be added to the initial address.
123 Output:
124 1. Return an SSA_NAME whose value is the address of the memory location of
125 the first vector of the data reference.
126 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
127 these statement(s) which define the returned SSA_NAME.
129 FORNOW: We are only handling array accesses with step 1. */
131 static tree
132 vect_create_addr_base_for_vector_ref (tree stmt,
133 tree *new_stmt_list,
134 tree offset)
136 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
137 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
138 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
139 tree base_name = build_fold_indirect_ref (data_ref_base);
140 tree vec_stmt;
141 tree addr_base, addr_expr;
142 tree dest, new_stmt;
143 tree base_offset = unshare_expr (DR_OFFSET (dr));
144 tree init = unshare_expr (DR_INIT (dr));
145 tree vect_ptr_type, addr_expr2;
147 /* Create base_offset */
148 base_offset = size_binop (PLUS_EXPR, base_offset, init);
149 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
150 add_referenced_var (dest);
151 base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest);
152 append_to_statement_list_force (new_stmt, new_stmt_list);
154 if (offset)
156 tree tmp = create_tmp_var (TREE_TYPE (base_offset), "offset");
157 tree step;
159 /* For interleaved access step we divide STEP by the size of the
160 interleaving group. */
161 if (DR_GROUP_SIZE (stmt_info))
162 step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr),
163 build_int_cst (TREE_TYPE (offset),
164 DR_GROUP_SIZE (stmt_info)));
165 else
166 step = DR_STEP (dr);
168 add_referenced_var (tmp);
169 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
170 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
171 base_offset, offset);
172 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
173 append_to_statement_list_force (new_stmt, new_stmt_list);
176 /* base + base_offset */
177 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
178 base_offset);
180 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
182 /* addr_expr = addr_base */
183 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
184 get_name (base_name));
185 add_referenced_var (addr_expr);
186 vec_stmt = fold_convert (vect_ptr_type, addr_base);
187 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
188 get_name (base_name));
189 add_referenced_var (addr_expr2);
190 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
191 append_to_statement_list_force (new_stmt, new_stmt_list);
193 if (vect_print_dump_info (REPORT_DETAILS))
195 fprintf (vect_dump, "created ");
196 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
198 return vec_stmt;
202 /* Function vect_create_data_ref_ptr.
204 Create a new pointer to vector type (vp), that points to the first location
205 accessed in the loop by STMT, along with the def-use update chain to
206 appropriately advance the pointer through the loop iterations. Also set
207 aliasing information for the pointer. This vector pointer is used by the
208 callers to this function to create a memory reference expression for vector
209 load/store access.
211 Input:
212 1. STMT: a stmt that references memory. Expected to be of the form
213 MODIFY_EXPR <name, data-ref> or MODIFY_EXPR <data-ref, name>.
214 2. BSI: block_stmt_iterator where new stmts can be added.
215 3. OFFSET (optional): an offset to be added to the initial address accessed
216 by the data-ref in STMT.
217 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
218 pointing to the initial address.
219 5. TYPE: if not NULL indicates the required type of the data-ref
221 Output:
222 1. Declare a new ptr to vector_type, and have it point to the base of the
223 data reference (initial addressed accessed by the data reference).
224 For example, for vector of type V8HI, the following code is generated:
226 v8hi *vp;
227 vp = (v8hi *)initial_address;
229 if OFFSET is not supplied:
230 initial_address = &a[init];
231 if OFFSET is supplied:
232 initial_address = &a[init + OFFSET];
234 Return the initial_address in INITIAL_ADDRESS.
236 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
237 update the pointer in each iteration of the loop.
239 Return the increment stmt that updates the pointer in PTR_INCR.
241 3. Return the pointer. */
243 static tree
244 vect_create_data_ref_ptr (tree stmt,
245 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
246 tree offset, tree *initial_address, tree *ptr_incr,
247 bool only_init, tree type)
249 tree base_name;
250 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
251 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
252 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
253 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
254 tree vect_ptr_type;
255 tree vect_ptr;
256 tree tag;
257 tree new_temp;
258 tree vec_stmt;
259 tree new_stmt_list = NULL_TREE;
260 edge pe = loop_preheader_edge (loop);
261 basic_block new_bb;
262 tree vect_ptr_init;
263 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
265 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
267 if (vect_print_dump_info (REPORT_DETAILS))
269 tree data_ref_base = base_name;
270 fprintf (vect_dump, "create vector-pointer variable to type: ");
271 print_generic_expr (vect_dump, vectype, TDF_SLIM);
272 if (TREE_CODE (data_ref_base) == VAR_DECL)
273 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
274 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
275 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
276 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
277 fprintf (vect_dump, " vectorizing a record based array ref: ");
278 else if (TREE_CODE (data_ref_base) == SSA_NAME)
279 fprintf (vect_dump, " vectorizing a pointer ref: ");
280 print_generic_expr (vect_dump, base_name, TDF_SLIM);
283 /** (1) Create the new vector-pointer variable: **/
284 if (type)
285 vect_ptr_type = build_pointer_type (type);
286 else
287 vect_ptr_type = build_pointer_type (vectype);
288 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
289 get_name (base_name));
290 add_referenced_var (vect_ptr);
292 /** (2) Add aliasing information to the new vector-pointer:
293 (The points-to info (DR_PTR_INFO) may be defined later.) **/
295 tag = DR_MEMTAG (dr);
296 gcc_assert (tag);
298 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
299 tag must be created with tag added to its may alias list. */
300 if (!MTAG_P (tag))
301 new_type_alias (vect_ptr, tag, DR_REF (dr));
302 else
303 var_ann (vect_ptr)->symbol_mem_tag = tag;
305 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
307 /** (3) Calculate the initial address the vector-pointer, and set
308 the vector-pointer to point to it before the loop: **/
310 /* Create: (&(base[init_val+offset]) in the loop preheader. */
311 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
312 offset);
313 pe = loop_preheader_edge (loop);
314 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
315 gcc_assert (!new_bb);
316 *initial_address = new_temp;
318 /* Create: p = (vectype *) initial_base */
319 vec_stmt = fold_convert (vect_ptr_type, new_temp);
320 vec_stmt = build2 (MODIFY_EXPR, void_type_node, vect_ptr, vec_stmt);
321 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
322 TREE_OPERAND (vec_stmt, 0) = vect_ptr_init;
323 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
324 gcc_assert (!new_bb);
327 /** (4) Handle the updating of the vector-pointer inside the loop: **/
329 if (only_init) /* No update in loop is required. */
331 /* Copy the points-to information if it exists. */
332 if (DR_PTR_INFO (dr))
333 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
334 return vect_ptr_init;
336 else
338 block_stmt_iterator incr_bsi;
339 bool insert_after;
340 tree indx_before_incr, indx_after_incr;
341 tree incr;
343 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
344 create_iv (vect_ptr_init,
345 fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)),
346 NULL_TREE, loop, &incr_bsi, insert_after,
347 &indx_before_incr, &indx_after_incr);
348 incr = bsi_stmt (incr_bsi);
349 set_stmt_info (stmt_ann (incr),
350 new_stmt_vec_info (incr, loop_vinfo));
352 /* Copy the points-to information if it exists. */
353 if (DR_PTR_INFO (dr))
355 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
356 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
358 merge_alias_info (vect_ptr_init, indx_before_incr);
359 merge_alias_info (vect_ptr_init, indx_after_incr);
360 if (ptr_incr)
361 *ptr_incr = incr;
363 return indx_before_incr;
368 /* Function bump_vector_ptr
370 Increment a pointer (to a vector type) by vector-size. Connect the new
371 increment stmt to the existing def-use update-chain of the pointer.
373 The pointer def-use update-chain before this function:
374 DATAREF_PTR = phi (p_0, p_2)
375 ....
376 PTR_INCR: p_2 = DATAREF_PTR + step
378 The pointer def-use update-chain after this function:
379 DATAREF_PTR = phi (p_0, p_2)
380 ....
381 NEW_DATAREF_PTR = DATAREF_PTR + vector_size
382 ....
383 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
385 Input:
386 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
387 in the loop.
388 PTR_INCR - the stmt that updates the pointer in each iteration of the loop.
389 The increment amount across iterations is also expected to be
390 vector_size.
391 BSI - location where the new update stmt is to be placed.
392 STMT - the original scalar memory-access stmt that is being vectorized.
394 Output: Return NEW_DATAREF_PTR as illustrated above.
398 static tree
399 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
400 tree stmt)
402 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
403 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
404 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
405 tree vptr_type = TREE_TYPE (dataref_ptr);
406 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
407 tree update = fold_convert (vptr_type, TYPE_SIZE_UNIT (vectype));
408 tree incr_stmt;
409 ssa_op_iter iter;
410 use_operand_p use_p;
411 tree new_dataref_ptr;
413 incr_stmt = build2 (MODIFY_EXPR, void_type_node, ptr_var,
414 build2 (PLUS_EXPR, vptr_type, dataref_ptr, update));
415 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
416 TREE_OPERAND (incr_stmt, 0) = new_dataref_ptr;
417 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
419 /* Update the vector-pointer's cross-iteration increment. */
420 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
422 tree use = USE_FROM_PTR (use_p);
424 if (use == dataref_ptr)
425 SET_USE (use_p, new_dataref_ptr);
426 else
427 gcc_assert (tree_int_cst_compare (use, update) == 0);
430 /* Copy the points-to information if it exists. */
431 if (DR_PTR_INFO (dr))
432 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
433 merge_alias_info (new_dataref_ptr, dataref_ptr);
435 return new_dataref_ptr;
439 /* Function vect_create_destination_var.
441 Create a new temporary of type VECTYPE. */
443 static tree
444 vect_create_destination_var (tree scalar_dest, tree vectype)
446 tree vec_dest;
447 const char *new_name;
448 tree type;
449 enum vect_var_kind kind;
451 kind = vectype ? vect_simple_var : vect_scalar_var;
452 type = vectype ? vectype : TREE_TYPE (scalar_dest);
454 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
456 new_name = get_name (scalar_dest);
457 if (!new_name)
458 new_name = "var_";
459 vec_dest = vect_get_new_vect_var (type, vect_simple_var, new_name);
460 add_referenced_var (vec_dest);
462 return vec_dest;
466 /* Function vect_init_vector.
468 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
469 the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be
470 used in the vectorization of STMT. */
472 static tree
473 vect_init_vector (tree stmt, tree vector_var, tree vector_type)
475 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
476 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
477 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
478 tree new_var;
479 tree init_stmt;
480 tree vec_oprnd;
481 edge pe;
482 tree new_temp;
483 basic_block new_bb;
485 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
486 add_referenced_var (new_var);
488 init_stmt = build2 (MODIFY_EXPR, void_type_node, new_var, vector_var);
489 new_temp = make_ssa_name (new_var, init_stmt);
490 TREE_OPERAND (init_stmt, 0) = new_temp;
492 pe = loop_preheader_edge (loop);
493 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
494 gcc_assert (!new_bb);
496 if (vect_print_dump_info (REPORT_DETAILS))
498 fprintf (vect_dump, "created new init_stmt: ");
499 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
502 vec_oprnd = TREE_OPERAND (init_stmt, 0);
503 return vec_oprnd;
507 /* Function vect_get_vec_def_for_operand.
509 OP is an operand in STMT. This function returns a (vector) def that will be
510 used in the vectorized stmt for STMT.
512 In the case that OP is an SSA_NAME which is defined in the loop, then
513 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
515 In case OP is an invariant or constant, a new stmt that creates a vector def
516 needs to be introduced. */
518 static tree
519 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
521 tree vec_oprnd;
522 tree vec_stmt;
523 tree def_stmt;
524 stmt_vec_info def_stmt_info = NULL;
525 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
526 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
527 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
528 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
529 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
530 tree vec_inv;
531 tree vec_cst;
532 tree t = NULL_TREE;
533 tree def;
534 int i;
535 enum vect_def_type dt;
536 bool is_simple_use;
537 tree vector_type;
539 if (vect_print_dump_info (REPORT_DETAILS))
541 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
542 print_generic_expr (vect_dump, op, TDF_SLIM);
545 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
546 gcc_assert (is_simple_use);
547 if (vect_print_dump_info (REPORT_DETAILS))
549 if (def)
551 fprintf (vect_dump, "def = ");
552 print_generic_expr (vect_dump, def, TDF_SLIM);
554 if (def_stmt)
556 fprintf (vect_dump, " def_stmt = ");
557 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
561 switch (dt)
563 /* Case 1: operand is a constant. */
564 case vect_constant_def:
566 if (scalar_def)
567 *scalar_def = op;
569 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
570 if (vect_print_dump_info (REPORT_DETAILS))
571 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
573 for (i = nunits - 1; i >= 0; --i)
575 t = tree_cons (NULL_TREE, op, t);
577 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
578 vec_cst = build_vector (vector_type, t);
580 return vect_init_vector (stmt, vec_cst, vector_type);
583 /* Case 2: operand is defined outside the loop - loop invariant. */
584 case vect_invariant_def:
586 if (scalar_def)
587 *scalar_def = def;
589 /* Create 'vec_inv = {inv,inv,..,inv}' */
590 if (vect_print_dump_info (REPORT_DETAILS))
591 fprintf (vect_dump, "Create vector_inv.");
593 for (i = nunits - 1; i >= 0; --i)
595 t = tree_cons (NULL_TREE, def, t);
598 /* FIXME: use build_constructor directly. */
599 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
600 vec_inv = build_constructor_from_list (vector_type, t);
601 return vect_init_vector (stmt, vec_inv, vector_type);
604 /* Case 3: operand is defined inside the loop. */
605 case vect_loop_def:
607 if (scalar_def)
608 *scalar_def = def_stmt;
610 /* Get the def from the vectorized stmt. */
611 def_stmt_info = vinfo_for_stmt (def_stmt);
612 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
613 gcc_assert (vec_stmt);
614 vec_oprnd = TREE_OPERAND (vec_stmt, 0);
615 return vec_oprnd;
618 /* Case 4: operand is defined by a loop header phi - reduction */
619 case vect_reduction_def:
621 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
623 /* Get the def before the loop */
624 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
625 return get_initial_def_for_reduction (stmt, op, scalar_def);
628 /* Case 5: operand is defined by loop-header phi - induction. */
629 case vect_induction_def:
631 if (vect_print_dump_info (REPORT_DETAILS))
632 fprintf (vect_dump, "induction - unsupported.");
633 internal_error ("no support for induction"); /* FORNOW */
636 default:
637 gcc_unreachable ();
642 /* Function vect_get_vec_def_for_stmt_copy
644 Return a vector-def for an operand. This function is used when the
645 vectorized stmt to be created (by the caller to this function) is a "copy"
646 created in case the vectorized result cannot fit in one vector, and several
647 copies of the vector-stmt are required. In this case the vector-def is
648 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
649 of the stmt that defines VEC_OPRND.
650 DT is the type of the vector def VEC_OPRND.
652 Context:
653 In case the vectorization factor (VF) is bigger than the number
654 of elements that can fit in a vectype (nunits), we have to generate
655 more than one vector stmt to vectorize the scalar stmt. This situation
656 arises when there are multiple data-types operated upon in the loop; the
657 smallest data-type determines the VF, and as a result, when vectorizing
658 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
659 vector stmt (each computing a vector of 'nunits' results, and together
660 computing 'VF' results in each iteration). This function is called when
661 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
662 which VF=16 and nuniti=4, so the number of copies required is 4):
664 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
666 S1: x = load VS1.0: vx.0 = memref0 VS1.1
667 VS1.1: vx.1 = memref1 VS1.2
668 VS1.2: vx.2 = memref2 VS1.3
669 VS1.3: vx.3 = memref3
671 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
672 VSnew.1: vz1 = vx.1 + ... VSnew.2
673 VSnew.2: vz2 = vx.2 + ... VSnew.3
674 VSnew.3: vz3 = vx.3 + ...
676 The vectorization of S1 is explained in vectorizable_load.
677 The vectorization of S2:
678 To create the first vector-stmt out of the 4 copies - VSnew.0 -
679 the function 'vect_get_vec_def_for_operand' is called to
680 get the relevant vector-def for each operand of S2. For operand x it
681 returns the vector-def 'vx.0'.
683 To create the remaining copies of the vector-stmt (VSnew.j), this
684 function is called to get the relevant vector-def for each operand. It is
685 obtained from the respective VS1.j stmt, which is recorded in the
686 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
688 For example, to obtain the vector-def 'vx.1' in order to create the
689 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
690 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
691 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
692 and return its def ('vx.1').
693 Overall, to create the above sequence this function will be called 3 times:
694 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
695 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
696 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
698 static tree
699 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
701 tree vec_stmt_for_operand;
702 stmt_vec_info def_stmt_info;
704 if (dt == vect_invariant_def || dt == vect_constant_def)
706 /* Do nothing; can reuse same def. */ ;
707 return vec_oprnd;
710 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
711 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
712 gcc_assert (def_stmt_info);
713 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
714 gcc_assert (vec_stmt_for_operand);
715 vec_oprnd = TREE_OPERAND (vec_stmt_for_operand, 0);
717 return vec_oprnd;
721 /* Function vect_finish_stmt_generation.
723 Insert a new stmt. */
725 static void
726 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
727 block_stmt_iterator *bsi)
729 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
730 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
732 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
733 set_stmt_info (get_stmt_ann (vec_stmt),
734 new_stmt_vec_info (vec_stmt, loop_vinfo));
736 if (vect_print_dump_info (REPORT_DETAILS))
738 fprintf (vect_dump, "add new stmt: ");
739 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
742 /* Make sure bsi points to the stmt that is being vectorized. */
743 gcc_assert (stmt == bsi_stmt (*bsi));
745 #ifdef USE_MAPPED_LOCATION
746 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
747 #else
748 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
749 #endif
753 #define ADJUST_IN_EPILOG 1
755 /* Function get_initial_def_for_reduction
757 Input:
758 STMT - a stmt that performs a reduction operation in the loop.
759 INIT_VAL - the initial value of the reduction variable
761 Output:
762 SCALAR_DEF - a tree that holds a value to be added to the final result
763 of the reduction (used for "ADJUST_IN_EPILOG" - see below).
764 Return a vector variable, initialized according to the operation that STMT
765 performs. This vector will be used as the initial value of the
766 vector of partial results.
768 Option1 ("ADJUST_IN_EPILOG"): Initialize the vector as follows:
769 add: [0,0,...,0,0]
770 mult: [1,1,...,1,1]
771 min/max: [init_val,init_val,..,init_val,init_val]
772 bit and/or: [init_val,init_val,..,init_val,init_val]
773 and when necessary (e.g. add/mult case) let the caller know
774 that it needs to adjust the result by init_val.
776 Option2: Initialize the vector as follows:
777 add: [0,0,...,0,init_val]
778 mult: [1,1,...,1,init_val]
779 min/max: [init_val,init_val,...,init_val]
780 bit and/or: [init_val,init_val,...,init_val]
781 and no adjustments are needed.
783 For example, for the following code:
785 s = init_val;
786 for (i=0;i<n;i++)
787 s = s + a[i];
789 STMT is 's = s + a[i]', and the reduction variable is 's'.
790 For a vector of 4 units, we want to return either [0,0,0,init_val],
791 or [0,0,0,0] and let the caller know that it needs to adjust
792 the result at the end by 'init_val'.
794 FORNOW: We use the "ADJUST_IN_EPILOG" scheme.
795 TODO: Use some cost-model to estimate which scheme is more profitable.
798 static tree
799 get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
801 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
802 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
803 int nunits = GET_MODE_NUNITS (TYPE_MODE (vectype));
804 int nelements;
805 enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1));
806 tree type = TREE_TYPE (init_val);
807 tree def;
808 tree vec, t = NULL_TREE;
809 bool need_epilog_adjust;
810 int i;
811 tree vector_type;
813 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
815 switch (code)
817 case WIDEN_SUM_EXPR:
818 case DOT_PROD_EXPR:
819 case PLUS_EXPR:
820 if (INTEGRAL_TYPE_P (type))
821 def = build_int_cst (type, 0);
822 else
823 def = build_real (type, dconst0);
825 #ifdef ADJUST_IN_EPILOG
826 /* All the 'nunits' elements are set to 0. The final result will be
827 adjusted by 'init_val' at the loop epilog. */
828 nelements = nunits;
829 need_epilog_adjust = true;
830 #else
831 /* 'nunits - 1' elements are set to 0; The last element is set to
832 'init_val'. No further adjustments at the epilog are needed. */
833 nelements = nunits - 1;
834 need_epilog_adjust = false;
835 #endif
836 break;
838 case MIN_EXPR:
839 case MAX_EXPR:
840 def = init_val;
841 nelements = nunits;
842 need_epilog_adjust = false;
843 break;
845 default:
846 gcc_unreachable ();
849 for (i = nelements - 1; i >= 0; --i)
850 t = tree_cons (NULL_TREE, def, t);
852 if (nelements == nunits - 1)
854 /* Set the last element of the vector. */
855 t = tree_cons (NULL_TREE, init_val, t);
856 nelements += 1;
858 gcc_assert (nelements == nunits);
860 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
861 if (TREE_CODE (init_val) == INTEGER_CST || TREE_CODE (init_val) == REAL_CST)
862 vec = build_vector (vector_type, t);
863 else
864 vec = build_constructor_from_list (vector_type, t);
866 if (!need_epilog_adjust)
867 *scalar_def = NULL_TREE;
868 else
869 *scalar_def = init_val;
871 return vect_init_vector (stmt, vec, vector_type);
875 /* Function vect_create_epilog_for_reduction
877 Create code at the loop-epilog to finalize the result of a reduction
878 computation.
880 VECT_DEF is a vector of partial results.
881 REDUC_CODE is the tree-code for the epilog reduction.
882 STMT is the scalar reduction stmt that is being vectorized.
883 REDUCTION_PHI is the phi-node that carries the reduction computation.
885 This function:
886 1. Creates the reduction def-use cycle: sets the the arguments for
887 REDUCTION_PHI:
888 The loop-entry argument is the vectorized initial-value of the reduction.
889 The loop-latch argument is VECT_DEF - the vector of partial sums.
890 2. "Reduces" the vector of partial results VECT_DEF into a single result,
891 by applying the operation specified by REDUC_CODE if available, or by
892 other means (whole-vector shifts or a scalar loop).
893 The function also creates a new phi node at the loop exit to preserve
894 loop-closed form, as illustrated below.
896 The flow at the entry to this function:
898 loop:
899 vec_def = phi <null, null> # REDUCTION_PHI
900 VECT_DEF = vector_stmt # vectorized form of STMT
901 s_loop = scalar_stmt # (scalar) STMT
902 loop_exit:
903 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
904 use <s_out0>
905 use <s_out0>
907 The above is transformed by this function into:
909 loop:
910 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
911 VECT_DEF = vector_stmt # vectorized form of STMT
912 s_loop = scalar_stmt # (scalar) STMT
913 loop_exit:
914 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
915 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
916 v_out2 = reduce <v_out1>
917 s_out3 = extract_field <v_out2, 0>
918 s_out4 = adjust_result <s_out3>
919 use <s_out4>
920 use <s_out4>
923 static void
924 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
925 enum tree_code reduc_code, tree reduction_phi)
927 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
928 tree vectype;
929 enum machine_mode mode;
930 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
931 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
932 basic_block exit_bb;
933 tree scalar_dest;
934 tree scalar_type;
935 tree new_phi;
936 block_stmt_iterator exit_bsi;
937 tree vec_dest;
938 tree new_temp;
939 tree new_name;
940 tree epilog_stmt;
941 tree new_scalar_dest, exit_phi;
942 tree bitsize, bitpos, bytesize;
943 enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1));
944 tree scalar_initial_def;
945 tree vec_initial_def;
946 tree orig_name;
947 imm_use_iterator imm_iter;
948 use_operand_p use_p;
949 bool extract_scalar_result;
950 tree reduction_op;
951 tree orig_stmt;
952 tree use_stmt;
953 tree operation = TREE_OPERAND (stmt, 1);
954 int op_type;
956 op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
957 reduction_op = TREE_OPERAND (operation, op_type-1);
958 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
959 mode = TYPE_MODE (vectype);
961 /*** 1. Create the reduction def-use cycle ***/
963 /* 1.1 set the loop-entry arg of the reduction-phi: */
964 /* For the case of reduction, vect_get_vec_def_for_operand returns
965 the scalar def before the loop, that defines the initial value
966 of the reduction variable. */
967 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
968 &scalar_initial_def);
969 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
971 /* 1.2 set the loop-latch arg for the reduction-phi: */
972 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
974 if (vect_print_dump_info (REPORT_DETAILS))
976 fprintf (vect_dump, "transform reduction: created def-use cycle:");
977 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
978 fprintf (vect_dump, "\n");
979 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
983 /*** 2. Create epilog code
984 The reduction epilog code operates across the elements of the vector
985 of partial results computed by the vectorized loop.
986 The reduction epilog code consists of:
987 step 1: compute the scalar result in a vector (v_out2)
988 step 2: extract the scalar result (s_out3) from the vector (v_out2)
989 step 3: adjust the scalar result (s_out3) if needed.
991 Step 1 can be accomplished using one the following three schemes:
992 (scheme 1) using reduc_code, if available.
993 (scheme 2) using whole-vector shifts, if available.
994 (scheme 3) using a scalar loop. In this case steps 1+2 above are
995 combined.
997 The overall epilog code looks like this:
999 s_out0 = phi <s_loop> # original EXIT_PHI
1000 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1001 v_out2 = reduce <v_out1> # step 1
1002 s_out3 = extract_field <v_out2, 0> # step 2
1003 s_out4 = adjust_result <s_out3> # step 3
1005 (step 3 is optional, and step2 1 and 2 may be combined).
1006 Lastly, the uses of s_out0 are replaced by s_out4.
1008 ***/
1010 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
1011 v_out1 = phi <v_loop> */
1013 exit_bb = single_exit (loop)->dest;
1014 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
1015 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
1016 exit_bsi = bsi_start (exit_bb);
1018 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
1019 (i.e. when reduc_code is not available) and in the final adjustment code
1020 (if needed). Also get the original scalar reduction variable as
1021 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
1022 represents a reduction pattern), the tree-code and scalar-def are
1023 taken from the original stmt that the pattern-stmt (STMT) replaces.
1024 Otherwise (it is a regular reduction) - the tree-code and scalar-def
1025 are taken from STMT. */
1027 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1028 if (!orig_stmt)
1030 /* Regular reduction */
1031 orig_stmt = stmt;
1033 else
1035 /* Reduction pattern */
1036 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
1037 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1038 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
1040 code = TREE_CODE (TREE_OPERAND (orig_stmt, 1));
1041 scalar_dest = TREE_OPERAND (orig_stmt, 0);
1042 scalar_type = TREE_TYPE (scalar_dest);
1043 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
1044 bitsize = TYPE_SIZE (scalar_type);
1045 bytesize = TYPE_SIZE_UNIT (scalar_type);
1047 /* 2.3 Create the reduction code, using one of the three schemes described
1048 above. */
1050 if (reduc_code < NUM_TREE_CODES)
1052 /*** Case 1: Create:
1053 v_out2 = reduc_expr <v_out1> */
1055 if (vect_print_dump_info (REPORT_DETAILS))
1056 fprintf (vect_dump, "Reduce using direct vector reduction.");
1058 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1059 epilog_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest,
1060 build1 (reduc_code, vectype, PHI_RESULT (new_phi)));
1061 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1062 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1063 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1065 extract_scalar_result = true;
1067 else
1069 enum tree_code shift_code = 0;
1070 bool have_whole_vector_shift = true;
1071 int bit_offset;
1072 int element_bitsize = tree_low_cst (bitsize, 1);
1073 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1074 tree vec_temp;
1076 if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
1077 shift_code = VEC_RSHIFT_EXPR;
1078 else
1079 have_whole_vector_shift = false;
1081 /* Regardless of whether we have a whole vector shift, if we're
1082 emulating the operation via tree-vect-generic, we don't want
1083 to use it. Only the first round of the reduction is likely
1084 to still be profitable via emulation. */
1085 /* ??? It might be better to emit a reduction tree code here, so that
1086 tree-vect-generic can expand the first round via bit tricks. */
1087 if (!VECTOR_MODE_P (mode))
1088 have_whole_vector_shift = false;
1089 else
1091 optab optab = optab_for_tree_code (code, vectype);
1092 if (optab->handlers[mode].insn_code == CODE_FOR_nothing)
1093 have_whole_vector_shift = false;
1096 if (have_whole_vector_shift)
1098 /*** Case 2: Create:
1099 for (offset = VS/2; offset >= element_size; offset/=2)
1101 Create: va' = vec_shift <va, offset>
1102 Create: va = vop <va, va'>
1103 } */
1105 if (vect_print_dump_info (REPORT_DETAILS))
1106 fprintf (vect_dump, "Reduce using vector shifts");
1108 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1109 new_temp = PHI_RESULT (new_phi);
1111 for (bit_offset = vec_size_in_bits/2;
1112 bit_offset >= element_bitsize;
1113 bit_offset /= 2)
1115 tree bitpos = size_int (bit_offset);
1117 epilog_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest,
1118 build2 (shift_code, vectype,
1119 new_temp, bitpos));
1120 new_name = make_ssa_name (vec_dest, epilog_stmt);
1121 TREE_OPERAND (epilog_stmt, 0) = new_name;
1122 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1124 epilog_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest,
1125 build2 (code, vectype,
1126 new_name, new_temp));
1127 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1128 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1129 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1132 extract_scalar_result = true;
1134 else
1136 tree rhs;
1138 /*** Case 3: Create:
1139 s = extract_field <v_out2, 0>
1140 for (offset = element_size;
1141 offset < vector_size;
1142 offset += element_size;)
1144 Create: s' = extract_field <v_out2, offset>
1145 Create: s = op <s, s'>
1146 } */
1148 if (vect_print_dump_info (REPORT_DETAILS))
1149 fprintf (vect_dump, "Reduce using scalar code. ");
1151 vec_temp = PHI_RESULT (new_phi);
1152 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1153 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1154 bitsize_zero_node);
1155 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1156 epilog_stmt = build2 (MODIFY_EXPR, void_type_node, new_scalar_dest, rhs);
1157 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1158 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1159 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1161 for (bit_offset = element_bitsize;
1162 bit_offset < vec_size_in_bits;
1163 bit_offset += element_bitsize)
1165 tree bitpos = bitsize_int (bit_offset);
1166 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1167 bitpos);
1169 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1170 epilog_stmt = build2 (MODIFY_EXPR, void_type_node, new_scalar_dest,
1171 rhs);
1172 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
1173 TREE_OPERAND (epilog_stmt, 0) = new_name;
1174 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1176 epilog_stmt = build2 (MODIFY_EXPR, void_type_node, new_scalar_dest,
1177 build2 (code, scalar_type, new_name, new_temp));
1178 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1179 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1180 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1183 extract_scalar_result = false;
1187 /* 2.4 Extract the final scalar result. Create:
1188 s_out3 = extract_field <v_out2, bitpos> */
1190 if (extract_scalar_result)
1192 tree rhs;
1194 if (vect_print_dump_info (REPORT_DETAILS))
1195 fprintf (vect_dump, "extract scalar result");
1197 if (BYTES_BIG_ENDIAN)
1198 bitpos = size_binop (MULT_EXPR,
1199 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
1200 TYPE_SIZE (scalar_type));
1201 else
1202 bitpos = bitsize_zero_node;
1204 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
1205 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1206 epilog_stmt = build2 (MODIFY_EXPR, void_type_node, new_scalar_dest, rhs);
1207 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1208 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1209 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1212 /* 2.4 Adjust the final result by the initial value of the reduction
1213 variable. (When such adjustment is not needed, then
1214 'scalar_initial_def' is zero).
1216 Create:
1217 s_out4 = scalar_expr <s_out3, scalar_initial_def> */
1219 if (scalar_initial_def)
1221 epilog_stmt = build2 (MODIFY_EXPR, void_type_node, new_scalar_dest,
1222 build2 (code, scalar_type, new_temp, scalar_initial_def));
1223 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1224 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1225 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1228 /* 2.6 Replace uses of s_out0 with uses of s_out3 */
1230 /* Find the loop-closed-use at the loop exit of the original scalar result.
1231 (The reduction result is expected to have two immediate uses - one at the
1232 latch block, and one at the loop exit). */
1233 exit_phi = NULL;
1234 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
1236 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
1238 exit_phi = USE_STMT (use_p);
1239 break;
1242 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
1243 gcc_assert (exit_phi);
1244 /* Replace the uses: */
1245 orig_name = PHI_RESULT (exit_phi);
1246 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
1247 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1248 SET_USE (use_p, new_temp);
1252 /* Function vectorizable_reduction.
1254 Check if STMT performs a reduction operation that can be vectorized.
1255 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1256 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1257 Return FALSE if not a vectorizable STMT, TRUE otherwise.
1259 This function also handles reduction idioms (patterns) that have been
1260 recognized in advance during vect_pattern_recog. In this case, STMT may be
1261 of this form:
1262 X = pattern_expr (arg0, arg1, ..., X)
1263 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
1264 sequence that had been detected and replaced by the pattern-stmt (STMT).
1266 In some cases of reduction patterns, the type of the reduction variable X is
1267 different than the type of the other arguments of STMT.
1268 In such cases, the vectype that is used when transforming STMT into a vector
1269 stmt is different than the vectype that is used to determine the
1270 vectorization factor, because it consists of a different number of elements
1271 than the actual number of elements that are being operated upon in parallel.
1273 For example, consider an accumulation of shorts into an int accumulator.
1274 On some targets it's possible to vectorize this pattern operating on 8
1275 shorts at a time (hence, the vectype for purposes of determining the
1276 vectorization factor should be V8HI); on the other hand, the vectype that
1277 is used to create the vector form is actually V4SI (the type of the result).
1279 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
1280 indicates what is the actual level of parallelism (V8HI in the example), so
1281 that the right vectorization factor would be derived. This vectype
1282 corresponds to the type of arguments to the reduction stmt, and should *NOT*
1283 be used to create the vectorized stmt. The right vectype for the vectorized
1284 stmt is obtained from the type of the result X:
1285 get_vectype_for_scalar_type (TREE_TYPE (X))
1287 This means that, contrary to "regular" reductions (or "regular" stmts in
1288 general), the following equation:
1289 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
1290 does *NOT* necessarily hold for reduction patterns. */
1292 bool
1293 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1295 tree vec_dest;
1296 tree scalar_dest;
1297 tree op;
1298 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
1299 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1300 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1301 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1302 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1303 tree operation;
1304 enum tree_code code, orig_code, epilog_reduc_code = 0;
1305 enum machine_mode vec_mode;
1306 int op_type;
1307 optab optab, reduc_optab;
1308 tree new_temp = NULL_TREE;
1309 tree def, def_stmt;
1310 enum vect_def_type dt;
1311 tree new_phi;
1312 tree scalar_type;
1313 bool is_simple_use;
1314 tree orig_stmt;
1315 stmt_vec_info orig_stmt_info;
1316 tree expr = NULL_TREE;
1317 int i;
1318 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1319 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1320 stmt_vec_info prev_stmt_info;
1321 tree reduc_def;
1322 tree new_stmt = NULL_TREE;
1323 int j;
1325 gcc_assert (ncopies >= 1);
1327 /* 1. Is vectorizable reduction? */
1329 /* Not supportable if the reduction variable is used in the loop. */
1330 if (STMT_VINFO_RELEVANT_P (stmt_info))
1331 return false;
1333 if (!STMT_VINFO_LIVE_P (stmt_info))
1334 return false;
1336 /* Make sure it was already recognized as a reduction computation. */
1337 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
1338 return false;
1340 /* 2. Has this been recognized as a reduction pattern?
1342 Check if STMT represents a pattern that has been recognized
1343 in earlier analysis stages. For stmts that represent a pattern,
1344 the STMT_VINFO_RELATED_STMT field records the last stmt in
1345 the original sequence that constitutes the pattern. */
1347 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1348 if (orig_stmt)
1350 orig_stmt_info = vinfo_for_stmt (orig_stmt);
1351 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
1352 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
1353 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
1356 /* 3. Check the operands of the operation. The first operands are defined
1357 inside the loop body. The last operand is the reduction variable,
1358 which is defined by the loop-header-phi. */
1360 gcc_assert (TREE_CODE (stmt) == MODIFY_EXPR);
1362 operation = TREE_OPERAND (stmt, 1);
1363 code = TREE_CODE (operation);
1364 op_type = TREE_CODE_LENGTH (code);
1365 if (op_type != binary_op && op_type != ternary_op)
1366 return false;
1367 scalar_dest = TREE_OPERAND (stmt, 0);
1368 scalar_type = TREE_TYPE (scalar_dest);
1370 /* All uses but the last are expected to be defined in the loop.
1371 The last use is the reduction variable. */
1372 for (i = 0; i < op_type-1; i++)
1374 op = TREE_OPERAND (operation, i);
1375 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1376 gcc_assert (is_simple_use);
1377 gcc_assert (dt == vect_loop_def || dt == vect_invariant_def ||
1378 dt == vect_constant_def);
1381 op = TREE_OPERAND (operation, i);
1382 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1383 gcc_assert (is_simple_use);
1384 gcc_assert (dt == vect_reduction_def);
1385 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1386 if (orig_stmt)
1387 gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt));
1388 else
1389 gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt));
1391 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
1392 return false;
1394 /* 4. Supportable by target? */
1396 /* 4.1. check support for the operation in the loop */
1397 optab = optab_for_tree_code (code, vectype);
1398 if (!optab)
1400 if (vect_print_dump_info (REPORT_DETAILS))
1401 fprintf (vect_dump, "no optab.");
1402 return false;
1404 vec_mode = TYPE_MODE (vectype);
1405 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1407 if (vect_print_dump_info (REPORT_DETAILS))
1408 fprintf (vect_dump, "op not supported by target.");
1409 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1410 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1411 < vect_min_worthwhile_factor (code))
1412 return false;
1413 if (vect_print_dump_info (REPORT_DETAILS))
1414 fprintf (vect_dump, "proceeding using word mode.");
1417 /* Worthwhile without SIMD support? */
1418 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1419 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1420 < vect_min_worthwhile_factor (code))
1422 if (vect_print_dump_info (REPORT_DETAILS))
1423 fprintf (vect_dump, "not worthwhile without SIMD support.");
1424 return false;
1427 /* 4.2. Check support for the epilog operation.
1429 If STMT represents a reduction pattern, then the type of the
1430 reduction variable may be different than the type of the rest
1431 of the arguments. For example, consider the case of accumulation
1432 of shorts into an int accumulator; The original code:
1433 S1: int_a = (int) short_a;
1434 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
1436 was replaced with:
1437 STMT: int_acc = widen_sum <short_a, int_acc>
1439 This means that:
1440 1. The tree-code that is used to create the vector operation in the
1441 epilog code (that reduces the partial results) is not the
1442 tree-code of STMT, but is rather the tree-code of the original
1443 stmt from the pattern that STMT is replacing. I.e, in the example
1444 above we want to use 'widen_sum' in the loop, but 'plus' in the
1445 epilog.
1446 2. The type (mode) we use to check available target support
1447 for the vector operation to be created in the *epilog*, is
1448 determined by the type of the reduction variable (in the example
1449 above we'd check this: plus_optab[vect_int_mode]).
1450 However the type (mode) we use to check available target support
1451 for the vector operation to be created *inside the loop*, is
1452 determined by the type of the other arguments to STMT (in the
1453 example we'd check this: widen_sum_optab[vect_short_mode]).
1455 This is contrary to "regular" reductions, in which the types of all
1456 the arguments are the same as the type of the reduction variable.
1457 For "regular" reductions we can therefore use the same vector type
1458 (and also the same tree-code) when generating the epilog code and
1459 when generating the code inside the loop. */
1461 if (orig_stmt)
1463 /* This is a reduction pattern: get the vectype from the type of the
1464 reduction variable, and get the tree-code from orig_stmt. */
1465 orig_code = TREE_CODE (TREE_OPERAND (orig_stmt, 1));
1466 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
1467 vec_mode = TYPE_MODE (vectype);
1469 else
1471 /* Regular reduction: use the same vectype and tree-code as used for
1472 the vector code inside the loop can be used for the epilog code. */
1473 orig_code = code;
1476 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
1477 return false;
1478 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
1479 if (!reduc_optab)
1481 if (vect_print_dump_info (REPORT_DETAILS))
1482 fprintf (vect_dump, "no optab for reduction.");
1483 epilog_reduc_code = NUM_TREE_CODES;
1485 if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1487 if (vect_print_dump_info (REPORT_DETAILS))
1488 fprintf (vect_dump, "reduc op not supported by target.");
1489 epilog_reduc_code = NUM_TREE_CODES;
1492 if (!vec_stmt) /* transformation not required. */
1494 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
1495 return true;
1498 /** Transform. **/
1500 if (vect_print_dump_info (REPORT_DETAILS))
1501 fprintf (vect_dump, "transform reduction.");
1503 /* Create the destination vector */
1504 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1506 /* Create the reduction-phi that defines the reduction-operand. */
1507 new_phi = create_phi_node (vec_dest, loop->header);
1509 /* In case the vectorization factor (VF) is bigger than the number
1510 of elements that we can fit in a vectype (nunits), we have to generate
1511 more than one vector stmt - i.e - we need to "unroll" the
1512 vector stmt by a factor VF/nunits. For more details see documentation
1513 in vectorizable_operation. */
1515 prev_stmt_info = NULL;
1516 for (j = 0; j < ncopies; j++)
1518 /* Handle uses. */
1519 if (j == 0)
1521 op = TREE_OPERAND (operation, 0);
1522 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
1523 if (op_type == ternary_op)
1525 op = TREE_OPERAND (operation, 1);
1526 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
1529 /* Get the vector def for the reduction variable from the phi node */
1530 reduc_def = PHI_RESULT (new_phi);
1532 else
1534 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
1535 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
1536 if (op_type == ternary_op)
1537 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
1539 /* Get the vector def for the reduction variable from the vectorized
1540 reduction operation generated in the previous iteration (j-1) */
1541 reduc_def = TREE_OPERAND (new_stmt ,0);
1544 /* Arguments are ready. create the new vector stmt. */
1546 if (op_type == binary_op)
1547 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
1548 else
1549 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
1550 reduc_def);
1551 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, expr);
1552 new_temp = make_ssa_name (vec_dest, new_stmt);
1553 TREE_OPERAND (new_stmt, 0) = new_temp;
1554 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1556 if (j == 0)
1557 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1558 else
1559 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1560 prev_stmt_info = vinfo_for_stmt (new_stmt);
1563 /* Finalize the reduction-phi (set it's arguments) and create the
1564 epilog reduction code. */
1565 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
1566 return true;
1569 /* Checks if CALL can be vectorized in type VECTYPE. Returns
1570 true if the target has a vectorized version of the function,
1571 or false if the function cannot be vectorized. */
1573 bool
1574 vectorizable_function (tree call, tree vectype)
1576 tree fndecl = get_callee_fndecl (call);
1578 /* We only handle functions that do not read or clobber memory -- i.e.
1579 const or novops ones. */
1580 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
1581 return false;
1583 if (!fndecl
1584 || TREE_CODE (fndecl) != FUNCTION_DECL
1585 || !DECL_BUILT_IN (fndecl))
1586 return false;
1588 if (targetm.vectorize.builtin_vectorized_function (DECL_FUNCTION_CODE (fndecl), vectype))
1589 return true;
1591 return false;
1594 /* Returns an expression that performs a call to vectorized version
1595 of FNDECL in type VECTYPE, with the arguments given by ARGS.
1596 If extra statements need to be generated, they are inserted
1597 before BSI. */
1599 static tree
1600 build_vectorized_function_call (tree fndecl,
1601 tree vectype, tree args)
1603 tree vfndecl;
1604 enum built_in_function code = DECL_FUNCTION_CODE (fndecl);
1606 /* The target specific builtin should be available. */
1607 vfndecl = targetm.vectorize.builtin_vectorized_function (code, vectype);
1608 gcc_assert (vfndecl != NULL_TREE);
1610 return build_function_call_expr (vfndecl, args);
1613 /* Function vectorizable_call.
1615 Check if STMT performs a function call that can be vectorized.
1616 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1617 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1618 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1620 bool
1621 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1623 tree vec_dest;
1624 tree scalar_dest;
1625 tree operation;
1626 tree op, args, type;
1627 tree vec_oprnd, vargs, *pvargs_end;
1628 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1629 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1630 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1631 tree fndecl, rhs, new_temp, def, def_stmt;
1632 enum vect_def_type dt;
1634 /* Is STMT a vectorizable call? */
1635 if (TREE_CODE (stmt) != MODIFY_EXPR)
1636 return false;
1638 if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME)
1639 return false;
1641 operation = TREE_OPERAND (stmt, 1);
1642 if (TREE_CODE (operation) != CALL_EXPR)
1643 return false;
1645 /* For now, we only vectorize functions if a target specific builtin
1646 is available. TODO -- in some cases, it might be profitable to
1647 insert the calls for pieces of the vector, in order to be able
1648 to vectorize other operations in the loop. */
1649 if (!vectorizable_function (operation, vectype))
1651 if (vect_print_dump_info (REPORT_DETAILS))
1652 fprintf (vect_dump, "function is not vectorizable.");
1654 return false;
1656 gcc_assert (!stmt_references_memory_p (stmt));
1658 for (args = TREE_OPERAND (operation, 1); args; args = TREE_CHAIN (args))
1660 op = TREE_VALUE (args);
1662 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
1664 if (vect_print_dump_info (REPORT_DETAILS))
1665 fprintf (vect_dump, "use not simple.");
1666 return false;
1670 if (!vec_stmt) /* transformation not required. */
1672 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
1673 return true;
1676 /** Transform. **/
1678 if (vect_print_dump_info (REPORT_DETAILS))
1679 fprintf (vect_dump, "transform operation.");
1681 /* Handle def. */
1682 scalar_dest = TREE_OPERAND (stmt, 0);
1683 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1685 /* Handle uses. */
1686 vargs = NULL_TREE;
1687 pvargs_end = &vargs;
1688 for (args = TREE_OPERAND (operation, 1); args; args = TREE_CHAIN (args))
1690 op = TREE_VALUE (args);
1691 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
1693 *pvargs_end = tree_cons (NULL_TREE, vec_oprnd, NULL_TREE);
1694 pvargs_end = &TREE_CHAIN (*pvargs_end);
1697 fndecl = get_callee_fndecl (operation);
1698 rhs = build_vectorized_function_call (fndecl, vectype, vargs);
1699 *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, rhs);
1700 new_temp = make_ssa_name (vec_dest, *vec_stmt);
1701 TREE_OPERAND (*vec_stmt, 0) = new_temp;
1703 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
1705 /* The call in STMT might prevent it from being removed in dce. We however
1706 cannot remove it here, due to the way the ssa name it defines is mapped
1707 to the new definition. So just replace rhs of the statement with something
1708 harmless. */
1709 type = TREE_TYPE (scalar_dest);
1710 TREE_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
1712 return true;
1716 /* Function vectorizable_assignment.
1718 Check if STMT performs an assignment (copy) that can be vectorized.
1719 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1720 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1721 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1723 bool
1724 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1726 tree vec_dest;
1727 tree scalar_dest;
1728 tree op;
1729 tree vec_oprnd;
1730 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1731 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1732 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1733 tree new_temp;
1734 tree def, def_stmt;
1735 enum vect_def_type dt;
1736 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1737 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1739 gcc_assert (ncopies >= 1);
1740 if (ncopies > 1)
1741 return false; /* FORNOW */
1743 /* Is vectorizable assignment? */
1744 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1745 return false;
1747 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1749 if (TREE_CODE (stmt) != MODIFY_EXPR)
1750 return false;
1752 scalar_dest = TREE_OPERAND (stmt, 0);
1753 if (TREE_CODE (scalar_dest) != SSA_NAME)
1754 return false;
1756 op = TREE_OPERAND (stmt, 1);
1757 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
1759 if (vect_print_dump_info (REPORT_DETAILS))
1760 fprintf (vect_dump, "use not simple.");
1761 return false;
1764 if (!vec_stmt) /* transformation not required. */
1766 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
1767 return true;
1770 /** Transform. **/
1771 if (vect_print_dump_info (REPORT_DETAILS))
1772 fprintf (vect_dump, "transform assignment.");
1774 /* Handle def. */
1775 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1777 /* Handle use. */
1778 op = TREE_OPERAND (stmt, 1);
1779 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
1781 /* Arguments are ready. create the new vector stmt. */
1782 *vec_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, vec_oprnd);
1783 new_temp = make_ssa_name (vec_dest, *vec_stmt);
1784 TREE_OPERAND (*vec_stmt, 0) = new_temp;
1785 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
1787 return true;
1791 /* Function vect_min_worthwhile_factor.
1793 For a loop where we could vectorize the operation indicated by CODE,
1794 return the minimum vectorization factor that makes it worthwhile
1795 to use generic vectors. */
1796 static int
1797 vect_min_worthwhile_factor (enum tree_code code)
1799 switch (code)
1801 case PLUS_EXPR:
1802 case MINUS_EXPR:
1803 case NEGATE_EXPR:
1804 return 4;
1806 case BIT_AND_EXPR:
1807 case BIT_IOR_EXPR:
1808 case BIT_XOR_EXPR:
1809 case BIT_NOT_EXPR:
1810 return 2;
1812 default:
1813 return INT_MAX;
1818 /* Function vectorizable_operation.
1820 Check if STMT performs a binary or unary operation that can be vectorized.
1821 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1822 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1823 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1825 bool
1826 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1828 tree vec_dest;
1829 tree scalar_dest;
1830 tree operation;
1831 tree op0, op1 = NULL;
1832 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
1833 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1834 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1835 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1836 enum tree_code code;
1837 enum machine_mode vec_mode;
1838 tree new_temp;
1839 int op_type;
1840 optab optab;
1841 int icode;
1842 enum machine_mode optab_op2_mode;
1843 tree def, def_stmt;
1844 enum vect_def_type dt0, dt1;
1845 tree new_stmt;
1846 stmt_vec_info prev_stmt_info;
1847 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
1848 int nunits_out;
1849 tree vectype_out;
1850 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
1851 int j;
1853 gcc_assert (ncopies >= 1);
1855 /* Is STMT a vectorizable binary/unary operation? */
1856 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1857 return false;
1859 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1861 if (STMT_VINFO_LIVE_P (stmt_info))
1863 /* FORNOW: not yet supported. */
1864 if (vect_print_dump_info (REPORT_DETAILS))
1865 fprintf (vect_dump, "value used after loop.");
1866 return false;
1869 if (TREE_CODE (stmt) != MODIFY_EXPR)
1870 return false;
1872 if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME)
1873 return false;
1875 scalar_dest = TREE_OPERAND (stmt, 0);
1876 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
1877 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
1878 if (nunits_out != nunits_in)
1879 return false;
1881 operation = TREE_OPERAND (stmt, 1);
1882 code = TREE_CODE (operation);
1883 optab = optab_for_tree_code (code, vectype);
1885 /* Support only unary or binary operations. */
1886 op_type = TREE_CODE_LENGTH (code);
1887 if (op_type != unary_op && op_type != binary_op)
1889 if (vect_print_dump_info (REPORT_DETAILS))
1890 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
1891 return false;
1894 op0 = TREE_OPERAND (operation, 0);
1895 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
1897 if (vect_print_dump_info (REPORT_DETAILS))
1898 fprintf (vect_dump, "use not simple.");
1899 return false;
1902 if (op_type == binary_op)
1904 op1 = TREE_OPERAND (operation, 1);
1905 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
1907 if (vect_print_dump_info (REPORT_DETAILS))
1908 fprintf (vect_dump, "use not simple.");
1909 return false;
1913 /* Supportable by target? */
1914 if (!optab)
1916 if (vect_print_dump_info (REPORT_DETAILS))
1917 fprintf (vect_dump, "no optab.");
1918 return false;
1920 vec_mode = TYPE_MODE (vectype);
1921 icode = (int) optab->handlers[(int) vec_mode].insn_code;
1922 if (icode == CODE_FOR_nothing)
1924 if (vect_print_dump_info (REPORT_DETAILS))
1925 fprintf (vect_dump, "op not supported by target.");
1926 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1927 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1928 < vect_min_worthwhile_factor (code))
1929 return false;
1930 if (vect_print_dump_info (REPORT_DETAILS))
1931 fprintf (vect_dump, "proceeding using word mode.");
1934 /* Worthwhile without SIMD support? */
1935 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1936 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1937 < vect_min_worthwhile_factor (code))
1939 if (vect_print_dump_info (REPORT_DETAILS))
1940 fprintf (vect_dump, "not worthwhile without SIMD support.");
1941 return false;
1944 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
1946 /* FORNOW: not yet supported. */
1947 if (!VECTOR_MODE_P (vec_mode))
1948 return false;
1950 /* Invariant argument is needed for a vector shift
1951 by a scalar shift operand. */
1952 optab_op2_mode = insn_data[icode].operand[2].mode;
1953 if (! (VECTOR_MODE_P (optab_op2_mode)
1954 || dt1 == vect_constant_def
1955 || dt1 == vect_invariant_def))
1957 if (vect_print_dump_info (REPORT_DETAILS))
1958 fprintf (vect_dump, "operand mode requires invariant argument.");
1959 return false;
1963 if (!vec_stmt) /* transformation not required. */
1965 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
1966 return true;
1969 /** Transform. **/
1971 if (vect_print_dump_info (REPORT_DETAILS))
1972 fprintf (vect_dump, "transform binary/unary operation.");
1974 /* Handle def. */
1975 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1977 /* In case the vectorization factor (VF) is bigger than the number
1978 of elements that we can fit in a vectype (nunits), we have to generate
1979 more than one vector stmt - i.e - we need to "unroll" the
1980 vector stmt by a factor VF/nunits. In doing so, we record a pointer
1981 from one copy of the vector stmt to the next, in the field
1982 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
1983 stages to find the correct vector defs to be used when vectorizing
1984 stmts that use the defs of the current stmt. The example below illustrates
1985 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
1986 4 vectorized stmts):
1988 before vectorization:
1989 RELATED_STMT VEC_STMT
1990 S1: x = memref - -
1991 S2: z = x + 1 - -
1993 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
1994 there):
1995 RELATED_STMT VEC_STMT
1996 VS1_0: vx0 = memref0 VS1_1 -
1997 VS1_1: vx1 = memref1 VS1_2 -
1998 VS1_2: vx2 = memref2 VS1_3 -
1999 VS1_3: vx3 = memref3 - -
2000 S1: x = load - VS1_0
2001 S2: z = x + 1 - -
2003 step2: vectorize stmt S2 (done here):
2004 To vectorize stmt S2 we first need to find the relevant vector
2005 def for the first operand 'x'. This is, as usual, obtained from
2006 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
2007 that defines 'x' (S1). This way we find the stmt VS1_0, and the
2008 relevant vector def 'vx0'. Having found 'vx0' we can generate
2009 the vector stmt VS2_0, and as usual, record it in the
2010 STMT_VINFO_VEC_STMT of stmt S2.
2011 When creating the second copy (VS2_1), we obtain the relevant vector
2012 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
2013 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
2014 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
2015 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
2016 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
2017 chain of stmts and pointers:
2018 RELATED_STMT VEC_STMT
2019 VS1_0: vx0 = memref0 VS1_1 -
2020 VS1_1: vx1 = memref1 VS1_2 -
2021 VS1_2: vx2 = memref2 VS1_3 -
2022 VS1_3: vx3 = memref3 - -
2023 S1: x = load - VS1_0
2024 VS2_0: vz0 = vx0 + v1 VS2_1 -
2025 VS2_1: vz1 = vx1 + v1 VS2_2 -
2026 VS2_2: vz2 = vx2 + v1 VS2_3 -
2027 VS2_3: vz3 = vx3 + v1 - -
2028 S2: z = x + 1 - VS2_0 */
2030 prev_stmt_info = NULL;
2031 for (j = 0; j < ncopies; j++)
2033 /* Handle uses. */
2034 if (j == 0)
2036 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2037 if (op_type == binary_op)
2039 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2041 /* Vector shl and shr insn patterns can be defined with
2042 scalar operand 2 (shift operand). In this case, use
2043 constant or loop invariant op1 directly, without
2044 extending it to vector mode first. */
2045 optab_op2_mode = insn_data[icode].operand[2].mode;
2046 if (!VECTOR_MODE_P (optab_op2_mode))
2048 if (vect_print_dump_info (REPORT_DETAILS))
2049 fprintf (vect_dump, "operand 1 using scalar mode.");
2050 vec_oprnd1 = op1;
2053 if (!vec_oprnd1)
2054 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2057 else
2059 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2060 if (op_type == binary_op)
2061 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2064 /* Arguments are ready. create the new vector stmt. */
2066 if (op_type == binary_op)
2067 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest,
2068 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
2069 else
2070 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest,
2071 build1 (code, vectype, vec_oprnd0));
2072 new_temp = make_ssa_name (vec_dest, new_stmt);
2073 TREE_OPERAND (new_stmt, 0) = new_temp;
2074 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2076 if (j == 0)
2077 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2078 else
2079 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2080 prev_stmt_info = vinfo_for_stmt (new_stmt);
2083 return true;
2087 /* Function vectorizable_type_demotion
2089 Check if STMT performs a binary or unary operation that involves
2090 type demotion, and if it can be vectorized.
2091 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2092 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2093 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2095 bool
2096 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
2097 tree *vec_stmt)
2099 tree vec_dest;
2100 tree scalar_dest;
2101 tree operation;
2102 tree op0;
2103 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2104 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2105 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2106 enum tree_code code;
2107 tree new_temp;
2108 tree def, def_stmt;
2109 enum vect_def_type dt0;
2110 tree new_stmt;
2111 stmt_vec_info prev_stmt_info;
2112 int nunits_in;
2113 int nunits_out;
2114 tree vectype_out;
2115 int ncopies;
2116 int j;
2117 tree expr;
2118 tree vectype_in;
2119 tree scalar_type;
2120 optab optab;
2121 enum machine_mode vec_mode;
2123 /* Is STMT a vectorizable type-demotion operation? */
2125 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2126 return false;
2128 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2130 if (STMT_VINFO_LIVE_P (stmt_info))
2132 /* FORNOW: not yet supported. */
2133 if (vect_print_dump_info (REPORT_DETAILS))
2134 fprintf (vect_dump, "value used after loop.");
2135 return false;
2138 if (TREE_CODE (stmt) != MODIFY_EXPR)
2139 return false;
2141 if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME)
2142 return false;
2144 operation = TREE_OPERAND (stmt, 1);
2145 code = TREE_CODE (operation);
2146 if (code != NOP_EXPR && code != CONVERT_EXPR)
2147 return false;
2149 op0 = TREE_OPERAND (operation, 0);
2150 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2151 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2153 scalar_dest = TREE_OPERAND (stmt, 0);
2154 scalar_type = TREE_TYPE (scalar_dest);
2155 vectype_out = get_vectype_for_scalar_type (scalar_type);
2156 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2157 if (nunits_in != nunits_out / 2) /* FORNOW */
2158 return false;
2160 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2161 gcc_assert (ncopies >= 1);
2163 /* Check the operands of the operation. */
2164 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2166 if (vect_print_dump_info (REPORT_DETAILS))
2167 fprintf (vect_dump, "use not simple.");
2168 return false;
2171 /* Supportable by target? */
2172 code = VEC_PACK_MOD_EXPR;
2173 optab = optab_for_tree_code (VEC_PACK_MOD_EXPR, vectype_in);
2174 if (!optab)
2175 return false;
2177 vec_mode = TYPE_MODE (vectype_in);
2178 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
2179 return false;
2181 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2183 if (!vec_stmt) /* transformation not required. */
2185 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
2186 return true;
2189 /** Transform. **/
2191 if (vect_print_dump_info (REPORT_DETAILS))
2192 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
2193 ncopies);
2195 /* Handle def. */
2196 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2198 /* In case the vectorization factor (VF) is bigger than the number
2199 of elements that we can fit in a vectype (nunits), we have to generate
2200 more than one vector stmt - i.e - we need to "unroll" the
2201 vector stmt by a factor VF/nunits. */
2202 prev_stmt_info = NULL;
2203 for (j = 0; j < ncopies; j++)
2205 /* Handle uses. */
2206 if (j == 0)
2208 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2209 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2210 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd0);
2212 else
2214 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
2215 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2218 /* Arguments are ready. Create the new vector stmt. */
2219 expr = build2 (code, vectype_out, vec_oprnd0, vec_oprnd1);
2220 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, expr);
2221 new_temp = make_ssa_name (vec_dest, new_stmt);
2222 TREE_OPERAND (new_stmt, 0) = new_temp;
2223 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2225 if (j == 0)
2226 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2227 else
2228 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2230 prev_stmt_info = vinfo_for_stmt (new_stmt);
2233 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2234 return true;
2238 /* Function vect_gen_widened_results_half
2240 Create a vector stmt whose code, type, number of arguments, and result
2241 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2242 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2243 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2244 needs to be created (DECL is a function-decl of a target-builtin).
2245 STMT is the original scalar stmt that we are vectorizing. */
2247 static tree
2248 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2249 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2250 tree vec_dest, block_stmt_iterator *bsi,
2251 tree stmt)
2253 tree vec_params;
2254 tree expr;
2255 tree new_stmt;
2256 tree new_temp;
2257 tree sym;
2258 ssa_op_iter iter;
2260 /* Generate half of the widened result: */
2261 if (code == CALL_EXPR)
2263 /* Target specific support */
2264 vec_params = build_tree_list (NULL_TREE, vec_oprnd0);
2265 if (op_type == binary_op)
2266 vec_params = tree_cons (NULL_TREE, vec_oprnd1, vec_params);
2267 expr = build_function_call_expr (decl, vec_params);
2269 else
2271 /* Generic support */
2272 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2273 if (op_type == binary_op)
2274 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2275 else
2276 expr = build1 (code, vectype, vec_oprnd0);
2278 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, expr);
2279 new_temp = make_ssa_name (vec_dest, new_stmt);
2280 TREE_OPERAND (new_stmt, 0) = new_temp;
2281 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2283 if (code == CALL_EXPR)
2285 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2287 if (TREE_CODE (sym) == SSA_NAME)
2288 sym = SSA_NAME_VAR (sym);
2289 mark_sym_for_renaming (sym);
2293 return new_stmt;
2297 /* Function vectorizable_type_promotion
2299 Check if STMT performs a binary or unary operation that involves
2300 type promotion, and if it can be vectorized.
2301 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2302 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2303 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2305 bool
2306 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
2307 tree *vec_stmt)
2309 tree vec_dest;
2310 tree scalar_dest;
2311 tree operation;
2312 tree op0, op1 = NULL;
2313 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2314 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2315 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2316 enum tree_code code, code1 = CODE_FOR_nothing, code2 = CODE_FOR_nothing;
2317 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
2318 int op_type;
2319 tree def, def_stmt;
2320 enum vect_def_type dt0, dt1;
2321 tree new_stmt;
2322 stmt_vec_info prev_stmt_info;
2323 int nunits_in;
2324 int nunits_out;
2325 tree vectype_out;
2326 int ncopies;
2327 int j;
2328 tree vectype_in;
2330 /* Is STMT a vectorizable type-promotion operation? */
2332 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2333 return false;
2335 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2337 if (STMT_VINFO_LIVE_P (stmt_info))
2339 /* FORNOW: not yet supported. */
2340 if (vect_print_dump_info (REPORT_DETAILS))
2341 fprintf (vect_dump, "value used after loop.");
2342 return false;
2345 if (TREE_CODE (stmt) != MODIFY_EXPR)
2346 return false;
2348 if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME)
2349 return false;
2351 operation = TREE_OPERAND (stmt, 1);
2352 code = TREE_CODE (operation);
2353 if (code != NOP_EXPR && code != WIDEN_MULT_EXPR)
2354 return false;
2356 op0 = TREE_OPERAND (operation, 0);
2357 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2358 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2359 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2360 gcc_assert (ncopies >= 1);
2362 scalar_dest = TREE_OPERAND (stmt, 0);
2363 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2364 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2365 if (nunits_out != nunits_in / 2) /* FORNOW */
2366 return false;
2368 /* Check the operands of the operation. */
2369 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2371 if (vect_print_dump_info (REPORT_DETAILS))
2372 fprintf (vect_dump, "use not simple.");
2373 return false;
2376 op_type = TREE_CODE_LENGTH (code);
2377 if (op_type == binary_op)
2379 op1 = TREE_OPERAND (operation, 1);
2380 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2382 if (vect_print_dump_info (REPORT_DETAILS))
2383 fprintf (vect_dump, "use not simple.");
2384 return false;
2388 /* Supportable by target? */
2389 if (!supportable_widening_operation (code, stmt, vectype_in,
2390 &decl1, &decl2, &code1, &code2))
2391 return false;
2393 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2395 if (!vec_stmt) /* transformation not required. */
2397 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
2398 return true;
2401 /** Transform. **/
2403 if (vect_print_dump_info (REPORT_DETAILS))
2404 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
2405 ncopies);
2407 /* Handle def. */
2408 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2410 /* In case the vectorization factor (VF) is bigger than the number
2411 of elements that we can fit in a vectype (nunits), we have to generate
2412 more than one vector stmt - i.e - we need to "unroll" the
2413 vector stmt by a factor VF/nunits. */
2415 prev_stmt_info = NULL;
2416 for (j = 0; j < ncopies; j++)
2418 /* Handle uses. */
2419 if (j == 0)
2421 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2422 if (op_type == binary_op)
2423 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2425 else
2427 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2428 if (op_type == binary_op)
2429 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2432 /* Arguments are ready. Create the new vector stmt. We are creating
2433 two vector defs because the widened result does not fit in one vector.
2434 The vectorized stmt can be expressed as a call to a taregt builtin,
2435 or a using a tree-code. */
2436 /* Generate first half of the widened result: */
2437 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
2438 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2439 if (j == 0)
2440 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2441 else
2442 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2443 prev_stmt_info = vinfo_for_stmt (new_stmt);
2445 /* Generate second half of the widened result: */
2446 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
2447 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2448 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2449 prev_stmt_info = vinfo_for_stmt (new_stmt);
2453 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2454 return true;
2458 /* Function vect_strided_store_supported.
2460 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
2461 and FALSE otherwise. */
2463 static bool
2464 vect_strided_store_supported (tree vectype)
2466 optab interleave_high_optab, interleave_low_optab;
2467 int mode;
2469 mode = (int) TYPE_MODE (vectype);
2471 /* Check that the operation is supported. */
2472 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
2473 vectype);
2474 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
2475 vectype);
2476 if (!interleave_high_optab || !interleave_low_optab)
2478 if (vect_print_dump_info (REPORT_DETAILS))
2479 fprintf (vect_dump, "no optab for interleave.");
2480 return false;
2483 if (interleave_high_optab->handlers[(int) mode].insn_code
2484 == CODE_FOR_nothing
2485 || interleave_low_optab->handlers[(int) mode].insn_code
2486 == CODE_FOR_nothing)
2488 if (vect_print_dump_info (REPORT_DETAILS))
2489 fprintf (vect_dump, "interleave op not supported by target.");
2490 return false;
2492 return true;
2496 /* Function vect_permute_store_chain.
2498 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
2499 a power of 2, generate interleave_high/low stmts to reorder the data
2500 correctly for the stores. Return the final references for stores in
2501 RESULT_CHAIN.
2503 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
2504 The input is 4 vectors each containing 8 elements. We assign a number to each
2505 element, the input sequence is:
2507 1st vec: 0 1 2 3 4 5 6 7
2508 2nd vec: 8 9 10 11 12 13 14 15
2509 3rd vec: 16 17 18 19 20 21 22 23
2510 4th vec: 24 25 26 27 28 29 30 31
2512 The output sequence should be:
2514 1st vec: 0 8 16 24 1 9 17 25
2515 2nd vec: 2 10 18 26 3 11 19 27
2516 3rd vec: 4 12 20 28 5 13 21 30
2517 4th vec: 6 14 22 30 7 15 23 31
2519 i.e., we interleave the contents of the four vectors in their order.
2521 We use interleave_high/low instructions to create such output. The input of
2522 each interleave_high/low operation is two vectors:
2523 1st vec 2nd vec
2524 0 1 2 3 4 5 6 7
2525 the even elements of the result vector are obtained left-to-right from the
2526 high/low elements of the first vector. The odd elements of the result are
2527 obtained left-to-right from the high/low elements of the second vector.
2528 The output of interleave_high will be: 0 4 1 5
2529 and of interleave_low: 2 6 3 7
2532 The permutation is done in log LENGTH stages. In each stage interleave_high
2533 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
2534 where the first argument is taken from the first half of DR_CHAIN and the
2535 second argument from it's second half.
2536 In our example,
2538 I1: interleave_high (1st vec, 3rd vec)
2539 I2: interleave_low (1st vec, 3rd vec)
2540 I3: interleave_high (2nd vec, 4th vec)
2541 I4: interleave_low (2nd vec, 4th vec)
2543 The output for the first stage is:
2545 I1: 0 16 1 17 2 18 3 19
2546 I2: 4 20 5 21 6 22 7 23
2547 I3: 8 24 9 25 10 26 11 27
2548 I4: 12 28 13 29 14 30 15 31
2550 The output of the second stage, i.e. the final result is:
2552 I1: 0 8 16 24 1 9 17 25
2553 I2: 2 10 18 26 3 11 19 27
2554 I3: 4 12 20 28 5 13 21 30
2555 I4: 6 14 22 30 7 15 23 31. */
2557 static bool
2558 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
2559 unsigned int length,
2560 tree stmt,
2561 block_stmt_iterator *bsi,
2562 VEC(tree,heap) **result_chain)
2564 tree perm_dest, perm_stmt, vect1, vect2, high, low;
2565 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2566 tree scalar_dest;
2567 int i;
2568 unsigned int j;
2569 VEC(tree,heap) *first, *second;
2571 scalar_dest = TREE_OPERAND (stmt, 0);
2572 first = VEC_alloc (tree, heap, length/2);
2573 second = VEC_alloc (tree, heap, length/2);
2575 /* Check that the operation is supported. */
2576 if (!vect_strided_store_supported (vectype))
2577 return false;
2579 *result_chain = VEC_copy (tree, heap, dr_chain);
2581 for (i = 0; i < exact_log2 (length); i++)
2583 for (j = 0; j < length/2; j++)
2585 vect1 = VEC_index (tree, dr_chain, j);
2586 vect2 = VEC_index (tree, dr_chain, j+length/2);
2588 /* high = interleave_high (vect1, vect2); */
2589 perm_dest = create_tmp_var (vectype, "vect_inter_high");
2590 add_referenced_var (perm_dest);
2591 perm_stmt = build2 (MODIFY_EXPR, void_type_node, perm_dest,
2592 build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1,
2593 vect2));
2594 high = make_ssa_name (perm_dest, perm_stmt);
2595 TREE_OPERAND (perm_stmt, 0) = high;
2596 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
2597 VEC_replace (tree, *result_chain, 2*j, high);
2599 /* low = interleave_low (vect1, vect2); */
2600 perm_dest = create_tmp_var (vectype, "vect_inter_low");
2601 add_referenced_var (perm_dest);
2602 perm_stmt = build2 (MODIFY_EXPR, void_type_node, perm_dest,
2603 build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1,
2604 vect2));
2605 low = make_ssa_name (perm_dest, perm_stmt);
2606 TREE_OPERAND (perm_stmt, 0) = low;
2607 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
2608 VEC_replace (tree, *result_chain, 2*j+1, low);
2610 dr_chain = VEC_copy (tree, heap, *result_chain);
2612 return true;
2616 /* Function vectorizable_store.
2618 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
2619 can be vectorized.
2620 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2621 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2622 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2624 bool
2625 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2627 tree scalar_dest;
2628 tree data_ref;
2629 tree op;
2630 tree vec_oprnd = NULL_TREE;
2631 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2632 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
2633 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2634 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2635 enum machine_mode vec_mode;
2636 tree dummy;
2637 enum dr_alignment_support alignment_support_cheme;
2638 ssa_op_iter iter;
2639 def_operand_p def_p;
2640 tree def, def_stmt;
2641 enum vect_def_type dt;
2642 stmt_vec_info prev_stmt_info = NULL;
2643 tree dataref_ptr = NULL_TREE;
2644 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2645 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2646 int j;
2647 tree next_stmt, first_stmt;
2648 bool strided_store = false;
2649 unsigned int group_size, i;
2650 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
2651 gcc_assert (ncopies >= 1);
2653 /* Is vectorizable store? */
2655 if (TREE_CODE (stmt) != MODIFY_EXPR)
2656 return false;
2658 scalar_dest = TREE_OPERAND (stmt, 0);
2659 if (TREE_CODE (scalar_dest) != ARRAY_REF
2660 && TREE_CODE (scalar_dest) != INDIRECT_REF
2661 && !DR_GROUP_FIRST_DR (stmt_info))
2662 return false;
2664 op = TREE_OPERAND (stmt, 1);
2665 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
2667 if (vect_print_dump_info (REPORT_DETAILS))
2668 fprintf (vect_dump, "use not simple.");
2669 return false;
2672 vec_mode = TYPE_MODE (vectype);
2673 /* FORNOW. In some cases can vectorize even if data-type not supported
2674 (e.g. - array initialization with 0). */
2675 if (mov_optab->handlers[(int)vec_mode].insn_code == CODE_FOR_nothing)
2676 return false;
2678 if (!STMT_VINFO_DATA_REF (stmt_info))
2679 return false;
2681 if (DR_GROUP_FIRST_DR (stmt_info))
2683 strided_store = true;
2684 if (!vect_strided_store_supported (vectype))
2685 return false;
2688 if (!vec_stmt) /* transformation not required. */
2690 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
2691 return true;
2694 /** Transform. **/
2696 if (vect_print_dump_info (REPORT_DETAILS))
2697 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
2699 if (strided_store)
2701 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
2702 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2703 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
2705 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
2707 /* We vectorize all the stmts of the interleaving group when we
2708 reach the last stmt in the group. */
2709 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
2710 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
2712 *vec_stmt = NULL_TREE;
2713 return true;
2716 else
2718 first_stmt = stmt;
2719 first_dr = dr;
2720 group_size = 1;
2723 dr_chain = VEC_alloc (tree, heap, group_size);
2724 oprnds = VEC_alloc (tree, heap, group_size);
2726 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
2727 gcc_assert (alignment_support_cheme);
2728 gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */
2730 /* In case the vectorization factor (VF) is bigger than the number
2731 of elements that we can fit in a vectype (nunits), we have to generate
2732 more than one vector stmt - i.e - we need to "unroll" the
2733 vector stmt by a factor VF/nunits. For more details see documentation in
2734 vect_get_vec_def_for_copy_stmt. */
2736 /* In case of interleaving (non-unit strided access):
2738 S1: &base + 2 = x2
2739 S2: &base = x0
2740 S3: &base + 1 = x1
2741 S4: &base + 3 = x3
2743 We create vectorized storess starting from base address (the access of the
2744 first stmt in the chain (S2 in the above example), when the last store stmt
2745 of the chain (S4) is reached:
2747 VS1: &base = vx2
2748 VS2: &base + vec_size*1 = vx0
2749 VS3: &base + vec_size*2 = vx1
2750 VS4: &base + vec_size*3 = vx3
2752 Then permutation statements are generated:
2754 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
2755 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
2758 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
2759 (the order of the data-refs in the output of vect_permute_store_chain
2760 corresponds to the order of scalar stmts in the interleaving chain - see
2761 the documentation of vect_permute_store_chain()).
2763 In case of both multiple types and interleaving, above vector stores and
2764 permutation stmts are created for every copy. The result vector stmts are
2765 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
2766 STMT_VINFO_RELATED_STMT for the next copies.
2769 prev_stmt_info = NULL;
2770 for (j = 0; j < ncopies; j++)
2772 tree new_stmt;
2773 tree ptr_incr;
2775 if (j == 0)
2777 /* For interleaved stores we collect vectorized defs for all the
2778 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
2779 as an input to vect_permute_store_chain(), and OPRNDS as an input
2780 to vect_get_vec_def_for_stmt_copy() for the next copy.
2781 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
2782 OPRNDS are of size 1.
2784 next_stmt = first_stmt;
2785 for (i = 0; i < group_size; i++)
2787 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
2788 is the exact number of stmts in the chain. Therefore, NEXT_STMT
2789 can't be NULL_TREE. In case that there is no interleaving,
2790 GROUP_SIZE is 1, and only one iteration of the loop will be
2791 executed.
2793 gcc_assert (next_stmt);
2794 op = TREE_OPERAND (next_stmt, 1);
2795 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
2796 VEC_quick_push(tree, dr_chain, vec_oprnd);
2797 VEC_quick_push(tree, oprnds, vec_oprnd);
2798 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
2800 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE,
2801 &dummy, &ptr_incr, false,
2802 TREE_TYPE (vec_oprnd));
2804 else
2806 /* For interleaved stores we created vectorized defs for all the
2807 defs stored in OPRNDS in the previous iteration (previous copy).
2808 DR_CHAIN is then used as an input to vect_permute_store_chain(),
2809 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
2810 next copy.
2811 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
2812 OPRNDS are of size 1.
2814 for (i = 0; i < group_size; i++)
2816 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
2817 VEC_index (tree, oprnds, i));
2818 VEC_replace(tree, dr_chain, i, vec_oprnd);
2819 VEC_replace(tree, oprnds, i, vec_oprnd);
2821 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
2824 if (strided_store)
2826 result_chain = VEC_alloc (tree, heap, group_size);
2827 /* Permute. */
2828 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
2829 &result_chain))
2830 return false;
2833 next_stmt = first_stmt;
2834 for (i = 0; i < group_size; i++)
2836 /* For strided stores vectorized defs are interleaved in
2837 vect_permute_store_chain(). */
2838 if (strided_store)
2839 vec_oprnd = VEC_index(tree, result_chain, i);
2841 data_ref = build_fold_indirect_ref (dataref_ptr);
2842 /* Arguments are ready. Create the new vector stmt. */
2843 new_stmt = build2 (MODIFY_EXPR, void_type_node, data_ref,
2844 vec_oprnd);
2845 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2847 /* Set the V_MAY_DEFS for the vector pointer. If this virtual def has a
2848 use outside the loop and a loop peel is performed then the def may be
2849 renamed by the peel. Mark it for renaming so the later use will also
2850 be renamed. */
2851 copy_virtual_operands (new_stmt, next_stmt);
2852 if (j == 0)
2854 /* The original store is deleted so the same SSA_NAMEs can be used.
2856 FOR_EACH_SSA_TREE_OPERAND (def, next_stmt, iter, SSA_OP_VMAYDEF)
2858 SSA_NAME_DEF_STMT (def) = new_stmt;
2859 mark_sym_for_renaming (SSA_NAME_VAR (def));
2862 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2864 else
2866 /* Create new names for all the definitions created by COPY and
2867 add replacement mappings for each new name. */
2868 FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VMAYDEF)
2870 create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p);
2871 mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p)));
2874 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2877 prev_stmt_info = vinfo_for_stmt (new_stmt);
2878 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
2879 if (!next_stmt)
2880 break;
2881 /* Bump the vector pointer. */
2882 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
2886 return true;
2890 /* Function vect_setup_realignment
2892 This function is called when vectorizing an unaligned load using
2893 the dr_unaligned_software_pipeline scheme.
2894 This function generates the following code at the loop prolog:
2896 p = initial_addr;
2897 msq_init = *(floor(p)); # prolog load
2898 realignment_token = call target_builtin;
2899 loop:
2900 msq = phi (msq_init, ---)
2902 The code above sets up a new (vector) pointer, pointing to the first
2903 location accessed by STMT, and a "floor-aligned" load using that pointer.
2904 It also generates code to compute the "realignment-token" (if the relevant
2905 target hook was defined), and creates a phi-node at the loop-header bb
2906 whose arguments are the result of the prolog-load (created by this
2907 function) and the result of a load that takes place in the loop (to be
2908 created by the caller to this function).
2909 The caller to this function uses the phi-result (msq) to create the
2910 realignment code inside the loop, and sets up the missing phi argument,
2911 as follows:
2913 loop:
2914 msq = phi (msq_init, lsq)
2915 lsq = *(floor(p')); # load in loop
2916 result = realign_load (msq, lsq, realignment_token);
2918 Input:
2919 STMT - (scalar) load stmt to be vectorized. This load accesses
2920 a memory location that may be unaligned.
2921 BSI - place where new code is to be inserted.
2923 Output:
2924 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
2925 target hook, if defined.
2926 Return value - the result of the loop-header phi node. */
2928 static tree
2929 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
2930 tree *realignment_token)
2932 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2933 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2934 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2935 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2936 edge pe = loop_preheader_edge (loop);
2937 tree scalar_dest = TREE_OPERAND (stmt, 0);
2938 tree vec_dest;
2939 tree init_addr;
2940 tree inc;
2941 tree ptr;
2942 tree data_ref;
2943 tree new_stmt;
2944 basic_block new_bb;
2945 tree msq_init;
2946 tree new_temp;
2947 tree phi_stmt;
2948 tree msq;
2950 /* 1. Create msq_init = *(floor(p1)) in the loop preheader */
2951 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2952 ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true,
2953 NULL_TREE);
2954 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
2955 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, data_ref);
2956 new_temp = make_ssa_name (vec_dest, new_stmt);
2957 TREE_OPERAND (new_stmt, 0) = new_temp;
2958 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
2959 gcc_assert (!new_bb);
2960 msq_init = TREE_OPERAND (new_stmt, 0);
2961 copy_virtual_operands (new_stmt, stmt);
2962 update_vuses_to_preheader (new_stmt, loop);
2964 /* 2. Create permutation mask, if required, in loop preheader. */
2965 if (targetm.vectorize.builtin_mask_for_load)
2967 tree builtin_decl;
2968 tree params = build_tree_list (NULL_TREE, init_addr);
2970 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
2971 new_stmt = build_function_call_expr (builtin_decl, params);
2972 vec_dest = vect_create_destination_var (scalar_dest,
2973 TREE_TYPE (new_stmt));
2974 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, new_stmt);
2975 new_temp = make_ssa_name (vec_dest, new_stmt);
2976 TREE_OPERAND (new_stmt, 0) = new_temp;
2977 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
2978 gcc_assert (!new_bb);
2979 *realignment_token = TREE_OPERAND (new_stmt, 0);
2981 /* The result of the CALL_EXPR to this builtin is determined from
2982 the value of the parameter and no global variables are touched
2983 which makes the builtin a "const" function. Requiring the
2984 builtin to have the "const" attribute makes it unnecessary
2985 to call mark_call_clobbered. */
2986 gcc_assert (TREE_READONLY (builtin_decl));
2989 /* 3. Create msq = phi <msq_init, lsq> in loop */
2990 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2991 msq = make_ssa_name (vec_dest, NULL_TREE);
2992 phi_stmt = create_phi_node (msq, loop->header);
2993 SSA_NAME_DEF_STMT (msq) = phi_stmt;
2994 add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop));
2996 return msq;
3000 /* Function vect_strided_load_supported.
3002 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
3003 and FALSE otherwise. */
3005 static bool
3006 vect_strided_load_supported (tree vectype)
3008 optab perm_even_optab, perm_odd_optab;
3009 int mode;
3011 mode = (int) TYPE_MODE (vectype);
3013 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
3014 if (!perm_even_optab)
3016 if (vect_print_dump_info (REPORT_DETAILS))
3017 fprintf (vect_dump, "no optab for perm_even.");
3018 return false;
3021 if (perm_even_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3023 if (vect_print_dump_info (REPORT_DETAILS))
3024 fprintf (vect_dump, "perm_even op not supported by target.");
3025 return false;
3028 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
3029 if (!perm_odd_optab)
3031 if (vect_print_dump_info (REPORT_DETAILS))
3032 fprintf (vect_dump, "no optab for perm_odd.");
3033 return false;
3036 if (perm_odd_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3038 if (vect_print_dump_info (REPORT_DETAILS))
3039 fprintf (vect_dump, "perm_odd op not supported by target.");
3040 return false;
3042 return true;
3046 /* Function vect_permute_load_chain.
3048 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
3049 a power of 2, generate extract_even/odd stmts to reorder the input data
3050 correctly. Return the final references for loads in RESULT_CHAIN.
3052 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3053 The input is 4 vectors each containing 8 elements. We assign a number to each
3054 element, the input sequence is:
3056 1st vec: 0 1 2 3 4 5 6 7
3057 2nd vec: 8 9 10 11 12 13 14 15
3058 3rd vec: 16 17 18 19 20 21 22 23
3059 4th vec: 24 25 26 27 28 29 30 31
3061 The output sequence should be:
3063 1st vec: 0 4 8 12 16 20 24 28
3064 2nd vec: 1 5 9 13 17 21 25 29
3065 3rd vec: 2 6 10 14 18 22 26 30
3066 4th vec: 3 7 11 15 19 23 27 31
3068 i.e., the first output vector should contain the first elements of each
3069 interleaving group, etc.
3071 We use extract_even/odd instructions to create such output. The input of each
3072 extract_even/odd operation is two vectors
3073 1st vec 2nd vec
3074 0 1 2 3 4 5 6 7
3076 and the output is the vector of extracted even/odd elements. The output of
3077 extract_even will be: 0 2 4 6
3078 and of extract_odd: 1 3 5 7
3081 The permutation is done in log LENGTH stages. In each stage extract_even and
3082 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
3083 order. In our example,
3085 E1: extract_even (1st vec, 2nd vec)
3086 E2: extract_odd (1st vec, 2nd vec)
3087 E3: extract_even (3rd vec, 4th vec)
3088 E4: extract_odd (3rd vec, 4th vec)
3090 The output for the first stage will be:
3092 E1: 0 2 4 6 8 10 12 14
3093 E2: 1 3 5 7 9 11 13 15
3094 E3: 16 18 20 22 24 26 28 30
3095 E4: 17 19 21 23 25 27 29 31
3097 In order to proceed and create the correct sequence for the next stage (or
3098 for the correct output, if the second stage is the last one, as in our
3099 example), we first put the output of extract_even operation and then the
3100 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
3101 The input for the second stage is:
3103 1st vec (E1): 0 2 4 6 8 10 12 14
3104 2nd vec (E3): 16 18 20 22 24 26 28 30
3105 3rd vec (E2): 1 3 5 7 9 11 13 15
3106 4th vec (E4): 17 19 21 23 25 27 29 31
3108 The output of the second stage:
3110 E1: 0 4 8 12 16 20 24 28
3111 E2: 2 6 10 14 18 22 26 30
3112 E3: 1 5 9 13 17 21 25 29
3113 E4: 3 7 11 15 19 23 27 31
3115 And RESULT_CHAIN after reordering:
3117 1st vec (E1): 0 4 8 12 16 20 24 28
3118 2nd vec (E3): 1 5 9 13 17 21 25 29
3119 3rd vec (E2): 2 6 10 14 18 22 26 30
3120 4th vec (E4): 3 7 11 15 19 23 27 31. */
3122 static bool
3123 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
3124 unsigned int length,
3125 tree stmt,
3126 block_stmt_iterator *bsi,
3127 VEC(tree,heap) **result_chain)
3129 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
3130 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3131 int i;
3132 unsigned int j;
3134 /* Check that the operation is supported. */
3135 if (!vect_strided_load_supported (vectype))
3136 return false;
3138 *result_chain = VEC_copy (tree, heap, dr_chain);
3139 for (i = 0; i < exact_log2 (length); i++)
3141 for (j = 0; j < length; j +=2)
3143 first_vect = VEC_index (tree, dr_chain, j);
3144 second_vect = VEC_index (tree, dr_chain, j+1);
3146 /* data_ref = permute_even (first_data_ref, second_data_ref); */
3147 perm_dest = create_tmp_var (vectype, "vect_perm_even");
3148 add_referenced_var (perm_dest);
3150 perm_stmt = build2 (MODIFY_EXPR, void_type_node, perm_dest,
3151 build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
3152 first_vect, second_vect));
3154 data_ref = make_ssa_name (perm_dest, perm_stmt);
3155 TREE_OPERAND (perm_stmt, 0) = data_ref;
3156 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3157 mark_new_vars_to_rename (perm_stmt);
3159 VEC_replace (tree, *result_chain, j/2, data_ref);
3161 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
3162 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
3163 add_referenced_var (perm_dest);
3165 perm_stmt = build2 (MODIFY_EXPR, void_type_node, perm_dest,
3166 build2 (VEC_EXTRACT_ODD_EXPR, vectype,
3167 first_vect, second_vect));
3168 data_ref = make_ssa_name (perm_dest, perm_stmt);
3169 TREE_OPERAND (perm_stmt, 0) = data_ref;
3170 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3171 mark_new_vars_to_rename (perm_stmt);
3173 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
3175 dr_chain = VEC_copy (tree, heap, *result_chain);
3177 return true;
3181 /* Function vect_transform_strided_load.
3183 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
3184 to perform their permutation and ascribe the result vectorized statements to
3185 the scalar statements.
3188 static bool
3189 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
3190 block_stmt_iterator *bsi)
3192 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3193 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3194 tree next_stmt, new_stmt;
3195 VEC(tree,heap) *result_chain = NULL;
3196 unsigned int i, gap_count;
3197 tree tmp_data_ref;
3199 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
3200 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
3201 vectors, that are ready for vector computation. */
3202 result_chain = VEC_alloc (tree, heap, size);
3203 /* Permute. */
3204 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
3205 return false;
3207 /* Put a permuted data-ref in the VECTORIZED_STMT field.
3208 Since we scan the chain starting from it's first node, their order
3209 corresponds the order of data-refs in RESULT_CHAIN. */
3210 next_stmt = first_stmt;
3211 gap_count = 1;
3212 for (i = 0; VEC_iterate(tree, result_chain, i, tmp_data_ref); i++)
3214 if (!next_stmt)
3215 break;
3217 /* Skip the gaps. Loads created for the gaps will be removed by dead
3218 code elimination pass later.
3219 DR_GROUP_GAP is the number of steps in elements from the previous
3220 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
3221 correspond to the gaps.
3223 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
3225 gap_count++;
3226 continue;
3229 while (next_stmt)
3231 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
3232 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
3233 copies, and we put the new vector statement in the first available
3234 RELATED_STMT. */
3235 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
3236 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
3237 else
3239 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
3240 tree rel_stmt = STMT_VINFO_RELATED_STMT (
3241 vinfo_for_stmt (prev_stmt));
3242 while (rel_stmt)
3244 prev_stmt = rel_stmt;
3245 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
3247 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
3249 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3250 gap_count = 1;
3251 /* If NEXT_STMT accesses the same DR as the previous statement,
3252 put the same TMP_DATA_REF as its vectorized statement; otherwise
3253 get the next data-ref from RESULT_CHAIN. */
3254 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
3255 break;
3258 return true;
3262 /* vectorizable_load.
3264 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
3265 can be vectorized.
3266 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3267 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3268 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3270 bool
3271 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3273 tree scalar_dest;
3274 tree vec_dest = NULL;
3275 tree data_ref = NULL;
3276 tree op;
3277 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3278 stmt_vec_info prev_stmt_info;
3279 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3280 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3281 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
3282 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3283 tree new_temp;
3284 int mode;
3285 tree new_stmt = NULL_TREE;
3286 tree dummy;
3287 enum dr_alignment_support alignment_support_cheme;
3288 tree dataref_ptr = NULL_TREE;
3289 tree ptr_incr;
3290 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3291 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3292 int i, j, group_size;
3293 tree msq = NULL_TREE, lsq;
3294 tree offset = NULL_TREE;
3295 tree realignment_token = NULL_TREE;
3296 tree phi_stmt = NULL_TREE;
3297 VEC(tree,heap) *dr_chain = NULL;
3298 bool strided_load = false;
3299 tree first_stmt;
3301 /* Is vectorizable load? */
3302 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3303 return false;
3305 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3307 if (STMT_VINFO_LIVE_P (stmt_info))
3309 /* FORNOW: not yet supported. */
3310 if (vect_print_dump_info (REPORT_DETAILS))
3311 fprintf (vect_dump, "value used after loop.");
3312 return false;
3315 if (TREE_CODE (stmt) != MODIFY_EXPR)
3316 return false;
3318 scalar_dest = TREE_OPERAND (stmt, 0);
3319 if (TREE_CODE (scalar_dest) != SSA_NAME)
3320 return false;
3322 op = TREE_OPERAND (stmt, 1);
3323 if (TREE_CODE (op) != ARRAY_REF
3324 && TREE_CODE (op) != INDIRECT_REF
3325 && !DR_GROUP_FIRST_DR (stmt_info))
3326 return false;
3328 if (!STMT_VINFO_DATA_REF (stmt_info))
3329 return false;
3331 mode = (int) TYPE_MODE (vectype);
3333 /* FORNOW. In some cases can vectorize even if data-type not supported
3334 (e.g. - data copies). */
3335 if (mov_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3337 if (vect_print_dump_info (REPORT_DETAILS))
3338 fprintf (vect_dump, "Aligned load, but unsupported type.");
3339 return false;
3342 /* Check if the load is a part of an interleaving chain. */
3343 if (DR_GROUP_FIRST_DR (stmt_info))
3345 strided_load = true;
3347 /* Check if interleaving is supported. */
3348 if (!vect_strided_load_supported (vectype))
3349 return false;
3352 if (!vec_stmt) /* transformation not required. */
3354 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
3355 return true;
3358 /** Transform. **/
3360 if (vect_print_dump_info (REPORT_DETAILS))
3361 fprintf (vect_dump, "transform load.");
3363 if (strided_load)
3365 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3366 /* Check if the chain of loads is already vectorized. */
3367 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
3369 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3370 return true;
3372 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3373 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3374 dr_chain = VEC_alloc (tree, heap, group_size);
3376 else
3378 first_stmt = stmt;
3379 first_dr = dr;
3380 group_size = 1;
3383 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3384 gcc_assert (alignment_support_cheme);
3387 /* In case the vectorization factor (VF) is bigger than the number
3388 of elements that we can fit in a vectype (nunits), we have to generate
3389 more than one vector stmt - i.e - we need to "unroll" the
3390 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3391 from one copy of the vector stmt to the next, in the field
3392 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3393 stages to find the correct vector defs to be used when vectorizing
3394 stmts that use the defs of the current stmt. The example below illustrates
3395 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3396 4 vectorized stmts):
3398 before vectorization:
3399 RELATED_STMT VEC_STMT
3400 S1: x = memref - -
3401 S2: z = x + 1 - -
3403 step 1: vectorize stmt S1:
3404 We first create the vector stmt VS1_0, and, as usual, record a
3405 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
3406 Next, we create the vector stmt VS1_1, and record a pointer to
3407 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
3408 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
3409 stmts and pointers:
3410 RELATED_STMT VEC_STMT
3411 VS1_0: vx0 = memref0 VS1_1 -
3412 VS1_1: vx1 = memref1 VS1_2 -
3413 VS1_2: vx2 = memref2 VS1_3 -
3414 VS1_3: vx3 = memref3 - -
3415 S1: x = load - VS1_0
3416 S2: z = x + 1 - -
3418 See in documentation in vect_get_vec_def_for_stmt_copy for how the
3419 information we recorded in RELATED_STMT field is used to vectorize
3420 stmt S2. */
3422 /* In case of interleaving (non-unit strided access):
3424 S1: x2 = &base + 2
3425 S2: x0 = &base
3426 S3: x1 = &base + 1
3427 S4: x3 = &base + 3
3429 Vectorized loads are created in the order of memory accesses
3430 starting from the access of the first stmt of the chain:
3432 VS1: vx0 = &base
3433 VS2: vx1 = &base + vec_size*1
3434 VS3: vx3 = &base + vec_size*2
3435 VS4: vx4 = &base + vec_size*3
3437 Then permutation statements are generated:
3439 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
3440 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
3443 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3444 (the order of the data-refs in the output of vect_permute_load_chain
3445 corresponds to the order of scalar stmts in the interleaving chain - see
3446 the documentation of vect_permute_load_chain()).
3447 The generation of permutation stmts and recording them in
3448 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
3450 In case of both multiple types and interleaving, the vector loads and
3451 permutation stmts above are created for every copy. The result vector stmts
3452 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3453 STMT_VINFO_RELATED_STMT for the next copies. */
3455 /* If the data reference is aligned (dr_aligned) or potentially unaligned
3456 on a target that supports unaligned accesses (dr_unaligned_supported)
3457 we generate the following code:
3458 p = initial_addr;
3459 indx = 0;
3460 loop {
3461 p = p + indx * vectype_size;
3462 vec_dest = *(p);
3463 indx = indx + 1;
3466 Otherwise, the data reference is potentially unaligned on a target that
3467 does not support unaligned accesses (dr_unaligned_software_pipeline) -
3468 then generate the following code, in which the data in each iteration is
3469 obtained by two vector loads, one from the previous iteration, and one
3470 from the current iteration:
3471 p1 = initial_addr;
3472 msq_init = *(floor(p1))
3473 p2 = initial_addr + VS - 1;
3474 realignment_token = call target_builtin;
3475 indx = 0;
3476 loop {
3477 p2 = p2 + indx * vectype_size
3478 lsq = *(floor(p2))
3479 vec_dest = realign_load (msq, lsq, realignment_token)
3480 indx = indx + 1;
3481 msq = lsq;
3482 } */
3484 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3486 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token);
3487 phi_stmt = SSA_NAME_DEF_STMT (msq);
3488 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
3491 prev_stmt_info = NULL;
3492 for (j = 0; j < ncopies; j++)
3494 /* 1. Create the vector pointer update chain. */
3495 if (j == 0)
3496 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset, &dummy,
3497 &ptr_incr, false, NULL_TREE);
3498 else
3499 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3501 for (i = 0; i < group_size; i++)
3503 /* 2. Create the vector-load in the loop. */
3504 switch (alignment_support_cheme)
3506 case dr_aligned:
3507 gcc_assert (aligned_access_p (first_dr));
3508 data_ref = build_fold_indirect_ref (dataref_ptr);
3509 break;
3510 case dr_unaligned_supported:
3512 int mis = DR_MISALIGNMENT (first_dr);
3513 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
3515 gcc_assert (!aligned_access_p (first_dr));
3516 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
3517 data_ref =
3518 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
3519 break;
3521 case dr_unaligned_software_pipeline:
3522 gcc_assert (!aligned_access_p (first_dr));
3523 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
3524 break;
3525 default:
3526 gcc_unreachable ();
3528 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3529 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, data_ref);
3530 new_temp = make_ssa_name (vec_dest, new_stmt);
3531 TREE_OPERAND (new_stmt, 0) = new_temp;
3532 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3533 copy_virtual_operands (new_stmt, stmt);
3534 mark_new_vars_to_rename (new_stmt);
3536 /* 3. Handle explicit realignment if necessary/supported. */
3537 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3539 /* Create in loop:
3540 <vec_dest = realign_load (msq, lsq, realignment_token)> */
3541 lsq = TREE_OPERAND (new_stmt, 0);
3542 if (!realignment_token)
3543 realignment_token = dataref_ptr;
3544 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3545 new_stmt =
3546 build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token);
3547 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, new_stmt);
3548 new_temp = make_ssa_name (vec_dest, new_stmt);
3549 TREE_OPERAND (new_stmt, 0) = new_temp;
3550 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3551 if (i == group_size - 1 && j == ncopies - 1)
3552 add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop));
3553 msq = lsq;
3555 if (strided_load)
3556 VEC_quick_push (tree, dr_chain, new_temp);
3557 if (i < group_size - 1)
3558 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3561 if (strided_load)
3563 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
3564 return false;
3565 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3566 dr_chain = VEC_alloc (tree, heap, group_size);
3568 else
3570 if (j == 0)
3571 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3572 else
3573 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3574 prev_stmt_info = vinfo_for_stmt (new_stmt);
3578 return true;
3582 /* Function vectorizable_live_operation.
3584 STMT computes a value that is used outside the loop. Check if
3585 it can be supported. */
3587 bool
3588 vectorizable_live_operation (tree stmt,
3589 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3590 tree *vec_stmt ATTRIBUTE_UNUSED)
3592 tree operation;
3593 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3594 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3595 int i;
3596 enum tree_code code;
3597 int op_type;
3598 tree op;
3599 tree def, def_stmt;
3600 enum vect_def_type dt;
3602 if (!STMT_VINFO_LIVE_P (stmt_info))
3603 return false;
3605 if (TREE_CODE (stmt) != MODIFY_EXPR)
3606 return false;
3608 if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME)
3609 return false;
3611 operation = TREE_OPERAND (stmt, 1);
3612 code = TREE_CODE (operation);
3614 op_type = TREE_CODE_LENGTH (code);
3616 /* FORNOW: support only if all uses are invariant. This means
3617 that the scalar operations can remain in place, unvectorized.
3618 The original last scalar value that they compute will be used. */
3620 for (i = 0; i < op_type; i++)
3622 op = TREE_OPERAND (operation, i);
3623 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
3625 if (vect_print_dump_info (REPORT_DETAILS))
3626 fprintf (vect_dump, "use not simple.");
3627 return false;
3630 if (dt != vect_invariant_def && dt != vect_constant_def)
3631 return false;
3634 /* No transformation is required for the cases we currently support. */
3635 return true;
3639 /* Function vect_is_simple_cond.
3641 Input:
3642 LOOP - the loop that is being vectorized.
3643 COND - Condition that is checked for simple use.
3645 Returns whether a COND can be vectorized. Checks whether
3646 condition operands are supportable using vec_is_simple_use. */
3648 static bool
3649 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
3651 tree lhs, rhs;
3652 tree def;
3653 enum vect_def_type dt;
3655 if (!COMPARISON_CLASS_P (cond))
3656 return false;
3658 lhs = TREE_OPERAND (cond, 0);
3659 rhs = TREE_OPERAND (cond, 1);
3661 if (TREE_CODE (lhs) == SSA_NAME)
3663 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
3664 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
3665 return false;
3667 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST)
3668 return false;
3670 if (TREE_CODE (rhs) == SSA_NAME)
3672 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
3673 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
3674 return false;
3676 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST)
3677 return false;
3679 return true;
3682 /* vectorizable_condition.
3684 Check if STMT is conditional modify expression that can be vectorized.
3685 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3686 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
3687 at BSI.
3689 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3691 bool
3692 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3694 tree scalar_dest = NULL_TREE;
3695 tree vec_dest = NULL_TREE;
3696 tree op = NULL_TREE;
3697 tree cond_expr, then_clause, else_clause;
3698 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3699 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3700 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
3701 tree vec_compare, vec_cond_expr;
3702 tree new_temp;
3703 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3704 enum machine_mode vec_mode;
3705 tree def;
3706 enum vect_def_type dt;
3707 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3708 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3710 gcc_assert (ncopies >= 1);
3711 if (ncopies > 1)
3712 return false; /* FORNOW */
3714 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3715 return false;
3717 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3719 if (STMT_VINFO_LIVE_P (stmt_info))
3721 /* FORNOW: not yet supported. */
3722 if (vect_print_dump_info (REPORT_DETAILS))
3723 fprintf (vect_dump, "value used after loop.");
3724 return false;
3727 if (TREE_CODE (stmt) != MODIFY_EXPR)
3728 return false;
3730 op = TREE_OPERAND (stmt, 1);
3732 if (TREE_CODE (op) != COND_EXPR)
3733 return false;
3735 cond_expr = TREE_OPERAND (op, 0);
3736 then_clause = TREE_OPERAND (op, 1);
3737 else_clause = TREE_OPERAND (op, 2);
3739 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
3740 return false;
3742 /* We do not handle two different vector types for the condition
3743 and the values. */
3744 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
3745 return false;
3747 if (TREE_CODE (then_clause) == SSA_NAME)
3749 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
3750 if (!vect_is_simple_use (then_clause, loop_vinfo,
3751 &then_def_stmt, &def, &dt))
3752 return false;
3754 else if (TREE_CODE (then_clause) != INTEGER_CST
3755 && TREE_CODE (then_clause) != REAL_CST)
3756 return false;
3758 if (TREE_CODE (else_clause) == SSA_NAME)
3760 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
3761 if (!vect_is_simple_use (else_clause, loop_vinfo,
3762 &else_def_stmt, &def, &dt))
3763 return false;
3765 else if (TREE_CODE (else_clause) != INTEGER_CST
3766 && TREE_CODE (else_clause) != REAL_CST)
3767 return false;
3770 vec_mode = TYPE_MODE (vectype);
3772 if (!vec_stmt)
3774 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
3775 return expand_vec_cond_expr_p (op, vec_mode);
3778 /* Transform */
3780 /* Handle def. */
3781 scalar_dest = TREE_OPERAND (stmt, 0);
3782 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3784 /* Handle cond expr. */
3785 vec_cond_lhs =
3786 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
3787 vec_cond_rhs =
3788 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
3789 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
3790 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
3792 /* Arguments are ready. create the new vector stmt. */
3793 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
3794 vec_cond_lhs, vec_cond_rhs);
3795 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
3796 vec_compare, vec_then_clause, vec_else_clause);
3798 *vec_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, vec_cond_expr);
3799 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3800 TREE_OPERAND (*vec_stmt, 0) = new_temp;
3801 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3803 return true;
3806 /* Function vect_transform_stmt.
3808 Create a vectorized stmt to replace STMT, and insert it at BSI. */
3810 bool
3811 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
3813 bool is_store = false;
3814 tree vec_stmt = NULL_TREE;
3815 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3816 tree orig_stmt_in_pattern;
3817 bool done;
3819 if (STMT_VINFO_RELEVANT_P (stmt_info))
3821 switch (STMT_VINFO_TYPE (stmt_info))
3823 case type_demotion_vec_info_type:
3824 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
3825 gcc_assert (done);
3826 break;
3828 case type_promotion_vec_info_type:
3829 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
3830 gcc_assert (done);
3831 break;
3833 case op_vec_info_type:
3834 done = vectorizable_operation (stmt, bsi, &vec_stmt);
3835 gcc_assert (done);
3836 break;
3838 case assignment_vec_info_type:
3839 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
3840 gcc_assert (done);
3841 break;
3843 case load_vec_info_type:
3844 done = vectorizable_load (stmt, bsi, &vec_stmt);
3845 gcc_assert (done);
3846 break;
3848 case store_vec_info_type:
3849 done = vectorizable_store (stmt, bsi, &vec_stmt);
3850 gcc_assert (done);
3851 if (DR_GROUP_FIRST_DR (stmt_info))
3853 /* In case of interleaving, the whole chain is vectorized when the
3854 last store in the chain is reached. Store stmts before the last
3855 one are skipped, and there vec_stmt_info shoudn't be freed
3856 meanwhile. */
3857 *strided_store = true;
3858 if (STMT_VINFO_VEC_STMT (stmt_info))
3859 is_store = true;
3861 else
3862 is_store = true;
3863 break;
3865 case condition_vec_info_type:
3866 done = vectorizable_condition (stmt, bsi, &vec_stmt);
3867 gcc_assert (done);
3868 break;
3870 case call_vec_info_type:
3871 done = vectorizable_call (stmt, bsi, &vec_stmt);
3872 break;
3874 default:
3875 if (vect_print_dump_info (REPORT_DETAILS))
3876 fprintf (vect_dump, "stmt not supported.");
3877 gcc_unreachable ();
3880 gcc_assert (vec_stmt || *strided_store);
3881 if (vec_stmt)
3883 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
3884 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
3885 if (orig_stmt_in_pattern)
3887 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
3888 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
3890 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3892 /* STMT was inserted by the vectorizer to replace a
3893 computation idiom. ORIG_STMT_IN_PATTERN is a stmt in the
3894 original sequence that computed this idiom. We need to
3895 record a pointer to VEC_STMT in the stmt_info of
3896 ORIG_STMT_IN_PATTERN. See more details in the
3897 documentation of vect_pattern_recog. */
3899 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
3905 if (STMT_VINFO_LIVE_P (stmt_info))
3907 switch (STMT_VINFO_TYPE (stmt_info))
3909 case reduc_vec_info_type:
3910 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
3911 gcc_assert (done);
3912 break;
3914 default:
3915 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
3916 gcc_assert (done);
3920 return is_store;
3924 /* This function builds ni_name = number of iterations loop executes
3925 on the loop preheader. */
3927 static tree
3928 vect_build_loop_niters (loop_vec_info loop_vinfo)
3930 tree ni_name, stmt, var;
3931 edge pe;
3932 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3933 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
3935 var = create_tmp_var (TREE_TYPE (ni), "niters");
3936 add_referenced_var (var);
3937 ni_name = force_gimple_operand (ni, &stmt, false, var);
3939 pe = loop_preheader_edge (loop);
3940 if (stmt)
3942 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
3943 gcc_assert (!new_bb);
3946 return ni_name;
3950 /* This function generates the following statements:
3952 ni_name = number of iterations loop executes
3953 ratio = ni_name / vf
3954 ratio_mult_vf_name = ratio * vf
3956 and places them at the loop preheader edge. */
3958 static void
3959 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
3960 tree *ni_name_ptr,
3961 tree *ratio_mult_vf_name_ptr,
3962 tree *ratio_name_ptr)
3965 edge pe;
3966 basic_block new_bb;
3967 tree stmt, ni_name;
3968 tree var;
3969 tree ratio_name;
3970 tree ratio_mult_vf_name;
3971 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3972 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
3973 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3974 tree log_vf;
3976 pe = loop_preheader_edge (loop);
3978 /* Generate temporary variable that contains
3979 number of iterations loop executes. */
3981 ni_name = vect_build_loop_niters (loop_vinfo);
3982 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
3984 /* Create: ratio = ni >> log2(vf) */
3986 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
3987 if (!is_gimple_val (ratio_name))
3989 var = create_tmp_var (TREE_TYPE (ni), "bnd");
3990 add_referenced_var (var);
3992 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
3993 pe = loop_preheader_edge (loop);
3994 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
3995 gcc_assert (!new_bb);
3998 /* Create: ratio_mult_vf = ratio << log2 (vf). */
4000 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
4001 ratio_name, log_vf);
4002 if (!is_gimple_val (ratio_mult_vf_name))
4004 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
4005 add_referenced_var (var);
4007 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
4008 true, var);
4009 pe = loop_preheader_edge (loop);
4010 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4011 gcc_assert (!new_bb);
4014 *ni_name_ptr = ni_name;
4015 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
4016 *ratio_name_ptr = ratio_name;
4018 return;
4022 /* Function update_vuses_to_preheader.
4024 Input:
4025 STMT - a statement with potential VUSEs.
4026 LOOP - the loop whose preheader will contain STMT.
4028 It's possible to vectorize a loop even though an SSA_NAME from a VUSE
4029 appears to be defined in a V_MAY_DEF in another statement in a loop.
4030 One such case is when the VUSE is at the dereference of a __restricted__
4031 pointer in a load and the V_MAY_DEF is at the dereference of a different
4032 __restricted__ pointer in a store. Vectorization may result in
4033 copy_virtual_uses being called to copy the problematic VUSE to a new
4034 statement that is being inserted in the loop preheader. This procedure
4035 is called to change the SSA_NAME in the new statement's VUSE from the
4036 SSA_NAME updated in the loop to the related SSA_NAME available on the
4037 path entering the loop.
4039 When this function is called, we have the following situation:
4041 # vuse <name1>
4042 S1: vload
4043 do {
4044 # name1 = phi < name0 , name2>
4046 # vuse <name1>
4047 S2: vload
4049 # name2 = vdef <name1>
4050 S3: vstore
4052 }while...
4054 Stmt S1 was created in the loop preheader block as part of misaligned-load
4055 handling. This function fixes the name of the vuse of S1 from 'name1' to
4056 'name0'. */
4058 static void
4059 update_vuses_to_preheader (tree stmt, struct loop *loop)
4061 basic_block header_bb = loop->header;
4062 edge preheader_e = loop_preheader_edge (loop);
4063 ssa_op_iter iter;
4064 use_operand_p use_p;
4066 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VUSE)
4068 tree ssa_name = USE_FROM_PTR (use_p);
4069 tree def_stmt = SSA_NAME_DEF_STMT (ssa_name);
4070 tree name_var = SSA_NAME_VAR (ssa_name);
4071 basic_block bb = bb_for_stmt (def_stmt);
4073 /* For a use before any definitions, def_stmt is a NOP_EXPR. */
4074 if (!IS_EMPTY_STMT (def_stmt)
4075 && flow_bb_inside_loop_p (loop, bb))
4077 /* If the block containing the statement defining the SSA_NAME
4078 is in the loop then it's necessary to find the definition
4079 outside the loop using the PHI nodes of the header. */
4080 tree phi;
4081 bool updated = false;
4083 for (phi = phi_nodes (header_bb); phi; phi = TREE_CHAIN (phi))
4085 if (SSA_NAME_VAR (PHI_RESULT (phi)) == name_var)
4087 SET_USE (use_p, PHI_ARG_DEF (phi, preheader_e->dest_idx));
4088 updated = true;
4089 break;
4092 gcc_assert (updated);
4098 /* Function vect_update_ivs_after_vectorizer.
4100 "Advance" the induction variables of LOOP to the value they should take
4101 after the execution of LOOP. This is currently necessary because the
4102 vectorizer does not handle induction variables that are used after the
4103 loop. Such a situation occurs when the last iterations of LOOP are
4104 peeled, because:
4105 1. We introduced new uses after LOOP for IVs that were not originally used
4106 after LOOP: the IVs of LOOP are now used by an epilog loop.
4107 2. LOOP is going to be vectorized; this means that it will iterate N/VF
4108 times, whereas the loop IVs should be bumped N times.
4110 Input:
4111 - LOOP - a loop that is going to be vectorized. The last few iterations
4112 of LOOP were peeled.
4113 - NITERS - the number of iterations that LOOP executes (before it is
4114 vectorized). i.e, the number of times the ivs should be bumped.
4115 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
4116 coming out from LOOP on which there are uses of the LOOP ivs
4117 (this is the path from LOOP->exit to epilog_loop->preheader).
4119 The new definitions of the ivs are placed in LOOP->exit.
4120 The phi args associated with the edge UPDATE_E in the bb
4121 UPDATE_E->dest are updated accordingly.
4123 Assumption 1: Like the rest of the vectorizer, this function assumes
4124 a single loop exit that has a single predecessor.
4126 Assumption 2: The phi nodes in the LOOP header and in update_bb are
4127 organized in the same order.
4129 Assumption 3: The access function of the ivs is simple enough (see
4130 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
4132 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
4133 coming out of LOOP on which the ivs of LOOP are used (this is the path
4134 that leads to the epilog loop; other paths skip the epilog loop). This
4135 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
4136 needs to have its phis updated.
4139 static void
4140 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
4141 edge update_e)
4143 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4144 basic_block exit_bb = single_exit (loop)->dest;
4145 tree phi, phi1;
4146 basic_block update_bb = update_e->dest;
4148 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
4150 /* Make sure there exists a single-predecessor exit bb: */
4151 gcc_assert (single_pred_p (exit_bb));
4153 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
4154 phi && phi1;
4155 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
4157 tree access_fn = NULL;
4158 tree evolution_part;
4159 tree init_expr;
4160 tree step_expr;
4161 tree var, stmt, ni, ni_name;
4162 block_stmt_iterator last_bsi;
4164 if (vect_print_dump_info (REPORT_DETAILS))
4166 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
4167 print_generic_expr (vect_dump, phi, TDF_SLIM);
4170 /* Skip virtual phi's. */
4171 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
4173 if (vect_print_dump_info (REPORT_DETAILS))
4174 fprintf (vect_dump, "virtual phi. skip.");
4175 continue;
4178 /* Skip reduction phis. */
4179 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
4181 if (vect_print_dump_info (REPORT_DETAILS))
4182 fprintf (vect_dump, "reduc phi. skip.");
4183 continue;
4186 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
4187 gcc_assert (access_fn);
4188 evolution_part =
4189 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
4190 gcc_assert (evolution_part != NULL_TREE);
4192 /* FORNOW: We do not support IVs whose evolution function is a polynomial
4193 of degree >= 2 or exponential. */
4194 gcc_assert (!tree_is_chrec (evolution_part));
4196 step_expr = evolution_part;
4197 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
4198 loop->num));
4200 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
4201 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
4202 fold_convert (TREE_TYPE (init_expr),
4203 niters),
4204 step_expr),
4205 init_expr);
4207 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
4208 add_referenced_var (var);
4210 ni_name = force_gimple_operand (ni, &stmt, false, var);
4212 /* Insert stmt into exit_bb. */
4213 last_bsi = bsi_last (exit_bb);
4214 if (stmt)
4215 bsi_insert_before (&last_bsi, stmt, BSI_SAME_STMT);
4217 /* Fix phi expressions in the successor bb. */
4218 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
4223 /* Function vect_do_peeling_for_loop_bound
4225 Peel the last iterations of the loop represented by LOOP_VINFO.
4226 The peeled iterations form a new epilog loop. Given that the loop now
4227 iterates NITERS times, the new epilog loop iterates
4228 NITERS % VECTORIZATION_FACTOR times.
4230 The original loop will later be made to iterate
4231 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
4233 static void
4234 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
4236 tree ni_name, ratio_mult_vf_name;
4237 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4238 struct loop *new_loop;
4239 edge update_e;
4240 basic_block preheader;
4241 int loop_num;
4243 if (vect_print_dump_info (REPORT_DETAILS))
4244 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
4246 initialize_original_copy_tables ();
4248 /* Generate the following variables on the preheader of original loop:
4250 ni_name = number of iteration the original loop executes
4251 ratio = ni_name / vf
4252 ratio_mult_vf_name = ratio * vf */
4253 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
4254 &ratio_mult_vf_name, ratio);
4256 loop_num = loop->num;
4257 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
4258 ratio_mult_vf_name, ni_name, false);
4259 gcc_assert (new_loop);
4260 gcc_assert (loop_num == loop->num);
4261 #ifdef ENABLE_CHECKING
4262 slpeel_verify_cfg_after_peeling (loop, new_loop);
4263 #endif
4265 /* A guard that controls whether the new_loop is to be executed or skipped
4266 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
4267 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
4268 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
4269 is on the path where the LOOP IVs are used and need to be updated. */
4271 preheader = loop_preheader_edge (new_loop)->src;
4272 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
4273 update_e = EDGE_PRED (preheader, 0);
4274 else
4275 update_e = EDGE_PRED (preheader, 1);
4277 /* Update IVs of original loop as if they were advanced
4278 by ratio_mult_vf_name steps. */
4279 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
4281 /* After peeling we have to reset scalar evolution analyzer. */
4282 scev_reset ();
4284 free_original_copy_tables ();
4288 /* Function vect_gen_niters_for_prolog_loop
4290 Set the number of iterations for the loop represented by LOOP_VINFO
4291 to the minimum between LOOP_NITERS (the original iteration count of the loop)
4292 and the misalignment of DR - the data reference recorded in
4293 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
4294 this loop, the data reference DR will refer to an aligned location.
4296 The following computation is generated:
4298 If the misalignment of DR is known at compile time:
4299 addr_mis = int mis = DR_MISALIGNMENT (dr);
4300 Else, compute address misalignment in bytes:
4301 addr_mis = addr & (vectype_size - 1)
4303 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
4305 (elem_size = element type size; an element is the scalar element
4306 whose type is the inner type of the vectype)
4308 For interleaving,
4310 prolog_niters = min ( LOOP_NITERS ,
4311 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
4312 where group_size is the size of the interleaved group.
4315 static tree
4316 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
4318 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
4319 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4320 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4321 tree var, stmt;
4322 tree iters, iters_name;
4323 edge pe;
4324 basic_block new_bb;
4325 tree dr_stmt = DR_STMT (dr);
4326 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
4327 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4328 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
4329 tree niters_type = TREE_TYPE (loop_niters);
4330 int group_size = 1;
4331 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
4333 if (DR_GROUP_FIRST_DR (stmt_info))
4335 /* For interleaved access element size must be multiplied by the size of
4336 the interleaved group. */
4337 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
4338 DR_GROUP_FIRST_DR (stmt_info)));
4339 element_size *= group_size;
4342 pe = loop_preheader_edge (loop);
4344 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
4346 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
4347 int elem_misalign = byte_misalign / element_size;
4349 if (vect_print_dump_info (REPORT_DETAILS))
4350 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
4351 iters = build_int_cst (niters_type,
4352 (vf - elem_misalign)&(vf/group_size-1));
4354 else
4356 tree new_stmts = NULL_TREE;
4357 tree start_addr =
4358 vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE);
4359 tree ptr_type = TREE_TYPE (start_addr);
4360 tree size = TYPE_SIZE (ptr_type);
4361 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
4362 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
4363 tree elem_size_log =
4364 build_int_cst (type, exact_log2 (vectype_align/vf));
4365 tree vf_minus_1 = build_int_cst (type, vf - 1);
4366 tree vf_tree = build_int_cst (type, vf);
4367 tree byte_misalign;
4368 tree elem_misalign;
4370 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
4371 gcc_assert (!new_bb);
4373 /* Create: byte_misalign = addr & (vectype_size - 1) */
4374 byte_misalign =
4375 fold_build2 (BIT_AND_EXPR, type, start_addr, vectype_size_minus_1);
4377 /* Create: elem_misalign = byte_misalign / element_size */
4378 elem_misalign =
4379 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
4381 /* Create: (niters_type) (VF - elem_misalign)&(VF - 1) */
4382 iters = fold_build2 (MINUS_EXPR, type, vf_tree, elem_misalign);
4383 iters = fold_build2 (BIT_AND_EXPR, type, iters, vf_minus_1);
4384 iters = fold_convert (niters_type, iters);
4387 /* Create: prolog_loop_niters = min (iters, loop_niters) */
4388 /* If the loop bound is known at compile time we already verified that it is
4389 greater than vf; since the misalignment ('iters') is at most vf, there's
4390 no need to generate the MIN_EXPR in this case. */
4391 if (TREE_CODE (loop_niters) != INTEGER_CST)
4392 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
4394 if (vect_print_dump_info (REPORT_DETAILS))
4396 fprintf (vect_dump, "niters for prolog loop: ");
4397 print_generic_expr (vect_dump, iters, TDF_SLIM);
4400 var = create_tmp_var (niters_type, "prolog_loop_niters");
4401 add_referenced_var (var);
4402 iters_name = force_gimple_operand (iters, &stmt, false, var);
4404 /* Insert stmt on loop preheader edge. */
4405 if (stmt)
4407 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4408 gcc_assert (!new_bb);
4411 return iters_name;
4415 /* Function vect_update_init_of_dr
4417 NITERS iterations were peeled from LOOP. DR represents a data reference
4418 in LOOP. This function updates the information recorded in DR to
4419 account for the fact that the first NITERS iterations had already been
4420 executed. Specifically, it updates the OFFSET field of DR. */
4422 static void
4423 vect_update_init_of_dr (struct data_reference *dr, tree niters)
4425 tree offset = DR_OFFSET (dr);
4427 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
4428 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
4429 DR_OFFSET (dr) = offset;
4433 /* Function vect_update_inits_of_drs
4435 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
4436 This function updates the information recorded for the data references in
4437 the loop to account for the fact that the first NITERS iterations had
4438 already been executed. Specifically, it updates the initial_condition of the
4439 access_function of all the data_references in the loop. */
4441 static void
4442 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
4444 unsigned int i;
4445 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
4446 struct data_reference *dr;
4448 if (vect_dump && (dump_flags & TDF_DETAILS))
4449 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
4451 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
4452 vect_update_init_of_dr (dr, niters);
4456 /* Function vect_do_peeling_for_alignment
4458 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
4459 'niters' is set to the misalignment of one of the data references in the
4460 loop, thereby forcing it to refer to an aligned location at the beginning
4461 of the execution of this loop. The data reference for which we are
4462 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
4464 static void
4465 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
4467 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4468 tree niters_of_prolog_loop, ni_name;
4469 tree n_iters;
4470 struct loop *new_loop;
4472 if (vect_print_dump_info (REPORT_DETAILS))
4473 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
4475 initialize_original_copy_tables ();
4477 ni_name = vect_build_loop_niters (loop_vinfo);
4478 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
4480 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
4481 new_loop =
4482 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
4483 niters_of_prolog_loop, ni_name, true);
4484 gcc_assert (new_loop);
4485 #ifdef ENABLE_CHECKING
4486 slpeel_verify_cfg_after_peeling (new_loop, loop);
4487 #endif
4489 /* Update number of times loop executes. */
4490 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
4491 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
4492 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
4494 /* Update the init conditions of the access functions of all data refs. */
4495 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
4497 /* After peeling we have to reset scalar evolution analyzer. */
4498 scev_reset ();
4500 free_original_copy_tables ();
4504 /* Function vect_create_cond_for_align_checks.
4506 Create a conditional expression that represents the alignment checks for
4507 all of data references (array element references) whose alignment must be
4508 checked at runtime.
4510 Input:
4511 LOOP_VINFO - two fields of the loop information are used.
4512 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
4513 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
4515 Output:
4516 COND_EXPR_STMT_LIST - statements needed to construct the conditional
4517 expression.
4518 The returned value is the conditional expression to be used in the if
4519 statement that controls which version of the loop gets executed at runtime.
4521 The algorithm makes two assumptions:
4522 1) The number of bytes "n" in a vector is a power of 2.
4523 2) An address "a" is aligned if a%n is zero and that this
4524 test can be done as a&(n-1) == 0. For example, for 16
4525 byte vectors the test is a&0xf == 0. */
4527 static tree
4528 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
4529 tree *cond_expr_stmt_list)
4531 VEC(tree,heap) *may_misalign_stmts
4532 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
4533 tree ref_stmt;
4534 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
4535 tree mask_cst;
4536 unsigned int i;
4537 tree psize;
4538 tree int_ptrsize_type;
4539 char tmp_name[20];
4540 tree or_tmp_name = NULL_TREE;
4541 tree and_tmp, and_tmp_name, and_stmt;
4542 tree ptrsize_zero;
4544 /* Check that mask is one less than a power of 2, i.e., mask is
4545 all zeros followed by all ones. */
4546 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
4548 /* CHECKME: what is the best integer or unsigned type to use to hold a
4549 cast from a pointer value? */
4550 psize = TYPE_SIZE (ptr_type_node);
4551 int_ptrsize_type
4552 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
4554 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
4555 of the first vector of the i'th data reference. */
4557 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
4559 tree new_stmt_list = NULL_TREE;
4560 tree addr_base;
4561 tree addr_tmp, addr_tmp_name, addr_stmt;
4562 tree or_tmp, new_or_tmp_name, or_stmt;
4564 /* create: addr_tmp = (int)(address_of_first_vector) */
4565 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
4566 &new_stmt_list,
4567 NULL_TREE);
4569 if (new_stmt_list != NULL_TREE)
4570 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
4572 sprintf (tmp_name, "%s%d", "addr2int", i);
4573 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4574 add_referenced_var (addr_tmp);
4575 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
4576 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
4577 addr_stmt = build2 (MODIFY_EXPR, void_type_node,
4578 addr_tmp_name, addr_stmt);
4579 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
4580 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
4582 /* The addresses are OR together. */
4584 if (or_tmp_name != NULL_TREE)
4586 /* create: or_tmp = or_tmp | addr_tmp */
4587 sprintf (tmp_name, "%s%d", "orptrs", i);
4588 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4589 add_referenced_var (or_tmp);
4590 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
4591 or_stmt = build2 (MODIFY_EXPR, void_type_node, new_or_tmp_name,
4592 build2 (BIT_IOR_EXPR, int_ptrsize_type,
4593 or_tmp_name,
4594 addr_tmp_name));
4595 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
4596 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
4597 or_tmp_name = new_or_tmp_name;
4599 else
4600 or_tmp_name = addr_tmp_name;
4602 } /* end for i */
4604 mask_cst = build_int_cst (int_ptrsize_type, mask);
4606 /* create: and_tmp = or_tmp & mask */
4607 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
4608 add_referenced_var (and_tmp);
4609 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
4611 and_stmt = build2 (MODIFY_EXPR, void_type_node,
4612 and_tmp_name,
4613 build2 (BIT_AND_EXPR, int_ptrsize_type,
4614 or_tmp_name, mask_cst));
4615 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
4616 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
4618 /* Make and_tmp the left operand of the conditional test against zero.
4619 if and_tmp has a nonzero bit then some address is unaligned. */
4620 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
4621 return build2 (EQ_EXPR, boolean_type_node,
4622 and_tmp_name, ptrsize_zero);
4626 /* Function vect_transform_loop.
4628 The analysis phase has determined that the loop is vectorizable.
4629 Vectorize the loop - created vectorized stmts to replace the scalar
4630 stmts in the loop, and update the loop exit condition. */
4632 void
4633 vect_transform_loop (loop_vec_info loop_vinfo)
4635 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4636 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4637 int nbbs = loop->num_nodes;
4638 block_stmt_iterator si;
4639 int i;
4640 tree ratio = NULL;
4641 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4642 bitmap_iterator bi;
4643 unsigned int j;
4644 bool strided_store;
4646 if (vect_print_dump_info (REPORT_DETAILS))
4647 fprintf (vect_dump, "=== vec_transform_loop ===");
4649 /* If the loop has data references that may or may not be aligned then
4650 two versions of the loop need to be generated, one which is vectorized
4651 and one which isn't. A test is then generated to control which of the
4652 loops is executed. The test checks for the alignment of all of the
4653 data references that may or may not be aligned. */
4655 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
4657 struct loop *nloop;
4658 tree cond_expr;
4659 tree cond_expr_stmt_list = NULL_TREE;
4660 basic_block condition_bb;
4661 block_stmt_iterator cond_exp_bsi;
4662 basic_block merge_bb;
4663 basic_block new_exit_bb;
4664 edge new_exit_e, e;
4665 tree orig_phi, new_phi, arg;
4667 cond_expr = vect_create_cond_for_align_checks (loop_vinfo,
4668 &cond_expr_stmt_list);
4669 initialize_original_copy_tables ();
4670 nloop = loop_version (loop, cond_expr, &condition_bb, true);
4671 free_original_copy_tables();
4673 /** Loop versioning violates an assumption we try to maintain during
4674 vectorization - that the loop exit block has a single predecessor.
4675 After versioning, the exit block of both loop versions is the same
4676 basic block (i.e. it has two predecessors). Just in order to simplify
4677 following transformations in the vectorizer, we fix this situation
4678 here by adding a new (empty) block on the exit-edge of the loop,
4679 with the proper loop-exit phis to maintain loop-closed-form. **/
4681 merge_bb = single_exit (loop)->dest;
4682 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
4683 new_exit_bb = split_edge (single_exit (loop));
4684 new_exit_e = single_exit (loop);
4685 e = EDGE_SUCC (new_exit_bb, 0);
4687 for (orig_phi = phi_nodes (merge_bb); orig_phi;
4688 orig_phi = PHI_CHAIN (orig_phi))
4690 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
4691 new_exit_bb);
4692 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4693 add_phi_arg (new_phi, arg, new_exit_e);
4694 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
4697 /** end loop-exit-fixes after versioning **/
4699 update_ssa (TODO_update_ssa);
4700 cond_exp_bsi = bsi_last (condition_bb);
4701 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
4704 /* CHECKME: we wouldn't need this if we called update_ssa once
4705 for all loops. */
4706 bitmap_zero (vect_vnames_to_rename);
4708 /* Peel the loop if there are data refs with unknown alignment.
4709 Only one data ref with unknown store is allowed. */
4711 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4712 vect_do_peeling_for_alignment (loop_vinfo);
4714 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4715 compile time constant), or it is a constant that doesn't divide by the
4716 vectorization factor, then an epilog loop needs to be created.
4717 We therefore duplicate the loop: the original loop will be vectorized,
4718 and will compute the first (n/VF) iterations. The second copy of the loop
4719 will remain scalar and will compute the remaining (n%VF) iterations.
4720 (VF is the vectorization factor). */
4722 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4723 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4724 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
4725 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
4726 else
4727 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4728 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4730 /* 1) Make sure the loop header has exactly two entries
4731 2) Make sure we have a preheader basic block. */
4733 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4735 split_edge (loop_preheader_edge (loop));
4737 /* FORNOW: the vectorizer supports only loops which body consist
4738 of one basic block (header + empty latch). When the vectorizer will
4739 support more involved loop forms, the order by which the BBs are
4740 traversed need to be reconsidered. */
4742 for (i = 0; i < nbbs; i++)
4744 basic_block bb = bbs[i];
4746 for (si = bsi_start (bb); !bsi_end_p (si);)
4748 tree stmt = bsi_stmt (si);
4749 stmt_vec_info stmt_info;
4750 bool is_store;
4752 if (vect_print_dump_info (REPORT_DETAILS))
4754 fprintf (vect_dump, "------>vectorizing statement: ");
4755 print_generic_expr (vect_dump, stmt, TDF_SLIM);
4757 stmt_info = vinfo_for_stmt (stmt);
4758 gcc_assert (stmt_info);
4759 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4760 && !STMT_VINFO_LIVE_P (stmt_info))
4762 bsi_next (&si);
4763 continue;
4766 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4767 != (unsigned HOST_WIDE_INT) vectorization_factor)
4768 && vect_print_dump_info (REPORT_DETAILS))
4769 fprintf (vect_dump, "multiple-types.");
4771 /* -------- vectorize statement ------------ */
4772 if (vect_print_dump_info (REPORT_DETAILS))
4773 fprintf (vect_dump, "transform statement.");
4775 strided_store = false;
4776 is_store = vect_transform_stmt (stmt, &si, &strided_store);
4777 if (is_store)
4779 stmt_ann_t ann;
4780 if (DR_GROUP_FIRST_DR (stmt_info))
4782 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4783 interleaving chain was completed - free all the stores in
4784 the chain. */
4785 tree next = DR_GROUP_FIRST_DR (stmt_info);
4786 tree tmp;
4787 stmt_vec_info next_stmt_info;
4789 while (next)
4791 next_stmt_info = vinfo_for_stmt (next);
4792 /* Free the attached stmt_vec_info and remove the stmt. */
4793 ann = stmt_ann (next);
4794 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
4795 free (next_stmt_info);
4796 set_stmt_info (ann, NULL);
4797 next = tmp;
4799 bsi_remove (&si, true);
4800 continue;
4802 else
4804 /* Free the attached stmt_vec_info and remove the stmt. */
4805 ann = stmt_ann (stmt);
4806 free (stmt_info);
4807 set_stmt_info (ann, NULL);
4808 bsi_remove (&si, true);
4809 continue;
4812 else
4814 if (strided_store)
4816 /* This is case of skipped interleaved store. We don't free
4817 its stmt_vec_info. */
4818 bsi_remove (&si, true);
4819 continue;
4822 bsi_next (&si);
4823 } /* stmts in BB */
4824 } /* BBs in loop */
4826 slpeel_make_loop_iterate_ntimes (loop, ratio);
4828 EXECUTE_IF_SET_IN_BITMAP (vect_vnames_to_rename, 0, j, bi)
4829 mark_sym_for_renaming (SSA_NAME_VAR (ssa_name (j)));
4831 /* The memory tags and pointers in vectorized statements need to
4832 have their SSA forms updated. FIXME, why can't this be delayed
4833 until all the loops have been transformed? */
4834 update_ssa (TODO_update_ssa);
4836 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
4837 fprintf (vect_dump, "LOOP VECTORIZED.");