* jump.c: Remove prototypes for delete_computation and
[official-gcc.git] / gcc / tree-vect-transform.c
blobe901d00dc9888f8ce0cf52c7c0b869aaf2e8737d
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to the Free
19 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
20 02110-1301, USA. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "ggc.h"
27 #include "tree.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "timevar.h"
35 #include "cfgloop.h"
36 #include "expr.h"
37 #include "optabs.h"
38 #include "params.h"
39 #include "recog.h"
40 #include "tree-data-ref.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "langhooks.h"
45 #include "tree-pass.h"
46 #include "toplev.h"
47 #include "real.h"
49 /* Utility functions for the code transformation. */
50 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (tree, block_stmt_iterator *, tree, tree *, tree *, bool, tree);
54 static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree);
55 static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
58 static tree vect_init_vector (tree, tree, tree);
59 static void vect_finish_stmt_generation
60 (tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void update_vuses_to_preheader (tree, struct loop*);
63 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
64 static tree get_initial_def_for_reduction (tree, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
77 /* Function vect_get_new_vect_var.
79 Returns a name for a new variable. The current naming scheme appends the
80 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
81 the name of vectorizer generated variables, and appends that to NAME if
82 provided. */
84 static tree
85 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
87 const char *prefix;
88 tree new_vect_var;
90 switch (var_kind)
92 case vect_simple_var:
93 prefix = "vect_";
94 break;
95 case vect_scalar_var:
96 prefix = "stmp_";
97 break;
98 case vect_pointer_var:
99 prefix = "vect_p";
100 break;
101 default:
102 gcc_unreachable ();
105 if (name)
106 new_vect_var = create_tmp_var (type, concat (prefix, name, NULL));
107 else
108 new_vect_var = create_tmp_var (type, prefix);
110 /* Mark vector typed variable as a gimple register variable. */
111 if (TREE_CODE (type) == VECTOR_TYPE)
112 DECL_GIMPLE_REG_P (new_vect_var) = true;
114 return new_vect_var;
118 /* Function vect_create_addr_base_for_vector_ref.
120 Create an expression that computes the address of the first memory location
121 that will be accessed for a data reference.
123 Input:
124 STMT: The statement containing the data reference.
125 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
126 OFFSET: Optional. If supplied, it is be added to the initial address.
128 Output:
129 1. Return an SSA_NAME whose value is the address of the memory location of
130 the first vector of the data reference.
131 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
132 these statement(s) which define the returned SSA_NAME.
134 FORNOW: We are only handling array accesses with step 1. */
136 static tree
137 vect_create_addr_base_for_vector_ref (tree stmt,
138 tree *new_stmt_list,
139 tree offset)
141 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
142 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
143 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
144 tree base_name = build_fold_indirect_ref (data_ref_base);
145 tree vec_stmt;
146 tree addr_base, addr_expr;
147 tree dest, new_stmt;
148 tree base_offset = unshare_expr (DR_OFFSET (dr));
149 tree init = unshare_expr (DR_INIT (dr));
150 tree vect_ptr_type, addr_expr2;
152 /* Create base_offset */
153 base_offset = size_binop (PLUS_EXPR, base_offset, init);
154 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
155 add_referenced_var (dest);
156 base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest);
157 append_to_statement_list_force (new_stmt, new_stmt_list);
159 if (offset)
161 tree tmp = create_tmp_var (TREE_TYPE (base_offset), "offset");
162 tree step;
164 /* For interleaved access step we divide STEP by the size of the
165 interleaving group. */
166 if (DR_GROUP_SIZE (stmt_info))
167 step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr),
168 build_int_cst (TREE_TYPE (offset),
169 DR_GROUP_SIZE (stmt_info)));
170 else
171 step = DR_STEP (dr);
173 add_referenced_var (tmp);
174 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
175 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
176 base_offset, offset);
177 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
178 append_to_statement_list_force (new_stmt, new_stmt_list);
181 /* base + base_offset */
182 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
183 base_offset);
185 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
187 /* addr_expr = addr_base */
188 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
189 get_name (base_name));
190 add_referenced_var (addr_expr);
191 vec_stmt = fold_convert (vect_ptr_type, addr_base);
192 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
193 get_name (base_name));
194 add_referenced_var (addr_expr2);
195 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
196 append_to_statement_list_force (new_stmt, new_stmt_list);
198 if (vect_print_dump_info (REPORT_DETAILS))
200 fprintf (vect_dump, "created ");
201 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
203 return vec_stmt;
207 /* Function vect_create_data_ref_ptr.
209 Create a new pointer to vector type (vp), that points to the first location
210 accessed in the loop by STMT, along with the def-use update chain to
211 appropriately advance the pointer through the loop iterations. Also set
212 aliasing information for the pointer. This vector pointer is used by the
213 callers to this function to create a memory reference expression for vector
214 load/store access.
216 Input:
217 1. STMT: a stmt that references memory. Expected to be of the form
218 GIMPLE_MODIFY_STMT <name, data-ref> or
219 GIMPLE_MODIFY_STMT <data-ref, name>.
220 2. BSI: block_stmt_iterator where new stmts can be added.
221 3. OFFSET (optional): an offset to be added to the initial address accessed
222 by the data-ref in STMT.
223 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
224 pointing to the initial address.
225 5. TYPE: if not NULL indicates the required type of the data-ref
227 Output:
228 1. Declare a new ptr to vector_type, and have it point to the base of the
229 data reference (initial addressed accessed by the data reference).
230 For example, for vector of type V8HI, the following code is generated:
232 v8hi *vp;
233 vp = (v8hi *)initial_address;
235 if OFFSET is not supplied:
236 initial_address = &a[init];
237 if OFFSET is supplied:
238 initial_address = &a[init + OFFSET];
240 Return the initial_address in INITIAL_ADDRESS.
242 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
243 update the pointer in each iteration of the loop.
245 Return the increment stmt that updates the pointer in PTR_INCR.
247 3. Return the pointer. */
249 static tree
250 vect_create_data_ref_ptr (tree stmt,
251 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
252 tree offset, tree *initial_address, tree *ptr_incr,
253 bool only_init, tree type)
255 tree base_name;
256 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
257 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
258 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
259 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
260 tree vect_ptr_type;
261 tree vect_ptr;
262 tree tag;
263 tree new_temp;
264 tree vec_stmt;
265 tree new_stmt_list = NULL_TREE;
266 edge pe = loop_preheader_edge (loop);
267 basic_block new_bb;
268 tree vect_ptr_init;
269 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
271 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
273 if (vect_print_dump_info (REPORT_DETAILS))
275 tree data_ref_base = base_name;
276 fprintf (vect_dump, "create vector-pointer variable to type: ");
277 print_generic_expr (vect_dump, vectype, TDF_SLIM);
278 if (TREE_CODE (data_ref_base) == VAR_DECL)
279 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
280 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
281 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
282 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
283 fprintf (vect_dump, " vectorizing a record based array ref: ");
284 else if (TREE_CODE (data_ref_base) == SSA_NAME)
285 fprintf (vect_dump, " vectorizing a pointer ref: ");
286 print_generic_expr (vect_dump, base_name, TDF_SLIM);
289 /** (1) Create the new vector-pointer variable: **/
290 if (type)
291 vect_ptr_type = build_pointer_type (type);
292 else
293 vect_ptr_type = build_pointer_type (vectype);
294 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
295 get_name (base_name));
296 add_referenced_var (vect_ptr);
298 /** (2) Add aliasing information to the new vector-pointer:
299 (The points-to info (DR_PTR_INFO) may be defined later.) **/
301 tag = DR_MEMTAG (dr);
302 gcc_assert (tag);
304 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
305 tag must be created with tag added to its may alias list. */
306 if (!MTAG_P (tag))
307 new_type_alias (vect_ptr, tag, DR_REF (dr));
308 else
309 set_symbol_mem_tag (vect_ptr, tag);
311 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
313 /** (3) Calculate the initial address the vector-pointer, and set
314 the vector-pointer to point to it before the loop: **/
316 /* Create: (&(base[init_val+offset]) in the loop preheader. */
317 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
318 offset);
319 pe = loop_preheader_edge (loop);
320 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
321 gcc_assert (!new_bb);
322 *initial_address = new_temp;
324 /* Create: p = (vectype *) initial_base */
325 vec_stmt = fold_convert (vect_ptr_type, new_temp);
326 vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vect_ptr, vec_stmt);
327 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
328 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
329 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
330 gcc_assert (!new_bb);
333 /** (4) Handle the updating of the vector-pointer inside the loop: **/
335 if (only_init) /* No update in loop is required. */
337 /* Copy the points-to information if it exists. */
338 if (DR_PTR_INFO (dr))
339 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
340 return vect_ptr_init;
342 else
344 block_stmt_iterator incr_bsi;
345 bool insert_after;
346 tree indx_before_incr, indx_after_incr;
347 tree incr;
349 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
350 create_iv (vect_ptr_init,
351 fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)),
352 NULL_TREE, loop, &incr_bsi, insert_after,
353 &indx_before_incr, &indx_after_incr);
354 incr = bsi_stmt (incr_bsi);
355 set_stmt_info (stmt_ann (incr),
356 new_stmt_vec_info (incr, loop_vinfo));
358 /* Copy the points-to information if it exists. */
359 if (DR_PTR_INFO (dr))
361 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
362 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
364 merge_alias_info (vect_ptr_init, indx_before_incr);
365 merge_alias_info (vect_ptr_init, indx_after_incr);
366 if (ptr_incr)
367 *ptr_incr = incr;
369 return indx_before_incr;
374 /* Function bump_vector_ptr
376 Increment a pointer (to a vector type) by vector-size. Connect the new
377 increment stmt to the existing def-use update-chain of the pointer.
379 The pointer def-use update-chain before this function:
380 DATAREF_PTR = phi (p_0, p_2)
381 ....
382 PTR_INCR: p_2 = DATAREF_PTR + step
384 The pointer def-use update-chain after this function:
385 DATAREF_PTR = phi (p_0, p_2)
386 ....
387 NEW_DATAREF_PTR = DATAREF_PTR + vector_size
388 ....
389 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
391 Input:
392 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
393 in the loop.
394 PTR_INCR - the stmt that updates the pointer in each iteration of the loop.
395 The increment amount across iterations is also expected to be
396 vector_size.
397 BSI - location where the new update stmt is to be placed.
398 STMT - the original scalar memory-access stmt that is being vectorized.
400 Output: Return NEW_DATAREF_PTR as illustrated above.
404 static tree
405 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
406 tree stmt)
408 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
409 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
410 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
411 tree vptr_type = TREE_TYPE (dataref_ptr);
412 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
413 tree update = fold_convert (vptr_type, TYPE_SIZE_UNIT (vectype));
414 tree incr_stmt;
415 ssa_op_iter iter;
416 use_operand_p use_p;
417 tree new_dataref_ptr;
419 incr_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, ptr_var,
420 build2 (PLUS_EXPR, vptr_type, dataref_ptr, update));
421 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
422 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
423 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
425 /* Update the vector-pointer's cross-iteration increment. */
426 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
428 tree use = USE_FROM_PTR (use_p);
430 if (use == dataref_ptr)
431 SET_USE (use_p, new_dataref_ptr);
432 else
433 gcc_assert (tree_int_cst_compare (use, update) == 0);
436 /* Copy the points-to information if it exists. */
437 if (DR_PTR_INFO (dr))
438 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
439 merge_alias_info (new_dataref_ptr, dataref_ptr);
441 return new_dataref_ptr;
445 /* Function vect_create_destination_var.
447 Create a new temporary of type VECTYPE. */
449 static tree
450 vect_create_destination_var (tree scalar_dest, tree vectype)
452 tree vec_dest;
453 const char *new_name;
454 tree type;
455 enum vect_var_kind kind;
457 kind = vectype ? vect_simple_var : vect_scalar_var;
458 type = vectype ? vectype : TREE_TYPE (scalar_dest);
460 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
462 new_name = get_name (scalar_dest);
463 if (!new_name)
464 new_name = "var_";
465 vec_dest = vect_get_new_vect_var (type, kind, new_name);
466 add_referenced_var (vec_dest);
468 return vec_dest;
472 /* Function vect_init_vector.
474 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
475 the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be
476 used in the vectorization of STMT. */
478 static tree
479 vect_init_vector (tree stmt, tree vector_var, tree vector_type)
481 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
482 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
483 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
484 tree new_var;
485 tree init_stmt;
486 tree vec_oprnd;
487 edge pe;
488 tree new_temp;
489 basic_block new_bb;
491 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
492 add_referenced_var (new_var);
494 init_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, new_var, vector_var);
495 new_temp = make_ssa_name (new_var, init_stmt);
496 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
498 pe = loop_preheader_edge (loop);
499 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
500 gcc_assert (!new_bb);
502 if (vect_print_dump_info (REPORT_DETAILS))
504 fprintf (vect_dump, "created new init_stmt: ");
505 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
508 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
509 return vec_oprnd;
513 /* Function get_initial_def_for_induction
515 Input:
516 STMT - a stmt that performs an induction operation in the loop.
517 IV_PHI - the initial value of the induction variable
519 Output:
520 Return a vector variable, initialized with the first VF values of
521 the induction variable. E.g., for an iv with IV_PHI='X' and
522 evolution S, for a vector of 4 units, we want to return:
523 [X, X + S, X + 2*S, X + 3*S]. */
525 static tree
526 get_initial_def_for_induction (tree stmt, tree iv_phi)
528 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
529 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
530 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
531 tree scalar_type = TREE_TYPE (iv_phi);
532 tree vectype = get_vectype_for_scalar_type (scalar_type);
533 int nunits = GET_MODE_NUNITS (TYPE_MODE (vectype));
534 edge pe = loop_preheader_edge (loop);
535 basic_block new_bb;
536 block_stmt_iterator bsi;
537 tree vec, vec_init, vec_step, t;
538 tree access_fn;
539 tree new_var;
540 tree new_name;
541 tree init_stmt;
542 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
543 tree init_expr, step_expr;
544 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
545 int i;
546 bool ok;
547 int ncopies = vf / nunits;
548 tree expr;
549 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
551 gcc_assert (phi_info);
553 if (STMT_VINFO_VEC_STMT (phi_info))
555 induction_phi = STMT_VINFO_VEC_STMT (phi_info);
556 gcc_assert (TREE_CODE (induction_phi) == PHI_NODE);
558 if (vect_print_dump_info (REPORT_DETAILS))
560 fprintf (vect_dump, "induction already vectorized:");
561 print_generic_expr (vect_dump, iv_phi, TDF_SLIM);
562 fprintf (vect_dump, "\n");
563 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
566 return PHI_RESULT (induction_phi);
569 gcc_assert (ncopies >= 1);
571 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (iv_phi));
572 gcc_assert (access_fn);
573 ok = vect_is_simple_iv_evolution (loop->num, access_fn, &init_expr, &step_expr);
574 gcc_assert (ok);
576 /* Create the vector that holds the initial_value of the induction. */
577 new_name = init_expr;
578 t = NULL_TREE;
579 t = tree_cons (NULL_TREE, init_expr, t);
580 for (i = 1; i < nunits; i++)
582 /* Create: new_name = new_name + step_expr */
583 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
584 add_referenced_var (new_var);
585 init_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, new_var,
586 fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr));
587 new_name = make_ssa_name (new_var, init_stmt);
588 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
590 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
591 gcc_assert (!new_bb);
593 if (vect_print_dump_info (REPORT_DETAILS))
595 fprintf (vect_dump, "created new init_stmt: ");
596 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
598 t = tree_cons (NULL_TREE, new_name, t);
600 vec = build_constructor_from_list (vectype, nreverse (t));
601 vec_init = vect_init_vector (stmt, vec, vectype);
604 /* Create the vector that holds the step of the induction. */
605 expr = build_int_cst (scalar_type, vf);
606 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
607 t = NULL_TREE;
608 for (i = 0; i < nunits; i++)
609 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
610 vec = build_constructor_from_list (vectype, t);
611 vec_step = vect_init_vector (stmt, vec, vectype);
614 /* Create the following def-use cycle:
615 loop prolog:
616 vec_init = [X, X+S, X+2*S, X+3*S]
617 vec_step = [VF*S, VF*S, VF*S, VF*S]
618 loop:
619 vec_iv = PHI <vec_init, vec_loop>
621 STMT
623 vec_loop = vec_iv + vec_step; */
625 /* Create the induction-phi that defines the induction-operand. */
626 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
627 add_referenced_var (vec_dest);
628 induction_phi = create_phi_node (vec_dest, loop->header);
629 set_stmt_info (get_stmt_ann (induction_phi),
630 new_stmt_vec_info (induction_phi, loop_vinfo));
631 induc_def = PHI_RESULT (induction_phi);
633 /* Create the iv update inside the loop */
634 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, NULL_TREE,
635 build2 (PLUS_EXPR, vectype, induc_def, vec_step));
636 vec_def = make_ssa_name (vec_dest, new_stmt);
637 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
638 bsi = bsi_for_stmt (stmt);
639 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
641 /* Set the arguments of the phi node: */
642 add_phi_arg (induction_phi, vec_init, loop_preheader_edge (loop));
643 add_phi_arg (induction_phi, vec_def, loop_latch_edge (loop));
646 /* In case the vectorization factor (VF) is bigger than the number
647 of elements that we can fit in a vectype (nunits), we have to generate
648 more than one vector stmt - i.e - we need to "unroll" the
649 vector stmt by a factor VF/nunits. For more details see documentation
650 in vectorizable_operation. */
652 if (ncopies > 1)
654 stmt_vec_info prev_stmt_vinfo;
656 /* Create the vector that holds the step of the induction. */
657 expr = build_int_cst (scalar_type, nunits);
658 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
659 t = NULL_TREE;
660 for (i = 0; i < nunits; i++)
661 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
662 vec = build_constructor_from_list (vectype, t);
663 vec_step = vect_init_vector (stmt, vec, vectype);
665 vec_def = induc_def;
666 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
667 for (i = 1; i < ncopies; i++)
669 /* vec_i = vec_prev + vec_{step*nunits} */
671 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, NULL_TREE,
672 build2 (PLUS_EXPR, vectype, vec_def, vec_step));
673 vec_def = make_ssa_name (vec_dest, new_stmt);
674 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
675 bsi = bsi_for_stmt (stmt);
676 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
678 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
679 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
683 if (vect_print_dump_info (REPORT_DETAILS))
685 fprintf (vect_dump, "transform induction: created def-use cycle:");
686 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
687 fprintf (vect_dump, "\n");
688 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
691 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
692 return induc_def;
696 /* Function vect_get_vec_def_for_operand.
698 OP is an operand in STMT. This function returns a (vector) def that will be
699 used in the vectorized stmt for STMT.
701 In the case that OP is an SSA_NAME which is defined in the loop, then
702 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
704 In case OP is an invariant or constant, a new stmt that creates a vector def
705 needs to be introduced. */
707 static tree
708 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
710 tree vec_oprnd;
711 tree vec_stmt;
712 tree def_stmt;
713 stmt_vec_info def_stmt_info = NULL;
714 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
715 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
716 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
717 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
718 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
719 tree vec_inv;
720 tree vec_cst;
721 tree t = NULL_TREE;
722 tree def;
723 int i;
724 enum vect_def_type dt;
725 bool is_simple_use;
726 tree vector_type;
728 if (vect_print_dump_info (REPORT_DETAILS))
730 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
731 print_generic_expr (vect_dump, op, TDF_SLIM);
734 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
735 gcc_assert (is_simple_use);
736 if (vect_print_dump_info (REPORT_DETAILS))
738 if (def)
740 fprintf (vect_dump, "def = ");
741 print_generic_expr (vect_dump, def, TDF_SLIM);
743 if (def_stmt)
745 fprintf (vect_dump, " def_stmt = ");
746 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
750 switch (dt)
752 /* Case 1: operand is a constant. */
753 case vect_constant_def:
755 if (scalar_def)
756 *scalar_def = op;
758 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
759 if (vect_print_dump_info (REPORT_DETAILS))
760 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
762 for (i = nunits - 1; i >= 0; --i)
764 t = tree_cons (NULL_TREE, op, t);
766 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
767 vec_cst = build_vector (vector_type, t);
769 return vect_init_vector (stmt, vec_cst, vector_type);
772 /* Case 2: operand is defined outside the loop - loop invariant. */
773 case vect_invariant_def:
775 if (scalar_def)
776 *scalar_def = def;
778 /* Create 'vec_inv = {inv,inv,..,inv}' */
779 if (vect_print_dump_info (REPORT_DETAILS))
780 fprintf (vect_dump, "Create vector_inv.");
782 for (i = nunits - 1; i >= 0; --i)
784 t = tree_cons (NULL_TREE, def, t);
787 /* FIXME: use build_constructor directly. */
788 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
789 vec_inv = build_constructor_from_list (vector_type, t);
790 return vect_init_vector (stmt, vec_inv, vector_type);
793 /* Case 3: operand is defined inside the loop. */
794 case vect_loop_def:
796 if (scalar_def)
797 *scalar_def = def_stmt;
799 /* Get the def from the vectorized stmt. */
800 def_stmt_info = vinfo_for_stmt (def_stmt);
801 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
802 gcc_assert (vec_stmt);
803 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
804 return vec_oprnd;
807 /* Case 4: operand is defined by a loop header phi - reduction */
808 case vect_reduction_def:
810 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
812 /* Get the def before the loop */
813 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
814 return get_initial_def_for_reduction (stmt, op, scalar_def);
817 /* Case 5: operand is defined by loop-header phi - induction. */
818 case vect_induction_def:
820 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
822 /* Get the def before the loop */
823 return get_initial_def_for_induction (stmt, def_stmt);
826 default:
827 gcc_unreachable ();
832 /* Function vect_get_vec_def_for_stmt_copy
834 Return a vector-def for an operand. This function is used when the
835 vectorized stmt to be created (by the caller to this function) is a "copy"
836 created in case the vectorized result cannot fit in one vector, and several
837 copies of the vector-stmt are required. In this case the vector-def is
838 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
839 of the stmt that defines VEC_OPRND.
840 DT is the type of the vector def VEC_OPRND.
842 Context:
843 In case the vectorization factor (VF) is bigger than the number
844 of elements that can fit in a vectype (nunits), we have to generate
845 more than one vector stmt to vectorize the scalar stmt. This situation
846 arises when there are multiple data-types operated upon in the loop; the
847 smallest data-type determines the VF, and as a result, when vectorizing
848 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
849 vector stmt (each computing a vector of 'nunits' results, and together
850 computing 'VF' results in each iteration). This function is called when
851 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
852 which VF=16 and nunits=4, so the number of copies required is 4):
854 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
856 S1: x = load VS1.0: vx.0 = memref0 VS1.1
857 VS1.1: vx.1 = memref1 VS1.2
858 VS1.2: vx.2 = memref2 VS1.3
859 VS1.3: vx.3 = memref3
861 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
862 VSnew.1: vz1 = vx.1 + ... VSnew.2
863 VSnew.2: vz2 = vx.2 + ... VSnew.3
864 VSnew.3: vz3 = vx.3 + ...
866 The vectorization of S1 is explained in vectorizable_load.
867 The vectorization of S2:
868 To create the first vector-stmt out of the 4 copies - VSnew.0 -
869 the function 'vect_get_vec_def_for_operand' is called to
870 get the relevant vector-def for each operand of S2. For operand x it
871 returns the vector-def 'vx.0'.
873 To create the remaining copies of the vector-stmt (VSnew.j), this
874 function is called to get the relevant vector-def for each operand. It is
875 obtained from the respective VS1.j stmt, which is recorded in the
876 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
878 For example, to obtain the vector-def 'vx.1' in order to create the
879 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
880 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
881 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
882 and return its def ('vx.1').
883 Overall, to create the above sequence this function will be called 3 times:
884 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
885 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
886 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
888 static tree
889 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
891 tree vec_stmt_for_operand;
892 stmt_vec_info def_stmt_info;
894 /* Do nothing; can reuse same def. */
895 if (dt == vect_invariant_def || dt == vect_constant_def )
896 return vec_oprnd;
898 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
899 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
900 if (dt == vect_induction_def)
901 gcc_assert (TREE_CODE (vec_stmt_for_operand) == PHI_NODE);
902 gcc_assert (def_stmt_info);
903 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
904 gcc_assert (vec_stmt_for_operand);
905 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
907 return vec_oprnd;
911 /* Function vect_finish_stmt_generation.
913 Insert a new stmt. */
915 static void
916 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
917 block_stmt_iterator *bsi)
919 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
920 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
922 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
923 set_stmt_info (get_stmt_ann (vec_stmt),
924 new_stmt_vec_info (vec_stmt, loop_vinfo));
926 if (vect_print_dump_info (REPORT_DETAILS))
928 fprintf (vect_dump, "add new stmt: ");
929 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
932 /* Make sure bsi points to the stmt that is being vectorized. */
933 gcc_assert (stmt == bsi_stmt (*bsi));
935 #ifdef USE_MAPPED_LOCATION
936 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
937 #else
938 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
939 #endif
943 #define ADJUST_IN_EPILOG 1
945 /* Function get_initial_def_for_reduction
947 Input:
948 STMT - a stmt that performs a reduction operation in the loop.
949 INIT_VAL - the initial value of the reduction variable
951 Output:
952 SCALAR_DEF - a tree that holds a value to be added to the final result
953 of the reduction (used for "ADJUST_IN_EPILOG" - see below).
954 Return a vector variable, initialized according to the operation that STMT
955 performs. This vector will be used as the initial value of the
956 vector of partial results.
958 Option1 ("ADJUST_IN_EPILOG"): Initialize the vector as follows:
959 add: [0,0,...,0,0]
960 mult: [1,1,...,1,1]
961 min/max: [init_val,init_val,..,init_val,init_val]
962 bit and/or: [init_val,init_val,..,init_val,init_val]
963 and when necessary (e.g. add/mult case) let the caller know
964 that it needs to adjust the result by init_val.
966 Option2: Initialize the vector as follows:
967 add: [0,0,...,0,init_val]
968 mult: [1,1,...,1,init_val]
969 min/max: [init_val,init_val,...,init_val]
970 bit and/or: [init_val,init_val,...,init_val]
971 and no adjustments are needed.
973 For example, for the following code:
975 s = init_val;
976 for (i=0;i<n;i++)
977 s = s + a[i];
979 STMT is 's = s + a[i]', and the reduction variable is 's'.
980 For a vector of 4 units, we want to return either [0,0,0,init_val],
981 or [0,0,0,0] and let the caller know that it needs to adjust
982 the result at the end by 'init_val'.
984 FORNOW: We use the "ADJUST_IN_EPILOG" scheme.
985 TODO: Use some cost-model to estimate which scheme is more profitable.
988 static tree
989 get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
991 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
992 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
993 int nunits = GET_MODE_NUNITS (TYPE_MODE (vectype));
994 int nelements;
995 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
996 tree type = TREE_TYPE (init_val);
997 tree def;
998 tree vec, t = NULL_TREE;
999 bool need_epilog_adjust;
1000 int i;
1001 tree vector_type;
1003 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
1005 switch (code)
1007 case WIDEN_SUM_EXPR:
1008 case DOT_PROD_EXPR:
1009 case PLUS_EXPR:
1010 if (INTEGRAL_TYPE_P (type))
1011 def = build_int_cst (type, 0);
1012 else
1013 def = build_real (type, dconst0);
1015 #ifdef ADJUST_IN_EPILOG
1016 /* All the 'nunits' elements are set to 0. The final result will be
1017 adjusted by 'init_val' at the loop epilog. */
1018 nelements = nunits;
1019 need_epilog_adjust = true;
1020 #else
1021 /* 'nunits - 1' elements are set to 0; The last element is set to
1022 'init_val'. No further adjustments at the epilog are needed. */
1023 nelements = nunits - 1;
1024 need_epilog_adjust = false;
1025 #endif
1026 break;
1028 case MIN_EXPR:
1029 case MAX_EXPR:
1030 def = init_val;
1031 nelements = nunits;
1032 need_epilog_adjust = false;
1033 break;
1035 default:
1036 gcc_unreachable ();
1039 for (i = nelements - 1; i >= 0; --i)
1040 t = tree_cons (NULL_TREE, def, t);
1042 if (nelements == nunits - 1)
1044 /* Set the last element of the vector. */
1045 t = tree_cons (NULL_TREE, init_val, t);
1046 nelements += 1;
1048 gcc_assert (nelements == nunits);
1050 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1051 if (TREE_CODE (init_val) == INTEGER_CST || TREE_CODE (init_val) == REAL_CST)
1052 vec = build_vector (vector_type, t);
1053 else
1054 vec = build_constructor_from_list (vector_type, t);
1056 if (!need_epilog_adjust)
1057 *scalar_def = NULL_TREE;
1058 else
1059 *scalar_def = init_val;
1061 return vect_init_vector (stmt, vec, vector_type);
1065 /* Function vect_create_epilog_for_reduction
1067 Create code at the loop-epilog to finalize the result of a reduction
1068 computation.
1070 VECT_DEF is a vector of partial results.
1071 REDUC_CODE is the tree-code for the epilog reduction.
1072 STMT is the scalar reduction stmt that is being vectorized.
1073 REDUCTION_PHI is the phi-node that carries the reduction computation.
1075 This function:
1076 1. Creates the reduction def-use cycle: sets the the arguments for
1077 REDUCTION_PHI:
1078 The loop-entry argument is the vectorized initial-value of the reduction.
1079 The loop-latch argument is VECT_DEF - the vector of partial sums.
1080 2. "Reduces" the vector of partial results VECT_DEF into a single result,
1081 by applying the operation specified by REDUC_CODE if available, or by
1082 other means (whole-vector shifts or a scalar loop).
1083 The function also creates a new phi node at the loop exit to preserve
1084 loop-closed form, as illustrated below.
1086 The flow at the entry to this function:
1088 loop:
1089 vec_def = phi <null, null> # REDUCTION_PHI
1090 VECT_DEF = vector_stmt # vectorized form of STMT
1091 s_loop = scalar_stmt # (scalar) STMT
1092 loop_exit:
1093 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1094 use <s_out0>
1095 use <s_out0>
1097 The above is transformed by this function into:
1099 loop:
1100 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
1101 VECT_DEF = vector_stmt # vectorized form of STMT
1102 s_loop = scalar_stmt # (scalar) STMT
1103 loop_exit:
1104 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1105 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1106 v_out2 = reduce <v_out1>
1107 s_out3 = extract_field <v_out2, 0>
1108 s_out4 = adjust_result <s_out3>
1109 use <s_out4>
1110 use <s_out4>
1113 static void
1114 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
1115 enum tree_code reduc_code, tree reduction_phi)
1117 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1118 tree vectype;
1119 enum machine_mode mode;
1120 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1121 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1122 basic_block exit_bb;
1123 tree scalar_dest;
1124 tree scalar_type;
1125 tree new_phi;
1126 block_stmt_iterator exit_bsi;
1127 tree vec_dest;
1128 tree new_temp;
1129 tree new_name;
1130 tree epilog_stmt;
1131 tree new_scalar_dest, exit_phi;
1132 tree bitsize, bitpos, bytesize;
1133 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1134 tree scalar_initial_def;
1135 tree vec_initial_def;
1136 tree orig_name;
1137 imm_use_iterator imm_iter;
1138 use_operand_p use_p;
1139 bool extract_scalar_result;
1140 tree reduction_op;
1141 tree orig_stmt;
1142 tree use_stmt;
1143 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
1144 int op_type;
1146 op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
1147 reduction_op = TREE_OPERAND (operation, op_type-1);
1148 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
1149 mode = TYPE_MODE (vectype);
1151 /*** 1. Create the reduction def-use cycle ***/
1153 /* 1.1 set the loop-entry arg of the reduction-phi: */
1154 /* For the case of reduction, vect_get_vec_def_for_operand returns
1155 the scalar def before the loop, that defines the initial value
1156 of the reduction variable. */
1157 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
1158 &scalar_initial_def);
1159 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
1161 /* 1.2 set the loop-latch arg for the reduction-phi: */
1162 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
1164 if (vect_print_dump_info (REPORT_DETAILS))
1166 fprintf (vect_dump, "transform reduction: created def-use cycle:");
1167 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
1168 fprintf (vect_dump, "\n");
1169 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
1173 /*** 2. Create epilog code
1174 The reduction epilog code operates across the elements of the vector
1175 of partial results computed by the vectorized loop.
1176 The reduction epilog code consists of:
1177 step 1: compute the scalar result in a vector (v_out2)
1178 step 2: extract the scalar result (s_out3) from the vector (v_out2)
1179 step 3: adjust the scalar result (s_out3) if needed.
1181 Step 1 can be accomplished using one the following three schemes:
1182 (scheme 1) using reduc_code, if available.
1183 (scheme 2) using whole-vector shifts, if available.
1184 (scheme 3) using a scalar loop. In this case steps 1+2 above are
1185 combined.
1187 The overall epilog code looks like this:
1189 s_out0 = phi <s_loop> # original EXIT_PHI
1190 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1191 v_out2 = reduce <v_out1> # step 1
1192 s_out3 = extract_field <v_out2, 0> # step 2
1193 s_out4 = adjust_result <s_out3> # step 3
1195 (step 3 is optional, and step2 1 and 2 may be combined).
1196 Lastly, the uses of s_out0 are replaced by s_out4.
1198 ***/
1200 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
1201 v_out1 = phi <v_loop> */
1203 exit_bb = single_exit (loop)->dest;
1204 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
1205 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
1206 exit_bsi = bsi_start (exit_bb);
1208 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
1209 (i.e. when reduc_code is not available) and in the final adjustment code
1210 (if needed). Also get the original scalar reduction variable as
1211 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
1212 represents a reduction pattern), the tree-code and scalar-def are
1213 taken from the original stmt that the pattern-stmt (STMT) replaces.
1214 Otherwise (it is a regular reduction) - the tree-code and scalar-def
1215 are taken from STMT. */
1217 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1218 if (!orig_stmt)
1220 /* Regular reduction */
1221 orig_stmt = stmt;
1223 else
1225 /* Reduction pattern */
1226 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
1227 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1228 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
1230 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1231 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
1232 scalar_type = TREE_TYPE (scalar_dest);
1233 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
1234 bitsize = TYPE_SIZE (scalar_type);
1235 bytesize = TYPE_SIZE_UNIT (scalar_type);
1237 /* 2.3 Create the reduction code, using one of the three schemes described
1238 above. */
1240 if (reduc_code < NUM_TREE_CODES)
1242 /*** Case 1: Create:
1243 v_out2 = reduc_expr <v_out1> */
1245 if (vect_print_dump_info (REPORT_DETAILS))
1246 fprintf (vect_dump, "Reduce using direct vector reduction.");
1248 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1249 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
1250 build1 (reduc_code, vectype, PHI_RESULT (new_phi)));
1251 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1252 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1253 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1255 extract_scalar_result = true;
1257 else
1259 enum tree_code shift_code = 0;
1260 bool have_whole_vector_shift = true;
1261 int bit_offset;
1262 int element_bitsize = tree_low_cst (bitsize, 1);
1263 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1264 tree vec_temp;
1266 if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
1267 shift_code = VEC_RSHIFT_EXPR;
1268 else
1269 have_whole_vector_shift = false;
1271 /* Regardless of whether we have a whole vector shift, if we're
1272 emulating the operation via tree-vect-generic, we don't want
1273 to use it. Only the first round of the reduction is likely
1274 to still be profitable via emulation. */
1275 /* ??? It might be better to emit a reduction tree code here, so that
1276 tree-vect-generic can expand the first round via bit tricks. */
1277 if (!VECTOR_MODE_P (mode))
1278 have_whole_vector_shift = false;
1279 else
1281 optab optab = optab_for_tree_code (code, vectype);
1282 if (optab->handlers[mode].insn_code == CODE_FOR_nothing)
1283 have_whole_vector_shift = false;
1286 if (have_whole_vector_shift)
1288 /*** Case 2: Create:
1289 for (offset = VS/2; offset >= element_size; offset/=2)
1291 Create: va' = vec_shift <va, offset>
1292 Create: va = vop <va, va'>
1293 } */
1295 if (vect_print_dump_info (REPORT_DETAILS))
1296 fprintf (vect_dump, "Reduce using vector shifts");
1298 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1299 new_temp = PHI_RESULT (new_phi);
1301 for (bit_offset = vec_size_in_bits/2;
1302 bit_offset >= element_bitsize;
1303 bit_offset /= 2)
1305 tree bitpos = size_int (bit_offset);
1307 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1308 vec_dest,
1309 build2 (shift_code, vectype,
1310 new_temp, bitpos));
1311 new_name = make_ssa_name (vec_dest, epilog_stmt);
1312 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1313 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1315 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1316 vec_dest,
1317 build2 (code, vectype,
1318 new_name, new_temp));
1319 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1320 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1321 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1324 extract_scalar_result = true;
1326 else
1328 tree rhs;
1330 /*** Case 3: Create:
1331 s = extract_field <v_out2, 0>
1332 for (offset = element_size;
1333 offset < vector_size;
1334 offset += element_size;)
1336 Create: s' = extract_field <v_out2, offset>
1337 Create: s = op <s, s'>
1338 } */
1340 if (vect_print_dump_info (REPORT_DETAILS))
1341 fprintf (vect_dump, "Reduce using scalar code. ");
1343 vec_temp = PHI_RESULT (new_phi);
1344 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1345 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1346 bitsize_zero_node);
1347 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1348 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1349 new_scalar_dest, rhs);
1350 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1351 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1352 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1354 for (bit_offset = element_bitsize;
1355 bit_offset < vec_size_in_bits;
1356 bit_offset += element_bitsize)
1358 tree bitpos = bitsize_int (bit_offset);
1359 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1360 bitpos);
1362 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1363 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1364 new_scalar_dest, rhs);
1365 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
1366 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1367 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1369 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1370 new_scalar_dest,
1371 build2 (code, scalar_type, new_name, new_temp));
1372 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1373 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1374 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1377 extract_scalar_result = false;
1381 /* 2.4 Extract the final scalar result. Create:
1382 s_out3 = extract_field <v_out2, bitpos> */
1384 if (extract_scalar_result)
1386 tree rhs;
1388 if (vect_print_dump_info (REPORT_DETAILS))
1389 fprintf (vect_dump, "extract scalar result");
1391 if (BYTES_BIG_ENDIAN)
1392 bitpos = size_binop (MULT_EXPR,
1393 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
1394 TYPE_SIZE (scalar_type));
1395 else
1396 bitpos = bitsize_zero_node;
1398 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
1399 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1400 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1401 new_scalar_dest, rhs);
1402 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1403 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1404 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1407 /* 2.4 Adjust the final result by the initial value of the reduction
1408 variable. (When such adjustment is not needed, then
1409 'scalar_initial_def' is zero).
1411 Create:
1412 s_out4 = scalar_expr <s_out3, scalar_initial_def> */
1414 if (scalar_initial_def)
1416 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1417 new_scalar_dest,
1418 build2 (code, scalar_type, new_temp, scalar_initial_def));
1419 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1420 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1421 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1424 /* 2.6 Replace uses of s_out0 with uses of s_out3 */
1426 /* Find the loop-closed-use at the loop exit of the original scalar result.
1427 (The reduction result is expected to have two immediate uses - one at the
1428 latch block, and one at the loop exit). */
1429 exit_phi = NULL;
1430 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
1432 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
1434 exit_phi = USE_STMT (use_p);
1435 break;
1438 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
1439 gcc_assert (exit_phi);
1440 /* Replace the uses: */
1441 orig_name = PHI_RESULT (exit_phi);
1442 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
1443 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1444 SET_USE (use_p, new_temp);
1448 /* Function vectorizable_reduction.
1450 Check if STMT performs a reduction operation that can be vectorized.
1451 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1452 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1453 Return FALSE if not a vectorizable STMT, TRUE otherwise.
1455 This function also handles reduction idioms (patterns) that have been
1456 recognized in advance during vect_pattern_recog. In this case, STMT may be
1457 of this form:
1458 X = pattern_expr (arg0, arg1, ..., X)
1459 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
1460 sequence that had been detected and replaced by the pattern-stmt (STMT).
1462 In some cases of reduction patterns, the type of the reduction variable X is
1463 different than the type of the other arguments of STMT.
1464 In such cases, the vectype that is used when transforming STMT into a vector
1465 stmt is different than the vectype that is used to determine the
1466 vectorization factor, because it consists of a different number of elements
1467 than the actual number of elements that are being operated upon in parallel.
1469 For example, consider an accumulation of shorts into an int accumulator.
1470 On some targets it's possible to vectorize this pattern operating on 8
1471 shorts at a time (hence, the vectype for purposes of determining the
1472 vectorization factor should be V8HI); on the other hand, the vectype that
1473 is used to create the vector form is actually V4SI (the type of the result).
1475 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
1476 indicates what is the actual level of parallelism (V8HI in the example), so
1477 that the right vectorization factor would be derived. This vectype
1478 corresponds to the type of arguments to the reduction stmt, and should *NOT*
1479 be used to create the vectorized stmt. The right vectype for the vectorized
1480 stmt is obtained from the type of the result X:
1481 get_vectype_for_scalar_type (TREE_TYPE (X))
1483 This means that, contrary to "regular" reductions (or "regular" stmts in
1484 general), the following equation:
1485 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
1486 does *NOT* necessarily hold for reduction patterns. */
1488 bool
1489 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1491 tree vec_dest;
1492 tree scalar_dest;
1493 tree op;
1494 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
1495 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1496 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1497 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1498 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1499 tree operation;
1500 enum tree_code code, orig_code, epilog_reduc_code = 0;
1501 enum machine_mode vec_mode;
1502 int op_type;
1503 optab optab, reduc_optab;
1504 tree new_temp = NULL_TREE;
1505 tree def, def_stmt;
1506 enum vect_def_type dt;
1507 tree new_phi;
1508 tree scalar_type;
1509 bool is_simple_use;
1510 tree orig_stmt;
1511 stmt_vec_info orig_stmt_info;
1512 tree expr = NULL_TREE;
1513 int i;
1514 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1515 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1516 stmt_vec_info prev_stmt_info;
1517 tree reduc_def;
1518 tree new_stmt = NULL_TREE;
1519 int j;
1521 gcc_assert (ncopies >= 1);
1523 /* 1. Is vectorizable reduction? */
1525 /* Not supportable if the reduction variable is used in the loop. */
1526 if (STMT_VINFO_RELEVANT_P (stmt_info))
1527 return false;
1529 if (!STMT_VINFO_LIVE_P (stmt_info))
1530 return false;
1532 /* Make sure it was already recognized as a reduction computation. */
1533 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
1534 return false;
1536 /* 2. Has this been recognized as a reduction pattern?
1538 Check if STMT represents a pattern that has been recognized
1539 in earlier analysis stages. For stmts that represent a pattern,
1540 the STMT_VINFO_RELATED_STMT field records the last stmt in
1541 the original sequence that constitutes the pattern. */
1543 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1544 if (orig_stmt)
1546 orig_stmt_info = vinfo_for_stmt (orig_stmt);
1547 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
1548 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
1549 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
1552 /* 3. Check the operands of the operation. The first operands are defined
1553 inside the loop body. The last operand is the reduction variable,
1554 which is defined by the loop-header-phi. */
1556 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
1558 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1559 code = TREE_CODE (operation);
1560 op_type = TREE_CODE_LENGTH (code);
1561 if (op_type != binary_op && op_type != ternary_op)
1562 return false;
1563 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1564 scalar_type = TREE_TYPE (scalar_dest);
1566 /* All uses but the last are expected to be defined in the loop.
1567 The last use is the reduction variable. */
1568 for (i = 0; i < op_type-1; i++)
1570 op = TREE_OPERAND (operation, i);
1571 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1572 gcc_assert (is_simple_use);
1573 if (dt != vect_loop_def
1574 && dt != vect_invariant_def
1575 && dt != vect_constant_def
1576 && dt != vect_induction_def)
1577 return false;
1580 op = TREE_OPERAND (operation, i);
1581 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1582 gcc_assert (is_simple_use);
1583 gcc_assert (dt == vect_reduction_def);
1584 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1585 if (orig_stmt)
1586 gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt));
1587 else
1588 gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt));
1590 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
1591 return false;
1593 /* 4. Supportable by target? */
1595 /* 4.1. check support for the operation in the loop */
1596 optab = optab_for_tree_code (code, vectype);
1597 if (!optab)
1599 if (vect_print_dump_info (REPORT_DETAILS))
1600 fprintf (vect_dump, "no optab.");
1601 return false;
1603 vec_mode = TYPE_MODE (vectype);
1604 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1606 if (vect_print_dump_info (REPORT_DETAILS))
1607 fprintf (vect_dump, "op not supported by target.");
1608 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1609 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1610 < vect_min_worthwhile_factor (code))
1611 return false;
1612 if (vect_print_dump_info (REPORT_DETAILS))
1613 fprintf (vect_dump, "proceeding using word mode.");
1616 /* Worthwhile without SIMD support? */
1617 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1618 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1619 < vect_min_worthwhile_factor (code))
1621 if (vect_print_dump_info (REPORT_DETAILS))
1622 fprintf (vect_dump, "not worthwhile without SIMD support.");
1623 return false;
1626 /* 4.2. Check support for the epilog operation.
1628 If STMT represents a reduction pattern, then the type of the
1629 reduction variable may be different than the type of the rest
1630 of the arguments. For example, consider the case of accumulation
1631 of shorts into an int accumulator; The original code:
1632 S1: int_a = (int) short_a;
1633 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
1635 was replaced with:
1636 STMT: int_acc = widen_sum <short_a, int_acc>
1638 This means that:
1639 1. The tree-code that is used to create the vector operation in the
1640 epilog code (that reduces the partial results) is not the
1641 tree-code of STMT, but is rather the tree-code of the original
1642 stmt from the pattern that STMT is replacing. I.e, in the example
1643 above we want to use 'widen_sum' in the loop, but 'plus' in the
1644 epilog.
1645 2. The type (mode) we use to check available target support
1646 for the vector operation to be created in the *epilog*, is
1647 determined by the type of the reduction variable (in the example
1648 above we'd check this: plus_optab[vect_int_mode]).
1649 However the type (mode) we use to check available target support
1650 for the vector operation to be created *inside the loop*, is
1651 determined by the type of the other arguments to STMT (in the
1652 example we'd check this: widen_sum_optab[vect_short_mode]).
1654 This is contrary to "regular" reductions, in which the types of all
1655 the arguments are the same as the type of the reduction variable.
1656 For "regular" reductions we can therefore use the same vector type
1657 (and also the same tree-code) when generating the epilog code and
1658 when generating the code inside the loop. */
1660 if (orig_stmt)
1662 /* This is a reduction pattern: get the vectype from the type of the
1663 reduction variable, and get the tree-code from orig_stmt. */
1664 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1665 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
1666 vec_mode = TYPE_MODE (vectype);
1668 else
1670 /* Regular reduction: use the same vectype and tree-code as used for
1671 the vector code inside the loop can be used for the epilog code. */
1672 orig_code = code;
1675 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
1676 return false;
1677 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
1678 if (!reduc_optab)
1680 if (vect_print_dump_info (REPORT_DETAILS))
1681 fprintf (vect_dump, "no optab for reduction.");
1682 epilog_reduc_code = NUM_TREE_CODES;
1684 if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1686 if (vect_print_dump_info (REPORT_DETAILS))
1687 fprintf (vect_dump, "reduc op not supported by target.");
1688 epilog_reduc_code = NUM_TREE_CODES;
1691 if (!vec_stmt) /* transformation not required. */
1693 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
1694 return true;
1697 /** Transform. **/
1699 if (vect_print_dump_info (REPORT_DETAILS))
1700 fprintf (vect_dump, "transform reduction.");
1702 /* Create the destination vector */
1703 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1705 /* Create the reduction-phi that defines the reduction-operand. */
1706 new_phi = create_phi_node (vec_dest, loop->header);
1708 /* In case the vectorization factor (VF) is bigger than the number
1709 of elements that we can fit in a vectype (nunits), we have to generate
1710 more than one vector stmt - i.e - we need to "unroll" the
1711 vector stmt by a factor VF/nunits. For more details see documentation
1712 in vectorizable_operation. */
1714 prev_stmt_info = NULL;
1715 for (j = 0; j < ncopies; j++)
1717 /* Handle uses. */
1718 if (j == 0)
1720 op = TREE_OPERAND (operation, 0);
1721 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
1722 if (op_type == ternary_op)
1724 op = TREE_OPERAND (operation, 1);
1725 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
1728 /* Get the vector def for the reduction variable from the phi node */
1729 reduc_def = PHI_RESULT (new_phi);
1731 else
1733 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
1734 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
1735 if (op_type == ternary_op)
1736 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
1738 /* Get the vector def for the reduction variable from the vectorized
1739 reduction operation generated in the previous iteration (j-1) */
1740 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
1743 /* Arguments are ready. create the new vector stmt. */
1745 if (op_type == binary_op)
1746 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
1747 else
1748 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
1749 reduc_def);
1750 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
1751 new_temp = make_ssa_name (vec_dest, new_stmt);
1752 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1753 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1755 if (j == 0)
1756 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1757 else
1758 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1759 prev_stmt_info = vinfo_for_stmt (new_stmt);
1762 /* Finalize the reduction-phi (set it's arguments) and create the
1763 epilog reduction code. */
1764 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
1765 return true;
1768 /* Checks if CALL can be vectorized in type VECTYPE. Returns
1769 a function declaration if the target has a vectorized version
1770 of the function, or NULL_TREE if the function cannot be vectorized. */
1772 tree
1773 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
1775 tree fndecl = get_callee_fndecl (call);
1776 enum built_in_function code;
1778 /* We only handle functions that do not read or clobber memory -- i.e.
1779 const or novops ones. */
1780 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
1781 return NULL_TREE;
1783 if (!fndecl
1784 || TREE_CODE (fndecl) != FUNCTION_DECL
1785 || !DECL_BUILT_IN (fndecl))
1786 return NULL_TREE;
1788 code = DECL_FUNCTION_CODE (fndecl);
1789 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
1790 vectype_in);
1793 /* Function vectorizable_call.
1795 Check if STMT performs a function call that can be vectorized.
1796 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1797 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1798 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1800 bool
1801 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1803 tree vec_dest;
1804 tree scalar_dest;
1805 tree operation;
1806 tree args, type;
1807 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
1808 tree vectype_out, vectype_in;
1809 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1810 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
1811 enum vect_def_type dt[2];
1812 int ncopies, j, nargs;
1814 /* Is STMT a vectorizable call? */
1815 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
1816 return false;
1818 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
1819 return false;
1821 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1822 if (TREE_CODE (operation) != CALL_EXPR)
1823 return false;
1825 /* Process function arguments. */
1826 rhs_type = NULL_TREE;
1827 for (args = TREE_OPERAND (operation, 1), nargs = 0;
1828 args; args = TREE_CHAIN (args), ++nargs)
1830 tree op = TREE_VALUE (args);
1832 /* Bail out if the function has more than two arguments, we
1833 do not have interesting builtin functions to vectorize with
1834 more than two arguments. */
1835 if (nargs >= 2)
1836 return false;
1838 /* We can only handle calls with arguments of the same type. */
1839 if (rhs_type
1840 && rhs_type != TREE_TYPE (op))
1842 if (vect_print_dump_info (REPORT_DETAILS))
1843 fprintf (vect_dump, "argument types differ.");
1844 return false;
1846 rhs_type = TREE_TYPE (op);
1848 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
1850 if (vect_print_dump_info (REPORT_DETAILS))
1851 fprintf (vect_dump, "use not simple.");
1852 return false;
1856 /* No arguments is also not good. */
1857 if (nargs == 0)
1858 return false;
1860 vectype_in = get_vectype_for_scalar_type (rhs_type);
1862 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
1863 vectype_out = get_vectype_for_scalar_type (lhs_type);
1865 /* Only handle the case of vectors with the same number of elements.
1866 FIXME: We need a way to handle for example the SSE2 cvtpd2dq
1867 instruction which converts V2DFmode to V4SImode but only
1868 using the lower half of the V4SImode result. */
1869 if (TYPE_VECTOR_SUBPARTS (vectype_in) != TYPE_VECTOR_SUBPARTS (vectype_out))
1870 return false;
1872 /* For now, we only vectorize functions if a target specific builtin
1873 is available. TODO -- in some cases, it might be profitable to
1874 insert the calls for pieces of the vector, in order to be able
1875 to vectorize other operations in the loop. */
1876 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
1877 if (fndecl == NULL_TREE)
1879 if (vect_print_dump_info (REPORT_DETAILS))
1880 fprintf (vect_dump, "function is not vectorizable.");
1882 return false;
1885 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
1887 if (!vec_stmt) /* transformation not required. */
1889 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
1890 return true;
1893 /** Transform. **/
1895 if (vect_print_dump_info (REPORT_DETAILS))
1896 fprintf (vect_dump, "transform operation.");
1898 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1899 / TYPE_VECTOR_SUBPARTS (vectype_out));
1900 gcc_assert (ncopies >= 1);
1902 /* Handle def. */
1903 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1904 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
1906 prev_stmt_info = NULL;
1907 for (j = 0; j < ncopies; ++j)
1909 tree new_stmt, vargs;
1910 tree vec_oprnd[2];
1911 int n;
1913 /* Build argument list for the vectorized call. */
1914 vargs = NULL_TREE;
1915 for (args = TREE_OPERAND (operation, 1), n = 0;
1916 args; args = TREE_CHAIN (args), ++n)
1918 tree op = TREE_VALUE (args);
1920 if (j == 0)
1921 vec_oprnd[n] = vect_get_vec_def_for_operand (op, stmt, NULL);
1922 else
1923 vec_oprnd[n] = vect_get_vec_def_for_stmt_copy (dt[n], vec_oprnd[n]);
1925 vargs = tree_cons (NULL_TREE, vec_oprnd[n], vargs);
1927 vargs = nreverse (vargs);
1929 rhs = build_function_call_expr (fndecl, vargs);
1930 new_stmt = build2 (GIMPLE_MODIFY_STMT, NULL_TREE, vec_dest, rhs);
1931 new_temp = make_ssa_name (vec_dest, new_stmt);
1932 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1934 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1936 if (j == 0)
1937 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1938 else
1939 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1940 prev_stmt_info = vinfo_for_stmt (new_stmt);
1943 /* The call in STMT might prevent it from being removed in dce. We however
1944 cannot remove it here, due to the way the ssa name it defines is mapped
1945 to the new definition. So just replace rhs of the statement with something
1946 harmless. */
1947 type = TREE_TYPE (scalar_dest);
1948 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
1950 return true;
1954 /* Function vectorizable_conversion.
1956 Check if STMT performs a conversion operation, that can be vectorized.
1957 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1958 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1959 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1961 bool
1962 vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
1963 tree * vec_stmt)
1965 tree vec_dest;
1966 tree scalar_dest;
1967 tree operation;
1968 tree op0;
1969 tree vec_oprnd0 = NULL_TREE;
1970 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1971 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1972 enum tree_code code;
1973 tree new_temp;
1974 tree def, def_stmt;
1975 enum vect_def_type dt0;
1976 tree new_stmt;
1977 int nunits_in;
1978 int nunits_out;
1979 int ncopies, j;
1980 tree vectype_out, vectype_in;
1981 tree rhs_type, lhs_type;
1982 tree builtin_decl, params;
1983 stmt_vec_info prev_stmt_info;
1985 /* Is STMT a vectorizable conversion? */
1987 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1988 return false;
1990 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1992 if (STMT_VINFO_LIVE_P (stmt_info))
1994 /* FORNOW: not yet supported. */
1995 if (vect_print_dump_info (REPORT_DETAILS))
1996 fprintf (vect_dump, "value used after loop.");
1997 return false;
2000 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2001 return false;
2003 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2004 return false;
2006 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2007 code = TREE_CODE (operation);
2008 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
2009 return false;
2011 /* Check types of lhs and rhs */
2012 op0 = TREE_OPERAND (operation, 0);
2013 rhs_type = TREE_TYPE (op0);
2014 vectype_in = get_vectype_for_scalar_type (rhs_type);
2015 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2017 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2018 lhs_type = TREE_TYPE (scalar_dest);
2019 vectype_out = get_vectype_for_scalar_type (lhs_type);
2020 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
2021 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2023 /* FORNOW: need to extend to support short<->float conversions as well. */
2024 if (nunits_out != nunits_in)
2025 return false;
2027 /* Bail out if the types are both integral or non-integral */
2028 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
2029 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
2030 return false;
2032 /* Sanity check: make sure that at least one copy of the vectorized stmt
2033 needs to be generated. */
2034 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2035 gcc_assert (ncopies >= 1);
2037 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2039 if (vect_print_dump_info (REPORT_DETAILS))
2040 fprintf (vect_dump, "use not simple.");
2041 return false;
2044 /* Supportable by target? */
2045 if (!targetm.vectorize.builtin_conversion (code, vectype_in))
2047 if (vect_print_dump_info (REPORT_DETAILS))
2048 fprintf (vect_dump, "op not supported by target.");
2049 return false;
2052 if (!vec_stmt) /* transformation not required. */
2054 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
2055 return true;
2058 /** Transform. **/
2060 if (vect_print_dump_info (REPORT_DETAILS))
2061 fprintf (vect_dump, "transform conversion.");
2063 /* Handle def. */
2064 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2066 prev_stmt_info = NULL;
2067 for (j = 0; j < ncopies; j++)
2069 tree sym;
2070 ssa_op_iter iter;
2072 if (j == 0)
2073 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2074 else
2075 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2076 params = build_tree_list (NULL_TREE, vec_oprnd0);
2078 builtin_decl =
2079 targetm.vectorize.builtin_conversion (code, vectype_in);
2080 new_stmt = build_function_call_expr (builtin_decl, params);
2082 /* Arguments are ready. create the new vector stmt. */
2083 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
2084 new_stmt);
2085 new_temp = make_ssa_name (vec_dest, new_stmt);
2086 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2087 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2088 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2090 if (TREE_CODE (sym) == SSA_NAME)
2091 sym = SSA_NAME_VAR (sym);
2092 mark_sym_for_renaming (sym);
2095 if (j == 0)
2096 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2097 else
2098 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2099 prev_stmt_info = vinfo_for_stmt (new_stmt);
2101 return true;
2105 /* Function vectorizable_assignment.
2107 Check if STMT performs an assignment (copy) that can be vectorized.
2108 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2109 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2110 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2112 bool
2113 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2115 tree vec_dest;
2116 tree scalar_dest;
2117 tree op;
2118 tree vec_oprnd;
2119 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2120 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2121 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2122 tree new_temp;
2123 tree def, def_stmt;
2124 enum vect_def_type dt;
2125 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2126 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2128 gcc_assert (ncopies >= 1);
2129 if (ncopies > 1)
2130 return false; /* FORNOW */
2132 /* Is vectorizable assignment? */
2133 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2134 return false;
2136 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2138 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2139 return false;
2141 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2142 if (TREE_CODE (scalar_dest) != SSA_NAME)
2143 return false;
2145 op = GIMPLE_STMT_OPERAND (stmt, 1);
2146 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
2148 if (vect_print_dump_info (REPORT_DETAILS))
2149 fprintf (vect_dump, "use not simple.");
2150 return false;
2153 if (!vec_stmt) /* transformation not required. */
2155 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
2156 return true;
2159 /** Transform. **/
2160 if (vect_print_dump_info (REPORT_DETAILS))
2161 fprintf (vect_dump, "transform assignment.");
2163 /* Handle def. */
2164 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2166 /* Handle use. */
2167 op = GIMPLE_STMT_OPERAND (stmt, 1);
2168 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
2170 /* Arguments are ready. create the new vector stmt. */
2171 *vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, vec_oprnd);
2172 new_temp = make_ssa_name (vec_dest, *vec_stmt);
2173 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
2174 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
2176 return true;
2180 /* Function vect_min_worthwhile_factor.
2182 For a loop where we could vectorize the operation indicated by CODE,
2183 return the minimum vectorization factor that makes it worthwhile
2184 to use generic vectors. */
2185 static int
2186 vect_min_worthwhile_factor (enum tree_code code)
2188 switch (code)
2190 case PLUS_EXPR:
2191 case MINUS_EXPR:
2192 case NEGATE_EXPR:
2193 return 4;
2195 case BIT_AND_EXPR:
2196 case BIT_IOR_EXPR:
2197 case BIT_XOR_EXPR:
2198 case BIT_NOT_EXPR:
2199 return 2;
2201 default:
2202 return INT_MAX;
2207 /* Function vectorizable_operation.
2209 Check if STMT performs a binary or unary operation that can be vectorized.
2210 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2211 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2212 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2214 bool
2215 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2217 tree vec_dest;
2218 tree scalar_dest;
2219 tree operation;
2220 tree op0, op1 = NULL;
2221 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2222 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2223 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2224 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2225 enum tree_code code;
2226 enum machine_mode vec_mode;
2227 tree new_temp;
2228 int op_type;
2229 optab optab;
2230 int icode;
2231 enum machine_mode optab_op2_mode;
2232 tree def, def_stmt;
2233 enum vect_def_type dt0, dt1;
2234 tree new_stmt;
2235 stmt_vec_info prev_stmt_info;
2236 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
2237 int nunits_out;
2238 tree vectype_out;
2239 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2240 int j;
2242 gcc_assert (ncopies >= 1);
2244 /* Is STMT a vectorizable binary/unary operation? */
2245 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2246 return false;
2248 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2250 if (STMT_VINFO_LIVE_P (stmt_info))
2252 /* FORNOW: not yet supported. */
2253 if (vect_print_dump_info (REPORT_DETAILS))
2254 fprintf (vect_dump, "value used after loop.");
2255 return false;
2258 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2259 return false;
2261 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2262 return false;
2264 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2265 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2266 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2267 if (nunits_out != nunits_in)
2268 return false;
2270 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2271 code = TREE_CODE (operation);
2272 optab = optab_for_tree_code (code, vectype);
2274 /* Support only unary or binary operations. */
2275 op_type = TREE_CODE_LENGTH (code);
2276 if (op_type != unary_op && op_type != binary_op)
2278 if (vect_print_dump_info (REPORT_DETAILS))
2279 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
2280 return false;
2283 op0 = TREE_OPERAND (operation, 0);
2284 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2286 if (vect_print_dump_info (REPORT_DETAILS))
2287 fprintf (vect_dump, "use not simple.");
2288 return false;
2291 if (op_type == binary_op)
2293 op1 = TREE_OPERAND (operation, 1);
2294 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2296 if (vect_print_dump_info (REPORT_DETAILS))
2297 fprintf (vect_dump, "use not simple.");
2298 return false;
2302 /* Supportable by target? */
2303 if (!optab)
2305 if (vect_print_dump_info (REPORT_DETAILS))
2306 fprintf (vect_dump, "no optab.");
2307 return false;
2309 vec_mode = TYPE_MODE (vectype);
2310 icode = (int) optab->handlers[(int) vec_mode].insn_code;
2311 if (icode == CODE_FOR_nothing)
2313 if (vect_print_dump_info (REPORT_DETAILS))
2314 fprintf (vect_dump, "op not supported by target.");
2315 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2316 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2317 < vect_min_worthwhile_factor (code))
2318 return false;
2319 if (vect_print_dump_info (REPORT_DETAILS))
2320 fprintf (vect_dump, "proceeding using word mode.");
2323 /* Worthwhile without SIMD support? */
2324 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2325 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2326 < vect_min_worthwhile_factor (code))
2328 if (vect_print_dump_info (REPORT_DETAILS))
2329 fprintf (vect_dump, "not worthwhile without SIMD support.");
2330 return false;
2333 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2335 /* FORNOW: not yet supported. */
2336 if (!VECTOR_MODE_P (vec_mode))
2337 return false;
2339 /* Invariant argument is needed for a vector shift
2340 by a scalar shift operand. */
2341 optab_op2_mode = insn_data[icode].operand[2].mode;
2342 if (! (VECTOR_MODE_P (optab_op2_mode)
2343 || dt1 == vect_constant_def
2344 || dt1 == vect_invariant_def))
2346 if (vect_print_dump_info (REPORT_DETAILS))
2347 fprintf (vect_dump, "operand mode requires invariant argument.");
2348 return false;
2352 if (!vec_stmt) /* transformation not required. */
2354 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
2355 return true;
2358 /** Transform. **/
2360 if (vect_print_dump_info (REPORT_DETAILS))
2361 fprintf (vect_dump, "transform binary/unary operation.");
2363 /* Handle def. */
2364 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2366 /* In case the vectorization factor (VF) is bigger than the number
2367 of elements that we can fit in a vectype (nunits), we have to generate
2368 more than one vector stmt - i.e - we need to "unroll" the
2369 vector stmt by a factor VF/nunits. In doing so, we record a pointer
2370 from one copy of the vector stmt to the next, in the field
2371 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
2372 stages to find the correct vector defs to be used when vectorizing
2373 stmts that use the defs of the current stmt. The example below illustrates
2374 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
2375 4 vectorized stmts):
2377 before vectorization:
2378 RELATED_STMT VEC_STMT
2379 S1: x = memref - -
2380 S2: z = x + 1 - -
2382 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
2383 there):
2384 RELATED_STMT VEC_STMT
2385 VS1_0: vx0 = memref0 VS1_1 -
2386 VS1_1: vx1 = memref1 VS1_2 -
2387 VS1_2: vx2 = memref2 VS1_3 -
2388 VS1_3: vx3 = memref3 - -
2389 S1: x = load - VS1_0
2390 S2: z = x + 1 - -
2392 step2: vectorize stmt S2 (done here):
2393 To vectorize stmt S2 we first need to find the relevant vector
2394 def for the first operand 'x'. This is, as usual, obtained from
2395 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
2396 that defines 'x' (S1). This way we find the stmt VS1_0, and the
2397 relevant vector def 'vx0'. Having found 'vx0' we can generate
2398 the vector stmt VS2_0, and as usual, record it in the
2399 STMT_VINFO_VEC_STMT of stmt S2.
2400 When creating the second copy (VS2_1), we obtain the relevant vector
2401 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
2402 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
2403 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
2404 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
2405 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
2406 chain of stmts and pointers:
2407 RELATED_STMT VEC_STMT
2408 VS1_0: vx0 = memref0 VS1_1 -
2409 VS1_1: vx1 = memref1 VS1_2 -
2410 VS1_2: vx2 = memref2 VS1_3 -
2411 VS1_3: vx3 = memref3 - -
2412 S1: x = load - VS1_0
2413 VS2_0: vz0 = vx0 + v1 VS2_1 -
2414 VS2_1: vz1 = vx1 + v1 VS2_2 -
2415 VS2_2: vz2 = vx2 + v1 VS2_3 -
2416 VS2_3: vz3 = vx3 + v1 - -
2417 S2: z = x + 1 - VS2_0 */
2419 prev_stmt_info = NULL;
2420 for (j = 0; j < ncopies; j++)
2422 /* Handle uses. */
2423 if (j == 0)
2425 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2426 if (op_type == binary_op)
2428 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2430 /* Vector shl and shr insn patterns can be defined with
2431 scalar operand 2 (shift operand). In this case, use
2432 constant or loop invariant op1 directly, without
2433 extending it to vector mode first. */
2434 optab_op2_mode = insn_data[icode].operand[2].mode;
2435 if (!VECTOR_MODE_P (optab_op2_mode))
2437 if (vect_print_dump_info (REPORT_DETAILS))
2438 fprintf (vect_dump, "operand 1 using scalar mode.");
2439 vec_oprnd1 = op1;
2442 if (!vec_oprnd1)
2443 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2446 else
2448 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2449 if (op_type == binary_op)
2450 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2453 /* Arguments are ready. create the new vector stmt. */
2455 if (op_type == binary_op)
2456 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
2457 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
2458 else
2459 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
2460 build1 (code, vectype, vec_oprnd0));
2461 new_temp = make_ssa_name (vec_dest, new_stmt);
2462 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2463 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2465 if (j == 0)
2466 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2467 else
2468 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2469 prev_stmt_info = vinfo_for_stmt (new_stmt);
2472 return true;
2476 /* Function vectorizable_type_demotion
2478 Check if STMT performs a binary or unary operation that involves
2479 type demotion, and if it can be vectorized.
2480 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2481 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2482 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2484 bool
2485 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
2486 tree *vec_stmt)
2488 tree vec_dest;
2489 tree scalar_dest;
2490 tree operation;
2491 tree op0;
2492 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2493 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2494 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2495 enum tree_code code;
2496 tree new_temp;
2497 tree def, def_stmt;
2498 enum vect_def_type dt0;
2499 tree new_stmt;
2500 stmt_vec_info prev_stmt_info;
2501 int nunits_in;
2502 int nunits_out;
2503 tree vectype_out;
2504 int ncopies;
2505 int j;
2506 tree expr;
2507 tree vectype_in;
2508 tree scalar_type;
2509 optab optab;
2510 enum machine_mode vec_mode;
2512 /* Is STMT a vectorizable type-demotion operation? */
2514 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2515 return false;
2517 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2519 if (STMT_VINFO_LIVE_P (stmt_info))
2521 /* FORNOW: not yet supported. */
2522 if (vect_print_dump_info (REPORT_DETAILS))
2523 fprintf (vect_dump, "value used after loop.");
2524 return false;
2527 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2528 return false;
2530 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2531 return false;
2533 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2534 code = TREE_CODE (operation);
2535 if (code != NOP_EXPR && code != CONVERT_EXPR)
2536 return false;
2538 op0 = TREE_OPERAND (operation, 0);
2539 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2540 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2542 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2543 scalar_type = TREE_TYPE (scalar_dest);
2544 vectype_out = get_vectype_for_scalar_type (scalar_type);
2545 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2546 if (nunits_in != nunits_out / 2) /* FORNOW */
2547 return false;
2549 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2550 gcc_assert (ncopies >= 1);
2552 if (! INTEGRAL_TYPE_P (scalar_type)
2553 || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2554 return false;
2556 /* Check the operands of the operation. */
2557 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2559 if (vect_print_dump_info (REPORT_DETAILS))
2560 fprintf (vect_dump, "use not simple.");
2561 return false;
2564 /* Supportable by target? */
2565 code = VEC_PACK_MOD_EXPR;
2566 optab = optab_for_tree_code (VEC_PACK_MOD_EXPR, vectype_in);
2567 if (!optab)
2568 return false;
2570 vec_mode = TYPE_MODE (vectype_in);
2571 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
2572 return false;
2574 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2576 if (!vec_stmt) /* transformation not required. */
2578 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
2579 return true;
2582 /** Transform. **/
2584 if (vect_print_dump_info (REPORT_DETAILS))
2585 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
2586 ncopies);
2588 /* Handle def. */
2589 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2591 /* In case the vectorization factor (VF) is bigger than the number
2592 of elements that we can fit in a vectype (nunits), we have to generate
2593 more than one vector stmt - i.e - we need to "unroll" the
2594 vector stmt by a factor VF/nunits. */
2595 prev_stmt_info = NULL;
2596 for (j = 0; j < ncopies; j++)
2598 /* Handle uses. */
2599 if (j == 0)
2601 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2602 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2604 else
2606 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
2607 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2610 /* Arguments are ready. Create the new vector stmt. */
2611 expr = build2 (code, vectype_out, vec_oprnd0, vec_oprnd1);
2612 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
2613 new_temp = make_ssa_name (vec_dest, new_stmt);
2614 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2615 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2617 if (j == 0)
2618 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2619 else
2620 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2622 prev_stmt_info = vinfo_for_stmt (new_stmt);
2625 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2626 return true;
2630 /* Function vect_gen_widened_results_half
2632 Create a vector stmt whose code, type, number of arguments, and result
2633 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2634 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2635 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2636 needs to be created (DECL is a function-decl of a target-builtin).
2637 STMT is the original scalar stmt that we are vectorizing. */
2639 static tree
2640 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2641 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2642 tree vec_dest, block_stmt_iterator *bsi,
2643 tree stmt)
2645 tree vec_params;
2646 tree expr;
2647 tree new_stmt;
2648 tree new_temp;
2649 tree sym;
2650 ssa_op_iter iter;
2652 /* Generate half of the widened result: */
2653 if (code == CALL_EXPR)
2655 /* Target specific support */
2656 vec_params = build_tree_list (NULL_TREE, vec_oprnd0);
2657 if (op_type == binary_op)
2658 vec_params = tree_cons (NULL_TREE, vec_oprnd1, vec_params);
2659 expr = build_function_call_expr (decl, vec_params);
2661 else
2663 /* Generic support */
2664 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2665 if (op_type == binary_op)
2666 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2667 else
2668 expr = build1 (code, vectype, vec_oprnd0);
2670 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
2671 new_temp = make_ssa_name (vec_dest, new_stmt);
2672 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2673 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2675 if (code == CALL_EXPR)
2677 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2679 if (TREE_CODE (sym) == SSA_NAME)
2680 sym = SSA_NAME_VAR (sym);
2681 mark_sym_for_renaming (sym);
2685 return new_stmt;
2689 /* Function vectorizable_type_promotion
2691 Check if STMT performs a binary or unary operation that involves
2692 type promotion, and if it can be vectorized.
2693 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2694 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2695 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2697 bool
2698 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
2699 tree *vec_stmt)
2701 tree vec_dest;
2702 tree scalar_dest;
2703 tree operation;
2704 tree op0, op1 = NULL;
2705 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2706 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2707 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2708 enum tree_code code, code1 = CODE_FOR_nothing, code2 = CODE_FOR_nothing;
2709 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
2710 int op_type;
2711 tree def, def_stmt;
2712 enum vect_def_type dt0, dt1;
2713 tree new_stmt;
2714 stmt_vec_info prev_stmt_info;
2715 int nunits_in;
2716 int nunits_out;
2717 tree vectype_out;
2718 int ncopies;
2719 int j;
2720 tree vectype_in;
2722 /* Is STMT a vectorizable type-promotion operation? */
2724 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2725 return false;
2727 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2729 if (STMT_VINFO_LIVE_P (stmt_info))
2731 /* FORNOW: not yet supported. */
2732 if (vect_print_dump_info (REPORT_DETAILS))
2733 fprintf (vect_dump, "value used after loop.");
2734 return false;
2737 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2738 return false;
2740 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2741 return false;
2743 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2744 code = TREE_CODE (operation);
2745 if (code != NOP_EXPR && code != WIDEN_MULT_EXPR)
2746 return false;
2748 op0 = TREE_OPERAND (operation, 0);
2749 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2750 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2751 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2752 gcc_assert (ncopies >= 1);
2754 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2755 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2756 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2757 if (nunits_out != nunits_in / 2) /* FORNOW */
2758 return false;
2760 if (! INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
2761 || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2762 return false;
2764 /* Check the operands of the operation. */
2765 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2767 if (vect_print_dump_info (REPORT_DETAILS))
2768 fprintf (vect_dump, "use not simple.");
2769 return false;
2772 op_type = TREE_CODE_LENGTH (code);
2773 if (op_type == binary_op)
2775 op1 = TREE_OPERAND (operation, 1);
2776 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2778 if (vect_print_dump_info (REPORT_DETAILS))
2779 fprintf (vect_dump, "use not simple.");
2780 return false;
2784 /* Supportable by target? */
2785 if (!supportable_widening_operation (code, stmt, vectype_in,
2786 &decl1, &decl2, &code1, &code2))
2787 return false;
2789 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2791 if (!vec_stmt) /* transformation not required. */
2793 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
2794 return true;
2797 /** Transform. **/
2799 if (vect_print_dump_info (REPORT_DETAILS))
2800 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
2801 ncopies);
2803 /* Handle def. */
2804 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2806 /* In case the vectorization factor (VF) is bigger than the number
2807 of elements that we can fit in a vectype (nunits), we have to generate
2808 more than one vector stmt - i.e - we need to "unroll" the
2809 vector stmt by a factor VF/nunits. */
2811 prev_stmt_info = NULL;
2812 for (j = 0; j < ncopies; j++)
2814 /* Handle uses. */
2815 if (j == 0)
2817 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2818 if (op_type == binary_op)
2819 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2821 else
2823 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2824 if (op_type == binary_op)
2825 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2828 /* Arguments are ready. Create the new vector stmt. We are creating
2829 two vector defs because the widened result does not fit in one vector.
2830 The vectorized stmt can be expressed as a call to a taregt builtin,
2831 or a using a tree-code. */
2832 /* Generate first half of the widened result: */
2833 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
2834 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2835 if (j == 0)
2836 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2837 else
2838 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2839 prev_stmt_info = vinfo_for_stmt (new_stmt);
2841 /* Generate second half of the widened result: */
2842 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
2843 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2844 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2845 prev_stmt_info = vinfo_for_stmt (new_stmt);
2849 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2850 return true;
2854 /* Function vect_strided_store_supported.
2856 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
2857 and FALSE otherwise. */
2859 static bool
2860 vect_strided_store_supported (tree vectype)
2862 optab interleave_high_optab, interleave_low_optab;
2863 int mode;
2865 mode = (int) TYPE_MODE (vectype);
2867 /* Check that the operation is supported. */
2868 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
2869 vectype);
2870 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
2871 vectype);
2872 if (!interleave_high_optab || !interleave_low_optab)
2874 if (vect_print_dump_info (REPORT_DETAILS))
2875 fprintf (vect_dump, "no optab for interleave.");
2876 return false;
2879 if (interleave_high_optab->handlers[(int) mode].insn_code
2880 == CODE_FOR_nothing
2881 || interleave_low_optab->handlers[(int) mode].insn_code
2882 == CODE_FOR_nothing)
2884 if (vect_print_dump_info (REPORT_DETAILS))
2885 fprintf (vect_dump, "interleave op not supported by target.");
2886 return false;
2888 return true;
2892 /* Function vect_permute_store_chain.
2894 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
2895 a power of 2, generate interleave_high/low stmts to reorder the data
2896 correctly for the stores. Return the final references for stores in
2897 RESULT_CHAIN.
2899 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
2900 The input is 4 vectors each containing 8 elements. We assign a number to each
2901 element, the input sequence is:
2903 1st vec: 0 1 2 3 4 5 6 7
2904 2nd vec: 8 9 10 11 12 13 14 15
2905 3rd vec: 16 17 18 19 20 21 22 23
2906 4th vec: 24 25 26 27 28 29 30 31
2908 The output sequence should be:
2910 1st vec: 0 8 16 24 1 9 17 25
2911 2nd vec: 2 10 18 26 3 11 19 27
2912 3rd vec: 4 12 20 28 5 13 21 30
2913 4th vec: 6 14 22 30 7 15 23 31
2915 i.e., we interleave the contents of the four vectors in their order.
2917 We use interleave_high/low instructions to create such output. The input of
2918 each interleave_high/low operation is two vectors:
2919 1st vec 2nd vec
2920 0 1 2 3 4 5 6 7
2921 the even elements of the result vector are obtained left-to-right from the
2922 high/low elements of the first vector. The odd elements of the result are
2923 obtained left-to-right from the high/low elements of the second vector.
2924 The output of interleave_high will be: 0 4 1 5
2925 and of interleave_low: 2 6 3 7
2928 The permutation is done in log LENGTH stages. In each stage interleave_high
2929 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
2930 where the first argument is taken from the first half of DR_CHAIN and the
2931 second argument from it's second half.
2932 In our example,
2934 I1: interleave_high (1st vec, 3rd vec)
2935 I2: interleave_low (1st vec, 3rd vec)
2936 I3: interleave_high (2nd vec, 4th vec)
2937 I4: interleave_low (2nd vec, 4th vec)
2939 The output for the first stage is:
2941 I1: 0 16 1 17 2 18 3 19
2942 I2: 4 20 5 21 6 22 7 23
2943 I3: 8 24 9 25 10 26 11 27
2944 I4: 12 28 13 29 14 30 15 31
2946 The output of the second stage, i.e. the final result is:
2948 I1: 0 8 16 24 1 9 17 25
2949 I2: 2 10 18 26 3 11 19 27
2950 I3: 4 12 20 28 5 13 21 30
2951 I4: 6 14 22 30 7 15 23 31. */
2953 static bool
2954 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
2955 unsigned int length,
2956 tree stmt,
2957 block_stmt_iterator *bsi,
2958 VEC(tree,heap) **result_chain)
2960 tree perm_dest, perm_stmt, vect1, vect2, high, low;
2961 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2962 tree scalar_dest;
2963 int i;
2964 unsigned int j;
2965 VEC(tree,heap) *first, *second;
2967 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2968 first = VEC_alloc (tree, heap, length/2);
2969 second = VEC_alloc (tree, heap, length/2);
2971 /* Check that the operation is supported. */
2972 if (!vect_strided_store_supported (vectype))
2973 return false;
2975 *result_chain = VEC_copy (tree, heap, dr_chain);
2977 for (i = 0; i < exact_log2 (length); i++)
2979 for (j = 0; j < length/2; j++)
2981 vect1 = VEC_index (tree, dr_chain, j);
2982 vect2 = VEC_index (tree, dr_chain, j+length/2);
2984 /* Create interleaving stmt:
2985 in the case of big endian:
2986 high = interleave_high (vect1, vect2)
2987 and in the case of little endian:
2988 high = interleave_low (vect1, vect2). */
2989 perm_dest = create_tmp_var (vectype, "vect_inter_high");
2990 DECL_GIMPLE_REG_P (perm_dest) = 1;
2991 add_referenced_var (perm_dest);
2992 if (BYTES_BIG_ENDIAN)
2993 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2994 build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype,
2995 vect1, vect2));
2996 else
2997 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2998 build2 (VEC_INTERLEAVE_LOW_EXPR, vectype,
2999 vect1, vect2));
3000 high = make_ssa_name (perm_dest, perm_stmt);
3001 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
3002 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3003 VEC_replace (tree, *result_chain, 2*j, high);
3005 /* Create interleaving stmt:
3006 in the case of big endian:
3007 low = interleave_low (vect1, vect2)
3008 and in the case of little endian:
3009 low = interleave_high (vect1, vect2). */
3010 perm_dest = create_tmp_var (vectype, "vect_inter_low");
3011 DECL_GIMPLE_REG_P (perm_dest) = 1;
3012 add_referenced_var (perm_dest);
3013 if (BYTES_BIG_ENDIAN)
3014 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3015 build2 (VEC_INTERLEAVE_LOW_EXPR, vectype,
3016 vect1, vect2));
3017 else
3018 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3019 build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype,
3020 vect1, vect2));
3021 low = make_ssa_name (perm_dest, perm_stmt);
3022 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
3023 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3024 VEC_replace (tree, *result_chain, 2*j+1, low);
3026 dr_chain = VEC_copy (tree, heap, *result_chain);
3028 return true;
3032 /* Function vectorizable_store.
3034 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
3035 can be vectorized.
3036 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3037 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3038 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3040 bool
3041 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3043 tree scalar_dest;
3044 tree data_ref;
3045 tree op;
3046 tree vec_oprnd = NULL_TREE;
3047 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3048 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
3049 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3050 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3051 enum machine_mode vec_mode;
3052 tree dummy;
3053 enum dr_alignment_support alignment_support_cheme;
3054 ssa_op_iter iter;
3055 def_operand_p def_p;
3056 tree def, def_stmt;
3057 enum vect_def_type dt;
3058 stmt_vec_info prev_stmt_info = NULL;
3059 tree dataref_ptr = NULL_TREE;
3060 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3061 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3062 int j;
3063 tree next_stmt, first_stmt;
3064 bool strided_store = false;
3065 unsigned int group_size, i;
3066 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
3067 gcc_assert (ncopies >= 1);
3069 /* Is vectorizable store? */
3071 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3072 return false;
3074 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3075 if (TREE_CODE (scalar_dest) != ARRAY_REF
3076 && TREE_CODE (scalar_dest) != INDIRECT_REF
3077 && !DR_GROUP_FIRST_DR (stmt_info))
3078 return false;
3080 op = GIMPLE_STMT_OPERAND (stmt, 1);
3081 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
3083 if (vect_print_dump_info (REPORT_DETAILS))
3084 fprintf (vect_dump, "use not simple.");
3085 return false;
3088 vec_mode = TYPE_MODE (vectype);
3089 /* FORNOW. In some cases can vectorize even if data-type not supported
3090 (e.g. - array initialization with 0). */
3091 if (mov_optab->handlers[(int)vec_mode].insn_code == CODE_FOR_nothing)
3092 return false;
3094 if (!STMT_VINFO_DATA_REF (stmt_info))
3095 return false;
3097 if (DR_GROUP_FIRST_DR (stmt_info))
3099 strided_store = true;
3100 if (!vect_strided_store_supported (vectype))
3101 return false;
3104 if (!vec_stmt) /* transformation not required. */
3106 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
3107 return true;
3110 /** Transform. **/
3112 if (vect_print_dump_info (REPORT_DETAILS))
3113 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
3115 if (strided_store)
3117 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3118 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3119 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3121 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
3123 /* We vectorize all the stmts of the interleaving group when we
3124 reach the last stmt in the group. */
3125 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
3126 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
3128 *vec_stmt = NULL_TREE;
3129 return true;
3132 else
3134 first_stmt = stmt;
3135 first_dr = dr;
3136 group_size = 1;
3139 dr_chain = VEC_alloc (tree, heap, group_size);
3140 oprnds = VEC_alloc (tree, heap, group_size);
3142 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3143 gcc_assert (alignment_support_cheme);
3144 gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */
3146 /* In case the vectorization factor (VF) is bigger than the number
3147 of elements that we can fit in a vectype (nunits), we have to generate
3148 more than one vector stmt - i.e - we need to "unroll" the
3149 vector stmt by a factor VF/nunits. For more details see documentation in
3150 vect_get_vec_def_for_copy_stmt. */
3152 /* In case of interleaving (non-unit strided access):
3154 S1: &base + 2 = x2
3155 S2: &base = x0
3156 S3: &base + 1 = x1
3157 S4: &base + 3 = x3
3159 We create vectorized stores starting from base address (the access of the
3160 first stmt in the chain (S2 in the above example), when the last store stmt
3161 of the chain (S4) is reached:
3163 VS1: &base = vx2
3164 VS2: &base + vec_size*1 = vx0
3165 VS3: &base + vec_size*2 = vx1
3166 VS4: &base + vec_size*3 = vx3
3168 Then permutation statements are generated:
3170 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
3171 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
3174 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3175 (the order of the data-refs in the output of vect_permute_store_chain
3176 corresponds to the order of scalar stmts in the interleaving chain - see
3177 the documentation of vect_permute_store_chain()).
3179 In case of both multiple types and interleaving, above vector stores and
3180 permutation stmts are created for every copy. The result vector stmts are
3181 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3182 STMT_VINFO_RELATED_STMT for the next copies.
3185 prev_stmt_info = NULL;
3186 for (j = 0; j < ncopies; j++)
3188 tree new_stmt;
3189 tree ptr_incr;
3191 if (j == 0)
3193 /* For interleaved stores we collect vectorized defs for all the
3194 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
3195 as an input to vect_permute_store_chain(), and OPRNDS as an input
3196 to vect_get_vec_def_for_stmt_copy() for the next copy.
3197 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3198 OPRNDS are of size 1. */
3199 next_stmt = first_stmt;
3200 for (i = 0; i < group_size; i++)
3202 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
3203 is the exact number of stmts in the chain. Therefore, NEXT_STMT
3204 can't be NULL_TREE. In case that there is no interleaving,
3205 GROUP_SIZE is 1, and only one iteration of the loop will be
3206 executed. */
3207 gcc_assert (next_stmt);
3208 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
3209 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
3210 VEC_quick_push(tree, dr_chain, vec_oprnd);
3211 VEC_quick_push(tree, oprnds, vec_oprnd);
3212 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3214 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE,
3215 &dummy, &ptr_incr, false,
3216 TREE_TYPE (vec_oprnd));
3218 else
3220 /* For interleaved stores we created vectorized defs for all the
3221 defs stored in OPRNDS in the previous iteration (previous copy).
3222 DR_CHAIN is then used as an input to vect_permute_store_chain(),
3223 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
3224 next copy.
3225 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3226 OPRNDS are of size 1. */
3227 for (i = 0; i < group_size; i++)
3229 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
3230 VEC_index (tree, oprnds, i));
3231 VEC_replace(tree, dr_chain, i, vec_oprnd);
3232 VEC_replace(tree, oprnds, i, vec_oprnd);
3234 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3237 if (strided_store)
3239 result_chain = VEC_alloc (tree, heap, group_size);
3240 /* Permute. */
3241 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
3242 &result_chain))
3243 return false;
3246 next_stmt = first_stmt;
3247 for (i = 0; i < group_size; i++)
3249 /* For strided stores vectorized defs are interleaved in
3250 vect_permute_store_chain(). */
3251 if (strided_store)
3252 vec_oprnd = VEC_index(tree, result_chain, i);
3254 data_ref = build_fold_indirect_ref (dataref_ptr);
3255 /* Arguments are ready. Create the new vector stmt. */
3256 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, data_ref,
3257 vec_oprnd);
3258 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3260 /* Set the VDEFs for the vector pointer. If this virtual def
3261 has a use outside the loop and a loop peel is performed
3262 then the def may be renamed by the peel. Mark it for
3263 renaming so the later use will also be renamed. */
3264 copy_virtual_operands (new_stmt, next_stmt);
3265 if (j == 0)
3267 /* The original store is deleted so the same SSA_NAMEs
3268 can be used. */
3269 FOR_EACH_SSA_TREE_OPERAND (def, next_stmt, iter, SSA_OP_VDEF)
3271 SSA_NAME_DEF_STMT (def) = new_stmt;
3272 mark_sym_for_renaming (SSA_NAME_VAR (def));
3275 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3277 else
3279 /* Create new names for all the definitions created by COPY and
3280 add replacement mappings for each new name. */
3281 FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VDEF)
3283 create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p);
3284 mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p)));
3287 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3290 prev_stmt_info = vinfo_for_stmt (new_stmt);
3291 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3292 if (!next_stmt)
3293 break;
3294 /* Bump the vector pointer. */
3295 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3299 return true;
3303 /* Function vect_setup_realignment
3305 This function is called when vectorizing an unaligned load using
3306 the dr_unaligned_software_pipeline scheme.
3307 This function generates the following code at the loop prolog:
3309 p = initial_addr;
3310 msq_init = *(floor(p)); # prolog load
3311 realignment_token = call target_builtin;
3312 loop:
3313 msq = phi (msq_init, ---)
3315 The code above sets up a new (vector) pointer, pointing to the first
3316 location accessed by STMT, and a "floor-aligned" load using that pointer.
3317 It also generates code to compute the "realignment-token" (if the relevant
3318 target hook was defined), and creates a phi-node at the loop-header bb
3319 whose arguments are the result of the prolog-load (created by this
3320 function) and the result of a load that takes place in the loop (to be
3321 created by the caller to this function).
3322 The caller to this function uses the phi-result (msq) to create the
3323 realignment code inside the loop, and sets up the missing phi argument,
3324 as follows:
3326 loop:
3327 msq = phi (msq_init, lsq)
3328 lsq = *(floor(p')); # load in loop
3329 result = realign_load (msq, lsq, realignment_token);
3331 Input:
3332 STMT - (scalar) load stmt to be vectorized. This load accesses
3333 a memory location that may be unaligned.
3334 BSI - place where new code is to be inserted.
3336 Output:
3337 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
3338 target hook, if defined.
3339 Return value - the result of the loop-header phi node. */
3341 static tree
3342 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
3343 tree *realignment_token)
3345 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3346 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3347 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3348 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3349 edge pe = loop_preheader_edge (loop);
3350 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3351 tree vec_dest;
3352 tree init_addr;
3353 tree inc;
3354 tree ptr;
3355 tree data_ref;
3356 tree new_stmt;
3357 basic_block new_bb;
3358 tree msq_init;
3359 tree new_temp;
3360 tree phi_stmt;
3361 tree msq;
3363 /* 1. Create msq_init = *(floor(p1)) in the loop preheader */
3364 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3365 ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true,
3366 NULL_TREE);
3367 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
3368 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, data_ref);
3369 new_temp = make_ssa_name (vec_dest, new_stmt);
3370 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3371 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3372 gcc_assert (!new_bb);
3373 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
3374 copy_virtual_operands (new_stmt, stmt);
3375 update_vuses_to_preheader (new_stmt, loop);
3377 /* 2. Create permutation mask, if required, in loop preheader. */
3378 if (targetm.vectorize.builtin_mask_for_load)
3380 tree builtin_decl;
3381 tree params = build_tree_list (NULL_TREE, init_addr);
3383 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
3384 new_stmt = build_function_call_expr (builtin_decl, params);
3385 vec_dest = vect_create_destination_var (scalar_dest,
3386 TREE_TYPE (new_stmt));
3387 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3388 new_stmt);
3389 new_temp = make_ssa_name (vec_dest, new_stmt);
3390 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3391 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3392 gcc_assert (!new_bb);
3393 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
3395 /* The result of the CALL_EXPR to this builtin is determined from
3396 the value of the parameter and no global variables are touched
3397 which makes the builtin a "const" function. Requiring the
3398 builtin to have the "const" attribute makes it unnecessary
3399 to call mark_call_clobbered. */
3400 gcc_assert (TREE_READONLY (builtin_decl));
3403 /* 3. Create msq = phi <msq_init, lsq> in loop */
3404 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3405 msq = make_ssa_name (vec_dest, NULL_TREE);
3406 phi_stmt = create_phi_node (msq, loop->header);
3407 SSA_NAME_DEF_STMT (msq) = phi_stmt;
3408 add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop));
3410 return msq;
3414 /* Function vect_strided_load_supported.
3416 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
3417 and FALSE otherwise. */
3419 static bool
3420 vect_strided_load_supported (tree vectype)
3422 optab perm_even_optab, perm_odd_optab;
3423 int mode;
3425 mode = (int) TYPE_MODE (vectype);
3427 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
3428 if (!perm_even_optab)
3430 if (vect_print_dump_info (REPORT_DETAILS))
3431 fprintf (vect_dump, "no optab for perm_even.");
3432 return false;
3435 if (perm_even_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3437 if (vect_print_dump_info (REPORT_DETAILS))
3438 fprintf (vect_dump, "perm_even op not supported by target.");
3439 return false;
3442 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
3443 if (!perm_odd_optab)
3445 if (vect_print_dump_info (REPORT_DETAILS))
3446 fprintf (vect_dump, "no optab for perm_odd.");
3447 return false;
3450 if (perm_odd_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3452 if (vect_print_dump_info (REPORT_DETAILS))
3453 fprintf (vect_dump, "perm_odd op not supported by target.");
3454 return false;
3456 return true;
3460 /* Function vect_permute_load_chain.
3462 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
3463 a power of 2, generate extract_even/odd stmts to reorder the input data
3464 correctly. Return the final references for loads in RESULT_CHAIN.
3466 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3467 The input is 4 vectors each containing 8 elements. We assign a number to each
3468 element, the input sequence is:
3470 1st vec: 0 1 2 3 4 5 6 7
3471 2nd vec: 8 9 10 11 12 13 14 15
3472 3rd vec: 16 17 18 19 20 21 22 23
3473 4th vec: 24 25 26 27 28 29 30 31
3475 The output sequence should be:
3477 1st vec: 0 4 8 12 16 20 24 28
3478 2nd vec: 1 5 9 13 17 21 25 29
3479 3rd vec: 2 6 10 14 18 22 26 30
3480 4th vec: 3 7 11 15 19 23 27 31
3482 i.e., the first output vector should contain the first elements of each
3483 interleaving group, etc.
3485 We use extract_even/odd instructions to create such output. The input of each
3486 extract_even/odd operation is two vectors
3487 1st vec 2nd vec
3488 0 1 2 3 4 5 6 7
3490 and the output is the vector of extracted even/odd elements. The output of
3491 extract_even will be: 0 2 4 6
3492 and of extract_odd: 1 3 5 7
3495 The permutation is done in log LENGTH stages. In each stage extract_even and
3496 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
3497 order. In our example,
3499 E1: extract_even (1st vec, 2nd vec)
3500 E2: extract_odd (1st vec, 2nd vec)
3501 E3: extract_even (3rd vec, 4th vec)
3502 E4: extract_odd (3rd vec, 4th vec)
3504 The output for the first stage will be:
3506 E1: 0 2 4 6 8 10 12 14
3507 E2: 1 3 5 7 9 11 13 15
3508 E3: 16 18 20 22 24 26 28 30
3509 E4: 17 19 21 23 25 27 29 31
3511 In order to proceed and create the correct sequence for the next stage (or
3512 for the correct output, if the second stage is the last one, as in our
3513 example), we first put the output of extract_even operation and then the
3514 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
3515 The input for the second stage is:
3517 1st vec (E1): 0 2 4 6 8 10 12 14
3518 2nd vec (E3): 16 18 20 22 24 26 28 30
3519 3rd vec (E2): 1 3 5 7 9 11 13 15
3520 4th vec (E4): 17 19 21 23 25 27 29 31
3522 The output of the second stage:
3524 E1: 0 4 8 12 16 20 24 28
3525 E2: 2 6 10 14 18 22 26 30
3526 E3: 1 5 9 13 17 21 25 29
3527 E4: 3 7 11 15 19 23 27 31
3529 And RESULT_CHAIN after reordering:
3531 1st vec (E1): 0 4 8 12 16 20 24 28
3532 2nd vec (E3): 1 5 9 13 17 21 25 29
3533 3rd vec (E2): 2 6 10 14 18 22 26 30
3534 4th vec (E4): 3 7 11 15 19 23 27 31. */
3536 static bool
3537 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
3538 unsigned int length,
3539 tree stmt,
3540 block_stmt_iterator *bsi,
3541 VEC(tree,heap) **result_chain)
3543 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
3544 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3545 int i;
3546 unsigned int j;
3548 /* Check that the operation is supported. */
3549 if (!vect_strided_load_supported (vectype))
3550 return false;
3552 *result_chain = VEC_copy (tree, heap, dr_chain);
3553 for (i = 0; i < exact_log2 (length); i++)
3555 for (j = 0; j < length; j +=2)
3557 first_vect = VEC_index (tree, dr_chain, j);
3558 second_vect = VEC_index (tree, dr_chain, j+1);
3560 /* data_ref = permute_even (first_data_ref, second_data_ref); */
3561 perm_dest = create_tmp_var (vectype, "vect_perm_even");
3562 DECL_GIMPLE_REG_P (perm_dest) = 1;
3563 add_referenced_var (perm_dest);
3565 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3566 build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
3567 first_vect, second_vect));
3569 data_ref = make_ssa_name (perm_dest, perm_stmt);
3570 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3571 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3572 mark_symbols_for_renaming (perm_stmt);
3574 VEC_replace (tree, *result_chain, j/2, data_ref);
3576 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
3577 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
3578 DECL_GIMPLE_REG_P (perm_dest) = 1;
3579 add_referenced_var (perm_dest);
3581 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3582 build2 (VEC_EXTRACT_ODD_EXPR, vectype,
3583 first_vect, second_vect));
3584 data_ref = make_ssa_name (perm_dest, perm_stmt);
3585 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3586 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3587 mark_symbols_for_renaming (perm_stmt);
3589 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
3591 dr_chain = VEC_copy (tree, heap, *result_chain);
3593 return true;
3597 /* Function vect_transform_strided_load.
3599 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
3600 to perform their permutation and ascribe the result vectorized statements to
3601 the scalar statements.
3604 static bool
3605 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
3606 block_stmt_iterator *bsi)
3608 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3609 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3610 tree next_stmt, new_stmt;
3611 VEC(tree,heap) *result_chain = NULL;
3612 unsigned int i, gap_count;
3613 tree tmp_data_ref;
3615 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
3616 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
3617 vectors, that are ready for vector computation. */
3618 result_chain = VEC_alloc (tree, heap, size);
3619 /* Permute. */
3620 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
3621 return false;
3623 /* Put a permuted data-ref in the VECTORIZED_STMT field.
3624 Since we scan the chain starting from it's first node, their order
3625 corresponds the order of data-refs in RESULT_CHAIN. */
3626 next_stmt = first_stmt;
3627 gap_count = 1;
3628 for (i = 0; VEC_iterate(tree, result_chain, i, tmp_data_ref); i++)
3630 if (!next_stmt)
3631 break;
3633 /* Skip the gaps. Loads created for the gaps will be removed by dead
3634 code elimination pass later.
3635 DR_GROUP_GAP is the number of steps in elements from the previous
3636 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
3637 correspond to the gaps.
3639 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
3641 gap_count++;
3642 continue;
3645 while (next_stmt)
3647 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
3648 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
3649 copies, and we put the new vector statement in the first available
3650 RELATED_STMT. */
3651 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
3652 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
3653 else
3655 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
3656 tree rel_stmt = STMT_VINFO_RELATED_STMT (
3657 vinfo_for_stmt (prev_stmt));
3658 while (rel_stmt)
3660 prev_stmt = rel_stmt;
3661 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
3663 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
3665 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3666 gap_count = 1;
3667 /* If NEXT_STMT accesses the same DR as the previous statement,
3668 put the same TMP_DATA_REF as its vectorized statement; otherwise
3669 get the next data-ref from RESULT_CHAIN. */
3670 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
3671 break;
3674 return true;
3678 /* vectorizable_load.
3680 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
3681 can be vectorized.
3682 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3683 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3684 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3686 bool
3687 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3689 tree scalar_dest;
3690 tree vec_dest = NULL;
3691 tree data_ref = NULL;
3692 tree op;
3693 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3694 stmt_vec_info prev_stmt_info;
3695 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3696 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3697 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
3698 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3699 tree new_temp;
3700 int mode;
3701 tree new_stmt = NULL_TREE;
3702 tree dummy;
3703 enum dr_alignment_support alignment_support_cheme;
3704 tree dataref_ptr = NULL_TREE;
3705 tree ptr_incr;
3706 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3707 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3708 int i, j, group_size;
3709 tree msq = NULL_TREE, lsq;
3710 tree offset = NULL_TREE;
3711 tree realignment_token = NULL_TREE;
3712 tree phi_stmt = NULL_TREE;
3713 VEC(tree,heap) *dr_chain = NULL;
3714 bool strided_load = false;
3715 tree first_stmt;
3717 /* Is vectorizable load? */
3718 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3719 return false;
3721 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3723 if (STMT_VINFO_LIVE_P (stmt_info))
3725 /* FORNOW: not yet supported. */
3726 if (vect_print_dump_info (REPORT_DETAILS))
3727 fprintf (vect_dump, "value used after loop.");
3728 return false;
3731 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3732 return false;
3734 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3735 if (TREE_CODE (scalar_dest) != SSA_NAME)
3736 return false;
3738 op = GIMPLE_STMT_OPERAND (stmt, 1);
3739 if (TREE_CODE (op) != ARRAY_REF
3740 && TREE_CODE (op) != INDIRECT_REF
3741 && !DR_GROUP_FIRST_DR (stmt_info))
3742 return false;
3744 if (!STMT_VINFO_DATA_REF (stmt_info))
3745 return false;
3747 mode = (int) TYPE_MODE (vectype);
3749 /* FORNOW. In some cases can vectorize even if data-type not supported
3750 (e.g. - data copies). */
3751 if (mov_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3753 if (vect_print_dump_info (REPORT_DETAILS))
3754 fprintf (vect_dump, "Aligned load, but unsupported type.");
3755 return false;
3758 /* Check if the load is a part of an interleaving chain. */
3759 if (DR_GROUP_FIRST_DR (stmt_info))
3761 strided_load = true;
3763 /* Check if interleaving is supported. */
3764 if (!vect_strided_load_supported (vectype))
3765 return false;
3768 if (!vec_stmt) /* transformation not required. */
3770 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
3771 return true;
3774 /** Transform. **/
3776 if (vect_print_dump_info (REPORT_DETAILS))
3777 fprintf (vect_dump, "transform load.");
3779 if (strided_load)
3781 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3782 /* Check if the chain of loads is already vectorized. */
3783 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
3785 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3786 return true;
3788 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3789 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3790 dr_chain = VEC_alloc (tree, heap, group_size);
3792 else
3794 first_stmt = stmt;
3795 first_dr = dr;
3796 group_size = 1;
3799 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3800 gcc_assert (alignment_support_cheme);
3803 /* In case the vectorization factor (VF) is bigger than the number
3804 of elements that we can fit in a vectype (nunits), we have to generate
3805 more than one vector stmt - i.e - we need to "unroll" the
3806 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3807 from one copy of the vector stmt to the next, in the field
3808 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3809 stages to find the correct vector defs to be used when vectorizing
3810 stmts that use the defs of the current stmt. The example below illustrates
3811 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3812 4 vectorized stmts):
3814 before vectorization:
3815 RELATED_STMT VEC_STMT
3816 S1: x = memref - -
3817 S2: z = x + 1 - -
3819 step 1: vectorize stmt S1:
3820 We first create the vector stmt VS1_0, and, as usual, record a
3821 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
3822 Next, we create the vector stmt VS1_1, and record a pointer to
3823 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
3824 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
3825 stmts and pointers:
3826 RELATED_STMT VEC_STMT
3827 VS1_0: vx0 = memref0 VS1_1 -
3828 VS1_1: vx1 = memref1 VS1_2 -
3829 VS1_2: vx2 = memref2 VS1_3 -
3830 VS1_3: vx3 = memref3 - -
3831 S1: x = load - VS1_0
3832 S2: z = x + 1 - -
3834 See in documentation in vect_get_vec_def_for_stmt_copy for how the
3835 information we recorded in RELATED_STMT field is used to vectorize
3836 stmt S2. */
3838 /* In case of interleaving (non-unit strided access):
3840 S1: x2 = &base + 2
3841 S2: x0 = &base
3842 S3: x1 = &base + 1
3843 S4: x3 = &base + 3
3845 Vectorized loads are created in the order of memory accesses
3846 starting from the access of the first stmt of the chain:
3848 VS1: vx0 = &base
3849 VS2: vx1 = &base + vec_size*1
3850 VS3: vx3 = &base + vec_size*2
3851 VS4: vx4 = &base + vec_size*3
3853 Then permutation statements are generated:
3855 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
3856 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
3859 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3860 (the order of the data-refs in the output of vect_permute_load_chain
3861 corresponds to the order of scalar stmts in the interleaving chain - see
3862 the documentation of vect_permute_load_chain()).
3863 The generation of permutation stmts and recording them in
3864 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
3866 In case of both multiple types and interleaving, the vector loads and
3867 permutation stmts above are created for every copy. The result vector stmts
3868 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3869 STMT_VINFO_RELATED_STMT for the next copies. */
3871 /* If the data reference is aligned (dr_aligned) or potentially unaligned
3872 on a target that supports unaligned accesses (dr_unaligned_supported)
3873 we generate the following code:
3874 p = initial_addr;
3875 indx = 0;
3876 loop {
3877 p = p + indx * vectype_size;
3878 vec_dest = *(p);
3879 indx = indx + 1;
3882 Otherwise, the data reference is potentially unaligned on a target that
3883 does not support unaligned accesses (dr_unaligned_software_pipeline) -
3884 then generate the following code, in which the data in each iteration is
3885 obtained by two vector loads, one from the previous iteration, and one
3886 from the current iteration:
3887 p1 = initial_addr;
3888 msq_init = *(floor(p1))
3889 p2 = initial_addr + VS - 1;
3890 realignment_token = call target_builtin;
3891 indx = 0;
3892 loop {
3893 p2 = p2 + indx * vectype_size
3894 lsq = *(floor(p2))
3895 vec_dest = realign_load (msq, lsq, realignment_token)
3896 indx = indx + 1;
3897 msq = lsq;
3898 } */
3900 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3902 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token);
3903 phi_stmt = SSA_NAME_DEF_STMT (msq);
3904 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
3907 prev_stmt_info = NULL;
3908 for (j = 0; j < ncopies; j++)
3910 /* 1. Create the vector pointer update chain. */
3911 if (j == 0)
3912 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset, &dummy,
3913 &ptr_incr, false, NULL_TREE);
3914 else
3915 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3917 for (i = 0; i < group_size; i++)
3919 /* 2. Create the vector-load in the loop. */
3920 switch (alignment_support_cheme)
3922 case dr_aligned:
3923 gcc_assert (aligned_access_p (first_dr));
3924 data_ref = build_fold_indirect_ref (dataref_ptr);
3925 break;
3926 case dr_unaligned_supported:
3928 int mis = DR_MISALIGNMENT (first_dr);
3929 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
3931 gcc_assert (!aligned_access_p (first_dr));
3932 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
3933 data_ref =
3934 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
3935 break;
3937 case dr_unaligned_software_pipeline:
3938 gcc_assert (!aligned_access_p (first_dr));
3939 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
3940 break;
3941 default:
3942 gcc_unreachable ();
3944 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3945 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3946 data_ref);
3947 new_temp = make_ssa_name (vec_dest, new_stmt);
3948 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3949 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3950 copy_virtual_operands (new_stmt, stmt);
3951 mark_symbols_for_renaming (new_stmt);
3953 /* 3. Handle explicit realignment if necessary/supported. */
3954 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3956 /* Create in loop:
3957 <vec_dest = realign_load (msq, lsq, realignment_token)> */
3958 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
3959 if (!realignment_token)
3960 realignment_token = dataref_ptr;
3961 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3962 new_stmt =
3963 build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token);
3964 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3965 new_stmt);
3966 new_temp = make_ssa_name (vec_dest, new_stmt);
3967 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3968 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3969 if (i == group_size - 1 && j == ncopies - 1)
3970 add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop));
3971 msq = lsq;
3973 if (strided_load)
3974 VEC_quick_push (tree, dr_chain, new_temp);
3975 if (i < group_size - 1)
3976 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3979 if (strided_load)
3981 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
3982 return false;
3983 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3984 dr_chain = VEC_alloc (tree, heap, group_size);
3986 else
3988 if (j == 0)
3989 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3990 else
3991 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3992 prev_stmt_info = vinfo_for_stmt (new_stmt);
3996 return true;
4000 /* Function vectorizable_live_operation.
4002 STMT computes a value that is used outside the loop. Check if
4003 it can be supported. */
4005 bool
4006 vectorizable_live_operation (tree stmt,
4007 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
4008 tree *vec_stmt ATTRIBUTE_UNUSED)
4010 tree operation;
4011 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4012 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4013 int i;
4014 enum tree_code code;
4015 int op_type;
4016 tree op;
4017 tree def, def_stmt;
4018 enum vect_def_type dt;
4020 if (!STMT_VINFO_LIVE_P (stmt_info))
4021 return false;
4023 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4024 return false;
4026 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4027 return false;
4029 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4030 code = TREE_CODE (operation);
4032 op_type = TREE_CODE_LENGTH (code);
4034 /* FORNOW: support only if all uses are invariant. This means
4035 that the scalar operations can remain in place, unvectorized.
4036 The original last scalar value that they compute will be used. */
4038 for (i = 0; i < op_type; i++)
4040 op = TREE_OPERAND (operation, i);
4041 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4043 if (vect_print_dump_info (REPORT_DETAILS))
4044 fprintf (vect_dump, "use not simple.");
4045 return false;
4048 if (dt != vect_invariant_def && dt != vect_constant_def)
4049 return false;
4052 /* No transformation is required for the cases we currently support. */
4053 return true;
4057 /* Function vect_is_simple_cond.
4059 Input:
4060 LOOP - the loop that is being vectorized.
4061 COND - Condition that is checked for simple use.
4063 Returns whether a COND can be vectorized. Checks whether
4064 condition operands are supportable using vec_is_simple_use. */
4066 static bool
4067 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
4069 tree lhs, rhs;
4070 tree def;
4071 enum vect_def_type dt;
4073 if (!COMPARISON_CLASS_P (cond))
4074 return false;
4076 lhs = TREE_OPERAND (cond, 0);
4077 rhs = TREE_OPERAND (cond, 1);
4079 if (TREE_CODE (lhs) == SSA_NAME)
4081 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
4082 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
4083 return false;
4085 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST)
4086 return false;
4088 if (TREE_CODE (rhs) == SSA_NAME)
4090 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
4091 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
4092 return false;
4094 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST)
4095 return false;
4097 return true;
4100 /* vectorizable_condition.
4102 Check if STMT is conditional modify expression that can be vectorized.
4103 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4104 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
4105 at BSI.
4107 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4109 bool
4110 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
4112 tree scalar_dest = NULL_TREE;
4113 tree vec_dest = NULL_TREE;
4114 tree op = NULL_TREE;
4115 tree cond_expr, then_clause, else_clause;
4116 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4117 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4118 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
4119 tree vec_compare, vec_cond_expr;
4120 tree new_temp;
4121 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4122 enum machine_mode vec_mode;
4123 tree def;
4124 enum vect_def_type dt;
4125 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4126 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4128 gcc_assert (ncopies >= 1);
4129 if (ncopies > 1)
4130 return false; /* FORNOW */
4132 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4133 return false;
4135 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
4137 if (STMT_VINFO_LIVE_P (stmt_info))
4139 /* FORNOW: not yet supported. */
4140 if (vect_print_dump_info (REPORT_DETAILS))
4141 fprintf (vect_dump, "value used after loop.");
4142 return false;
4145 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4146 return false;
4148 op = GIMPLE_STMT_OPERAND (stmt, 1);
4150 if (TREE_CODE (op) != COND_EXPR)
4151 return false;
4153 cond_expr = TREE_OPERAND (op, 0);
4154 then_clause = TREE_OPERAND (op, 1);
4155 else_clause = TREE_OPERAND (op, 2);
4157 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
4158 return false;
4160 /* We do not handle two different vector types for the condition
4161 and the values. */
4162 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
4163 return false;
4165 if (TREE_CODE (then_clause) == SSA_NAME)
4167 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
4168 if (!vect_is_simple_use (then_clause, loop_vinfo,
4169 &then_def_stmt, &def, &dt))
4170 return false;
4172 else if (TREE_CODE (then_clause) != INTEGER_CST
4173 && TREE_CODE (then_clause) != REAL_CST)
4174 return false;
4176 if (TREE_CODE (else_clause) == SSA_NAME)
4178 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
4179 if (!vect_is_simple_use (else_clause, loop_vinfo,
4180 &else_def_stmt, &def, &dt))
4181 return false;
4183 else if (TREE_CODE (else_clause) != INTEGER_CST
4184 && TREE_CODE (else_clause) != REAL_CST)
4185 return false;
4188 vec_mode = TYPE_MODE (vectype);
4190 if (!vec_stmt)
4192 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
4193 return expand_vec_cond_expr_p (op, vec_mode);
4196 /* Transform */
4198 /* Handle def. */
4199 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4200 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4202 /* Handle cond expr. */
4203 vec_cond_lhs =
4204 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
4205 vec_cond_rhs =
4206 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
4207 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
4208 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
4210 /* Arguments are ready. create the new vector stmt. */
4211 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
4212 vec_cond_lhs, vec_cond_rhs);
4213 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
4214 vec_compare, vec_then_clause, vec_else_clause);
4216 *vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
4217 vec_cond_expr);
4218 new_temp = make_ssa_name (vec_dest, *vec_stmt);
4219 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
4220 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
4222 return true;
4225 /* Function vect_transform_stmt.
4227 Create a vectorized stmt to replace STMT, and insert it at BSI. */
4229 bool
4230 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
4232 bool is_store = false;
4233 tree vec_stmt = NULL_TREE;
4234 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4235 tree orig_stmt_in_pattern;
4236 bool done;
4238 if (STMT_VINFO_RELEVANT_P (stmt_info))
4240 switch (STMT_VINFO_TYPE (stmt_info))
4242 case type_demotion_vec_info_type:
4243 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
4244 gcc_assert (done);
4245 break;
4247 case type_promotion_vec_info_type:
4248 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
4249 gcc_assert (done);
4250 break;
4252 case type_conversion_vec_info_type:
4253 done = vectorizable_conversion (stmt, bsi, &vec_stmt);
4254 gcc_assert (done);
4255 break;
4257 case op_vec_info_type:
4258 done = vectorizable_operation (stmt, bsi, &vec_stmt);
4259 gcc_assert (done);
4260 break;
4262 case assignment_vec_info_type:
4263 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
4264 gcc_assert (done);
4265 break;
4267 case load_vec_info_type:
4268 done = vectorizable_load (stmt, bsi, &vec_stmt);
4269 gcc_assert (done);
4270 break;
4272 case store_vec_info_type:
4273 done = vectorizable_store (stmt, bsi, &vec_stmt);
4274 gcc_assert (done);
4275 if (DR_GROUP_FIRST_DR (stmt_info))
4277 /* In case of interleaving, the whole chain is vectorized when the
4278 last store in the chain is reached. Store stmts before the last
4279 one are skipped, and there vec_stmt_info shouldn't be freed
4280 meanwhile. */
4281 *strided_store = true;
4282 if (STMT_VINFO_VEC_STMT (stmt_info))
4283 is_store = true;
4285 else
4286 is_store = true;
4287 break;
4289 case condition_vec_info_type:
4290 done = vectorizable_condition (stmt, bsi, &vec_stmt);
4291 gcc_assert (done);
4292 break;
4294 case call_vec_info_type:
4295 done = vectorizable_call (stmt, bsi, &vec_stmt);
4296 break;
4298 default:
4299 if (vect_print_dump_info (REPORT_DETAILS))
4300 fprintf (vect_dump, "stmt not supported.");
4301 gcc_unreachable ();
4304 gcc_assert (vec_stmt || *strided_store);
4305 if (vec_stmt)
4307 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
4308 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
4309 if (orig_stmt_in_pattern)
4311 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
4312 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
4314 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4316 /* STMT was inserted by the vectorizer to replace a
4317 computation idiom. ORIG_STMT_IN_PATTERN is a stmt in the
4318 original sequence that computed this idiom. We need to
4319 record a pointer to VEC_STMT in the stmt_info of
4320 ORIG_STMT_IN_PATTERN. See more details in the
4321 documentation of vect_pattern_recog. */
4323 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
4329 if (STMT_VINFO_LIVE_P (stmt_info))
4331 switch (STMT_VINFO_TYPE (stmt_info))
4333 case reduc_vec_info_type:
4334 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
4335 gcc_assert (done);
4336 break;
4338 default:
4339 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
4340 gcc_assert (done);
4344 return is_store;
4348 /* This function builds ni_name = number of iterations loop executes
4349 on the loop preheader. */
4351 static tree
4352 vect_build_loop_niters (loop_vec_info loop_vinfo)
4354 tree ni_name, stmt, var;
4355 edge pe;
4356 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4357 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
4359 var = create_tmp_var (TREE_TYPE (ni), "niters");
4360 add_referenced_var (var);
4361 ni_name = force_gimple_operand (ni, &stmt, false, var);
4363 pe = loop_preheader_edge (loop);
4364 if (stmt)
4366 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4367 gcc_assert (!new_bb);
4370 return ni_name;
4374 /* This function generates the following statements:
4376 ni_name = number of iterations loop executes
4377 ratio = ni_name / vf
4378 ratio_mult_vf_name = ratio * vf
4380 and places them at the loop preheader edge. */
4382 static void
4383 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
4384 tree *ni_name_ptr,
4385 tree *ratio_mult_vf_name_ptr,
4386 tree *ratio_name_ptr)
4389 edge pe;
4390 basic_block new_bb;
4391 tree stmt, ni_name;
4392 tree var;
4393 tree ratio_name;
4394 tree ratio_mult_vf_name;
4395 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4396 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
4397 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4398 tree log_vf;
4400 pe = loop_preheader_edge (loop);
4402 /* Generate temporary variable that contains
4403 number of iterations loop executes. */
4405 ni_name = vect_build_loop_niters (loop_vinfo);
4406 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
4408 /* Create: ratio = ni >> log2(vf) */
4410 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
4411 if (!is_gimple_val (ratio_name))
4413 var = create_tmp_var (TREE_TYPE (ni), "bnd");
4414 add_referenced_var (var);
4416 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
4417 pe = loop_preheader_edge (loop);
4418 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4419 gcc_assert (!new_bb);
4422 /* Create: ratio_mult_vf = ratio << log2 (vf). */
4424 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
4425 ratio_name, log_vf);
4426 if (!is_gimple_val (ratio_mult_vf_name))
4428 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
4429 add_referenced_var (var);
4431 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
4432 true, var);
4433 pe = loop_preheader_edge (loop);
4434 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4435 gcc_assert (!new_bb);
4438 *ni_name_ptr = ni_name;
4439 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
4440 *ratio_name_ptr = ratio_name;
4442 return;
4446 /* Function update_vuses_to_preheader.
4448 Input:
4449 STMT - a statement with potential VUSEs.
4450 LOOP - the loop whose preheader will contain STMT.
4452 It's possible to vectorize a loop even though an SSA_NAME from a VUSE
4453 appears to be defined in a VDEF in another statement in a loop.
4454 One such case is when the VUSE is at the dereference of a __restricted__
4455 pointer in a load and the VDEF is at the dereference of a different
4456 __restricted__ pointer in a store. Vectorization may result in
4457 copy_virtual_uses being called to copy the problematic VUSE to a new
4458 statement that is being inserted in the loop preheader. This procedure
4459 is called to change the SSA_NAME in the new statement's VUSE from the
4460 SSA_NAME updated in the loop to the related SSA_NAME available on the
4461 path entering the loop.
4463 When this function is called, we have the following situation:
4465 # vuse <name1>
4466 S1: vload
4467 do {
4468 # name1 = phi < name0 , name2>
4470 # vuse <name1>
4471 S2: vload
4473 # name2 = vdef <name1>
4474 S3: vstore
4476 }while...
4478 Stmt S1 was created in the loop preheader block as part of misaligned-load
4479 handling. This function fixes the name of the vuse of S1 from 'name1' to
4480 'name0'. */
4482 static void
4483 update_vuses_to_preheader (tree stmt, struct loop *loop)
4485 basic_block header_bb = loop->header;
4486 edge preheader_e = loop_preheader_edge (loop);
4487 ssa_op_iter iter;
4488 use_operand_p use_p;
4490 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VUSE)
4492 tree ssa_name = USE_FROM_PTR (use_p);
4493 tree def_stmt = SSA_NAME_DEF_STMT (ssa_name);
4494 tree name_var = SSA_NAME_VAR (ssa_name);
4495 basic_block bb = bb_for_stmt (def_stmt);
4497 /* For a use before any definitions, def_stmt is a NOP_EXPR. */
4498 if (!IS_EMPTY_STMT (def_stmt)
4499 && flow_bb_inside_loop_p (loop, bb))
4501 /* If the block containing the statement defining the SSA_NAME
4502 is in the loop then it's necessary to find the definition
4503 outside the loop using the PHI nodes of the header. */
4504 tree phi;
4505 bool updated = false;
4507 for (phi = phi_nodes (header_bb); phi; phi = TREE_CHAIN (phi))
4509 if (SSA_NAME_VAR (PHI_RESULT (phi)) == name_var)
4511 SET_USE (use_p, PHI_ARG_DEF (phi, preheader_e->dest_idx));
4512 updated = true;
4513 break;
4516 gcc_assert (updated);
4522 /* Function vect_update_ivs_after_vectorizer.
4524 "Advance" the induction variables of LOOP to the value they should take
4525 after the execution of LOOP. This is currently necessary because the
4526 vectorizer does not handle induction variables that are used after the
4527 loop. Such a situation occurs when the last iterations of LOOP are
4528 peeled, because:
4529 1. We introduced new uses after LOOP for IVs that were not originally used
4530 after LOOP: the IVs of LOOP are now used by an epilog loop.
4531 2. LOOP is going to be vectorized; this means that it will iterate N/VF
4532 times, whereas the loop IVs should be bumped N times.
4534 Input:
4535 - LOOP - a loop that is going to be vectorized. The last few iterations
4536 of LOOP were peeled.
4537 - NITERS - the number of iterations that LOOP executes (before it is
4538 vectorized). i.e, the number of times the ivs should be bumped.
4539 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
4540 coming out from LOOP on which there are uses of the LOOP ivs
4541 (this is the path from LOOP->exit to epilog_loop->preheader).
4543 The new definitions of the ivs are placed in LOOP->exit.
4544 The phi args associated with the edge UPDATE_E in the bb
4545 UPDATE_E->dest are updated accordingly.
4547 Assumption 1: Like the rest of the vectorizer, this function assumes
4548 a single loop exit that has a single predecessor.
4550 Assumption 2: The phi nodes in the LOOP header and in update_bb are
4551 organized in the same order.
4553 Assumption 3: The access function of the ivs is simple enough (see
4554 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
4556 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
4557 coming out of LOOP on which the ivs of LOOP are used (this is the path
4558 that leads to the epilog loop; other paths skip the epilog loop). This
4559 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
4560 needs to have its phis updated.
4563 static void
4564 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
4565 edge update_e)
4567 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4568 basic_block exit_bb = single_exit (loop)->dest;
4569 tree phi, phi1;
4570 basic_block update_bb = update_e->dest;
4572 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
4574 /* Make sure there exists a single-predecessor exit bb: */
4575 gcc_assert (single_pred_p (exit_bb));
4577 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
4578 phi && phi1;
4579 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
4581 tree access_fn = NULL;
4582 tree evolution_part;
4583 tree init_expr;
4584 tree step_expr;
4585 tree var, stmt, ni, ni_name;
4586 block_stmt_iterator last_bsi;
4588 if (vect_print_dump_info (REPORT_DETAILS))
4590 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
4591 print_generic_expr (vect_dump, phi, TDF_SLIM);
4594 /* Skip virtual phi's. */
4595 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
4597 if (vect_print_dump_info (REPORT_DETAILS))
4598 fprintf (vect_dump, "virtual phi. skip.");
4599 continue;
4602 /* Skip reduction phis. */
4603 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
4605 if (vect_print_dump_info (REPORT_DETAILS))
4606 fprintf (vect_dump, "reduc phi. skip.");
4607 continue;
4610 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
4611 gcc_assert (access_fn);
4612 evolution_part =
4613 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
4614 gcc_assert (evolution_part != NULL_TREE);
4616 /* FORNOW: We do not support IVs whose evolution function is a polynomial
4617 of degree >= 2 or exponential. */
4618 gcc_assert (!tree_is_chrec (evolution_part));
4620 step_expr = evolution_part;
4621 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
4622 loop->num));
4624 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
4625 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
4626 fold_convert (TREE_TYPE (init_expr),
4627 niters),
4628 step_expr),
4629 init_expr);
4631 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
4632 add_referenced_var (var);
4634 ni_name = force_gimple_operand (ni, &stmt, false, var);
4636 /* Insert stmt into exit_bb. */
4637 last_bsi = bsi_last (exit_bb);
4638 if (stmt)
4639 bsi_insert_before (&last_bsi, stmt, BSI_SAME_STMT);
4641 /* Fix phi expressions in the successor bb. */
4642 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
4647 /* Function vect_do_peeling_for_loop_bound
4649 Peel the last iterations of the loop represented by LOOP_VINFO.
4650 The peeled iterations form a new epilog loop. Given that the loop now
4651 iterates NITERS times, the new epilog loop iterates
4652 NITERS % VECTORIZATION_FACTOR times.
4654 The original loop will later be made to iterate
4655 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
4657 static void
4658 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
4660 tree ni_name, ratio_mult_vf_name;
4661 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4662 struct loop *new_loop;
4663 edge update_e;
4664 basic_block preheader;
4665 int loop_num;
4666 unsigned int th;
4668 if (vect_print_dump_info (REPORT_DETAILS))
4669 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
4671 initialize_original_copy_tables ();
4673 /* Generate the following variables on the preheader of original loop:
4675 ni_name = number of iteration the original loop executes
4676 ratio = ni_name / vf
4677 ratio_mult_vf_name = ratio * vf */
4678 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
4679 &ratio_mult_vf_name, ratio);
4681 loop_num = loop->num;
4682 /* Threshold for vectorized loop. */
4683 th = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)) *
4684 LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4685 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
4686 ratio_mult_vf_name, ni_name, false, th);
4687 gcc_assert (new_loop);
4688 gcc_assert (loop_num == loop->num);
4689 #ifdef ENABLE_CHECKING
4690 slpeel_verify_cfg_after_peeling (loop, new_loop);
4691 #endif
4693 /* A guard that controls whether the new_loop is to be executed or skipped
4694 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
4695 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
4696 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
4697 is on the path where the LOOP IVs are used and need to be updated. */
4699 preheader = loop_preheader_edge (new_loop)->src;
4700 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
4701 update_e = EDGE_PRED (preheader, 0);
4702 else
4703 update_e = EDGE_PRED (preheader, 1);
4705 /* Update IVs of original loop as if they were advanced
4706 by ratio_mult_vf_name steps. */
4707 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
4709 /* After peeling we have to reset scalar evolution analyzer. */
4710 scev_reset ();
4712 free_original_copy_tables ();
4716 /* Function vect_gen_niters_for_prolog_loop
4718 Set the number of iterations for the loop represented by LOOP_VINFO
4719 to the minimum between LOOP_NITERS (the original iteration count of the loop)
4720 and the misalignment of DR - the data reference recorded in
4721 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
4722 this loop, the data reference DR will refer to an aligned location.
4724 The following computation is generated:
4726 If the misalignment of DR is known at compile time:
4727 addr_mis = int mis = DR_MISALIGNMENT (dr);
4728 Else, compute address misalignment in bytes:
4729 addr_mis = addr & (vectype_size - 1)
4731 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
4733 (elem_size = element type size; an element is the scalar element
4734 whose type is the inner type of the vectype)
4736 For interleaving,
4738 prolog_niters = min ( LOOP_NITERS ,
4739 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
4740 where group_size is the size of the interleaved group.
4743 static tree
4744 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
4746 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
4747 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4748 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4749 tree var, stmt;
4750 tree iters, iters_name;
4751 edge pe;
4752 basic_block new_bb;
4753 tree dr_stmt = DR_STMT (dr);
4754 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
4755 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4756 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
4757 tree niters_type = TREE_TYPE (loop_niters);
4758 int group_size = 1;
4759 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
4761 if (DR_GROUP_FIRST_DR (stmt_info))
4763 /* For interleaved access element size must be multiplied by the size of
4764 the interleaved group. */
4765 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
4766 DR_GROUP_FIRST_DR (stmt_info)));
4767 element_size *= group_size;
4770 pe = loop_preheader_edge (loop);
4772 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
4774 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
4775 int elem_misalign = byte_misalign / element_size;
4777 if (vect_print_dump_info (REPORT_DETAILS))
4778 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
4779 iters = build_int_cst (niters_type,
4780 (vf - elem_misalign)&(vf/group_size-1));
4782 else
4784 tree new_stmts = NULL_TREE;
4785 tree start_addr =
4786 vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE);
4787 tree ptr_type = TREE_TYPE (start_addr);
4788 tree size = TYPE_SIZE (ptr_type);
4789 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
4790 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
4791 tree elem_size_log =
4792 build_int_cst (type, exact_log2 (vectype_align/vf));
4793 tree vf_minus_1 = build_int_cst (type, vf - 1);
4794 tree vf_tree = build_int_cst (type, vf);
4795 tree byte_misalign;
4796 tree elem_misalign;
4798 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
4799 gcc_assert (!new_bb);
4801 /* Create: byte_misalign = addr & (vectype_size - 1) */
4802 byte_misalign =
4803 fold_build2 (BIT_AND_EXPR, type, start_addr, vectype_size_minus_1);
4805 /* Create: elem_misalign = byte_misalign / element_size */
4806 elem_misalign =
4807 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
4809 /* Create: (niters_type) (VF - elem_misalign)&(VF - 1) */
4810 iters = fold_build2 (MINUS_EXPR, type, vf_tree, elem_misalign);
4811 iters = fold_build2 (BIT_AND_EXPR, type, iters, vf_minus_1);
4812 iters = fold_convert (niters_type, iters);
4815 /* Create: prolog_loop_niters = min (iters, loop_niters) */
4816 /* If the loop bound is known at compile time we already verified that it is
4817 greater than vf; since the misalignment ('iters') is at most vf, there's
4818 no need to generate the MIN_EXPR in this case. */
4819 if (TREE_CODE (loop_niters) != INTEGER_CST)
4820 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
4822 if (vect_print_dump_info (REPORT_DETAILS))
4824 fprintf (vect_dump, "niters for prolog loop: ");
4825 print_generic_expr (vect_dump, iters, TDF_SLIM);
4828 var = create_tmp_var (niters_type, "prolog_loop_niters");
4829 add_referenced_var (var);
4830 iters_name = force_gimple_operand (iters, &stmt, false, var);
4832 /* Insert stmt on loop preheader edge. */
4833 if (stmt)
4835 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4836 gcc_assert (!new_bb);
4839 return iters_name;
4843 /* Function vect_update_init_of_dr
4845 NITERS iterations were peeled from LOOP. DR represents a data reference
4846 in LOOP. This function updates the information recorded in DR to
4847 account for the fact that the first NITERS iterations had already been
4848 executed. Specifically, it updates the OFFSET field of DR. */
4850 static void
4851 vect_update_init_of_dr (struct data_reference *dr, tree niters)
4853 tree offset = DR_OFFSET (dr);
4855 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
4856 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
4857 DR_OFFSET (dr) = offset;
4861 /* Function vect_update_inits_of_drs
4863 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
4864 This function updates the information recorded for the data references in
4865 the loop to account for the fact that the first NITERS iterations had
4866 already been executed. Specifically, it updates the initial_condition of the
4867 access_function of all the data_references in the loop. */
4869 static void
4870 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
4872 unsigned int i;
4873 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
4874 struct data_reference *dr;
4876 if (vect_dump && (dump_flags & TDF_DETAILS))
4877 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
4879 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
4880 vect_update_init_of_dr (dr, niters);
4884 /* Function vect_do_peeling_for_alignment
4886 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
4887 'niters' is set to the misalignment of one of the data references in the
4888 loop, thereby forcing it to refer to an aligned location at the beginning
4889 of the execution of this loop. The data reference for which we are
4890 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
4892 static void
4893 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
4895 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4896 tree niters_of_prolog_loop, ni_name;
4897 tree n_iters;
4898 struct loop *new_loop;
4900 if (vect_print_dump_info (REPORT_DETAILS))
4901 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
4903 initialize_original_copy_tables ();
4905 ni_name = vect_build_loop_niters (loop_vinfo);
4906 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
4908 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
4909 new_loop =
4910 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
4911 niters_of_prolog_loop, ni_name, true, 0);
4912 gcc_assert (new_loop);
4913 #ifdef ENABLE_CHECKING
4914 slpeel_verify_cfg_after_peeling (new_loop, loop);
4915 #endif
4917 /* Update number of times loop executes. */
4918 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
4919 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
4920 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
4922 /* Update the init conditions of the access functions of all data refs. */
4923 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
4925 /* After peeling we have to reset scalar evolution analyzer. */
4926 scev_reset ();
4928 free_original_copy_tables ();
4932 /* Function vect_create_cond_for_align_checks.
4934 Create a conditional expression that represents the alignment checks for
4935 all of data references (array element references) whose alignment must be
4936 checked at runtime.
4938 Input:
4939 LOOP_VINFO - two fields of the loop information are used.
4940 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
4941 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
4943 Output:
4944 COND_EXPR_STMT_LIST - statements needed to construct the conditional
4945 expression.
4946 The returned value is the conditional expression to be used in the if
4947 statement that controls which version of the loop gets executed at runtime.
4949 The algorithm makes two assumptions:
4950 1) The number of bytes "n" in a vector is a power of 2.
4951 2) An address "a" is aligned if a%n is zero and that this
4952 test can be done as a&(n-1) == 0. For example, for 16
4953 byte vectors the test is a&0xf == 0. */
4955 static tree
4956 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
4957 tree *cond_expr_stmt_list)
4959 VEC(tree,heap) *may_misalign_stmts
4960 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
4961 tree ref_stmt;
4962 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
4963 tree mask_cst;
4964 unsigned int i;
4965 tree psize;
4966 tree int_ptrsize_type;
4967 char tmp_name[20];
4968 tree or_tmp_name = NULL_TREE;
4969 tree and_tmp, and_tmp_name, and_stmt;
4970 tree ptrsize_zero;
4972 /* Check that mask is one less than a power of 2, i.e., mask is
4973 all zeros followed by all ones. */
4974 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
4976 /* CHECKME: what is the best integer or unsigned type to use to hold a
4977 cast from a pointer value? */
4978 psize = TYPE_SIZE (ptr_type_node);
4979 int_ptrsize_type
4980 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
4982 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
4983 of the first vector of the i'th data reference. */
4985 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
4987 tree new_stmt_list = NULL_TREE;
4988 tree addr_base;
4989 tree addr_tmp, addr_tmp_name, addr_stmt;
4990 tree or_tmp, new_or_tmp_name, or_stmt;
4992 /* create: addr_tmp = (int)(address_of_first_vector) */
4993 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
4994 &new_stmt_list,
4995 NULL_TREE);
4997 if (new_stmt_list != NULL_TREE)
4998 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
5000 sprintf (tmp_name, "%s%d", "addr2int", i);
5001 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
5002 add_referenced_var (addr_tmp);
5003 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
5004 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
5005 addr_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
5006 addr_tmp_name, addr_stmt);
5007 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
5008 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
5010 /* The addresses are OR together. */
5012 if (or_tmp_name != NULL_TREE)
5014 /* create: or_tmp = or_tmp | addr_tmp */
5015 sprintf (tmp_name, "%s%d", "orptrs", i);
5016 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
5017 add_referenced_var (or_tmp);
5018 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
5019 or_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
5020 new_or_tmp_name,
5021 build2 (BIT_IOR_EXPR, int_ptrsize_type,
5022 or_tmp_name,
5023 addr_tmp_name));
5024 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
5025 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
5026 or_tmp_name = new_or_tmp_name;
5028 else
5029 or_tmp_name = addr_tmp_name;
5031 } /* end for i */
5033 mask_cst = build_int_cst (int_ptrsize_type, mask);
5035 /* create: and_tmp = or_tmp & mask */
5036 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
5037 add_referenced_var (and_tmp);
5038 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
5040 and_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
5041 and_tmp_name,
5042 build2 (BIT_AND_EXPR, int_ptrsize_type,
5043 or_tmp_name, mask_cst));
5044 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
5045 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
5047 /* Make and_tmp the left operand of the conditional test against zero.
5048 if and_tmp has a nonzero bit then some address is unaligned. */
5049 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
5050 return build2 (EQ_EXPR, boolean_type_node,
5051 and_tmp_name, ptrsize_zero);
5055 /* Function vect_transform_loop.
5057 The analysis phase has determined that the loop is vectorizable.
5058 Vectorize the loop - created vectorized stmts to replace the scalar
5059 stmts in the loop, and update the loop exit condition. */
5061 void
5062 vect_transform_loop (loop_vec_info loop_vinfo)
5064 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5065 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5066 int nbbs = loop->num_nodes;
5067 block_stmt_iterator si;
5068 int i;
5069 tree ratio = NULL;
5070 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5071 bool strided_store;
5073 if (vect_print_dump_info (REPORT_DETAILS))
5074 fprintf (vect_dump, "=== vec_transform_loop ===");
5076 /* If the loop has data references that may or may not be aligned then
5077 two versions of the loop need to be generated, one which is vectorized
5078 and one which isn't. A test is then generated to control which of the
5079 loops is executed. The test checks for the alignment of all of the
5080 data references that may or may not be aligned. */
5082 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
5084 struct loop *nloop;
5085 tree cond_expr;
5086 tree cond_expr_stmt_list = NULL_TREE;
5087 basic_block condition_bb;
5088 block_stmt_iterator cond_exp_bsi;
5089 basic_block merge_bb;
5090 basic_block new_exit_bb;
5091 edge new_exit_e, e;
5092 tree orig_phi, new_phi, arg;
5093 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
5095 cond_expr = vect_create_cond_for_align_checks (loop_vinfo,
5096 &cond_expr_stmt_list);
5097 initialize_original_copy_tables ();
5098 nloop = loop_version (loop, cond_expr, &condition_bb,
5099 prob, prob, REG_BR_PROB_BASE - prob, true);
5100 free_original_copy_tables();
5102 /** Loop versioning violates an assumption we try to maintain during
5103 vectorization - that the loop exit block has a single predecessor.
5104 After versioning, the exit block of both loop versions is the same
5105 basic block (i.e. it has two predecessors). Just in order to simplify
5106 following transformations in the vectorizer, we fix this situation
5107 here by adding a new (empty) block on the exit-edge of the loop,
5108 with the proper loop-exit phis to maintain loop-closed-form. **/
5110 merge_bb = single_exit (loop)->dest;
5111 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
5112 new_exit_bb = split_edge (single_exit (loop));
5113 new_exit_e = single_exit (loop);
5114 e = EDGE_SUCC (new_exit_bb, 0);
5116 for (orig_phi = phi_nodes (merge_bb); orig_phi;
5117 orig_phi = PHI_CHAIN (orig_phi))
5119 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
5120 new_exit_bb);
5121 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
5122 add_phi_arg (new_phi, arg, new_exit_e);
5123 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
5126 /** end loop-exit-fixes after versioning **/
5128 update_ssa (TODO_update_ssa);
5129 cond_exp_bsi = bsi_last (condition_bb);
5130 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
5133 /* CHECKME: we wouldn't need this if we called update_ssa once
5134 for all loops. */
5135 bitmap_zero (vect_memsyms_to_rename);
5137 /* Peel the loop if there are data refs with unknown alignment.
5138 Only one data ref with unknown store is allowed. */
5140 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5141 vect_do_peeling_for_alignment (loop_vinfo);
5143 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5144 compile time constant), or it is a constant that doesn't divide by the
5145 vectorization factor, then an epilog loop needs to be created.
5146 We therefore duplicate the loop: the original loop will be vectorized,
5147 and will compute the first (n/VF) iterations. The second copy of the loop
5148 will remain scalar and will compute the remaining (n%VF) iterations.
5149 (VF is the vectorization factor). */
5151 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5152 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5153 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
5154 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
5155 else
5156 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5157 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5159 /* 1) Make sure the loop header has exactly two entries
5160 2) Make sure we have a preheader basic block. */
5162 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5164 split_edge (loop_preheader_edge (loop));
5166 /* FORNOW: the vectorizer supports only loops which body consist
5167 of one basic block (header + empty latch). When the vectorizer will
5168 support more involved loop forms, the order by which the BBs are
5169 traversed need to be reconsidered. */
5171 for (i = 0; i < nbbs; i++)
5173 basic_block bb = bbs[i];
5175 for (si = bsi_start (bb); !bsi_end_p (si);)
5177 tree stmt = bsi_stmt (si);
5178 stmt_vec_info stmt_info;
5179 bool is_store;
5181 if (vect_print_dump_info (REPORT_DETAILS))
5183 fprintf (vect_dump, "------>vectorizing statement: ");
5184 print_generic_expr (vect_dump, stmt, TDF_SLIM);
5186 stmt_info = vinfo_for_stmt (stmt);
5187 gcc_assert (stmt_info);
5188 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5189 && !STMT_VINFO_LIVE_P (stmt_info))
5191 bsi_next (&si);
5192 continue;
5195 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5196 != (unsigned HOST_WIDE_INT) vectorization_factor)
5197 && vect_print_dump_info (REPORT_DETAILS))
5198 fprintf (vect_dump, "multiple-types.");
5200 /* -------- vectorize statement ------------ */
5201 if (vect_print_dump_info (REPORT_DETAILS))
5202 fprintf (vect_dump, "transform statement.");
5204 strided_store = false;
5205 is_store = vect_transform_stmt (stmt, &si, &strided_store);
5206 if (is_store)
5208 stmt_ann_t ann;
5209 if (DR_GROUP_FIRST_DR (stmt_info))
5211 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5212 interleaving chain was completed - free all the stores in
5213 the chain. */
5214 tree next = DR_GROUP_FIRST_DR (stmt_info);
5215 tree tmp;
5216 stmt_vec_info next_stmt_info;
5218 while (next)
5220 next_stmt_info = vinfo_for_stmt (next);
5221 /* Free the attached stmt_vec_info and remove the stmt. */
5222 ann = stmt_ann (next);
5223 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
5224 free (next_stmt_info);
5225 set_stmt_info (ann, NULL);
5226 next = tmp;
5228 bsi_remove (&si, true);
5229 continue;
5231 else
5233 /* Free the attached stmt_vec_info and remove the stmt. */
5234 ann = stmt_ann (stmt);
5235 free (stmt_info);
5236 set_stmt_info (ann, NULL);
5237 bsi_remove (&si, true);
5238 continue;
5241 else
5243 if (strided_store)
5245 /* This is case of skipped interleaved store. We don't free
5246 its stmt_vec_info. */
5247 bsi_remove (&si, true);
5248 continue;
5251 bsi_next (&si);
5252 } /* stmts in BB */
5253 } /* BBs in loop */
5255 slpeel_make_loop_iterate_ntimes (loop, ratio);
5257 mark_set_for_renaming (vect_memsyms_to_rename);
5259 /* The memory tags and pointers in vectorized statements need to
5260 have their SSA forms updated. FIXME, why can't this be delayed
5261 until all the loops have been transformed? */
5262 update_ssa (TODO_update_ssa);
5264 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
5265 fprintf (vect_dump, "LOOP VECTORIZED.");