* tree-sra.c (sra_build_assignment): Disable assertion checking
[official-gcc.git] / gcc / tree-vect-transform.c
blob07c077329a3b28adf706e6a6eef5b71b6fed2f03
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to the Free
19 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
20 02110-1301, USA. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "ggc.h"
27 #include "tree.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "timevar.h"
35 #include "cfgloop.h"
36 #include "expr.h"
37 #include "optabs.h"
38 #include "params.h"
39 #include "recog.h"
40 #include "tree-data-ref.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "langhooks.h"
45 #include "tree-pass.h"
46 #include "toplev.h"
47 #include "real.h"
49 /* Utility functions for the code transformation. */
50 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (tree, block_stmt_iterator *, tree, tree *, tree *, bool, tree);
54 static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree);
55 static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
58 static tree vect_init_vector (tree, tree, tree);
59 static void vect_finish_stmt_generation
60 (tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void update_vuses_to_preheader (tree, struct loop*);
63 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
64 static tree get_initial_def_for_reduction (tree, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
77 /* Function vect_get_new_vect_var.
79 Returns a name for a new variable. The current naming scheme appends the
80 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
81 the name of vectorizer generated variables, and appends that to NAME if
82 provided. */
84 static tree
85 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
87 const char *prefix;
88 tree new_vect_var;
90 switch (var_kind)
92 case vect_simple_var:
93 prefix = "vect_";
94 break;
95 case vect_scalar_var:
96 prefix = "stmp_";
97 break;
98 case vect_pointer_var:
99 prefix = "vect_p";
100 break;
101 default:
102 gcc_unreachable ();
105 if (name)
106 new_vect_var = create_tmp_var (type, concat (prefix, name, NULL));
107 else
108 new_vect_var = create_tmp_var (type, prefix);
110 /* Mark vector typed variable as a gimple register variable. */
111 if (TREE_CODE (type) == VECTOR_TYPE)
112 DECL_GIMPLE_REG_P (new_vect_var) = true;
114 return new_vect_var;
118 /* Function vect_create_addr_base_for_vector_ref.
120 Create an expression that computes the address of the first memory location
121 that will be accessed for a data reference.
123 Input:
124 STMT: The statement containing the data reference.
125 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
126 OFFSET: Optional. If supplied, it is be added to the initial address.
128 Output:
129 1. Return an SSA_NAME whose value is the address of the memory location of
130 the first vector of the data reference.
131 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
132 these statement(s) which define the returned SSA_NAME.
134 FORNOW: We are only handling array accesses with step 1. */
136 static tree
137 vect_create_addr_base_for_vector_ref (tree stmt,
138 tree *new_stmt_list,
139 tree offset)
141 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
142 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
143 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
144 tree base_name = build_fold_indirect_ref (data_ref_base);
145 tree vec_stmt;
146 tree addr_base, addr_expr;
147 tree dest, new_stmt;
148 tree base_offset = unshare_expr (DR_OFFSET (dr));
149 tree init = unshare_expr (DR_INIT (dr));
150 tree vect_ptr_type, addr_expr2;
152 /* Create base_offset */
153 base_offset = size_binop (PLUS_EXPR, base_offset, init);
154 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
155 add_referenced_var (dest);
156 base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest);
157 append_to_statement_list_force (new_stmt, new_stmt_list);
159 if (offset)
161 tree tmp = create_tmp_var (TREE_TYPE (base_offset), "offset");
162 tree step;
164 /* For interleaved access step we divide STEP by the size of the
165 interleaving group. */
166 if (DR_GROUP_SIZE (stmt_info))
167 step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr),
168 build_int_cst (TREE_TYPE (offset),
169 DR_GROUP_SIZE (stmt_info)));
170 else
171 step = DR_STEP (dr);
173 add_referenced_var (tmp);
174 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
175 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
176 base_offset, offset);
177 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
178 append_to_statement_list_force (new_stmt, new_stmt_list);
181 /* base + base_offset */
182 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
183 base_offset);
185 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
187 /* addr_expr = addr_base */
188 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
189 get_name (base_name));
190 add_referenced_var (addr_expr);
191 vec_stmt = fold_convert (vect_ptr_type, addr_base);
192 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
193 get_name (base_name));
194 add_referenced_var (addr_expr2);
195 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
196 append_to_statement_list_force (new_stmt, new_stmt_list);
198 if (vect_print_dump_info (REPORT_DETAILS))
200 fprintf (vect_dump, "created ");
201 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
203 return vec_stmt;
207 /* Function vect_create_data_ref_ptr.
209 Create a new pointer to vector type (vp), that points to the first location
210 accessed in the loop by STMT, along with the def-use update chain to
211 appropriately advance the pointer through the loop iterations. Also set
212 aliasing information for the pointer. This vector pointer is used by the
213 callers to this function to create a memory reference expression for vector
214 load/store access.
216 Input:
217 1. STMT: a stmt that references memory. Expected to be of the form
218 GIMPLE_MODIFY_STMT <name, data-ref> or
219 GIMPLE_MODIFY_STMT <data-ref, name>.
220 2. BSI: block_stmt_iterator where new stmts can be added.
221 3. OFFSET (optional): an offset to be added to the initial address accessed
222 by the data-ref in STMT.
223 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
224 pointing to the initial address.
225 5. TYPE: if not NULL indicates the required type of the data-ref
227 Output:
228 1. Declare a new ptr to vector_type, and have it point to the base of the
229 data reference (initial addressed accessed by the data reference).
230 For example, for vector of type V8HI, the following code is generated:
232 v8hi *vp;
233 vp = (v8hi *)initial_address;
235 if OFFSET is not supplied:
236 initial_address = &a[init];
237 if OFFSET is supplied:
238 initial_address = &a[init + OFFSET];
240 Return the initial_address in INITIAL_ADDRESS.
242 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
243 update the pointer in each iteration of the loop.
245 Return the increment stmt that updates the pointer in PTR_INCR.
247 3. Return the pointer. */
249 static tree
250 vect_create_data_ref_ptr (tree stmt,
251 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
252 tree offset, tree *initial_address, tree *ptr_incr,
253 bool only_init, tree type)
255 tree base_name;
256 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
257 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
258 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
259 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
260 tree vect_ptr_type;
261 tree vect_ptr;
262 tree tag;
263 tree new_temp;
264 tree vec_stmt;
265 tree new_stmt_list = NULL_TREE;
266 edge pe = loop_preheader_edge (loop);
267 basic_block new_bb;
268 tree vect_ptr_init;
269 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
271 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
273 if (vect_print_dump_info (REPORT_DETAILS))
275 tree data_ref_base = base_name;
276 fprintf (vect_dump, "create vector-pointer variable to type: ");
277 print_generic_expr (vect_dump, vectype, TDF_SLIM);
278 if (TREE_CODE (data_ref_base) == VAR_DECL)
279 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
280 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
281 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
282 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
283 fprintf (vect_dump, " vectorizing a record based array ref: ");
284 else if (TREE_CODE (data_ref_base) == SSA_NAME)
285 fprintf (vect_dump, " vectorizing a pointer ref: ");
286 print_generic_expr (vect_dump, base_name, TDF_SLIM);
289 /** (1) Create the new vector-pointer variable: **/
290 if (type)
291 vect_ptr_type = build_pointer_type (type);
292 else
293 vect_ptr_type = build_pointer_type (vectype);
294 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
295 get_name (base_name));
296 add_referenced_var (vect_ptr);
298 /** (2) Add aliasing information to the new vector-pointer:
299 (The points-to info (DR_PTR_INFO) may be defined later.) **/
301 tag = DR_MEMTAG (dr);
302 gcc_assert (tag);
304 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
305 tag must be created with tag added to its may alias list. */
306 if (!MTAG_P (tag))
307 new_type_alias (vect_ptr, tag, DR_REF (dr));
308 else
309 set_symbol_mem_tag (vect_ptr, tag);
311 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
313 /** (3) Calculate the initial address the vector-pointer, and set
314 the vector-pointer to point to it before the loop: **/
316 /* Create: (&(base[init_val+offset]) in the loop preheader. */
317 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
318 offset);
319 pe = loop_preheader_edge (loop);
320 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
321 gcc_assert (!new_bb);
322 *initial_address = new_temp;
324 /* Create: p = (vectype *) initial_base */
325 vec_stmt = fold_convert (vect_ptr_type, new_temp);
326 vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vect_ptr, vec_stmt);
327 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
328 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
329 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
330 gcc_assert (!new_bb);
333 /** (4) Handle the updating of the vector-pointer inside the loop: **/
335 if (only_init) /* No update in loop is required. */
337 /* Copy the points-to information if it exists. */
338 if (DR_PTR_INFO (dr))
339 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
340 return vect_ptr_init;
342 else
344 block_stmt_iterator incr_bsi;
345 bool insert_after;
346 tree indx_before_incr, indx_after_incr;
347 tree incr;
349 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
350 create_iv (vect_ptr_init,
351 fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)),
352 NULL_TREE, loop, &incr_bsi, insert_after,
353 &indx_before_incr, &indx_after_incr);
354 incr = bsi_stmt (incr_bsi);
355 set_stmt_info (stmt_ann (incr),
356 new_stmt_vec_info (incr, loop_vinfo));
358 /* Copy the points-to information if it exists. */
359 if (DR_PTR_INFO (dr))
361 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
362 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
364 merge_alias_info (vect_ptr_init, indx_before_incr);
365 merge_alias_info (vect_ptr_init, indx_after_incr);
366 if (ptr_incr)
367 *ptr_incr = incr;
369 return indx_before_incr;
374 /* Function bump_vector_ptr
376 Increment a pointer (to a vector type) by vector-size. Connect the new
377 increment stmt to the existing def-use update-chain of the pointer.
379 The pointer def-use update-chain before this function:
380 DATAREF_PTR = phi (p_0, p_2)
381 ....
382 PTR_INCR: p_2 = DATAREF_PTR + step
384 The pointer def-use update-chain after this function:
385 DATAREF_PTR = phi (p_0, p_2)
386 ....
387 NEW_DATAREF_PTR = DATAREF_PTR + vector_size
388 ....
389 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
391 Input:
392 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
393 in the loop.
394 PTR_INCR - the stmt that updates the pointer in each iteration of the loop.
395 The increment amount across iterations is also expected to be
396 vector_size.
397 BSI - location where the new update stmt is to be placed.
398 STMT - the original scalar memory-access stmt that is being vectorized.
400 Output: Return NEW_DATAREF_PTR as illustrated above.
404 static tree
405 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
406 tree stmt)
408 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
409 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
410 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
411 tree vptr_type = TREE_TYPE (dataref_ptr);
412 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
413 tree update = fold_convert (vptr_type, TYPE_SIZE_UNIT (vectype));
414 tree incr_stmt;
415 ssa_op_iter iter;
416 use_operand_p use_p;
417 tree new_dataref_ptr;
419 incr_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, ptr_var,
420 build2 (PLUS_EXPR, vptr_type, dataref_ptr, update));
421 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
422 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
423 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
425 /* Update the vector-pointer's cross-iteration increment. */
426 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
428 tree use = USE_FROM_PTR (use_p);
430 if (use == dataref_ptr)
431 SET_USE (use_p, new_dataref_ptr);
432 else
433 gcc_assert (tree_int_cst_compare (use, update) == 0);
436 /* Copy the points-to information if it exists. */
437 if (DR_PTR_INFO (dr))
438 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
439 merge_alias_info (new_dataref_ptr, dataref_ptr);
441 return new_dataref_ptr;
445 /* Function vect_create_destination_var.
447 Create a new temporary of type VECTYPE. */
449 static tree
450 vect_create_destination_var (tree scalar_dest, tree vectype)
452 tree vec_dest;
453 const char *new_name;
454 tree type;
455 enum vect_var_kind kind;
457 kind = vectype ? vect_simple_var : vect_scalar_var;
458 type = vectype ? vectype : TREE_TYPE (scalar_dest);
460 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
462 new_name = get_name (scalar_dest);
463 if (!new_name)
464 new_name = "var_";
465 vec_dest = vect_get_new_vect_var (type, kind, new_name);
466 add_referenced_var (vec_dest);
468 return vec_dest;
472 /* Function vect_init_vector.
474 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
475 the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be
476 used in the vectorization of STMT. */
478 static tree
479 vect_init_vector (tree stmt, tree vector_var, tree vector_type)
481 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
482 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
483 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
484 tree new_var;
485 tree init_stmt;
486 tree vec_oprnd;
487 edge pe;
488 tree new_temp;
489 basic_block new_bb;
491 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
492 add_referenced_var (new_var);
494 init_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, new_var, vector_var);
495 new_temp = make_ssa_name (new_var, init_stmt);
496 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
498 pe = loop_preheader_edge (loop);
499 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
500 gcc_assert (!new_bb);
502 if (vect_print_dump_info (REPORT_DETAILS))
504 fprintf (vect_dump, "created new init_stmt: ");
505 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
508 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
509 return vec_oprnd;
513 /* Function get_initial_def_for_induction
515 Input:
516 STMT - a stmt that performs an induction operation in the loop.
517 IV_PHI - the initial value of the induction variable
519 Output:
520 Return a vector variable, initialized with the first VF values of
521 the induction variable. E.g., for an iv with IV_PHI='X' and
522 evolution S, for a vector of 4 units, we want to return:
523 [X, X + S, X + 2*S, X + 3*S]. */
525 static tree
526 get_initial_def_for_induction (tree stmt, tree iv_phi)
528 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
529 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
530 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
531 tree scalar_type = TREE_TYPE (iv_phi);
532 tree vectype = get_vectype_for_scalar_type (scalar_type);
533 int nunits = GET_MODE_NUNITS (TYPE_MODE (vectype));
534 edge pe = loop_preheader_edge (loop);
535 basic_block new_bb;
536 block_stmt_iterator bsi;
537 tree vec, vec_init, vec_step, t;
538 tree access_fn;
539 tree new_var;
540 tree new_name;
541 tree init_stmt;
542 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
543 tree init_expr, step_expr;
544 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
545 int i;
546 bool ok;
547 int ncopies = vf / nunits;
548 tree expr;
549 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
551 gcc_assert (phi_info);
553 if (STMT_VINFO_VEC_STMT (phi_info))
555 induction_phi = STMT_VINFO_VEC_STMT (phi_info);
556 gcc_assert (TREE_CODE (induction_phi) == PHI_NODE);
558 if (vect_print_dump_info (REPORT_DETAILS))
560 fprintf (vect_dump, "induction already vectorized:");
561 print_generic_expr (vect_dump, iv_phi, TDF_SLIM);
562 fprintf (vect_dump, "\n");
563 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
566 return PHI_RESULT (induction_phi);
569 gcc_assert (ncopies >= 1);
571 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (iv_phi));
572 gcc_assert (access_fn);
573 ok = vect_is_simple_iv_evolution (loop->num, access_fn, &init_expr, &step_expr);
574 gcc_assert (ok);
576 /* Create the vector that holds the initial_value of the induction. */
577 new_name = init_expr;
578 t = NULL_TREE;
579 t = tree_cons (NULL_TREE, init_expr, t);
580 for (i = 1; i < nunits; i++)
582 /* Create: new_name = new_name + step_expr */
583 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
584 add_referenced_var (new_var);
585 init_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, new_var,
586 fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr));
587 new_name = make_ssa_name (new_var, init_stmt);
588 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
590 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
591 gcc_assert (!new_bb);
593 if (vect_print_dump_info (REPORT_DETAILS))
595 fprintf (vect_dump, "created new init_stmt: ");
596 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
598 t = tree_cons (NULL_TREE, new_name, t);
600 vec = build_constructor_from_list (vectype, nreverse (t));
601 vec_init = vect_init_vector (stmt, vec, vectype);
604 /* Create the vector that holds the step of the induction. */
605 expr = build_int_cst (scalar_type, vf);
606 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
607 t = NULL_TREE;
608 for (i = 0; i < nunits; i++)
609 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
610 vec = build_constructor_from_list (vectype, t);
611 vec_step = vect_init_vector (stmt, vec, vectype);
614 /* Create the following def-use cycle:
615 loop prolog:
616 vec_init = [X, X+S, X+2*S, X+3*S]
617 vec_step = [VF*S, VF*S, VF*S, VF*S]
618 loop:
619 vec_iv = PHI <vec_init, vec_loop>
621 STMT
623 vec_loop = vec_iv + vec_step; */
625 /* Create the induction-phi that defines the induction-operand. */
626 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
627 add_referenced_var (vec_dest);
628 induction_phi = create_phi_node (vec_dest, loop->header);
629 set_stmt_info (get_stmt_ann (induction_phi),
630 new_stmt_vec_info (induction_phi, loop_vinfo));
631 induc_def = PHI_RESULT (induction_phi);
633 /* Create the iv update inside the loop */
634 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, NULL_TREE,
635 build2 (PLUS_EXPR, vectype, induc_def, vec_step));
636 vec_def = make_ssa_name (vec_dest, new_stmt);
637 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
638 bsi = bsi_for_stmt (stmt);
639 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
641 /* Set the arguments of the phi node: */
642 add_phi_arg (induction_phi, vec_init, loop_preheader_edge (loop));
643 add_phi_arg (induction_phi, vec_def, loop_latch_edge (loop));
646 /* In case the vectorization factor (VF) is bigger than the number
647 of elements that we can fit in a vectype (nunits), we have to generate
648 more than one vector stmt - i.e - we need to "unroll" the
649 vector stmt by a factor VF/nunits. For more details see documentation
650 in vectorizable_operation. */
652 if (ncopies > 1)
654 stmt_vec_info prev_stmt_vinfo;
656 /* Create the vector that holds the step of the induction. */
657 expr = build_int_cst (scalar_type, nunits);
658 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
659 t = NULL_TREE;
660 for (i = 0; i < nunits; i++)
661 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
662 vec = build_constructor_from_list (vectype, t);
663 vec_step = vect_init_vector (stmt, vec, vectype);
665 vec_def = induc_def;
666 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
667 for (i = 1; i < ncopies; i++)
669 /* vec_i = vec_prev + vec_{step*nunits} */
671 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, NULL_TREE,
672 build2 (PLUS_EXPR, vectype, vec_def, vec_step));
673 vec_def = make_ssa_name (vec_dest, new_stmt);
674 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
675 bsi = bsi_for_stmt (stmt);
676 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
678 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
679 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
683 if (vect_print_dump_info (REPORT_DETAILS))
685 fprintf (vect_dump, "transform induction: created def-use cycle:");
686 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
687 fprintf (vect_dump, "\n");
688 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
691 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
692 return induc_def;
696 /* Function vect_get_vec_def_for_operand.
698 OP is an operand in STMT. This function returns a (vector) def that will be
699 used in the vectorized stmt for STMT.
701 In the case that OP is an SSA_NAME which is defined in the loop, then
702 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
704 In case OP is an invariant or constant, a new stmt that creates a vector def
705 needs to be introduced. */
707 static tree
708 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
710 tree vec_oprnd;
711 tree vec_stmt;
712 tree def_stmt;
713 stmt_vec_info def_stmt_info = NULL;
714 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
715 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
716 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
717 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
718 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
719 tree vec_inv;
720 tree vec_cst;
721 tree t = NULL_TREE;
722 tree def;
723 int i;
724 enum vect_def_type dt;
725 bool is_simple_use;
726 tree vector_type;
728 if (vect_print_dump_info (REPORT_DETAILS))
730 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
731 print_generic_expr (vect_dump, op, TDF_SLIM);
734 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
735 gcc_assert (is_simple_use);
736 if (vect_print_dump_info (REPORT_DETAILS))
738 if (def)
740 fprintf (vect_dump, "def = ");
741 print_generic_expr (vect_dump, def, TDF_SLIM);
743 if (def_stmt)
745 fprintf (vect_dump, " def_stmt = ");
746 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
750 switch (dt)
752 /* Case 1: operand is a constant. */
753 case vect_constant_def:
755 if (scalar_def)
756 *scalar_def = op;
758 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
759 if (vect_print_dump_info (REPORT_DETAILS))
760 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
762 for (i = nunits - 1; i >= 0; --i)
764 t = tree_cons (NULL_TREE, op, t);
766 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
767 vec_cst = build_vector (vector_type, t);
769 return vect_init_vector (stmt, vec_cst, vector_type);
772 /* Case 2: operand is defined outside the loop - loop invariant. */
773 case vect_invariant_def:
775 if (scalar_def)
776 *scalar_def = def;
778 /* Create 'vec_inv = {inv,inv,..,inv}' */
779 if (vect_print_dump_info (REPORT_DETAILS))
780 fprintf (vect_dump, "Create vector_inv.");
782 for (i = nunits - 1; i >= 0; --i)
784 t = tree_cons (NULL_TREE, def, t);
787 /* FIXME: use build_constructor directly. */
788 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
789 vec_inv = build_constructor_from_list (vector_type, t);
790 return vect_init_vector (stmt, vec_inv, vector_type);
793 /* Case 3: operand is defined inside the loop. */
794 case vect_loop_def:
796 if (scalar_def)
797 *scalar_def = def_stmt;
799 /* Get the def from the vectorized stmt. */
800 def_stmt_info = vinfo_for_stmt (def_stmt);
801 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
802 gcc_assert (vec_stmt);
803 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
804 return vec_oprnd;
807 /* Case 4: operand is defined by a loop header phi - reduction */
808 case vect_reduction_def:
810 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
812 /* Get the def before the loop */
813 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
814 return get_initial_def_for_reduction (stmt, op, scalar_def);
817 /* Case 5: operand is defined by loop-header phi - induction. */
818 case vect_induction_def:
820 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
822 /* Get the def before the loop */
823 return get_initial_def_for_induction (stmt, def_stmt);
826 default:
827 gcc_unreachable ();
832 /* Function vect_get_vec_def_for_stmt_copy
834 Return a vector-def for an operand. This function is used when the
835 vectorized stmt to be created (by the caller to this function) is a "copy"
836 created in case the vectorized result cannot fit in one vector, and several
837 copies of the vector-stmt are required. In this case the vector-def is
838 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
839 of the stmt that defines VEC_OPRND.
840 DT is the type of the vector def VEC_OPRND.
842 Context:
843 In case the vectorization factor (VF) is bigger than the number
844 of elements that can fit in a vectype (nunits), we have to generate
845 more than one vector stmt to vectorize the scalar stmt. This situation
846 arises when there are multiple data-types operated upon in the loop; the
847 smallest data-type determines the VF, and as a result, when vectorizing
848 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
849 vector stmt (each computing a vector of 'nunits' results, and together
850 computing 'VF' results in each iteration). This function is called when
851 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
852 which VF=16 and nunits=4, so the number of copies required is 4):
854 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
856 S1: x = load VS1.0: vx.0 = memref0 VS1.1
857 VS1.1: vx.1 = memref1 VS1.2
858 VS1.2: vx.2 = memref2 VS1.3
859 VS1.3: vx.3 = memref3
861 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
862 VSnew.1: vz1 = vx.1 + ... VSnew.2
863 VSnew.2: vz2 = vx.2 + ... VSnew.3
864 VSnew.3: vz3 = vx.3 + ...
866 The vectorization of S1 is explained in vectorizable_load.
867 The vectorization of S2:
868 To create the first vector-stmt out of the 4 copies - VSnew.0 -
869 the function 'vect_get_vec_def_for_operand' is called to
870 get the relevant vector-def for each operand of S2. For operand x it
871 returns the vector-def 'vx.0'.
873 To create the remaining copies of the vector-stmt (VSnew.j), this
874 function is called to get the relevant vector-def for each operand. It is
875 obtained from the respective VS1.j stmt, which is recorded in the
876 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
878 For example, to obtain the vector-def 'vx.1' in order to create the
879 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
880 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
881 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
882 and return its def ('vx.1').
883 Overall, to create the above sequence this function will be called 3 times:
884 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
885 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
886 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
888 static tree
889 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
891 tree vec_stmt_for_operand;
892 stmt_vec_info def_stmt_info;
894 /* Do nothing; can reuse same def. */
895 if (dt == vect_invariant_def || dt == vect_constant_def )
896 return vec_oprnd;
898 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
899 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
900 if (dt == vect_induction_def)
901 gcc_assert (TREE_CODE (vec_stmt_for_operand) == PHI_NODE);
902 gcc_assert (def_stmt_info);
903 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
904 gcc_assert (vec_stmt_for_operand);
905 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
907 return vec_oprnd;
911 /* Function vect_finish_stmt_generation.
913 Insert a new stmt. */
915 static void
916 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
917 block_stmt_iterator *bsi)
919 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
920 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
922 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
923 set_stmt_info (get_stmt_ann (vec_stmt),
924 new_stmt_vec_info (vec_stmt, loop_vinfo));
926 if (vect_print_dump_info (REPORT_DETAILS))
928 fprintf (vect_dump, "add new stmt: ");
929 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
932 /* Make sure bsi points to the stmt that is being vectorized. */
933 gcc_assert (stmt == bsi_stmt (*bsi));
935 #ifdef USE_MAPPED_LOCATION
936 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
937 #else
938 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
939 #endif
943 #define ADJUST_IN_EPILOG 1
945 /* Function get_initial_def_for_reduction
947 Input:
948 STMT - a stmt that performs a reduction operation in the loop.
949 INIT_VAL - the initial value of the reduction variable
951 Output:
952 SCALAR_DEF - a tree that holds a value to be added to the final result
953 of the reduction (used for "ADJUST_IN_EPILOG" - see below).
954 Return a vector variable, initialized according to the operation that STMT
955 performs. This vector will be used as the initial value of the
956 vector of partial results.
958 Option1 ("ADJUST_IN_EPILOG"): Initialize the vector as follows:
959 add: [0,0,...,0,0]
960 mult: [1,1,...,1,1]
961 min/max: [init_val,init_val,..,init_val,init_val]
962 bit and/or: [init_val,init_val,..,init_val,init_val]
963 and when necessary (e.g. add/mult case) let the caller know
964 that it needs to adjust the result by init_val.
966 Option2: Initialize the vector as follows:
967 add: [0,0,...,0,init_val]
968 mult: [1,1,...,1,init_val]
969 min/max: [init_val,init_val,...,init_val]
970 bit and/or: [init_val,init_val,...,init_val]
971 and no adjustments are needed.
973 For example, for the following code:
975 s = init_val;
976 for (i=0;i<n;i++)
977 s = s + a[i];
979 STMT is 's = s + a[i]', and the reduction variable is 's'.
980 For a vector of 4 units, we want to return either [0,0,0,init_val],
981 or [0,0,0,0] and let the caller know that it needs to adjust
982 the result at the end by 'init_val'.
984 FORNOW: We use the "ADJUST_IN_EPILOG" scheme.
985 TODO: Use some cost-model to estimate which scheme is more profitable.
988 static tree
989 get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
991 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
992 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
993 int nunits = GET_MODE_NUNITS (TYPE_MODE (vectype));
994 int nelements;
995 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
996 tree type = TREE_TYPE (init_val);
997 tree def;
998 tree vec, t = NULL_TREE;
999 bool need_epilog_adjust;
1000 int i;
1001 tree vector_type;
1003 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
1005 switch (code)
1007 case WIDEN_SUM_EXPR:
1008 case DOT_PROD_EXPR:
1009 case PLUS_EXPR:
1010 if (INTEGRAL_TYPE_P (type))
1011 def = build_int_cst (type, 0);
1012 else
1013 def = build_real (type, dconst0);
1015 #ifdef ADJUST_IN_EPILOG
1016 /* All the 'nunits' elements are set to 0. The final result will be
1017 adjusted by 'init_val' at the loop epilog. */
1018 nelements = nunits;
1019 need_epilog_adjust = true;
1020 #else
1021 /* 'nunits - 1' elements are set to 0; The last element is set to
1022 'init_val'. No further adjustments at the epilog are needed. */
1023 nelements = nunits - 1;
1024 need_epilog_adjust = false;
1025 #endif
1026 break;
1028 case MIN_EXPR:
1029 case MAX_EXPR:
1030 def = init_val;
1031 nelements = nunits;
1032 need_epilog_adjust = false;
1033 break;
1035 default:
1036 gcc_unreachable ();
1039 for (i = nelements - 1; i >= 0; --i)
1040 t = tree_cons (NULL_TREE, def, t);
1042 if (nelements == nunits - 1)
1044 /* Set the last element of the vector. */
1045 t = tree_cons (NULL_TREE, init_val, t);
1046 nelements += 1;
1048 gcc_assert (nelements == nunits);
1050 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1051 if (TREE_CODE (init_val) == INTEGER_CST || TREE_CODE (init_val) == REAL_CST)
1052 vec = build_vector (vector_type, t);
1053 else
1054 vec = build_constructor_from_list (vector_type, t);
1056 if (!need_epilog_adjust)
1057 *scalar_def = NULL_TREE;
1058 else
1059 *scalar_def = init_val;
1061 return vect_init_vector (stmt, vec, vector_type);
1065 /* Function vect_create_epilog_for_reduction
1067 Create code at the loop-epilog to finalize the result of a reduction
1068 computation.
1070 VECT_DEF is a vector of partial results.
1071 REDUC_CODE is the tree-code for the epilog reduction.
1072 STMT is the scalar reduction stmt that is being vectorized.
1073 REDUCTION_PHI is the phi-node that carries the reduction computation.
1075 This function:
1076 1. Creates the reduction def-use cycle: sets the arguments for
1077 REDUCTION_PHI:
1078 The loop-entry argument is the vectorized initial-value of the reduction.
1079 The loop-latch argument is VECT_DEF - the vector of partial sums.
1080 2. "Reduces" the vector of partial results VECT_DEF into a single result,
1081 by applying the operation specified by REDUC_CODE if available, or by
1082 other means (whole-vector shifts or a scalar loop).
1083 The function also creates a new phi node at the loop exit to preserve
1084 loop-closed form, as illustrated below.
1086 The flow at the entry to this function:
1088 loop:
1089 vec_def = phi <null, null> # REDUCTION_PHI
1090 VECT_DEF = vector_stmt # vectorized form of STMT
1091 s_loop = scalar_stmt # (scalar) STMT
1092 loop_exit:
1093 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1094 use <s_out0>
1095 use <s_out0>
1097 The above is transformed by this function into:
1099 loop:
1100 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
1101 VECT_DEF = vector_stmt # vectorized form of STMT
1102 s_loop = scalar_stmt # (scalar) STMT
1103 loop_exit:
1104 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1105 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1106 v_out2 = reduce <v_out1>
1107 s_out3 = extract_field <v_out2, 0>
1108 s_out4 = adjust_result <s_out3>
1109 use <s_out4>
1110 use <s_out4>
1113 static void
1114 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
1115 enum tree_code reduc_code, tree reduction_phi)
1117 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1118 tree vectype;
1119 enum machine_mode mode;
1120 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1121 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1122 basic_block exit_bb;
1123 tree scalar_dest;
1124 tree scalar_type;
1125 tree new_phi;
1126 block_stmt_iterator exit_bsi;
1127 tree vec_dest;
1128 tree new_temp;
1129 tree new_name;
1130 tree epilog_stmt;
1131 tree new_scalar_dest, exit_phi;
1132 tree bitsize, bitpos, bytesize;
1133 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1134 tree scalar_initial_def;
1135 tree vec_initial_def;
1136 tree orig_name;
1137 imm_use_iterator imm_iter;
1138 use_operand_p use_p;
1139 bool extract_scalar_result;
1140 tree reduction_op;
1141 tree orig_stmt;
1142 tree use_stmt;
1143 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
1144 int op_type;
1146 op_type = TREE_OPERAND_LENGTH (operation);
1147 reduction_op = TREE_OPERAND (operation, op_type-1);
1148 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
1149 mode = TYPE_MODE (vectype);
1151 /*** 1. Create the reduction def-use cycle ***/
1153 /* 1.1 set the loop-entry arg of the reduction-phi: */
1154 /* For the case of reduction, vect_get_vec_def_for_operand returns
1155 the scalar def before the loop, that defines the initial value
1156 of the reduction variable. */
1157 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
1158 &scalar_initial_def);
1159 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
1161 /* 1.2 set the loop-latch arg for the reduction-phi: */
1162 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
1164 if (vect_print_dump_info (REPORT_DETAILS))
1166 fprintf (vect_dump, "transform reduction: created def-use cycle:");
1167 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
1168 fprintf (vect_dump, "\n");
1169 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
1173 /*** 2. Create epilog code
1174 The reduction epilog code operates across the elements of the vector
1175 of partial results computed by the vectorized loop.
1176 The reduction epilog code consists of:
1177 step 1: compute the scalar result in a vector (v_out2)
1178 step 2: extract the scalar result (s_out3) from the vector (v_out2)
1179 step 3: adjust the scalar result (s_out3) if needed.
1181 Step 1 can be accomplished using one the following three schemes:
1182 (scheme 1) using reduc_code, if available.
1183 (scheme 2) using whole-vector shifts, if available.
1184 (scheme 3) using a scalar loop. In this case steps 1+2 above are
1185 combined.
1187 The overall epilog code looks like this:
1189 s_out0 = phi <s_loop> # original EXIT_PHI
1190 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1191 v_out2 = reduce <v_out1> # step 1
1192 s_out3 = extract_field <v_out2, 0> # step 2
1193 s_out4 = adjust_result <s_out3> # step 3
1195 (step 3 is optional, and step2 1 and 2 may be combined).
1196 Lastly, the uses of s_out0 are replaced by s_out4.
1198 ***/
1200 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
1201 v_out1 = phi <v_loop> */
1203 exit_bb = single_exit (loop)->dest;
1204 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
1205 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
1206 exit_bsi = bsi_start (exit_bb);
1208 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
1209 (i.e. when reduc_code is not available) and in the final adjustment code
1210 (if needed). Also get the original scalar reduction variable as
1211 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
1212 represents a reduction pattern), the tree-code and scalar-def are
1213 taken from the original stmt that the pattern-stmt (STMT) replaces.
1214 Otherwise (it is a regular reduction) - the tree-code and scalar-def
1215 are taken from STMT. */
1217 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1218 if (!orig_stmt)
1220 /* Regular reduction */
1221 orig_stmt = stmt;
1223 else
1225 /* Reduction pattern */
1226 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
1227 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1228 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
1230 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1231 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
1232 scalar_type = TREE_TYPE (scalar_dest);
1233 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
1234 bitsize = TYPE_SIZE (scalar_type);
1235 bytesize = TYPE_SIZE_UNIT (scalar_type);
1237 /* 2.3 Create the reduction code, using one of the three schemes described
1238 above. */
1240 if (reduc_code < NUM_TREE_CODES)
1242 /*** Case 1: Create:
1243 v_out2 = reduc_expr <v_out1> */
1245 if (vect_print_dump_info (REPORT_DETAILS))
1246 fprintf (vect_dump, "Reduce using direct vector reduction.");
1248 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1249 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
1250 build1 (reduc_code, vectype, PHI_RESULT (new_phi)));
1251 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1252 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1253 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1255 extract_scalar_result = true;
1257 else
1259 enum tree_code shift_code = 0;
1260 bool have_whole_vector_shift = true;
1261 int bit_offset;
1262 int element_bitsize = tree_low_cst (bitsize, 1);
1263 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1264 tree vec_temp;
1266 if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
1267 shift_code = VEC_RSHIFT_EXPR;
1268 else
1269 have_whole_vector_shift = false;
1271 /* Regardless of whether we have a whole vector shift, if we're
1272 emulating the operation via tree-vect-generic, we don't want
1273 to use it. Only the first round of the reduction is likely
1274 to still be profitable via emulation. */
1275 /* ??? It might be better to emit a reduction tree code here, so that
1276 tree-vect-generic can expand the first round via bit tricks. */
1277 if (!VECTOR_MODE_P (mode))
1278 have_whole_vector_shift = false;
1279 else
1281 optab optab = optab_for_tree_code (code, vectype);
1282 if (optab->handlers[mode].insn_code == CODE_FOR_nothing)
1283 have_whole_vector_shift = false;
1286 if (have_whole_vector_shift)
1288 /*** Case 2: Create:
1289 for (offset = VS/2; offset >= element_size; offset/=2)
1291 Create: va' = vec_shift <va, offset>
1292 Create: va = vop <va, va'>
1293 } */
1295 if (vect_print_dump_info (REPORT_DETAILS))
1296 fprintf (vect_dump, "Reduce using vector shifts");
1298 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1299 new_temp = PHI_RESULT (new_phi);
1301 for (bit_offset = vec_size_in_bits/2;
1302 bit_offset >= element_bitsize;
1303 bit_offset /= 2)
1305 tree bitpos = size_int (bit_offset);
1307 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1308 vec_dest,
1309 build2 (shift_code, vectype,
1310 new_temp, bitpos));
1311 new_name = make_ssa_name (vec_dest, epilog_stmt);
1312 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1313 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1315 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1316 vec_dest,
1317 build2 (code, vectype,
1318 new_name, new_temp));
1319 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1320 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1321 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1324 extract_scalar_result = true;
1326 else
1328 tree rhs;
1330 /*** Case 3: Create:
1331 s = extract_field <v_out2, 0>
1332 for (offset = element_size;
1333 offset < vector_size;
1334 offset += element_size;)
1336 Create: s' = extract_field <v_out2, offset>
1337 Create: s = op <s, s'>
1338 } */
1340 if (vect_print_dump_info (REPORT_DETAILS))
1341 fprintf (vect_dump, "Reduce using scalar code. ");
1343 vec_temp = PHI_RESULT (new_phi);
1344 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1345 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1346 bitsize_zero_node);
1347 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1348 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1349 new_scalar_dest, rhs);
1350 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1351 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1352 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1354 for (bit_offset = element_bitsize;
1355 bit_offset < vec_size_in_bits;
1356 bit_offset += element_bitsize)
1358 tree bitpos = bitsize_int (bit_offset);
1359 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1360 bitpos);
1362 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1363 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1364 new_scalar_dest, rhs);
1365 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
1366 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1367 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1369 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1370 new_scalar_dest,
1371 build2 (code, scalar_type, new_name, new_temp));
1372 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1373 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1374 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1377 extract_scalar_result = false;
1381 /* 2.4 Extract the final scalar result. Create:
1382 s_out3 = extract_field <v_out2, bitpos> */
1384 if (extract_scalar_result)
1386 tree rhs;
1388 if (vect_print_dump_info (REPORT_DETAILS))
1389 fprintf (vect_dump, "extract scalar result");
1391 if (BYTES_BIG_ENDIAN)
1392 bitpos = size_binop (MULT_EXPR,
1393 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
1394 TYPE_SIZE (scalar_type));
1395 else
1396 bitpos = bitsize_zero_node;
1398 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
1399 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1400 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1401 new_scalar_dest, rhs);
1402 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1403 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1404 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1407 /* 2.4 Adjust the final result by the initial value of the reduction
1408 variable. (When such adjustment is not needed, then
1409 'scalar_initial_def' is zero).
1411 Create:
1412 s_out4 = scalar_expr <s_out3, scalar_initial_def> */
1414 if (scalar_initial_def)
1416 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1417 new_scalar_dest,
1418 build2 (code, scalar_type, new_temp, scalar_initial_def));
1419 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1420 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1421 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1424 /* 2.6 Replace uses of s_out0 with uses of s_out3 */
1426 /* Find the loop-closed-use at the loop exit of the original scalar result.
1427 (The reduction result is expected to have two immediate uses - one at the
1428 latch block, and one at the loop exit). */
1429 exit_phi = NULL;
1430 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
1432 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
1434 exit_phi = USE_STMT (use_p);
1435 break;
1438 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
1439 gcc_assert (exit_phi);
1440 /* Replace the uses: */
1441 orig_name = PHI_RESULT (exit_phi);
1442 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
1443 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1444 SET_USE (use_p, new_temp);
1448 /* Function vectorizable_reduction.
1450 Check if STMT performs a reduction operation that can be vectorized.
1451 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1452 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1453 Return FALSE if not a vectorizable STMT, TRUE otherwise.
1455 This function also handles reduction idioms (patterns) that have been
1456 recognized in advance during vect_pattern_recog. In this case, STMT may be
1457 of this form:
1458 X = pattern_expr (arg0, arg1, ..., X)
1459 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
1460 sequence that had been detected and replaced by the pattern-stmt (STMT).
1462 In some cases of reduction patterns, the type of the reduction variable X is
1463 different than the type of the other arguments of STMT.
1464 In such cases, the vectype that is used when transforming STMT into a vector
1465 stmt is different than the vectype that is used to determine the
1466 vectorization factor, because it consists of a different number of elements
1467 than the actual number of elements that are being operated upon in parallel.
1469 For example, consider an accumulation of shorts into an int accumulator.
1470 On some targets it's possible to vectorize this pattern operating on 8
1471 shorts at a time (hence, the vectype for purposes of determining the
1472 vectorization factor should be V8HI); on the other hand, the vectype that
1473 is used to create the vector form is actually V4SI (the type of the result).
1475 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
1476 indicates what is the actual level of parallelism (V8HI in the example), so
1477 that the right vectorization factor would be derived. This vectype
1478 corresponds to the type of arguments to the reduction stmt, and should *NOT*
1479 be used to create the vectorized stmt. The right vectype for the vectorized
1480 stmt is obtained from the type of the result X:
1481 get_vectype_for_scalar_type (TREE_TYPE (X))
1483 This means that, contrary to "regular" reductions (or "regular" stmts in
1484 general), the following equation:
1485 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
1486 does *NOT* necessarily hold for reduction patterns. */
1488 bool
1489 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1491 tree vec_dest;
1492 tree scalar_dest;
1493 tree op;
1494 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
1495 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1496 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1497 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1498 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1499 tree operation;
1500 enum tree_code code, orig_code, epilog_reduc_code = 0;
1501 enum machine_mode vec_mode;
1502 int op_type;
1503 optab optab, reduc_optab;
1504 tree new_temp = NULL_TREE;
1505 tree def, def_stmt;
1506 enum vect_def_type dt;
1507 tree new_phi;
1508 tree scalar_type;
1509 bool is_simple_use;
1510 tree orig_stmt;
1511 stmt_vec_info orig_stmt_info;
1512 tree expr = NULL_TREE;
1513 int i;
1514 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1515 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1516 stmt_vec_info prev_stmt_info;
1517 tree reduc_def;
1518 tree new_stmt = NULL_TREE;
1519 int j;
1521 gcc_assert (ncopies >= 1);
1523 /* 1. Is vectorizable reduction? */
1525 /* Not supportable if the reduction variable is used in the loop. */
1526 if (STMT_VINFO_RELEVANT_P (stmt_info))
1527 return false;
1529 if (!STMT_VINFO_LIVE_P (stmt_info))
1530 return false;
1532 /* Make sure it was already recognized as a reduction computation. */
1533 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
1534 return false;
1536 /* 2. Has this been recognized as a reduction pattern?
1538 Check if STMT represents a pattern that has been recognized
1539 in earlier analysis stages. For stmts that represent a pattern,
1540 the STMT_VINFO_RELATED_STMT field records the last stmt in
1541 the original sequence that constitutes the pattern. */
1543 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1544 if (orig_stmt)
1546 orig_stmt_info = vinfo_for_stmt (orig_stmt);
1547 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
1548 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
1549 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
1552 /* 3. Check the operands of the operation. The first operands are defined
1553 inside the loop body. The last operand is the reduction variable,
1554 which is defined by the loop-header-phi. */
1556 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
1558 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1559 code = TREE_CODE (operation);
1560 op_type = TREE_OPERAND_LENGTH (operation);
1561 if (op_type != binary_op && op_type != ternary_op)
1562 return false;
1563 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1564 scalar_type = TREE_TYPE (scalar_dest);
1566 /* All uses but the last are expected to be defined in the loop.
1567 The last use is the reduction variable. */
1568 for (i = 0; i < op_type-1; i++)
1570 op = TREE_OPERAND (operation, i);
1571 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1572 gcc_assert (is_simple_use);
1573 if (dt != vect_loop_def
1574 && dt != vect_invariant_def
1575 && dt != vect_constant_def
1576 && dt != vect_induction_def)
1577 return false;
1580 op = TREE_OPERAND (operation, i);
1581 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1582 gcc_assert (is_simple_use);
1583 gcc_assert (dt == vect_reduction_def);
1584 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1585 if (orig_stmt)
1586 gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt));
1587 else
1588 gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt));
1590 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
1591 return false;
1593 /* 4. Supportable by target? */
1595 /* 4.1. check support for the operation in the loop */
1596 optab = optab_for_tree_code (code, vectype);
1597 if (!optab)
1599 if (vect_print_dump_info (REPORT_DETAILS))
1600 fprintf (vect_dump, "no optab.");
1601 return false;
1603 vec_mode = TYPE_MODE (vectype);
1604 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1606 if (vect_print_dump_info (REPORT_DETAILS))
1607 fprintf (vect_dump, "op not supported by target.");
1608 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1609 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1610 < vect_min_worthwhile_factor (code))
1611 return false;
1612 if (vect_print_dump_info (REPORT_DETAILS))
1613 fprintf (vect_dump, "proceeding using word mode.");
1616 /* Worthwhile without SIMD support? */
1617 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1618 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1619 < vect_min_worthwhile_factor (code))
1621 if (vect_print_dump_info (REPORT_DETAILS))
1622 fprintf (vect_dump, "not worthwhile without SIMD support.");
1623 return false;
1626 /* 4.2. Check support for the epilog operation.
1628 If STMT represents a reduction pattern, then the type of the
1629 reduction variable may be different than the type of the rest
1630 of the arguments. For example, consider the case of accumulation
1631 of shorts into an int accumulator; The original code:
1632 S1: int_a = (int) short_a;
1633 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
1635 was replaced with:
1636 STMT: int_acc = widen_sum <short_a, int_acc>
1638 This means that:
1639 1. The tree-code that is used to create the vector operation in the
1640 epilog code (that reduces the partial results) is not the
1641 tree-code of STMT, but is rather the tree-code of the original
1642 stmt from the pattern that STMT is replacing. I.e, in the example
1643 above we want to use 'widen_sum' in the loop, but 'plus' in the
1644 epilog.
1645 2. The type (mode) we use to check available target support
1646 for the vector operation to be created in the *epilog*, is
1647 determined by the type of the reduction variable (in the example
1648 above we'd check this: plus_optab[vect_int_mode]).
1649 However the type (mode) we use to check available target support
1650 for the vector operation to be created *inside the loop*, is
1651 determined by the type of the other arguments to STMT (in the
1652 example we'd check this: widen_sum_optab[vect_short_mode]).
1654 This is contrary to "regular" reductions, in which the types of all
1655 the arguments are the same as the type of the reduction variable.
1656 For "regular" reductions we can therefore use the same vector type
1657 (and also the same tree-code) when generating the epilog code and
1658 when generating the code inside the loop. */
1660 if (orig_stmt)
1662 /* This is a reduction pattern: get the vectype from the type of the
1663 reduction variable, and get the tree-code from orig_stmt. */
1664 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1665 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
1666 vec_mode = TYPE_MODE (vectype);
1668 else
1670 /* Regular reduction: use the same vectype and tree-code as used for
1671 the vector code inside the loop can be used for the epilog code. */
1672 orig_code = code;
1675 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
1676 return false;
1677 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
1678 if (!reduc_optab)
1680 if (vect_print_dump_info (REPORT_DETAILS))
1681 fprintf (vect_dump, "no optab for reduction.");
1682 epilog_reduc_code = NUM_TREE_CODES;
1684 if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1686 if (vect_print_dump_info (REPORT_DETAILS))
1687 fprintf (vect_dump, "reduc op not supported by target.");
1688 epilog_reduc_code = NUM_TREE_CODES;
1691 if (!vec_stmt) /* transformation not required. */
1693 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
1694 return true;
1697 /** Transform. **/
1699 if (vect_print_dump_info (REPORT_DETAILS))
1700 fprintf (vect_dump, "transform reduction.");
1702 /* Create the destination vector */
1703 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1705 /* Create the reduction-phi that defines the reduction-operand. */
1706 new_phi = create_phi_node (vec_dest, loop->header);
1708 /* In case the vectorization factor (VF) is bigger than the number
1709 of elements that we can fit in a vectype (nunits), we have to generate
1710 more than one vector stmt - i.e - we need to "unroll" the
1711 vector stmt by a factor VF/nunits. For more details see documentation
1712 in vectorizable_operation. */
1714 prev_stmt_info = NULL;
1715 for (j = 0; j < ncopies; j++)
1717 /* Handle uses. */
1718 if (j == 0)
1720 op = TREE_OPERAND (operation, 0);
1721 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
1722 if (op_type == ternary_op)
1724 op = TREE_OPERAND (operation, 1);
1725 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
1728 /* Get the vector def for the reduction variable from the phi node */
1729 reduc_def = PHI_RESULT (new_phi);
1731 else
1733 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
1734 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
1735 if (op_type == ternary_op)
1736 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
1738 /* Get the vector def for the reduction variable from the vectorized
1739 reduction operation generated in the previous iteration (j-1) */
1740 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
1743 /* Arguments are ready. create the new vector stmt. */
1745 if (op_type == binary_op)
1746 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
1747 else
1748 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
1749 reduc_def);
1750 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
1751 new_temp = make_ssa_name (vec_dest, new_stmt);
1752 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1753 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1755 if (j == 0)
1756 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1757 else
1758 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1759 prev_stmt_info = vinfo_for_stmt (new_stmt);
1762 /* Finalize the reduction-phi (set it's arguments) and create the
1763 epilog reduction code. */
1764 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
1765 return true;
1768 /* Checks if CALL can be vectorized in type VECTYPE. Returns
1769 a function declaration if the target has a vectorized version
1770 of the function, or NULL_TREE if the function cannot be vectorized. */
1772 tree
1773 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
1775 tree fndecl = get_callee_fndecl (call);
1776 enum built_in_function code;
1778 /* We only handle functions that do not read or clobber memory -- i.e.
1779 const or novops ones. */
1780 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
1781 return NULL_TREE;
1783 if (!fndecl
1784 || TREE_CODE (fndecl) != FUNCTION_DECL
1785 || !DECL_BUILT_IN (fndecl))
1786 return NULL_TREE;
1788 code = DECL_FUNCTION_CODE (fndecl);
1789 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
1790 vectype_in);
1793 /* Function vectorizable_call.
1795 Check if STMT performs a function call that can be vectorized.
1796 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1797 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1798 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1800 bool
1801 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1803 tree vec_dest;
1804 tree scalar_dest;
1805 tree operation;
1806 tree op, type;
1807 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
1808 tree vectype_out, vectype_in;
1809 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1810 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
1811 enum vect_def_type dt[2];
1812 int ncopies, j, nargs;
1813 call_expr_arg_iterator iter;
1815 /* Is STMT a vectorizable call? */
1816 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
1817 return false;
1819 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
1820 return false;
1822 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1823 if (TREE_CODE (operation) != CALL_EXPR)
1824 return false;
1826 /* Process function arguments. */
1827 rhs_type = NULL_TREE;
1828 nargs = 0;
1829 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
1831 ++nargs;
1833 /* Bail out if the function has more than two arguments, we
1834 do not have interesting builtin functions to vectorize with
1835 more than two arguments. */
1836 if (nargs > 2)
1837 return false;
1839 /* We can only handle calls with arguments of the same type. */
1840 if (rhs_type
1841 && rhs_type != TREE_TYPE (op))
1843 if (vect_print_dump_info (REPORT_DETAILS))
1844 fprintf (vect_dump, "argument types differ.");
1845 return false;
1847 rhs_type = TREE_TYPE (op);
1849 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
1851 if (vect_print_dump_info (REPORT_DETAILS))
1852 fprintf (vect_dump, "use not simple.");
1853 return false;
1857 /* No arguments is also not good. */
1858 if (nargs == 0)
1859 return false;
1861 vectype_in = get_vectype_for_scalar_type (rhs_type);
1863 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
1864 vectype_out = get_vectype_for_scalar_type (lhs_type);
1866 /* Only handle the case of vectors with the same number of elements.
1867 FIXME: We need a way to handle for example the SSE2 cvtpd2dq
1868 instruction which converts V2DFmode to V4SImode but only
1869 using the lower half of the V4SImode result. */
1870 if (TYPE_VECTOR_SUBPARTS (vectype_in) != TYPE_VECTOR_SUBPARTS (vectype_out))
1871 return false;
1873 /* For now, we only vectorize functions if a target specific builtin
1874 is available. TODO -- in some cases, it might be profitable to
1875 insert the calls for pieces of the vector, in order to be able
1876 to vectorize other operations in the loop. */
1877 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
1878 if (fndecl == NULL_TREE)
1880 if (vect_print_dump_info (REPORT_DETAILS))
1881 fprintf (vect_dump, "function is not vectorizable.");
1883 return false;
1886 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
1888 if (!vec_stmt) /* transformation not required. */
1890 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
1891 return true;
1894 /** Transform. **/
1896 if (vect_print_dump_info (REPORT_DETAILS))
1897 fprintf (vect_dump, "transform operation.");
1899 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1900 / TYPE_VECTOR_SUBPARTS (vectype_out));
1901 gcc_assert (ncopies >= 1);
1903 /* Handle def. */
1904 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1905 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
1907 prev_stmt_info = NULL;
1908 for (j = 0; j < ncopies; ++j)
1910 tree new_stmt, vargs;
1911 tree vec_oprnd[2];
1912 int n;
1914 /* Build argument list for the vectorized call. */
1915 /* FIXME: Rewrite this so that it doesn't construct a temporary
1916 list. */
1917 vargs = NULL_TREE;
1918 n = -1;
1919 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
1921 ++n;
1923 if (j == 0)
1924 vec_oprnd[n] = vect_get_vec_def_for_operand (op, stmt, NULL);
1925 else
1926 vec_oprnd[n] = vect_get_vec_def_for_stmt_copy (dt[n], vec_oprnd[n]);
1928 vargs = tree_cons (NULL_TREE, vec_oprnd[n], vargs);
1930 vargs = nreverse (vargs);
1932 rhs = build_function_call_expr (fndecl, vargs);
1933 new_stmt = build2 (GIMPLE_MODIFY_STMT, NULL_TREE, vec_dest, rhs);
1934 new_temp = make_ssa_name (vec_dest, new_stmt);
1935 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1937 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1939 if (j == 0)
1940 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1941 else
1942 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1943 prev_stmt_info = vinfo_for_stmt (new_stmt);
1946 /* The call in STMT might prevent it from being removed in dce. We however
1947 cannot remove it here, due to the way the ssa name it defines is mapped
1948 to the new definition. So just replace rhs of the statement with something
1949 harmless. */
1950 type = TREE_TYPE (scalar_dest);
1951 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
1953 return true;
1957 /* Function vectorizable_conversion.
1959 Check if STMT performs a conversion operation, that can be vectorized.
1960 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1961 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1962 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1964 bool
1965 vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
1966 tree * vec_stmt)
1968 tree vec_dest;
1969 tree scalar_dest;
1970 tree operation;
1971 tree op0;
1972 tree vec_oprnd0 = NULL_TREE;
1973 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1974 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1975 enum tree_code code;
1976 tree new_temp;
1977 tree def, def_stmt;
1978 enum vect_def_type dt0;
1979 tree new_stmt;
1980 int nunits_in;
1981 int nunits_out;
1982 int ncopies, j;
1983 tree vectype_out, vectype_in;
1984 tree rhs_type, lhs_type;
1985 tree builtin_decl;
1986 stmt_vec_info prev_stmt_info;
1988 /* Is STMT a vectorizable conversion? */
1990 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1991 return false;
1993 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1995 if (STMT_VINFO_LIVE_P (stmt_info))
1997 /* FORNOW: not yet supported. */
1998 if (vect_print_dump_info (REPORT_DETAILS))
1999 fprintf (vect_dump, "value used after loop.");
2000 return false;
2003 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2004 return false;
2006 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2007 return false;
2009 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2010 code = TREE_CODE (operation);
2011 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
2012 return false;
2014 /* Check types of lhs and rhs */
2015 op0 = TREE_OPERAND (operation, 0);
2016 rhs_type = TREE_TYPE (op0);
2017 vectype_in = get_vectype_for_scalar_type (rhs_type);
2018 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2020 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2021 lhs_type = TREE_TYPE (scalar_dest);
2022 vectype_out = get_vectype_for_scalar_type (lhs_type);
2023 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
2024 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2026 /* FORNOW: need to extend to support short<->float conversions as well. */
2027 if (nunits_out != nunits_in)
2028 return false;
2030 /* Bail out if the types are both integral or non-integral */
2031 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
2032 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
2033 return false;
2035 /* Sanity check: make sure that at least one copy of the vectorized stmt
2036 needs to be generated. */
2037 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2038 gcc_assert (ncopies >= 1);
2040 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2042 if (vect_print_dump_info (REPORT_DETAILS))
2043 fprintf (vect_dump, "use not simple.");
2044 return false;
2047 /* Supportable by target? */
2048 if (!targetm.vectorize.builtin_conversion (code, vectype_in))
2050 if (vect_print_dump_info (REPORT_DETAILS))
2051 fprintf (vect_dump, "op not supported by target.");
2052 return false;
2055 if (!vec_stmt) /* transformation not required. */
2057 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
2058 return true;
2061 /** Transform. **/
2063 if (vect_print_dump_info (REPORT_DETAILS))
2064 fprintf (vect_dump, "transform conversion.");
2066 /* Handle def. */
2067 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2069 prev_stmt_info = NULL;
2070 for (j = 0; j < ncopies; j++)
2072 tree sym;
2073 ssa_op_iter iter;
2075 if (j == 0)
2076 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2077 else
2078 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2080 builtin_decl =
2081 targetm.vectorize.builtin_conversion (code, vectype_in);
2082 new_stmt = build_call_expr (builtin_decl, 1, vec_oprnd0);
2084 /* Arguments are ready. create the new vector stmt. */
2085 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
2086 new_stmt);
2087 new_temp = make_ssa_name (vec_dest, new_stmt);
2088 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2089 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2090 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2092 if (TREE_CODE (sym) == SSA_NAME)
2093 sym = SSA_NAME_VAR (sym);
2094 mark_sym_for_renaming (sym);
2097 if (j == 0)
2098 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2099 else
2100 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2101 prev_stmt_info = vinfo_for_stmt (new_stmt);
2103 return true;
2107 /* Function vectorizable_assignment.
2109 Check if STMT performs an assignment (copy) that can be vectorized.
2110 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2111 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2112 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2114 bool
2115 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2117 tree vec_dest;
2118 tree scalar_dest;
2119 tree op;
2120 tree vec_oprnd;
2121 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2122 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2123 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2124 tree new_temp;
2125 tree def, def_stmt;
2126 enum vect_def_type dt;
2127 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2128 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2130 gcc_assert (ncopies >= 1);
2131 if (ncopies > 1)
2132 return false; /* FORNOW */
2134 /* Is vectorizable assignment? */
2135 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2136 return false;
2138 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2140 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2141 return false;
2143 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2144 if (TREE_CODE (scalar_dest) != SSA_NAME)
2145 return false;
2147 op = GIMPLE_STMT_OPERAND (stmt, 1);
2148 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
2150 if (vect_print_dump_info (REPORT_DETAILS))
2151 fprintf (vect_dump, "use not simple.");
2152 return false;
2155 if (!vec_stmt) /* transformation not required. */
2157 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
2158 return true;
2161 /** Transform. **/
2162 if (vect_print_dump_info (REPORT_DETAILS))
2163 fprintf (vect_dump, "transform assignment.");
2165 /* Handle def. */
2166 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2168 /* Handle use. */
2169 op = GIMPLE_STMT_OPERAND (stmt, 1);
2170 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
2172 /* Arguments are ready. create the new vector stmt. */
2173 *vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, vec_oprnd);
2174 new_temp = make_ssa_name (vec_dest, *vec_stmt);
2175 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
2176 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
2178 return true;
2182 /* Function vect_min_worthwhile_factor.
2184 For a loop where we could vectorize the operation indicated by CODE,
2185 return the minimum vectorization factor that makes it worthwhile
2186 to use generic vectors. */
2187 static int
2188 vect_min_worthwhile_factor (enum tree_code code)
2190 switch (code)
2192 case PLUS_EXPR:
2193 case MINUS_EXPR:
2194 case NEGATE_EXPR:
2195 return 4;
2197 case BIT_AND_EXPR:
2198 case BIT_IOR_EXPR:
2199 case BIT_XOR_EXPR:
2200 case BIT_NOT_EXPR:
2201 return 2;
2203 default:
2204 return INT_MAX;
2209 /* Function vectorizable_operation.
2211 Check if STMT performs a binary or unary operation that can be vectorized.
2212 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2213 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2214 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2216 bool
2217 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2219 tree vec_dest;
2220 tree scalar_dest;
2221 tree operation;
2222 tree op0, op1 = NULL;
2223 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2224 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2225 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2226 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2227 enum tree_code code;
2228 enum machine_mode vec_mode;
2229 tree new_temp;
2230 int op_type;
2231 optab optab;
2232 int icode;
2233 enum machine_mode optab_op2_mode;
2234 tree def, def_stmt;
2235 enum vect_def_type dt0, dt1;
2236 tree new_stmt;
2237 stmt_vec_info prev_stmt_info;
2238 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
2239 int nunits_out;
2240 tree vectype_out;
2241 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2242 int j;
2244 gcc_assert (ncopies >= 1);
2246 /* Is STMT a vectorizable binary/unary operation? */
2247 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2248 return false;
2250 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2252 if (STMT_VINFO_LIVE_P (stmt_info))
2254 /* FORNOW: not yet supported. */
2255 if (vect_print_dump_info (REPORT_DETAILS))
2256 fprintf (vect_dump, "value used after loop.");
2257 return false;
2260 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2261 return false;
2263 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2264 return false;
2266 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2267 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2268 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2269 if (nunits_out != nunits_in)
2270 return false;
2272 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2273 code = TREE_CODE (operation);
2274 optab = optab_for_tree_code (code, vectype);
2276 /* Support only unary or binary operations. */
2277 op_type = TREE_OPERAND_LENGTH (operation);
2278 if (op_type != unary_op && op_type != binary_op)
2280 if (vect_print_dump_info (REPORT_DETAILS))
2281 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
2282 return false;
2285 op0 = TREE_OPERAND (operation, 0);
2286 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2288 if (vect_print_dump_info (REPORT_DETAILS))
2289 fprintf (vect_dump, "use not simple.");
2290 return false;
2293 if (op_type == binary_op)
2295 op1 = TREE_OPERAND (operation, 1);
2296 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2298 if (vect_print_dump_info (REPORT_DETAILS))
2299 fprintf (vect_dump, "use not simple.");
2300 return false;
2304 /* Supportable by target? */
2305 if (!optab)
2307 if (vect_print_dump_info (REPORT_DETAILS))
2308 fprintf (vect_dump, "no optab.");
2309 return false;
2311 vec_mode = TYPE_MODE (vectype);
2312 icode = (int) optab->handlers[(int) vec_mode].insn_code;
2313 if (icode == CODE_FOR_nothing)
2315 if (vect_print_dump_info (REPORT_DETAILS))
2316 fprintf (vect_dump, "op not supported by target.");
2317 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2318 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2319 < vect_min_worthwhile_factor (code))
2320 return false;
2321 if (vect_print_dump_info (REPORT_DETAILS))
2322 fprintf (vect_dump, "proceeding using word mode.");
2325 /* Worthwhile without SIMD support? */
2326 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2327 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2328 < vect_min_worthwhile_factor (code))
2330 if (vect_print_dump_info (REPORT_DETAILS))
2331 fprintf (vect_dump, "not worthwhile without SIMD support.");
2332 return false;
2335 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2337 /* FORNOW: not yet supported. */
2338 if (!VECTOR_MODE_P (vec_mode))
2339 return false;
2341 /* Invariant argument is needed for a vector shift
2342 by a scalar shift operand. */
2343 optab_op2_mode = insn_data[icode].operand[2].mode;
2344 if (! (VECTOR_MODE_P (optab_op2_mode)
2345 || dt1 == vect_constant_def
2346 || dt1 == vect_invariant_def))
2348 if (vect_print_dump_info (REPORT_DETAILS))
2349 fprintf (vect_dump, "operand mode requires invariant argument.");
2350 return false;
2354 if (!vec_stmt) /* transformation not required. */
2356 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
2357 return true;
2360 /** Transform. **/
2362 if (vect_print_dump_info (REPORT_DETAILS))
2363 fprintf (vect_dump, "transform binary/unary operation.");
2365 /* Handle def. */
2366 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2368 /* In case the vectorization factor (VF) is bigger than the number
2369 of elements that we can fit in a vectype (nunits), we have to generate
2370 more than one vector stmt - i.e - we need to "unroll" the
2371 vector stmt by a factor VF/nunits. In doing so, we record a pointer
2372 from one copy of the vector stmt to the next, in the field
2373 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
2374 stages to find the correct vector defs to be used when vectorizing
2375 stmts that use the defs of the current stmt. The example below illustrates
2376 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
2377 4 vectorized stmts):
2379 before vectorization:
2380 RELATED_STMT VEC_STMT
2381 S1: x = memref - -
2382 S2: z = x + 1 - -
2384 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
2385 there):
2386 RELATED_STMT VEC_STMT
2387 VS1_0: vx0 = memref0 VS1_1 -
2388 VS1_1: vx1 = memref1 VS1_2 -
2389 VS1_2: vx2 = memref2 VS1_3 -
2390 VS1_3: vx3 = memref3 - -
2391 S1: x = load - VS1_0
2392 S2: z = x + 1 - -
2394 step2: vectorize stmt S2 (done here):
2395 To vectorize stmt S2 we first need to find the relevant vector
2396 def for the first operand 'x'. This is, as usual, obtained from
2397 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
2398 that defines 'x' (S1). This way we find the stmt VS1_0, and the
2399 relevant vector def 'vx0'. Having found 'vx0' we can generate
2400 the vector stmt VS2_0, and as usual, record it in the
2401 STMT_VINFO_VEC_STMT of stmt S2.
2402 When creating the second copy (VS2_1), we obtain the relevant vector
2403 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
2404 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
2405 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
2406 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
2407 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
2408 chain of stmts and pointers:
2409 RELATED_STMT VEC_STMT
2410 VS1_0: vx0 = memref0 VS1_1 -
2411 VS1_1: vx1 = memref1 VS1_2 -
2412 VS1_2: vx2 = memref2 VS1_3 -
2413 VS1_3: vx3 = memref3 - -
2414 S1: x = load - VS1_0
2415 VS2_0: vz0 = vx0 + v1 VS2_1 -
2416 VS2_1: vz1 = vx1 + v1 VS2_2 -
2417 VS2_2: vz2 = vx2 + v1 VS2_3 -
2418 VS2_3: vz3 = vx3 + v1 - -
2419 S2: z = x + 1 - VS2_0 */
2421 prev_stmt_info = NULL;
2422 for (j = 0; j < ncopies; j++)
2424 /* Handle uses. */
2425 if (j == 0)
2427 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2428 if (op_type == binary_op)
2430 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2432 /* Vector shl and shr insn patterns can be defined with
2433 scalar operand 2 (shift operand). In this case, use
2434 constant or loop invariant op1 directly, without
2435 extending it to vector mode first. */
2436 optab_op2_mode = insn_data[icode].operand[2].mode;
2437 if (!VECTOR_MODE_P (optab_op2_mode))
2439 if (vect_print_dump_info (REPORT_DETAILS))
2440 fprintf (vect_dump, "operand 1 using scalar mode.");
2441 vec_oprnd1 = op1;
2444 if (!vec_oprnd1)
2445 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2448 else
2450 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2451 if (op_type == binary_op)
2452 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2455 /* Arguments are ready. create the new vector stmt. */
2457 if (op_type == binary_op)
2458 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
2459 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
2460 else
2461 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
2462 build1 (code, vectype, vec_oprnd0));
2463 new_temp = make_ssa_name (vec_dest, new_stmt);
2464 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2465 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2467 if (j == 0)
2468 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2469 else
2470 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2471 prev_stmt_info = vinfo_for_stmt (new_stmt);
2474 return true;
2478 /* Function vectorizable_type_demotion
2480 Check if STMT performs a binary or unary operation that involves
2481 type demotion, and if it can be vectorized.
2482 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2483 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2484 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2486 bool
2487 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
2488 tree *vec_stmt)
2490 tree vec_dest;
2491 tree scalar_dest;
2492 tree operation;
2493 tree op0;
2494 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2495 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2496 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2497 enum tree_code code;
2498 tree new_temp;
2499 tree def, def_stmt;
2500 enum vect_def_type dt0;
2501 tree new_stmt;
2502 stmt_vec_info prev_stmt_info;
2503 int nunits_in;
2504 int nunits_out;
2505 tree vectype_out;
2506 int ncopies;
2507 int j;
2508 tree expr;
2509 tree vectype_in;
2510 tree scalar_type;
2511 optab optab;
2512 enum machine_mode vec_mode;
2514 /* Is STMT a vectorizable type-demotion operation? */
2516 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2517 return false;
2519 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2521 if (STMT_VINFO_LIVE_P (stmt_info))
2523 /* FORNOW: not yet supported. */
2524 if (vect_print_dump_info (REPORT_DETAILS))
2525 fprintf (vect_dump, "value used after loop.");
2526 return false;
2529 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2530 return false;
2532 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2533 return false;
2535 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2536 code = TREE_CODE (operation);
2537 if (code != NOP_EXPR && code != CONVERT_EXPR)
2538 return false;
2540 op0 = TREE_OPERAND (operation, 0);
2541 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2542 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2544 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2545 scalar_type = TREE_TYPE (scalar_dest);
2546 vectype_out = get_vectype_for_scalar_type (scalar_type);
2547 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2548 if (nunits_in != nunits_out / 2) /* FORNOW */
2549 return false;
2551 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2552 gcc_assert (ncopies >= 1);
2554 if (! INTEGRAL_TYPE_P (scalar_type)
2555 || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2556 return false;
2558 /* Check the operands of the operation. */
2559 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2561 if (vect_print_dump_info (REPORT_DETAILS))
2562 fprintf (vect_dump, "use not simple.");
2563 return false;
2566 /* Supportable by target? */
2567 code = VEC_PACK_MOD_EXPR;
2568 optab = optab_for_tree_code (VEC_PACK_MOD_EXPR, vectype_in);
2569 if (!optab)
2570 return false;
2572 vec_mode = TYPE_MODE (vectype_in);
2573 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
2574 return false;
2576 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2578 if (!vec_stmt) /* transformation not required. */
2580 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
2581 return true;
2584 /** Transform. **/
2586 if (vect_print_dump_info (REPORT_DETAILS))
2587 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
2588 ncopies);
2590 /* Handle def. */
2591 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2593 /* In case the vectorization factor (VF) is bigger than the number
2594 of elements that we can fit in a vectype (nunits), we have to generate
2595 more than one vector stmt - i.e - we need to "unroll" the
2596 vector stmt by a factor VF/nunits. */
2597 prev_stmt_info = NULL;
2598 for (j = 0; j < ncopies; j++)
2600 /* Handle uses. */
2601 if (j == 0)
2603 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2604 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2606 else
2608 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
2609 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2612 /* Arguments are ready. Create the new vector stmt. */
2613 expr = build2 (code, vectype_out, vec_oprnd0, vec_oprnd1);
2614 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
2615 new_temp = make_ssa_name (vec_dest, new_stmt);
2616 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2617 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2619 if (j == 0)
2620 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2621 else
2622 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2624 prev_stmt_info = vinfo_for_stmt (new_stmt);
2627 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2628 return true;
2632 /* Function vect_gen_widened_results_half
2634 Create a vector stmt whose code, type, number of arguments, and result
2635 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2636 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2637 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2638 needs to be created (DECL is a function-decl of a target-builtin).
2639 STMT is the original scalar stmt that we are vectorizing. */
2641 static tree
2642 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2643 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2644 tree vec_dest, block_stmt_iterator *bsi,
2645 tree stmt)
2647 tree expr;
2648 tree new_stmt;
2649 tree new_temp;
2650 tree sym;
2651 ssa_op_iter iter;
2653 /* Generate half of the widened result: */
2654 if (code == CALL_EXPR)
2656 /* Target specific support */
2657 if (op_type == binary_op)
2658 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
2659 else
2660 expr = build_call_expr (decl, 1, vec_oprnd0);
2662 else
2664 /* Generic support */
2665 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2666 if (op_type == binary_op)
2667 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2668 else
2669 expr = build1 (code, vectype, vec_oprnd0);
2671 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
2672 new_temp = make_ssa_name (vec_dest, new_stmt);
2673 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2674 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2676 if (code == CALL_EXPR)
2678 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2680 if (TREE_CODE (sym) == SSA_NAME)
2681 sym = SSA_NAME_VAR (sym);
2682 mark_sym_for_renaming (sym);
2686 return new_stmt;
2690 /* Function vectorizable_type_promotion
2692 Check if STMT performs a binary or unary operation that involves
2693 type promotion, and if it can be vectorized.
2694 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2695 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2696 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2698 bool
2699 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
2700 tree *vec_stmt)
2702 tree vec_dest;
2703 tree scalar_dest;
2704 tree operation;
2705 tree op0, op1 = NULL;
2706 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2707 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2708 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2709 enum tree_code code, code1 = CODE_FOR_nothing, code2 = CODE_FOR_nothing;
2710 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
2711 int op_type;
2712 tree def, def_stmt;
2713 enum vect_def_type dt0, dt1;
2714 tree new_stmt;
2715 stmt_vec_info prev_stmt_info;
2716 int nunits_in;
2717 int nunits_out;
2718 tree vectype_out;
2719 int ncopies;
2720 int j;
2721 tree vectype_in;
2723 /* Is STMT a vectorizable type-promotion operation? */
2725 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2726 return false;
2728 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2730 if (STMT_VINFO_LIVE_P (stmt_info))
2732 /* FORNOW: not yet supported. */
2733 if (vect_print_dump_info (REPORT_DETAILS))
2734 fprintf (vect_dump, "value used after loop.");
2735 return false;
2738 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2739 return false;
2741 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2742 return false;
2744 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2745 code = TREE_CODE (operation);
2746 if (code != NOP_EXPR && code != WIDEN_MULT_EXPR)
2747 return false;
2749 op0 = TREE_OPERAND (operation, 0);
2750 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2751 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2752 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2753 gcc_assert (ncopies >= 1);
2755 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2756 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2757 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2758 if (nunits_out != nunits_in / 2) /* FORNOW */
2759 return false;
2761 if (! INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
2762 || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2763 return false;
2765 /* Check the operands of the operation. */
2766 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2768 if (vect_print_dump_info (REPORT_DETAILS))
2769 fprintf (vect_dump, "use not simple.");
2770 return false;
2773 op_type = TREE_CODE_LENGTH (code);
2774 if (op_type == binary_op)
2776 op1 = TREE_OPERAND (operation, 1);
2777 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2779 if (vect_print_dump_info (REPORT_DETAILS))
2780 fprintf (vect_dump, "use not simple.");
2781 return false;
2785 /* Supportable by target? */
2786 if (!supportable_widening_operation (code, stmt, vectype_in,
2787 &decl1, &decl2, &code1, &code2))
2788 return false;
2790 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2792 if (!vec_stmt) /* transformation not required. */
2794 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
2795 return true;
2798 /** Transform. **/
2800 if (vect_print_dump_info (REPORT_DETAILS))
2801 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
2802 ncopies);
2804 /* Handle def. */
2805 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2807 /* In case the vectorization factor (VF) is bigger than the number
2808 of elements that we can fit in a vectype (nunits), we have to generate
2809 more than one vector stmt - i.e - we need to "unroll" the
2810 vector stmt by a factor VF/nunits. */
2812 prev_stmt_info = NULL;
2813 for (j = 0; j < ncopies; j++)
2815 /* Handle uses. */
2816 if (j == 0)
2818 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2819 if (op_type == binary_op)
2820 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2822 else
2824 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2825 if (op_type == binary_op)
2826 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2829 /* Arguments are ready. Create the new vector stmt. We are creating
2830 two vector defs because the widened result does not fit in one vector.
2831 The vectorized stmt can be expressed as a call to a taregt builtin,
2832 or a using a tree-code. */
2833 /* Generate first half of the widened result: */
2834 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
2835 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2836 if (j == 0)
2837 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2838 else
2839 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2840 prev_stmt_info = vinfo_for_stmt (new_stmt);
2842 /* Generate second half of the widened result: */
2843 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
2844 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2845 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2846 prev_stmt_info = vinfo_for_stmt (new_stmt);
2850 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2851 return true;
2855 /* Function vect_strided_store_supported.
2857 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
2858 and FALSE otherwise. */
2860 static bool
2861 vect_strided_store_supported (tree vectype)
2863 optab interleave_high_optab, interleave_low_optab;
2864 int mode;
2866 mode = (int) TYPE_MODE (vectype);
2868 /* Check that the operation is supported. */
2869 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
2870 vectype);
2871 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
2872 vectype);
2873 if (!interleave_high_optab || !interleave_low_optab)
2875 if (vect_print_dump_info (REPORT_DETAILS))
2876 fprintf (vect_dump, "no optab for interleave.");
2877 return false;
2880 if (interleave_high_optab->handlers[(int) mode].insn_code
2881 == CODE_FOR_nothing
2882 || interleave_low_optab->handlers[(int) mode].insn_code
2883 == CODE_FOR_nothing)
2885 if (vect_print_dump_info (REPORT_DETAILS))
2886 fprintf (vect_dump, "interleave op not supported by target.");
2887 return false;
2889 return true;
2893 /* Function vect_permute_store_chain.
2895 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
2896 a power of 2, generate interleave_high/low stmts to reorder the data
2897 correctly for the stores. Return the final references for stores in
2898 RESULT_CHAIN.
2900 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
2901 The input is 4 vectors each containing 8 elements. We assign a number to each
2902 element, the input sequence is:
2904 1st vec: 0 1 2 3 4 5 6 7
2905 2nd vec: 8 9 10 11 12 13 14 15
2906 3rd vec: 16 17 18 19 20 21 22 23
2907 4th vec: 24 25 26 27 28 29 30 31
2909 The output sequence should be:
2911 1st vec: 0 8 16 24 1 9 17 25
2912 2nd vec: 2 10 18 26 3 11 19 27
2913 3rd vec: 4 12 20 28 5 13 21 30
2914 4th vec: 6 14 22 30 7 15 23 31
2916 i.e., we interleave the contents of the four vectors in their order.
2918 We use interleave_high/low instructions to create such output. The input of
2919 each interleave_high/low operation is two vectors:
2920 1st vec 2nd vec
2921 0 1 2 3 4 5 6 7
2922 the even elements of the result vector are obtained left-to-right from the
2923 high/low elements of the first vector. The odd elements of the result are
2924 obtained left-to-right from the high/low elements of the second vector.
2925 The output of interleave_high will be: 0 4 1 5
2926 and of interleave_low: 2 6 3 7
2929 The permutation is done in log LENGTH stages. In each stage interleave_high
2930 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
2931 where the first argument is taken from the first half of DR_CHAIN and the
2932 second argument from it's second half.
2933 In our example,
2935 I1: interleave_high (1st vec, 3rd vec)
2936 I2: interleave_low (1st vec, 3rd vec)
2937 I3: interleave_high (2nd vec, 4th vec)
2938 I4: interleave_low (2nd vec, 4th vec)
2940 The output for the first stage is:
2942 I1: 0 16 1 17 2 18 3 19
2943 I2: 4 20 5 21 6 22 7 23
2944 I3: 8 24 9 25 10 26 11 27
2945 I4: 12 28 13 29 14 30 15 31
2947 The output of the second stage, i.e. the final result is:
2949 I1: 0 8 16 24 1 9 17 25
2950 I2: 2 10 18 26 3 11 19 27
2951 I3: 4 12 20 28 5 13 21 30
2952 I4: 6 14 22 30 7 15 23 31. */
2954 static bool
2955 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
2956 unsigned int length,
2957 tree stmt,
2958 block_stmt_iterator *bsi,
2959 VEC(tree,heap) **result_chain)
2961 tree perm_dest, perm_stmt, vect1, vect2, high, low;
2962 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2963 tree scalar_dest;
2964 int i;
2965 unsigned int j;
2966 VEC(tree,heap) *first, *second;
2968 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2969 first = VEC_alloc (tree, heap, length/2);
2970 second = VEC_alloc (tree, heap, length/2);
2972 /* Check that the operation is supported. */
2973 if (!vect_strided_store_supported (vectype))
2974 return false;
2976 *result_chain = VEC_copy (tree, heap, dr_chain);
2978 for (i = 0; i < exact_log2 (length); i++)
2980 for (j = 0; j < length/2; j++)
2982 vect1 = VEC_index (tree, dr_chain, j);
2983 vect2 = VEC_index (tree, dr_chain, j+length/2);
2985 /* Create interleaving stmt:
2986 in the case of big endian:
2987 high = interleave_high (vect1, vect2)
2988 and in the case of little endian:
2989 high = interleave_low (vect1, vect2). */
2990 perm_dest = create_tmp_var (vectype, "vect_inter_high");
2991 DECL_GIMPLE_REG_P (perm_dest) = 1;
2992 add_referenced_var (perm_dest);
2993 if (BYTES_BIG_ENDIAN)
2994 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2995 build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype,
2996 vect1, vect2));
2997 else
2998 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2999 build2 (VEC_INTERLEAVE_LOW_EXPR, vectype,
3000 vect1, vect2));
3001 high = make_ssa_name (perm_dest, perm_stmt);
3002 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
3003 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3004 VEC_replace (tree, *result_chain, 2*j, high);
3006 /* Create interleaving stmt:
3007 in the case of big endian:
3008 low = interleave_low (vect1, vect2)
3009 and in the case of little endian:
3010 low = interleave_high (vect1, vect2). */
3011 perm_dest = create_tmp_var (vectype, "vect_inter_low");
3012 DECL_GIMPLE_REG_P (perm_dest) = 1;
3013 add_referenced_var (perm_dest);
3014 if (BYTES_BIG_ENDIAN)
3015 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3016 build2 (VEC_INTERLEAVE_LOW_EXPR, vectype,
3017 vect1, vect2));
3018 else
3019 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3020 build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype,
3021 vect1, vect2));
3022 low = make_ssa_name (perm_dest, perm_stmt);
3023 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
3024 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3025 VEC_replace (tree, *result_chain, 2*j+1, low);
3027 dr_chain = VEC_copy (tree, heap, *result_chain);
3029 return true;
3033 /* Function vectorizable_store.
3035 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
3036 can be vectorized.
3037 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3038 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3039 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3041 bool
3042 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3044 tree scalar_dest;
3045 tree data_ref;
3046 tree op;
3047 tree vec_oprnd = NULL_TREE;
3048 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3049 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
3050 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3051 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3052 enum machine_mode vec_mode;
3053 tree dummy;
3054 enum dr_alignment_support alignment_support_cheme;
3055 ssa_op_iter iter;
3056 def_operand_p def_p;
3057 tree def, def_stmt;
3058 enum vect_def_type dt;
3059 stmt_vec_info prev_stmt_info = NULL;
3060 tree dataref_ptr = NULL_TREE;
3061 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3062 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3063 int j;
3064 tree next_stmt, first_stmt;
3065 bool strided_store = false;
3066 unsigned int group_size, i;
3067 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
3068 gcc_assert (ncopies >= 1);
3070 /* Is vectorizable store? */
3072 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3073 return false;
3075 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3076 if (TREE_CODE (scalar_dest) != ARRAY_REF
3077 && TREE_CODE (scalar_dest) != INDIRECT_REF
3078 && !DR_GROUP_FIRST_DR (stmt_info))
3079 return false;
3081 op = GIMPLE_STMT_OPERAND (stmt, 1);
3082 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
3084 if (vect_print_dump_info (REPORT_DETAILS))
3085 fprintf (vect_dump, "use not simple.");
3086 return false;
3089 vec_mode = TYPE_MODE (vectype);
3090 /* FORNOW. In some cases can vectorize even if data-type not supported
3091 (e.g. - array initialization with 0). */
3092 if (mov_optab->handlers[(int)vec_mode].insn_code == CODE_FOR_nothing)
3093 return false;
3095 if (!STMT_VINFO_DATA_REF (stmt_info))
3096 return false;
3098 if (DR_GROUP_FIRST_DR (stmt_info))
3100 strided_store = true;
3101 if (!vect_strided_store_supported (vectype))
3102 return false;
3105 if (!vec_stmt) /* transformation not required. */
3107 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
3108 return true;
3111 /** Transform. **/
3113 if (vect_print_dump_info (REPORT_DETAILS))
3114 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
3116 if (strided_store)
3118 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3119 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3120 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3122 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
3124 /* We vectorize all the stmts of the interleaving group when we
3125 reach the last stmt in the group. */
3126 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
3127 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
3129 *vec_stmt = NULL_TREE;
3130 return true;
3133 else
3135 first_stmt = stmt;
3136 first_dr = dr;
3137 group_size = 1;
3140 dr_chain = VEC_alloc (tree, heap, group_size);
3141 oprnds = VEC_alloc (tree, heap, group_size);
3143 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3144 gcc_assert (alignment_support_cheme);
3145 gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */
3147 /* In case the vectorization factor (VF) is bigger than the number
3148 of elements that we can fit in a vectype (nunits), we have to generate
3149 more than one vector stmt - i.e - we need to "unroll" the
3150 vector stmt by a factor VF/nunits. For more details see documentation in
3151 vect_get_vec_def_for_copy_stmt. */
3153 /* In case of interleaving (non-unit strided access):
3155 S1: &base + 2 = x2
3156 S2: &base = x0
3157 S3: &base + 1 = x1
3158 S4: &base + 3 = x3
3160 We create vectorized stores starting from base address (the access of the
3161 first stmt in the chain (S2 in the above example), when the last store stmt
3162 of the chain (S4) is reached:
3164 VS1: &base = vx2
3165 VS2: &base + vec_size*1 = vx0
3166 VS3: &base + vec_size*2 = vx1
3167 VS4: &base + vec_size*3 = vx3
3169 Then permutation statements are generated:
3171 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
3172 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
3175 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3176 (the order of the data-refs in the output of vect_permute_store_chain
3177 corresponds to the order of scalar stmts in the interleaving chain - see
3178 the documentation of vect_permute_store_chain()).
3180 In case of both multiple types and interleaving, above vector stores and
3181 permutation stmts are created for every copy. The result vector stmts are
3182 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3183 STMT_VINFO_RELATED_STMT for the next copies.
3186 prev_stmt_info = NULL;
3187 for (j = 0; j < ncopies; j++)
3189 tree new_stmt;
3190 tree ptr_incr;
3192 if (j == 0)
3194 /* For interleaved stores we collect vectorized defs for all the
3195 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
3196 as an input to vect_permute_store_chain(), and OPRNDS as an input
3197 to vect_get_vec_def_for_stmt_copy() for the next copy.
3198 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3199 OPRNDS are of size 1. */
3200 next_stmt = first_stmt;
3201 for (i = 0; i < group_size; i++)
3203 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
3204 is the exact number of stmts in the chain. Therefore, NEXT_STMT
3205 can't be NULL_TREE. In case that there is no interleaving,
3206 GROUP_SIZE is 1, and only one iteration of the loop will be
3207 executed. */
3208 gcc_assert (next_stmt);
3209 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
3210 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
3211 VEC_quick_push(tree, dr_chain, vec_oprnd);
3212 VEC_quick_push(tree, oprnds, vec_oprnd);
3213 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3215 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE,
3216 &dummy, &ptr_incr, false,
3217 TREE_TYPE (vec_oprnd));
3219 else
3221 /* For interleaved stores we created vectorized defs for all the
3222 defs stored in OPRNDS in the previous iteration (previous copy).
3223 DR_CHAIN is then used as an input to vect_permute_store_chain(),
3224 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
3225 next copy.
3226 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3227 OPRNDS are of size 1. */
3228 for (i = 0; i < group_size; i++)
3230 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
3231 VEC_index (tree, oprnds, i));
3232 VEC_replace(tree, dr_chain, i, vec_oprnd);
3233 VEC_replace(tree, oprnds, i, vec_oprnd);
3235 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3238 if (strided_store)
3240 result_chain = VEC_alloc (tree, heap, group_size);
3241 /* Permute. */
3242 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
3243 &result_chain))
3244 return false;
3247 next_stmt = first_stmt;
3248 for (i = 0; i < group_size; i++)
3250 /* For strided stores vectorized defs are interleaved in
3251 vect_permute_store_chain(). */
3252 if (strided_store)
3253 vec_oprnd = VEC_index(tree, result_chain, i);
3255 data_ref = build_fold_indirect_ref (dataref_ptr);
3256 /* Arguments are ready. Create the new vector stmt. */
3257 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, data_ref,
3258 vec_oprnd);
3259 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3261 /* Set the VDEFs for the vector pointer. If this virtual def
3262 has a use outside the loop and a loop peel is performed
3263 then the def may be renamed by the peel. Mark it for
3264 renaming so the later use will also be renamed. */
3265 copy_virtual_operands (new_stmt, next_stmt);
3266 if (j == 0)
3268 /* The original store is deleted so the same SSA_NAMEs
3269 can be used. */
3270 FOR_EACH_SSA_TREE_OPERAND (def, next_stmt, iter, SSA_OP_VDEF)
3272 SSA_NAME_DEF_STMT (def) = new_stmt;
3273 mark_sym_for_renaming (SSA_NAME_VAR (def));
3276 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3278 else
3280 /* Create new names for all the definitions created by COPY and
3281 add replacement mappings for each new name. */
3282 FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VDEF)
3284 create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p);
3285 mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p)));
3288 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3291 prev_stmt_info = vinfo_for_stmt (new_stmt);
3292 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3293 if (!next_stmt)
3294 break;
3295 /* Bump the vector pointer. */
3296 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3300 return true;
3304 /* Function vect_setup_realignment
3306 This function is called when vectorizing an unaligned load using
3307 the dr_unaligned_software_pipeline scheme.
3308 This function generates the following code at the loop prolog:
3310 p = initial_addr;
3311 msq_init = *(floor(p)); # prolog load
3312 realignment_token = call target_builtin;
3313 loop:
3314 msq = phi (msq_init, ---)
3316 The code above sets up a new (vector) pointer, pointing to the first
3317 location accessed by STMT, and a "floor-aligned" load using that pointer.
3318 It also generates code to compute the "realignment-token" (if the relevant
3319 target hook was defined), and creates a phi-node at the loop-header bb
3320 whose arguments are the result of the prolog-load (created by this
3321 function) and the result of a load that takes place in the loop (to be
3322 created by the caller to this function).
3323 The caller to this function uses the phi-result (msq) to create the
3324 realignment code inside the loop, and sets up the missing phi argument,
3325 as follows:
3327 loop:
3328 msq = phi (msq_init, lsq)
3329 lsq = *(floor(p')); # load in loop
3330 result = realign_load (msq, lsq, realignment_token);
3332 Input:
3333 STMT - (scalar) load stmt to be vectorized. This load accesses
3334 a memory location that may be unaligned.
3335 BSI - place where new code is to be inserted.
3337 Output:
3338 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
3339 target hook, if defined.
3340 Return value - the result of the loop-header phi node. */
3342 static tree
3343 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
3344 tree *realignment_token)
3346 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3347 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3348 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3349 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3350 edge pe = loop_preheader_edge (loop);
3351 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3352 tree vec_dest;
3353 tree init_addr;
3354 tree inc;
3355 tree ptr;
3356 tree data_ref;
3357 tree new_stmt;
3358 basic_block new_bb;
3359 tree msq_init;
3360 tree new_temp;
3361 tree phi_stmt;
3362 tree msq;
3364 /* 1. Create msq_init = *(floor(p1)) in the loop preheader */
3365 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3366 ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true,
3367 NULL_TREE);
3368 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
3369 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, data_ref);
3370 new_temp = make_ssa_name (vec_dest, new_stmt);
3371 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3372 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3373 gcc_assert (!new_bb);
3374 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
3375 copy_virtual_operands (new_stmt, stmt);
3376 update_vuses_to_preheader (new_stmt, loop);
3378 /* 2. Create permutation mask, if required, in loop preheader. */
3379 if (targetm.vectorize.builtin_mask_for_load)
3381 tree builtin_decl;
3383 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
3384 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
3385 vec_dest = vect_create_destination_var (scalar_dest,
3386 TREE_TYPE (new_stmt));
3387 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3388 new_stmt);
3389 new_temp = make_ssa_name (vec_dest, new_stmt);
3390 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3391 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3392 gcc_assert (!new_bb);
3393 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
3395 /* The result of the CALL_EXPR to this builtin is determined from
3396 the value of the parameter and no global variables are touched
3397 which makes the builtin a "const" function. Requiring the
3398 builtin to have the "const" attribute makes it unnecessary
3399 to call mark_call_clobbered. */
3400 gcc_assert (TREE_READONLY (builtin_decl));
3403 /* 3. Create msq = phi <msq_init, lsq> in loop */
3404 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3405 msq = make_ssa_name (vec_dest, NULL_TREE);
3406 phi_stmt = create_phi_node (msq, loop->header);
3407 SSA_NAME_DEF_STMT (msq) = phi_stmt;
3408 add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop));
3410 return msq;
3414 /* Function vect_strided_load_supported.
3416 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
3417 and FALSE otherwise. */
3419 static bool
3420 vect_strided_load_supported (tree vectype)
3422 optab perm_even_optab, perm_odd_optab;
3423 int mode;
3425 mode = (int) TYPE_MODE (vectype);
3427 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
3428 if (!perm_even_optab)
3430 if (vect_print_dump_info (REPORT_DETAILS))
3431 fprintf (vect_dump, "no optab for perm_even.");
3432 return false;
3435 if (perm_even_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3437 if (vect_print_dump_info (REPORT_DETAILS))
3438 fprintf (vect_dump, "perm_even op not supported by target.");
3439 return false;
3442 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
3443 if (!perm_odd_optab)
3445 if (vect_print_dump_info (REPORT_DETAILS))
3446 fprintf (vect_dump, "no optab for perm_odd.");
3447 return false;
3450 if (perm_odd_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3452 if (vect_print_dump_info (REPORT_DETAILS))
3453 fprintf (vect_dump, "perm_odd op not supported by target.");
3454 return false;
3456 return true;
3460 /* Function vect_permute_load_chain.
3462 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
3463 a power of 2, generate extract_even/odd stmts to reorder the input data
3464 correctly. Return the final references for loads in RESULT_CHAIN.
3466 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3467 The input is 4 vectors each containing 8 elements. We assign a number to each
3468 element, the input sequence is:
3470 1st vec: 0 1 2 3 4 5 6 7
3471 2nd vec: 8 9 10 11 12 13 14 15
3472 3rd vec: 16 17 18 19 20 21 22 23
3473 4th vec: 24 25 26 27 28 29 30 31
3475 The output sequence should be:
3477 1st vec: 0 4 8 12 16 20 24 28
3478 2nd vec: 1 5 9 13 17 21 25 29
3479 3rd vec: 2 6 10 14 18 22 26 30
3480 4th vec: 3 7 11 15 19 23 27 31
3482 i.e., the first output vector should contain the first elements of each
3483 interleaving group, etc.
3485 We use extract_even/odd instructions to create such output. The input of each
3486 extract_even/odd operation is two vectors
3487 1st vec 2nd vec
3488 0 1 2 3 4 5 6 7
3490 and the output is the vector of extracted even/odd elements. The output of
3491 extract_even will be: 0 2 4 6
3492 and of extract_odd: 1 3 5 7
3495 The permutation is done in log LENGTH stages. In each stage extract_even and
3496 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
3497 order. In our example,
3499 E1: extract_even (1st vec, 2nd vec)
3500 E2: extract_odd (1st vec, 2nd vec)
3501 E3: extract_even (3rd vec, 4th vec)
3502 E4: extract_odd (3rd vec, 4th vec)
3504 The output for the first stage will be:
3506 E1: 0 2 4 6 8 10 12 14
3507 E2: 1 3 5 7 9 11 13 15
3508 E3: 16 18 20 22 24 26 28 30
3509 E4: 17 19 21 23 25 27 29 31
3511 In order to proceed and create the correct sequence for the next stage (or
3512 for the correct output, if the second stage is the last one, as in our
3513 example), we first put the output of extract_even operation and then the
3514 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
3515 The input for the second stage is:
3517 1st vec (E1): 0 2 4 6 8 10 12 14
3518 2nd vec (E3): 16 18 20 22 24 26 28 30
3519 3rd vec (E2): 1 3 5 7 9 11 13 15
3520 4th vec (E4): 17 19 21 23 25 27 29 31
3522 The output of the second stage:
3524 E1: 0 4 8 12 16 20 24 28
3525 E2: 2 6 10 14 18 22 26 30
3526 E3: 1 5 9 13 17 21 25 29
3527 E4: 3 7 11 15 19 23 27 31
3529 And RESULT_CHAIN after reordering:
3531 1st vec (E1): 0 4 8 12 16 20 24 28
3532 2nd vec (E3): 1 5 9 13 17 21 25 29
3533 3rd vec (E2): 2 6 10 14 18 22 26 30
3534 4th vec (E4): 3 7 11 15 19 23 27 31. */
3536 static bool
3537 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
3538 unsigned int length,
3539 tree stmt,
3540 block_stmt_iterator *bsi,
3541 VEC(tree,heap) **result_chain)
3543 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
3544 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3545 int i;
3546 unsigned int j;
3548 /* Check that the operation is supported. */
3549 if (!vect_strided_load_supported (vectype))
3550 return false;
3552 *result_chain = VEC_copy (tree, heap, dr_chain);
3553 for (i = 0; i < exact_log2 (length); i++)
3555 for (j = 0; j < length; j +=2)
3557 first_vect = VEC_index (tree, dr_chain, j);
3558 second_vect = VEC_index (tree, dr_chain, j+1);
3560 /* data_ref = permute_even (first_data_ref, second_data_ref); */
3561 perm_dest = create_tmp_var (vectype, "vect_perm_even");
3562 DECL_GIMPLE_REG_P (perm_dest) = 1;
3563 add_referenced_var (perm_dest);
3565 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3566 build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
3567 first_vect, second_vect));
3569 data_ref = make_ssa_name (perm_dest, perm_stmt);
3570 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3571 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3572 mark_symbols_for_renaming (perm_stmt);
3574 VEC_replace (tree, *result_chain, j/2, data_ref);
3576 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
3577 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
3578 DECL_GIMPLE_REG_P (perm_dest) = 1;
3579 add_referenced_var (perm_dest);
3581 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3582 build2 (VEC_EXTRACT_ODD_EXPR, vectype,
3583 first_vect, second_vect));
3584 data_ref = make_ssa_name (perm_dest, perm_stmt);
3585 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3586 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3587 mark_symbols_for_renaming (perm_stmt);
3589 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
3591 dr_chain = VEC_copy (tree, heap, *result_chain);
3593 return true;
3597 /* Function vect_transform_strided_load.
3599 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
3600 to perform their permutation and ascribe the result vectorized statements to
3601 the scalar statements.
3604 static bool
3605 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
3606 block_stmt_iterator *bsi)
3608 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3609 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3610 tree next_stmt, new_stmt;
3611 VEC(tree,heap) *result_chain = NULL;
3612 unsigned int i, gap_count;
3613 tree tmp_data_ref;
3615 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
3616 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
3617 vectors, that are ready for vector computation. */
3618 result_chain = VEC_alloc (tree, heap, size);
3619 /* Permute. */
3620 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
3621 return false;
3623 /* Put a permuted data-ref in the VECTORIZED_STMT field.
3624 Since we scan the chain starting from it's first node, their order
3625 corresponds the order of data-refs in RESULT_CHAIN. */
3626 next_stmt = first_stmt;
3627 gap_count = 1;
3628 for (i = 0; VEC_iterate(tree, result_chain, i, tmp_data_ref); i++)
3630 if (!next_stmt)
3631 break;
3633 /* Skip the gaps. Loads created for the gaps will be removed by dead
3634 code elimination pass later.
3635 DR_GROUP_GAP is the number of steps in elements from the previous
3636 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
3637 correspond to the gaps.
3639 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
3641 gap_count++;
3642 continue;
3645 while (next_stmt)
3647 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
3648 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
3649 copies, and we put the new vector statement in the first available
3650 RELATED_STMT. */
3651 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
3652 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
3653 else
3655 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
3656 tree rel_stmt = STMT_VINFO_RELATED_STMT (
3657 vinfo_for_stmt (prev_stmt));
3658 while (rel_stmt)
3660 prev_stmt = rel_stmt;
3661 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
3663 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
3665 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3666 gap_count = 1;
3667 /* If NEXT_STMT accesses the same DR as the previous statement,
3668 put the same TMP_DATA_REF as its vectorized statement; otherwise
3669 get the next data-ref from RESULT_CHAIN. */
3670 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
3671 break;
3674 return true;
3678 /* vectorizable_load.
3680 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
3681 can be vectorized.
3682 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3683 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3684 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3686 bool
3687 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3689 tree scalar_dest;
3690 tree vec_dest = NULL;
3691 tree data_ref = NULL;
3692 tree op;
3693 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3694 stmt_vec_info prev_stmt_info;
3695 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3696 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3697 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
3698 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3699 tree new_temp;
3700 int mode;
3701 tree new_stmt = NULL_TREE;
3702 tree dummy;
3703 enum dr_alignment_support alignment_support_cheme;
3704 tree dataref_ptr = NULL_TREE;
3705 tree ptr_incr;
3706 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3707 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3708 int i, j, group_size;
3709 tree msq = NULL_TREE, lsq;
3710 tree offset = NULL_TREE;
3711 tree realignment_token = NULL_TREE;
3712 tree phi_stmt = NULL_TREE;
3713 VEC(tree,heap) *dr_chain = NULL;
3714 bool strided_load = false;
3715 tree first_stmt;
3717 /* Is vectorizable load? */
3718 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3719 return false;
3721 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3723 if (STMT_VINFO_LIVE_P (stmt_info))
3725 /* FORNOW: not yet supported. */
3726 if (vect_print_dump_info (REPORT_DETAILS))
3727 fprintf (vect_dump, "value used after loop.");
3728 return false;
3731 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3732 return false;
3734 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3735 if (TREE_CODE (scalar_dest) != SSA_NAME)
3736 return false;
3738 op = GIMPLE_STMT_OPERAND (stmt, 1);
3739 if (TREE_CODE (op) != ARRAY_REF
3740 && TREE_CODE (op) != INDIRECT_REF
3741 && !DR_GROUP_FIRST_DR (stmt_info))
3742 return false;
3744 if (!STMT_VINFO_DATA_REF (stmt_info))
3745 return false;
3747 mode = (int) TYPE_MODE (vectype);
3749 /* FORNOW. In some cases can vectorize even if data-type not supported
3750 (e.g. - data copies). */
3751 if (mov_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3753 if (vect_print_dump_info (REPORT_DETAILS))
3754 fprintf (vect_dump, "Aligned load, but unsupported type.");
3755 return false;
3758 /* Check if the load is a part of an interleaving chain. */
3759 if (DR_GROUP_FIRST_DR (stmt_info))
3761 strided_load = true;
3763 /* Check if interleaving is supported. */
3764 if (!vect_strided_load_supported (vectype))
3765 return false;
3768 if (!vec_stmt) /* transformation not required. */
3770 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
3771 return true;
3774 /** Transform. **/
3776 if (vect_print_dump_info (REPORT_DETAILS))
3777 fprintf (vect_dump, "transform load.");
3779 if (strided_load)
3781 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3782 /* Check if the chain of loads is already vectorized. */
3783 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
3785 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3786 return true;
3788 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3789 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3790 dr_chain = VEC_alloc (tree, heap, group_size);
3792 else
3794 first_stmt = stmt;
3795 first_dr = dr;
3796 group_size = 1;
3799 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3800 gcc_assert (alignment_support_cheme);
3803 /* In case the vectorization factor (VF) is bigger than the number
3804 of elements that we can fit in a vectype (nunits), we have to generate
3805 more than one vector stmt - i.e - we need to "unroll" the
3806 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3807 from one copy of the vector stmt to the next, in the field
3808 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3809 stages to find the correct vector defs to be used when vectorizing
3810 stmts that use the defs of the current stmt. The example below illustrates
3811 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3812 4 vectorized stmts):
3814 before vectorization:
3815 RELATED_STMT VEC_STMT
3816 S1: x = memref - -
3817 S2: z = x + 1 - -
3819 step 1: vectorize stmt S1:
3820 We first create the vector stmt VS1_0, and, as usual, record a
3821 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
3822 Next, we create the vector stmt VS1_1, and record a pointer to
3823 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
3824 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
3825 stmts and pointers:
3826 RELATED_STMT VEC_STMT
3827 VS1_0: vx0 = memref0 VS1_1 -
3828 VS1_1: vx1 = memref1 VS1_2 -
3829 VS1_2: vx2 = memref2 VS1_3 -
3830 VS1_3: vx3 = memref3 - -
3831 S1: x = load - VS1_0
3832 S2: z = x + 1 - -
3834 See in documentation in vect_get_vec_def_for_stmt_copy for how the
3835 information we recorded in RELATED_STMT field is used to vectorize
3836 stmt S2. */
3838 /* In case of interleaving (non-unit strided access):
3840 S1: x2 = &base + 2
3841 S2: x0 = &base
3842 S3: x1 = &base + 1
3843 S4: x3 = &base + 3
3845 Vectorized loads are created in the order of memory accesses
3846 starting from the access of the first stmt of the chain:
3848 VS1: vx0 = &base
3849 VS2: vx1 = &base + vec_size*1
3850 VS3: vx3 = &base + vec_size*2
3851 VS4: vx4 = &base + vec_size*3
3853 Then permutation statements are generated:
3855 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
3856 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
3859 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3860 (the order of the data-refs in the output of vect_permute_load_chain
3861 corresponds to the order of scalar stmts in the interleaving chain - see
3862 the documentation of vect_permute_load_chain()).
3863 The generation of permutation stmts and recording them in
3864 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
3866 In case of both multiple types and interleaving, the vector loads and
3867 permutation stmts above are created for every copy. The result vector stmts
3868 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3869 STMT_VINFO_RELATED_STMT for the next copies. */
3871 /* If the data reference is aligned (dr_aligned) or potentially unaligned
3872 on a target that supports unaligned accesses (dr_unaligned_supported)
3873 we generate the following code:
3874 p = initial_addr;
3875 indx = 0;
3876 loop {
3877 p = p + indx * vectype_size;
3878 vec_dest = *(p);
3879 indx = indx + 1;
3882 Otherwise, the data reference is potentially unaligned on a target that
3883 does not support unaligned accesses (dr_unaligned_software_pipeline) -
3884 then generate the following code, in which the data in each iteration is
3885 obtained by two vector loads, one from the previous iteration, and one
3886 from the current iteration:
3887 p1 = initial_addr;
3888 msq_init = *(floor(p1))
3889 p2 = initial_addr + VS - 1;
3890 realignment_token = call target_builtin;
3891 indx = 0;
3892 loop {
3893 p2 = p2 + indx * vectype_size
3894 lsq = *(floor(p2))
3895 vec_dest = realign_load (msq, lsq, realignment_token)
3896 indx = indx + 1;
3897 msq = lsq;
3898 } */
3900 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3902 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token);
3903 phi_stmt = SSA_NAME_DEF_STMT (msq);
3904 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
3907 prev_stmt_info = NULL;
3908 for (j = 0; j < ncopies; j++)
3910 /* 1. Create the vector pointer update chain. */
3911 if (j == 0)
3912 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset, &dummy,
3913 &ptr_incr, false, NULL_TREE);
3914 else
3915 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3917 for (i = 0; i < group_size; i++)
3919 /* 2. Create the vector-load in the loop. */
3920 switch (alignment_support_cheme)
3922 case dr_aligned:
3923 gcc_assert (aligned_access_p (first_dr));
3924 data_ref = build_fold_indirect_ref (dataref_ptr);
3925 break;
3926 case dr_unaligned_supported:
3928 int mis = DR_MISALIGNMENT (first_dr);
3929 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
3931 gcc_assert (!aligned_access_p (first_dr));
3932 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
3933 data_ref =
3934 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
3935 break;
3937 case dr_unaligned_software_pipeline:
3938 gcc_assert (!aligned_access_p (first_dr));
3939 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
3940 break;
3941 default:
3942 gcc_unreachable ();
3944 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3945 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3946 data_ref);
3947 new_temp = make_ssa_name (vec_dest, new_stmt);
3948 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3949 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3950 copy_virtual_operands (new_stmt, stmt);
3951 mark_symbols_for_renaming (new_stmt);
3953 /* 3. Handle explicit realignment if necessary/supported. */
3954 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3956 /* Create in loop:
3957 <vec_dest = realign_load (msq, lsq, realignment_token)> */
3958 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
3959 if (!realignment_token)
3960 realignment_token = dataref_ptr;
3961 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3962 new_stmt =
3963 build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token);
3964 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3965 new_stmt);
3966 new_temp = make_ssa_name (vec_dest, new_stmt);
3967 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3968 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3969 if (i == group_size - 1 && j == ncopies - 1)
3970 add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop));
3971 msq = lsq;
3973 if (strided_load)
3974 VEC_quick_push (tree, dr_chain, new_temp);
3975 if (i < group_size - 1)
3976 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3979 if (strided_load)
3981 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
3982 return false;
3983 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3984 dr_chain = VEC_alloc (tree, heap, group_size);
3986 else
3988 if (j == 0)
3989 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3990 else
3991 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3992 prev_stmt_info = vinfo_for_stmt (new_stmt);
3996 return true;
4000 /* Function vectorizable_live_operation.
4002 STMT computes a value that is used outside the loop. Check if
4003 it can be supported. */
4005 bool
4006 vectorizable_live_operation (tree stmt,
4007 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
4008 tree *vec_stmt ATTRIBUTE_UNUSED)
4010 tree operation;
4011 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4012 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4013 int i;
4014 int op_type;
4015 tree op;
4016 tree def, def_stmt;
4017 enum vect_def_type dt;
4019 if (!STMT_VINFO_LIVE_P (stmt_info))
4020 return false;
4022 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4023 return false;
4025 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4026 return false;
4028 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4029 op_type = TREE_OPERAND_LENGTH (operation);
4031 /* FORNOW: support only if all uses are invariant. This means
4032 that the scalar operations can remain in place, unvectorized.
4033 The original last scalar value that they compute will be used. */
4035 for (i = 0; i < op_type; i++)
4037 op = TREE_OPERAND (operation, i);
4038 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4040 if (vect_print_dump_info (REPORT_DETAILS))
4041 fprintf (vect_dump, "use not simple.");
4042 return false;
4045 if (dt != vect_invariant_def && dt != vect_constant_def)
4046 return false;
4049 /* No transformation is required for the cases we currently support. */
4050 return true;
4054 /* Function vect_is_simple_cond.
4056 Input:
4057 LOOP - the loop that is being vectorized.
4058 COND - Condition that is checked for simple use.
4060 Returns whether a COND can be vectorized. Checks whether
4061 condition operands are supportable using vec_is_simple_use. */
4063 static bool
4064 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
4066 tree lhs, rhs;
4067 tree def;
4068 enum vect_def_type dt;
4070 if (!COMPARISON_CLASS_P (cond))
4071 return false;
4073 lhs = TREE_OPERAND (cond, 0);
4074 rhs = TREE_OPERAND (cond, 1);
4076 if (TREE_CODE (lhs) == SSA_NAME)
4078 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
4079 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
4080 return false;
4082 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST)
4083 return false;
4085 if (TREE_CODE (rhs) == SSA_NAME)
4087 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
4088 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
4089 return false;
4091 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST)
4092 return false;
4094 return true;
4097 /* vectorizable_condition.
4099 Check if STMT is conditional modify expression that can be vectorized.
4100 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4101 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
4102 at BSI.
4104 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4106 bool
4107 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
4109 tree scalar_dest = NULL_TREE;
4110 tree vec_dest = NULL_TREE;
4111 tree op = NULL_TREE;
4112 tree cond_expr, then_clause, else_clause;
4113 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4114 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4115 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
4116 tree vec_compare, vec_cond_expr;
4117 tree new_temp;
4118 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4119 enum machine_mode vec_mode;
4120 tree def;
4121 enum vect_def_type dt;
4122 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4123 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4125 gcc_assert (ncopies >= 1);
4126 if (ncopies > 1)
4127 return false; /* FORNOW */
4129 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4130 return false;
4132 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
4134 if (STMT_VINFO_LIVE_P (stmt_info))
4136 /* FORNOW: not yet supported. */
4137 if (vect_print_dump_info (REPORT_DETAILS))
4138 fprintf (vect_dump, "value used after loop.");
4139 return false;
4142 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4143 return false;
4145 op = GIMPLE_STMT_OPERAND (stmt, 1);
4147 if (TREE_CODE (op) != COND_EXPR)
4148 return false;
4150 cond_expr = TREE_OPERAND (op, 0);
4151 then_clause = TREE_OPERAND (op, 1);
4152 else_clause = TREE_OPERAND (op, 2);
4154 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
4155 return false;
4157 /* We do not handle two different vector types for the condition
4158 and the values. */
4159 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
4160 return false;
4162 if (TREE_CODE (then_clause) == SSA_NAME)
4164 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
4165 if (!vect_is_simple_use (then_clause, loop_vinfo,
4166 &then_def_stmt, &def, &dt))
4167 return false;
4169 else if (TREE_CODE (then_clause) != INTEGER_CST
4170 && TREE_CODE (then_clause) != REAL_CST)
4171 return false;
4173 if (TREE_CODE (else_clause) == SSA_NAME)
4175 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
4176 if (!vect_is_simple_use (else_clause, loop_vinfo,
4177 &else_def_stmt, &def, &dt))
4178 return false;
4180 else if (TREE_CODE (else_clause) != INTEGER_CST
4181 && TREE_CODE (else_clause) != REAL_CST)
4182 return false;
4185 vec_mode = TYPE_MODE (vectype);
4187 if (!vec_stmt)
4189 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
4190 return expand_vec_cond_expr_p (op, vec_mode);
4193 /* Transform */
4195 /* Handle def. */
4196 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4197 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4199 /* Handle cond expr. */
4200 vec_cond_lhs =
4201 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
4202 vec_cond_rhs =
4203 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
4204 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
4205 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
4207 /* Arguments are ready. create the new vector stmt. */
4208 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
4209 vec_cond_lhs, vec_cond_rhs);
4210 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
4211 vec_compare, vec_then_clause, vec_else_clause);
4213 *vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
4214 vec_cond_expr);
4215 new_temp = make_ssa_name (vec_dest, *vec_stmt);
4216 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
4217 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
4219 return true;
4222 /* Function vect_transform_stmt.
4224 Create a vectorized stmt to replace STMT, and insert it at BSI. */
4226 bool
4227 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
4229 bool is_store = false;
4230 tree vec_stmt = NULL_TREE;
4231 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4232 tree orig_stmt_in_pattern;
4233 bool done;
4235 if (STMT_VINFO_RELEVANT_P (stmt_info))
4237 switch (STMT_VINFO_TYPE (stmt_info))
4239 case type_demotion_vec_info_type:
4240 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
4241 gcc_assert (done);
4242 break;
4244 case type_promotion_vec_info_type:
4245 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
4246 gcc_assert (done);
4247 break;
4249 case type_conversion_vec_info_type:
4250 done = vectorizable_conversion (stmt, bsi, &vec_stmt);
4251 gcc_assert (done);
4252 break;
4254 case op_vec_info_type:
4255 done = vectorizable_operation (stmt, bsi, &vec_stmt);
4256 gcc_assert (done);
4257 break;
4259 case assignment_vec_info_type:
4260 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
4261 gcc_assert (done);
4262 break;
4264 case load_vec_info_type:
4265 done = vectorizable_load (stmt, bsi, &vec_stmt);
4266 gcc_assert (done);
4267 break;
4269 case store_vec_info_type:
4270 done = vectorizable_store (stmt, bsi, &vec_stmt);
4271 gcc_assert (done);
4272 if (DR_GROUP_FIRST_DR (stmt_info))
4274 /* In case of interleaving, the whole chain is vectorized when the
4275 last store in the chain is reached. Store stmts before the last
4276 one are skipped, and there vec_stmt_info shouldn't be freed
4277 meanwhile. */
4278 *strided_store = true;
4279 if (STMT_VINFO_VEC_STMT (stmt_info))
4280 is_store = true;
4282 else
4283 is_store = true;
4284 break;
4286 case condition_vec_info_type:
4287 done = vectorizable_condition (stmt, bsi, &vec_stmt);
4288 gcc_assert (done);
4289 break;
4291 case call_vec_info_type:
4292 done = vectorizable_call (stmt, bsi, &vec_stmt);
4293 break;
4295 default:
4296 if (vect_print_dump_info (REPORT_DETAILS))
4297 fprintf (vect_dump, "stmt not supported.");
4298 gcc_unreachable ();
4301 gcc_assert (vec_stmt || *strided_store);
4302 if (vec_stmt)
4304 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
4305 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
4306 if (orig_stmt_in_pattern)
4308 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
4309 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
4311 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4313 /* STMT was inserted by the vectorizer to replace a
4314 computation idiom. ORIG_STMT_IN_PATTERN is a stmt in the
4315 original sequence that computed this idiom. We need to
4316 record a pointer to VEC_STMT in the stmt_info of
4317 ORIG_STMT_IN_PATTERN. See more details in the
4318 documentation of vect_pattern_recog. */
4320 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
4326 if (STMT_VINFO_LIVE_P (stmt_info))
4328 switch (STMT_VINFO_TYPE (stmt_info))
4330 case reduc_vec_info_type:
4331 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
4332 gcc_assert (done);
4333 break;
4335 default:
4336 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
4337 gcc_assert (done);
4341 return is_store;
4345 /* This function builds ni_name = number of iterations loop executes
4346 on the loop preheader. */
4348 static tree
4349 vect_build_loop_niters (loop_vec_info loop_vinfo)
4351 tree ni_name, stmt, var;
4352 edge pe;
4353 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4354 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
4356 var = create_tmp_var (TREE_TYPE (ni), "niters");
4357 add_referenced_var (var);
4358 ni_name = force_gimple_operand (ni, &stmt, false, var);
4360 pe = loop_preheader_edge (loop);
4361 if (stmt)
4363 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4364 gcc_assert (!new_bb);
4367 return ni_name;
4371 /* This function generates the following statements:
4373 ni_name = number of iterations loop executes
4374 ratio = ni_name / vf
4375 ratio_mult_vf_name = ratio * vf
4377 and places them at the loop preheader edge. */
4379 static void
4380 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
4381 tree *ni_name_ptr,
4382 tree *ratio_mult_vf_name_ptr,
4383 tree *ratio_name_ptr)
4386 edge pe;
4387 basic_block new_bb;
4388 tree stmt, ni_name;
4389 tree var;
4390 tree ratio_name;
4391 tree ratio_mult_vf_name;
4392 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4393 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
4394 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4395 tree log_vf;
4397 pe = loop_preheader_edge (loop);
4399 /* Generate temporary variable that contains
4400 number of iterations loop executes. */
4402 ni_name = vect_build_loop_niters (loop_vinfo);
4403 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
4405 /* Create: ratio = ni >> log2(vf) */
4407 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
4408 if (!is_gimple_val (ratio_name))
4410 var = create_tmp_var (TREE_TYPE (ni), "bnd");
4411 add_referenced_var (var);
4413 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
4414 pe = loop_preheader_edge (loop);
4415 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4416 gcc_assert (!new_bb);
4419 /* Create: ratio_mult_vf = ratio << log2 (vf). */
4421 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
4422 ratio_name, log_vf);
4423 if (!is_gimple_val (ratio_mult_vf_name))
4425 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
4426 add_referenced_var (var);
4428 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
4429 true, var);
4430 pe = loop_preheader_edge (loop);
4431 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4432 gcc_assert (!new_bb);
4435 *ni_name_ptr = ni_name;
4436 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
4437 *ratio_name_ptr = ratio_name;
4439 return;
4443 /* Function update_vuses_to_preheader.
4445 Input:
4446 STMT - a statement with potential VUSEs.
4447 LOOP - the loop whose preheader will contain STMT.
4449 It's possible to vectorize a loop even though an SSA_NAME from a VUSE
4450 appears to be defined in a VDEF in another statement in a loop.
4451 One such case is when the VUSE is at the dereference of a __restricted__
4452 pointer in a load and the VDEF is at the dereference of a different
4453 __restricted__ pointer in a store. Vectorization may result in
4454 copy_virtual_uses being called to copy the problematic VUSE to a new
4455 statement that is being inserted in the loop preheader. This procedure
4456 is called to change the SSA_NAME in the new statement's VUSE from the
4457 SSA_NAME updated in the loop to the related SSA_NAME available on the
4458 path entering the loop.
4460 When this function is called, we have the following situation:
4462 # vuse <name1>
4463 S1: vload
4464 do {
4465 # name1 = phi < name0 , name2>
4467 # vuse <name1>
4468 S2: vload
4470 # name2 = vdef <name1>
4471 S3: vstore
4473 }while...
4475 Stmt S1 was created in the loop preheader block as part of misaligned-load
4476 handling. This function fixes the name of the vuse of S1 from 'name1' to
4477 'name0'. */
4479 static void
4480 update_vuses_to_preheader (tree stmt, struct loop *loop)
4482 basic_block header_bb = loop->header;
4483 edge preheader_e = loop_preheader_edge (loop);
4484 ssa_op_iter iter;
4485 use_operand_p use_p;
4487 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VUSE)
4489 tree ssa_name = USE_FROM_PTR (use_p);
4490 tree def_stmt = SSA_NAME_DEF_STMT (ssa_name);
4491 tree name_var = SSA_NAME_VAR (ssa_name);
4492 basic_block bb = bb_for_stmt (def_stmt);
4494 /* For a use before any definitions, def_stmt is a NOP_EXPR. */
4495 if (!IS_EMPTY_STMT (def_stmt)
4496 && flow_bb_inside_loop_p (loop, bb))
4498 /* If the block containing the statement defining the SSA_NAME
4499 is in the loop then it's necessary to find the definition
4500 outside the loop using the PHI nodes of the header. */
4501 tree phi;
4502 bool updated = false;
4504 for (phi = phi_nodes (header_bb); phi; phi = TREE_CHAIN (phi))
4506 if (SSA_NAME_VAR (PHI_RESULT (phi)) == name_var)
4508 SET_USE (use_p, PHI_ARG_DEF (phi, preheader_e->dest_idx));
4509 updated = true;
4510 break;
4513 gcc_assert (updated);
4519 /* Function vect_update_ivs_after_vectorizer.
4521 "Advance" the induction variables of LOOP to the value they should take
4522 after the execution of LOOP. This is currently necessary because the
4523 vectorizer does not handle induction variables that are used after the
4524 loop. Such a situation occurs when the last iterations of LOOP are
4525 peeled, because:
4526 1. We introduced new uses after LOOP for IVs that were not originally used
4527 after LOOP: the IVs of LOOP are now used by an epilog loop.
4528 2. LOOP is going to be vectorized; this means that it will iterate N/VF
4529 times, whereas the loop IVs should be bumped N times.
4531 Input:
4532 - LOOP - a loop that is going to be vectorized. The last few iterations
4533 of LOOP were peeled.
4534 - NITERS - the number of iterations that LOOP executes (before it is
4535 vectorized). i.e, the number of times the ivs should be bumped.
4536 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
4537 coming out from LOOP on which there are uses of the LOOP ivs
4538 (this is the path from LOOP->exit to epilog_loop->preheader).
4540 The new definitions of the ivs are placed in LOOP->exit.
4541 The phi args associated with the edge UPDATE_E in the bb
4542 UPDATE_E->dest are updated accordingly.
4544 Assumption 1: Like the rest of the vectorizer, this function assumes
4545 a single loop exit that has a single predecessor.
4547 Assumption 2: The phi nodes in the LOOP header and in update_bb are
4548 organized in the same order.
4550 Assumption 3: The access function of the ivs is simple enough (see
4551 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
4553 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
4554 coming out of LOOP on which the ivs of LOOP are used (this is the path
4555 that leads to the epilog loop; other paths skip the epilog loop). This
4556 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
4557 needs to have its phis updated.
4560 static void
4561 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
4562 edge update_e)
4564 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4565 basic_block exit_bb = single_exit (loop)->dest;
4566 tree phi, phi1;
4567 basic_block update_bb = update_e->dest;
4569 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
4571 /* Make sure there exists a single-predecessor exit bb: */
4572 gcc_assert (single_pred_p (exit_bb));
4574 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
4575 phi && phi1;
4576 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
4578 tree access_fn = NULL;
4579 tree evolution_part;
4580 tree init_expr;
4581 tree step_expr;
4582 tree var, stmt, ni, ni_name;
4583 block_stmt_iterator last_bsi;
4585 if (vect_print_dump_info (REPORT_DETAILS))
4587 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
4588 print_generic_expr (vect_dump, phi, TDF_SLIM);
4591 /* Skip virtual phi's. */
4592 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
4594 if (vect_print_dump_info (REPORT_DETAILS))
4595 fprintf (vect_dump, "virtual phi. skip.");
4596 continue;
4599 /* Skip reduction phis. */
4600 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
4602 if (vect_print_dump_info (REPORT_DETAILS))
4603 fprintf (vect_dump, "reduc phi. skip.");
4604 continue;
4607 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
4608 gcc_assert (access_fn);
4609 evolution_part =
4610 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
4611 gcc_assert (evolution_part != NULL_TREE);
4613 /* FORNOW: We do not support IVs whose evolution function is a polynomial
4614 of degree >= 2 or exponential. */
4615 gcc_assert (!tree_is_chrec (evolution_part));
4617 step_expr = evolution_part;
4618 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
4619 loop->num));
4621 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
4622 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
4623 fold_convert (TREE_TYPE (init_expr),
4624 niters),
4625 step_expr),
4626 init_expr);
4628 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
4629 add_referenced_var (var);
4631 ni_name = force_gimple_operand (ni, &stmt, false, var);
4633 /* Insert stmt into exit_bb. */
4634 last_bsi = bsi_last (exit_bb);
4635 if (stmt)
4636 bsi_insert_before (&last_bsi, stmt, BSI_SAME_STMT);
4638 /* Fix phi expressions in the successor bb. */
4639 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
4644 /* Function vect_do_peeling_for_loop_bound
4646 Peel the last iterations of the loop represented by LOOP_VINFO.
4647 The peeled iterations form a new epilog loop. Given that the loop now
4648 iterates NITERS times, the new epilog loop iterates
4649 NITERS % VECTORIZATION_FACTOR times.
4651 The original loop will later be made to iterate
4652 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
4654 static void
4655 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
4657 tree ni_name, ratio_mult_vf_name;
4658 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4659 struct loop *new_loop;
4660 edge update_e;
4661 basic_block preheader;
4662 int loop_num;
4663 unsigned int th;
4665 if (vect_print_dump_info (REPORT_DETAILS))
4666 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
4668 initialize_original_copy_tables ();
4670 /* Generate the following variables on the preheader of original loop:
4672 ni_name = number of iteration the original loop executes
4673 ratio = ni_name / vf
4674 ratio_mult_vf_name = ratio * vf */
4675 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
4676 &ratio_mult_vf_name, ratio);
4678 loop_num = loop->num;
4679 /* Threshold for vectorized loop. */
4680 th = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)) *
4681 LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4682 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
4683 ratio_mult_vf_name, ni_name, false, th);
4684 gcc_assert (new_loop);
4685 gcc_assert (loop_num == loop->num);
4686 #ifdef ENABLE_CHECKING
4687 slpeel_verify_cfg_after_peeling (loop, new_loop);
4688 #endif
4690 /* A guard that controls whether the new_loop is to be executed or skipped
4691 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
4692 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
4693 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
4694 is on the path where the LOOP IVs are used and need to be updated. */
4696 preheader = loop_preheader_edge (new_loop)->src;
4697 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
4698 update_e = EDGE_PRED (preheader, 0);
4699 else
4700 update_e = EDGE_PRED (preheader, 1);
4702 /* Update IVs of original loop as if they were advanced
4703 by ratio_mult_vf_name steps. */
4704 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
4706 /* After peeling we have to reset scalar evolution analyzer. */
4707 scev_reset ();
4709 free_original_copy_tables ();
4713 /* Function vect_gen_niters_for_prolog_loop
4715 Set the number of iterations for the loop represented by LOOP_VINFO
4716 to the minimum between LOOP_NITERS (the original iteration count of the loop)
4717 and the misalignment of DR - the data reference recorded in
4718 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
4719 this loop, the data reference DR will refer to an aligned location.
4721 The following computation is generated:
4723 If the misalignment of DR is known at compile time:
4724 addr_mis = int mis = DR_MISALIGNMENT (dr);
4725 Else, compute address misalignment in bytes:
4726 addr_mis = addr & (vectype_size - 1)
4728 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
4730 (elem_size = element type size; an element is the scalar element
4731 whose type is the inner type of the vectype)
4733 For interleaving,
4735 prolog_niters = min ( LOOP_NITERS ,
4736 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
4737 where group_size is the size of the interleaved group.
4740 static tree
4741 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
4743 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
4744 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4745 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4746 tree var, stmt;
4747 tree iters, iters_name;
4748 edge pe;
4749 basic_block new_bb;
4750 tree dr_stmt = DR_STMT (dr);
4751 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
4752 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4753 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
4754 tree niters_type = TREE_TYPE (loop_niters);
4755 int group_size = 1;
4756 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
4758 if (DR_GROUP_FIRST_DR (stmt_info))
4760 /* For interleaved access element size must be multiplied by the size of
4761 the interleaved group. */
4762 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
4763 DR_GROUP_FIRST_DR (stmt_info)));
4764 element_size *= group_size;
4767 pe = loop_preheader_edge (loop);
4769 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
4771 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
4772 int elem_misalign = byte_misalign / element_size;
4774 if (vect_print_dump_info (REPORT_DETAILS))
4775 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
4776 iters = build_int_cst (niters_type,
4777 (vf - elem_misalign)&(vf/group_size-1));
4779 else
4781 tree new_stmts = NULL_TREE;
4782 tree start_addr =
4783 vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE);
4784 tree ptr_type = TREE_TYPE (start_addr);
4785 tree size = TYPE_SIZE (ptr_type);
4786 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
4787 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
4788 tree elem_size_log =
4789 build_int_cst (type, exact_log2 (vectype_align/vf));
4790 tree vf_minus_1 = build_int_cst (type, vf - 1);
4791 tree vf_tree = build_int_cst (type, vf);
4792 tree byte_misalign;
4793 tree elem_misalign;
4795 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
4796 gcc_assert (!new_bb);
4798 /* Create: byte_misalign = addr & (vectype_size - 1) */
4799 byte_misalign =
4800 fold_build2 (BIT_AND_EXPR, type, start_addr, vectype_size_minus_1);
4802 /* Create: elem_misalign = byte_misalign / element_size */
4803 elem_misalign =
4804 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
4806 /* Create: (niters_type) (VF - elem_misalign)&(VF - 1) */
4807 iters = fold_build2 (MINUS_EXPR, type, vf_tree, elem_misalign);
4808 iters = fold_build2 (BIT_AND_EXPR, type, iters, vf_minus_1);
4809 iters = fold_convert (niters_type, iters);
4812 /* Create: prolog_loop_niters = min (iters, loop_niters) */
4813 /* If the loop bound is known at compile time we already verified that it is
4814 greater than vf; since the misalignment ('iters') is at most vf, there's
4815 no need to generate the MIN_EXPR in this case. */
4816 if (TREE_CODE (loop_niters) != INTEGER_CST)
4817 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
4819 if (vect_print_dump_info (REPORT_DETAILS))
4821 fprintf (vect_dump, "niters for prolog loop: ");
4822 print_generic_expr (vect_dump, iters, TDF_SLIM);
4825 var = create_tmp_var (niters_type, "prolog_loop_niters");
4826 add_referenced_var (var);
4827 iters_name = force_gimple_operand (iters, &stmt, false, var);
4829 /* Insert stmt on loop preheader edge. */
4830 if (stmt)
4832 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4833 gcc_assert (!new_bb);
4836 return iters_name;
4840 /* Function vect_update_init_of_dr
4842 NITERS iterations were peeled from LOOP. DR represents a data reference
4843 in LOOP. This function updates the information recorded in DR to
4844 account for the fact that the first NITERS iterations had already been
4845 executed. Specifically, it updates the OFFSET field of DR. */
4847 static void
4848 vect_update_init_of_dr (struct data_reference *dr, tree niters)
4850 tree offset = DR_OFFSET (dr);
4852 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
4853 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
4854 DR_OFFSET (dr) = offset;
4858 /* Function vect_update_inits_of_drs
4860 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
4861 This function updates the information recorded for the data references in
4862 the loop to account for the fact that the first NITERS iterations had
4863 already been executed. Specifically, it updates the initial_condition of the
4864 access_function of all the data_references in the loop. */
4866 static void
4867 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
4869 unsigned int i;
4870 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
4871 struct data_reference *dr;
4873 if (vect_dump && (dump_flags & TDF_DETAILS))
4874 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
4876 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
4877 vect_update_init_of_dr (dr, niters);
4881 /* Function vect_do_peeling_for_alignment
4883 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
4884 'niters' is set to the misalignment of one of the data references in the
4885 loop, thereby forcing it to refer to an aligned location at the beginning
4886 of the execution of this loop. The data reference for which we are
4887 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
4889 static void
4890 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
4892 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4893 tree niters_of_prolog_loop, ni_name;
4894 tree n_iters;
4895 struct loop *new_loop;
4897 if (vect_print_dump_info (REPORT_DETAILS))
4898 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
4900 initialize_original_copy_tables ();
4902 ni_name = vect_build_loop_niters (loop_vinfo);
4903 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
4905 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
4906 new_loop =
4907 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
4908 niters_of_prolog_loop, ni_name, true, 0);
4909 gcc_assert (new_loop);
4910 #ifdef ENABLE_CHECKING
4911 slpeel_verify_cfg_after_peeling (new_loop, loop);
4912 #endif
4914 /* Update number of times loop executes. */
4915 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
4916 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
4917 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
4919 /* Update the init conditions of the access functions of all data refs. */
4920 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
4922 /* After peeling we have to reset scalar evolution analyzer. */
4923 scev_reset ();
4925 free_original_copy_tables ();
4929 /* Function vect_create_cond_for_align_checks.
4931 Create a conditional expression that represents the alignment checks for
4932 all of data references (array element references) whose alignment must be
4933 checked at runtime.
4935 Input:
4936 LOOP_VINFO - two fields of the loop information are used.
4937 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
4938 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
4940 Output:
4941 COND_EXPR_STMT_LIST - statements needed to construct the conditional
4942 expression.
4943 The returned value is the conditional expression to be used in the if
4944 statement that controls which version of the loop gets executed at runtime.
4946 The algorithm makes two assumptions:
4947 1) The number of bytes "n" in a vector is a power of 2.
4948 2) An address "a" is aligned if a%n is zero and that this
4949 test can be done as a&(n-1) == 0. For example, for 16
4950 byte vectors the test is a&0xf == 0. */
4952 static tree
4953 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
4954 tree *cond_expr_stmt_list)
4956 VEC(tree,heap) *may_misalign_stmts
4957 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
4958 tree ref_stmt;
4959 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
4960 tree mask_cst;
4961 unsigned int i;
4962 tree psize;
4963 tree int_ptrsize_type;
4964 char tmp_name[20];
4965 tree or_tmp_name = NULL_TREE;
4966 tree and_tmp, and_tmp_name, and_stmt;
4967 tree ptrsize_zero;
4969 /* Check that mask is one less than a power of 2, i.e., mask is
4970 all zeros followed by all ones. */
4971 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
4973 /* CHECKME: what is the best integer or unsigned type to use to hold a
4974 cast from a pointer value? */
4975 psize = TYPE_SIZE (ptr_type_node);
4976 int_ptrsize_type
4977 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
4979 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
4980 of the first vector of the i'th data reference. */
4982 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
4984 tree new_stmt_list = NULL_TREE;
4985 tree addr_base;
4986 tree addr_tmp, addr_tmp_name, addr_stmt;
4987 tree or_tmp, new_or_tmp_name, or_stmt;
4989 /* create: addr_tmp = (int)(address_of_first_vector) */
4990 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
4991 &new_stmt_list,
4992 NULL_TREE);
4994 if (new_stmt_list != NULL_TREE)
4995 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
4997 sprintf (tmp_name, "%s%d", "addr2int", i);
4998 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4999 add_referenced_var (addr_tmp);
5000 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
5001 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
5002 addr_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
5003 addr_tmp_name, addr_stmt);
5004 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
5005 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
5007 /* The addresses are OR together. */
5009 if (or_tmp_name != NULL_TREE)
5011 /* create: or_tmp = or_tmp | addr_tmp */
5012 sprintf (tmp_name, "%s%d", "orptrs", i);
5013 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
5014 add_referenced_var (or_tmp);
5015 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
5016 or_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
5017 new_or_tmp_name,
5018 build2 (BIT_IOR_EXPR, int_ptrsize_type,
5019 or_tmp_name,
5020 addr_tmp_name));
5021 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
5022 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
5023 or_tmp_name = new_or_tmp_name;
5025 else
5026 or_tmp_name = addr_tmp_name;
5028 } /* end for i */
5030 mask_cst = build_int_cst (int_ptrsize_type, mask);
5032 /* create: and_tmp = or_tmp & mask */
5033 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
5034 add_referenced_var (and_tmp);
5035 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
5037 and_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
5038 and_tmp_name,
5039 build2 (BIT_AND_EXPR, int_ptrsize_type,
5040 or_tmp_name, mask_cst));
5041 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
5042 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
5044 /* Make and_tmp the left operand of the conditional test against zero.
5045 if and_tmp has a nonzero bit then some address is unaligned. */
5046 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
5047 return build2 (EQ_EXPR, boolean_type_node,
5048 and_tmp_name, ptrsize_zero);
5052 /* Function vect_transform_loop.
5054 The analysis phase has determined that the loop is vectorizable.
5055 Vectorize the loop - created vectorized stmts to replace the scalar
5056 stmts in the loop, and update the loop exit condition. */
5058 void
5059 vect_transform_loop (loop_vec_info loop_vinfo)
5061 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5062 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5063 int nbbs = loop->num_nodes;
5064 block_stmt_iterator si;
5065 int i;
5066 tree ratio = NULL;
5067 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5068 bool strided_store;
5070 if (vect_print_dump_info (REPORT_DETAILS))
5071 fprintf (vect_dump, "=== vec_transform_loop ===");
5073 /* If the loop has data references that may or may not be aligned then
5074 two versions of the loop need to be generated, one which is vectorized
5075 and one which isn't. A test is then generated to control which of the
5076 loops is executed. The test checks for the alignment of all of the
5077 data references that may or may not be aligned. */
5079 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
5081 struct loop *nloop;
5082 tree cond_expr;
5083 tree cond_expr_stmt_list = NULL_TREE;
5084 basic_block condition_bb;
5085 block_stmt_iterator cond_exp_bsi;
5086 basic_block merge_bb;
5087 basic_block new_exit_bb;
5088 edge new_exit_e, e;
5089 tree orig_phi, new_phi, arg;
5090 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
5092 cond_expr = vect_create_cond_for_align_checks (loop_vinfo,
5093 &cond_expr_stmt_list);
5094 initialize_original_copy_tables ();
5095 nloop = loop_version (loop, cond_expr, &condition_bb,
5096 prob, prob, REG_BR_PROB_BASE - prob, true);
5097 free_original_copy_tables();
5099 /** Loop versioning violates an assumption we try to maintain during
5100 vectorization - that the loop exit block has a single predecessor.
5101 After versioning, the exit block of both loop versions is the same
5102 basic block (i.e. it has two predecessors). Just in order to simplify
5103 following transformations in the vectorizer, we fix this situation
5104 here by adding a new (empty) block on the exit-edge of the loop,
5105 with the proper loop-exit phis to maintain loop-closed-form. **/
5107 merge_bb = single_exit (loop)->dest;
5108 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
5109 new_exit_bb = split_edge (single_exit (loop));
5110 new_exit_e = single_exit (loop);
5111 e = EDGE_SUCC (new_exit_bb, 0);
5113 for (orig_phi = phi_nodes (merge_bb); orig_phi;
5114 orig_phi = PHI_CHAIN (orig_phi))
5116 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
5117 new_exit_bb);
5118 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
5119 add_phi_arg (new_phi, arg, new_exit_e);
5120 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
5123 /** end loop-exit-fixes after versioning **/
5125 update_ssa (TODO_update_ssa);
5126 cond_exp_bsi = bsi_last (condition_bb);
5127 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
5130 /* CHECKME: we wouldn't need this if we called update_ssa once
5131 for all loops. */
5132 bitmap_zero (vect_memsyms_to_rename);
5134 /* Peel the loop if there are data refs with unknown alignment.
5135 Only one data ref with unknown store is allowed. */
5137 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5138 vect_do_peeling_for_alignment (loop_vinfo);
5140 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5141 compile time constant), or it is a constant that doesn't divide by the
5142 vectorization factor, then an epilog loop needs to be created.
5143 We therefore duplicate the loop: the original loop will be vectorized,
5144 and will compute the first (n/VF) iterations. The second copy of the loop
5145 will remain scalar and will compute the remaining (n%VF) iterations.
5146 (VF is the vectorization factor). */
5148 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5149 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5150 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
5151 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
5152 else
5153 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5154 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5156 /* 1) Make sure the loop header has exactly two entries
5157 2) Make sure we have a preheader basic block. */
5159 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5161 split_edge (loop_preheader_edge (loop));
5163 /* FORNOW: the vectorizer supports only loops which body consist
5164 of one basic block (header + empty latch). When the vectorizer will
5165 support more involved loop forms, the order by which the BBs are
5166 traversed need to be reconsidered. */
5168 for (i = 0; i < nbbs; i++)
5170 basic_block bb = bbs[i];
5172 for (si = bsi_start (bb); !bsi_end_p (si);)
5174 tree stmt = bsi_stmt (si);
5175 stmt_vec_info stmt_info;
5176 bool is_store;
5178 if (vect_print_dump_info (REPORT_DETAILS))
5180 fprintf (vect_dump, "------>vectorizing statement: ");
5181 print_generic_expr (vect_dump, stmt, TDF_SLIM);
5183 stmt_info = vinfo_for_stmt (stmt);
5184 gcc_assert (stmt_info);
5185 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5186 && !STMT_VINFO_LIVE_P (stmt_info))
5188 bsi_next (&si);
5189 continue;
5192 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5193 != (unsigned HOST_WIDE_INT) vectorization_factor)
5194 && vect_print_dump_info (REPORT_DETAILS))
5195 fprintf (vect_dump, "multiple-types.");
5197 /* -------- vectorize statement ------------ */
5198 if (vect_print_dump_info (REPORT_DETAILS))
5199 fprintf (vect_dump, "transform statement.");
5201 strided_store = false;
5202 is_store = vect_transform_stmt (stmt, &si, &strided_store);
5203 if (is_store)
5205 stmt_ann_t ann;
5206 if (DR_GROUP_FIRST_DR (stmt_info))
5208 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5209 interleaving chain was completed - free all the stores in
5210 the chain. */
5211 tree next = DR_GROUP_FIRST_DR (stmt_info);
5212 tree tmp;
5213 stmt_vec_info next_stmt_info;
5215 while (next)
5217 next_stmt_info = vinfo_for_stmt (next);
5218 /* Free the attached stmt_vec_info and remove the stmt. */
5219 ann = stmt_ann (next);
5220 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
5221 free (next_stmt_info);
5222 set_stmt_info (ann, NULL);
5223 next = tmp;
5225 bsi_remove (&si, true);
5226 continue;
5228 else
5230 /* Free the attached stmt_vec_info and remove the stmt. */
5231 ann = stmt_ann (stmt);
5232 free (stmt_info);
5233 set_stmt_info (ann, NULL);
5234 bsi_remove (&si, true);
5235 continue;
5238 else
5240 if (strided_store)
5242 /* This is case of skipped interleaved store. We don't free
5243 its stmt_vec_info. */
5244 bsi_remove (&si, true);
5245 continue;
5248 bsi_next (&si);
5249 } /* stmts in BB */
5250 } /* BBs in loop */
5252 slpeel_make_loop_iterate_ntimes (loop, ratio);
5254 mark_set_for_renaming (vect_memsyms_to_rename);
5256 /* The memory tags and pointers in vectorized statements need to
5257 have their SSA forms updated. FIXME, why can't this be delayed
5258 until all the loops have been transformed? */
5259 update_ssa (TODO_update_ssa);
5261 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
5262 fprintf (vect_dump, "LOOP VECTORIZED.");