Fix 31018 -- move TARGET_xxx in i386.md to tuning options
[official-gcc.git] / gcc / tree-vect-transform.c
blob6938841628d661d4d8eed91a71172f85f6d59f48
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to the Free
19 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
20 02110-1301, USA. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "ggc.h"
27 #include "tree.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "timevar.h"
35 #include "cfgloop.h"
36 #include "expr.h"
37 #include "optabs.h"
38 #include "params.h"
39 #include "recog.h"
40 #include "tree-data-ref.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "langhooks.h"
45 #include "tree-pass.h"
46 #include "toplev.h"
47 #include "real.h"
49 /* Utility functions for the code transformation. */
50 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (tree, block_stmt_iterator *, tree, tree *, tree *, bool, tree);
54 static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree);
55 static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
58 static tree vect_init_vector (tree, tree, tree);
59 static void vect_finish_stmt_generation
60 (tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void update_vuses_to_preheader (tree, struct loop*);
63 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
64 static tree get_initial_def_for_reduction (tree, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
77 /* Function vect_get_new_vect_var.
79 Returns a name for a new variable. The current naming scheme appends the
80 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
81 the name of vectorizer generated variables, and appends that to NAME if
82 provided. */
84 static tree
85 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
87 const char *prefix;
88 tree new_vect_var;
90 switch (var_kind)
92 case vect_simple_var:
93 prefix = "vect_";
94 break;
95 case vect_scalar_var:
96 prefix = "stmp_";
97 break;
98 case vect_pointer_var:
99 prefix = "vect_p";
100 break;
101 default:
102 gcc_unreachable ();
105 if (name)
106 new_vect_var = create_tmp_var (type, concat (prefix, name, NULL));
107 else
108 new_vect_var = create_tmp_var (type, prefix);
110 /* Mark vector typed variable as a gimple register variable. */
111 if (TREE_CODE (type) == VECTOR_TYPE)
112 DECL_GIMPLE_REG_P (new_vect_var) = true;
114 return new_vect_var;
118 /* Function vect_create_addr_base_for_vector_ref.
120 Create an expression that computes the address of the first memory location
121 that will be accessed for a data reference.
123 Input:
124 STMT: The statement containing the data reference.
125 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
126 OFFSET: Optional. If supplied, it is be added to the initial address.
128 Output:
129 1. Return an SSA_NAME whose value is the address of the memory location of
130 the first vector of the data reference.
131 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
132 these statement(s) which define the returned SSA_NAME.
134 FORNOW: We are only handling array accesses with step 1. */
136 static tree
137 vect_create_addr_base_for_vector_ref (tree stmt,
138 tree *new_stmt_list,
139 tree offset)
141 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
142 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
143 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
144 tree base_name = build_fold_indirect_ref (data_ref_base);
145 tree vec_stmt;
146 tree addr_base, addr_expr;
147 tree dest, new_stmt;
148 tree base_offset = unshare_expr (DR_OFFSET (dr));
149 tree init = unshare_expr (DR_INIT (dr));
150 tree vect_ptr_type, addr_expr2;
152 /* Create base_offset */
153 base_offset = size_binop (PLUS_EXPR, base_offset, init);
154 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
155 add_referenced_var (dest);
156 base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest);
157 append_to_statement_list_force (new_stmt, new_stmt_list);
159 if (offset)
161 tree tmp = create_tmp_var (TREE_TYPE (base_offset), "offset");
162 tree step;
164 /* For interleaved access step we divide STEP by the size of the
165 interleaving group. */
166 if (DR_GROUP_SIZE (stmt_info))
167 step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr),
168 build_int_cst (TREE_TYPE (offset),
169 DR_GROUP_SIZE (stmt_info)));
170 else
171 step = DR_STEP (dr);
173 add_referenced_var (tmp);
174 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
175 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
176 base_offset, offset);
177 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
178 append_to_statement_list_force (new_stmt, new_stmt_list);
181 /* base + base_offset */
182 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
183 base_offset);
185 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
187 /* addr_expr = addr_base */
188 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
189 get_name (base_name));
190 add_referenced_var (addr_expr);
191 vec_stmt = fold_convert (vect_ptr_type, addr_base);
192 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
193 get_name (base_name));
194 add_referenced_var (addr_expr2);
195 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
196 append_to_statement_list_force (new_stmt, new_stmt_list);
198 if (vect_print_dump_info (REPORT_DETAILS))
200 fprintf (vect_dump, "created ");
201 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
203 return vec_stmt;
207 /* Function vect_create_data_ref_ptr.
209 Create a new pointer to vector type (vp), that points to the first location
210 accessed in the loop by STMT, along with the def-use update chain to
211 appropriately advance the pointer through the loop iterations. Also set
212 aliasing information for the pointer. This vector pointer is used by the
213 callers to this function to create a memory reference expression for vector
214 load/store access.
216 Input:
217 1. STMT: a stmt that references memory. Expected to be of the form
218 GIMPLE_MODIFY_STMT <name, data-ref> or
219 GIMPLE_MODIFY_STMT <data-ref, name>.
220 2. BSI: block_stmt_iterator where new stmts can be added.
221 3. OFFSET (optional): an offset to be added to the initial address accessed
222 by the data-ref in STMT.
223 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
224 pointing to the initial address.
225 5. TYPE: if not NULL indicates the required type of the data-ref
227 Output:
228 1. Declare a new ptr to vector_type, and have it point to the base of the
229 data reference (initial addressed accessed by the data reference).
230 For example, for vector of type V8HI, the following code is generated:
232 v8hi *vp;
233 vp = (v8hi *)initial_address;
235 if OFFSET is not supplied:
236 initial_address = &a[init];
237 if OFFSET is supplied:
238 initial_address = &a[init + OFFSET];
240 Return the initial_address in INITIAL_ADDRESS.
242 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
243 update the pointer in each iteration of the loop.
245 Return the increment stmt that updates the pointer in PTR_INCR.
247 3. Return the pointer. */
249 static tree
250 vect_create_data_ref_ptr (tree stmt,
251 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
252 tree offset, tree *initial_address, tree *ptr_incr,
253 bool only_init, tree type)
255 tree base_name;
256 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
257 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
258 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
259 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
260 tree vect_ptr_type;
261 tree vect_ptr;
262 tree tag;
263 tree new_temp;
264 tree vec_stmt;
265 tree new_stmt_list = NULL_TREE;
266 edge pe = loop_preheader_edge (loop);
267 basic_block new_bb;
268 tree vect_ptr_init;
269 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
271 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
273 if (vect_print_dump_info (REPORT_DETAILS))
275 tree data_ref_base = base_name;
276 fprintf (vect_dump, "create vector-pointer variable to type: ");
277 print_generic_expr (vect_dump, vectype, TDF_SLIM);
278 if (TREE_CODE (data_ref_base) == VAR_DECL)
279 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
280 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
281 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
282 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
283 fprintf (vect_dump, " vectorizing a record based array ref: ");
284 else if (TREE_CODE (data_ref_base) == SSA_NAME)
285 fprintf (vect_dump, " vectorizing a pointer ref: ");
286 print_generic_expr (vect_dump, base_name, TDF_SLIM);
289 /** (1) Create the new vector-pointer variable: **/
290 if (type)
291 vect_ptr_type = build_pointer_type (type);
292 else
293 vect_ptr_type = build_pointer_type (vectype);
294 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
295 get_name (base_name));
296 add_referenced_var (vect_ptr);
298 /** (2) Add aliasing information to the new vector-pointer:
299 (The points-to info (DR_PTR_INFO) may be defined later.) **/
301 tag = DR_MEMTAG (dr);
302 gcc_assert (tag);
304 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
305 tag must be created with tag added to its may alias list. */
306 if (!MTAG_P (tag))
307 new_type_alias (vect_ptr, tag, DR_REF (dr));
308 else
309 set_symbol_mem_tag (vect_ptr, tag);
311 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
313 /** (3) Calculate the initial address the vector-pointer, and set
314 the vector-pointer to point to it before the loop: **/
316 /* Create: (&(base[init_val+offset]) in the loop preheader. */
317 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
318 offset);
319 pe = loop_preheader_edge (loop);
320 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
321 gcc_assert (!new_bb);
322 *initial_address = new_temp;
324 /* Create: p = (vectype *) initial_base */
325 vec_stmt = fold_convert (vect_ptr_type, new_temp);
326 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
327 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
328 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
329 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
330 gcc_assert (!new_bb);
333 /** (4) Handle the updating of the vector-pointer inside the loop: **/
335 if (only_init) /* No update in loop is required. */
337 /* Copy the points-to information if it exists. */
338 if (DR_PTR_INFO (dr))
339 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
340 return vect_ptr_init;
342 else
344 block_stmt_iterator incr_bsi;
345 bool insert_after;
346 tree indx_before_incr, indx_after_incr;
347 tree incr;
349 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
350 create_iv (vect_ptr_init,
351 fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)),
352 NULL_TREE, loop, &incr_bsi, insert_after,
353 &indx_before_incr, &indx_after_incr);
354 incr = bsi_stmt (incr_bsi);
355 set_stmt_info (stmt_ann (incr),
356 new_stmt_vec_info (incr, loop_vinfo));
358 /* Copy the points-to information if it exists. */
359 if (DR_PTR_INFO (dr))
361 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
362 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
364 merge_alias_info (vect_ptr_init, indx_before_incr);
365 merge_alias_info (vect_ptr_init, indx_after_incr);
366 if (ptr_incr)
367 *ptr_incr = incr;
369 return indx_before_incr;
374 /* Function bump_vector_ptr
376 Increment a pointer (to a vector type) by vector-size. Connect the new
377 increment stmt to the existing def-use update-chain of the pointer.
379 The pointer def-use update-chain before this function:
380 DATAREF_PTR = phi (p_0, p_2)
381 ....
382 PTR_INCR: p_2 = DATAREF_PTR + step
384 The pointer def-use update-chain after this function:
385 DATAREF_PTR = phi (p_0, p_2)
386 ....
387 NEW_DATAREF_PTR = DATAREF_PTR + vector_size
388 ....
389 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
391 Input:
392 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
393 in the loop.
394 PTR_INCR - the stmt that updates the pointer in each iteration of the loop.
395 The increment amount across iterations is also expected to be
396 vector_size.
397 BSI - location where the new update stmt is to be placed.
398 STMT - the original scalar memory-access stmt that is being vectorized.
400 Output: Return NEW_DATAREF_PTR as illustrated above.
404 static tree
405 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
406 tree stmt)
408 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
409 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
410 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
411 tree vptr_type = TREE_TYPE (dataref_ptr);
412 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
413 tree update = fold_convert (vptr_type, TYPE_SIZE_UNIT (vectype));
414 tree incr_stmt;
415 ssa_op_iter iter;
416 use_operand_p use_p;
417 tree new_dataref_ptr;
419 incr_stmt = build_gimple_modify_stmt (ptr_var,
420 build2 (PLUS_EXPR, vptr_type,
421 dataref_ptr, update));
422 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
423 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
424 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
426 /* Update the vector-pointer's cross-iteration increment. */
427 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
429 tree use = USE_FROM_PTR (use_p);
431 if (use == dataref_ptr)
432 SET_USE (use_p, new_dataref_ptr);
433 else
434 gcc_assert (tree_int_cst_compare (use, update) == 0);
437 /* Copy the points-to information if it exists. */
438 if (DR_PTR_INFO (dr))
439 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
440 merge_alias_info (new_dataref_ptr, dataref_ptr);
442 return new_dataref_ptr;
446 /* Function vect_create_destination_var.
448 Create a new temporary of type VECTYPE. */
450 static tree
451 vect_create_destination_var (tree scalar_dest, tree vectype)
453 tree vec_dest;
454 const char *new_name;
455 tree type;
456 enum vect_var_kind kind;
458 kind = vectype ? vect_simple_var : vect_scalar_var;
459 type = vectype ? vectype : TREE_TYPE (scalar_dest);
461 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
463 new_name = get_name (scalar_dest);
464 if (!new_name)
465 new_name = "var_";
466 vec_dest = vect_get_new_vect_var (type, kind, new_name);
467 add_referenced_var (vec_dest);
469 return vec_dest;
473 /* Function vect_init_vector.
475 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
476 the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be
477 used in the vectorization of STMT. */
479 static tree
480 vect_init_vector (tree stmt, tree vector_var, tree vector_type)
482 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
483 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
484 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
485 tree new_var;
486 tree init_stmt;
487 tree vec_oprnd;
488 edge pe;
489 tree new_temp;
490 basic_block new_bb;
492 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
493 add_referenced_var (new_var);
495 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
496 new_temp = make_ssa_name (new_var, init_stmt);
497 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
499 pe = loop_preheader_edge (loop);
500 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
501 gcc_assert (!new_bb);
503 if (vect_print_dump_info (REPORT_DETAILS))
505 fprintf (vect_dump, "created new init_stmt: ");
506 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
509 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
510 return vec_oprnd;
514 /* Function get_initial_def_for_induction
516 Input:
517 STMT - a stmt that performs an induction operation in the loop.
518 IV_PHI - the initial value of the induction variable
520 Output:
521 Return a vector variable, initialized with the first VF values of
522 the induction variable. E.g., for an iv with IV_PHI='X' and
523 evolution S, for a vector of 4 units, we want to return:
524 [X, X + S, X + 2*S, X + 3*S]. */
526 static tree
527 get_initial_def_for_induction (tree stmt, tree iv_phi)
529 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
530 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
531 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
532 tree scalar_type = TREE_TYPE (iv_phi);
533 tree vectype = get_vectype_for_scalar_type (scalar_type);
534 int nunits = GET_MODE_NUNITS (TYPE_MODE (vectype));
535 edge pe = loop_preheader_edge (loop);
536 basic_block new_bb;
537 block_stmt_iterator bsi;
538 tree vec, vec_init, vec_step, t;
539 tree access_fn;
540 tree new_var;
541 tree new_name;
542 tree init_stmt;
543 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
544 tree init_expr, step_expr;
545 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
546 int i;
547 bool ok;
548 int ncopies = vf / nunits;
549 tree expr;
550 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
552 gcc_assert (phi_info);
554 if (STMT_VINFO_VEC_STMT (phi_info))
556 induction_phi = STMT_VINFO_VEC_STMT (phi_info);
557 gcc_assert (TREE_CODE (induction_phi) == PHI_NODE);
559 if (vect_print_dump_info (REPORT_DETAILS))
561 fprintf (vect_dump, "induction already vectorized:");
562 print_generic_expr (vect_dump, iv_phi, TDF_SLIM);
563 fprintf (vect_dump, "\n");
564 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
567 return PHI_RESULT (induction_phi);
570 gcc_assert (ncopies >= 1);
572 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (iv_phi));
573 gcc_assert (access_fn);
574 ok = vect_is_simple_iv_evolution (loop->num, access_fn, &init_expr, &step_expr);
575 gcc_assert (ok);
577 /* Create the vector that holds the initial_value of the induction. */
578 new_name = init_expr;
579 t = NULL_TREE;
580 t = tree_cons (NULL_TREE, init_expr, t);
581 for (i = 1; i < nunits; i++)
583 tree tmp;
585 /* Create: new_name = new_name + step_expr */
586 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
587 add_referenced_var (new_var);
588 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
589 init_stmt = build_gimple_modify_stmt (new_var, tmp);
590 new_name = make_ssa_name (new_var, init_stmt);
591 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
593 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
594 gcc_assert (!new_bb);
596 if (vect_print_dump_info (REPORT_DETAILS))
598 fprintf (vect_dump, "created new init_stmt: ");
599 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
601 t = tree_cons (NULL_TREE, new_name, t);
603 vec = build_constructor_from_list (vectype, nreverse (t));
604 vec_init = vect_init_vector (stmt, vec, vectype);
607 /* Create the vector that holds the step of the induction. */
608 expr = build_int_cst (scalar_type, vf);
609 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
610 t = NULL_TREE;
611 for (i = 0; i < nunits; i++)
612 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
613 vec = build_constructor_from_list (vectype, t);
614 vec_step = vect_init_vector (stmt, vec, vectype);
617 /* Create the following def-use cycle:
618 loop prolog:
619 vec_init = [X, X+S, X+2*S, X+3*S]
620 vec_step = [VF*S, VF*S, VF*S, VF*S]
621 loop:
622 vec_iv = PHI <vec_init, vec_loop>
624 STMT
626 vec_loop = vec_iv + vec_step; */
628 /* Create the induction-phi that defines the induction-operand. */
629 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
630 add_referenced_var (vec_dest);
631 induction_phi = create_phi_node (vec_dest, loop->header);
632 set_stmt_info (get_stmt_ann (induction_phi),
633 new_stmt_vec_info (induction_phi, loop_vinfo));
634 induc_def = PHI_RESULT (induction_phi);
636 /* Create the iv update inside the loop */
637 new_stmt = build_gimple_modify_stmt (NULL_TREE,
638 build2 (PLUS_EXPR, vectype,
639 induc_def, vec_step));
640 vec_def = make_ssa_name (vec_dest, new_stmt);
641 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
642 bsi = bsi_for_stmt (stmt);
643 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
645 /* Set the arguments of the phi node: */
646 add_phi_arg (induction_phi, vec_init, loop_preheader_edge (loop));
647 add_phi_arg (induction_phi, vec_def, loop_latch_edge (loop));
650 /* In case the vectorization factor (VF) is bigger than the number
651 of elements that we can fit in a vectype (nunits), we have to generate
652 more than one vector stmt - i.e - we need to "unroll" the
653 vector stmt by a factor VF/nunits. For more details see documentation
654 in vectorizable_operation. */
656 if (ncopies > 1)
658 stmt_vec_info prev_stmt_vinfo;
660 /* Create the vector that holds the step of the induction. */
661 expr = build_int_cst (scalar_type, nunits);
662 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
663 t = NULL_TREE;
664 for (i = 0; i < nunits; i++)
665 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
666 vec = build_constructor_from_list (vectype, t);
667 vec_step = vect_init_vector (stmt, vec, vectype);
669 vec_def = induc_def;
670 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
671 for (i = 1; i < ncopies; i++)
673 tree tmp;
675 /* vec_i = vec_prev + vec_{step*nunits} */
676 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
677 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
678 vec_def = make_ssa_name (vec_dest, new_stmt);
679 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
680 bsi = bsi_for_stmt (stmt);
681 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
683 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
684 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
688 if (vect_print_dump_info (REPORT_DETAILS))
690 fprintf (vect_dump, "transform induction: created def-use cycle:");
691 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
692 fprintf (vect_dump, "\n");
693 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
696 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
697 return induc_def;
701 /* Function vect_get_vec_def_for_operand.
703 OP is an operand in STMT. This function returns a (vector) def that will be
704 used in the vectorized stmt for STMT.
706 In the case that OP is an SSA_NAME which is defined in the loop, then
707 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
709 In case OP is an invariant or constant, a new stmt that creates a vector def
710 needs to be introduced. */
712 static tree
713 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
715 tree vec_oprnd;
716 tree vec_stmt;
717 tree def_stmt;
718 stmt_vec_info def_stmt_info = NULL;
719 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
720 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
721 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
722 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
723 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
724 tree vec_inv;
725 tree vec_cst;
726 tree t = NULL_TREE;
727 tree def;
728 int i;
729 enum vect_def_type dt;
730 bool is_simple_use;
731 tree vector_type;
733 if (vect_print_dump_info (REPORT_DETAILS))
735 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
736 print_generic_expr (vect_dump, op, TDF_SLIM);
739 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
740 gcc_assert (is_simple_use);
741 if (vect_print_dump_info (REPORT_DETAILS))
743 if (def)
745 fprintf (vect_dump, "def = ");
746 print_generic_expr (vect_dump, def, TDF_SLIM);
748 if (def_stmt)
750 fprintf (vect_dump, " def_stmt = ");
751 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
755 switch (dt)
757 /* Case 1: operand is a constant. */
758 case vect_constant_def:
760 if (scalar_def)
761 *scalar_def = op;
763 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
764 if (vect_print_dump_info (REPORT_DETAILS))
765 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
767 for (i = nunits - 1; i >= 0; --i)
769 t = tree_cons (NULL_TREE, op, t);
771 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
772 vec_cst = build_vector (vector_type, t);
774 return vect_init_vector (stmt, vec_cst, vector_type);
777 /* Case 2: operand is defined outside the loop - loop invariant. */
778 case vect_invariant_def:
780 if (scalar_def)
781 *scalar_def = def;
783 /* Create 'vec_inv = {inv,inv,..,inv}' */
784 if (vect_print_dump_info (REPORT_DETAILS))
785 fprintf (vect_dump, "Create vector_inv.");
787 for (i = nunits - 1; i >= 0; --i)
789 t = tree_cons (NULL_TREE, def, t);
792 /* FIXME: use build_constructor directly. */
793 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
794 vec_inv = build_constructor_from_list (vector_type, t);
795 return vect_init_vector (stmt, vec_inv, vector_type);
798 /* Case 3: operand is defined inside the loop. */
799 case vect_loop_def:
801 if (scalar_def)
802 *scalar_def = def_stmt;
804 /* Get the def from the vectorized stmt. */
805 def_stmt_info = vinfo_for_stmt (def_stmt);
806 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
807 gcc_assert (vec_stmt);
808 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
809 return vec_oprnd;
812 /* Case 4: operand is defined by a loop header phi - reduction */
813 case vect_reduction_def:
815 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
817 /* Get the def before the loop */
818 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
819 return get_initial_def_for_reduction (stmt, op, scalar_def);
822 /* Case 5: operand is defined by loop-header phi - induction. */
823 case vect_induction_def:
825 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
827 /* Get the def before the loop */
828 return get_initial_def_for_induction (stmt, def_stmt);
831 default:
832 gcc_unreachable ();
837 /* Function vect_get_vec_def_for_stmt_copy
839 Return a vector-def for an operand. This function is used when the
840 vectorized stmt to be created (by the caller to this function) is a "copy"
841 created in case the vectorized result cannot fit in one vector, and several
842 copies of the vector-stmt are required. In this case the vector-def is
843 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
844 of the stmt that defines VEC_OPRND.
845 DT is the type of the vector def VEC_OPRND.
847 Context:
848 In case the vectorization factor (VF) is bigger than the number
849 of elements that can fit in a vectype (nunits), we have to generate
850 more than one vector stmt to vectorize the scalar stmt. This situation
851 arises when there are multiple data-types operated upon in the loop; the
852 smallest data-type determines the VF, and as a result, when vectorizing
853 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
854 vector stmt (each computing a vector of 'nunits' results, and together
855 computing 'VF' results in each iteration). This function is called when
856 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
857 which VF=16 and nunits=4, so the number of copies required is 4):
859 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
861 S1: x = load VS1.0: vx.0 = memref0 VS1.1
862 VS1.1: vx.1 = memref1 VS1.2
863 VS1.2: vx.2 = memref2 VS1.3
864 VS1.3: vx.3 = memref3
866 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
867 VSnew.1: vz1 = vx.1 + ... VSnew.2
868 VSnew.2: vz2 = vx.2 + ... VSnew.3
869 VSnew.3: vz3 = vx.3 + ...
871 The vectorization of S1 is explained in vectorizable_load.
872 The vectorization of S2:
873 To create the first vector-stmt out of the 4 copies - VSnew.0 -
874 the function 'vect_get_vec_def_for_operand' is called to
875 get the relevant vector-def for each operand of S2. For operand x it
876 returns the vector-def 'vx.0'.
878 To create the remaining copies of the vector-stmt (VSnew.j), this
879 function is called to get the relevant vector-def for each operand. It is
880 obtained from the respective VS1.j stmt, which is recorded in the
881 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
883 For example, to obtain the vector-def 'vx.1' in order to create the
884 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
885 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
886 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
887 and return its def ('vx.1').
888 Overall, to create the above sequence this function will be called 3 times:
889 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
890 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
891 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
893 static tree
894 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
896 tree vec_stmt_for_operand;
897 stmt_vec_info def_stmt_info;
899 /* Do nothing; can reuse same def. */
900 if (dt == vect_invariant_def || dt == vect_constant_def )
901 return vec_oprnd;
903 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
904 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
905 gcc_assert (def_stmt_info);
906 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
907 gcc_assert (vec_stmt_for_operand);
908 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
910 return vec_oprnd;
914 /* Function vect_finish_stmt_generation.
916 Insert a new stmt. */
918 static void
919 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
920 block_stmt_iterator *bsi)
922 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
923 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
925 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
926 set_stmt_info (get_stmt_ann (vec_stmt),
927 new_stmt_vec_info (vec_stmt, loop_vinfo));
929 if (vect_print_dump_info (REPORT_DETAILS))
931 fprintf (vect_dump, "add new stmt: ");
932 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
935 /* Make sure bsi points to the stmt that is being vectorized. */
936 gcc_assert (stmt == bsi_stmt (*bsi));
938 #ifdef USE_MAPPED_LOCATION
939 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
940 #else
941 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
942 #endif
946 #define ADJUST_IN_EPILOG 1
948 /* Function get_initial_def_for_reduction
950 Input:
951 STMT - a stmt that performs a reduction operation in the loop.
952 INIT_VAL - the initial value of the reduction variable
954 Output:
955 SCALAR_DEF - a tree that holds a value to be added to the final result
956 of the reduction (used for "ADJUST_IN_EPILOG" - see below).
957 Return a vector variable, initialized according to the operation that STMT
958 performs. This vector will be used as the initial value of the
959 vector of partial results.
961 Option1 ("ADJUST_IN_EPILOG"): Initialize the vector as follows:
962 add: [0,0,...,0,0]
963 mult: [1,1,...,1,1]
964 min/max: [init_val,init_val,..,init_val,init_val]
965 bit and/or: [init_val,init_val,..,init_val,init_val]
966 and when necessary (e.g. add/mult case) let the caller know
967 that it needs to adjust the result by init_val.
969 Option2: Initialize the vector as follows:
970 add: [0,0,...,0,init_val]
971 mult: [1,1,...,1,init_val]
972 min/max: [init_val,init_val,...,init_val]
973 bit and/or: [init_val,init_val,...,init_val]
974 and no adjustments are needed.
976 For example, for the following code:
978 s = init_val;
979 for (i=0;i<n;i++)
980 s = s + a[i];
982 STMT is 's = s + a[i]', and the reduction variable is 's'.
983 For a vector of 4 units, we want to return either [0,0,0,init_val],
984 or [0,0,0,0] and let the caller know that it needs to adjust
985 the result at the end by 'init_val'.
987 FORNOW: We use the "ADJUST_IN_EPILOG" scheme.
988 TODO: Use some cost-model to estimate which scheme is more profitable.
991 static tree
992 get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
994 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
995 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
996 int nunits = GET_MODE_NUNITS (TYPE_MODE (vectype));
997 int nelements;
998 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
999 tree type = TREE_TYPE (init_val);
1000 tree def;
1001 tree vec, t = NULL_TREE;
1002 bool need_epilog_adjust;
1003 int i;
1004 tree vector_type;
1006 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
1008 switch (code)
1010 case WIDEN_SUM_EXPR:
1011 case DOT_PROD_EXPR:
1012 case PLUS_EXPR:
1013 if (INTEGRAL_TYPE_P (type))
1014 def = build_int_cst (type, 0);
1015 else
1016 def = build_real (type, dconst0);
1018 #ifdef ADJUST_IN_EPILOG
1019 /* All the 'nunits' elements are set to 0. The final result will be
1020 adjusted by 'init_val' at the loop epilog. */
1021 nelements = nunits;
1022 need_epilog_adjust = true;
1023 #else
1024 /* 'nunits - 1' elements are set to 0; The last element is set to
1025 'init_val'. No further adjustments at the epilog are needed. */
1026 nelements = nunits - 1;
1027 need_epilog_adjust = false;
1028 #endif
1029 break;
1031 case MIN_EXPR:
1032 case MAX_EXPR:
1033 def = init_val;
1034 nelements = nunits;
1035 need_epilog_adjust = false;
1036 break;
1038 default:
1039 gcc_unreachable ();
1042 for (i = nelements - 1; i >= 0; --i)
1043 t = tree_cons (NULL_TREE, def, t);
1045 if (nelements == nunits - 1)
1047 /* Set the last element of the vector. */
1048 t = tree_cons (NULL_TREE, init_val, t);
1049 nelements += 1;
1051 gcc_assert (nelements == nunits);
1053 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1054 if (TREE_CODE (init_val) == INTEGER_CST || TREE_CODE (init_val) == REAL_CST)
1055 vec = build_vector (vector_type, t);
1056 else
1057 vec = build_constructor_from_list (vector_type, t);
1059 if (!need_epilog_adjust)
1060 *scalar_def = NULL_TREE;
1061 else
1062 *scalar_def = init_val;
1064 return vect_init_vector (stmt, vec, vector_type);
1068 /* Function vect_create_epilog_for_reduction
1070 Create code at the loop-epilog to finalize the result of a reduction
1071 computation.
1073 VECT_DEF is a vector of partial results.
1074 REDUC_CODE is the tree-code for the epilog reduction.
1075 STMT is the scalar reduction stmt that is being vectorized.
1076 REDUCTION_PHI is the phi-node that carries the reduction computation.
1078 This function:
1079 1. Creates the reduction def-use cycle: sets the arguments for
1080 REDUCTION_PHI:
1081 The loop-entry argument is the vectorized initial-value of the reduction.
1082 The loop-latch argument is VECT_DEF - the vector of partial sums.
1083 2. "Reduces" the vector of partial results VECT_DEF into a single result,
1084 by applying the operation specified by REDUC_CODE if available, or by
1085 other means (whole-vector shifts or a scalar loop).
1086 The function also creates a new phi node at the loop exit to preserve
1087 loop-closed form, as illustrated below.
1089 The flow at the entry to this function:
1091 loop:
1092 vec_def = phi <null, null> # REDUCTION_PHI
1093 VECT_DEF = vector_stmt # vectorized form of STMT
1094 s_loop = scalar_stmt # (scalar) STMT
1095 loop_exit:
1096 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1097 use <s_out0>
1098 use <s_out0>
1100 The above is transformed by this function into:
1102 loop:
1103 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
1104 VECT_DEF = vector_stmt # vectorized form of STMT
1105 s_loop = scalar_stmt # (scalar) STMT
1106 loop_exit:
1107 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1108 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1109 v_out2 = reduce <v_out1>
1110 s_out3 = extract_field <v_out2, 0>
1111 s_out4 = adjust_result <s_out3>
1112 use <s_out4>
1113 use <s_out4>
1116 static void
1117 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
1118 enum tree_code reduc_code, tree reduction_phi)
1120 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1121 tree vectype;
1122 enum machine_mode mode;
1123 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1124 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1125 basic_block exit_bb;
1126 tree scalar_dest;
1127 tree scalar_type;
1128 tree new_phi;
1129 block_stmt_iterator exit_bsi;
1130 tree vec_dest;
1131 tree new_temp;
1132 tree new_name;
1133 tree epilog_stmt;
1134 tree new_scalar_dest, exit_phi;
1135 tree bitsize, bitpos, bytesize;
1136 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1137 tree scalar_initial_def;
1138 tree vec_initial_def;
1139 tree orig_name;
1140 imm_use_iterator imm_iter;
1141 use_operand_p use_p;
1142 bool extract_scalar_result;
1143 tree reduction_op;
1144 tree orig_stmt;
1145 tree use_stmt;
1146 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
1147 int op_type;
1149 op_type = TREE_OPERAND_LENGTH (operation);
1150 reduction_op = TREE_OPERAND (operation, op_type-1);
1151 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
1152 mode = TYPE_MODE (vectype);
1154 /*** 1. Create the reduction def-use cycle ***/
1156 /* 1.1 set the loop-entry arg of the reduction-phi: */
1157 /* For the case of reduction, vect_get_vec_def_for_operand returns
1158 the scalar def before the loop, that defines the initial value
1159 of the reduction variable. */
1160 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
1161 &scalar_initial_def);
1162 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
1164 /* 1.2 set the loop-latch arg for the reduction-phi: */
1165 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
1167 if (vect_print_dump_info (REPORT_DETAILS))
1169 fprintf (vect_dump, "transform reduction: created def-use cycle:");
1170 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
1171 fprintf (vect_dump, "\n");
1172 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
1176 /*** 2. Create epilog code
1177 The reduction epilog code operates across the elements of the vector
1178 of partial results computed by the vectorized loop.
1179 The reduction epilog code consists of:
1180 step 1: compute the scalar result in a vector (v_out2)
1181 step 2: extract the scalar result (s_out3) from the vector (v_out2)
1182 step 3: adjust the scalar result (s_out3) if needed.
1184 Step 1 can be accomplished using one the following three schemes:
1185 (scheme 1) using reduc_code, if available.
1186 (scheme 2) using whole-vector shifts, if available.
1187 (scheme 3) using a scalar loop. In this case steps 1+2 above are
1188 combined.
1190 The overall epilog code looks like this:
1192 s_out0 = phi <s_loop> # original EXIT_PHI
1193 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1194 v_out2 = reduce <v_out1> # step 1
1195 s_out3 = extract_field <v_out2, 0> # step 2
1196 s_out4 = adjust_result <s_out3> # step 3
1198 (step 3 is optional, and step2 1 and 2 may be combined).
1199 Lastly, the uses of s_out0 are replaced by s_out4.
1201 ***/
1203 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
1204 v_out1 = phi <v_loop> */
1206 exit_bb = single_exit (loop)->dest;
1207 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
1208 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
1209 exit_bsi = bsi_start (exit_bb);
1211 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
1212 (i.e. when reduc_code is not available) and in the final adjustment code
1213 (if needed). Also get the original scalar reduction variable as
1214 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
1215 represents a reduction pattern), the tree-code and scalar-def are
1216 taken from the original stmt that the pattern-stmt (STMT) replaces.
1217 Otherwise (it is a regular reduction) - the tree-code and scalar-def
1218 are taken from STMT. */
1220 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1221 if (!orig_stmt)
1223 /* Regular reduction */
1224 orig_stmt = stmt;
1226 else
1228 /* Reduction pattern */
1229 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
1230 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1231 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
1233 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1234 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
1235 scalar_type = TREE_TYPE (scalar_dest);
1236 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
1237 bitsize = TYPE_SIZE (scalar_type);
1238 bytesize = TYPE_SIZE_UNIT (scalar_type);
1240 /* 2.3 Create the reduction code, using one of the three schemes described
1241 above. */
1243 if (reduc_code < NUM_TREE_CODES)
1245 tree tmp;
1247 /*** Case 1: Create:
1248 v_out2 = reduc_expr <v_out1> */
1250 if (vect_print_dump_info (REPORT_DETAILS))
1251 fprintf (vect_dump, "Reduce using direct vector reduction.");
1253 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1254 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
1255 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
1256 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1257 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1258 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1260 extract_scalar_result = true;
1262 else
1264 enum tree_code shift_code = 0;
1265 bool have_whole_vector_shift = true;
1266 int bit_offset;
1267 int element_bitsize = tree_low_cst (bitsize, 1);
1268 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1269 tree vec_temp;
1271 if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
1272 shift_code = VEC_RSHIFT_EXPR;
1273 else
1274 have_whole_vector_shift = false;
1276 /* Regardless of whether we have a whole vector shift, if we're
1277 emulating the operation via tree-vect-generic, we don't want
1278 to use it. Only the first round of the reduction is likely
1279 to still be profitable via emulation. */
1280 /* ??? It might be better to emit a reduction tree code here, so that
1281 tree-vect-generic can expand the first round via bit tricks. */
1282 if (!VECTOR_MODE_P (mode))
1283 have_whole_vector_shift = false;
1284 else
1286 optab optab = optab_for_tree_code (code, vectype);
1287 if (optab->handlers[mode].insn_code == CODE_FOR_nothing)
1288 have_whole_vector_shift = false;
1291 if (have_whole_vector_shift)
1293 /*** Case 2: Create:
1294 for (offset = VS/2; offset >= element_size; offset/=2)
1296 Create: va' = vec_shift <va, offset>
1297 Create: va = vop <va, va'>
1298 } */
1300 if (vect_print_dump_info (REPORT_DETAILS))
1301 fprintf (vect_dump, "Reduce using vector shifts");
1303 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1304 new_temp = PHI_RESULT (new_phi);
1306 for (bit_offset = vec_size_in_bits/2;
1307 bit_offset >= element_bitsize;
1308 bit_offset /= 2)
1310 tree bitpos = size_int (bit_offset);
1311 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
1312 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
1313 new_name = make_ssa_name (vec_dest, epilog_stmt);
1314 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1315 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1317 tmp = build2 (code, vectype, new_name, new_temp);
1318 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
1319 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1320 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1321 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1324 extract_scalar_result = true;
1326 else
1328 tree rhs;
1330 /*** Case 3: Create:
1331 s = extract_field <v_out2, 0>
1332 for (offset = element_size;
1333 offset < vector_size;
1334 offset += element_size;)
1336 Create: s' = extract_field <v_out2, offset>
1337 Create: s = op <s, s'>
1338 } */
1340 if (vect_print_dump_info (REPORT_DETAILS))
1341 fprintf (vect_dump, "Reduce using scalar code. ");
1343 vec_temp = PHI_RESULT (new_phi);
1344 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1345 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1346 bitsize_zero_node);
1347 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1348 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
1349 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1350 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1351 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1353 for (bit_offset = element_bitsize;
1354 bit_offset < vec_size_in_bits;
1355 bit_offset += element_bitsize)
1357 tree tmp;
1358 tree bitpos = bitsize_int (bit_offset);
1359 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1360 bitpos);
1362 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1363 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
1364 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
1365 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1366 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1368 tmp = build2 (code, scalar_type, new_name, new_temp);
1369 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
1370 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1371 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1372 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1375 extract_scalar_result = false;
1379 /* 2.4 Extract the final scalar result. Create:
1380 s_out3 = extract_field <v_out2, bitpos> */
1382 if (extract_scalar_result)
1384 tree rhs;
1386 if (vect_print_dump_info (REPORT_DETAILS))
1387 fprintf (vect_dump, "extract scalar result");
1389 if (BYTES_BIG_ENDIAN)
1390 bitpos = size_binop (MULT_EXPR,
1391 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
1392 TYPE_SIZE (scalar_type));
1393 else
1394 bitpos = bitsize_zero_node;
1396 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
1397 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1398 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
1399 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1400 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1401 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1404 /* 2.4 Adjust the final result by the initial value of the reduction
1405 variable. (When such adjustment is not needed, then
1406 'scalar_initial_def' is zero).
1408 Create:
1409 s_out4 = scalar_expr <s_out3, scalar_initial_def> */
1411 if (scalar_initial_def)
1413 tree tmp = build2 (code, scalar_type, new_temp, scalar_initial_def);
1414 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
1415 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1416 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1417 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1420 /* 2.6 Replace uses of s_out0 with uses of s_out3 */
1422 /* Find the loop-closed-use at the loop exit of the original scalar result.
1423 (The reduction result is expected to have two immediate uses - one at the
1424 latch block, and one at the loop exit). */
1425 exit_phi = NULL;
1426 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
1428 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
1430 exit_phi = USE_STMT (use_p);
1431 break;
1434 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
1435 gcc_assert (exit_phi);
1436 /* Replace the uses: */
1437 orig_name = PHI_RESULT (exit_phi);
1438 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
1439 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1440 SET_USE (use_p, new_temp);
1444 /* Function vectorizable_reduction.
1446 Check if STMT performs a reduction operation that can be vectorized.
1447 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1448 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1449 Return FALSE if not a vectorizable STMT, TRUE otherwise.
1451 This function also handles reduction idioms (patterns) that have been
1452 recognized in advance during vect_pattern_recog. In this case, STMT may be
1453 of this form:
1454 X = pattern_expr (arg0, arg1, ..., X)
1455 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
1456 sequence that had been detected and replaced by the pattern-stmt (STMT).
1458 In some cases of reduction patterns, the type of the reduction variable X is
1459 different than the type of the other arguments of STMT.
1460 In such cases, the vectype that is used when transforming STMT into a vector
1461 stmt is different than the vectype that is used to determine the
1462 vectorization factor, because it consists of a different number of elements
1463 than the actual number of elements that are being operated upon in parallel.
1465 For example, consider an accumulation of shorts into an int accumulator.
1466 On some targets it's possible to vectorize this pattern operating on 8
1467 shorts at a time (hence, the vectype for purposes of determining the
1468 vectorization factor should be V8HI); on the other hand, the vectype that
1469 is used to create the vector form is actually V4SI (the type of the result).
1471 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
1472 indicates what is the actual level of parallelism (V8HI in the example), so
1473 that the right vectorization factor would be derived. This vectype
1474 corresponds to the type of arguments to the reduction stmt, and should *NOT*
1475 be used to create the vectorized stmt. The right vectype for the vectorized
1476 stmt is obtained from the type of the result X:
1477 get_vectype_for_scalar_type (TREE_TYPE (X))
1479 This means that, contrary to "regular" reductions (or "regular" stmts in
1480 general), the following equation:
1481 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
1482 does *NOT* necessarily hold for reduction patterns. */
1484 bool
1485 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1487 tree vec_dest;
1488 tree scalar_dest;
1489 tree op;
1490 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
1491 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1492 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1493 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1494 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1495 tree operation;
1496 enum tree_code code, orig_code, epilog_reduc_code = 0;
1497 enum machine_mode vec_mode;
1498 int op_type;
1499 optab optab, reduc_optab;
1500 tree new_temp = NULL_TREE;
1501 tree def, def_stmt;
1502 enum vect_def_type dt;
1503 tree new_phi;
1504 tree scalar_type;
1505 bool is_simple_use;
1506 tree orig_stmt;
1507 stmt_vec_info orig_stmt_info;
1508 tree expr = NULL_TREE;
1509 int i;
1510 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1511 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1512 stmt_vec_info prev_stmt_info;
1513 tree reduc_def;
1514 tree new_stmt = NULL_TREE;
1515 int j;
1517 gcc_assert (ncopies >= 1);
1519 /* 1. Is vectorizable reduction? */
1521 /* Not supportable if the reduction variable is used in the loop. */
1522 if (STMT_VINFO_RELEVANT_P (stmt_info))
1523 return false;
1525 if (!STMT_VINFO_LIVE_P (stmt_info))
1526 return false;
1528 /* Make sure it was already recognized as a reduction computation. */
1529 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
1530 return false;
1532 /* 2. Has this been recognized as a reduction pattern?
1534 Check if STMT represents a pattern that has been recognized
1535 in earlier analysis stages. For stmts that represent a pattern,
1536 the STMT_VINFO_RELATED_STMT field records the last stmt in
1537 the original sequence that constitutes the pattern. */
1539 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1540 if (orig_stmt)
1542 orig_stmt_info = vinfo_for_stmt (orig_stmt);
1543 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
1544 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
1545 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
1548 /* 3. Check the operands of the operation. The first operands are defined
1549 inside the loop body. The last operand is the reduction variable,
1550 which is defined by the loop-header-phi. */
1552 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
1554 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1555 code = TREE_CODE (operation);
1556 op_type = TREE_OPERAND_LENGTH (operation);
1557 if (op_type != binary_op && op_type != ternary_op)
1558 return false;
1559 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1560 scalar_type = TREE_TYPE (scalar_dest);
1562 /* All uses but the last are expected to be defined in the loop.
1563 The last use is the reduction variable. */
1564 for (i = 0; i < op_type-1; i++)
1566 op = TREE_OPERAND (operation, i);
1567 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1568 gcc_assert (is_simple_use);
1569 if (dt != vect_loop_def
1570 && dt != vect_invariant_def
1571 && dt != vect_constant_def
1572 && dt != vect_induction_def)
1573 return false;
1576 op = TREE_OPERAND (operation, i);
1577 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1578 gcc_assert (is_simple_use);
1579 gcc_assert (dt == vect_reduction_def);
1580 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1581 if (orig_stmt)
1582 gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt));
1583 else
1584 gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt));
1586 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
1587 return false;
1589 /* 4. Supportable by target? */
1591 /* 4.1. check support for the operation in the loop */
1592 optab = optab_for_tree_code (code, vectype);
1593 if (!optab)
1595 if (vect_print_dump_info (REPORT_DETAILS))
1596 fprintf (vect_dump, "no optab.");
1597 return false;
1599 vec_mode = TYPE_MODE (vectype);
1600 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1602 if (vect_print_dump_info (REPORT_DETAILS))
1603 fprintf (vect_dump, "op not supported by target.");
1604 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1605 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1606 < vect_min_worthwhile_factor (code))
1607 return false;
1608 if (vect_print_dump_info (REPORT_DETAILS))
1609 fprintf (vect_dump, "proceeding using word mode.");
1612 /* Worthwhile without SIMD support? */
1613 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1614 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1615 < vect_min_worthwhile_factor (code))
1617 if (vect_print_dump_info (REPORT_DETAILS))
1618 fprintf (vect_dump, "not worthwhile without SIMD support.");
1619 return false;
1622 /* 4.2. Check support for the epilog operation.
1624 If STMT represents a reduction pattern, then the type of the
1625 reduction variable may be different than the type of the rest
1626 of the arguments. For example, consider the case of accumulation
1627 of shorts into an int accumulator; The original code:
1628 S1: int_a = (int) short_a;
1629 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
1631 was replaced with:
1632 STMT: int_acc = widen_sum <short_a, int_acc>
1634 This means that:
1635 1. The tree-code that is used to create the vector operation in the
1636 epilog code (that reduces the partial results) is not the
1637 tree-code of STMT, but is rather the tree-code of the original
1638 stmt from the pattern that STMT is replacing. I.e, in the example
1639 above we want to use 'widen_sum' in the loop, but 'plus' in the
1640 epilog.
1641 2. The type (mode) we use to check available target support
1642 for the vector operation to be created in the *epilog*, is
1643 determined by the type of the reduction variable (in the example
1644 above we'd check this: plus_optab[vect_int_mode]).
1645 However the type (mode) we use to check available target support
1646 for the vector operation to be created *inside the loop*, is
1647 determined by the type of the other arguments to STMT (in the
1648 example we'd check this: widen_sum_optab[vect_short_mode]).
1650 This is contrary to "regular" reductions, in which the types of all
1651 the arguments are the same as the type of the reduction variable.
1652 For "regular" reductions we can therefore use the same vector type
1653 (and also the same tree-code) when generating the epilog code and
1654 when generating the code inside the loop. */
1656 if (orig_stmt)
1658 /* This is a reduction pattern: get the vectype from the type of the
1659 reduction variable, and get the tree-code from orig_stmt. */
1660 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1661 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
1662 vec_mode = TYPE_MODE (vectype);
1664 else
1666 /* Regular reduction: use the same vectype and tree-code as used for
1667 the vector code inside the loop can be used for the epilog code. */
1668 orig_code = code;
1671 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
1672 return false;
1673 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
1674 if (!reduc_optab)
1676 if (vect_print_dump_info (REPORT_DETAILS))
1677 fprintf (vect_dump, "no optab for reduction.");
1678 epilog_reduc_code = NUM_TREE_CODES;
1680 if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1682 if (vect_print_dump_info (REPORT_DETAILS))
1683 fprintf (vect_dump, "reduc op not supported by target.");
1684 epilog_reduc_code = NUM_TREE_CODES;
1687 if (!vec_stmt) /* transformation not required. */
1689 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
1690 return true;
1693 /** Transform. **/
1695 if (vect_print_dump_info (REPORT_DETAILS))
1696 fprintf (vect_dump, "transform reduction.");
1698 /* Create the destination vector */
1699 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1701 /* Create the reduction-phi that defines the reduction-operand. */
1702 new_phi = create_phi_node (vec_dest, loop->header);
1704 /* In case the vectorization factor (VF) is bigger than the number
1705 of elements that we can fit in a vectype (nunits), we have to generate
1706 more than one vector stmt - i.e - we need to "unroll" the
1707 vector stmt by a factor VF/nunits. For more details see documentation
1708 in vectorizable_operation. */
1710 prev_stmt_info = NULL;
1711 for (j = 0; j < ncopies; j++)
1713 /* Handle uses. */
1714 if (j == 0)
1716 op = TREE_OPERAND (operation, 0);
1717 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
1718 if (op_type == ternary_op)
1720 op = TREE_OPERAND (operation, 1);
1721 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
1724 /* Get the vector def for the reduction variable from the phi node */
1725 reduc_def = PHI_RESULT (new_phi);
1727 else
1729 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
1730 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
1731 if (op_type == ternary_op)
1732 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
1734 /* Get the vector def for the reduction variable from the vectorized
1735 reduction operation generated in the previous iteration (j-1) */
1736 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
1739 /* Arguments are ready. create the new vector stmt. */
1741 if (op_type == binary_op)
1742 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
1743 else
1744 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
1745 reduc_def);
1746 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
1747 new_temp = make_ssa_name (vec_dest, new_stmt);
1748 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1749 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1751 if (j == 0)
1752 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1753 else
1754 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1755 prev_stmt_info = vinfo_for_stmt (new_stmt);
1758 /* Finalize the reduction-phi (set it's arguments) and create the
1759 epilog reduction code. */
1760 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
1761 return true;
1764 /* Checks if CALL can be vectorized in type VECTYPE. Returns
1765 a function declaration if the target has a vectorized version
1766 of the function, or NULL_TREE if the function cannot be vectorized. */
1768 tree
1769 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
1771 tree fndecl = get_callee_fndecl (call);
1772 enum built_in_function code;
1774 /* We only handle functions that do not read or clobber memory -- i.e.
1775 const or novops ones. */
1776 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
1777 return NULL_TREE;
1779 if (!fndecl
1780 || TREE_CODE (fndecl) != FUNCTION_DECL
1781 || !DECL_BUILT_IN (fndecl))
1782 return NULL_TREE;
1784 code = DECL_FUNCTION_CODE (fndecl);
1785 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
1786 vectype_in);
1789 /* Function vectorizable_call.
1791 Check if STMT performs a function call that can be vectorized.
1792 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1793 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1794 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1796 bool
1797 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1799 tree vec_dest;
1800 tree scalar_dest;
1801 tree operation;
1802 tree op, type;
1803 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
1804 tree vectype_out, vectype_in;
1805 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1806 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
1807 enum vect_def_type dt[2];
1808 int ncopies, j, nargs;
1809 call_expr_arg_iterator iter;
1811 /* Is STMT a vectorizable call? */
1812 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
1813 return false;
1815 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
1816 return false;
1818 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1819 if (TREE_CODE (operation) != CALL_EXPR)
1820 return false;
1822 /* Process function arguments. */
1823 rhs_type = NULL_TREE;
1824 nargs = 0;
1825 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
1827 ++nargs;
1829 /* Bail out if the function has more than two arguments, we
1830 do not have interesting builtin functions to vectorize with
1831 more than two arguments. */
1832 if (nargs > 2)
1833 return false;
1835 /* We can only handle calls with arguments of the same type. */
1836 if (rhs_type
1837 && rhs_type != TREE_TYPE (op))
1839 if (vect_print_dump_info (REPORT_DETAILS))
1840 fprintf (vect_dump, "argument types differ.");
1841 return false;
1843 rhs_type = TREE_TYPE (op);
1845 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs-1]))
1847 if (vect_print_dump_info (REPORT_DETAILS))
1848 fprintf (vect_dump, "use not simple.");
1849 return false;
1853 /* No arguments is also not good. */
1854 if (nargs == 0)
1855 return false;
1857 vectype_in = get_vectype_for_scalar_type (rhs_type);
1859 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
1860 vectype_out = get_vectype_for_scalar_type (lhs_type);
1862 /* Only handle the case of vectors with the same number of elements.
1863 FIXME: We need a way to handle for example the SSE2 cvtpd2dq
1864 instruction which converts V2DFmode to V4SImode but only
1865 using the lower half of the V4SImode result. */
1866 if (TYPE_VECTOR_SUBPARTS (vectype_in) != TYPE_VECTOR_SUBPARTS (vectype_out))
1867 return false;
1869 /* For now, we only vectorize functions if a target specific builtin
1870 is available. TODO -- in some cases, it might be profitable to
1871 insert the calls for pieces of the vector, in order to be able
1872 to vectorize other operations in the loop. */
1873 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
1874 if (fndecl == NULL_TREE)
1876 if (vect_print_dump_info (REPORT_DETAILS))
1877 fprintf (vect_dump, "function is not vectorizable.");
1879 return false;
1882 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
1884 if (!vec_stmt) /* transformation not required. */
1886 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
1887 return true;
1890 /** Transform. **/
1892 if (vect_print_dump_info (REPORT_DETAILS))
1893 fprintf (vect_dump, "transform operation.");
1895 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1896 / TYPE_VECTOR_SUBPARTS (vectype_out));
1897 gcc_assert (ncopies >= 1);
1899 /* Handle def. */
1900 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1901 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
1903 prev_stmt_info = NULL;
1904 for (j = 0; j < ncopies; ++j)
1906 tree new_stmt, vargs;
1907 tree vec_oprnd[2];
1908 int n;
1910 /* Build argument list for the vectorized call. */
1911 /* FIXME: Rewrite this so that it doesn't construct a temporary
1912 list. */
1913 vargs = NULL_TREE;
1914 n = -1;
1915 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
1917 ++n;
1919 if (j == 0)
1920 vec_oprnd[n] = vect_get_vec_def_for_operand (op, stmt, NULL);
1921 else
1922 vec_oprnd[n] = vect_get_vec_def_for_stmt_copy (dt[n], vec_oprnd[n]);
1924 vargs = tree_cons (NULL_TREE, vec_oprnd[n], vargs);
1926 vargs = nreverse (vargs);
1928 rhs = build_function_call_expr (fndecl, vargs);
1929 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
1930 new_temp = make_ssa_name (vec_dest, new_stmt);
1931 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1933 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1935 if (j == 0)
1936 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1937 else
1938 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1939 prev_stmt_info = vinfo_for_stmt (new_stmt);
1942 /* The call in STMT might prevent it from being removed in dce. We however
1943 cannot remove it here, due to the way the ssa name it defines is mapped
1944 to the new definition. So just replace rhs of the statement with something
1945 harmless. */
1946 type = TREE_TYPE (scalar_dest);
1947 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
1949 return true;
1953 /* Function vectorizable_conversion.
1955 Check if STMT performs a conversion operation, that can be vectorized.
1956 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1957 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1958 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1960 bool
1961 vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
1962 tree * vec_stmt)
1964 tree vec_dest;
1965 tree scalar_dest;
1966 tree operation;
1967 tree op0;
1968 tree vec_oprnd0 = NULL_TREE;
1969 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1970 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1971 enum tree_code code;
1972 tree new_temp;
1973 tree def, def_stmt;
1974 enum vect_def_type dt0;
1975 tree new_stmt;
1976 int nunits_in;
1977 int nunits_out;
1978 int ncopies, j;
1979 tree vectype_out, vectype_in;
1980 tree rhs_type, lhs_type;
1981 tree builtin_decl;
1982 stmt_vec_info prev_stmt_info;
1984 /* Is STMT a vectorizable conversion? */
1986 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1987 return false;
1989 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1991 if (STMT_VINFO_LIVE_P (stmt_info))
1993 /* FORNOW: not yet supported. */
1994 if (vect_print_dump_info (REPORT_DETAILS))
1995 fprintf (vect_dump, "value used after loop.");
1996 return false;
1999 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2000 return false;
2002 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2003 return false;
2005 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2006 code = TREE_CODE (operation);
2007 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
2008 return false;
2010 /* Check types of lhs and rhs */
2011 op0 = TREE_OPERAND (operation, 0);
2012 rhs_type = TREE_TYPE (op0);
2013 vectype_in = get_vectype_for_scalar_type (rhs_type);
2014 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2016 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2017 lhs_type = TREE_TYPE (scalar_dest);
2018 vectype_out = get_vectype_for_scalar_type (lhs_type);
2019 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
2020 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2022 /* FORNOW: need to extend to support short<->float conversions as well. */
2023 if (nunits_out != nunits_in)
2024 return false;
2026 /* Bail out if the types are both integral or non-integral */
2027 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
2028 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
2029 return false;
2031 /* Sanity check: make sure that at least one copy of the vectorized stmt
2032 needs to be generated. */
2033 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2034 gcc_assert (ncopies >= 1);
2036 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2038 if (vect_print_dump_info (REPORT_DETAILS))
2039 fprintf (vect_dump, "use not simple.");
2040 return false;
2043 /* Supportable by target? */
2044 if (!targetm.vectorize.builtin_conversion (code, vectype_in))
2046 if (vect_print_dump_info (REPORT_DETAILS))
2047 fprintf (vect_dump, "op not supported by target.");
2048 return false;
2051 if (!vec_stmt) /* transformation not required. */
2053 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
2054 return true;
2057 /** Transform. **/
2059 if (vect_print_dump_info (REPORT_DETAILS))
2060 fprintf (vect_dump, "transform conversion.");
2062 /* Handle def. */
2063 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2065 prev_stmt_info = NULL;
2066 for (j = 0; j < ncopies; j++)
2068 tree sym;
2069 ssa_op_iter iter;
2071 if (j == 0)
2072 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2073 else
2074 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2076 builtin_decl =
2077 targetm.vectorize.builtin_conversion (code, vectype_in);
2078 new_stmt = build_call_expr (builtin_decl, 1, vec_oprnd0);
2080 /* Arguments are ready. create the new vector stmt. */
2081 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
2082 new_temp = make_ssa_name (vec_dest, new_stmt);
2083 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2084 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2085 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2087 if (TREE_CODE (sym) == SSA_NAME)
2088 sym = SSA_NAME_VAR (sym);
2089 mark_sym_for_renaming (sym);
2092 if (j == 0)
2093 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2094 else
2095 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2096 prev_stmt_info = vinfo_for_stmt (new_stmt);
2098 return true;
2102 /* Function vectorizable_assignment.
2104 Check if STMT performs an assignment (copy) that can be vectorized.
2105 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2106 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2107 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2109 bool
2110 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2112 tree vec_dest;
2113 tree scalar_dest;
2114 tree op;
2115 tree vec_oprnd;
2116 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2117 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2118 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2119 tree new_temp;
2120 tree def, def_stmt;
2121 enum vect_def_type dt;
2122 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2123 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2125 gcc_assert (ncopies >= 1);
2126 if (ncopies > 1)
2127 return false; /* FORNOW */
2129 /* Is vectorizable assignment? */
2130 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2131 return false;
2133 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2135 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2136 return false;
2138 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2139 if (TREE_CODE (scalar_dest) != SSA_NAME)
2140 return false;
2142 op = GIMPLE_STMT_OPERAND (stmt, 1);
2143 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
2145 if (vect_print_dump_info (REPORT_DETAILS))
2146 fprintf (vect_dump, "use not simple.");
2147 return false;
2150 if (!vec_stmt) /* transformation not required. */
2152 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
2153 return true;
2156 /** Transform. **/
2157 if (vect_print_dump_info (REPORT_DETAILS))
2158 fprintf (vect_dump, "transform assignment.");
2160 /* Handle def. */
2161 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2163 /* Handle use. */
2164 op = GIMPLE_STMT_OPERAND (stmt, 1);
2165 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
2167 /* Arguments are ready. create the new vector stmt. */
2168 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_oprnd);
2169 new_temp = make_ssa_name (vec_dest, *vec_stmt);
2170 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
2171 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
2173 return true;
2177 /* Function vect_min_worthwhile_factor.
2179 For a loop where we could vectorize the operation indicated by CODE,
2180 return the minimum vectorization factor that makes it worthwhile
2181 to use generic vectors. */
2182 static int
2183 vect_min_worthwhile_factor (enum tree_code code)
2185 switch (code)
2187 case PLUS_EXPR:
2188 case MINUS_EXPR:
2189 case NEGATE_EXPR:
2190 return 4;
2192 case BIT_AND_EXPR:
2193 case BIT_IOR_EXPR:
2194 case BIT_XOR_EXPR:
2195 case BIT_NOT_EXPR:
2196 return 2;
2198 default:
2199 return INT_MAX;
2204 /* Function vectorizable_operation.
2206 Check if STMT performs a binary or unary operation that can be vectorized.
2207 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2208 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2209 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2211 bool
2212 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2214 tree vec_dest;
2215 tree scalar_dest;
2216 tree operation;
2217 tree op0, op1 = NULL;
2218 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2219 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2220 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2221 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2222 enum tree_code code;
2223 enum machine_mode vec_mode;
2224 tree new_temp;
2225 int op_type;
2226 optab optab;
2227 int icode;
2228 enum machine_mode optab_op2_mode;
2229 tree def, def_stmt;
2230 enum vect_def_type dt0, dt1;
2231 tree new_stmt;
2232 stmt_vec_info prev_stmt_info;
2233 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
2234 int nunits_out;
2235 tree vectype_out;
2236 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2237 int j;
2239 gcc_assert (ncopies >= 1);
2241 /* Is STMT a vectorizable binary/unary operation? */
2242 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2243 return false;
2245 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2247 if (STMT_VINFO_LIVE_P (stmt_info))
2249 /* FORNOW: not yet supported. */
2250 if (vect_print_dump_info (REPORT_DETAILS))
2251 fprintf (vect_dump, "value used after loop.");
2252 return false;
2255 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2256 return false;
2258 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2259 return false;
2261 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2262 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2263 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2264 if (nunits_out != nunits_in)
2265 return false;
2267 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2268 code = TREE_CODE (operation);
2269 optab = optab_for_tree_code (code, vectype);
2271 /* Support only unary or binary operations. */
2272 op_type = TREE_OPERAND_LENGTH (operation);
2273 if (op_type != unary_op && op_type != binary_op)
2275 if (vect_print_dump_info (REPORT_DETAILS))
2276 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
2277 return false;
2280 op0 = TREE_OPERAND (operation, 0);
2281 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2283 if (vect_print_dump_info (REPORT_DETAILS))
2284 fprintf (vect_dump, "use not simple.");
2285 return false;
2288 if (op_type == binary_op)
2290 op1 = TREE_OPERAND (operation, 1);
2291 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2293 if (vect_print_dump_info (REPORT_DETAILS))
2294 fprintf (vect_dump, "use not simple.");
2295 return false;
2299 /* Supportable by target? */
2300 if (!optab)
2302 if (vect_print_dump_info (REPORT_DETAILS))
2303 fprintf (vect_dump, "no optab.");
2304 return false;
2306 vec_mode = TYPE_MODE (vectype);
2307 icode = (int) optab->handlers[(int) vec_mode].insn_code;
2308 if (icode == CODE_FOR_nothing)
2310 if (vect_print_dump_info (REPORT_DETAILS))
2311 fprintf (vect_dump, "op not supported by target.");
2312 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2313 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2314 < vect_min_worthwhile_factor (code))
2315 return false;
2316 if (vect_print_dump_info (REPORT_DETAILS))
2317 fprintf (vect_dump, "proceeding using word mode.");
2320 /* Worthwhile without SIMD support? */
2321 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2322 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2323 < vect_min_worthwhile_factor (code))
2325 if (vect_print_dump_info (REPORT_DETAILS))
2326 fprintf (vect_dump, "not worthwhile without SIMD support.");
2327 return false;
2330 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2332 /* FORNOW: not yet supported. */
2333 if (!VECTOR_MODE_P (vec_mode))
2334 return false;
2336 /* Invariant argument is needed for a vector shift
2337 by a scalar shift operand. */
2338 optab_op2_mode = insn_data[icode].operand[2].mode;
2339 if (! (VECTOR_MODE_P (optab_op2_mode)
2340 || dt1 == vect_constant_def
2341 || dt1 == vect_invariant_def))
2343 if (vect_print_dump_info (REPORT_DETAILS))
2344 fprintf (vect_dump, "operand mode requires invariant argument.");
2345 return false;
2349 if (!vec_stmt) /* transformation not required. */
2351 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
2352 return true;
2355 /** Transform. **/
2357 if (vect_print_dump_info (REPORT_DETAILS))
2358 fprintf (vect_dump, "transform binary/unary operation.");
2360 /* Handle def. */
2361 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2363 /* In case the vectorization factor (VF) is bigger than the number
2364 of elements that we can fit in a vectype (nunits), we have to generate
2365 more than one vector stmt - i.e - we need to "unroll" the
2366 vector stmt by a factor VF/nunits. In doing so, we record a pointer
2367 from one copy of the vector stmt to the next, in the field
2368 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
2369 stages to find the correct vector defs to be used when vectorizing
2370 stmts that use the defs of the current stmt. The example below illustrates
2371 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
2372 4 vectorized stmts):
2374 before vectorization:
2375 RELATED_STMT VEC_STMT
2376 S1: x = memref - -
2377 S2: z = x + 1 - -
2379 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
2380 there):
2381 RELATED_STMT VEC_STMT
2382 VS1_0: vx0 = memref0 VS1_1 -
2383 VS1_1: vx1 = memref1 VS1_2 -
2384 VS1_2: vx2 = memref2 VS1_3 -
2385 VS1_3: vx3 = memref3 - -
2386 S1: x = load - VS1_0
2387 S2: z = x + 1 - -
2389 step2: vectorize stmt S2 (done here):
2390 To vectorize stmt S2 we first need to find the relevant vector
2391 def for the first operand 'x'. This is, as usual, obtained from
2392 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
2393 that defines 'x' (S1). This way we find the stmt VS1_0, and the
2394 relevant vector def 'vx0'. Having found 'vx0' we can generate
2395 the vector stmt VS2_0, and as usual, record it in the
2396 STMT_VINFO_VEC_STMT of stmt S2.
2397 When creating the second copy (VS2_1), we obtain the relevant vector
2398 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
2399 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
2400 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
2401 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
2402 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
2403 chain of stmts and pointers:
2404 RELATED_STMT VEC_STMT
2405 VS1_0: vx0 = memref0 VS1_1 -
2406 VS1_1: vx1 = memref1 VS1_2 -
2407 VS1_2: vx2 = memref2 VS1_3 -
2408 VS1_3: vx3 = memref3 - -
2409 S1: x = load - VS1_0
2410 VS2_0: vz0 = vx0 + v1 VS2_1 -
2411 VS2_1: vz1 = vx1 + v1 VS2_2 -
2412 VS2_2: vz2 = vx2 + v1 VS2_3 -
2413 VS2_3: vz3 = vx3 + v1 - -
2414 S2: z = x + 1 - VS2_0 */
2416 prev_stmt_info = NULL;
2417 for (j = 0; j < ncopies; j++)
2419 /* Handle uses. */
2420 if (j == 0)
2422 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2423 if (op_type == binary_op)
2425 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2427 /* Vector shl and shr insn patterns can be defined with
2428 scalar operand 2 (shift operand). In this case, use
2429 constant or loop invariant op1 directly, without
2430 extending it to vector mode first. */
2431 optab_op2_mode = insn_data[icode].operand[2].mode;
2432 if (!VECTOR_MODE_P (optab_op2_mode))
2434 if (vect_print_dump_info (REPORT_DETAILS))
2435 fprintf (vect_dump, "operand 1 using scalar mode.");
2436 vec_oprnd1 = op1;
2439 if (!vec_oprnd1)
2440 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2443 else
2445 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2446 if (op_type == binary_op)
2447 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2450 /* Arguments are ready. create the new vector stmt. */
2452 if (op_type == binary_op)
2453 new_stmt = build_gimple_modify_stmt (vec_dest,
2454 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
2455 else
2456 new_stmt = build_gimple_modify_stmt (vec_dest,
2457 build1 (code, vectype, vec_oprnd0));
2458 new_temp = make_ssa_name (vec_dest, new_stmt);
2459 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2460 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2462 if (j == 0)
2463 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2464 else
2465 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2466 prev_stmt_info = vinfo_for_stmt (new_stmt);
2469 return true;
2473 /* Function vectorizable_type_demotion
2475 Check if STMT performs a binary or unary operation that involves
2476 type demotion, and if it can be vectorized.
2477 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2478 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2479 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2481 bool
2482 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
2483 tree *vec_stmt)
2485 tree vec_dest;
2486 tree scalar_dest;
2487 tree operation;
2488 tree op0;
2489 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2490 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2491 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2492 enum tree_code code;
2493 tree new_temp;
2494 tree def, def_stmt;
2495 enum vect_def_type dt0;
2496 tree new_stmt;
2497 stmt_vec_info prev_stmt_info;
2498 int nunits_in;
2499 int nunits_out;
2500 tree vectype_out;
2501 int ncopies;
2502 int j;
2503 tree expr;
2504 tree vectype_in;
2505 tree scalar_type;
2506 optab optab;
2507 enum machine_mode vec_mode;
2509 /* Is STMT a vectorizable type-demotion operation? */
2511 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2512 return false;
2514 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2516 if (STMT_VINFO_LIVE_P (stmt_info))
2518 /* FORNOW: not yet supported. */
2519 if (vect_print_dump_info (REPORT_DETAILS))
2520 fprintf (vect_dump, "value used after loop.");
2521 return false;
2524 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2525 return false;
2527 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2528 return false;
2530 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2531 code = TREE_CODE (operation);
2532 if (code != NOP_EXPR && code != CONVERT_EXPR)
2533 return false;
2535 op0 = TREE_OPERAND (operation, 0);
2536 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2537 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2539 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2540 scalar_type = TREE_TYPE (scalar_dest);
2541 vectype_out = get_vectype_for_scalar_type (scalar_type);
2542 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2543 if (nunits_in != nunits_out / 2) /* FORNOW */
2544 return false;
2546 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2547 gcc_assert (ncopies >= 1);
2549 if (! INTEGRAL_TYPE_P (scalar_type)
2550 || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2551 return false;
2553 /* Check the operands of the operation. */
2554 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2556 if (vect_print_dump_info (REPORT_DETAILS))
2557 fprintf (vect_dump, "use not simple.");
2558 return false;
2561 /* Supportable by target? */
2562 code = VEC_PACK_MOD_EXPR;
2563 optab = optab_for_tree_code (VEC_PACK_MOD_EXPR, vectype_in);
2564 if (!optab)
2565 return false;
2567 vec_mode = TYPE_MODE (vectype_in);
2568 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
2569 return false;
2571 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2573 if (!vec_stmt) /* transformation not required. */
2575 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
2576 return true;
2579 /** Transform. **/
2581 if (vect_print_dump_info (REPORT_DETAILS))
2582 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
2583 ncopies);
2585 /* Handle def. */
2586 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2588 /* In case the vectorization factor (VF) is bigger than the number
2589 of elements that we can fit in a vectype (nunits), we have to generate
2590 more than one vector stmt - i.e - we need to "unroll" the
2591 vector stmt by a factor VF/nunits. */
2592 prev_stmt_info = NULL;
2593 for (j = 0; j < ncopies; j++)
2595 /* Handle uses. */
2596 if (j == 0)
2598 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2599 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2601 else
2603 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
2604 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2607 /* Arguments are ready. Create the new vector stmt. */
2608 expr = build2 (code, vectype_out, vec_oprnd0, vec_oprnd1);
2609 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2610 new_temp = make_ssa_name (vec_dest, new_stmt);
2611 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2612 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2614 if (j == 0)
2615 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2616 else
2617 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2619 prev_stmt_info = vinfo_for_stmt (new_stmt);
2622 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2623 return true;
2627 /* Function vect_gen_widened_results_half
2629 Create a vector stmt whose code, type, number of arguments, and result
2630 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2631 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2632 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2633 needs to be created (DECL is a function-decl of a target-builtin).
2634 STMT is the original scalar stmt that we are vectorizing. */
2636 static tree
2637 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2638 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2639 tree vec_dest, block_stmt_iterator *bsi,
2640 tree stmt)
2642 tree expr;
2643 tree new_stmt;
2644 tree new_temp;
2645 tree sym;
2646 ssa_op_iter iter;
2648 /* Generate half of the widened result: */
2649 if (code == CALL_EXPR)
2651 /* Target specific support */
2652 if (op_type == binary_op)
2653 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
2654 else
2655 expr = build_call_expr (decl, 1, vec_oprnd0);
2657 else
2659 /* Generic support */
2660 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2661 if (op_type == binary_op)
2662 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2663 else
2664 expr = build1 (code, vectype, vec_oprnd0);
2666 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2667 new_temp = make_ssa_name (vec_dest, new_stmt);
2668 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2669 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2671 if (code == CALL_EXPR)
2673 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2675 if (TREE_CODE (sym) == SSA_NAME)
2676 sym = SSA_NAME_VAR (sym);
2677 mark_sym_for_renaming (sym);
2681 return new_stmt;
2685 /* Function vectorizable_type_promotion
2687 Check if STMT performs a binary or unary operation that involves
2688 type promotion, and if it can be vectorized.
2689 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2690 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2691 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2693 bool
2694 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
2695 tree *vec_stmt)
2697 tree vec_dest;
2698 tree scalar_dest;
2699 tree operation;
2700 tree op0, op1 = NULL;
2701 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2702 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2703 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2704 enum tree_code code, code1 = CODE_FOR_nothing, code2 = CODE_FOR_nothing;
2705 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
2706 int op_type;
2707 tree def, def_stmt;
2708 enum vect_def_type dt0, dt1;
2709 tree new_stmt;
2710 stmt_vec_info prev_stmt_info;
2711 int nunits_in;
2712 int nunits_out;
2713 tree vectype_out;
2714 int ncopies;
2715 int j;
2716 tree vectype_in;
2718 /* Is STMT a vectorizable type-promotion operation? */
2720 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2721 return false;
2723 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2725 if (STMT_VINFO_LIVE_P (stmt_info))
2727 /* FORNOW: not yet supported. */
2728 if (vect_print_dump_info (REPORT_DETAILS))
2729 fprintf (vect_dump, "value used after loop.");
2730 return false;
2733 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2734 return false;
2736 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2737 return false;
2739 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2740 code = TREE_CODE (operation);
2741 if (code != NOP_EXPR && code != WIDEN_MULT_EXPR)
2742 return false;
2744 op0 = TREE_OPERAND (operation, 0);
2745 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2746 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2747 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2748 gcc_assert (ncopies >= 1);
2750 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2751 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2752 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2753 if (nunits_out != nunits_in / 2) /* FORNOW */
2754 return false;
2756 if (! INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
2757 || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2758 return false;
2760 /* Check the operands of the operation. */
2761 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2763 if (vect_print_dump_info (REPORT_DETAILS))
2764 fprintf (vect_dump, "use not simple.");
2765 return false;
2768 op_type = TREE_CODE_LENGTH (code);
2769 if (op_type == binary_op)
2771 op1 = TREE_OPERAND (operation, 1);
2772 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2774 if (vect_print_dump_info (REPORT_DETAILS))
2775 fprintf (vect_dump, "use not simple.");
2776 return false;
2780 /* Supportable by target? */
2781 if (!supportable_widening_operation (code, stmt, vectype_in,
2782 &decl1, &decl2, &code1, &code2))
2783 return false;
2785 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2787 if (!vec_stmt) /* transformation not required. */
2789 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
2790 return true;
2793 /** Transform. **/
2795 if (vect_print_dump_info (REPORT_DETAILS))
2796 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
2797 ncopies);
2799 /* Handle def. */
2800 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2802 /* In case the vectorization factor (VF) is bigger than the number
2803 of elements that we can fit in a vectype (nunits), we have to generate
2804 more than one vector stmt - i.e - we need to "unroll" the
2805 vector stmt by a factor VF/nunits. */
2807 prev_stmt_info = NULL;
2808 for (j = 0; j < ncopies; j++)
2810 /* Handle uses. */
2811 if (j == 0)
2813 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2814 if (op_type == binary_op)
2815 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2817 else
2819 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2820 if (op_type == binary_op)
2821 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2824 /* Arguments are ready. Create the new vector stmt. We are creating
2825 two vector defs because the widened result does not fit in one vector.
2826 The vectorized stmt can be expressed as a call to a taregt builtin,
2827 or a using a tree-code. */
2828 /* Generate first half of the widened result: */
2829 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
2830 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2831 if (j == 0)
2832 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2833 else
2834 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2835 prev_stmt_info = vinfo_for_stmt (new_stmt);
2837 /* Generate second half of the widened result: */
2838 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
2839 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2840 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2841 prev_stmt_info = vinfo_for_stmt (new_stmt);
2845 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2846 return true;
2850 /* Function vect_strided_store_supported.
2852 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
2853 and FALSE otherwise. */
2855 static bool
2856 vect_strided_store_supported (tree vectype)
2858 optab interleave_high_optab, interleave_low_optab;
2859 int mode;
2861 mode = (int) TYPE_MODE (vectype);
2863 /* Check that the operation is supported. */
2864 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
2865 vectype);
2866 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
2867 vectype);
2868 if (!interleave_high_optab || !interleave_low_optab)
2870 if (vect_print_dump_info (REPORT_DETAILS))
2871 fprintf (vect_dump, "no optab for interleave.");
2872 return false;
2875 if (interleave_high_optab->handlers[(int) mode].insn_code
2876 == CODE_FOR_nothing
2877 || interleave_low_optab->handlers[(int) mode].insn_code
2878 == CODE_FOR_nothing)
2880 if (vect_print_dump_info (REPORT_DETAILS))
2881 fprintf (vect_dump, "interleave op not supported by target.");
2882 return false;
2884 return true;
2888 /* Function vect_permute_store_chain.
2890 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
2891 a power of 2, generate interleave_high/low stmts to reorder the data
2892 correctly for the stores. Return the final references for stores in
2893 RESULT_CHAIN.
2895 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
2896 The input is 4 vectors each containing 8 elements. We assign a number to each
2897 element, the input sequence is:
2899 1st vec: 0 1 2 3 4 5 6 7
2900 2nd vec: 8 9 10 11 12 13 14 15
2901 3rd vec: 16 17 18 19 20 21 22 23
2902 4th vec: 24 25 26 27 28 29 30 31
2904 The output sequence should be:
2906 1st vec: 0 8 16 24 1 9 17 25
2907 2nd vec: 2 10 18 26 3 11 19 27
2908 3rd vec: 4 12 20 28 5 13 21 30
2909 4th vec: 6 14 22 30 7 15 23 31
2911 i.e., we interleave the contents of the four vectors in their order.
2913 We use interleave_high/low instructions to create such output. The input of
2914 each interleave_high/low operation is two vectors:
2915 1st vec 2nd vec
2916 0 1 2 3 4 5 6 7
2917 the even elements of the result vector are obtained left-to-right from the
2918 high/low elements of the first vector. The odd elements of the result are
2919 obtained left-to-right from the high/low elements of the second vector.
2920 The output of interleave_high will be: 0 4 1 5
2921 and of interleave_low: 2 6 3 7
2924 The permutation is done in log LENGTH stages. In each stage interleave_high
2925 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
2926 where the first argument is taken from the first half of DR_CHAIN and the
2927 second argument from it's second half.
2928 In our example,
2930 I1: interleave_high (1st vec, 3rd vec)
2931 I2: interleave_low (1st vec, 3rd vec)
2932 I3: interleave_high (2nd vec, 4th vec)
2933 I4: interleave_low (2nd vec, 4th vec)
2935 The output for the first stage is:
2937 I1: 0 16 1 17 2 18 3 19
2938 I2: 4 20 5 21 6 22 7 23
2939 I3: 8 24 9 25 10 26 11 27
2940 I4: 12 28 13 29 14 30 15 31
2942 The output of the second stage, i.e. the final result is:
2944 I1: 0 8 16 24 1 9 17 25
2945 I2: 2 10 18 26 3 11 19 27
2946 I3: 4 12 20 28 5 13 21 30
2947 I4: 6 14 22 30 7 15 23 31. */
2949 static bool
2950 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
2951 unsigned int length,
2952 tree stmt,
2953 block_stmt_iterator *bsi,
2954 VEC(tree,heap) **result_chain)
2956 tree perm_dest, perm_stmt, vect1, vect2, high, low;
2957 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2958 tree scalar_dest, tmp;
2959 int i;
2960 unsigned int j;
2961 VEC(tree,heap) *first, *second;
2963 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2964 first = VEC_alloc (tree, heap, length/2);
2965 second = VEC_alloc (tree, heap, length/2);
2967 /* Check that the operation is supported. */
2968 if (!vect_strided_store_supported (vectype))
2969 return false;
2971 *result_chain = VEC_copy (tree, heap, dr_chain);
2973 for (i = 0; i < exact_log2 (length); i++)
2975 for (j = 0; j < length/2; j++)
2977 vect1 = VEC_index (tree, dr_chain, j);
2978 vect2 = VEC_index (tree, dr_chain, j+length/2);
2980 /* Create interleaving stmt:
2981 in the case of big endian:
2982 high = interleave_high (vect1, vect2)
2983 and in the case of little endian:
2984 high = interleave_low (vect1, vect2). */
2985 perm_dest = create_tmp_var (vectype, "vect_inter_high");
2986 DECL_GIMPLE_REG_P (perm_dest) = 1;
2987 add_referenced_var (perm_dest);
2988 if (BYTES_BIG_ENDIAN)
2989 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
2990 else
2991 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
2992 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
2993 high = make_ssa_name (perm_dest, perm_stmt);
2994 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
2995 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
2996 VEC_replace (tree, *result_chain, 2*j, high);
2998 /* Create interleaving stmt:
2999 in the case of big endian:
3000 low = interleave_low (vect1, vect2)
3001 and in the case of little endian:
3002 low = interleave_high (vect1, vect2). */
3003 perm_dest = create_tmp_var (vectype, "vect_inter_low");
3004 DECL_GIMPLE_REG_P (perm_dest) = 1;
3005 add_referenced_var (perm_dest);
3006 if (BYTES_BIG_ENDIAN)
3007 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
3008 else
3009 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
3010 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
3011 low = make_ssa_name (perm_dest, perm_stmt);
3012 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
3013 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3014 VEC_replace (tree, *result_chain, 2*j+1, low);
3016 dr_chain = VEC_copy (tree, heap, *result_chain);
3018 return true;
3022 /* Function vectorizable_store.
3024 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
3025 can be vectorized.
3026 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3027 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3028 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3030 bool
3031 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3033 tree scalar_dest;
3034 tree data_ref;
3035 tree op;
3036 tree vec_oprnd = NULL_TREE;
3037 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3038 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
3039 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3040 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3041 enum machine_mode vec_mode;
3042 tree dummy;
3043 enum dr_alignment_support alignment_support_cheme;
3044 ssa_op_iter iter;
3045 def_operand_p def_p;
3046 tree def, def_stmt;
3047 enum vect_def_type dt;
3048 stmt_vec_info prev_stmt_info = NULL;
3049 tree dataref_ptr = NULL_TREE;
3050 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3051 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3052 int j;
3053 tree next_stmt, first_stmt;
3054 bool strided_store = false;
3055 unsigned int group_size, i;
3056 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
3057 gcc_assert (ncopies >= 1);
3059 /* Is vectorizable store? */
3061 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3062 return false;
3064 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3065 if (TREE_CODE (scalar_dest) != ARRAY_REF
3066 && TREE_CODE (scalar_dest) != INDIRECT_REF
3067 && !DR_GROUP_FIRST_DR (stmt_info))
3068 return false;
3070 op = GIMPLE_STMT_OPERAND (stmt, 1);
3071 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
3073 if (vect_print_dump_info (REPORT_DETAILS))
3074 fprintf (vect_dump, "use not simple.");
3075 return false;
3078 vec_mode = TYPE_MODE (vectype);
3079 /* FORNOW. In some cases can vectorize even if data-type not supported
3080 (e.g. - array initialization with 0). */
3081 if (mov_optab->handlers[(int)vec_mode].insn_code == CODE_FOR_nothing)
3082 return false;
3084 if (!STMT_VINFO_DATA_REF (stmt_info))
3085 return false;
3087 if (DR_GROUP_FIRST_DR (stmt_info))
3089 strided_store = true;
3090 if (!vect_strided_store_supported (vectype))
3091 return false;
3094 if (!vec_stmt) /* transformation not required. */
3096 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
3097 return true;
3100 /** Transform. **/
3102 if (vect_print_dump_info (REPORT_DETAILS))
3103 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
3105 if (strided_store)
3107 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3108 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3109 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3111 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
3113 /* We vectorize all the stmts of the interleaving group when we
3114 reach the last stmt in the group. */
3115 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
3116 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
3118 *vec_stmt = NULL_TREE;
3119 return true;
3122 else
3124 first_stmt = stmt;
3125 first_dr = dr;
3126 group_size = 1;
3129 dr_chain = VEC_alloc (tree, heap, group_size);
3130 oprnds = VEC_alloc (tree, heap, group_size);
3132 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3133 gcc_assert (alignment_support_cheme);
3134 gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */
3136 /* In case the vectorization factor (VF) is bigger than the number
3137 of elements that we can fit in a vectype (nunits), we have to generate
3138 more than one vector stmt - i.e - we need to "unroll" the
3139 vector stmt by a factor VF/nunits. For more details see documentation in
3140 vect_get_vec_def_for_copy_stmt. */
3142 /* In case of interleaving (non-unit strided access):
3144 S1: &base + 2 = x2
3145 S2: &base = x0
3146 S3: &base + 1 = x1
3147 S4: &base + 3 = x3
3149 We create vectorized stores starting from base address (the access of the
3150 first stmt in the chain (S2 in the above example), when the last store stmt
3151 of the chain (S4) is reached:
3153 VS1: &base = vx2
3154 VS2: &base + vec_size*1 = vx0
3155 VS3: &base + vec_size*2 = vx1
3156 VS4: &base + vec_size*3 = vx3
3158 Then permutation statements are generated:
3160 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
3161 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
3164 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3165 (the order of the data-refs in the output of vect_permute_store_chain
3166 corresponds to the order of scalar stmts in the interleaving chain - see
3167 the documentation of vect_permute_store_chain()).
3169 In case of both multiple types and interleaving, above vector stores and
3170 permutation stmts are created for every copy. The result vector stmts are
3171 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3172 STMT_VINFO_RELATED_STMT for the next copies.
3175 prev_stmt_info = NULL;
3176 for (j = 0; j < ncopies; j++)
3178 tree new_stmt;
3179 tree ptr_incr;
3181 if (j == 0)
3183 /* For interleaved stores we collect vectorized defs for all the
3184 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
3185 as an input to vect_permute_store_chain(), and OPRNDS as an input
3186 to vect_get_vec_def_for_stmt_copy() for the next copy.
3187 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3188 OPRNDS are of size 1. */
3189 next_stmt = first_stmt;
3190 for (i = 0; i < group_size; i++)
3192 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
3193 is the exact number of stmts in the chain. Therefore, NEXT_STMT
3194 can't be NULL_TREE. In case that there is no interleaving,
3195 GROUP_SIZE is 1, and only one iteration of the loop will be
3196 executed. */
3197 gcc_assert (next_stmt);
3198 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
3199 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
3200 VEC_quick_push(tree, dr_chain, vec_oprnd);
3201 VEC_quick_push(tree, oprnds, vec_oprnd);
3202 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3204 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE,
3205 &dummy, &ptr_incr, false,
3206 TREE_TYPE (vec_oprnd));
3208 else
3210 /* For interleaved stores we created vectorized defs for all the
3211 defs stored in OPRNDS in the previous iteration (previous copy).
3212 DR_CHAIN is then used as an input to vect_permute_store_chain(),
3213 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
3214 next copy.
3215 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3216 OPRNDS are of size 1. */
3217 for (i = 0; i < group_size; i++)
3219 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
3220 VEC_index (tree, oprnds, i));
3221 VEC_replace(tree, dr_chain, i, vec_oprnd);
3222 VEC_replace(tree, oprnds, i, vec_oprnd);
3224 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3227 if (strided_store)
3229 result_chain = VEC_alloc (tree, heap, group_size);
3230 /* Permute. */
3231 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
3232 &result_chain))
3233 return false;
3236 next_stmt = first_stmt;
3237 for (i = 0; i < group_size; i++)
3239 /* For strided stores vectorized defs are interleaved in
3240 vect_permute_store_chain(). */
3241 if (strided_store)
3242 vec_oprnd = VEC_index(tree, result_chain, i);
3244 data_ref = build_fold_indirect_ref (dataref_ptr);
3245 /* Arguments are ready. Create the new vector stmt. */
3246 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
3247 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3249 /* Set the VDEFs for the vector pointer. If this virtual def
3250 has a use outside the loop and a loop peel is performed
3251 then the def may be renamed by the peel. Mark it for
3252 renaming so the later use will also be renamed. */
3253 copy_virtual_operands (new_stmt, next_stmt);
3254 if (j == 0)
3256 /* The original store is deleted so the same SSA_NAMEs
3257 can be used. */
3258 FOR_EACH_SSA_TREE_OPERAND (def, next_stmt, iter, SSA_OP_VDEF)
3260 SSA_NAME_DEF_STMT (def) = new_stmt;
3261 mark_sym_for_renaming (SSA_NAME_VAR (def));
3264 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3266 else
3268 /* Create new names for all the definitions created by COPY and
3269 add replacement mappings for each new name. */
3270 FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VDEF)
3272 create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p);
3273 mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p)));
3276 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3279 prev_stmt_info = vinfo_for_stmt (new_stmt);
3280 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3281 if (!next_stmt)
3282 break;
3283 /* Bump the vector pointer. */
3284 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3288 return true;
3292 /* Function vect_setup_realignment
3294 This function is called when vectorizing an unaligned load using
3295 the dr_unaligned_software_pipeline scheme.
3296 This function generates the following code at the loop prolog:
3298 p = initial_addr;
3299 msq_init = *(floor(p)); # prolog load
3300 realignment_token = call target_builtin;
3301 loop:
3302 msq = phi (msq_init, ---)
3304 The code above sets up a new (vector) pointer, pointing to the first
3305 location accessed by STMT, and a "floor-aligned" load using that pointer.
3306 It also generates code to compute the "realignment-token" (if the relevant
3307 target hook was defined), and creates a phi-node at the loop-header bb
3308 whose arguments are the result of the prolog-load (created by this
3309 function) and the result of a load that takes place in the loop (to be
3310 created by the caller to this function).
3311 The caller to this function uses the phi-result (msq) to create the
3312 realignment code inside the loop, and sets up the missing phi argument,
3313 as follows:
3315 loop:
3316 msq = phi (msq_init, lsq)
3317 lsq = *(floor(p')); # load in loop
3318 result = realign_load (msq, lsq, realignment_token);
3320 Input:
3321 STMT - (scalar) load stmt to be vectorized. This load accesses
3322 a memory location that may be unaligned.
3323 BSI - place where new code is to be inserted.
3325 Output:
3326 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
3327 target hook, if defined.
3328 Return value - the result of the loop-header phi node. */
3330 static tree
3331 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
3332 tree *realignment_token)
3334 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3335 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3336 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3337 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3338 edge pe = loop_preheader_edge (loop);
3339 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3340 tree vec_dest;
3341 tree init_addr;
3342 tree inc;
3343 tree ptr;
3344 tree data_ref;
3345 tree new_stmt;
3346 basic_block new_bb;
3347 tree msq_init;
3348 tree new_temp;
3349 tree phi_stmt;
3350 tree msq;
3352 /* 1. Create msq_init = *(floor(p1)) in the loop preheader */
3353 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3354 ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true,
3355 NULL_TREE);
3356 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
3357 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
3358 new_temp = make_ssa_name (vec_dest, new_stmt);
3359 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3360 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3361 gcc_assert (!new_bb);
3362 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
3363 copy_virtual_operands (new_stmt, stmt);
3364 update_vuses_to_preheader (new_stmt, loop);
3366 /* 2. Create permutation mask, if required, in loop preheader. */
3367 if (targetm.vectorize.builtin_mask_for_load)
3369 tree builtin_decl;
3371 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
3372 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
3373 vec_dest = vect_create_destination_var (scalar_dest,
3374 TREE_TYPE (new_stmt));
3375 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3376 new_temp = make_ssa_name (vec_dest, new_stmt);
3377 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3378 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3379 gcc_assert (!new_bb);
3380 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
3382 /* The result of the CALL_EXPR to this builtin is determined from
3383 the value of the parameter and no global variables are touched
3384 which makes the builtin a "const" function. Requiring the
3385 builtin to have the "const" attribute makes it unnecessary
3386 to call mark_call_clobbered. */
3387 gcc_assert (TREE_READONLY (builtin_decl));
3390 /* 3. Create msq = phi <msq_init, lsq> in loop */
3391 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3392 msq = make_ssa_name (vec_dest, NULL_TREE);
3393 phi_stmt = create_phi_node (msq, loop->header);
3394 SSA_NAME_DEF_STMT (msq) = phi_stmt;
3395 add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop));
3397 return msq;
3401 /* Function vect_strided_load_supported.
3403 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
3404 and FALSE otherwise. */
3406 static bool
3407 vect_strided_load_supported (tree vectype)
3409 optab perm_even_optab, perm_odd_optab;
3410 int mode;
3412 mode = (int) TYPE_MODE (vectype);
3414 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
3415 if (!perm_even_optab)
3417 if (vect_print_dump_info (REPORT_DETAILS))
3418 fprintf (vect_dump, "no optab for perm_even.");
3419 return false;
3422 if (perm_even_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3424 if (vect_print_dump_info (REPORT_DETAILS))
3425 fprintf (vect_dump, "perm_even op not supported by target.");
3426 return false;
3429 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
3430 if (!perm_odd_optab)
3432 if (vect_print_dump_info (REPORT_DETAILS))
3433 fprintf (vect_dump, "no optab for perm_odd.");
3434 return false;
3437 if (perm_odd_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3439 if (vect_print_dump_info (REPORT_DETAILS))
3440 fprintf (vect_dump, "perm_odd op not supported by target.");
3441 return false;
3443 return true;
3447 /* Function vect_permute_load_chain.
3449 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
3450 a power of 2, generate extract_even/odd stmts to reorder the input data
3451 correctly. Return the final references for loads in RESULT_CHAIN.
3453 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3454 The input is 4 vectors each containing 8 elements. We assign a number to each
3455 element, the input sequence is:
3457 1st vec: 0 1 2 3 4 5 6 7
3458 2nd vec: 8 9 10 11 12 13 14 15
3459 3rd vec: 16 17 18 19 20 21 22 23
3460 4th vec: 24 25 26 27 28 29 30 31
3462 The output sequence should be:
3464 1st vec: 0 4 8 12 16 20 24 28
3465 2nd vec: 1 5 9 13 17 21 25 29
3466 3rd vec: 2 6 10 14 18 22 26 30
3467 4th vec: 3 7 11 15 19 23 27 31
3469 i.e., the first output vector should contain the first elements of each
3470 interleaving group, etc.
3472 We use extract_even/odd instructions to create such output. The input of each
3473 extract_even/odd operation is two vectors
3474 1st vec 2nd vec
3475 0 1 2 3 4 5 6 7
3477 and the output is the vector of extracted even/odd elements. The output of
3478 extract_even will be: 0 2 4 6
3479 and of extract_odd: 1 3 5 7
3482 The permutation is done in log LENGTH stages. In each stage extract_even and
3483 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
3484 order. In our example,
3486 E1: extract_even (1st vec, 2nd vec)
3487 E2: extract_odd (1st vec, 2nd vec)
3488 E3: extract_even (3rd vec, 4th vec)
3489 E4: extract_odd (3rd vec, 4th vec)
3491 The output for the first stage will be:
3493 E1: 0 2 4 6 8 10 12 14
3494 E2: 1 3 5 7 9 11 13 15
3495 E3: 16 18 20 22 24 26 28 30
3496 E4: 17 19 21 23 25 27 29 31
3498 In order to proceed and create the correct sequence for the next stage (or
3499 for the correct output, if the second stage is the last one, as in our
3500 example), we first put the output of extract_even operation and then the
3501 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
3502 The input for the second stage is:
3504 1st vec (E1): 0 2 4 6 8 10 12 14
3505 2nd vec (E3): 16 18 20 22 24 26 28 30
3506 3rd vec (E2): 1 3 5 7 9 11 13 15
3507 4th vec (E4): 17 19 21 23 25 27 29 31
3509 The output of the second stage:
3511 E1: 0 4 8 12 16 20 24 28
3512 E2: 2 6 10 14 18 22 26 30
3513 E3: 1 5 9 13 17 21 25 29
3514 E4: 3 7 11 15 19 23 27 31
3516 And RESULT_CHAIN after reordering:
3518 1st vec (E1): 0 4 8 12 16 20 24 28
3519 2nd vec (E3): 1 5 9 13 17 21 25 29
3520 3rd vec (E2): 2 6 10 14 18 22 26 30
3521 4th vec (E4): 3 7 11 15 19 23 27 31. */
3523 static bool
3524 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
3525 unsigned int length,
3526 tree stmt,
3527 block_stmt_iterator *bsi,
3528 VEC(tree,heap) **result_chain)
3530 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
3531 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3532 tree tmp;
3533 int i;
3534 unsigned int j;
3536 /* Check that the operation is supported. */
3537 if (!vect_strided_load_supported (vectype))
3538 return false;
3540 *result_chain = VEC_copy (tree, heap, dr_chain);
3541 for (i = 0; i < exact_log2 (length); i++)
3543 for (j = 0; j < length; j +=2)
3545 first_vect = VEC_index (tree, dr_chain, j);
3546 second_vect = VEC_index (tree, dr_chain, j+1);
3548 /* data_ref = permute_even (first_data_ref, second_data_ref); */
3549 perm_dest = create_tmp_var (vectype, "vect_perm_even");
3550 DECL_GIMPLE_REG_P (perm_dest) = 1;
3551 add_referenced_var (perm_dest);
3553 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
3554 first_vect, second_vect);
3555 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
3557 data_ref = make_ssa_name (perm_dest, perm_stmt);
3558 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3559 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3560 mark_symbols_for_renaming (perm_stmt);
3562 VEC_replace (tree, *result_chain, j/2, data_ref);
3564 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
3565 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
3566 DECL_GIMPLE_REG_P (perm_dest) = 1;
3567 add_referenced_var (perm_dest);
3569 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
3570 first_vect, second_vect);
3571 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
3572 data_ref = make_ssa_name (perm_dest, perm_stmt);
3573 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3574 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3575 mark_symbols_for_renaming (perm_stmt);
3577 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
3579 dr_chain = VEC_copy (tree, heap, *result_chain);
3581 return true;
3585 /* Function vect_transform_strided_load.
3587 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
3588 to perform their permutation and ascribe the result vectorized statements to
3589 the scalar statements.
3592 static bool
3593 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
3594 block_stmt_iterator *bsi)
3596 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3597 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3598 tree next_stmt, new_stmt;
3599 VEC(tree,heap) *result_chain = NULL;
3600 unsigned int i, gap_count;
3601 tree tmp_data_ref;
3603 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
3604 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
3605 vectors, that are ready for vector computation. */
3606 result_chain = VEC_alloc (tree, heap, size);
3607 /* Permute. */
3608 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
3609 return false;
3611 /* Put a permuted data-ref in the VECTORIZED_STMT field.
3612 Since we scan the chain starting from it's first node, their order
3613 corresponds the order of data-refs in RESULT_CHAIN. */
3614 next_stmt = first_stmt;
3615 gap_count = 1;
3616 for (i = 0; VEC_iterate(tree, result_chain, i, tmp_data_ref); i++)
3618 if (!next_stmt)
3619 break;
3621 /* Skip the gaps. Loads created for the gaps will be removed by dead
3622 code elimination pass later.
3623 DR_GROUP_GAP is the number of steps in elements from the previous
3624 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
3625 correspond to the gaps.
3627 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
3629 gap_count++;
3630 continue;
3633 while (next_stmt)
3635 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
3636 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
3637 copies, and we put the new vector statement in the first available
3638 RELATED_STMT. */
3639 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
3640 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
3641 else
3643 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
3644 tree rel_stmt = STMT_VINFO_RELATED_STMT (
3645 vinfo_for_stmt (prev_stmt));
3646 while (rel_stmt)
3648 prev_stmt = rel_stmt;
3649 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
3651 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
3653 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3654 gap_count = 1;
3655 /* If NEXT_STMT accesses the same DR as the previous statement,
3656 put the same TMP_DATA_REF as its vectorized statement; otherwise
3657 get the next data-ref from RESULT_CHAIN. */
3658 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
3659 break;
3662 return true;
3666 /* vectorizable_load.
3668 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
3669 can be vectorized.
3670 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3671 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3672 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3674 bool
3675 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3677 tree scalar_dest;
3678 tree vec_dest = NULL;
3679 tree data_ref = NULL;
3680 tree op;
3681 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3682 stmt_vec_info prev_stmt_info;
3683 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3684 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3685 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
3686 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3687 tree new_temp;
3688 int mode;
3689 tree new_stmt = NULL_TREE;
3690 tree dummy;
3691 enum dr_alignment_support alignment_support_cheme;
3692 tree dataref_ptr = NULL_TREE;
3693 tree ptr_incr;
3694 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3695 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3696 int i, j, group_size;
3697 tree msq = NULL_TREE, lsq;
3698 tree offset = NULL_TREE;
3699 tree realignment_token = NULL_TREE;
3700 tree phi_stmt = NULL_TREE;
3701 VEC(tree,heap) *dr_chain = NULL;
3702 bool strided_load = false;
3703 tree first_stmt;
3705 /* Is vectorizable load? */
3706 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3707 return false;
3709 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3711 if (STMT_VINFO_LIVE_P (stmt_info))
3713 /* FORNOW: not yet supported. */
3714 if (vect_print_dump_info (REPORT_DETAILS))
3715 fprintf (vect_dump, "value used after loop.");
3716 return false;
3719 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3720 return false;
3722 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3723 if (TREE_CODE (scalar_dest) != SSA_NAME)
3724 return false;
3726 op = GIMPLE_STMT_OPERAND (stmt, 1);
3727 if (TREE_CODE (op) != ARRAY_REF
3728 && TREE_CODE (op) != INDIRECT_REF
3729 && !DR_GROUP_FIRST_DR (stmt_info))
3730 return false;
3732 if (!STMT_VINFO_DATA_REF (stmt_info))
3733 return false;
3735 mode = (int) TYPE_MODE (vectype);
3737 /* FORNOW. In some cases can vectorize even if data-type not supported
3738 (e.g. - data copies). */
3739 if (mov_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3741 if (vect_print_dump_info (REPORT_DETAILS))
3742 fprintf (vect_dump, "Aligned load, but unsupported type.");
3743 return false;
3746 /* Check if the load is a part of an interleaving chain. */
3747 if (DR_GROUP_FIRST_DR (stmt_info))
3749 strided_load = true;
3751 /* Check if interleaving is supported. */
3752 if (!vect_strided_load_supported (vectype))
3753 return false;
3756 if (!vec_stmt) /* transformation not required. */
3758 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
3759 return true;
3762 /** Transform. **/
3764 if (vect_print_dump_info (REPORT_DETAILS))
3765 fprintf (vect_dump, "transform load.");
3767 if (strided_load)
3769 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3770 /* Check if the chain of loads is already vectorized. */
3771 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
3773 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3774 return true;
3776 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3777 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3778 dr_chain = VEC_alloc (tree, heap, group_size);
3780 else
3782 first_stmt = stmt;
3783 first_dr = dr;
3784 group_size = 1;
3787 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3788 gcc_assert (alignment_support_cheme);
3791 /* In case the vectorization factor (VF) is bigger than the number
3792 of elements that we can fit in a vectype (nunits), we have to generate
3793 more than one vector stmt - i.e - we need to "unroll" the
3794 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3795 from one copy of the vector stmt to the next, in the field
3796 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3797 stages to find the correct vector defs to be used when vectorizing
3798 stmts that use the defs of the current stmt. The example below illustrates
3799 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3800 4 vectorized stmts):
3802 before vectorization:
3803 RELATED_STMT VEC_STMT
3804 S1: x = memref - -
3805 S2: z = x + 1 - -
3807 step 1: vectorize stmt S1:
3808 We first create the vector stmt VS1_0, and, as usual, record a
3809 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
3810 Next, we create the vector stmt VS1_1, and record a pointer to
3811 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
3812 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
3813 stmts and pointers:
3814 RELATED_STMT VEC_STMT
3815 VS1_0: vx0 = memref0 VS1_1 -
3816 VS1_1: vx1 = memref1 VS1_2 -
3817 VS1_2: vx2 = memref2 VS1_3 -
3818 VS1_3: vx3 = memref3 - -
3819 S1: x = load - VS1_0
3820 S2: z = x + 1 - -
3822 See in documentation in vect_get_vec_def_for_stmt_copy for how the
3823 information we recorded in RELATED_STMT field is used to vectorize
3824 stmt S2. */
3826 /* In case of interleaving (non-unit strided access):
3828 S1: x2 = &base + 2
3829 S2: x0 = &base
3830 S3: x1 = &base + 1
3831 S4: x3 = &base + 3
3833 Vectorized loads are created in the order of memory accesses
3834 starting from the access of the first stmt of the chain:
3836 VS1: vx0 = &base
3837 VS2: vx1 = &base + vec_size*1
3838 VS3: vx3 = &base + vec_size*2
3839 VS4: vx4 = &base + vec_size*3
3841 Then permutation statements are generated:
3843 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
3844 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
3847 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3848 (the order of the data-refs in the output of vect_permute_load_chain
3849 corresponds to the order of scalar stmts in the interleaving chain - see
3850 the documentation of vect_permute_load_chain()).
3851 The generation of permutation stmts and recording them in
3852 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
3854 In case of both multiple types and interleaving, the vector loads and
3855 permutation stmts above are created for every copy. The result vector stmts
3856 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3857 STMT_VINFO_RELATED_STMT for the next copies. */
3859 /* If the data reference is aligned (dr_aligned) or potentially unaligned
3860 on a target that supports unaligned accesses (dr_unaligned_supported)
3861 we generate the following code:
3862 p = initial_addr;
3863 indx = 0;
3864 loop {
3865 p = p + indx * vectype_size;
3866 vec_dest = *(p);
3867 indx = indx + 1;
3870 Otherwise, the data reference is potentially unaligned on a target that
3871 does not support unaligned accesses (dr_unaligned_software_pipeline) -
3872 then generate the following code, in which the data in each iteration is
3873 obtained by two vector loads, one from the previous iteration, and one
3874 from the current iteration:
3875 p1 = initial_addr;
3876 msq_init = *(floor(p1))
3877 p2 = initial_addr + VS - 1;
3878 realignment_token = call target_builtin;
3879 indx = 0;
3880 loop {
3881 p2 = p2 + indx * vectype_size
3882 lsq = *(floor(p2))
3883 vec_dest = realign_load (msq, lsq, realignment_token)
3884 indx = indx + 1;
3885 msq = lsq;
3886 } */
3888 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3890 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token);
3891 phi_stmt = SSA_NAME_DEF_STMT (msq);
3892 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
3895 prev_stmt_info = NULL;
3896 for (j = 0; j < ncopies; j++)
3898 /* 1. Create the vector pointer update chain. */
3899 if (j == 0)
3900 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset, &dummy,
3901 &ptr_incr, false, NULL_TREE);
3902 else
3903 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3905 for (i = 0; i < group_size; i++)
3907 /* 2. Create the vector-load in the loop. */
3908 switch (alignment_support_cheme)
3910 case dr_aligned:
3911 gcc_assert (aligned_access_p (first_dr));
3912 data_ref = build_fold_indirect_ref (dataref_ptr);
3913 break;
3914 case dr_unaligned_supported:
3916 int mis = DR_MISALIGNMENT (first_dr);
3917 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
3919 gcc_assert (!aligned_access_p (first_dr));
3920 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
3921 data_ref =
3922 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
3923 break;
3925 case dr_unaligned_software_pipeline:
3926 gcc_assert (!aligned_access_p (first_dr));
3927 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
3928 break;
3929 default:
3930 gcc_unreachable ();
3932 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3933 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
3934 new_temp = make_ssa_name (vec_dest, new_stmt);
3935 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3936 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3937 copy_virtual_operands (new_stmt, stmt);
3938 mark_symbols_for_renaming (new_stmt);
3940 /* 3. Handle explicit realignment if necessary/supported. */
3941 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3943 /* Create in loop:
3944 <vec_dest = realign_load (msq, lsq, realignment_token)> */
3945 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
3946 if (!realignment_token)
3947 realignment_token = dataref_ptr;
3948 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3949 new_stmt =
3950 build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token);
3951 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3952 new_temp = make_ssa_name (vec_dest, new_stmt);
3953 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3954 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3955 if (i == group_size - 1 && j == ncopies - 1)
3956 add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop));
3957 msq = lsq;
3959 if (strided_load)
3960 VEC_quick_push (tree, dr_chain, new_temp);
3961 if (i < group_size - 1)
3962 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3965 if (strided_load)
3967 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
3968 return false;
3969 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3970 dr_chain = VEC_alloc (tree, heap, group_size);
3972 else
3974 if (j == 0)
3975 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3976 else
3977 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3978 prev_stmt_info = vinfo_for_stmt (new_stmt);
3982 return true;
3986 /* Function vectorizable_live_operation.
3988 STMT computes a value that is used outside the loop. Check if
3989 it can be supported. */
3991 bool
3992 vectorizable_live_operation (tree stmt,
3993 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3994 tree *vec_stmt ATTRIBUTE_UNUSED)
3996 tree operation;
3997 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3998 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3999 int i;
4000 int op_type;
4001 tree op;
4002 tree def, def_stmt;
4003 enum vect_def_type dt;
4005 if (!STMT_VINFO_LIVE_P (stmt_info))
4006 return false;
4008 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4009 return false;
4011 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4012 return false;
4014 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4015 op_type = TREE_OPERAND_LENGTH (operation);
4017 /* FORNOW: support only if all uses are invariant. This means
4018 that the scalar operations can remain in place, unvectorized.
4019 The original last scalar value that they compute will be used. */
4021 for (i = 0; i < op_type; i++)
4023 op = TREE_OPERAND (operation, i);
4024 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4026 if (vect_print_dump_info (REPORT_DETAILS))
4027 fprintf (vect_dump, "use not simple.");
4028 return false;
4031 if (dt != vect_invariant_def && dt != vect_constant_def)
4032 return false;
4035 /* No transformation is required for the cases we currently support. */
4036 return true;
4040 /* Function vect_is_simple_cond.
4042 Input:
4043 LOOP - the loop that is being vectorized.
4044 COND - Condition that is checked for simple use.
4046 Returns whether a COND can be vectorized. Checks whether
4047 condition operands are supportable using vec_is_simple_use. */
4049 static bool
4050 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
4052 tree lhs, rhs;
4053 tree def;
4054 enum vect_def_type dt;
4056 if (!COMPARISON_CLASS_P (cond))
4057 return false;
4059 lhs = TREE_OPERAND (cond, 0);
4060 rhs = TREE_OPERAND (cond, 1);
4062 if (TREE_CODE (lhs) == SSA_NAME)
4064 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
4065 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
4066 return false;
4068 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST)
4069 return false;
4071 if (TREE_CODE (rhs) == SSA_NAME)
4073 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
4074 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
4075 return false;
4077 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST)
4078 return false;
4080 return true;
4083 /* vectorizable_condition.
4085 Check if STMT is conditional modify expression that can be vectorized.
4086 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4087 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
4088 at BSI.
4090 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4092 bool
4093 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
4095 tree scalar_dest = NULL_TREE;
4096 tree vec_dest = NULL_TREE;
4097 tree op = NULL_TREE;
4098 tree cond_expr, then_clause, else_clause;
4099 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4100 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4101 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
4102 tree vec_compare, vec_cond_expr;
4103 tree new_temp;
4104 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4105 enum machine_mode vec_mode;
4106 tree def;
4107 enum vect_def_type dt;
4108 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4109 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4111 gcc_assert (ncopies >= 1);
4112 if (ncopies > 1)
4113 return false; /* FORNOW */
4115 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4116 return false;
4118 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
4120 if (STMT_VINFO_LIVE_P (stmt_info))
4122 /* FORNOW: not yet supported. */
4123 if (vect_print_dump_info (REPORT_DETAILS))
4124 fprintf (vect_dump, "value used after loop.");
4125 return false;
4128 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4129 return false;
4131 op = GIMPLE_STMT_OPERAND (stmt, 1);
4133 if (TREE_CODE (op) != COND_EXPR)
4134 return false;
4136 cond_expr = TREE_OPERAND (op, 0);
4137 then_clause = TREE_OPERAND (op, 1);
4138 else_clause = TREE_OPERAND (op, 2);
4140 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
4141 return false;
4143 /* We do not handle two different vector types for the condition
4144 and the values. */
4145 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
4146 return false;
4148 if (TREE_CODE (then_clause) == SSA_NAME)
4150 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
4151 if (!vect_is_simple_use (then_clause, loop_vinfo,
4152 &then_def_stmt, &def, &dt))
4153 return false;
4155 else if (TREE_CODE (then_clause) != INTEGER_CST
4156 && TREE_CODE (then_clause) != REAL_CST)
4157 return false;
4159 if (TREE_CODE (else_clause) == SSA_NAME)
4161 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
4162 if (!vect_is_simple_use (else_clause, loop_vinfo,
4163 &else_def_stmt, &def, &dt))
4164 return false;
4166 else if (TREE_CODE (else_clause) != INTEGER_CST
4167 && TREE_CODE (else_clause) != REAL_CST)
4168 return false;
4171 vec_mode = TYPE_MODE (vectype);
4173 if (!vec_stmt)
4175 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
4176 return expand_vec_cond_expr_p (op, vec_mode);
4179 /* Transform */
4181 /* Handle def. */
4182 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4183 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4185 /* Handle cond expr. */
4186 vec_cond_lhs =
4187 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
4188 vec_cond_rhs =
4189 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
4190 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
4191 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
4193 /* Arguments are ready. create the new vector stmt. */
4194 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
4195 vec_cond_lhs, vec_cond_rhs);
4196 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
4197 vec_compare, vec_then_clause, vec_else_clause);
4199 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
4200 new_temp = make_ssa_name (vec_dest, *vec_stmt);
4201 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
4202 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
4204 return true;
4207 /* Function vect_transform_stmt.
4209 Create a vectorized stmt to replace STMT, and insert it at BSI. */
4211 bool
4212 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
4214 bool is_store = false;
4215 tree vec_stmt = NULL_TREE;
4216 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4217 tree orig_stmt_in_pattern;
4218 bool done;
4220 if (STMT_VINFO_RELEVANT_P (stmt_info))
4222 switch (STMT_VINFO_TYPE (stmt_info))
4224 case type_demotion_vec_info_type:
4225 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
4226 gcc_assert (done);
4227 break;
4229 case type_promotion_vec_info_type:
4230 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
4231 gcc_assert (done);
4232 break;
4234 case type_conversion_vec_info_type:
4235 done = vectorizable_conversion (stmt, bsi, &vec_stmt);
4236 gcc_assert (done);
4237 break;
4239 case op_vec_info_type:
4240 done = vectorizable_operation (stmt, bsi, &vec_stmt);
4241 gcc_assert (done);
4242 break;
4244 case assignment_vec_info_type:
4245 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
4246 gcc_assert (done);
4247 break;
4249 case load_vec_info_type:
4250 done = vectorizable_load (stmt, bsi, &vec_stmt);
4251 gcc_assert (done);
4252 break;
4254 case store_vec_info_type:
4255 done = vectorizable_store (stmt, bsi, &vec_stmt);
4256 gcc_assert (done);
4257 if (DR_GROUP_FIRST_DR (stmt_info))
4259 /* In case of interleaving, the whole chain is vectorized when the
4260 last store in the chain is reached. Store stmts before the last
4261 one are skipped, and there vec_stmt_info shouldn't be freed
4262 meanwhile. */
4263 *strided_store = true;
4264 if (STMT_VINFO_VEC_STMT (stmt_info))
4265 is_store = true;
4267 else
4268 is_store = true;
4269 break;
4271 case condition_vec_info_type:
4272 done = vectorizable_condition (stmt, bsi, &vec_stmt);
4273 gcc_assert (done);
4274 break;
4276 case call_vec_info_type:
4277 done = vectorizable_call (stmt, bsi, &vec_stmt);
4278 break;
4280 default:
4281 if (vect_print_dump_info (REPORT_DETAILS))
4282 fprintf (vect_dump, "stmt not supported.");
4283 gcc_unreachable ();
4286 gcc_assert (vec_stmt || *strided_store);
4287 if (vec_stmt)
4289 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
4290 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
4291 if (orig_stmt_in_pattern)
4293 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
4294 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
4296 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4298 /* STMT was inserted by the vectorizer to replace a
4299 computation idiom. ORIG_STMT_IN_PATTERN is a stmt in the
4300 original sequence that computed this idiom. We need to
4301 record a pointer to VEC_STMT in the stmt_info of
4302 ORIG_STMT_IN_PATTERN. See more details in the
4303 documentation of vect_pattern_recog. */
4305 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
4311 if (STMT_VINFO_LIVE_P (stmt_info))
4313 switch (STMT_VINFO_TYPE (stmt_info))
4315 case reduc_vec_info_type:
4316 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
4317 gcc_assert (done);
4318 break;
4320 default:
4321 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
4322 gcc_assert (done);
4326 return is_store;
4330 /* This function builds ni_name = number of iterations loop executes
4331 on the loop preheader. */
4333 static tree
4334 vect_build_loop_niters (loop_vec_info loop_vinfo)
4336 tree ni_name, stmt, var;
4337 edge pe;
4338 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4339 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
4341 var = create_tmp_var (TREE_TYPE (ni), "niters");
4342 add_referenced_var (var);
4343 ni_name = force_gimple_operand (ni, &stmt, false, var);
4345 pe = loop_preheader_edge (loop);
4346 if (stmt)
4348 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4349 gcc_assert (!new_bb);
4352 return ni_name;
4356 /* This function generates the following statements:
4358 ni_name = number of iterations loop executes
4359 ratio = ni_name / vf
4360 ratio_mult_vf_name = ratio * vf
4362 and places them at the loop preheader edge. */
4364 static void
4365 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
4366 tree *ni_name_ptr,
4367 tree *ratio_mult_vf_name_ptr,
4368 tree *ratio_name_ptr)
4371 edge pe;
4372 basic_block new_bb;
4373 tree stmt, ni_name;
4374 tree var;
4375 tree ratio_name;
4376 tree ratio_mult_vf_name;
4377 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4378 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
4379 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4380 tree log_vf;
4382 pe = loop_preheader_edge (loop);
4384 /* Generate temporary variable that contains
4385 number of iterations loop executes. */
4387 ni_name = vect_build_loop_niters (loop_vinfo);
4388 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
4390 /* Create: ratio = ni >> log2(vf) */
4392 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
4393 if (!is_gimple_val (ratio_name))
4395 var = create_tmp_var (TREE_TYPE (ni), "bnd");
4396 add_referenced_var (var);
4398 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
4399 pe = loop_preheader_edge (loop);
4400 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4401 gcc_assert (!new_bb);
4404 /* Create: ratio_mult_vf = ratio << log2 (vf). */
4406 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
4407 ratio_name, log_vf);
4408 if (!is_gimple_val (ratio_mult_vf_name))
4410 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
4411 add_referenced_var (var);
4413 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
4414 true, var);
4415 pe = loop_preheader_edge (loop);
4416 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4417 gcc_assert (!new_bb);
4420 *ni_name_ptr = ni_name;
4421 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
4422 *ratio_name_ptr = ratio_name;
4424 return;
4428 /* Function update_vuses_to_preheader.
4430 Input:
4431 STMT - a statement with potential VUSEs.
4432 LOOP - the loop whose preheader will contain STMT.
4434 It's possible to vectorize a loop even though an SSA_NAME from a VUSE
4435 appears to be defined in a VDEF in another statement in a loop.
4436 One such case is when the VUSE is at the dereference of a __restricted__
4437 pointer in a load and the VDEF is at the dereference of a different
4438 __restricted__ pointer in a store. Vectorization may result in
4439 copy_virtual_uses being called to copy the problematic VUSE to a new
4440 statement that is being inserted in the loop preheader. This procedure
4441 is called to change the SSA_NAME in the new statement's VUSE from the
4442 SSA_NAME updated in the loop to the related SSA_NAME available on the
4443 path entering the loop.
4445 When this function is called, we have the following situation:
4447 # vuse <name1>
4448 S1: vload
4449 do {
4450 # name1 = phi < name0 , name2>
4452 # vuse <name1>
4453 S2: vload
4455 # name2 = vdef <name1>
4456 S3: vstore
4458 }while...
4460 Stmt S1 was created in the loop preheader block as part of misaligned-load
4461 handling. This function fixes the name of the vuse of S1 from 'name1' to
4462 'name0'. */
4464 static void
4465 update_vuses_to_preheader (tree stmt, struct loop *loop)
4467 basic_block header_bb = loop->header;
4468 edge preheader_e = loop_preheader_edge (loop);
4469 ssa_op_iter iter;
4470 use_operand_p use_p;
4472 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VUSE)
4474 tree ssa_name = USE_FROM_PTR (use_p);
4475 tree def_stmt = SSA_NAME_DEF_STMT (ssa_name);
4476 tree name_var = SSA_NAME_VAR (ssa_name);
4477 basic_block bb = bb_for_stmt (def_stmt);
4479 /* For a use before any definitions, def_stmt is a NOP_EXPR. */
4480 if (!IS_EMPTY_STMT (def_stmt)
4481 && flow_bb_inside_loop_p (loop, bb))
4483 /* If the block containing the statement defining the SSA_NAME
4484 is in the loop then it's necessary to find the definition
4485 outside the loop using the PHI nodes of the header. */
4486 tree phi;
4487 bool updated = false;
4489 for (phi = phi_nodes (header_bb); phi; phi = TREE_CHAIN (phi))
4491 if (SSA_NAME_VAR (PHI_RESULT (phi)) == name_var)
4493 SET_USE (use_p, PHI_ARG_DEF (phi, preheader_e->dest_idx));
4494 updated = true;
4495 break;
4498 gcc_assert (updated);
4504 /* Function vect_update_ivs_after_vectorizer.
4506 "Advance" the induction variables of LOOP to the value they should take
4507 after the execution of LOOP. This is currently necessary because the
4508 vectorizer does not handle induction variables that are used after the
4509 loop. Such a situation occurs when the last iterations of LOOP are
4510 peeled, because:
4511 1. We introduced new uses after LOOP for IVs that were not originally used
4512 after LOOP: the IVs of LOOP are now used by an epilog loop.
4513 2. LOOP is going to be vectorized; this means that it will iterate N/VF
4514 times, whereas the loop IVs should be bumped N times.
4516 Input:
4517 - LOOP - a loop that is going to be vectorized. The last few iterations
4518 of LOOP were peeled.
4519 - NITERS - the number of iterations that LOOP executes (before it is
4520 vectorized). i.e, the number of times the ivs should be bumped.
4521 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
4522 coming out from LOOP on which there are uses of the LOOP ivs
4523 (this is the path from LOOP->exit to epilog_loop->preheader).
4525 The new definitions of the ivs are placed in LOOP->exit.
4526 The phi args associated with the edge UPDATE_E in the bb
4527 UPDATE_E->dest are updated accordingly.
4529 Assumption 1: Like the rest of the vectorizer, this function assumes
4530 a single loop exit that has a single predecessor.
4532 Assumption 2: The phi nodes in the LOOP header and in update_bb are
4533 organized in the same order.
4535 Assumption 3: The access function of the ivs is simple enough (see
4536 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
4538 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
4539 coming out of LOOP on which the ivs of LOOP are used (this is the path
4540 that leads to the epilog loop; other paths skip the epilog loop). This
4541 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
4542 needs to have its phis updated.
4545 static void
4546 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
4547 edge update_e)
4549 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4550 basic_block exit_bb = single_exit (loop)->dest;
4551 tree phi, phi1;
4552 basic_block update_bb = update_e->dest;
4554 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
4556 /* Make sure there exists a single-predecessor exit bb: */
4557 gcc_assert (single_pred_p (exit_bb));
4559 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
4560 phi && phi1;
4561 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
4563 tree access_fn = NULL;
4564 tree evolution_part;
4565 tree init_expr;
4566 tree step_expr;
4567 tree var, stmt, ni, ni_name;
4568 block_stmt_iterator last_bsi;
4570 if (vect_print_dump_info (REPORT_DETAILS))
4572 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
4573 print_generic_expr (vect_dump, phi, TDF_SLIM);
4576 /* Skip virtual phi's. */
4577 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
4579 if (vect_print_dump_info (REPORT_DETAILS))
4580 fprintf (vect_dump, "virtual phi. skip.");
4581 continue;
4584 /* Skip reduction phis. */
4585 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
4587 if (vect_print_dump_info (REPORT_DETAILS))
4588 fprintf (vect_dump, "reduc phi. skip.");
4589 continue;
4592 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
4593 gcc_assert (access_fn);
4594 evolution_part =
4595 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
4596 gcc_assert (evolution_part != NULL_TREE);
4598 /* FORNOW: We do not support IVs whose evolution function is a polynomial
4599 of degree >= 2 or exponential. */
4600 gcc_assert (!tree_is_chrec (evolution_part));
4602 step_expr = evolution_part;
4603 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
4604 loop->num));
4606 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
4607 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
4608 fold_convert (TREE_TYPE (init_expr),
4609 niters),
4610 step_expr),
4611 init_expr);
4613 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
4614 add_referenced_var (var);
4616 ni_name = force_gimple_operand (ni, &stmt, false, var);
4618 /* Insert stmt into exit_bb. */
4619 last_bsi = bsi_last (exit_bb);
4620 if (stmt)
4621 bsi_insert_before (&last_bsi, stmt, BSI_SAME_STMT);
4623 /* Fix phi expressions in the successor bb. */
4624 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
4629 /* Function vect_do_peeling_for_loop_bound
4631 Peel the last iterations of the loop represented by LOOP_VINFO.
4632 The peeled iterations form a new epilog loop. Given that the loop now
4633 iterates NITERS times, the new epilog loop iterates
4634 NITERS % VECTORIZATION_FACTOR times.
4636 The original loop will later be made to iterate
4637 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
4639 static void
4640 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
4642 tree ni_name, ratio_mult_vf_name;
4643 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4644 struct loop *new_loop;
4645 edge update_e;
4646 basic_block preheader;
4647 int loop_num;
4648 unsigned int th;
4650 if (vect_print_dump_info (REPORT_DETAILS))
4651 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
4653 initialize_original_copy_tables ();
4655 /* Generate the following variables on the preheader of original loop:
4657 ni_name = number of iteration the original loop executes
4658 ratio = ni_name / vf
4659 ratio_mult_vf_name = ratio * vf */
4660 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
4661 &ratio_mult_vf_name, ratio);
4663 loop_num = loop->num;
4664 /* Threshold for vectorized loop. */
4665 th = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)) *
4666 LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4667 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
4668 ratio_mult_vf_name, ni_name, false, th);
4669 gcc_assert (new_loop);
4670 gcc_assert (loop_num == loop->num);
4671 #ifdef ENABLE_CHECKING
4672 slpeel_verify_cfg_after_peeling (loop, new_loop);
4673 #endif
4675 /* A guard that controls whether the new_loop is to be executed or skipped
4676 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
4677 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
4678 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
4679 is on the path where the LOOP IVs are used and need to be updated. */
4681 preheader = loop_preheader_edge (new_loop)->src;
4682 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
4683 update_e = EDGE_PRED (preheader, 0);
4684 else
4685 update_e = EDGE_PRED (preheader, 1);
4687 /* Update IVs of original loop as if they were advanced
4688 by ratio_mult_vf_name steps. */
4689 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
4691 /* After peeling we have to reset scalar evolution analyzer. */
4692 scev_reset ();
4694 free_original_copy_tables ();
4698 /* Function vect_gen_niters_for_prolog_loop
4700 Set the number of iterations for the loop represented by LOOP_VINFO
4701 to the minimum between LOOP_NITERS (the original iteration count of the loop)
4702 and the misalignment of DR - the data reference recorded in
4703 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
4704 this loop, the data reference DR will refer to an aligned location.
4706 The following computation is generated:
4708 If the misalignment of DR is known at compile time:
4709 addr_mis = int mis = DR_MISALIGNMENT (dr);
4710 Else, compute address misalignment in bytes:
4711 addr_mis = addr & (vectype_size - 1)
4713 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
4715 (elem_size = element type size; an element is the scalar element
4716 whose type is the inner type of the vectype)
4718 For interleaving,
4720 prolog_niters = min ( LOOP_NITERS ,
4721 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
4722 where group_size is the size of the interleaved group.
4725 static tree
4726 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
4728 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
4729 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4730 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4731 tree var, stmt;
4732 tree iters, iters_name;
4733 edge pe;
4734 basic_block new_bb;
4735 tree dr_stmt = DR_STMT (dr);
4736 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
4737 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4738 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
4739 tree niters_type = TREE_TYPE (loop_niters);
4740 int group_size = 1;
4741 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
4743 if (DR_GROUP_FIRST_DR (stmt_info))
4745 /* For interleaved access element size must be multiplied by the size of
4746 the interleaved group. */
4747 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
4748 DR_GROUP_FIRST_DR (stmt_info)));
4749 element_size *= group_size;
4752 pe = loop_preheader_edge (loop);
4754 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
4756 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
4757 int elem_misalign = byte_misalign / element_size;
4759 if (vect_print_dump_info (REPORT_DETAILS))
4760 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
4761 iters = build_int_cst (niters_type,
4762 (vf - elem_misalign)&(vf/group_size-1));
4764 else
4766 tree new_stmts = NULL_TREE;
4767 tree start_addr =
4768 vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE);
4769 tree ptr_type = TREE_TYPE (start_addr);
4770 tree size = TYPE_SIZE (ptr_type);
4771 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
4772 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
4773 tree elem_size_log =
4774 build_int_cst (type, exact_log2 (vectype_align/vf));
4775 tree vf_minus_1 = build_int_cst (type, vf - 1);
4776 tree vf_tree = build_int_cst (type, vf);
4777 tree byte_misalign;
4778 tree elem_misalign;
4780 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
4781 gcc_assert (!new_bb);
4783 /* Create: byte_misalign = addr & (vectype_size - 1) */
4784 byte_misalign =
4785 fold_build2 (BIT_AND_EXPR, type, start_addr, vectype_size_minus_1);
4787 /* Create: elem_misalign = byte_misalign / element_size */
4788 elem_misalign =
4789 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
4791 /* Create: (niters_type) (VF - elem_misalign)&(VF - 1) */
4792 iters = fold_build2 (MINUS_EXPR, type, vf_tree, elem_misalign);
4793 iters = fold_build2 (BIT_AND_EXPR, type, iters, vf_minus_1);
4794 iters = fold_convert (niters_type, iters);
4797 /* Create: prolog_loop_niters = min (iters, loop_niters) */
4798 /* If the loop bound is known at compile time we already verified that it is
4799 greater than vf; since the misalignment ('iters') is at most vf, there's
4800 no need to generate the MIN_EXPR in this case. */
4801 if (TREE_CODE (loop_niters) != INTEGER_CST)
4802 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
4804 if (vect_print_dump_info (REPORT_DETAILS))
4806 fprintf (vect_dump, "niters for prolog loop: ");
4807 print_generic_expr (vect_dump, iters, TDF_SLIM);
4810 var = create_tmp_var (niters_type, "prolog_loop_niters");
4811 add_referenced_var (var);
4812 iters_name = force_gimple_operand (iters, &stmt, false, var);
4814 /* Insert stmt on loop preheader edge. */
4815 if (stmt)
4817 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4818 gcc_assert (!new_bb);
4821 return iters_name;
4825 /* Function vect_update_init_of_dr
4827 NITERS iterations were peeled from LOOP. DR represents a data reference
4828 in LOOP. This function updates the information recorded in DR to
4829 account for the fact that the first NITERS iterations had already been
4830 executed. Specifically, it updates the OFFSET field of DR. */
4832 static void
4833 vect_update_init_of_dr (struct data_reference *dr, tree niters)
4835 tree offset = DR_OFFSET (dr);
4837 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
4838 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
4839 DR_OFFSET (dr) = offset;
4843 /* Function vect_update_inits_of_drs
4845 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
4846 This function updates the information recorded for the data references in
4847 the loop to account for the fact that the first NITERS iterations had
4848 already been executed. Specifically, it updates the initial_condition of the
4849 access_function of all the data_references in the loop. */
4851 static void
4852 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
4854 unsigned int i;
4855 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
4856 struct data_reference *dr;
4858 if (vect_dump && (dump_flags & TDF_DETAILS))
4859 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
4861 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
4862 vect_update_init_of_dr (dr, niters);
4866 /* Function vect_do_peeling_for_alignment
4868 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
4869 'niters' is set to the misalignment of one of the data references in the
4870 loop, thereby forcing it to refer to an aligned location at the beginning
4871 of the execution of this loop. The data reference for which we are
4872 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
4874 static void
4875 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
4877 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4878 tree niters_of_prolog_loop, ni_name;
4879 tree n_iters;
4880 struct loop *new_loop;
4882 if (vect_print_dump_info (REPORT_DETAILS))
4883 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
4885 initialize_original_copy_tables ();
4887 ni_name = vect_build_loop_niters (loop_vinfo);
4888 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
4890 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
4891 new_loop =
4892 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
4893 niters_of_prolog_loop, ni_name, true, 0);
4894 gcc_assert (new_loop);
4895 #ifdef ENABLE_CHECKING
4896 slpeel_verify_cfg_after_peeling (new_loop, loop);
4897 #endif
4899 /* Update number of times loop executes. */
4900 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
4901 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
4902 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
4904 /* Update the init conditions of the access functions of all data refs. */
4905 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
4907 /* After peeling we have to reset scalar evolution analyzer. */
4908 scev_reset ();
4910 free_original_copy_tables ();
4914 /* Function vect_create_cond_for_align_checks.
4916 Create a conditional expression that represents the alignment checks for
4917 all of data references (array element references) whose alignment must be
4918 checked at runtime.
4920 Input:
4921 LOOP_VINFO - two fields of the loop information are used.
4922 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
4923 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
4925 Output:
4926 COND_EXPR_STMT_LIST - statements needed to construct the conditional
4927 expression.
4928 The returned value is the conditional expression to be used in the if
4929 statement that controls which version of the loop gets executed at runtime.
4931 The algorithm makes two assumptions:
4932 1) The number of bytes "n" in a vector is a power of 2.
4933 2) An address "a" is aligned if a%n is zero and that this
4934 test can be done as a&(n-1) == 0. For example, for 16
4935 byte vectors the test is a&0xf == 0. */
4937 static tree
4938 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
4939 tree *cond_expr_stmt_list)
4941 VEC(tree,heap) *may_misalign_stmts
4942 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
4943 tree ref_stmt, tmp;
4944 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
4945 tree mask_cst;
4946 unsigned int i;
4947 tree psize;
4948 tree int_ptrsize_type;
4949 char tmp_name[20];
4950 tree or_tmp_name = NULL_TREE;
4951 tree and_tmp, and_tmp_name, and_stmt;
4952 tree ptrsize_zero;
4954 /* Check that mask is one less than a power of 2, i.e., mask is
4955 all zeros followed by all ones. */
4956 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
4958 /* CHECKME: what is the best integer or unsigned type to use to hold a
4959 cast from a pointer value? */
4960 psize = TYPE_SIZE (ptr_type_node);
4961 int_ptrsize_type
4962 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
4964 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
4965 of the first vector of the i'th data reference. */
4967 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
4969 tree new_stmt_list = NULL_TREE;
4970 tree addr_base;
4971 tree addr_tmp, addr_tmp_name, addr_stmt;
4972 tree or_tmp, new_or_tmp_name, or_stmt;
4974 /* create: addr_tmp = (int)(address_of_first_vector) */
4975 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
4976 &new_stmt_list,
4977 NULL_TREE);
4979 if (new_stmt_list != NULL_TREE)
4980 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
4982 sprintf (tmp_name, "%s%d", "addr2int", i);
4983 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4984 add_referenced_var (addr_tmp);
4985 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
4986 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
4987 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
4988 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
4989 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
4991 /* The addresses are OR together. */
4993 if (or_tmp_name != NULL_TREE)
4995 /* create: or_tmp = or_tmp | addr_tmp */
4996 sprintf (tmp_name, "%s%d", "orptrs", i);
4997 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4998 add_referenced_var (or_tmp);
4999 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
5000 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
5001 or_tmp_name, addr_tmp_name);
5002 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
5003 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
5004 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
5005 or_tmp_name = new_or_tmp_name;
5007 else
5008 or_tmp_name = addr_tmp_name;
5010 } /* end for i */
5012 mask_cst = build_int_cst (int_ptrsize_type, mask);
5014 /* create: and_tmp = or_tmp & mask */
5015 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
5016 add_referenced_var (and_tmp);
5017 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
5019 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
5020 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
5021 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
5022 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
5024 /* Make and_tmp the left operand of the conditional test against zero.
5025 if and_tmp has a nonzero bit then some address is unaligned. */
5026 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
5027 return build2 (EQ_EXPR, boolean_type_node,
5028 and_tmp_name, ptrsize_zero);
5032 /* Function vect_transform_loop.
5034 The analysis phase has determined that the loop is vectorizable.
5035 Vectorize the loop - created vectorized stmts to replace the scalar
5036 stmts in the loop, and update the loop exit condition. */
5038 void
5039 vect_transform_loop (loop_vec_info loop_vinfo)
5041 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5042 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5043 int nbbs = loop->num_nodes;
5044 block_stmt_iterator si, next_si;
5045 int i;
5046 tree ratio = NULL;
5047 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5048 bool strided_store;
5050 if (vect_print_dump_info (REPORT_DETAILS))
5051 fprintf (vect_dump, "=== vec_transform_loop ===");
5053 /* If the loop has data references that may or may not be aligned then
5054 two versions of the loop need to be generated, one which is vectorized
5055 and one which isn't. A test is then generated to control which of the
5056 loops is executed. The test checks for the alignment of all of the
5057 data references that may or may not be aligned. */
5059 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
5061 struct loop *nloop;
5062 tree cond_expr;
5063 tree cond_expr_stmt_list = NULL_TREE;
5064 basic_block condition_bb;
5065 block_stmt_iterator cond_exp_bsi;
5066 basic_block merge_bb;
5067 basic_block new_exit_bb;
5068 edge new_exit_e, e;
5069 tree orig_phi, new_phi, arg;
5070 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
5072 cond_expr = vect_create_cond_for_align_checks (loop_vinfo,
5073 &cond_expr_stmt_list);
5074 initialize_original_copy_tables ();
5075 nloop = loop_version (loop, cond_expr, &condition_bb,
5076 prob, prob, REG_BR_PROB_BASE - prob, true);
5077 free_original_copy_tables();
5079 /** Loop versioning violates an assumption we try to maintain during
5080 vectorization - that the loop exit block has a single predecessor.
5081 After versioning, the exit block of both loop versions is the same
5082 basic block (i.e. it has two predecessors). Just in order to simplify
5083 following transformations in the vectorizer, we fix this situation
5084 here by adding a new (empty) block on the exit-edge of the loop,
5085 with the proper loop-exit phis to maintain loop-closed-form. **/
5087 merge_bb = single_exit (loop)->dest;
5088 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
5089 new_exit_bb = split_edge (single_exit (loop));
5090 new_exit_e = single_exit (loop);
5091 e = EDGE_SUCC (new_exit_bb, 0);
5093 for (orig_phi = phi_nodes (merge_bb); orig_phi;
5094 orig_phi = PHI_CHAIN (orig_phi))
5096 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
5097 new_exit_bb);
5098 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
5099 add_phi_arg (new_phi, arg, new_exit_e);
5100 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
5103 /** end loop-exit-fixes after versioning **/
5105 update_ssa (TODO_update_ssa);
5106 cond_exp_bsi = bsi_last (condition_bb);
5107 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
5110 /* CHECKME: we wouldn't need this if we called update_ssa once
5111 for all loops. */
5112 bitmap_zero (vect_memsyms_to_rename);
5114 /* Peel the loop if there are data refs with unknown alignment.
5115 Only one data ref with unknown store is allowed. */
5117 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5118 vect_do_peeling_for_alignment (loop_vinfo);
5120 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5121 compile time constant), or it is a constant that doesn't divide by the
5122 vectorization factor, then an epilog loop needs to be created.
5123 We therefore duplicate the loop: the original loop will be vectorized,
5124 and will compute the first (n/VF) iterations. The second copy of the loop
5125 will remain scalar and will compute the remaining (n%VF) iterations.
5126 (VF is the vectorization factor). */
5128 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5129 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5130 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
5131 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
5132 else
5133 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5134 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5136 /* 1) Make sure the loop header has exactly two entries
5137 2) Make sure we have a preheader basic block. */
5139 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5141 split_edge (loop_preheader_edge (loop));
5143 /* FORNOW: the vectorizer supports only loops which body consist
5144 of one basic block (header + empty latch). When the vectorizer will
5145 support more involved loop forms, the order by which the BBs are
5146 traversed need to be reconsidered. */
5148 for (i = 0; i < nbbs; i++)
5150 basic_block bb = bbs[i];
5152 for (si = bsi_start (bb); !bsi_end_p (si);)
5154 tree stmt = bsi_stmt (si);
5155 stmt_vec_info stmt_info;
5156 bool is_store;
5158 if (vect_print_dump_info (REPORT_DETAILS))
5160 fprintf (vect_dump, "------>vectorizing statement: ");
5161 print_generic_expr (vect_dump, stmt, TDF_SLIM);
5163 stmt_info = vinfo_for_stmt (stmt);
5164 gcc_assert (stmt_info);
5165 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5166 && !STMT_VINFO_LIVE_P (stmt_info))
5168 bsi_next (&si);
5169 continue;
5172 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5173 != (unsigned HOST_WIDE_INT) vectorization_factor)
5174 && vect_print_dump_info (REPORT_DETAILS))
5175 fprintf (vect_dump, "multiple-types.");
5177 /* -------- vectorize statement ------------ */
5178 if (vect_print_dump_info (REPORT_DETAILS))
5179 fprintf (vect_dump, "transform statement.");
5181 strided_store = false;
5182 is_store = vect_transform_stmt (stmt, &si, &strided_store);
5183 if (is_store)
5185 stmt_ann_t ann;
5186 if (DR_GROUP_FIRST_DR (stmt_info))
5188 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5189 interleaving chain was completed - free all the stores in
5190 the chain. */
5191 tree next = DR_GROUP_FIRST_DR (stmt_info);
5192 tree tmp;
5193 stmt_vec_info next_stmt_info;
5195 while (next)
5197 next_si = bsi_for_stmt (next);
5198 next_stmt_info = vinfo_for_stmt (next);
5199 /* Free the attached stmt_vec_info and remove the stmt. */
5200 ann = stmt_ann (next);
5201 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
5202 free (next_stmt_info);
5203 set_stmt_info (ann, NULL);
5204 bsi_remove (&next_si, true);
5205 next = tmp;
5207 bsi_remove (&si, true);
5208 continue;
5210 else
5212 /* Free the attached stmt_vec_info and remove the stmt. */
5213 ann = stmt_ann (stmt);
5214 free (stmt_info);
5215 set_stmt_info (ann, NULL);
5216 bsi_remove (&si, true);
5217 continue;
5220 bsi_next (&si);
5221 } /* stmts in BB */
5222 } /* BBs in loop */
5224 slpeel_make_loop_iterate_ntimes (loop, ratio);
5226 mark_set_for_renaming (vect_memsyms_to_rename);
5228 /* The memory tags and pointers in vectorized statements need to
5229 have their SSA forms updated. FIXME, why can't this be delayed
5230 until all the loops have been transformed? */
5231 update_ssa (TODO_update_ssa);
5233 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
5234 fprintf (vect_dump, "LOOP VECTORIZED.");