* filename_cmp.c (filename_cmp): Improve documentation.
[official-gcc.git] / gcc / tree-vect-transform.c
blob456138361d43f8b5e78334523c414cd4fa3834c1
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to the Free
19 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
20 02110-1301, USA. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "ggc.h"
27 #include "tree.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "timevar.h"
35 #include "cfgloop.h"
36 #include "expr.h"
37 #include "optabs.h"
38 #include "params.h"
39 #include "recog.h"
40 #include "tree-data-ref.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "langhooks.h"
45 #include "tree-pass.h"
46 #include "toplev.h"
47 #include "real.h"
49 /* Utility functions for the code transformation. */
50 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (tree, block_stmt_iterator *, tree, tree *, tree *, bool, tree);
54 static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree);
55 static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
58 static tree vect_init_vector (tree, tree, tree);
59 static void vect_finish_stmt_generation
60 (tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void update_vuses_to_preheader (tree, struct loop*);
63 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
64 static tree get_initial_def_for_reduction (tree, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
77 /* Function vect_get_new_vect_var.
79 Returns a name for a new variable. The current naming scheme appends the
80 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
81 the name of vectorizer generated variables, and appends that to NAME if
82 provided. */
84 static tree
85 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
87 const char *prefix;
88 tree new_vect_var;
90 switch (var_kind)
92 case vect_simple_var:
93 prefix = "vect_";
94 break;
95 case vect_scalar_var:
96 prefix = "stmp_";
97 break;
98 case vect_pointer_var:
99 prefix = "vect_p";
100 break;
101 default:
102 gcc_unreachable ();
105 if (name)
106 new_vect_var = create_tmp_var (type, concat (prefix, name, NULL));
107 else
108 new_vect_var = create_tmp_var (type, prefix);
110 /* Mark vector typed variable as a gimple register variable. */
111 if (TREE_CODE (type) == VECTOR_TYPE)
112 DECL_GIMPLE_REG_P (new_vect_var) = true;
114 return new_vect_var;
118 /* Function vect_create_addr_base_for_vector_ref.
120 Create an expression that computes the address of the first memory location
121 that will be accessed for a data reference.
123 Input:
124 STMT: The statement containing the data reference.
125 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
126 OFFSET: Optional. If supplied, it is be added to the initial address.
128 Output:
129 1. Return an SSA_NAME whose value is the address of the memory location of
130 the first vector of the data reference.
131 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
132 these statement(s) which define the returned SSA_NAME.
134 FORNOW: We are only handling array accesses with step 1. */
136 static tree
137 vect_create_addr_base_for_vector_ref (tree stmt,
138 tree *new_stmt_list,
139 tree offset)
141 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
142 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
143 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
144 tree base_name = build_fold_indirect_ref (data_ref_base);
145 tree vec_stmt;
146 tree addr_base, addr_expr;
147 tree dest, new_stmt;
148 tree base_offset = unshare_expr (DR_OFFSET (dr));
149 tree init = unshare_expr (DR_INIT (dr));
150 tree vect_ptr_type, addr_expr2;
152 /* Create base_offset */
153 base_offset = size_binop (PLUS_EXPR, base_offset, init);
154 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
155 add_referenced_var (dest);
156 base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest);
157 append_to_statement_list_force (new_stmt, new_stmt_list);
159 if (offset)
161 tree tmp = create_tmp_var (TREE_TYPE (base_offset), "offset");
162 tree step;
164 /* For interleaved access step we divide STEP by the size of the
165 interleaving group. */
166 if (DR_GROUP_SIZE (stmt_info))
167 step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr),
168 build_int_cst (TREE_TYPE (offset),
169 DR_GROUP_SIZE (stmt_info)));
170 else
171 step = DR_STEP (dr);
173 add_referenced_var (tmp);
174 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
175 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
176 base_offset, offset);
177 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
178 append_to_statement_list_force (new_stmt, new_stmt_list);
181 /* base + base_offset */
182 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
183 base_offset);
185 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
187 /* addr_expr = addr_base */
188 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
189 get_name (base_name));
190 add_referenced_var (addr_expr);
191 vec_stmt = fold_convert (vect_ptr_type, addr_base);
192 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
193 get_name (base_name));
194 add_referenced_var (addr_expr2);
195 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
196 append_to_statement_list_force (new_stmt, new_stmt_list);
198 if (vect_print_dump_info (REPORT_DETAILS))
200 fprintf (vect_dump, "created ");
201 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
203 return vec_stmt;
207 /* Function vect_create_data_ref_ptr.
209 Create a new pointer to vector type (vp), that points to the first location
210 accessed in the loop by STMT, along with the def-use update chain to
211 appropriately advance the pointer through the loop iterations. Also set
212 aliasing information for the pointer. This vector pointer is used by the
213 callers to this function to create a memory reference expression for vector
214 load/store access.
216 Input:
217 1. STMT: a stmt that references memory. Expected to be of the form
218 GIMPLE_MODIFY_STMT <name, data-ref> or
219 GIMPLE_MODIFY_STMT <data-ref, name>.
220 2. BSI: block_stmt_iterator where new stmts can be added.
221 3. OFFSET (optional): an offset to be added to the initial address accessed
222 by the data-ref in STMT.
223 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
224 pointing to the initial address.
225 5. TYPE: if not NULL indicates the required type of the data-ref
227 Output:
228 1. Declare a new ptr to vector_type, and have it point to the base of the
229 data reference (initial addressed accessed by the data reference).
230 For example, for vector of type V8HI, the following code is generated:
232 v8hi *vp;
233 vp = (v8hi *)initial_address;
235 if OFFSET is not supplied:
236 initial_address = &a[init];
237 if OFFSET is supplied:
238 initial_address = &a[init + OFFSET];
240 Return the initial_address in INITIAL_ADDRESS.
242 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
243 update the pointer in each iteration of the loop.
245 Return the increment stmt that updates the pointer in PTR_INCR.
247 3. Return the pointer. */
249 static tree
250 vect_create_data_ref_ptr (tree stmt,
251 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
252 tree offset, tree *initial_address, tree *ptr_incr,
253 bool only_init, tree type)
255 tree base_name;
256 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
257 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
258 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
259 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
260 tree vect_ptr_type;
261 tree vect_ptr;
262 tree tag;
263 tree new_temp;
264 tree vec_stmt;
265 tree new_stmt_list = NULL_TREE;
266 edge pe = loop_preheader_edge (loop);
267 basic_block new_bb;
268 tree vect_ptr_init;
269 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
271 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
273 if (vect_print_dump_info (REPORT_DETAILS))
275 tree data_ref_base = base_name;
276 fprintf (vect_dump, "create vector-pointer variable to type: ");
277 print_generic_expr (vect_dump, vectype, TDF_SLIM);
278 if (TREE_CODE (data_ref_base) == VAR_DECL)
279 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
280 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
281 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
282 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
283 fprintf (vect_dump, " vectorizing a record based array ref: ");
284 else if (TREE_CODE (data_ref_base) == SSA_NAME)
285 fprintf (vect_dump, " vectorizing a pointer ref: ");
286 print_generic_expr (vect_dump, base_name, TDF_SLIM);
289 /** (1) Create the new vector-pointer variable: **/
290 if (type)
291 vect_ptr_type = build_pointer_type (type);
292 else
293 vect_ptr_type = build_pointer_type (vectype);
294 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
295 get_name (base_name));
296 add_referenced_var (vect_ptr);
298 /** (2) Add aliasing information to the new vector-pointer:
299 (The points-to info (DR_PTR_INFO) may be defined later.) **/
301 tag = DR_MEMTAG (dr);
302 gcc_assert (tag);
304 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
305 tag must be created with tag added to its may alias list. */
306 if (!MTAG_P (tag))
307 new_type_alias (vect_ptr, tag, DR_REF (dr));
308 else
309 set_symbol_mem_tag (vect_ptr, tag);
311 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
313 /** (3) Calculate the initial address the vector-pointer, and set
314 the vector-pointer to point to it before the loop: **/
316 /* Create: (&(base[init_val+offset]) in the loop preheader. */
317 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
318 offset);
319 pe = loop_preheader_edge (loop);
320 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
321 gcc_assert (!new_bb);
322 *initial_address = new_temp;
324 /* Create: p = (vectype *) initial_base */
325 vec_stmt = fold_convert (vect_ptr_type, new_temp);
326 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
327 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
328 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
329 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
330 gcc_assert (!new_bb);
333 /** (4) Handle the updating of the vector-pointer inside the loop: **/
335 if (only_init) /* No update in loop is required. */
337 /* Copy the points-to information if it exists. */
338 if (DR_PTR_INFO (dr))
339 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
340 return vect_ptr_init;
342 else
344 block_stmt_iterator incr_bsi;
345 bool insert_after;
346 tree indx_before_incr, indx_after_incr;
347 tree incr;
349 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
350 create_iv (vect_ptr_init,
351 fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)),
352 NULL_TREE, loop, &incr_bsi, insert_after,
353 &indx_before_incr, &indx_after_incr);
354 incr = bsi_stmt (incr_bsi);
355 set_stmt_info (stmt_ann (incr),
356 new_stmt_vec_info (incr, loop_vinfo));
358 /* Copy the points-to information if it exists. */
359 if (DR_PTR_INFO (dr))
361 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
362 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
364 merge_alias_info (vect_ptr_init, indx_before_incr);
365 merge_alias_info (vect_ptr_init, indx_after_incr);
366 if (ptr_incr)
367 *ptr_incr = incr;
369 return indx_before_incr;
374 /* Function bump_vector_ptr
376 Increment a pointer (to a vector type) by vector-size. Connect the new
377 increment stmt to the existing def-use update-chain of the pointer.
379 The pointer def-use update-chain before this function:
380 DATAREF_PTR = phi (p_0, p_2)
381 ....
382 PTR_INCR: p_2 = DATAREF_PTR + step
384 The pointer def-use update-chain after this function:
385 DATAREF_PTR = phi (p_0, p_2)
386 ....
387 NEW_DATAREF_PTR = DATAREF_PTR + vector_size
388 ....
389 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
391 Input:
392 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
393 in the loop.
394 PTR_INCR - the stmt that updates the pointer in each iteration of the loop.
395 The increment amount across iterations is also expected to be
396 vector_size.
397 BSI - location where the new update stmt is to be placed.
398 STMT - the original scalar memory-access stmt that is being vectorized.
400 Output: Return NEW_DATAREF_PTR as illustrated above.
404 static tree
405 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
406 tree stmt)
408 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
409 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
410 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
411 tree vptr_type = TREE_TYPE (dataref_ptr);
412 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
413 tree update = fold_convert (vptr_type, TYPE_SIZE_UNIT (vectype));
414 tree incr_stmt;
415 ssa_op_iter iter;
416 use_operand_p use_p;
417 tree new_dataref_ptr;
419 incr_stmt = build_gimple_modify_stmt (ptr_var,
420 build2 (PLUS_EXPR, vptr_type,
421 dataref_ptr, update));
422 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
423 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
424 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
426 /* Update the vector-pointer's cross-iteration increment. */
427 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
429 tree use = USE_FROM_PTR (use_p);
431 if (use == dataref_ptr)
432 SET_USE (use_p, new_dataref_ptr);
433 else
434 gcc_assert (tree_int_cst_compare (use, update) == 0);
437 /* Copy the points-to information if it exists. */
438 if (DR_PTR_INFO (dr))
439 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
440 merge_alias_info (new_dataref_ptr, dataref_ptr);
442 return new_dataref_ptr;
446 /* Function vect_create_destination_var.
448 Create a new temporary of type VECTYPE. */
450 static tree
451 vect_create_destination_var (tree scalar_dest, tree vectype)
453 tree vec_dest;
454 const char *new_name;
455 tree type;
456 enum vect_var_kind kind;
458 kind = vectype ? vect_simple_var : vect_scalar_var;
459 type = vectype ? vectype : TREE_TYPE (scalar_dest);
461 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
463 new_name = get_name (scalar_dest);
464 if (!new_name)
465 new_name = "var_";
466 vec_dest = vect_get_new_vect_var (type, kind, new_name);
467 add_referenced_var (vec_dest);
469 return vec_dest;
473 /* Function vect_init_vector.
475 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
476 the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be
477 used in the vectorization of STMT. */
479 static tree
480 vect_init_vector (tree stmt, tree vector_var, tree vector_type)
482 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
483 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
484 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
485 tree new_var;
486 tree init_stmt;
487 tree vec_oprnd;
488 edge pe;
489 tree new_temp;
490 basic_block new_bb;
492 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
493 add_referenced_var (new_var);
495 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
496 new_temp = make_ssa_name (new_var, init_stmt);
497 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
499 pe = loop_preheader_edge (loop);
500 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
501 gcc_assert (!new_bb);
503 if (vect_print_dump_info (REPORT_DETAILS))
505 fprintf (vect_dump, "created new init_stmt: ");
506 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
509 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
510 return vec_oprnd;
514 /* Function get_initial_def_for_induction
516 Input:
517 STMT - a stmt that performs an induction operation in the loop.
518 IV_PHI - the initial value of the induction variable
520 Output:
521 Return a vector variable, initialized with the first VF values of
522 the induction variable. E.g., for an iv with IV_PHI='X' and
523 evolution S, for a vector of 4 units, we want to return:
524 [X, X + S, X + 2*S, X + 3*S]. */
526 static tree
527 get_initial_def_for_induction (tree stmt, tree iv_phi)
529 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
530 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
531 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
532 tree scalar_type = TREE_TYPE (iv_phi);
533 tree vectype = get_vectype_for_scalar_type (scalar_type);
534 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
535 edge pe = loop_preheader_edge (loop);
536 basic_block new_bb;
537 block_stmt_iterator bsi;
538 tree vec, vec_init, vec_step, t;
539 tree access_fn;
540 tree new_var;
541 tree new_name;
542 tree init_stmt;
543 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
544 tree init_expr, step_expr;
545 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
546 int i;
547 bool ok;
548 int ncopies = vf / nunits;
549 tree expr;
550 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
551 tree stmts;
553 gcc_assert (phi_info);
555 if (STMT_VINFO_VEC_STMT (phi_info))
557 induction_phi = STMT_VINFO_VEC_STMT (phi_info);
558 gcc_assert (TREE_CODE (induction_phi) == PHI_NODE);
560 if (vect_print_dump_info (REPORT_DETAILS))
562 fprintf (vect_dump, "induction already vectorized:");
563 print_generic_expr (vect_dump, iv_phi, TDF_SLIM);
564 fprintf (vect_dump, "\n");
565 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
568 return PHI_RESULT (induction_phi);
571 gcc_assert (ncopies >= 1);
573 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (iv_phi));
574 gcc_assert (access_fn);
575 ok = vect_is_simple_iv_evolution (loop->num, access_fn, &init_expr, &step_expr);
576 gcc_assert (ok);
578 /* Create the vector that holds the initial_value of the induction. */
579 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
580 add_referenced_var (new_var);
582 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
583 if (stmts)
585 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
586 gcc_assert (!new_bb);
589 t = NULL_TREE;
590 t = tree_cons (NULL_TREE, new_name, t);
591 for (i = 1; i < nunits; i++)
593 tree tmp;
595 /* Create: new_name = new_name + step_expr */
596 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
597 init_stmt = build_gimple_modify_stmt (new_var, tmp);
598 new_name = make_ssa_name (new_var, init_stmt);
599 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
601 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
602 gcc_assert (!new_bb);
604 if (vect_print_dump_info (REPORT_DETAILS))
606 fprintf (vect_dump, "created new init_stmt: ");
607 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
609 t = tree_cons (NULL_TREE, new_name, t);
611 vec = build_constructor_from_list (vectype, nreverse (t));
612 vec_init = vect_init_vector (stmt, vec, vectype);
615 /* Create the vector that holds the step of the induction. */
616 expr = build_int_cst (scalar_type, vf);
617 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
618 t = NULL_TREE;
619 for (i = 0; i < nunits; i++)
620 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
621 vec = build_constructor_from_list (vectype, t);
622 vec_step = vect_init_vector (stmt, vec, vectype);
625 /* Create the following def-use cycle:
626 loop prolog:
627 vec_init = [X, X+S, X+2*S, X+3*S]
628 vec_step = [VF*S, VF*S, VF*S, VF*S]
629 loop:
630 vec_iv = PHI <vec_init, vec_loop>
632 STMT
634 vec_loop = vec_iv + vec_step; */
636 /* Create the induction-phi that defines the induction-operand. */
637 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
638 add_referenced_var (vec_dest);
639 induction_phi = create_phi_node (vec_dest, loop->header);
640 set_stmt_info (get_stmt_ann (induction_phi),
641 new_stmt_vec_info (induction_phi, loop_vinfo));
642 induc_def = PHI_RESULT (induction_phi);
644 /* Create the iv update inside the loop */
645 new_stmt = build_gimple_modify_stmt (NULL_TREE,
646 build2 (PLUS_EXPR, vectype,
647 induc_def, vec_step));
648 vec_def = make_ssa_name (vec_dest, new_stmt);
649 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
650 bsi = bsi_for_stmt (stmt);
651 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
653 /* Set the arguments of the phi node: */
654 add_phi_arg (induction_phi, vec_init, loop_preheader_edge (loop));
655 add_phi_arg (induction_phi, vec_def, loop_latch_edge (loop));
658 /* In case the vectorization factor (VF) is bigger than the number
659 of elements that we can fit in a vectype (nunits), we have to generate
660 more than one vector stmt - i.e - we need to "unroll" the
661 vector stmt by a factor VF/nunits. For more details see documentation
662 in vectorizable_operation. */
664 if (ncopies > 1)
666 stmt_vec_info prev_stmt_vinfo;
668 /* Create the vector that holds the step of the induction. */
669 expr = build_int_cst (scalar_type, nunits);
670 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
671 t = NULL_TREE;
672 for (i = 0; i < nunits; i++)
673 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
674 vec = build_constructor_from_list (vectype, t);
675 vec_step = vect_init_vector (stmt, vec, vectype);
677 vec_def = induc_def;
678 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
679 for (i = 1; i < ncopies; i++)
681 tree tmp;
683 /* vec_i = vec_prev + vec_{step*nunits} */
684 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
685 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
686 vec_def = make_ssa_name (vec_dest, new_stmt);
687 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
688 bsi = bsi_for_stmt (stmt);
689 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
691 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
692 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
696 if (vect_print_dump_info (REPORT_DETAILS))
698 fprintf (vect_dump, "transform induction: created def-use cycle:");
699 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
700 fprintf (vect_dump, "\n");
701 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
704 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
705 return induc_def;
709 /* Function vect_get_vec_def_for_operand.
711 OP is an operand in STMT. This function returns a (vector) def that will be
712 used in the vectorized stmt for STMT.
714 In the case that OP is an SSA_NAME which is defined in the loop, then
715 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
717 In case OP is an invariant or constant, a new stmt that creates a vector def
718 needs to be introduced. */
720 static tree
721 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
723 tree vec_oprnd;
724 tree vec_stmt;
725 tree def_stmt;
726 stmt_vec_info def_stmt_info = NULL;
727 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
728 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
729 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
730 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
731 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
732 tree vec_inv;
733 tree vec_cst;
734 tree t = NULL_TREE;
735 tree def;
736 int i;
737 enum vect_def_type dt;
738 bool is_simple_use;
739 tree vector_type;
741 if (vect_print_dump_info (REPORT_DETAILS))
743 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
744 print_generic_expr (vect_dump, op, TDF_SLIM);
747 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
748 gcc_assert (is_simple_use);
749 if (vect_print_dump_info (REPORT_DETAILS))
751 if (def)
753 fprintf (vect_dump, "def = ");
754 print_generic_expr (vect_dump, def, TDF_SLIM);
756 if (def_stmt)
758 fprintf (vect_dump, " def_stmt = ");
759 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
763 switch (dt)
765 /* Case 1: operand is a constant. */
766 case vect_constant_def:
768 if (scalar_def)
769 *scalar_def = op;
771 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
772 if (vect_print_dump_info (REPORT_DETAILS))
773 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
775 for (i = nunits - 1; i >= 0; --i)
777 t = tree_cons (NULL_TREE, op, t);
779 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
780 vec_cst = build_vector (vector_type, t);
782 return vect_init_vector (stmt, vec_cst, vector_type);
785 /* Case 2: operand is defined outside the loop - loop invariant. */
786 case vect_invariant_def:
788 if (scalar_def)
789 *scalar_def = def;
791 /* Create 'vec_inv = {inv,inv,..,inv}' */
792 if (vect_print_dump_info (REPORT_DETAILS))
793 fprintf (vect_dump, "Create vector_inv.");
795 for (i = nunits - 1; i >= 0; --i)
797 t = tree_cons (NULL_TREE, def, t);
800 /* FIXME: use build_constructor directly. */
801 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
802 vec_inv = build_constructor_from_list (vector_type, t);
803 return vect_init_vector (stmt, vec_inv, vector_type);
806 /* Case 3: operand is defined inside the loop. */
807 case vect_loop_def:
809 if (scalar_def)
810 *scalar_def = def_stmt;
812 /* Get the def from the vectorized stmt. */
813 def_stmt_info = vinfo_for_stmt (def_stmt);
814 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
815 gcc_assert (vec_stmt);
816 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
817 return vec_oprnd;
820 /* Case 4: operand is defined by a loop header phi - reduction */
821 case vect_reduction_def:
823 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
825 /* Get the def before the loop */
826 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
827 return get_initial_def_for_reduction (stmt, op, scalar_def);
830 /* Case 5: operand is defined by loop-header phi - induction. */
831 case vect_induction_def:
833 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
835 /* Get the def before the loop */
836 return get_initial_def_for_induction (stmt, def_stmt);
839 default:
840 gcc_unreachable ();
845 /* Function vect_get_vec_def_for_stmt_copy
847 Return a vector-def for an operand. This function is used when the
848 vectorized stmt to be created (by the caller to this function) is a "copy"
849 created in case the vectorized result cannot fit in one vector, and several
850 copies of the vector-stmt are required. In this case the vector-def is
851 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
852 of the stmt that defines VEC_OPRND.
853 DT is the type of the vector def VEC_OPRND.
855 Context:
856 In case the vectorization factor (VF) is bigger than the number
857 of elements that can fit in a vectype (nunits), we have to generate
858 more than one vector stmt to vectorize the scalar stmt. This situation
859 arises when there are multiple data-types operated upon in the loop; the
860 smallest data-type determines the VF, and as a result, when vectorizing
861 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
862 vector stmt (each computing a vector of 'nunits' results, and together
863 computing 'VF' results in each iteration). This function is called when
864 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
865 which VF=16 and nunits=4, so the number of copies required is 4):
867 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
869 S1: x = load VS1.0: vx.0 = memref0 VS1.1
870 VS1.1: vx.1 = memref1 VS1.2
871 VS1.2: vx.2 = memref2 VS1.3
872 VS1.3: vx.3 = memref3
874 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
875 VSnew.1: vz1 = vx.1 + ... VSnew.2
876 VSnew.2: vz2 = vx.2 + ... VSnew.3
877 VSnew.3: vz3 = vx.3 + ...
879 The vectorization of S1 is explained in vectorizable_load.
880 The vectorization of S2:
881 To create the first vector-stmt out of the 4 copies - VSnew.0 -
882 the function 'vect_get_vec_def_for_operand' is called to
883 get the relevant vector-def for each operand of S2. For operand x it
884 returns the vector-def 'vx.0'.
886 To create the remaining copies of the vector-stmt (VSnew.j), this
887 function is called to get the relevant vector-def for each operand. It is
888 obtained from the respective VS1.j stmt, which is recorded in the
889 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
891 For example, to obtain the vector-def 'vx.1' in order to create the
892 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
893 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
894 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
895 and return its def ('vx.1').
896 Overall, to create the above sequence this function will be called 3 times:
897 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
898 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
899 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
901 static tree
902 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
904 tree vec_stmt_for_operand;
905 stmt_vec_info def_stmt_info;
907 /* Do nothing; can reuse same def. */
908 if (dt == vect_invariant_def || dt == vect_constant_def )
909 return vec_oprnd;
911 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
912 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
913 gcc_assert (def_stmt_info);
914 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
915 gcc_assert (vec_stmt_for_operand);
916 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
918 return vec_oprnd;
922 /* Function vect_finish_stmt_generation.
924 Insert a new stmt. */
926 static void
927 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
928 block_stmt_iterator *bsi)
930 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
931 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
933 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
934 set_stmt_info (get_stmt_ann (vec_stmt),
935 new_stmt_vec_info (vec_stmt, loop_vinfo));
937 if (vect_print_dump_info (REPORT_DETAILS))
939 fprintf (vect_dump, "add new stmt: ");
940 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
943 /* Make sure bsi points to the stmt that is being vectorized. */
944 gcc_assert (stmt == bsi_stmt (*bsi));
946 #ifdef USE_MAPPED_LOCATION
947 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
948 #else
949 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
950 #endif
954 #define ADJUST_IN_EPILOG 1
956 /* Function get_initial_def_for_reduction
958 Input:
959 STMT - a stmt that performs a reduction operation in the loop.
960 INIT_VAL - the initial value of the reduction variable
962 Output:
963 SCALAR_DEF - a tree that holds a value to be added to the final result
964 of the reduction (used for "ADJUST_IN_EPILOG" - see below).
965 Return a vector variable, initialized according to the operation that STMT
966 performs. This vector will be used as the initial value of the
967 vector of partial results.
969 Option1 ("ADJUST_IN_EPILOG"): Initialize the vector as follows:
970 add: [0,0,...,0,0]
971 mult: [1,1,...,1,1]
972 min/max: [init_val,init_val,..,init_val,init_val]
973 bit and/or: [init_val,init_val,..,init_val,init_val]
974 and when necessary (e.g. add/mult case) let the caller know
975 that it needs to adjust the result by init_val.
977 Option2: Initialize the vector as follows:
978 add: [0,0,...,0,init_val]
979 mult: [1,1,...,1,init_val]
980 min/max: [init_val,init_val,...,init_val]
981 bit and/or: [init_val,init_val,...,init_val]
982 and no adjustments are needed.
984 For example, for the following code:
986 s = init_val;
987 for (i=0;i<n;i++)
988 s = s + a[i];
990 STMT is 's = s + a[i]', and the reduction variable is 's'.
991 For a vector of 4 units, we want to return either [0,0,0,init_val],
992 or [0,0,0,0] and let the caller know that it needs to adjust
993 the result at the end by 'init_val'.
995 FORNOW: We use the "ADJUST_IN_EPILOG" scheme.
996 TODO: Use some cost-model to estimate which scheme is more profitable.
999 static tree
1000 get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
1002 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1003 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1004 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1005 int nelements;
1006 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1007 tree type = TREE_TYPE (init_val);
1008 tree def;
1009 tree vec, t = NULL_TREE;
1010 bool need_epilog_adjust;
1011 int i;
1012 tree vector_type;
1014 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
1016 switch (code)
1018 case WIDEN_SUM_EXPR:
1019 case DOT_PROD_EXPR:
1020 case PLUS_EXPR:
1021 if (INTEGRAL_TYPE_P (type))
1022 def = build_int_cst (type, 0);
1023 else
1024 def = build_real (type, dconst0);
1026 #ifdef ADJUST_IN_EPILOG
1027 /* All the 'nunits' elements are set to 0. The final result will be
1028 adjusted by 'init_val' at the loop epilog. */
1029 nelements = nunits;
1030 need_epilog_adjust = true;
1031 #else
1032 /* 'nunits - 1' elements are set to 0; The last element is set to
1033 'init_val'. No further adjustments at the epilog are needed. */
1034 nelements = nunits - 1;
1035 need_epilog_adjust = false;
1036 #endif
1037 break;
1039 case MIN_EXPR:
1040 case MAX_EXPR:
1041 def = init_val;
1042 nelements = nunits;
1043 need_epilog_adjust = false;
1044 break;
1046 default:
1047 gcc_unreachable ();
1050 for (i = nelements - 1; i >= 0; --i)
1051 t = tree_cons (NULL_TREE, def, t);
1053 if (nelements == nunits - 1)
1055 /* Set the last element of the vector. */
1056 t = tree_cons (NULL_TREE, init_val, t);
1057 nelements += 1;
1059 gcc_assert (nelements == nunits);
1061 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1062 if (TREE_CODE (init_val) == INTEGER_CST || TREE_CODE (init_val) == REAL_CST)
1063 vec = build_vector (vector_type, t);
1064 else
1065 vec = build_constructor_from_list (vector_type, t);
1067 if (!need_epilog_adjust)
1068 *scalar_def = NULL_TREE;
1069 else
1070 *scalar_def = init_val;
1072 return vect_init_vector (stmt, vec, vector_type);
1076 /* Function vect_create_epilog_for_reduction
1078 Create code at the loop-epilog to finalize the result of a reduction
1079 computation.
1081 VECT_DEF is a vector of partial results.
1082 REDUC_CODE is the tree-code for the epilog reduction.
1083 STMT is the scalar reduction stmt that is being vectorized.
1084 REDUCTION_PHI is the phi-node that carries the reduction computation.
1086 This function:
1087 1. Creates the reduction def-use cycle: sets the arguments for
1088 REDUCTION_PHI:
1089 The loop-entry argument is the vectorized initial-value of the reduction.
1090 The loop-latch argument is VECT_DEF - the vector of partial sums.
1091 2. "Reduces" the vector of partial results VECT_DEF into a single result,
1092 by applying the operation specified by REDUC_CODE if available, or by
1093 other means (whole-vector shifts or a scalar loop).
1094 The function also creates a new phi node at the loop exit to preserve
1095 loop-closed form, as illustrated below.
1097 The flow at the entry to this function:
1099 loop:
1100 vec_def = phi <null, null> # REDUCTION_PHI
1101 VECT_DEF = vector_stmt # vectorized form of STMT
1102 s_loop = scalar_stmt # (scalar) STMT
1103 loop_exit:
1104 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1105 use <s_out0>
1106 use <s_out0>
1108 The above is transformed by this function into:
1110 loop:
1111 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
1112 VECT_DEF = vector_stmt # vectorized form of STMT
1113 s_loop = scalar_stmt # (scalar) STMT
1114 loop_exit:
1115 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1116 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1117 v_out2 = reduce <v_out1>
1118 s_out3 = extract_field <v_out2, 0>
1119 s_out4 = adjust_result <s_out3>
1120 use <s_out4>
1121 use <s_out4>
1124 static void
1125 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
1126 enum tree_code reduc_code, tree reduction_phi)
1128 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1129 tree vectype;
1130 enum machine_mode mode;
1131 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1132 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1133 basic_block exit_bb;
1134 tree scalar_dest;
1135 tree scalar_type;
1136 tree new_phi;
1137 block_stmt_iterator exit_bsi;
1138 tree vec_dest;
1139 tree new_temp;
1140 tree new_name;
1141 tree epilog_stmt;
1142 tree new_scalar_dest, exit_phi;
1143 tree bitsize, bitpos, bytesize;
1144 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1145 tree scalar_initial_def;
1146 tree vec_initial_def;
1147 tree orig_name;
1148 imm_use_iterator imm_iter;
1149 use_operand_p use_p;
1150 bool extract_scalar_result;
1151 tree reduction_op;
1152 tree orig_stmt;
1153 tree use_stmt;
1154 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
1155 int op_type;
1157 op_type = TREE_OPERAND_LENGTH (operation);
1158 reduction_op = TREE_OPERAND (operation, op_type-1);
1159 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
1160 mode = TYPE_MODE (vectype);
1162 /*** 1. Create the reduction def-use cycle ***/
1164 /* 1.1 set the loop-entry arg of the reduction-phi: */
1165 /* For the case of reduction, vect_get_vec_def_for_operand returns
1166 the scalar def before the loop, that defines the initial value
1167 of the reduction variable. */
1168 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
1169 &scalar_initial_def);
1170 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
1172 /* 1.2 set the loop-latch arg for the reduction-phi: */
1173 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
1175 if (vect_print_dump_info (REPORT_DETAILS))
1177 fprintf (vect_dump, "transform reduction: created def-use cycle:");
1178 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
1179 fprintf (vect_dump, "\n");
1180 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
1184 /*** 2. Create epilog code
1185 The reduction epilog code operates across the elements of the vector
1186 of partial results computed by the vectorized loop.
1187 The reduction epilog code consists of:
1188 step 1: compute the scalar result in a vector (v_out2)
1189 step 2: extract the scalar result (s_out3) from the vector (v_out2)
1190 step 3: adjust the scalar result (s_out3) if needed.
1192 Step 1 can be accomplished using one the following three schemes:
1193 (scheme 1) using reduc_code, if available.
1194 (scheme 2) using whole-vector shifts, if available.
1195 (scheme 3) using a scalar loop. In this case steps 1+2 above are
1196 combined.
1198 The overall epilog code looks like this:
1200 s_out0 = phi <s_loop> # original EXIT_PHI
1201 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1202 v_out2 = reduce <v_out1> # step 1
1203 s_out3 = extract_field <v_out2, 0> # step 2
1204 s_out4 = adjust_result <s_out3> # step 3
1206 (step 3 is optional, and step2 1 and 2 may be combined).
1207 Lastly, the uses of s_out0 are replaced by s_out4.
1209 ***/
1211 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
1212 v_out1 = phi <v_loop> */
1214 exit_bb = single_exit (loop)->dest;
1215 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
1216 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
1217 exit_bsi = bsi_start (exit_bb);
1219 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
1220 (i.e. when reduc_code is not available) and in the final adjustment code
1221 (if needed). Also get the original scalar reduction variable as
1222 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
1223 represents a reduction pattern), the tree-code and scalar-def are
1224 taken from the original stmt that the pattern-stmt (STMT) replaces.
1225 Otherwise (it is a regular reduction) - the tree-code and scalar-def
1226 are taken from STMT. */
1228 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1229 if (!orig_stmt)
1231 /* Regular reduction */
1232 orig_stmt = stmt;
1234 else
1236 /* Reduction pattern */
1237 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
1238 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1239 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
1241 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1242 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
1243 scalar_type = TREE_TYPE (scalar_dest);
1244 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
1245 bitsize = TYPE_SIZE (scalar_type);
1246 bytesize = TYPE_SIZE_UNIT (scalar_type);
1248 /* 2.3 Create the reduction code, using one of the three schemes described
1249 above. */
1251 if (reduc_code < NUM_TREE_CODES)
1253 tree tmp;
1255 /*** Case 1: Create:
1256 v_out2 = reduc_expr <v_out1> */
1258 if (vect_print_dump_info (REPORT_DETAILS))
1259 fprintf (vect_dump, "Reduce using direct vector reduction.");
1261 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1262 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
1263 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
1264 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1265 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1266 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1268 extract_scalar_result = true;
1270 else
1272 enum tree_code shift_code = 0;
1273 bool have_whole_vector_shift = true;
1274 int bit_offset;
1275 int element_bitsize = tree_low_cst (bitsize, 1);
1276 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1277 tree vec_temp;
1279 if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
1280 shift_code = VEC_RSHIFT_EXPR;
1281 else
1282 have_whole_vector_shift = false;
1284 /* Regardless of whether we have a whole vector shift, if we're
1285 emulating the operation via tree-vect-generic, we don't want
1286 to use it. Only the first round of the reduction is likely
1287 to still be profitable via emulation. */
1288 /* ??? It might be better to emit a reduction tree code here, so that
1289 tree-vect-generic can expand the first round via bit tricks. */
1290 if (!VECTOR_MODE_P (mode))
1291 have_whole_vector_shift = false;
1292 else
1294 optab optab = optab_for_tree_code (code, vectype);
1295 if (optab->handlers[mode].insn_code == CODE_FOR_nothing)
1296 have_whole_vector_shift = false;
1299 if (have_whole_vector_shift)
1301 /*** Case 2: Create:
1302 for (offset = VS/2; offset >= element_size; offset/=2)
1304 Create: va' = vec_shift <va, offset>
1305 Create: va = vop <va, va'>
1306 } */
1308 if (vect_print_dump_info (REPORT_DETAILS))
1309 fprintf (vect_dump, "Reduce using vector shifts");
1311 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1312 new_temp = PHI_RESULT (new_phi);
1314 for (bit_offset = vec_size_in_bits/2;
1315 bit_offset >= element_bitsize;
1316 bit_offset /= 2)
1318 tree bitpos = size_int (bit_offset);
1319 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
1320 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
1321 new_name = make_ssa_name (vec_dest, epilog_stmt);
1322 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1323 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1325 tmp = build2 (code, vectype, new_name, new_temp);
1326 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
1327 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1328 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1329 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1332 extract_scalar_result = true;
1334 else
1336 tree rhs;
1338 /*** Case 3: Create:
1339 s = extract_field <v_out2, 0>
1340 for (offset = element_size;
1341 offset < vector_size;
1342 offset += element_size;)
1344 Create: s' = extract_field <v_out2, offset>
1345 Create: s = op <s, s'>
1346 } */
1348 if (vect_print_dump_info (REPORT_DETAILS))
1349 fprintf (vect_dump, "Reduce using scalar code. ");
1351 vec_temp = PHI_RESULT (new_phi);
1352 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1353 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1354 bitsize_zero_node);
1355 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1356 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
1357 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1358 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1359 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1361 for (bit_offset = element_bitsize;
1362 bit_offset < vec_size_in_bits;
1363 bit_offset += element_bitsize)
1365 tree tmp;
1366 tree bitpos = bitsize_int (bit_offset);
1367 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1368 bitpos);
1370 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1371 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
1372 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
1373 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1374 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1376 tmp = build2 (code, scalar_type, new_name, new_temp);
1377 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
1378 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1379 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1380 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1383 extract_scalar_result = false;
1387 /* 2.4 Extract the final scalar result. Create:
1388 s_out3 = extract_field <v_out2, bitpos> */
1390 if (extract_scalar_result)
1392 tree rhs;
1394 if (vect_print_dump_info (REPORT_DETAILS))
1395 fprintf (vect_dump, "extract scalar result");
1397 if (BYTES_BIG_ENDIAN)
1398 bitpos = size_binop (MULT_EXPR,
1399 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
1400 TYPE_SIZE (scalar_type));
1401 else
1402 bitpos = bitsize_zero_node;
1404 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
1405 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1406 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
1407 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1408 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1409 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1412 /* 2.4 Adjust the final result by the initial value of the reduction
1413 variable. (When such adjustment is not needed, then
1414 'scalar_initial_def' is zero).
1416 Create:
1417 s_out4 = scalar_expr <s_out3, scalar_initial_def> */
1419 if (scalar_initial_def)
1421 tree tmp = build2 (code, scalar_type, new_temp, scalar_initial_def);
1422 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
1423 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1424 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1425 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1428 /* 2.6 Replace uses of s_out0 with uses of s_out3 */
1430 /* Find the loop-closed-use at the loop exit of the original scalar result.
1431 (The reduction result is expected to have two immediate uses - one at the
1432 latch block, and one at the loop exit). */
1433 exit_phi = NULL;
1434 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
1436 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
1438 exit_phi = USE_STMT (use_p);
1439 break;
1442 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
1443 gcc_assert (exit_phi);
1444 /* Replace the uses: */
1445 orig_name = PHI_RESULT (exit_phi);
1446 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
1447 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1448 SET_USE (use_p, new_temp);
1452 /* Function vectorizable_reduction.
1454 Check if STMT performs a reduction operation that can be vectorized.
1455 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1456 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1457 Return FALSE if not a vectorizable STMT, TRUE otherwise.
1459 This function also handles reduction idioms (patterns) that have been
1460 recognized in advance during vect_pattern_recog. In this case, STMT may be
1461 of this form:
1462 X = pattern_expr (arg0, arg1, ..., X)
1463 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
1464 sequence that had been detected and replaced by the pattern-stmt (STMT).
1466 In some cases of reduction patterns, the type of the reduction variable X is
1467 different than the type of the other arguments of STMT.
1468 In such cases, the vectype that is used when transforming STMT into a vector
1469 stmt is different than the vectype that is used to determine the
1470 vectorization factor, because it consists of a different number of elements
1471 than the actual number of elements that are being operated upon in parallel.
1473 For example, consider an accumulation of shorts into an int accumulator.
1474 On some targets it's possible to vectorize this pattern operating on 8
1475 shorts at a time (hence, the vectype for purposes of determining the
1476 vectorization factor should be V8HI); on the other hand, the vectype that
1477 is used to create the vector form is actually V4SI (the type of the result).
1479 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
1480 indicates what is the actual level of parallelism (V8HI in the example), so
1481 that the right vectorization factor would be derived. This vectype
1482 corresponds to the type of arguments to the reduction stmt, and should *NOT*
1483 be used to create the vectorized stmt. The right vectype for the vectorized
1484 stmt is obtained from the type of the result X:
1485 get_vectype_for_scalar_type (TREE_TYPE (X))
1487 This means that, contrary to "regular" reductions (or "regular" stmts in
1488 general), the following equation:
1489 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
1490 does *NOT* necessarily hold for reduction patterns. */
1492 bool
1493 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1495 tree vec_dest;
1496 tree scalar_dest;
1497 tree op;
1498 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
1499 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1500 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1501 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1502 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1503 tree operation;
1504 enum tree_code code, orig_code, epilog_reduc_code = 0;
1505 enum machine_mode vec_mode;
1506 int op_type;
1507 optab optab, reduc_optab;
1508 tree new_temp = NULL_TREE;
1509 tree def, def_stmt;
1510 enum vect_def_type dt;
1511 tree new_phi;
1512 tree scalar_type;
1513 bool is_simple_use;
1514 tree orig_stmt;
1515 stmt_vec_info orig_stmt_info;
1516 tree expr = NULL_TREE;
1517 int i;
1518 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1519 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1520 stmt_vec_info prev_stmt_info;
1521 tree reduc_def;
1522 tree new_stmt = NULL_TREE;
1523 int j;
1525 gcc_assert (ncopies >= 1);
1527 /* 1. Is vectorizable reduction? */
1529 /* Not supportable if the reduction variable is used in the loop. */
1530 if (STMT_VINFO_RELEVANT_P (stmt_info))
1531 return false;
1533 if (!STMT_VINFO_LIVE_P (stmt_info))
1534 return false;
1536 /* Make sure it was already recognized as a reduction computation. */
1537 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
1538 return false;
1540 /* 2. Has this been recognized as a reduction pattern?
1542 Check if STMT represents a pattern that has been recognized
1543 in earlier analysis stages. For stmts that represent a pattern,
1544 the STMT_VINFO_RELATED_STMT field records the last stmt in
1545 the original sequence that constitutes the pattern. */
1547 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1548 if (orig_stmt)
1550 orig_stmt_info = vinfo_for_stmt (orig_stmt);
1551 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
1552 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
1553 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
1556 /* 3. Check the operands of the operation. The first operands are defined
1557 inside the loop body. The last operand is the reduction variable,
1558 which is defined by the loop-header-phi. */
1560 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
1562 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1563 code = TREE_CODE (operation);
1564 op_type = TREE_OPERAND_LENGTH (operation);
1565 if (op_type != binary_op && op_type != ternary_op)
1566 return false;
1567 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1568 scalar_type = TREE_TYPE (scalar_dest);
1570 /* All uses but the last are expected to be defined in the loop.
1571 The last use is the reduction variable. */
1572 for (i = 0; i < op_type-1; i++)
1574 op = TREE_OPERAND (operation, i);
1575 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1576 gcc_assert (is_simple_use);
1577 if (dt != vect_loop_def
1578 && dt != vect_invariant_def
1579 && dt != vect_constant_def
1580 && dt != vect_induction_def)
1581 return false;
1584 op = TREE_OPERAND (operation, i);
1585 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1586 gcc_assert (is_simple_use);
1587 gcc_assert (dt == vect_reduction_def);
1588 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1589 if (orig_stmt)
1590 gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt));
1591 else
1592 gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt));
1594 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
1595 return false;
1597 /* 4. Supportable by target? */
1599 /* 4.1. check support for the operation in the loop */
1600 optab = optab_for_tree_code (code, vectype);
1601 if (!optab)
1603 if (vect_print_dump_info (REPORT_DETAILS))
1604 fprintf (vect_dump, "no optab.");
1605 return false;
1607 vec_mode = TYPE_MODE (vectype);
1608 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1610 if (vect_print_dump_info (REPORT_DETAILS))
1611 fprintf (vect_dump, "op not supported by target.");
1612 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1613 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1614 < vect_min_worthwhile_factor (code))
1615 return false;
1616 if (vect_print_dump_info (REPORT_DETAILS))
1617 fprintf (vect_dump, "proceeding using word mode.");
1620 /* Worthwhile without SIMD support? */
1621 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1622 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1623 < vect_min_worthwhile_factor (code))
1625 if (vect_print_dump_info (REPORT_DETAILS))
1626 fprintf (vect_dump, "not worthwhile without SIMD support.");
1627 return false;
1630 /* 4.2. Check support for the epilog operation.
1632 If STMT represents a reduction pattern, then the type of the
1633 reduction variable may be different than the type of the rest
1634 of the arguments. For example, consider the case of accumulation
1635 of shorts into an int accumulator; The original code:
1636 S1: int_a = (int) short_a;
1637 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
1639 was replaced with:
1640 STMT: int_acc = widen_sum <short_a, int_acc>
1642 This means that:
1643 1. The tree-code that is used to create the vector operation in the
1644 epilog code (that reduces the partial results) is not the
1645 tree-code of STMT, but is rather the tree-code of the original
1646 stmt from the pattern that STMT is replacing. I.e, in the example
1647 above we want to use 'widen_sum' in the loop, but 'plus' in the
1648 epilog.
1649 2. The type (mode) we use to check available target support
1650 for the vector operation to be created in the *epilog*, is
1651 determined by the type of the reduction variable (in the example
1652 above we'd check this: plus_optab[vect_int_mode]).
1653 However the type (mode) we use to check available target support
1654 for the vector operation to be created *inside the loop*, is
1655 determined by the type of the other arguments to STMT (in the
1656 example we'd check this: widen_sum_optab[vect_short_mode]).
1658 This is contrary to "regular" reductions, in which the types of all
1659 the arguments are the same as the type of the reduction variable.
1660 For "regular" reductions we can therefore use the same vector type
1661 (and also the same tree-code) when generating the epilog code and
1662 when generating the code inside the loop. */
1664 if (orig_stmt)
1666 /* This is a reduction pattern: get the vectype from the type of the
1667 reduction variable, and get the tree-code from orig_stmt. */
1668 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1669 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
1670 vec_mode = TYPE_MODE (vectype);
1672 else
1674 /* Regular reduction: use the same vectype and tree-code as used for
1675 the vector code inside the loop can be used for the epilog code. */
1676 orig_code = code;
1679 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
1680 return false;
1681 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
1682 if (!reduc_optab)
1684 if (vect_print_dump_info (REPORT_DETAILS))
1685 fprintf (vect_dump, "no optab for reduction.");
1686 epilog_reduc_code = NUM_TREE_CODES;
1688 if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1690 if (vect_print_dump_info (REPORT_DETAILS))
1691 fprintf (vect_dump, "reduc op not supported by target.");
1692 epilog_reduc_code = NUM_TREE_CODES;
1695 if (!vec_stmt) /* transformation not required. */
1697 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
1698 return true;
1701 /** Transform. **/
1703 if (vect_print_dump_info (REPORT_DETAILS))
1704 fprintf (vect_dump, "transform reduction.");
1706 /* Create the destination vector */
1707 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1709 /* Create the reduction-phi that defines the reduction-operand. */
1710 new_phi = create_phi_node (vec_dest, loop->header);
1712 /* In case the vectorization factor (VF) is bigger than the number
1713 of elements that we can fit in a vectype (nunits), we have to generate
1714 more than one vector stmt - i.e - we need to "unroll" the
1715 vector stmt by a factor VF/nunits. For more details see documentation
1716 in vectorizable_operation. */
1718 prev_stmt_info = NULL;
1719 for (j = 0; j < ncopies; j++)
1721 /* Handle uses. */
1722 if (j == 0)
1724 op = TREE_OPERAND (operation, 0);
1725 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
1726 if (op_type == ternary_op)
1728 op = TREE_OPERAND (operation, 1);
1729 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
1732 /* Get the vector def for the reduction variable from the phi node */
1733 reduc_def = PHI_RESULT (new_phi);
1735 else
1737 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
1738 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
1739 if (op_type == ternary_op)
1740 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
1742 /* Get the vector def for the reduction variable from the vectorized
1743 reduction operation generated in the previous iteration (j-1) */
1744 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
1747 /* Arguments are ready. create the new vector stmt. */
1749 if (op_type == binary_op)
1750 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
1751 else
1752 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
1753 reduc_def);
1754 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
1755 new_temp = make_ssa_name (vec_dest, new_stmt);
1756 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1757 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1759 if (j == 0)
1760 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1761 else
1762 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1763 prev_stmt_info = vinfo_for_stmt (new_stmt);
1766 /* Finalize the reduction-phi (set it's arguments) and create the
1767 epilog reduction code. */
1768 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
1769 return true;
1772 /* Checks if CALL can be vectorized in type VECTYPE. Returns
1773 a function declaration if the target has a vectorized version
1774 of the function, or NULL_TREE if the function cannot be vectorized. */
1776 tree
1777 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
1779 tree fndecl = get_callee_fndecl (call);
1780 enum built_in_function code;
1782 /* We only handle functions that do not read or clobber memory -- i.e.
1783 const or novops ones. */
1784 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
1785 return NULL_TREE;
1787 if (!fndecl
1788 || TREE_CODE (fndecl) != FUNCTION_DECL
1789 || !DECL_BUILT_IN (fndecl))
1790 return NULL_TREE;
1792 code = DECL_FUNCTION_CODE (fndecl);
1793 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
1794 vectype_in);
1797 /* Function vectorizable_call.
1799 Check if STMT performs a function call that can be vectorized.
1800 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1801 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1802 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1804 bool
1805 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1807 tree vec_dest;
1808 tree scalar_dest;
1809 tree operation;
1810 tree op, type;
1811 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
1812 tree vectype_out, vectype_in;
1813 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1814 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
1815 enum vect_def_type dt[2];
1816 int ncopies, j, nargs;
1817 call_expr_arg_iterator iter;
1819 /* Is STMT a vectorizable call? */
1820 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
1821 return false;
1823 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
1824 return false;
1826 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1827 if (TREE_CODE (operation) != CALL_EXPR)
1828 return false;
1830 /* Process function arguments. */
1831 rhs_type = NULL_TREE;
1832 nargs = 0;
1833 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
1835 ++nargs;
1837 /* Bail out if the function has more than two arguments, we
1838 do not have interesting builtin functions to vectorize with
1839 more than two arguments. */
1840 if (nargs > 2)
1841 return false;
1843 /* We can only handle calls with arguments of the same type. */
1844 if (rhs_type
1845 && rhs_type != TREE_TYPE (op))
1847 if (vect_print_dump_info (REPORT_DETAILS))
1848 fprintf (vect_dump, "argument types differ.");
1849 return false;
1851 rhs_type = TREE_TYPE (op);
1853 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs-1]))
1855 if (vect_print_dump_info (REPORT_DETAILS))
1856 fprintf (vect_dump, "use not simple.");
1857 return false;
1861 /* No arguments is also not good. */
1862 if (nargs == 0)
1863 return false;
1865 vectype_in = get_vectype_for_scalar_type (rhs_type);
1867 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
1868 vectype_out = get_vectype_for_scalar_type (lhs_type);
1870 /* Only handle the case of vectors with the same number of elements.
1871 FIXME: We need a way to handle for example the SSE2 cvtpd2dq
1872 instruction which converts V2DFmode to V4SImode but only
1873 using the lower half of the V4SImode result. */
1874 if (TYPE_VECTOR_SUBPARTS (vectype_in) != TYPE_VECTOR_SUBPARTS (vectype_out))
1875 return false;
1877 /* For now, we only vectorize functions if a target specific builtin
1878 is available. TODO -- in some cases, it might be profitable to
1879 insert the calls for pieces of the vector, in order to be able
1880 to vectorize other operations in the loop. */
1881 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
1882 if (fndecl == NULL_TREE)
1884 if (vect_print_dump_info (REPORT_DETAILS))
1885 fprintf (vect_dump, "function is not vectorizable.");
1887 return false;
1890 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
1892 if (!vec_stmt) /* transformation not required. */
1894 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
1895 return true;
1898 /** Transform. **/
1900 if (vect_print_dump_info (REPORT_DETAILS))
1901 fprintf (vect_dump, "transform operation.");
1903 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1904 / TYPE_VECTOR_SUBPARTS (vectype_out));
1905 gcc_assert (ncopies >= 1);
1907 /* Handle def. */
1908 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1909 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
1911 prev_stmt_info = NULL;
1912 for (j = 0; j < ncopies; ++j)
1914 tree new_stmt, vargs;
1915 tree vec_oprnd[2];
1916 int n;
1918 /* Build argument list for the vectorized call. */
1919 /* FIXME: Rewrite this so that it doesn't construct a temporary
1920 list. */
1921 vargs = NULL_TREE;
1922 n = -1;
1923 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
1925 ++n;
1927 if (j == 0)
1928 vec_oprnd[n] = vect_get_vec_def_for_operand (op, stmt, NULL);
1929 else
1930 vec_oprnd[n] = vect_get_vec_def_for_stmt_copy (dt[n], vec_oprnd[n]);
1932 vargs = tree_cons (NULL_TREE, vec_oprnd[n], vargs);
1934 vargs = nreverse (vargs);
1936 rhs = build_function_call_expr (fndecl, vargs);
1937 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
1938 new_temp = make_ssa_name (vec_dest, new_stmt);
1939 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1941 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1943 if (j == 0)
1944 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1945 else
1946 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1947 prev_stmt_info = vinfo_for_stmt (new_stmt);
1950 /* The call in STMT might prevent it from being removed in dce. We however
1951 cannot remove it here, due to the way the ssa name it defines is mapped
1952 to the new definition. So just replace rhs of the statement with something
1953 harmless. */
1954 type = TREE_TYPE (scalar_dest);
1955 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
1957 return true;
1961 /* Function vectorizable_conversion.
1963 Check if STMT performs a conversion operation, that can be vectorized.
1964 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1965 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1966 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1968 bool
1969 vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
1970 tree * vec_stmt)
1972 tree vec_dest;
1973 tree scalar_dest;
1974 tree operation;
1975 tree op0;
1976 tree vec_oprnd0 = NULL_TREE;
1977 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1978 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1979 enum tree_code code;
1980 tree new_temp;
1981 tree def, def_stmt;
1982 enum vect_def_type dt0;
1983 tree new_stmt;
1984 int nunits_in;
1985 int nunits_out;
1986 int ncopies, j;
1987 tree vectype_out, vectype_in;
1988 tree rhs_type, lhs_type;
1989 tree builtin_decl;
1990 stmt_vec_info prev_stmt_info;
1992 /* Is STMT a vectorizable conversion? */
1994 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1995 return false;
1997 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1999 if (STMT_VINFO_LIVE_P (stmt_info))
2001 /* FORNOW: not yet supported. */
2002 if (vect_print_dump_info (REPORT_DETAILS))
2003 fprintf (vect_dump, "value used after loop.");
2004 return false;
2007 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2008 return false;
2010 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2011 return false;
2013 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2014 code = TREE_CODE (operation);
2015 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
2016 return false;
2018 /* Check types of lhs and rhs */
2019 op0 = TREE_OPERAND (operation, 0);
2020 rhs_type = TREE_TYPE (op0);
2021 vectype_in = get_vectype_for_scalar_type (rhs_type);
2022 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2024 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2025 lhs_type = TREE_TYPE (scalar_dest);
2026 vectype_out = get_vectype_for_scalar_type (lhs_type);
2027 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
2028 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2030 /* FORNOW: need to extend to support short<->float conversions as well. */
2031 if (nunits_out != nunits_in)
2032 return false;
2034 /* Bail out if the types are both integral or non-integral */
2035 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
2036 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
2037 return false;
2039 /* Sanity check: make sure that at least one copy of the vectorized stmt
2040 needs to be generated. */
2041 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2042 gcc_assert (ncopies >= 1);
2044 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2046 if (vect_print_dump_info (REPORT_DETAILS))
2047 fprintf (vect_dump, "use not simple.");
2048 return false;
2051 /* Supportable by target? */
2052 if (!targetm.vectorize.builtin_conversion (code, vectype_in))
2054 if (vect_print_dump_info (REPORT_DETAILS))
2055 fprintf (vect_dump, "op not supported by target.");
2056 return false;
2059 if (!vec_stmt) /* transformation not required. */
2061 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
2062 return true;
2065 /** Transform. **/
2067 if (vect_print_dump_info (REPORT_DETAILS))
2068 fprintf (vect_dump, "transform conversion.");
2070 /* Handle def. */
2071 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2073 prev_stmt_info = NULL;
2074 for (j = 0; j < ncopies; j++)
2076 tree sym;
2077 ssa_op_iter iter;
2079 if (j == 0)
2080 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2081 else
2082 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2084 builtin_decl =
2085 targetm.vectorize.builtin_conversion (code, vectype_in);
2086 new_stmt = build_call_expr (builtin_decl, 1, vec_oprnd0);
2088 /* Arguments are ready. create the new vector stmt. */
2089 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
2090 new_temp = make_ssa_name (vec_dest, new_stmt);
2091 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2092 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2093 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2095 if (TREE_CODE (sym) == SSA_NAME)
2096 sym = SSA_NAME_VAR (sym);
2097 mark_sym_for_renaming (sym);
2100 if (j == 0)
2101 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2102 else
2103 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2104 prev_stmt_info = vinfo_for_stmt (new_stmt);
2106 return true;
2110 /* Function vectorizable_assignment.
2112 Check if STMT performs an assignment (copy) that can be vectorized.
2113 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2114 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2115 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2117 bool
2118 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2120 tree vec_dest;
2121 tree scalar_dest;
2122 tree op;
2123 tree vec_oprnd;
2124 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2125 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2126 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2127 tree new_temp;
2128 tree def, def_stmt;
2129 enum vect_def_type dt;
2130 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2131 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2133 gcc_assert (ncopies >= 1);
2134 if (ncopies > 1)
2135 return false; /* FORNOW */
2137 /* Is vectorizable assignment? */
2138 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2139 return false;
2141 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2143 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2144 return false;
2146 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2147 if (TREE_CODE (scalar_dest) != SSA_NAME)
2148 return false;
2150 op = GIMPLE_STMT_OPERAND (stmt, 1);
2151 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
2153 if (vect_print_dump_info (REPORT_DETAILS))
2154 fprintf (vect_dump, "use not simple.");
2155 return false;
2158 if (!vec_stmt) /* transformation not required. */
2160 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
2161 return true;
2164 /** Transform. **/
2165 if (vect_print_dump_info (REPORT_DETAILS))
2166 fprintf (vect_dump, "transform assignment.");
2168 /* Handle def. */
2169 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2171 /* Handle use. */
2172 op = GIMPLE_STMT_OPERAND (stmt, 1);
2173 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
2175 /* Arguments are ready. create the new vector stmt. */
2176 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_oprnd);
2177 new_temp = make_ssa_name (vec_dest, *vec_stmt);
2178 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
2179 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
2181 return true;
2185 /* Function vect_min_worthwhile_factor.
2187 For a loop where we could vectorize the operation indicated by CODE,
2188 return the minimum vectorization factor that makes it worthwhile
2189 to use generic vectors. */
2190 static int
2191 vect_min_worthwhile_factor (enum tree_code code)
2193 switch (code)
2195 case PLUS_EXPR:
2196 case MINUS_EXPR:
2197 case NEGATE_EXPR:
2198 return 4;
2200 case BIT_AND_EXPR:
2201 case BIT_IOR_EXPR:
2202 case BIT_XOR_EXPR:
2203 case BIT_NOT_EXPR:
2204 return 2;
2206 default:
2207 return INT_MAX;
2212 /* Function vectorizable_operation.
2214 Check if STMT performs a binary or unary operation that can be vectorized.
2215 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2216 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2217 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2219 bool
2220 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2222 tree vec_dest;
2223 tree scalar_dest;
2224 tree operation;
2225 tree op0, op1 = NULL;
2226 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2227 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2228 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2229 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2230 enum tree_code code;
2231 enum machine_mode vec_mode;
2232 tree new_temp;
2233 int op_type;
2234 optab optab;
2235 int icode;
2236 enum machine_mode optab_op2_mode;
2237 tree def, def_stmt;
2238 enum vect_def_type dt0, dt1;
2239 tree new_stmt;
2240 stmt_vec_info prev_stmt_info;
2241 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
2242 int nunits_out;
2243 tree vectype_out;
2244 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2245 int j;
2247 gcc_assert (ncopies >= 1);
2249 /* Is STMT a vectorizable binary/unary operation? */
2250 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2251 return false;
2253 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2255 if (STMT_VINFO_LIVE_P (stmt_info))
2257 /* FORNOW: not yet supported. */
2258 if (vect_print_dump_info (REPORT_DETAILS))
2259 fprintf (vect_dump, "value used after loop.");
2260 return false;
2263 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2264 return false;
2266 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2267 return false;
2269 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2270 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2271 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2272 if (nunits_out != nunits_in)
2273 return false;
2275 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2276 code = TREE_CODE (operation);
2277 optab = optab_for_tree_code (code, vectype);
2279 /* Support only unary or binary operations. */
2280 op_type = TREE_OPERAND_LENGTH (operation);
2281 if (op_type != unary_op && op_type != binary_op)
2283 if (vect_print_dump_info (REPORT_DETAILS))
2284 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
2285 return false;
2288 op0 = TREE_OPERAND (operation, 0);
2289 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2291 if (vect_print_dump_info (REPORT_DETAILS))
2292 fprintf (vect_dump, "use not simple.");
2293 return false;
2296 if (op_type == binary_op)
2298 op1 = TREE_OPERAND (operation, 1);
2299 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2301 if (vect_print_dump_info (REPORT_DETAILS))
2302 fprintf (vect_dump, "use not simple.");
2303 return false;
2307 /* Supportable by target? */
2308 if (!optab)
2310 if (vect_print_dump_info (REPORT_DETAILS))
2311 fprintf (vect_dump, "no optab.");
2312 return false;
2314 vec_mode = TYPE_MODE (vectype);
2315 icode = (int) optab->handlers[(int) vec_mode].insn_code;
2316 if (icode == CODE_FOR_nothing)
2318 if (vect_print_dump_info (REPORT_DETAILS))
2319 fprintf (vect_dump, "op not supported by target.");
2320 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2321 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2322 < vect_min_worthwhile_factor (code))
2323 return false;
2324 if (vect_print_dump_info (REPORT_DETAILS))
2325 fprintf (vect_dump, "proceeding using word mode.");
2328 /* Worthwhile without SIMD support? */
2329 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2330 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2331 < vect_min_worthwhile_factor (code))
2333 if (vect_print_dump_info (REPORT_DETAILS))
2334 fprintf (vect_dump, "not worthwhile without SIMD support.");
2335 return false;
2338 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2340 /* FORNOW: not yet supported. */
2341 if (!VECTOR_MODE_P (vec_mode))
2342 return false;
2344 /* Invariant argument is needed for a vector shift
2345 by a scalar shift operand. */
2346 optab_op2_mode = insn_data[icode].operand[2].mode;
2347 if (! (VECTOR_MODE_P (optab_op2_mode)
2348 || dt1 == vect_constant_def
2349 || dt1 == vect_invariant_def))
2351 if (vect_print_dump_info (REPORT_DETAILS))
2352 fprintf (vect_dump, "operand mode requires invariant argument.");
2353 return false;
2357 if (!vec_stmt) /* transformation not required. */
2359 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
2360 return true;
2363 /** Transform. **/
2365 if (vect_print_dump_info (REPORT_DETAILS))
2366 fprintf (vect_dump, "transform binary/unary operation.");
2368 /* Handle def. */
2369 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2371 /* In case the vectorization factor (VF) is bigger than the number
2372 of elements that we can fit in a vectype (nunits), we have to generate
2373 more than one vector stmt - i.e - we need to "unroll" the
2374 vector stmt by a factor VF/nunits. In doing so, we record a pointer
2375 from one copy of the vector stmt to the next, in the field
2376 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
2377 stages to find the correct vector defs to be used when vectorizing
2378 stmts that use the defs of the current stmt. The example below illustrates
2379 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
2380 4 vectorized stmts):
2382 before vectorization:
2383 RELATED_STMT VEC_STMT
2384 S1: x = memref - -
2385 S2: z = x + 1 - -
2387 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
2388 there):
2389 RELATED_STMT VEC_STMT
2390 VS1_0: vx0 = memref0 VS1_1 -
2391 VS1_1: vx1 = memref1 VS1_2 -
2392 VS1_2: vx2 = memref2 VS1_3 -
2393 VS1_3: vx3 = memref3 - -
2394 S1: x = load - VS1_0
2395 S2: z = x + 1 - -
2397 step2: vectorize stmt S2 (done here):
2398 To vectorize stmt S2 we first need to find the relevant vector
2399 def for the first operand 'x'. This is, as usual, obtained from
2400 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
2401 that defines 'x' (S1). This way we find the stmt VS1_0, and the
2402 relevant vector def 'vx0'. Having found 'vx0' we can generate
2403 the vector stmt VS2_0, and as usual, record it in the
2404 STMT_VINFO_VEC_STMT of stmt S2.
2405 When creating the second copy (VS2_1), we obtain the relevant vector
2406 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
2407 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
2408 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
2409 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
2410 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
2411 chain of stmts and pointers:
2412 RELATED_STMT VEC_STMT
2413 VS1_0: vx0 = memref0 VS1_1 -
2414 VS1_1: vx1 = memref1 VS1_2 -
2415 VS1_2: vx2 = memref2 VS1_3 -
2416 VS1_3: vx3 = memref3 - -
2417 S1: x = load - VS1_0
2418 VS2_0: vz0 = vx0 + v1 VS2_1 -
2419 VS2_1: vz1 = vx1 + v1 VS2_2 -
2420 VS2_2: vz2 = vx2 + v1 VS2_3 -
2421 VS2_3: vz3 = vx3 + v1 - -
2422 S2: z = x + 1 - VS2_0 */
2424 prev_stmt_info = NULL;
2425 for (j = 0; j < ncopies; j++)
2427 /* Handle uses. */
2428 if (j == 0)
2430 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2431 if (op_type == binary_op)
2433 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2435 /* Vector shl and shr insn patterns can be defined with
2436 scalar operand 2 (shift operand). In this case, use
2437 constant or loop invariant op1 directly, without
2438 extending it to vector mode first. */
2439 optab_op2_mode = insn_data[icode].operand[2].mode;
2440 if (!VECTOR_MODE_P (optab_op2_mode))
2442 if (vect_print_dump_info (REPORT_DETAILS))
2443 fprintf (vect_dump, "operand 1 using scalar mode.");
2444 vec_oprnd1 = op1;
2447 if (!vec_oprnd1)
2448 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2451 else
2453 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2454 if (op_type == binary_op)
2455 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2458 /* Arguments are ready. create the new vector stmt. */
2460 if (op_type == binary_op)
2461 new_stmt = build_gimple_modify_stmt (vec_dest,
2462 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
2463 else
2464 new_stmt = build_gimple_modify_stmt (vec_dest,
2465 build1 (code, vectype, vec_oprnd0));
2466 new_temp = make_ssa_name (vec_dest, new_stmt);
2467 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2468 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2470 if (j == 0)
2471 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2472 else
2473 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2474 prev_stmt_info = vinfo_for_stmt (new_stmt);
2477 return true;
2481 /* Function vectorizable_type_demotion
2483 Check if STMT performs a binary or unary operation that involves
2484 type demotion, and if it can be vectorized.
2485 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2486 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2487 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2489 bool
2490 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
2491 tree *vec_stmt)
2493 tree vec_dest;
2494 tree scalar_dest;
2495 tree operation;
2496 tree op0;
2497 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2498 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2499 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2500 enum tree_code code;
2501 tree new_temp;
2502 tree def, def_stmt;
2503 enum vect_def_type dt0;
2504 tree new_stmt;
2505 stmt_vec_info prev_stmt_info;
2506 int nunits_in;
2507 int nunits_out;
2508 tree vectype_out;
2509 int ncopies;
2510 int j;
2511 tree expr;
2512 tree vectype_in;
2513 tree scalar_type;
2514 optab optab;
2515 enum machine_mode vec_mode;
2517 /* Is STMT a vectorizable type-demotion operation? */
2519 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2520 return false;
2522 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2524 if (STMT_VINFO_LIVE_P (stmt_info))
2526 /* FORNOW: not yet supported. */
2527 if (vect_print_dump_info (REPORT_DETAILS))
2528 fprintf (vect_dump, "value used after loop.");
2529 return false;
2532 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2533 return false;
2535 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2536 return false;
2538 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2539 code = TREE_CODE (operation);
2540 if (code != NOP_EXPR && code != CONVERT_EXPR)
2541 return false;
2543 op0 = TREE_OPERAND (operation, 0);
2544 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2545 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2547 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2548 scalar_type = TREE_TYPE (scalar_dest);
2549 vectype_out = get_vectype_for_scalar_type (scalar_type);
2550 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2551 if (nunits_in != nunits_out / 2) /* FORNOW */
2552 return false;
2554 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2555 gcc_assert (ncopies >= 1);
2557 if (! INTEGRAL_TYPE_P (scalar_type)
2558 || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2559 return false;
2561 /* Check the operands of the operation. */
2562 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2564 if (vect_print_dump_info (REPORT_DETAILS))
2565 fprintf (vect_dump, "use not simple.");
2566 return false;
2569 /* Supportable by target? */
2570 code = VEC_PACK_MOD_EXPR;
2571 optab = optab_for_tree_code (VEC_PACK_MOD_EXPR, vectype_in);
2572 if (!optab)
2573 return false;
2575 vec_mode = TYPE_MODE (vectype_in);
2576 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
2577 return false;
2579 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2581 if (!vec_stmt) /* transformation not required. */
2583 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
2584 return true;
2587 /** Transform. **/
2589 if (vect_print_dump_info (REPORT_DETAILS))
2590 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
2591 ncopies);
2593 /* Handle def. */
2594 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2596 /* In case the vectorization factor (VF) is bigger than the number
2597 of elements that we can fit in a vectype (nunits), we have to generate
2598 more than one vector stmt - i.e - we need to "unroll" the
2599 vector stmt by a factor VF/nunits. */
2600 prev_stmt_info = NULL;
2601 for (j = 0; j < ncopies; j++)
2603 /* Handle uses. */
2604 if (j == 0)
2606 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2607 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2609 else
2611 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
2612 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2615 /* Arguments are ready. Create the new vector stmt. */
2616 expr = build2 (code, vectype_out, vec_oprnd0, vec_oprnd1);
2617 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2618 new_temp = make_ssa_name (vec_dest, new_stmt);
2619 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2620 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2622 if (j == 0)
2623 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2624 else
2625 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2627 prev_stmt_info = vinfo_for_stmt (new_stmt);
2630 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2631 return true;
2635 /* Function vect_gen_widened_results_half
2637 Create a vector stmt whose code, type, number of arguments, and result
2638 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2639 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2640 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2641 needs to be created (DECL is a function-decl of a target-builtin).
2642 STMT is the original scalar stmt that we are vectorizing. */
2644 static tree
2645 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2646 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2647 tree vec_dest, block_stmt_iterator *bsi,
2648 tree stmt)
2650 tree expr;
2651 tree new_stmt;
2652 tree new_temp;
2653 tree sym;
2654 ssa_op_iter iter;
2656 /* Generate half of the widened result: */
2657 if (code == CALL_EXPR)
2659 /* Target specific support */
2660 if (op_type == binary_op)
2661 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
2662 else
2663 expr = build_call_expr (decl, 1, vec_oprnd0);
2665 else
2667 /* Generic support */
2668 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2669 if (op_type == binary_op)
2670 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2671 else
2672 expr = build1 (code, vectype, vec_oprnd0);
2674 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2675 new_temp = make_ssa_name (vec_dest, new_stmt);
2676 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2677 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2679 if (code == CALL_EXPR)
2681 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2683 if (TREE_CODE (sym) == SSA_NAME)
2684 sym = SSA_NAME_VAR (sym);
2685 mark_sym_for_renaming (sym);
2689 return new_stmt;
2693 /* Function vectorizable_type_promotion
2695 Check if STMT performs a binary or unary operation that involves
2696 type promotion, and if it can be vectorized.
2697 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2698 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2699 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2701 bool
2702 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
2703 tree *vec_stmt)
2705 tree vec_dest;
2706 tree scalar_dest;
2707 tree operation;
2708 tree op0, op1 = NULL;
2709 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2710 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2711 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2712 enum tree_code code, code1 = CODE_FOR_nothing, code2 = CODE_FOR_nothing;
2713 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
2714 int op_type;
2715 tree def, def_stmt;
2716 enum vect_def_type dt0, dt1;
2717 tree new_stmt;
2718 stmt_vec_info prev_stmt_info;
2719 int nunits_in;
2720 int nunits_out;
2721 tree vectype_out;
2722 int ncopies;
2723 int j;
2724 tree vectype_in;
2726 /* Is STMT a vectorizable type-promotion operation? */
2728 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2729 return false;
2731 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2733 if (STMT_VINFO_LIVE_P (stmt_info))
2735 /* FORNOW: not yet supported. */
2736 if (vect_print_dump_info (REPORT_DETAILS))
2737 fprintf (vect_dump, "value used after loop.");
2738 return false;
2741 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2742 return false;
2744 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2745 return false;
2747 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2748 code = TREE_CODE (operation);
2749 if (code != NOP_EXPR && code != WIDEN_MULT_EXPR)
2750 return false;
2752 op0 = TREE_OPERAND (operation, 0);
2753 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2754 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2755 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2756 gcc_assert (ncopies >= 1);
2758 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2759 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2760 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2761 if (nunits_out != nunits_in / 2) /* FORNOW */
2762 return false;
2764 if (! INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
2765 || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2766 return false;
2768 /* Check the operands of the operation. */
2769 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2771 if (vect_print_dump_info (REPORT_DETAILS))
2772 fprintf (vect_dump, "use not simple.");
2773 return false;
2776 op_type = TREE_CODE_LENGTH (code);
2777 if (op_type == binary_op)
2779 op1 = TREE_OPERAND (operation, 1);
2780 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2782 if (vect_print_dump_info (REPORT_DETAILS))
2783 fprintf (vect_dump, "use not simple.");
2784 return false;
2788 /* Supportable by target? */
2789 if (!supportable_widening_operation (code, stmt, vectype_in,
2790 &decl1, &decl2, &code1, &code2))
2791 return false;
2793 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2795 if (!vec_stmt) /* transformation not required. */
2797 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
2798 return true;
2801 /** Transform. **/
2803 if (vect_print_dump_info (REPORT_DETAILS))
2804 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
2805 ncopies);
2807 /* Handle def. */
2808 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2810 /* In case the vectorization factor (VF) is bigger than the number
2811 of elements that we can fit in a vectype (nunits), we have to generate
2812 more than one vector stmt - i.e - we need to "unroll" the
2813 vector stmt by a factor VF/nunits. */
2815 prev_stmt_info = NULL;
2816 for (j = 0; j < ncopies; j++)
2818 /* Handle uses. */
2819 if (j == 0)
2821 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2822 if (op_type == binary_op)
2823 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2825 else
2827 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2828 if (op_type == binary_op)
2829 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2832 /* Arguments are ready. Create the new vector stmt. We are creating
2833 two vector defs because the widened result does not fit in one vector.
2834 The vectorized stmt can be expressed as a call to a taregt builtin,
2835 or a using a tree-code. */
2836 /* Generate first half of the widened result: */
2837 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
2838 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2839 if (j == 0)
2840 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2841 else
2842 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2843 prev_stmt_info = vinfo_for_stmt (new_stmt);
2845 /* Generate second half of the widened result: */
2846 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
2847 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2848 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2849 prev_stmt_info = vinfo_for_stmt (new_stmt);
2853 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2854 return true;
2858 /* Function vect_strided_store_supported.
2860 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
2861 and FALSE otherwise. */
2863 static bool
2864 vect_strided_store_supported (tree vectype)
2866 optab interleave_high_optab, interleave_low_optab;
2867 int mode;
2869 mode = (int) TYPE_MODE (vectype);
2871 /* Check that the operation is supported. */
2872 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
2873 vectype);
2874 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
2875 vectype);
2876 if (!interleave_high_optab || !interleave_low_optab)
2878 if (vect_print_dump_info (REPORT_DETAILS))
2879 fprintf (vect_dump, "no optab for interleave.");
2880 return false;
2883 if (interleave_high_optab->handlers[(int) mode].insn_code
2884 == CODE_FOR_nothing
2885 || interleave_low_optab->handlers[(int) mode].insn_code
2886 == CODE_FOR_nothing)
2888 if (vect_print_dump_info (REPORT_DETAILS))
2889 fprintf (vect_dump, "interleave op not supported by target.");
2890 return false;
2892 return true;
2896 /* Function vect_permute_store_chain.
2898 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
2899 a power of 2, generate interleave_high/low stmts to reorder the data
2900 correctly for the stores. Return the final references for stores in
2901 RESULT_CHAIN.
2903 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
2904 The input is 4 vectors each containing 8 elements. We assign a number to each
2905 element, the input sequence is:
2907 1st vec: 0 1 2 3 4 5 6 7
2908 2nd vec: 8 9 10 11 12 13 14 15
2909 3rd vec: 16 17 18 19 20 21 22 23
2910 4th vec: 24 25 26 27 28 29 30 31
2912 The output sequence should be:
2914 1st vec: 0 8 16 24 1 9 17 25
2915 2nd vec: 2 10 18 26 3 11 19 27
2916 3rd vec: 4 12 20 28 5 13 21 30
2917 4th vec: 6 14 22 30 7 15 23 31
2919 i.e., we interleave the contents of the four vectors in their order.
2921 We use interleave_high/low instructions to create such output. The input of
2922 each interleave_high/low operation is two vectors:
2923 1st vec 2nd vec
2924 0 1 2 3 4 5 6 7
2925 the even elements of the result vector are obtained left-to-right from the
2926 high/low elements of the first vector. The odd elements of the result are
2927 obtained left-to-right from the high/low elements of the second vector.
2928 The output of interleave_high will be: 0 4 1 5
2929 and of interleave_low: 2 6 3 7
2932 The permutation is done in log LENGTH stages. In each stage interleave_high
2933 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
2934 where the first argument is taken from the first half of DR_CHAIN and the
2935 second argument from it's second half.
2936 In our example,
2938 I1: interleave_high (1st vec, 3rd vec)
2939 I2: interleave_low (1st vec, 3rd vec)
2940 I3: interleave_high (2nd vec, 4th vec)
2941 I4: interleave_low (2nd vec, 4th vec)
2943 The output for the first stage is:
2945 I1: 0 16 1 17 2 18 3 19
2946 I2: 4 20 5 21 6 22 7 23
2947 I3: 8 24 9 25 10 26 11 27
2948 I4: 12 28 13 29 14 30 15 31
2950 The output of the second stage, i.e. the final result is:
2952 I1: 0 8 16 24 1 9 17 25
2953 I2: 2 10 18 26 3 11 19 27
2954 I3: 4 12 20 28 5 13 21 30
2955 I4: 6 14 22 30 7 15 23 31. */
2957 static bool
2958 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
2959 unsigned int length,
2960 tree stmt,
2961 block_stmt_iterator *bsi,
2962 VEC(tree,heap) **result_chain)
2964 tree perm_dest, perm_stmt, vect1, vect2, high, low;
2965 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2966 tree scalar_dest, tmp;
2967 int i;
2968 unsigned int j;
2969 VEC(tree,heap) *first, *second;
2971 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2972 first = VEC_alloc (tree, heap, length/2);
2973 second = VEC_alloc (tree, heap, length/2);
2975 /* Check that the operation is supported. */
2976 if (!vect_strided_store_supported (vectype))
2977 return false;
2979 *result_chain = VEC_copy (tree, heap, dr_chain);
2981 for (i = 0; i < exact_log2 (length); i++)
2983 for (j = 0; j < length/2; j++)
2985 vect1 = VEC_index (tree, dr_chain, j);
2986 vect2 = VEC_index (tree, dr_chain, j+length/2);
2988 /* Create interleaving stmt:
2989 in the case of big endian:
2990 high = interleave_high (vect1, vect2)
2991 and in the case of little endian:
2992 high = interleave_low (vect1, vect2). */
2993 perm_dest = create_tmp_var (vectype, "vect_inter_high");
2994 DECL_GIMPLE_REG_P (perm_dest) = 1;
2995 add_referenced_var (perm_dest);
2996 if (BYTES_BIG_ENDIAN)
2997 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
2998 else
2999 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
3000 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
3001 high = make_ssa_name (perm_dest, perm_stmt);
3002 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
3003 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3004 VEC_replace (tree, *result_chain, 2*j, high);
3006 /* Create interleaving stmt:
3007 in the case of big endian:
3008 low = interleave_low (vect1, vect2)
3009 and in the case of little endian:
3010 low = interleave_high (vect1, vect2). */
3011 perm_dest = create_tmp_var (vectype, "vect_inter_low");
3012 DECL_GIMPLE_REG_P (perm_dest) = 1;
3013 add_referenced_var (perm_dest);
3014 if (BYTES_BIG_ENDIAN)
3015 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
3016 else
3017 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
3018 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
3019 low = make_ssa_name (perm_dest, perm_stmt);
3020 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
3021 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3022 VEC_replace (tree, *result_chain, 2*j+1, low);
3024 dr_chain = VEC_copy (tree, heap, *result_chain);
3026 return true;
3030 /* Function vectorizable_store.
3032 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
3033 can be vectorized.
3034 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3035 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3036 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3038 bool
3039 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3041 tree scalar_dest;
3042 tree data_ref;
3043 tree op;
3044 tree vec_oprnd = NULL_TREE;
3045 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3046 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
3047 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3048 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3049 enum machine_mode vec_mode;
3050 tree dummy;
3051 enum dr_alignment_support alignment_support_cheme;
3052 ssa_op_iter iter;
3053 def_operand_p def_p;
3054 tree def, def_stmt;
3055 enum vect_def_type dt;
3056 stmt_vec_info prev_stmt_info = NULL;
3057 tree dataref_ptr = NULL_TREE;
3058 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3059 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3060 int j;
3061 tree next_stmt, first_stmt;
3062 bool strided_store = false;
3063 unsigned int group_size, i;
3064 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
3065 gcc_assert (ncopies >= 1);
3067 /* Is vectorizable store? */
3069 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3070 return false;
3072 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3073 if (TREE_CODE (scalar_dest) != ARRAY_REF
3074 && TREE_CODE (scalar_dest) != INDIRECT_REF
3075 && !DR_GROUP_FIRST_DR (stmt_info))
3076 return false;
3078 op = GIMPLE_STMT_OPERAND (stmt, 1);
3079 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
3081 if (vect_print_dump_info (REPORT_DETAILS))
3082 fprintf (vect_dump, "use not simple.");
3083 return false;
3086 vec_mode = TYPE_MODE (vectype);
3087 /* FORNOW. In some cases can vectorize even if data-type not supported
3088 (e.g. - array initialization with 0). */
3089 if (mov_optab->handlers[(int)vec_mode].insn_code == CODE_FOR_nothing)
3090 return false;
3092 if (!STMT_VINFO_DATA_REF (stmt_info))
3093 return false;
3095 if (DR_GROUP_FIRST_DR (stmt_info))
3097 strided_store = true;
3098 if (!vect_strided_store_supported (vectype))
3099 return false;
3102 if (!vec_stmt) /* transformation not required. */
3104 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
3105 return true;
3108 /** Transform. **/
3110 if (vect_print_dump_info (REPORT_DETAILS))
3111 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
3113 if (strided_store)
3115 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3116 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3117 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3119 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
3121 /* We vectorize all the stmts of the interleaving group when we
3122 reach the last stmt in the group. */
3123 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
3124 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
3126 *vec_stmt = NULL_TREE;
3127 return true;
3130 else
3132 first_stmt = stmt;
3133 first_dr = dr;
3134 group_size = 1;
3137 dr_chain = VEC_alloc (tree, heap, group_size);
3138 oprnds = VEC_alloc (tree, heap, group_size);
3140 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3141 gcc_assert (alignment_support_cheme);
3142 gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */
3144 /* In case the vectorization factor (VF) is bigger than the number
3145 of elements that we can fit in a vectype (nunits), we have to generate
3146 more than one vector stmt - i.e - we need to "unroll" the
3147 vector stmt by a factor VF/nunits. For more details see documentation in
3148 vect_get_vec_def_for_copy_stmt. */
3150 /* In case of interleaving (non-unit strided access):
3152 S1: &base + 2 = x2
3153 S2: &base = x0
3154 S3: &base + 1 = x1
3155 S4: &base + 3 = x3
3157 We create vectorized stores starting from base address (the access of the
3158 first stmt in the chain (S2 in the above example), when the last store stmt
3159 of the chain (S4) is reached:
3161 VS1: &base = vx2
3162 VS2: &base + vec_size*1 = vx0
3163 VS3: &base + vec_size*2 = vx1
3164 VS4: &base + vec_size*3 = vx3
3166 Then permutation statements are generated:
3168 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
3169 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
3172 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3173 (the order of the data-refs in the output of vect_permute_store_chain
3174 corresponds to the order of scalar stmts in the interleaving chain - see
3175 the documentation of vect_permute_store_chain()).
3177 In case of both multiple types and interleaving, above vector stores and
3178 permutation stmts are created for every copy. The result vector stmts are
3179 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3180 STMT_VINFO_RELATED_STMT for the next copies.
3183 prev_stmt_info = NULL;
3184 for (j = 0; j < ncopies; j++)
3186 tree new_stmt;
3187 tree ptr_incr;
3189 if (j == 0)
3191 /* For interleaved stores we collect vectorized defs for all the
3192 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
3193 as an input to vect_permute_store_chain(), and OPRNDS as an input
3194 to vect_get_vec_def_for_stmt_copy() for the next copy.
3195 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3196 OPRNDS are of size 1. */
3197 next_stmt = first_stmt;
3198 for (i = 0; i < group_size; i++)
3200 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
3201 is the exact number of stmts in the chain. Therefore, NEXT_STMT
3202 can't be NULL_TREE. In case that there is no interleaving,
3203 GROUP_SIZE is 1, and only one iteration of the loop will be
3204 executed. */
3205 gcc_assert (next_stmt);
3206 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
3207 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
3208 VEC_quick_push(tree, dr_chain, vec_oprnd);
3209 VEC_quick_push(tree, oprnds, vec_oprnd);
3210 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3212 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE,
3213 &dummy, &ptr_incr, false,
3214 TREE_TYPE (vec_oprnd));
3216 else
3218 /* For interleaved stores we created vectorized defs for all the
3219 defs stored in OPRNDS in the previous iteration (previous copy).
3220 DR_CHAIN is then used as an input to vect_permute_store_chain(),
3221 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
3222 next copy.
3223 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3224 OPRNDS are of size 1. */
3225 for (i = 0; i < group_size; i++)
3227 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
3228 VEC_index (tree, oprnds, i));
3229 VEC_replace(tree, dr_chain, i, vec_oprnd);
3230 VEC_replace(tree, oprnds, i, vec_oprnd);
3232 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3235 if (strided_store)
3237 result_chain = VEC_alloc (tree, heap, group_size);
3238 /* Permute. */
3239 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
3240 &result_chain))
3241 return false;
3244 next_stmt = first_stmt;
3245 for (i = 0; i < group_size; i++)
3247 /* For strided stores vectorized defs are interleaved in
3248 vect_permute_store_chain(). */
3249 if (strided_store)
3250 vec_oprnd = VEC_index(tree, result_chain, i);
3252 data_ref = build_fold_indirect_ref (dataref_ptr);
3253 /* Arguments are ready. Create the new vector stmt. */
3254 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
3255 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3257 /* Set the VDEFs for the vector pointer. If this virtual def
3258 has a use outside the loop and a loop peel is performed
3259 then the def may be renamed by the peel. Mark it for
3260 renaming so the later use will also be renamed. */
3261 copy_virtual_operands (new_stmt, next_stmt);
3262 if (j == 0)
3264 /* The original store is deleted so the same SSA_NAMEs
3265 can be used. */
3266 FOR_EACH_SSA_TREE_OPERAND (def, next_stmt, iter, SSA_OP_VDEF)
3268 SSA_NAME_DEF_STMT (def) = new_stmt;
3269 mark_sym_for_renaming (SSA_NAME_VAR (def));
3272 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3274 else
3276 /* Create new names for all the definitions created by COPY and
3277 add replacement mappings for each new name. */
3278 FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VDEF)
3280 create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p);
3281 mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p)));
3284 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3287 prev_stmt_info = vinfo_for_stmt (new_stmt);
3288 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3289 if (!next_stmt)
3290 break;
3291 /* Bump the vector pointer. */
3292 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3296 return true;
3300 /* Function vect_setup_realignment
3302 This function is called when vectorizing an unaligned load using
3303 the dr_unaligned_software_pipeline scheme.
3304 This function generates the following code at the loop prolog:
3306 p = initial_addr;
3307 msq_init = *(floor(p)); # prolog load
3308 realignment_token = call target_builtin;
3309 loop:
3310 msq = phi (msq_init, ---)
3312 The code above sets up a new (vector) pointer, pointing to the first
3313 location accessed by STMT, and a "floor-aligned" load using that pointer.
3314 It also generates code to compute the "realignment-token" (if the relevant
3315 target hook was defined), and creates a phi-node at the loop-header bb
3316 whose arguments are the result of the prolog-load (created by this
3317 function) and the result of a load that takes place in the loop (to be
3318 created by the caller to this function).
3319 The caller to this function uses the phi-result (msq) to create the
3320 realignment code inside the loop, and sets up the missing phi argument,
3321 as follows:
3323 loop:
3324 msq = phi (msq_init, lsq)
3325 lsq = *(floor(p')); # load in loop
3326 result = realign_load (msq, lsq, realignment_token);
3328 Input:
3329 STMT - (scalar) load stmt to be vectorized. This load accesses
3330 a memory location that may be unaligned.
3331 BSI - place where new code is to be inserted.
3333 Output:
3334 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
3335 target hook, if defined.
3336 Return value - the result of the loop-header phi node. */
3338 static tree
3339 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
3340 tree *realignment_token)
3342 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3343 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3344 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3345 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3346 edge pe = loop_preheader_edge (loop);
3347 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3348 tree vec_dest;
3349 tree init_addr;
3350 tree inc;
3351 tree ptr;
3352 tree data_ref;
3353 tree new_stmt;
3354 basic_block new_bb;
3355 tree msq_init;
3356 tree new_temp;
3357 tree phi_stmt;
3358 tree msq;
3360 /* 1. Create msq_init = *(floor(p1)) in the loop preheader */
3361 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3362 ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true,
3363 NULL_TREE);
3364 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
3365 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
3366 new_temp = make_ssa_name (vec_dest, new_stmt);
3367 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3368 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3369 gcc_assert (!new_bb);
3370 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
3371 copy_virtual_operands (new_stmt, stmt);
3372 update_vuses_to_preheader (new_stmt, loop);
3374 /* 2. Create permutation mask, if required, in loop preheader. */
3375 if (targetm.vectorize.builtin_mask_for_load)
3377 tree builtin_decl;
3379 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
3380 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
3381 vec_dest = vect_create_destination_var (scalar_dest,
3382 TREE_TYPE (new_stmt));
3383 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3384 new_temp = make_ssa_name (vec_dest, new_stmt);
3385 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3386 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3387 gcc_assert (!new_bb);
3388 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
3390 /* The result of the CALL_EXPR to this builtin is determined from
3391 the value of the parameter and no global variables are touched
3392 which makes the builtin a "const" function. Requiring the
3393 builtin to have the "const" attribute makes it unnecessary
3394 to call mark_call_clobbered. */
3395 gcc_assert (TREE_READONLY (builtin_decl));
3398 /* 3. Create msq = phi <msq_init, lsq> in loop */
3399 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3400 msq = make_ssa_name (vec_dest, NULL_TREE);
3401 phi_stmt = create_phi_node (msq, loop->header);
3402 SSA_NAME_DEF_STMT (msq) = phi_stmt;
3403 add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop));
3405 return msq;
3409 /* Function vect_strided_load_supported.
3411 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
3412 and FALSE otherwise. */
3414 static bool
3415 vect_strided_load_supported (tree vectype)
3417 optab perm_even_optab, perm_odd_optab;
3418 int mode;
3420 mode = (int) TYPE_MODE (vectype);
3422 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
3423 if (!perm_even_optab)
3425 if (vect_print_dump_info (REPORT_DETAILS))
3426 fprintf (vect_dump, "no optab for perm_even.");
3427 return false;
3430 if (perm_even_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3432 if (vect_print_dump_info (REPORT_DETAILS))
3433 fprintf (vect_dump, "perm_even op not supported by target.");
3434 return false;
3437 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
3438 if (!perm_odd_optab)
3440 if (vect_print_dump_info (REPORT_DETAILS))
3441 fprintf (vect_dump, "no optab for perm_odd.");
3442 return false;
3445 if (perm_odd_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3447 if (vect_print_dump_info (REPORT_DETAILS))
3448 fprintf (vect_dump, "perm_odd op not supported by target.");
3449 return false;
3451 return true;
3455 /* Function vect_permute_load_chain.
3457 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
3458 a power of 2, generate extract_even/odd stmts to reorder the input data
3459 correctly. Return the final references for loads in RESULT_CHAIN.
3461 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3462 The input is 4 vectors each containing 8 elements. We assign a number to each
3463 element, the input sequence is:
3465 1st vec: 0 1 2 3 4 5 6 7
3466 2nd vec: 8 9 10 11 12 13 14 15
3467 3rd vec: 16 17 18 19 20 21 22 23
3468 4th vec: 24 25 26 27 28 29 30 31
3470 The output sequence should be:
3472 1st vec: 0 4 8 12 16 20 24 28
3473 2nd vec: 1 5 9 13 17 21 25 29
3474 3rd vec: 2 6 10 14 18 22 26 30
3475 4th vec: 3 7 11 15 19 23 27 31
3477 i.e., the first output vector should contain the first elements of each
3478 interleaving group, etc.
3480 We use extract_even/odd instructions to create such output. The input of each
3481 extract_even/odd operation is two vectors
3482 1st vec 2nd vec
3483 0 1 2 3 4 5 6 7
3485 and the output is the vector of extracted even/odd elements. The output of
3486 extract_even will be: 0 2 4 6
3487 and of extract_odd: 1 3 5 7
3490 The permutation is done in log LENGTH stages. In each stage extract_even and
3491 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
3492 order. In our example,
3494 E1: extract_even (1st vec, 2nd vec)
3495 E2: extract_odd (1st vec, 2nd vec)
3496 E3: extract_even (3rd vec, 4th vec)
3497 E4: extract_odd (3rd vec, 4th vec)
3499 The output for the first stage will be:
3501 E1: 0 2 4 6 8 10 12 14
3502 E2: 1 3 5 7 9 11 13 15
3503 E3: 16 18 20 22 24 26 28 30
3504 E4: 17 19 21 23 25 27 29 31
3506 In order to proceed and create the correct sequence for the next stage (or
3507 for the correct output, if the second stage is the last one, as in our
3508 example), we first put the output of extract_even operation and then the
3509 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
3510 The input for the second stage is:
3512 1st vec (E1): 0 2 4 6 8 10 12 14
3513 2nd vec (E3): 16 18 20 22 24 26 28 30
3514 3rd vec (E2): 1 3 5 7 9 11 13 15
3515 4th vec (E4): 17 19 21 23 25 27 29 31
3517 The output of the second stage:
3519 E1: 0 4 8 12 16 20 24 28
3520 E2: 2 6 10 14 18 22 26 30
3521 E3: 1 5 9 13 17 21 25 29
3522 E4: 3 7 11 15 19 23 27 31
3524 And RESULT_CHAIN after reordering:
3526 1st vec (E1): 0 4 8 12 16 20 24 28
3527 2nd vec (E3): 1 5 9 13 17 21 25 29
3528 3rd vec (E2): 2 6 10 14 18 22 26 30
3529 4th vec (E4): 3 7 11 15 19 23 27 31. */
3531 static bool
3532 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
3533 unsigned int length,
3534 tree stmt,
3535 block_stmt_iterator *bsi,
3536 VEC(tree,heap) **result_chain)
3538 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
3539 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3540 tree tmp;
3541 int i;
3542 unsigned int j;
3544 /* Check that the operation is supported. */
3545 if (!vect_strided_load_supported (vectype))
3546 return false;
3548 *result_chain = VEC_copy (tree, heap, dr_chain);
3549 for (i = 0; i < exact_log2 (length); i++)
3551 for (j = 0; j < length; j +=2)
3553 first_vect = VEC_index (tree, dr_chain, j);
3554 second_vect = VEC_index (tree, dr_chain, j+1);
3556 /* data_ref = permute_even (first_data_ref, second_data_ref); */
3557 perm_dest = create_tmp_var (vectype, "vect_perm_even");
3558 DECL_GIMPLE_REG_P (perm_dest) = 1;
3559 add_referenced_var (perm_dest);
3561 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
3562 first_vect, second_vect);
3563 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
3565 data_ref = make_ssa_name (perm_dest, perm_stmt);
3566 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3567 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3568 mark_symbols_for_renaming (perm_stmt);
3570 VEC_replace (tree, *result_chain, j/2, data_ref);
3572 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
3573 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
3574 DECL_GIMPLE_REG_P (perm_dest) = 1;
3575 add_referenced_var (perm_dest);
3577 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
3578 first_vect, second_vect);
3579 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
3580 data_ref = make_ssa_name (perm_dest, perm_stmt);
3581 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3582 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3583 mark_symbols_for_renaming (perm_stmt);
3585 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
3587 dr_chain = VEC_copy (tree, heap, *result_chain);
3589 return true;
3593 /* Function vect_transform_strided_load.
3595 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
3596 to perform their permutation and ascribe the result vectorized statements to
3597 the scalar statements.
3600 static bool
3601 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
3602 block_stmt_iterator *bsi)
3604 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3605 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3606 tree next_stmt, new_stmt;
3607 VEC(tree,heap) *result_chain = NULL;
3608 unsigned int i, gap_count;
3609 tree tmp_data_ref;
3611 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
3612 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
3613 vectors, that are ready for vector computation. */
3614 result_chain = VEC_alloc (tree, heap, size);
3615 /* Permute. */
3616 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
3617 return false;
3619 /* Put a permuted data-ref in the VECTORIZED_STMT field.
3620 Since we scan the chain starting from it's first node, their order
3621 corresponds the order of data-refs in RESULT_CHAIN. */
3622 next_stmt = first_stmt;
3623 gap_count = 1;
3624 for (i = 0; VEC_iterate(tree, result_chain, i, tmp_data_ref); i++)
3626 if (!next_stmt)
3627 break;
3629 /* Skip the gaps. Loads created for the gaps will be removed by dead
3630 code elimination pass later.
3631 DR_GROUP_GAP is the number of steps in elements from the previous
3632 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
3633 correspond to the gaps.
3635 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
3637 gap_count++;
3638 continue;
3641 while (next_stmt)
3643 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
3644 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
3645 copies, and we put the new vector statement in the first available
3646 RELATED_STMT. */
3647 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
3648 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
3649 else
3651 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
3652 tree rel_stmt = STMT_VINFO_RELATED_STMT (
3653 vinfo_for_stmt (prev_stmt));
3654 while (rel_stmt)
3656 prev_stmt = rel_stmt;
3657 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
3659 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
3661 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3662 gap_count = 1;
3663 /* If NEXT_STMT accesses the same DR as the previous statement,
3664 put the same TMP_DATA_REF as its vectorized statement; otherwise
3665 get the next data-ref from RESULT_CHAIN. */
3666 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
3667 break;
3670 return true;
3674 /* vectorizable_load.
3676 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
3677 can be vectorized.
3678 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3679 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3680 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3682 bool
3683 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3685 tree scalar_dest;
3686 tree vec_dest = NULL;
3687 tree data_ref = NULL;
3688 tree op;
3689 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3690 stmt_vec_info prev_stmt_info;
3691 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3692 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3693 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
3694 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3695 tree new_temp;
3696 int mode;
3697 tree new_stmt = NULL_TREE;
3698 tree dummy;
3699 enum dr_alignment_support alignment_support_cheme;
3700 tree dataref_ptr = NULL_TREE;
3701 tree ptr_incr;
3702 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3703 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3704 int i, j, group_size;
3705 tree msq = NULL_TREE, lsq;
3706 tree offset = NULL_TREE;
3707 tree realignment_token = NULL_TREE;
3708 tree phi_stmt = NULL_TREE;
3709 VEC(tree,heap) *dr_chain = NULL;
3710 bool strided_load = false;
3711 tree first_stmt;
3713 /* Is vectorizable load? */
3714 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3715 return false;
3717 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3719 if (STMT_VINFO_LIVE_P (stmt_info))
3721 /* FORNOW: not yet supported. */
3722 if (vect_print_dump_info (REPORT_DETAILS))
3723 fprintf (vect_dump, "value used after loop.");
3724 return false;
3727 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3728 return false;
3730 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3731 if (TREE_CODE (scalar_dest) != SSA_NAME)
3732 return false;
3734 op = GIMPLE_STMT_OPERAND (stmt, 1);
3735 if (TREE_CODE (op) != ARRAY_REF
3736 && TREE_CODE (op) != INDIRECT_REF
3737 && !DR_GROUP_FIRST_DR (stmt_info))
3738 return false;
3740 if (!STMT_VINFO_DATA_REF (stmt_info))
3741 return false;
3743 mode = (int) TYPE_MODE (vectype);
3745 /* FORNOW. In some cases can vectorize even if data-type not supported
3746 (e.g. - data copies). */
3747 if (mov_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3749 if (vect_print_dump_info (REPORT_DETAILS))
3750 fprintf (vect_dump, "Aligned load, but unsupported type.");
3751 return false;
3754 /* Check if the load is a part of an interleaving chain. */
3755 if (DR_GROUP_FIRST_DR (stmt_info))
3757 strided_load = true;
3759 /* Check if interleaving is supported. */
3760 if (!vect_strided_load_supported (vectype))
3761 return false;
3764 if (!vec_stmt) /* transformation not required. */
3766 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
3767 return true;
3770 /** Transform. **/
3772 if (vect_print_dump_info (REPORT_DETAILS))
3773 fprintf (vect_dump, "transform load.");
3775 if (strided_load)
3777 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3778 /* Check if the chain of loads is already vectorized. */
3779 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
3781 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3782 return true;
3784 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3785 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3786 dr_chain = VEC_alloc (tree, heap, group_size);
3788 else
3790 first_stmt = stmt;
3791 first_dr = dr;
3792 group_size = 1;
3795 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3796 gcc_assert (alignment_support_cheme);
3799 /* In case the vectorization factor (VF) is bigger than the number
3800 of elements that we can fit in a vectype (nunits), we have to generate
3801 more than one vector stmt - i.e - we need to "unroll" the
3802 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3803 from one copy of the vector stmt to the next, in the field
3804 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3805 stages to find the correct vector defs to be used when vectorizing
3806 stmts that use the defs of the current stmt. The example below illustrates
3807 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3808 4 vectorized stmts):
3810 before vectorization:
3811 RELATED_STMT VEC_STMT
3812 S1: x = memref - -
3813 S2: z = x + 1 - -
3815 step 1: vectorize stmt S1:
3816 We first create the vector stmt VS1_0, and, as usual, record a
3817 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
3818 Next, we create the vector stmt VS1_1, and record a pointer to
3819 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
3820 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
3821 stmts and pointers:
3822 RELATED_STMT VEC_STMT
3823 VS1_0: vx0 = memref0 VS1_1 -
3824 VS1_1: vx1 = memref1 VS1_2 -
3825 VS1_2: vx2 = memref2 VS1_3 -
3826 VS1_3: vx3 = memref3 - -
3827 S1: x = load - VS1_0
3828 S2: z = x + 1 - -
3830 See in documentation in vect_get_vec_def_for_stmt_copy for how the
3831 information we recorded in RELATED_STMT field is used to vectorize
3832 stmt S2. */
3834 /* In case of interleaving (non-unit strided access):
3836 S1: x2 = &base + 2
3837 S2: x0 = &base
3838 S3: x1 = &base + 1
3839 S4: x3 = &base + 3
3841 Vectorized loads are created in the order of memory accesses
3842 starting from the access of the first stmt of the chain:
3844 VS1: vx0 = &base
3845 VS2: vx1 = &base + vec_size*1
3846 VS3: vx3 = &base + vec_size*2
3847 VS4: vx4 = &base + vec_size*3
3849 Then permutation statements are generated:
3851 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
3852 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
3855 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3856 (the order of the data-refs in the output of vect_permute_load_chain
3857 corresponds to the order of scalar stmts in the interleaving chain - see
3858 the documentation of vect_permute_load_chain()).
3859 The generation of permutation stmts and recording them in
3860 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
3862 In case of both multiple types and interleaving, the vector loads and
3863 permutation stmts above are created for every copy. The result vector stmts
3864 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3865 STMT_VINFO_RELATED_STMT for the next copies. */
3867 /* If the data reference is aligned (dr_aligned) or potentially unaligned
3868 on a target that supports unaligned accesses (dr_unaligned_supported)
3869 we generate the following code:
3870 p = initial_addr;
3871 indx = 0;
3872 loop {
3873 p = p + indx * vectype_size;
3874 vec_dest = *(p);
3875 indx = indx + 1;
3878 Otherwise, the data reference is potentially unaligned on a target that
3879 does not support unaligned accesses (dr_unaligned_software_pipeline) -
3880 then generate the following code, in which the data in each iteration is
3881 obtained by two vector loads, one from the previous iteration, and one
3882 from the current iteration:
3883 p1 = initial_addr;
3884 msq_init = *(floor(p1))
3885 p2 = initial_addr + VS - 1;
3886 realignment_token = call target_builtin;
3887 indx = 0;
3888 loop {
3889 p2 = p2 + indx * vectype_size
3890 lsq = *(floor(p2))
3891 vec_dest = realign_load (msq, lsq, realignment_token)
3892 indx = indx + 1;
3893 msq = lsq;
3894 } */
3896 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3898 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token);
3899 phi_stmt = SSA_NAME_DEF_STMT (msq);
3900 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
3903 prev_stmt_info = NULL;
3904 for (j = 0; j < ncopies; j++)
3906 /* 1. Create the vector pointer update chain. */
3907 if (j == 0)
3908 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset, &dummy,
3909 &ptr_incr, false, NULL_TREE);
3910 else
3911 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3913 for (i = 0; i < group_size; i++)
3915 /* 2. Create the vector-load in the loop. */
3916 switch (alignment_support_cheme)
3918 case dr_aligned:
3919 gcc_assert (aligned_access_p (first_dr));
3920 data_ref = build_fold_indirect_ref (dataref_ptr);
3921 break;
3922 case dr_unaligned_supported:
3924 int mis = DR_MISALIGNMENT (first_dr);
3925 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
3927 gcc_assert (!aligned_access_p (first_dr));
3928 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
3929 data_ref =
3930 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
3931 break;
3933 case dr_unaligned_software_pipeline:
3934 gcc_assert (!aligned_access_p (first_dr));
3935 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
3936 break;
3937 default:
3938 gcc_unreachable ();
3940 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3941 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
3942 new_temp = make_ssa_name (vec_dest, new_stmt);
3943 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3944 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3945 copy_virtual_operands (new_stmt, stmt);
3946 mark_symbols_for_renaming (new_stmt);
3948 /* 3. Handle explicit realignment if necessary/supported. */
3949 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3951 /* Create in loop:
3952 <vec_dest = realign_load (msq, lsq, realignment_token)> */
3953 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
3954 if (!realignment_token)
3955 realignment_token = dataref_ptr;
3956 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3957 new_stmt =
3958 build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token);
3959 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3960 new_temp = make_ssa_name (vec_dest, new_stmt);
3961 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3962 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3963 if (i == group_size - 1 && j == ncopies - 1)
3964 add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop));
3965 msq = lsq;
3967 if (strided_load)
3968 VEC_quick_push (tree, dr_chain, new_temp);
3969 if (i < group_size - 1)
3970 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3973 if (strided_load)
3975 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
3976 return false;
3977 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3978 dr_chain = VEC_alloc (tree, heap, group_size);
3980 else
3982 if (j == 0)
3983 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3984 else
3985 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3986 prev_stmt_info = vinfo_for_stmt (new_stmt);
3990 return true;
3994 /* Function vectorizable_live_operation.
3996 STMT computes a value that is used outside the loop. Check if
3997 it can be supported. */
3999 bool
4000 vectorizable_live_operation (tree stmt,
4001 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
4002 tree *vec_stmt ATTRIBUTE_UNUSED)
4004 tree operation;
4005 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4006 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4007 int i;
4008 int op_type;
4009 tree op;
4010 tree def, def_stmt;
4011 enum vect_def_type dt;
4013 if (!STMT_VINFO_LIVE_P (stmt_info))
4014 return false;
4016 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4017 return false;
4019 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4020 return false;
4022 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4023 op_type = TREE_OPERAND_LENGTH (operation);
4025 /* FORNOW: support only if all uses are invariant. This means
4026 that the scalar operations can remain in place, unvectorized.
4027 The original last scalar value that they compute will be used. */
4029 for (i = 0; i < op_type; i++)
4031 op = TREE_OPERAND (operation, i);
4032 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4034 if (vect_print_dump_info (REPORT_DETAILS))
4035 fprintf (vect_dump, "use not simple.");
4036 return false;
4039 if (dt != vect_invariant_def && dt != vect_constant_def)
4040 return false;
4043 /* No transformation is required for the cases we currently support. */
4044 return true;
4048 /* Function vect_is_simple_cond.
4050 Input:
4051 LOOP - the loop that is being vectorized.
4052 COND - Condition that is checked for simple use.
4054 Returns whether a COND can be vectorized. Checks whether
4055 condition operands are supportable using vec_is_simple_use. */
4057 static bool
4058 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
4060 tree lhs, rhs;
4061 tree def;
4062 enum vect_def_type dt;
4064 if (!COMPARISON_CLASS_P (cond))
4065 return false;
4067 lhs = TREE_OPERAND (cond, 0);
4068 rhs = TREE_OPERAND (cond, 1);
4070 if (TREE_CODE (lhs) == SSA_NAME)
4072 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
4073 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
4074 return false;
4076 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST)
4077 return false;
4079 if (TREE_CODE (rhs) == SSA_NAME)
4081 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
4082 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
4083 return false;
4085 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST)
4086 return false;
4088 return true;
4091 /* vectorizable_condition.
4093 Check if STMT is conditional modify expression that can be vectorized.
4094 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4095 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
4096 at BSI.
4098 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4100 bool
4101 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
4103 tree scalar_dest = NULL_TREE;
4104 tree vec_dest = NULL_TREE;
4105 tree op = NULL_TREE;
4106 tree cond_expr, then_clause, else_clause;
4107 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4108 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4109 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
4110 tree vec_compare, vec_cond_expr;
4111 tree new_temp;
4112 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4113 enum machine_mode vec_mode;
4114 tree def;
4115 enum vect_def_type dt;
4116 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4117 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4119 gcc_assert (ncopies >= 1);
4120 if (ncopies > 1)
4121 return false; /* FORNOW */
4123 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4124 return false;
4126 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
4128 if (STMT_VINFO_LIVE_P (stmt_info))
4130 /* FORNOW: not yet supported. */
4131 if (vect_print_dump_info (REPORT_DETAILS))
4132 fprintf (vect_dump, "value used after loop.");
4133 return false;
4136 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4137 return false;
4139 op = GIMPLE_STMT_OPERAND (stmt, 1);
4141 if (TREE_CODE (op) != COND_EXPR)
4142 return false;
4144 cond_expr = TREE_OPERAND (op, 0);
4145 then_clause = TREE_OPERAND (op, 1);
4146 else_clause = TREE_OPERAND (op, 2);
4148 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
4149 return false;
4151 /* We do not handle two different vector types for the condition
4152 and the values. */
4153 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
4154 return false;
4156 if (TREE_CODE (then_clause) == SSA_NAME)
4158 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
4159 if (!vect_is_simple_use (then_clause, loop_vinfo,
4160 &then_def_stmt, &def, &dt))
4161 return false;
4163 else if (TREE_CODE (then_clause) != INTEGER_CST
4164 && TREE_CODE (then_clause) != REAL_CST)
4165 return false;
4167 if (TREE_CODE (else_clause) == SSA_NAME)
4169 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
4170 if (!vect_is_simple_use (else_clause, loop_vinfo,
4171 &else_def_stmt, &def, &dt))
4172 return false;
4174 else if (TREE_CODE (else_clause) != INTEGER_CST
4175 && TREE_CODE (else_clause) != REAL_CST)
4176 return false;
4179 vec_mode = TYPE_MODE (vectype);
4181 if (!vec_stmt)
4183 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
4184 return expand_vec_cond_expr_p (op, vec_mode);
4187 /* Transform */
4189 /* Handle def. */
4190 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4191 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4193 /* Handle cond expr. */
4194 vec_cond_lhs =
4195 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
4196 vec_cond_rhs =
4197 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
4198 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
4199 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
4201 /* Arguments are ready. create the new vector stmt. */
4202 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
4203 vec_cond_lhs, vec_cond_rhs);
4204 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
4205 vec_compare, vec_then_clause, vec_else_clause);
4207 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
4208 new_temp = make_ssa_name (vec_dest, *vec_stmt);
4209 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
4210 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
4212 return true;
4215 /* Function vect_transform_stmt.
4217 Create a vectorized stmt to replace STMT, and insert it at BSI. */
4219 bool
4220 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
4222 bool is_store = false;
4223 tree vec_stmt = NULL_TREE;
4224 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4225 tree orig_stmt_in_pattern;
4226 bool done;
4228 if (STMT_VINFO_RELEVANT_P (stmt_info))
4230 switch (STMT_VINFO_TYPE (stmt_info))
4232 case type_demotion_vec_info_type:
4233 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
4234 gcc_assert (done);
4235 break;
4237 case type_promotion_vec_info_type:
4238 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
4239 gcc_assert (done);
4240 break;
4242 case type_conversion_vec_info_type:
4243 done = vectorizable_conversion (stmt, bsi, &vec_stmt);
4244 gcc_assert (done);
4245 break;
4247 case op_vec_info_type:
4248 done = vectorizable_operation (stmt, bsi, &vec_stmt);
4249 gcc_assert (done);
4250 break;
4252 case assignment_vec_info_type:
4253 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
4254 gcc_assert (done);
4255 break;
4257 case load_vec_info_type:
4258 done = vectorizable_load (stmt, bsi, &vec_stmt);
4259 gcc_assert (done);
4260 break;
4262 case store_vec_info_type:
4263 done = vectorizable_store (stmt, bsi, &vec_stmt);
4264 gcc_assert (done);
4265 if (DR_GROUP_FIRST_DR (stmt_info))
4267 /* In case of interleaving, the whole chain is vectorized when the
4268 last store in the chain is reached. Store stmts before the last
4269 one are skipped, and there vec_stmt_info shouldn't be freed
4270 meanwhile. */
4271 *strided_store = true;
4272 if (STMT_VINFO_VEC_STMT (stmt_info))
4273 is_store = true;
4275 else
4276 is_store = true;
4277 break;
4279 case condition_vec_info_type:
4280 done = vectorizable_condition (stmt, bsi, &vec_stmt);
4281 gcc_assert (done);
4282 break;
4284 case call_vec_info_type:
4285 done = vectorizable_call (stmt, bsi, &vec_stmt);
4286 break;
4288 default:
4289 if (vect_print_dump_info (REPORT_DETAILS))
4290 fprintf (vect_dump, "stmt not supported.");
4291 gcc_unreachable ();
4294 gcc_assert (vec_stmt || *strided_store);
4295 if (vec_stmt)
4297 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
4298 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
4299 if (orig_stmt_in_pattern)
4301 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
4302 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
4304 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4306 /* STMT was inserted by the vectorizer to replace a
4307 computation idiom. ORIG_STMT_IN_PATTERN is a stmt in the
4308 original sequence that computed this idiom. We need to
4309 record a pointer to VEC_STMT in the stmt_info of
4310 ORIG_STMT_IN_PATTERN. See more details in the
4311 documentation of vect_pattern_recog. */
4313 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
4319 if (STMT_VINFO_LIVE_P (stmt_info))
4321 switch (STMT_VINFO_TYPE (stmt_info))
4323 case reduc_vec_info_type:
4324 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
4325 gcc_assert (done);
4326 break;
4328 default:
4329 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
4330 gcc_assert (done);
4334 return is_store;
4338 /* This function builds ni_name = number of iterations loop executes
4339 on the loop preheader. */
4341 static tree
4342 vect_build_loop_niters (loop_vec_info loop_vinfo)
4344 tree ni_name, stmt, var;
4345 edge pe;
4346 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4347 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
4349 var = create_tmp_var (TREE_TYPE (ni), "niters");
4350 add_referenced_var (var);
4351 ni_name = force_gimple_operand (ni, &stmt, false, var);
4353 pe = loop_preheader_edge (loop);
4354 if (stmt)
4356 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4357 gcc_assert (!new_bb);
4360 return ni_name;
4364 /* This function generates the following statements:
4366 ni_name = number of iterations loop executes
4367 ratio = ni_name / vf
4368 ratio_mult_vf_name = ratio * vf
4370 and places them at the loop preheader edge. */
4372 static void
4373 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
4374 tree *ni_name_ptr,
4375 tree *ratio_mult_vf_name_ptr,
4376 tree *ratio_name_ptr)
4379 edge pe;
4380 basic_block new_bb;
4381 tree stmt, ni_name;
4382 tree var;
4383 tree ratio_name;
4384 tree ratio_mult_vf_name;
4385 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4386 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
4387 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4388 tree log_vf;
4390 pe = loop_preheader_edge (loop);
4392 /* Generate temporary variable that contains
4393 number of iterations loop executes. */
4395 ni_name = vect_build_loop_niters (loop_vinfo);
4396 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
4398 /* Create: ratio = ni >> log2(vf) */
4400 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
4401 if (!is_gimple_val (ratio_name))
4403 var = create_tmp_var (TREE_TYPE (ni), "bnd");
4404 add_referenced_var (var);
4406 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
4407 pe = loop_preheader_edge (loop);
4408 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4409 gcc_assert (!new_bb);
4412 /* Create: ratio_mult_vf = ratio << log2 (vf). */
4414 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
4415 ratio_name, log_vf);
4416 if (!is_gimple_val (ratio_mult_vf_name))
4418 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
4419 add_referenced_var (var);
4421 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
4422 true, var);
4423 pe = loop_preheader_edge (loop);
4424 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4425 gcc_assert (!new_bb);
4428 *ni_name_ptr = ni_name;
4429 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
4430 *ratio_name_ptr = ratio_name;
4432 return;
4436 /* Function update_vuses_to_preheader.
4438 Input:
4439 STMT - a statement with potential VUSEs.
4440 LOOP - the loop whose preheader will contain STMT.
4442 It's possible to vectorize a loop even though an SSA_NAME from a VUSE
4443 appears to be defined in a VDEF in another statement in a loop.
4444 One such case is when the VUSE is at the dereference of a __restricted__
4445 pointer in a load and the VDEF is at the dereference of a different
4446 __restricted__ pointer in a store. Vectorization may result in
4447 copy_virtual_uses being called to copy the problematic VUSE to a new
4448 statement that is being inserted in the loop preheader. This procedure
4449 is called to change the SSA_NAME in the new statement's VUSE from the
4450 SSA_NAME updated in the loop to the related SSA_NAME available on the
4451 path entering the loop.
4453 When this function is called, we have the following situation:
4455 # vuse <name1>
4456 S1: vload
4457 do {
4458 # name1 = phi < name0 , name2>
4460 # vuse <name1>
4461 S2: vload
4463 # name2 = vdef <name1>
4464 S3: vstore
4466 }while...
4468 Stmt S1 was created in the loop preheader block as part of misaligned-load
4469 handling. This function fixes the name of the vuse of S1 from 'name1' to
4470 'name0'. */
4472 static void
4473 update_vuses_to_preheader (tree stmt, struct loop *loop)
4475 basic_block header_bb = loop->header;
4476 edge preheader_e = loop_preheader_edge (loop);
4477 ssa_op_iter iter;
4478 use_operand_p use_p;
4480 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VUSE)
4482 tree ssa_name = USE_FROM_PTR (use_p);
4483 tree def_stmt = SSA_NAME_DEF_STMT (ssa_name);
4484 tree name_var = SSA_NAME_VAR (ssa_name);
4485 basic_block bb = bb_for_stmt (def_stmt);
4487 /* For a use before any definitions, def_stmt is a NOP_EXPR. */
4488 if (!IS_EMPTY_STMT (def_stmt)
4489 && flow_bb_inside_loop_p (loop, bb))
4491 /* If the block containing the statement defining the SSA_NAME
4492 is in the loop then it's necessary to find the definition
4493 outside the loop using the PHI nodes of the header. */
4494 tree phi;
4495 bool updated = false;
4497 for (phi = phi_nodes (header_bb); phi; phi = TREE_CHAIN (phi))
4499 if (SSA_NAME_VAR (PHI_RESULT (phi)) == name_var)
4501 SET_USE (use_p, PHI_ARG_DEF (phi, preheader_e->dest_idx));
4502 updated = true;
4503 break;
4506 gcc_assert (updated);
4512 /* Function vect_update_ivs_after_vectorizer.
4514 "Advance" the induction variables of LOOP to the value they should take
4515 after the execution of LOOP. This is currently necessary because the
4516 vectorizer does not handle induction variables that are used after the
4517 loop. Such a situation occurs when the last iterations of LOOP are
4518 peeled, because:
4519 1. We introduced new uses after LOOP for IVs that were not originally used
4520 after LOOP: the IVs of LOOP are now used by an epilog loop.
4521 2. LOOP is going to be vectorized; this means that it will iterate N/VF
4522 times, whereas the loop IVs should be bumped N times.
4524 Input:
4525 - LOOP - a loop that is going to be vectorized. The last few iterations
4526 of LOOP were peeled.
4527 - NITERS - the number of iterations that LOOP executes (before it is
4528 vectorized). i.e, the number of times the ivs should be bumped.
4529 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
4530 coming out from LOOP on which there are uses of the LOOP ivs
4531 (this is the path from LOOP->exit to epilog_loop->preheader).
4533 The new definitions of the ivs are placed in LOOP->exit.
4534 The phi args associated with the edge UPDATE_E in the bb
4535 UPDATE_E->dest are updated accordingly.
4537 Assumption 1: Like the rest of the vectorizer, this function assumes
4538 a single loop exit that has a single predecessor.
4540 Assumption 2: The phi nodes in the LOOP header and in update_bb are
4541 organized in the same order.
4543 Assumption 3: The access function of the ivs is simple enough (see
4544 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
4546 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
4547 coming out of LOOP on which the ivs of LOOP are used (this is the path
4548 that leads to the epilog loop; other paths skip the epilog loop). This
4549 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
4550 needs to have its phis updated.
4553 static void
4554 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
4555 edge update_e)
4557 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4558 basic_block exit_bb = single_exit (loop)->dest;
4559 tree phi, phi1;
4560 basic_block update_bb = update_e->dest;
4562 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
4564 /* Make sure there exists a single-predecessor exit bb: */
4565 gcc_assert (single_pred_p (exit_bb));
4567 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
4568 phi && phi1;
4569 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
4571 tree access_fn = NULL;
4572 tree evolution_part;
4573 tree init_expr;
4574 tree step_expr;
4575 tree var, stmt, ni, ni_name;
4576 block_stmt_iterator last_bsi;
4578 if (vect_print_dump_info (REPORT_DETAILS))
4580 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
4581 print_generic_expr (vect_dump, phi, TDF_SLIM);
4584 /* Skip virtual phi's. */
4585 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
4587 if (vect_print_dump_info (REPORT_DETAILS))
4588 fprintf (vect_dump, "virtual phi. skip.");
4589 continue;
4592 /* Skip reduction phis. */
4593 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
4595 if (vect_print_dump_info (REPORT_DETAILS))
4596 fprintf (vect_dump, "reduc phi. skip.");
4597 continue;
4600 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
4601 gcc_assert (access_fn);
4602 evolution_part =
4603 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
4604 gcc_assert (evolution_part != NULL_TREE);
4606 /* FORNOW: We do not support IVs whose evolution function is a polynomial
4607 of degree >= 2 or exponential. */
4608 gcc_assert (!tree_is_chrec (evolution_part));
4610 step_expr = evolution_part;
4611 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
4612 loop->num));
4614 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
4615 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
4616 fold_convert (TREE_TYPE (init_expr),
4617 niters),
4618 step_expr),
4619 init_expr);
4621 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
4622 add_referenced_var (var);
4624 ni_name = force_gimple_operand (ni, &stmt, false, var);
4626 /* Insert stmt into exit_bb. */
4627 last_bsi = bsi_last (exit_bb);
4628 if (stmt)
4629 bsi_insert_before (&last_bsi, stmt, BSI_SAME_STMT);
4631 /* Fix phi expressions in the successor bb. */
4632 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
4637 /* Function vect_do_peeling_for_loop_bound
4639 Peel the last iterations of the loop represented by LOOP_VINFO.
4640 The peeled iterations form a new epilog loop. Given that the loop now
4641 iterates NITERS times, the new epilog loop iterates
4642 NITERS % VECTORIZATION_FACTOR times.
4644 The original loop will later be made to iterate
4645 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
4647 static void
4648 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
4650 tree ni_name, ratio_mult_vf_name;
4651 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4652 struct loop *new_loop;
4653 edge update_e;
4654 basic_block preheader;
4655 int loop_num;
4656 unsigned int th;
4658 if (vect_print_dump_info (REPORT_DETAILS))
4659 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
4661 initialize_original_copy_tables ();
4663 /* Generate the following variables on the preheader of original loop:
4665 ni_name = number of iteration the original loop executes
4666 ratio = ni_name / vf
4667 ratio_mult_vf_name = ratio * vf */
4668 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
4669 &ratio_mult_vf_name, ratio);
4671 loop_num = loop->num;
4672 /* Threshold for vectorized loop. */
4673 th = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)) *
4674 LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4675 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
4676 ratio_mult_vf_name, ni_name, false, th);
4677 gcc_assert (new_loop);
4678 gcc_assert (loop_num == loop->num);
4679 #ifdef ENABLE_CHECKING
4680 slpeel_verify_cfg_after_peeling (loop, new_loop);
4681 #endif
4683 /* A guard that controls whether the new_loop is to be executed or skipped
4684 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
4685 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
4686 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
4687 is on the path where the LOOP IVs are used and need to be updated. */
4689 preheader = loop_preheader_edge (new_loop)->src;
4690 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
4691 update_e = EDGE_PRED (preheader, 0);
4692 else
4693 update_e = EDGE_PRED (preheader, 1);
4695 /* Update IVs of original loop as if they were advanced
4696 by ratio_mult_vf_name steps. */
4697 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
4699 /* After peeling we have to reset scalar evolution analyzer. */
4700 scev_reset ();
4702 free_original_copy_tables ();
4706 /* Function vect_gen_niters_for_prolog_loop
4708 Set the number of iterations for the loop represented by LOOP_VINFO
4709 to the minimum between LOOP_NITERS (the original iteration count of the loop)
4710 and the misalignment of DR - the data reference recorded in
4711 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
4712 this loop, the data reference DR will refer to an aligned location.
4714 The following computation is generated:
4716 If the misalignment of DR is known at compile time:
4717 addr_mis = int mis = DR_MISALIGNMENT (dr);
4718 Else, compute address misalignment in bytes:
4719 addr_mis = addr & (vectype_size - 1)
4721 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
4723 (elem_size = element type size; an element is the scalar element
4724 whose type is the inner type of the vectype)
4726 For interleaving,
4728 prolog_niters = min ( LOOP_NITERS ,
4729 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
4730 where group_size is the size of the interleaved group.
4733 static tree
4734 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
4736 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
4737 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4738 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4739 tree var, stmt;
4740 tree iters, iters_name;
4741 edge pe;
4742 basic_block new_bb;
4743 tree dr_stmt = DR_STMT (dr);
4744 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
4745 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4746 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
4747 tree niters_type = TREE_TYPE (loop_niters);
4748 int group_size = 1;
4749 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
4751 if (DR_GROUP_FIRST_DR (stmt_info))
4753 /* For interleaved access element size must be multiplied by the size of
4754 the interleaved group. */
4755 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
4756 DR_GROUP_FIRST_DR (stmt_info)));
4757 element_size *= group_size;
4760 pe = loop_preheader_edge (loop);
4762 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
4764 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
4765 int elem_misalign = byte_misalign / element_size;
4767 if (vect_print_dump_info (REPORT_DETAILS))
4768 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
4769 iters = build_int_cst (niters_type,
4770 (vf - elem_misalign)&(vf/group_size-1));
4772 else
4774 tree new_stmts = NULL_TREE;
4775 tree start_addr =
4776 vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE);
4777 tree ptr_type = TREE_TYPE (start_addr);
4778 tree size = TYPE_SIZE (ptr_type);
4779 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
4780 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
4781 tree elem_size_log =
4782 build_int_cst (type, exact_log2 (vectype_align/vf));
4783 tree vf_minus_1 = build_int_cst (type, vf - 1);
4784 tree vf_tree = build_int_cst (type, vf);
4785 tree byte_misalign;
4786 tree elem_misalign;
4788 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
4789 gcc_assert (!new_bb);
4791 /* Create: byte_misalign = addr & (vectype_size - 1) */
4792 byte_misalign =
4793 fold_build2 (BIT_AND_EXPR, type, start_addr, vectype_size_minus_1);
4795 /* Create: elem_misalign = byte_misalign / element_size */
4796 elem_misalign =
4797 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
4799 /* Create: (niters_type) (VF - elem_misalign)&(VF - 1) */
4800 iters = fold_build2 (MINUS_EXPR, type, vf_tree, elem_misalign);
4801 iters = fold_build2 (BIT_AND_EXPR, type, iters, vf_minus_1);
4802 iters = fold_convert (niters_type, iters);
4805 /* Create: prolog_loop_niters = min (iters, loop_niters) */
4806 /* If the loop bound is known at compile time we already verified that it is
4807 greater than vf; since the misalignment ('iters') is at most vf, there's
4808 no need to generate the MIN_EXPR in this case. */
4809 if (TREE_CODE (loop_niters) != INTEGER_CST)
4810 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
4812 if (vect_print_dump_info (REPORT_DETAILS))
4814 fprintf (vect_dump, "niters for prolog loop: ");
4815 print_generic_expr (vect_dump, iters, TDF_SLIM);
4818 var = create_tmp_var (niters_type, "prolog_loop_niters");
4819 add_referenced_var (var);
4820 iters_name = force_gimple_operand (iters, &stmt, false, var);
4822 /* Insert stmt on loop preheader edge. */
4823 if (stmt)
4825 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4826 gcc_assert (!new_bb);
4829 return iters_name;
4833 /* Function vect_update_init_of_dr
4835 NITERS iterations were peeled from LOOP. DR represents a data reference
4836 in LOOP. This function updates the information recorded in DR to
4837 account for the fact that the first NITERS iterations had already been
4838 executed. Specifically, it updates the OFFSET field of DR. */
4840 static void
4841 vect_update_init_of_dr (struct data_reference *dr, tree niters)
4843 tree offset = DR_OFFSET (dr);
4845 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
4846 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
4847 DR_OFFSET (dr) = offset;
4851 /* Function vect_update_inits_of_drs
4853 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
4854 This function updates the information recorded for the data references in
4855 the loop to account for the fact that the first NITERS iterations had
4856 already been executed. Specifically, it updates the initial_condition of the
4857 access_function of all the data_references in the loop. */
4859 static void
4860 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
4862 unsigned int i;
4863 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
4864 struct data_reference *dr;
4866 if (vect_dump && (dump_flags & TDF_DETAILS))
4867 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
4869 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
4870 vect_update_init_of_dr (dr, niters);
4874 /* Function vect_do_peeling_for_alignment
4876 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
4877 'niters' is set to the misalignment of one of the data references in the
4878 loop, thereby forcing it to refer to an aligned location at the beginning
4879 of the execution of this loop. The data reference for which we are
4880 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
4882 static void
4883 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
4885 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4886 tree niters_of_prolog_loop, ni_name;
4887 tree n_iters;
4888 struct loop *new_loop;
4890 if (vect_print_dump_info (REPORT_DETAILS))
4891 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
4893 initialize_original_copy_tables ();
4895 ni_name = vect_build_loop_niters (loop_vinfo);
4896 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
4898 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
4899 new_loop =
4900 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
4901 niters_of_prolog_loop, ni_name, true, 0);
4902 gcc_assert (new_loop);
4903 #ifdef ENABLE_CHECKING
4904 slpeel_verify_cfg_after_peeling (new_loop, loop);
4905 #endif
4907 /* Update number of times loop executes. */
4908 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
4909 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
4910 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
4912 /* Update the init conditions of the access functions of all data refs. */
4913 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
4915 /* After peeling we have to reset scalar evolution analyzer. */
4916 scev_reset ();
4918 free_original_copy_tables ();
4922 /* Function vect_create_cond_for_align_checks.
4924 Create a conditional expression that represents the alignment checks for
4925 all of data references (array element references) whose alignment must be
4926 checked at runtime.
4928 Input:
4929 LOOP_VINFO - two fields of the loop information are used.
4930 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
4931 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
4933 Output:
4934 COND_EXPR_STMT_LIST - statements needed to construct the conditional
4935 expression.
4936 The returned value is the conditional expression to be used in the if
4937 statement that controls which version of the loop gets executed at runtime.
4939 The algorithm makes two assumptions:
4940 1) The number of bytes "n" in a vector is a power of 2.
4941 2) An address "a" is aligned if a%n is zero and that this
4942 test can be done as a&(n-1) == 0. For example, for 16
4943 byte vectors the test is a&0xf == 0. */
4945 static tree
4946 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
4947 tree *cond_expr_stmt_list)
4949 VEC(tree,heap) *may_misalign_stmts
4950 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
4951 tree ref_stmt, tmp;
4952 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
4953 tree mask_cst;
4954 unsigned int i;
4955 tree psize;
4956 tree int_ptrsize_type;
4957 char tmp_name[20];
4958 tree or_tmp_name = NULL_TREE;
4959 tree and_tmp, and_tmp_name, and_stmt;
4960 tree ptrsize_zero;
4962 /* Check that mask is one less than a power of 2, i.e., mask is
4963 all zeros followed by all ones. */
4964 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
4966 /* CHECKME: what is the best integer or unsigned type to use to hold a
4967 cast from a pointer value? */
4968 psize = TYPE_SIZE (ptr_type_node);
4969 int_ptrsize_type
4970 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
4972 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
4973 of the first vector of the i'th data reference. */
4975 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
4977 tree new_stmt_list = NULL_TREE;
4978 tree addr_base;
4979 tree addr_tmp, addr_tmp_name, addr_stmt;
4980 tree or_tmp, new_or_tmp_name, or_stmt;
4982 /* create: addr_tmp = (int)(address_of_first_vector) */
4983 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
4984 &new_stmt_list,
4985 NULL_TREE);
4987 if (new_stmt_list != NULL_TREE)
4988 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
4990 sprintf (tmp_name, "%s%d", "addr2int", i);
4991 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4992 add_referenced_var (addr_tmp);
4993 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
4994 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
4995 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
4996 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
4997 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
4999 /* The addresses are OR together. */
5001 if (or_tmp_name != NULL_TREE)
5003 /* create: or_tmp = or_tmp | addr_tmp */
5004 sprintf (tmp_name, "%s%d", "orptrs", i);
5005 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
5006 add_referenced_var (or_tmp);
5007 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
5008 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
5009 or_tmp_name, addr_tmp_name);
5010 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
5011 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
5012 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
5013 or_tmp_name = new_or_tmp_name;
5015 else
5016 or_tmp_name = addr_tmp_name;
5018 } /* end for i */
5020 mask_cst = build_int_cst (int_ptrsize_type, mask);
5022 /* create: and_tmp = or_tmp & mask */
5023 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
5024 add_referenced_var (and_tmp);
5025 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
5027 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
5028 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
5029 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
5030 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
5032 /* Make and_tmp the left operand of the conditional test against zero.
5033 if and_tmp has a nonzero bit then some address is unaligned. */
5034 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
5035 return build2 (EQ_EXPR, boolean_type_node,
5036 and_tmp_name, ptrsize_zero);
5040 /* Function vect_transform_loop.
5042 The analysis phase has determined that the loop is vectorizable.
5043 Vectorize the loop - created vectorized stmts to replace the scalar
5044 stmts in the loop, and update the loop exit condition. */
5046 void
5047 vect_transform_loop (loop_vec_info loop_vinfo)
5049 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5050 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5051 int nbbs = loop->num_nodes;
5052 block_stmt_iterator si, next_si;
5053 int i;
5054 tree ratio = NULL;
5055 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5056 bool strided_store;
5058 if (vect_print_dump_info (REPORT_DETAILS))
5059 fprintf (vect_dump, "=== vec_transform_loop ===");
5061 /* If the loop has data references that may or may not be aligned then
5062 two versions of the loop need to be generated, one which is vectorized
5063 and one which isn't. A test is then generated to control which of the
5064 loops is executed. The test checks for the alignment of all of the
5065 data references that may or may not be aligned. */
5067 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
5069 struct loop *nloop;
5070 tree cond_expr;
5071 tree cond_expr_stmt_list = NULL_TREE;
5072 basic_block condition_bb;
5073 block_stmt_iterator cond_exp_bsi;
5074 basic_block merge_bb;
5075 basic_block new_exit_bb;
5076 edge new_exit_e, e;
5077 tree orig_phi, new_phi, arg;
5078 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
5080 cond_expr = vect_create_cond_for_align_checks (loop_vinfo,
5081 &cond_expr_stmt_list);
5082 initialize_original_copy_tables ();
5083 nloop = loop_version (loop, cond_expr, &condition_bb,
5084 prob, prob, REG_BR_PROB_BASE - prob, true);
5085 free_original_copy_tables();
5087 /** Loop versioning violates an assumption we try to maintain during
5088 vectorization - that the loop exit block has a single predecessor.
5089 After versioning, the exit block of both loop versions is the same
5090 basic block (i.e. it has two predecessors). Just in order to simplify
5091 following transformations in the vectorizer, we fix this situation
5092 here by adding a new (empty) block on the exit-edge of the loop,
5093 with the proper loop-exit phis to maintain loop-closed-form. **/
5095 merge_bb = single_exit (loop)->dest;
5096 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
5097 new_exit_bb = split_edge (single_exit (loop));
5098 new_exit_e = single_exit (loop);
5099 e = EDGE_SUCC (new_exit_bb, 0);
5101 for (orig_phi = phi_nodes (merge_bb); orig_phi;
5102 orig_phi = PHI_CHAIN (orig_phi))
5104 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
5105 new_exit_bb);
5106 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
5107 add_phi_arg (new_phi, arg, new_exit_e);
5108 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
5111 /** end loop-exit-fixes after versioning **/
5113 update_ssa (TODO_update_ssa);
5114 cond_exp_bsi = bsi_last (condition_bb);
5115 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
5118 /* CHECKME: we wouldn't need this if we called update_ssa once
5119 for all loops. */
5120 bitmap_zero (vect_memsyms_to_rename);
5122 /* Peel the loop if there are data refs with unknown alignment.
5123 Only one data ref with unknown store is allowed. */
5125 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5126 vect_do_peeling_for_alignment (loop_vinfo);
5128 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5129 compile time constant), or it is a constant that doesn't divide by the
5130 vectorization factor, then an epilog loop needs to be created.
5131 We therefore duplicate the loop: the original loop will be vectorized,
5132 and will compute the first (n/VF) iterations. The second copy of the loop
5133 will remain scalar and will compute the remaining (n%VF) iterations.
5134 (VF is the vectorization factor). */
5136 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5137 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5138 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
5139 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
5140 else
5141 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5142 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5144 /* 1) Make sure the loop header has exactly two entries
5145 2) Make sure we have a preheader basic block. */
5147 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5149 split_edge (loop_preheader_edge (loop));
5151 /* FORNOW: the vectorizer supports only loops which body consist
5152 of one basic block (header + empty latch). When the vectorizer will
5153 support more involved loop forms, the order by which the BBs are
5154 traversed need to be reconsidered. */
5156 for (i = 0; i < nbbs; i++)
5158 basic_block bb = bbs[i];
5160 for (si = bsi_start (bb); !bsi_end_p (si);)
5162 tree stmt = bsi_stmt (si);
5163 stmt_vec_info stmt_info;
5164 bool is_store;
5166 if (vect_print_dump_info (REPORT_DETAILS))
5168 fprintf (vect_dump, "------>vectorizing statement: ");
5169 print_generic_expr (vect_dump, stmt, TDF_SLIM);
5171 stmt_info = vinfo_for_stmt (stmt);
5172 gcc_assert (stmt_info);
5173 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5174 && !STMT_VINFO_LIVE_P (stmt_info))
5176 bsi_next (&si);
5177 continue;
5180 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5181 != (unsigned HOST_WIDE_INT) vectorization_factor)
5182 && vect_print_dump_info (REPORT_DETAILS))
5183 fprintf (vect_dump, "multiple-types.");
5185 /* -------- vectorize statement ------------ */
5186 if (vect_print_dump_info (REPORT_DETAILS))
5187 fprintf (vect_dump, "transform statement.");
5189 strided_store = false;
5190 is_store = vect_transform_stmt (stmt, &si, &strided_store);
5191 if (is_store)
5193 stmt_ann_t ann;
5194 if (DR_GROUP_FIRST_DR (stmt_info))
5196 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5197 interleaving chain was completed - free all the stores in
5198 the chain. */
5199 tree next = DR_GROUP_FIRST_DR (stmt_info);
5200 tree tmp;
5201 stmt_vec_info next_stmt_info;
5203 while (next)
5205 next_si = bsi_for_stmt (next);
5206 next_stmt_info = vinfo_for_stmt (next);
5207 /* Free the attached stmt_vec_info and remove the stmt. */
5208 ann = stmt_ann (next);
5209 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
5210 free (next_stmt_info);
5211 set_stmt_info (ann, NULL);
5212 bsi_remove (&next_si, true);
5213 next = tmp;
5215 bsi_remove (&si, true);
5216 continue;
5218 else
5220 /* Free the attached stmt_vec_info and remove the stmt. */
5221 ann = stmt_ann (stmt);
5222 free (stmt_info);
5223 set_stmt_info (ann, NULL);
5224 bsi_remove (&si, true);
5225 continue;
5228 bsi_next (&si);
5229 } /* stmts in BB */
5230 } /* BBs in loop */
5232 slpeel_make_loop_iterate_ntimes (loop, ratio);
5234 mark_set_for_renaming (vect_memsyms_to_rename);
5236 /* The memory tags and pointers in vectorized statements need to
5237 have their SSA forms updated. FIXME, why can't this be delayed
5238 until all the loops have been transformed? */
5239 update_ssa (TODO_update_ssa);
5241 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
5242 fprintf (vect_dump, "LOOP VECTORIZED.");