2007-03-01 Paul Brook <paul@codesourcery.com>
[official-gcc.git] / gcc / tree-vect-transform.c
blob4a63a3c18f73307a64794dad45dd3aa723904b5b
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to the Free
19 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
20 02110-1301, USA. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "ggc.h"
27 #include "tree.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "timevar.h"
35 #include "cfgloop.h"
36 #include "expr.h"
37 #include "optabs.h"
38 #include "params.h"
39 #include "recog.h"
40 #include "tree-data-ref.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "langhooks.h"
45 #include "tree-pass.h"
46 #include "toplev.h"
47 #include "real.h"
49 /* Utility functions for the code transformation. */
50 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (tree, block_stmt_iterator *, tree, tree *, tree *, bool, tree);
54 static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree);
55 static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
58 static tree vect_init_vector (tree, tree, tree);
59 static void vect_finish_stmt_generation
60 (tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void update_vuses_to_preheader (tree, struct loop*);
63 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
64 static tree get_initial_def_for_reduction (tree, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
77 /* Function vect_get_new_vect_var.
79 Returns a name for a new variable. The current naming scheme appends the
80 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
81 the name of vectorizer generated variables, and appends that to NAME if
82 provided. */
84 static tree
85 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
87 const char *prefix;
88 tree new_vect_var;
90 switch (var_kind)
92 case vect_simple_var:
93 prefix = "vect_";
94 break;
95 case vect_scalar_var:
96 prefix = "stmp_";
97 break;
98 case vect_pointer_var:
99 prefix = "vect_p";
100 break;
101 default:
102 gcc_unreachable ();
105 if (name)
106 new_vect_var = create_tmp_var (type, concat (prefix, name, NULL));
107 else
108 new_vect_var = create_tmp_var (type, prefix);
110 /* Mark vector typed variable as a gimple register variable. */
111 if (TREE_CODE (type) == VECTOR_TYPE)
112 DECL_GIMPLE_REG_P (new_vect_var) = true;
114 return new_vect_var;
118 /* Function vect_create_addr_base_for_vector_ref.
120 Create an expression that computes the address of the first memory location
121 that will be accessed for a data reference.
123 Input:
124 STMT: The statement containing the data reference.
125 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
126 OFFSET: Optional. If supplied, it is be added to the initial address.
128 Output:
129 1. Return an SSA_NAME whose value is the address of the memory location of
130 the first vector of the data reference.
131 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
132 these statement(s) which define the returned SSA_NAME.
134 FORNOW: We are only handling array accesses with step 1. */
136 static tree
137 vect_create_addr_base_for_vector_ref (tree stmt,
138 tree *new_stmt_list,
139 tree offset)
141 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
142 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
143 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
144 tree base_name = build_fold_indirect_ref (data_ref_base);
145 tree vec_stmt;
146 tree addr_base, addr_expr;
147 tree dest, new_stmt;
148 tree base_offset = unshare_expr (DR_OFFSET (dr));
149 tree init = unshare_expr (DR_INIT (dr));
150 tree vect_ptr_type, addr_expr2;
152 /* Create base_offset */
153 base_offset = size_binop (PLUS_EXPR, base_offset, init);
154 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
155 add_referenced_var (dest);
156 base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest);
157 append_to_statement_list_force (new_stmt, new_stmt_list);
159 if (offset)
161 tree tmp = create_tmp_var (TREE_TYPE (base_offset), "offset");
162 tree step;
164 /* For interleaved access step we divide STEP by the size of the
165 interleaving group. */
166 if (DR_GROUP_SIZE (stmt_info))
167 step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr),
168 build_int_cst (TREE_TYPE (offset),
169 DR_GROUP_SIZE (stmt_info)));
170 else
171 step = DR_STEP (dr);
173 add_referenced_var (tmp);
174 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
175 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
176 base_offset, offset);
177 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
178 append_to_statement_list_force (new_stmt, new_stmt_list);
181 /* base + base_offset */
182 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
183 base_offset);
185 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
187 /* addr_expr = addr_base */
188 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
189 get_name (base_name));
190 add_referenced_var (addr_expr);
191 vec_stmt = fold_convert (vect_ptr_type, addr_base);
192 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
193 get_name (base_name));
194 add_referenced_var (addr_expr2);
195 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
196 append_to_statement_list_force (new_stmt, new_stmt_list);
198 if (vect_print_dump_info (REPORT_DETAILS))
200 fprintf (vect_dump, "created ");
201 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
203 return vec_stmt;
207 /* Function vect_create_data_ref_ptr.
209 Create a new pointer to vector type (vp), that points to the first location
210 accessed in the loop by STMT, along with the def-use update chain to
211 appropriately advance the pointer through the loop iterations. Also set
212 aliasing information for the pointer. This vector pointer is used by the
213 callers to this function to create a memory reference expression for vector
214 load/store access.
216 Input:
217 1. STMT: a stmt that references memory. Expected to be of the form
218 GIMPLE_MODIFY_STMT <name, data-ref> or
219 GIMPLE_MODIFY_STMT <data-ref, name>.
220 2. BSI: block_stmt_iterator where new stmts can be added.
221 3. OFFSET (optional): an offset to be added to the initial address accessed
222 by the data-ref in STMT.
223 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
224 pointing to the initial address.
225 5. TYPE: if not NULL indicates the required type of the data-ref
227 Output:
228 1. Declare a new ptr to vector_type, and have it point to the base of the
229 data reference (initial addressed accessed by the data reference).
230 For example, for vector of type V8HI, the following code is generated:
232 v8hi *vp;
233 vp = (v8hi *)initial_address;
235 if OFFSET is not supplied:
236 initial_address = &a[init];
237 if OFFSET is supplied:
238 initial_address = &a[init + OFFSET];
240 Return the initial_address in INITIAL_ADDRESS.
242 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
243 update the pointer in each iteration of the loop.
245 Return the increment stmt that updates the pointer in PTR_INCR.
247 3. Return the pointer. */
249 static tree
250 vect_create_data_ref_ptr (tree stmt,
251 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
252 tree offset, tree *initial_address, tree *ptr_incr,
253 bool only_init, tree type)
255 tree base_name;
256 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
257 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
258 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
259 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
260 tree vect_ptr_type;
261 tree vect_ptr;
262 tree tag;
263 tree new_temp;
264 tree vec_stmt;
265 tree new_stmt_list = NULL_TREE;
266 edge pe = loop_preheader_edge (loop);
267 basic_block new_bb;
268 tree vect_ptr_init;
269 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
271 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
273 if (vect_print_dump_info (REPORT_DETAILS))
275 tree data_ref_base = base_name;
276 fprintf (vect_dump, "create vector-pointer variable to type: ");
277 print_generic_expr (vect_dump, vectype, TDF_SLIM);
278 if (TREE_CODE (data_ref_base) == VAR_DECL)
279 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
280 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
281 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
282 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
283 fprintf (vect_dump, " vectorizing a record based array ref: ");
284 else if (TREE_CODE (data_ref_base) == SSA_NAME)
285 fprintf (vect_dump, " vectorizing a pointer ref: ");
286 print_generic_expr (vect_dump, base_name, TDF_SLIM);
289 /** (1) Create the new vector-pointer variable: **/
290 if (type)
291 vect_ptr_type = build_pointer_type (type);
292 else
293 vect_ptr_type = build_pointer_type (vectype);
294 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
295 get_name (base_name));
296 add_referenced_var (vect_ptr);
298 /** (2) Add aliasing information to the new vector-pointer:
299 (The points-to info (DR_PTR_INFO) may be defined later.) **/
301 tag = DR_MEMTAG (dr);
302 gcc_assert (tag);
304 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
305 tag must be created with tag added to its may alias list. */
306 if (!MTAG_P (tag))
307 new_type_alias (vect_ptr, tag, DR_REF (dr));
308 else
309 set_symbol_mem_tag (vect_ptr, tag);
311 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
313 /** (3) Calculate the initial address the vector-pointer, and set
314 the vector-pointer to point to it before the loop: **/
316 /* Create: (&(base[init_val+offset]) in the loop preheader. */
317 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
318 offset);
319 pe = loop_preheader_edge (loop);
320 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
321 gcc_assert (!new_bb);
322 *initial_address = new_temp;
324 /* Create: p = (vectype *) initial_base */
325 vec_stmt = fold_convert (vect_ptr_type, new_temp);
326 vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vect_ptr, vec_stmt);
327 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
328 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
329 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
330 gcc_assert (!new_bb);
333 /** (4) Handle the updating of the vector-pointer inside the loop: **/
335 if (only_init) /* No update in loop is required. */
337 /* Copy the points-to information if it exists. */
338 if (DR_PTR_INFO (dr))
339 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
340 return vect_ptr_init;
342 else
344 block_stmt_iterator incr_bsi;
345 bool insert_after;
346 tree indx_before_incr, indx_after_incr;
347 tree incr;
349 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
350 create_iv (vect_ptr_init,
351 fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)),
352 NULL_TREE, loop, &incr_bsi, insert_after,
353 &indx_before_incr, &indx_after_incr);
354 incr = bsi_stmt (incr_bsi);
355 set_stmt_info (stmt_ann (incr),
356 new_stmt_vec_info (incr, loop_vinfo));
358 /* Copy the points-to information if it exists. */
359 if (DR_PTR_INFO (dr))
361 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
362 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
364 merge_alias_info (vect_ptr_init, indx_before_incr);
365 merge_alias_info (vect_ptr_init, indx_after_incr);
366 if (ptr_incr)
367 *ptr_incr = incr;
369 return indx_before_incr;
374 /* Function bump_vector_ptr
376 Increment a pointer (to a vector type) by vector-size. Connect the new
377 increment stmt to the existing def-use update-chain of the pointer.
379 The pointer def-use update-chain before this function:
380 DATAREF_PTR = phi (p_0, p_2)
381 ....
382 PTR_INCR: p_2 = DATAREF_PTR + step
384 The pointer def-use update-chain after this function:
385 DATAREF_PTR = phi (p_0, p_2)
386 ....
387 NEW_DATAREF_PTR = DATAREF_PTR + vector_size
388 ....
389 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
391 Input:
392 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
393 in the loop.
394 PTR_INCR - the stmt that updates the pointer in each iteration of the loop.
395 The increment amount across iterations is also expected to be
396 vector_size.
397 BSI - location where the new update stmt is to be placed.
398 STMT - the original scalar memory-access stmt that is being vectorized.
400 Output: Return NEW_DATAREF_PTR as illustrated above.
404 static tree
405 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
406 tree stmt)
408 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
409 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
410 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
411 tree vptr_type = TREE_TYPE (dataref_ptr);
412 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
413 tree update = fold_convert (vptr_type, TYPE_SIZE_UNIT (vectype));
414 tree incr_stmt;
415 ssa_op_iter iter;
416 use_operand_p use_p;
417 tree new_dataref_ptr;
419 incr_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, ptr_var,
420 build2 (PLUS_EXPR, vptr_type, dataref_ptr, update));
421 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
422 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
423 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
425 /* Update the vector-pointer's cross-iteration increment. */
426 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
428 tree use = USE_FROM_PTR (use_p);
430 if (use == dataref_ptr)
431 SET_USE (use_p, new_dataref_ptr);
432 else
433 gcc_assert (tree_int_cst_compare (use, update) == 0);
436 /* Copy the points-to information if it exists. */
437 if (DR_PTR_INFO (dr))
438 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
439 merge_alias_info (new_dataref_ptr, dataref_ptr);
441 return new_dataref_ptr;
445 /* Function vect_create_destination_var.
447 Create a new temporary of type VECTYPE. */
449 static tree
450 vect_create_destination_var (tree scalar_dest, tree vectype)
452 tree vec_dest;
453 const char *new_name;
454 tree type;
455 enum vect_var_kind kind;
457 kind = vectype ? vect_simple_var : vect_scalar_var;
458 type = vectype ? vectype : TREE_TYPE (scalar_dest);
460 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
462 new_name = get_name (scalar_dest);
463 if (!new_name)
464 new_name = "var_";
465 vec_dest = vect_get_new_vect_var (type, kind, new_name);
466 add_referenced_var (vec_dest);
468 return vec_dest;
472 /* Function vect_init_vector.
474 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
475 the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be
476 used in the vectorization of STMT. */
478 static tree
479 vect_init_vector (tree stmt, tree vector_var, tree vector_type)
481 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
482 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
483 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
484 tree new_var;
485 tree init_stmt;
486 tree vec_oprnd;
487 edge pe;
488 tree new_temp;
489 basic_block new_bb;
491 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
492 add_referenced_var (new_var);
494 init_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, new_var, vector_var);
495 new_temp = make_ssa_name (new_var, init_stmt);
496 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
498 pe = loop_preheader_edge (loop);
499 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
500 gcc_assert (!new_bb);
502 if (vect_print_dump_info (REPORT_DETAILS))
504 fprintf (vect_dump, "created new init_stmt: ");
505 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
508 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
509 return vec_oprnd;
513 /* Function get_initial_def_for_induction
515 Input:
516 STMT - a stmt that performs an induction operation in the loop.
517 IV_PHI - the initial value of the induction variable
519 Output:
520 Return a vector variable, initialized with the first VF values of
521 the induction variable. E.g., for an iv with IV_PHI='X' and
522 evolution S, for a vector of 4 units, we want to return:
523 [X, X + S, X + 2*S, X + 3*S]. */
525 static tree
526 get_initial_def_for_induction (tree stmt, tree iv_phi)
528 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
529 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
530 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
531 tree scalar_type = TREE_TYPE (iv_phi);
532 tree vectype = get_vectype_for_scalar_type (scalar_type);
533 int nunits = GET_MODE_NUNITS (TYPE_MODE (vectype));
534 edge pe = loop_preheader_edge (loop);
535 basic_block new_bb;
536 block_stmt_iterator bsi;
537 tree vec, vec_init, vec_step, t;
538 tree access_fn;
539 tree new_var;
540 tree new_name;
541 tree init_stmt;
542 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
543 tree init_expr, step_expr;
544 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
545 int i;
546 bool ok;
547 int ncopies = vf / nunits;
548 tree expr;
549 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
551 gcc_assert (phi_info);
553 if (STMT_VINFO_VEC_STMT (phi_info))
555 induction_phi = STMT_VINFO_VEC_STMT (phi_info);
556 gcc_assert (TREE_CODE (induction_phi) == PHI_NODE);
558 if (vect_print_dump_info (REPORT_DETAILS))
560 fprintf (vect_dump, "induction already vectorized:");
561 print_generic_expr (vect_dump, iv_phi, TDF_SLIM);
562 fprintf (vect_dump, "\n");
563 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
566 return PHI_RESULT (induction_phi);
569 gcc_assert (ncopies >= 1);
571 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (iv_phi));
572 gcc_assert (access_fn);
573 ok = vect_is_simple_iv_evolution (loop->num, access_fn, &init_expr, &step_expr);
574 gcc_assert (ok);
576 /* Create the vector that holds the initial_value of the induction. */
577 new_name = init_expr;
578 t = NULL_TREE;
579 t = tree_cons (NULL_TREE, init_expr, t);
580 for (i = 1; i < nunits; i++)
582 /* Create: new_name = new_name + step_expr */
583 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
584 add_referenced_var (new_var);
585 init_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, new_var,
586 fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr));
587 new_name = make_ssa_name (new_var, init_stmt);
588 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
590 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
591 gcc_assert (!new_bb);
593 if (vect_print_dump_info (REPORT_DETAILS))
595 fprintf (vect_dump, "created new init_stmt: ");
596 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
598 t = tree_cons (NULL_TREE, new_name, t);
600 vec = build_constructor_from_list (vectype, nreverse (t));
601 vec_init = vect_init_vector (stmt, vec, vectype);
604 /* Create the vector that holds the step of the induction. */
605 expr = build_int_cst (scalar_type, vf);
606 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
607 t = NULL_TREE;
608 for (i = 0; i < nunits; i++)
609 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
610 vec = build_constructor_from_list (vectype, t);
611 vec_step = vect_init_vector (stmt, vec, vectype);
614 /* Create the following def-use cycle:
615 loop prolog:
616 vec_init = [X, X+S, X+2*S, X+3*S]
617 vec_step = [VF*S, VF*S, VF*S, VF*S]
618 loop:
619 vec_iv = PHI <vec_init, vec_loop>
621 STMT
623 vec_loop = vec_iv + vec_step; */
625 /* Create the induction-phi that defines the induction-operand. */
626 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
627 add_referenced_var (vec_dest);
628 induction_phi = create_phi_node (vec_dest, loop->header);
629 set_stmt_info (get_stmt_ann (induction_phi),
630 new_stmt_vec_info (induction_phi, loop_vinfo));
631 induc_def = PHI_RESULT (induction_phi);
633 /* Create the iv update inside the loop */
634 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, NULL_TREE,
635 build2 (PLUS_EXPR, vectype, induc_def, vec_step));
636 vec_def = make_ssa_name (vec_dest, new_stmt);
637 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
638 bsi = bsi_for_stmt (stmt);
639 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
641 /* Set the arguments of the phi node: */
642 add_phi_arg (induction_phi, vec_init, loop_preheader_edge (loop));
643 add_phi_arg (induction_phi, vec_def, loop_latch_edge (loop));
646 /* In case the vectorization factor (VF) is bigger than the number
647 of elements that we can fit in a vectype (nunits), we have to generate
648 more than one vector stmt - i.e - we need to "unroll" the
649 vector stmt by a factor VF/nunits. For more details see documentation
650 in vectorizable_operation. */
652 if (ncopies > 1)
654 stmt_vec_info prev_stmt_vinfo;
656 /* Create the vector that holds the step of the induction. */
657 expr = build_int_cst (scalar_type, nunits);
658 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
659 t = NULL_TREE;
660 for (i = 0; i < nunits; i++)
661 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
662 vec = build_constructor_from_list (vectype, t);
663 vec_step = vect_init_vector (stmt, vec, vectype);
665 vec_def = induc_def;
666 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
667 for (i = 1; i < ncopies; i++)
669 /* vec_i = vec_prev + vec_{step*nunits} */
671 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, NULL_TREE,
672 build2 (PLUS_EXPR, vectype, vec_def, vec_step));
673 vec_def = make_ssa_name (vec_dest, new_stmt);
674 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
675 bsi = bsi_for_stmt (stmt);
676 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
678 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
679 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
683 if (vect_print_dump_info (REPORT_DETAILS))
685 fprintf (vect_dump, "transform induction: created def-use cycle:");
686 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
687 fprintf (vect_dump, "\n");
688 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
691 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
692 return induc_def;
696 /* Function vect_get_vec_def_for_operand.
698 OP is an operand in STMT. This function returns a (vector) def that will be
699 used in the vectorized stmt for STMT.
701 In the case that OP is an SSA_NAME which is defined in the loop, then
702 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
704 In case OP is an invariant or constant, a new stmt that creates a vector def
705 needs to be introduced. */
707 static tree
708 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
710 tree vec_oprnd;
711 tree vec_stmt;
712 tree def_stmt;
713 stmt_vec_info def_stmt_info = NULL;
714 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
715 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
716 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
717 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
718 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
719 tree vec_inv;
720 tree vec_cst;
721 tree t = NULL_TREE;
722 tree def;
723 int i;
724 enum vect_def_type dt;
725 bool is_simple_use;
726 tree vector_type;
728 if (vect_print_dump_info (REPORT_DETAILS))
730 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
731 print_generic_expr (vect_dump, op, TDF_SLIM);
734 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
735 gcc_assert (is_simple_use);
736 if (vect_print_dump_info (REPORT_DETAILS))
738 if (def)
740 fprintf (vect_dump, "def = ");
741 print_generic_expr (vect_dump, def, TDF_SLIM);
743 if (def_stmt)
745 fprintf (vect_dump, " def_stmt = ");
746 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
750 switch (dt)
752 /* Case 1: operand is a constant. */
753 case vect_constant_def:
755 if (scalar_def)
756 *scalar_def = op;
758 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
759 if (vect_print_dump_info (REPORT_DETAILS))
760 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
762 for (i = nunits - 1; i >= 0; --i)
764 t = tree_cons (NULL_TREE, op, t);
766 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
767 vec_cst = build_vector (vector_type, t);
769 return vect_init_vector (stmt, vec_cst, vector_type);
772 /* Case 2: operand is defined outside the loop - loop invariant. */
773 case vect_invariant_def:
775 if (scalar_def)
776 *scalar_def = def;
778 /* Create 'vec_inv = {inv,inv,..,inv}' */
779 if (vect_print_dump_info (REPORT_DETAILS))
780 fprintf (vect_dump, "Create vector_inv.");
782 for (i = nunits - 1; i >= 0; --i)
784 t = tree_cons (NULL_TREE, def, t);
787 /* FIXME: use build_constructor directly. */
788 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
789 vec_inv = build_constructor_from_list (vector_type, t);
790 return vect_init_vector (stmt, vec_inv, vector_type);
793 /* Case 3: operand is defined inside the loop. */
794 case vect_loop_def:
796 if (scalar_def)
797 *scalar_def = def_stmt;
799 /* Get the def from the vectorized stmt. */
800 def_stmt_info = vinfo_for_stmt (def_stmt);
801 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
802 gcc_assert (vec_stmt);
803 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
804 return vec_oprnd;
807 /* Case 4: operand is defined by a loop header phi - reduction */
808 case vect_reduction_def:
810 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
812 /* Get the def before the loop */
813 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
814 return get_initial_def_for_reduction (stmt, op, scalar_def);
817 /* Case 5: operand is defined by loop-header phi - induction. */
818 case vect_induction_def:
820 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
822 /* Get the def before the loop */
823 return get_initial_def_for_induction (stmt, def_stmt);
826 default:
827 gcc_unreachable ();
832 /* Function vect_get_vec_def_for_stmt_copy
834 Return a vector-def for an operand. This function is used when the
835 vectorized stmt to be created (by the caller to this function) is a "copy"
836 created in case the vectorized result cannot fit in one vector, and several
837 copies of the vector-stmt are required. In this case the vector-def is
838 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
839 of the stmt that defines VEC_OPRND.
840 DT is the type of the vector def VEC_OPRND.
842 Context:
843 In case the vectorization factor (VF) is bigger than the number
844 of elements that can fit in a vectype (nunits), we have to generate
845 more than one vector stmt to vectorize the scalar stmt. This situation
846 arises when there are multiple data-types operated upon in the loop; the
847 smallest data-type determines the VF, and as a result, when vectorizing
848 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
849 vector stmt (each computing a vector of 'nunits' results, and together
850 computing 'VF' results in each iteration). This function is called when
851 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
852 which VF=16 and nunits=4, so the number of copies required is 4):
854 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
856 S1: x = load VS1.0: vx.0 = memref0 VS1.1
857 VS1.1: vx.1 = memref1 VS1.2
858 VS1.2: vx.2 = memref2 VS1.3
859 VS1.3: vx.3 = memref3
861 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
862 VSnew.1: vz1 = vx.1 + ... VSnew.2
863 VSnew.2: vz2 = vx.2 + ... VSnew.3
864 VSnew.3: vz3 = vx.3 + ...
866 The vectorization of S1 is explained in vectorizable_load.
867 The vectorization of S2:
868 To create the first vector-stmt out of the 4 copies - VSnew.0 -
869 the function 'vect_get_vec_def_for_operand' is called to
870 get the relevant vector-def for each operand of S2. For operand x it
871 returns the vector-def 'vx.0'.
873 To create the remaining copies of the vector-stmt (VSnew.j), this
874 function is called to get the relevant vector-def for each operand. It is
875 obtained from the respective VS1.j stmt, which is recorded in the
876 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
878 For example, to obtain the vector-def 'vx.1' in order to create the
879 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
880 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
881 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
882 and return its def ('vx.1').
883 Overall, to create the above sequence this function will be called 3 times:
884 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
885 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
886 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
888 static tree
889 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
891 tree vec_stmt_for_operand;
892 stmt_vec_info def_stmt_info;
894 /* Do nothing; can reuse same def. */
895 if (dt == vect_invariant_def || dt == vect_constant_def )
896 return vec_oprnd;
898 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
899 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
900 gcc_assert (def_stmt_info);
901 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
902 gcc_assert (vec_stmt_for_operand);
903 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
905 return vec_oprnd;
909 /* Function vect_finish_stmt_generation.
911 Insert a new stmt. */
913 static void
914 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
915 block_stmt_iterator *bsi)
917 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
918 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
920 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
921 set_stmt_info (get_stmt_ann (vec_stmt),
922 new_stmt_vec_info (vec_stmt, loop_vinfo));
924 if (vect_print_dump_info (REPORT_DETAILS))
926 fprintf (vect_dump, "add new stmt: ");
927 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
930 /* Make sure bsi points to the stmt that is being vectorized. */
931 gcc_assert (stmt == bsi_stmt (*bsi));
933 #ifdef USE_MAPPED_LOCATION
934 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
935 #else
936 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
937 #endif
941 #define ADJUST_IN_EPILOG 1
943 /* Function get_initial_def_for_reduction
945 Input:
946 STMT - a stmt that performs a reduction operation in the loop.
947 INIT_VAL - the initial value of the reduction variable
949 Output:
950 SCALAR_DEF - a tree that holds a value to be added to the final result
951 of the reduction (used for "ADJUST_IN_EPILOG" - see below).
952 Return a vector variable, initialized according to the operation that STMT
953 performs. This vector will be used as the initial value of the
954 vector of partial results.
956 Option1 ("ADJUST_IN_EPILOG"): Initialize the vector as follows:
957 add: [0,0,...,0,0]
958 mult: [1,1,...,1,1]
959 min/max: [init_val,init_val,..,init_val,init_val]
960 bit and/or: [init_val,init_val,..,init_val,init_val]
961 and when necessary (e.g. add/mult case) let the caller know
962 that it needs to adjust the result by init_val.
964 Option2: Initialize the vector as follows:
965 add: [0,0,...,0,init_val]
966 mult: [1,1,...,1,init_val]
967 min/max: [init_val,init_val,...,init_val]
968 bit and/or: [init_val,init_val,...,init_val]
969 and no adjustments are needed.
971 For example, for the following code:
973 s = init_val;
974 for (i=0;i<n;i++)
975 s = s + a[i];
977 STMT is 's = s + a[i]', and the reduction variable is 's'.
978 For a vector of 4 units, we want to return either [0,0,0,init_val],
979 or [0,0,0,0] and let the caller know that it needs to adjust
980 the result at the end by 'init_val'.
982 FORNOW: We use the "ADJUST_IN_EPILOG" scheme.
983 TODO: Use some cost-model to estimate which scheme is more profitable.
986 static tree
987 get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
989 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
990 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
991 int nunits = GET_MODE_NUNITS (TYPE_MODE (vectype));
992 int nelements;
993 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
994 tree type = TREE_TYPE (init_val);
995 tree def;
996 tree vec, t = NULL_TREE;
997 bool need_epilog_adjust;
998 int i;
999 tree vector_type;
1001 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
1003 switch (code)
1005 case WIDEN_SUM_EXPR:
1006 case DOT_PROD_EXPR:
1007 case PLUS_EXPR:
1008 if (INTEGRAL_TYPE_P (type))
1009 def = build_int_cst (type, 0);
1010 else
1011 def = build_real (type, dconst0);
1013 #ifdef ADJUST_IN_EPILOG
1014 /* All the 'nunits' elements are set to 0. The final result will be
1015 adjusted by 'init_val' at the loop epilog. */
1016 nelements = nunits;
1017 need_epilog_adjust = true;
1018 #else
1019 /* 'nunits - 1' elements are set to 0; The last element is set to
1020 'init_val'. No further adjustments at the epilog are needed. */
1021 nelements = nunits - 1;
1022 need_epilog_adjust = false;
1023 #endif
1024 break;
1026 case MIN_EXPR:
1027 case MAX_EXPR:
1028 def = init_val;
1029 nelements = nunits;
1030 need_epilog_adjust = false;
1031 break;
1033 default:
1034 gcc_unreachable ();
1037 for (i = nelements - 1; i >= 0; --i)
1038 t = tree_cons (NULL_TREE, def, t);
1040 if (nelements == nunits - 1)
1042 /* Set the last element of the vector. */
1043 t = tree_cons (NULL_TREE, init_val, t);
1044 nelements += 1;
1046 gcc_assert (nelements == nunits);
1048 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1049 if (TREE_CODE (init_val) == INTEGER_CST || TREE_CODE (init_val) == REAL_CST)
1050 vec = build_vector (vector_type, t);
1051 else
1052 vec = build_constructor_from_list (vector_type, t);
1054 if (!need_epilog_adjust)
1055 *scalar_def = NULL_TREE;
1056 else
1057 *scalar_def = init_val;
1059 return vect_init_vector (stmt, vec, vector_type);
1063 /* Function vect_create_epilog_for_reduction
1065 Create code at the loop-epilog to finalize the result of a reduction
1066 computation.
1068 VECT_DEF is a vector of partial results.
1069 REDUC_CODE is the tree-code for the epilog reduction.
1070 STMT is the scalar reduction stmt that is being vectorized.
1071 REDUCTION_PHI is the phi-node that carries the reduction computation.
1073 This function:
1074 1. Creates the reduction def-use cycle: sets the arguments for
1075 REDUCTION_PHI:
1076 The loop-entry argument is the vectorized initial-value of the reduction.
1077 The loop-latch argument is VECT_DEF - the vector of partial sums.
1078 2. "Reduces" the vector of partial results VECT_DEF into a single result,
1079 by applying the operation specified by REDUC_CODE if available, or by
1080 other means (whole-vector shifts or a scalar loop).
1081 The function also creates a new phi node at the loop exit to preserve
1082 loop-closed form, as illustrated below.
1084 The flow at the entry to this function:
1086 loop:
1087 vec_def = phi <null, null> # REDUCTION_PHI
1088 VECT_DEF = vector_stmt # vectorized form of STMT
1089 s_loop = scalar_stmt # (scalar) STMT
1090 loop_exit:
1091 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1092 use <s_out0>
1093 use <s_out0>
1095 The above is transformed by this function into:
1097 loop:
1098 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
1099 VECT_DEF = vector_stmt # vectorized form of STMT
1100 s_loop = scalar_stmt # (scalar) STMT
1101 loop_exit:
1102 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1103 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1104 v_out2 = reduce <v_out1>
1105 s_out3 = extract_field <v_out2, 0>
1106 s_out4 = adjust_result <s_out3>
1107 use <s_out4>
1108 use <s_out4>
1111 static void
1112 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
1113 enum tree_code reduc_code, tree reduction_phi)
1115 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1116 tree vectype;
1117 enum machine_mode mode;
1118 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1119 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1120 basic_block exit_bb;
1121 tree scalar_dest;
1122 tree scalar_type;
1123 tree new_phi;
1124 block_stmt_iterator exit_bsi;
1125 tree vec_dest;
1126 tree new_temp;
1127 tree new_name;
1128 tree epilog_stmt;
1129 tree new_scalar_dest, exit_phi;
1130 tree bitsize, bitpos, bytesize;
1131 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1132 tree scalar_initial_def;
1133 tree vec_initial_def;
1134 tree orig_name;
1135 imm_use_iterator imm_iter;
1136 use_operand_p use_p;
1137 bool extract_scalar_result;
1138 tree reduction_op;
1139 tree orig_stmt;
1140 tree use_stmt;
1141 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
1142 int op_type;
1144 op_type = TREE_OPERAND_LENGTH (operation);
1145 reduction_op = TREE_OPERAND (operation, op_type-1);
1146 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
1147 mode = TYPE_MODE (vectype);
1149 /*** 1. Create the reduction def-use cycle ***/
1151 /* 1.1 set the loop-entry arg of the reduction-phi: */
1152 /* For the case of reduction, vect_get_vec_def_for_operand returns
1153 the scalar def before the loop, that defines the initial value
1154 of the reduction variable. */
1155 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
1156 &scalar_initial_def);
1157 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
1159 /* 1.2 set the loop-latch arg for the reduction-phi: */
1160 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
1162 if (vect_print_dump_info (REPORT_DETAILS))
1164 fprintf (vect_dump, "transform reduction: created def-use cycle:");
1165 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
1166 fprintf (vect_dump, "\n");
1167 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
1171 /*** 2. Create epilog code
1172 The reduction epilog code operates across the elements of the vector
1173 of partial results computed by the vectorized loop.
1174 The reduction epilog code consists of:
1175 step 1: compute the scalar result in a vector (v_out2)
1176 step 2: extract the scalar result (s_out3) from the vector (v_out2)
1177 step 3: adjust the scalar result (s_out3) if needed.
1179 Step 1 can be accomplished using one the following three schemes:
1180 (scheme 1) using reduc_code, if available.
1181 (scheme 2) using whole-vector shifts, if available.
1182 (scheme 3) using a scalar loop. In this case steps 1+2 above are
1183 combined.
1185 The overall epilog code looks like this:
1187 s_out0 = phi <s_loop> # original EXIT_PHI
1188 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1189 v_out2 = reduce <v_out1> # step 1
1190 s_out3 = extract_field <v_out2, 0> # step 2
1191 s_out4 = adjust_result <s_out3> # step 3
1193 (step 3 is optional, and step2 1 and 2 may be combined).
1194 Lastly, the uses of s_out0 are replaced by s_out4.
1196 ***/
1198 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
1199 v_out1 = phi <v_loop> */
1201 exit_bb = single_exit (loop)->dest;
1202 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
1203 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
1204 exit_bsi = bsi_start (exit_bb);
1206 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
1207 (i.e. when reduc_code is not available) and in the final adjustment code
1208 (if needed). Also get the original scalar reduction variable as
1209 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
1210 represents a reduction pattern), the tree-code and scalar-def are
1211 taken from the original stmt that the pattern-stmt (STMT) replaces.
1212 Otherwise (it is a regular reduction) - the tree-code and scalar-def
1213 are taken from STMT. */
1215 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1216 if (!orig_stmt)
1218 /* Regular reduction */
1219 orig_stmt = stmt;
1221 else
1223 /* Reduction pattern */
1224 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
1225 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1226 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
1228 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1229 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
1230 scalar_type = TREE_TYPE (scalar_dest);
1231 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
1232 bitsize = TYPE_SIZE (scalar_type);
1233 bytesize = TYPE_SIZE_UNIT (scalar_type);
1235 /* 2.3 Create the reduction code, using one of the three schemes described
1236 above. */
1238 if (reduc_code < NUM_TREE_CODES)
1240 /*** Case 1: Create:
1241 v_out2 = reduc_expr <v_out1> */
1243 if (vect_print_dump_info (REPORT_DETAILS))
1244 fprintf (vect_dump, "Reduce using direct vector reduction.");
1246 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1247 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
1248 build1 (reduc_code, vectype, PHI_RESULT (new_phi)));
1249 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1250 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1251 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1253 extract_scalar_result = true;
1255 else
1257 enum tree_code shift_code = 0;
1258 bool have_whole_vector_shift = true;
1259 int bit_offset;
1260 int element_bitsize = tree_low_cst (bitsize, 1);
1261 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1262 tree vec_temp;
1264 if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
1265 shift_code = VEC_RSHIFT_EXPR;
1266 else
1267 have_whole_vector_shift = false;
1269 /* Regardless of whether we have a whole vector shift, if we're
1270 emulating the operation via tree-vect-generic, we don't want
1271 to use it. Only the first round of the reduction is likely
1272 to still be profitable via emulation. */
1273 /* ??? It might be better to emit a reduction tree code here, so that
1274 tree-vect-generic can expand the first round via bit tricks. */
1275 if (!VECTOR_MODE_P (mode))
1276 have_whole_vector_shift = false;
1277 else
1279 optab optab = optab_for_tree_code (code, vectype);
1280 if (optab->handlers[mode].insn_code == CODE_FOR_nothing)
1281 have_whole_vector_shift = false;
1284 if (have_whole_vector_shift)
1286 /*** Case 2: Create:
1287 for (offset = VS/2; offset >= element_size; offset/=2)
1289 Create: va' = vec_shift <va, offset>
1290 Create: va = vop <va, va'>
1291 } */
1293 if (vect_print_dump_info (REPORT_DETAILS))
1294 fprintf (vect_dump, "Reduce using vector shifts");
1296 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1297 new_temp = PHI_RESULT (new_phi);
1299 for (bit_offset = vec_size_in_bits/2;
1300 bit_offset >= element_bitsize;
1301 bit_offset /= 2)
1303 tree bitpos = size_int (bit_offset);
1305 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1306 vec_dest,
1307 build2 (shift_code, vectype,
1308 new_temp, bitpos));
1309 new_name = make_ssa_name (vec_dest, epilog_stmt);
1310 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1311 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1313 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1314 vec_dest,
1315 build2 (code, vectype,
1316 new_name, new_temp));
1317 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1318 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1319 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1322 extract_scalar_result = true;
1324 else
1326 tree rhs;
1328 /*** Case 3: Create:
1329 s = extract_field <v_out2, 0>
1330 for (offset = element_size;
1331 offset < vector_size;
1332 offset += element_size;)
1334 Create: s' = extract_field <v_out2, offset>
1335 Create: s = op <s, s'>
1336 } */
1338 if (vect_print_dump_info (REPORT_DETAILS))
1339 fprintf (vect_dump, "Reduce using scalar code. ");
1341 vec_temp = PHI_RESULT (new_phi);
1342 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1343 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1344 bitsize_zero_node);
1345 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1346 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1347 new_scalar_dest, rhs);
1348 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1349 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1350 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1352 for (bit_offset = element_bitsize;
1353 bit_offset < vec_size_in_bits;
1354 bit_offset += element_bitsize)
1356 tree bitpos = bitsize_int (bit_offset);
1357 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1358 bitpos);
1360 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1361 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1362 new_scalar_dest, rhs);
1363 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
1364 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1365 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1367 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1368 new_scalar_dest,
1369 build2 (code, scalar_type, new_name, new_temp));
1370 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1371 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1372 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1375 extract_scalar_result = false;
1379 /* 2.4 Extract the final scalar result. Create:
1380 s_out3 = extract_field <v_out2, bitpos> */
1382 if (extract_scalar_result)
1384 tree rhs;
1386 if (vect_print_dump_info (REPORT_DETAILS))
1387 fprintf (vect_dump, "extract scalar result");
1389 if (BYTES_BIG_ENDIAN)
1390 bitpos = size_binop (MULT_EXPR,
1391 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
1392 TYPE_SIZE (scalar_type));
1393 else
1394 bitpos = bitsize_zero_node;
1396 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
1397 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1398 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1399 new_scalar_dest, rhs);
1400 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1401 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1402 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1405 /* 2.4 Adjust the final result by the initial value of the reduction
1406 variable. (When such adjustment is not needed, then
1407 'scalar_initial_def' is zero).
1409 Create:
1410 s_out4 = scalar_expr <s_out3, scalar_initial_def> */
1412 if (scalar_initial_def)
1414 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1415 new_scalar_dest,
1416 build2 (code, scalar_type, new_temp, scalar_initial_def));
1417 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1418 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1419 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1422 /* 2.6 Replace uses of s_out0 with uses of s_out3 */
1424 /* Find the loop-closed-use at the loop exit of the original scalar result.
1425 (The reduction result is expected to have two immediate uses - one at the
1426 latch block, and one at the loop exit). */
1427 exit_phi = NULL;
1428 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
1430 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
1432 exit_phi = USE_STMT (use_p);
1433 break;
1436 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
1437 gcc_assert (exit_phi);
1438 /* Replace the uses: */
1439 orig_name = PHI_RESULT (exit_phi);
1440 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
1441 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1442 SET_USE (use_p, new_temp);
1446 /* Function vectorizable_reduction.
1448 Check if STMT performs a reduction operation that can be vectorized.
1449 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1450 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1451 Return FALSE if not a vectorizable STMT, TRUE otherwise.
1453 This function also handles reduction idioms (patterns) that have been
1454 recognized in advance during vect_pattern_recog. In this case, STMT may be
1455 of this form:
1456 X = pattern_expr (arg0, arg1, ..., X)
1457 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
1458 sequence that had been detected and replaced by the pattern-stmt (STMT).
1460 In some cases of reduction patterns, the type of the reduction variable X is
1461 different than the type of the other arguments of STMT.
1462 In such cases, the vectype that is used when transforming STMT into a vector
1463 stmt is different than the vectype that is used to determine the
1464 vectorization factor, because it consists of a different number of elements
1465 than the actual number of elements that are being operated upon in parallel.
1467 For example, consider an accumulation of shorts into an int accumulator.
1468 On some targets it's possible to vectorize this pattern operating on 8
1469 shorts at a time (hence, the vectype for purposes of determining the
1470 vectorization factor should be V8HI); on the other hand, the vectype that
1471 is used to create the vector form is actually V4SI (the type of the result).
1473 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
1474 indicates what is the actual level of parallelism (V8HI in the example), so
1475 that the right vectorization factor would be derived. This vectype
1476 corresponds to the type of arguments to the reduction stmt, and should *NOT*
1477 be used to create the vectorized stmt. The right vectype for the vectorized
1478 stmt is obtained from the type of the result X:
1479 get_vectype_for_scalar_type (TREE_TYPE (X))
1481 This means that, contrary to "regular" reductions (or "regular" stmts in
1482 general), the following equation:
1483 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
1484 does *NOT* necessarily hold for reduction patterns. */
1486 bool
1487 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1489 tree vec_dest;
1490 tree scalar_dest;
1491 tree op;
1492 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
1493 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1494 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1495 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1496 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1497 tree operation;
1498 enum tree_code code, orig_code, epilog_reduc_code = 0;
1499 enum machine_mode vec_mode;
1500 int op_type;
1501 optab optab, reduc_optab;
1502 tree new_temp = NULL_TREE;
1503 tree def, def_stmt;
1504 enum vect_def_type dt;
1505 tree new_phi;
1506 tree scalar_type;
1507 bool is_simple_use;
1508 tree orig_stmt;
1509 stmt_vec_info orig_stmt_info;
1510 tree expr = NULL_TREE;
1511 int i;
1512 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1513 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1514 stmt_vec_info prev_stmt_info;
1515 tree reduc_def;
1516 tree new_stmt = NULL_TREE;
1517 int j;
1519 gcc_assert (ncopies >= 1);
1521 /* 1. Is vectorizable reduction? */
1523 /* Not supportable if the reduction variable is used in the loop. */
1524 if (STMT_VINFO_RELEVANT_P (stmt_info))
1525 return false;
1527 if (!STMT_VINFO_LIVE_P (stmt_info))
1528 return false;
1530 /* Make sure it was already recognized as a reduction computation. */
1531 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
1532 return false;
1534 /* 2. Has this been recognized as a reduction pattern?
1536 Check if STMT represents a pattern that has been recognized
1537 in earlier analysis stages. For stmts that represent a pattern,
1538 the STMT_VINFO_RELATED_STMT field records the last stmt in
1539 the original sequence that constitutes the pattern. */
1541 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1542 if (orig_stmt)
1544 orig_stmt_info = vinfo_for_stmt (orig_stmt);
1545 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
1546 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
1547 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
1550 /* 3. Check the operands of the operation. The first operands are defined
1551 inside the loop body. The last operand is the reduction variable,
1552 which is defined by the loop-header-phi. */
1554 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
1556 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1557 code = TREE_CODE (operation);
1558 op_type = TREE_OPERAND_LENGTH (operation);
1559 if (op_type != binary_op && op_type != ternary_op)
1560 return false;
1561 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1562 scalar_type = TREE_TYPE (scalar_dest);
1564 /* All uses but the last are expected to be defined in the loop.
1565 The last use is the reduction variable. */
1566 for (i = 0; i < op_type-1; i++)
1568 op = TREE_OPERAND (operation, i);
1569 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1570 gcc_assert (is_simple_use);
1571 if (dt != vect_loop_def
1572 && dt != vect_invariant_def
1573 && dt != vect_constant_def
1574 && dt != vect_induction_def)
1575 return false;
1578 op = TREE_OPERAND (operation, i);
1579 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1580 gcc_assert (is_simple_use);
1581 gcc_assert (dt == vect_reduction_def);
1582 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1583 if (orig_stmt)
1584 gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt));
1585 else
1586 gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt));
1588 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
1589 return false;
1591 /* 4. Supportable by target? */
1593 /* 4.1. check support for the operation in the loop */
1594 optab = optab_for_tree_code (code, vectype);
1595 if (!optab)
1597 if (vect_print_dump_info (REPORT_DETAILS))
1598 fprintf (vect_dump, "no optab.");
1599 return false;
1601 vec_mode = TYPE_MODE (vectype);
1602 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1604 if (vect_print_dump_info (REPORT_DETAILS))
1605 fprintf (vect_dump, "op not supported by target.");
1606 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1607 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1608 < vect_min_worthwhile_factor (code))
1609 return false;
1610 if (vect_print_dump_info (REPORT_DETAILS))
1611 fprintf (vect_dump, "proceeding using word mode.");
1614 /* Worthwhile without SIMD support? */
1615 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1616 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1617 < vect_min_worthwhile_factor (code))
1619 if (vect_print_dump_info (REPORT_DETAILS))
1620 fprintf (vect_dump, "not worthwhile without SIMD support.");
1621 return false;
1624 /* 4.2. Check support for the epilog operation.
1626 If STMT represents a reduction pattern, then the type of the
1627 reduction variable may be different than the type of the rest
1628 of the arguments. For example, consider the case of accumulation
1629 of shorts into an int accumulator; The original code:
1630 S1: int_a = (int) short_a;
1631 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
1633 was replaced with:
1634 STMT: int_acc = widen_sum <short_a, int_acc>
1636 This means that:
1637 1. The tree-code that is used to create the vector operation in the
1638 epilog code (that reduces the partial results) is not the
1639 tree-code of STMT, but is rather the tree-code of the original
1640 stmt from the pattern that STMT is replacing. I.e, in the example
1641 above we want to use 'widen_sum' in the loop, but 'plus' in the
1642 epilog.
1643 2. The type (mode) we use to check available target support
1644 for the vector operation to be created in the *epilog*, is
1645 determined by the type of the reduction variable (in the example
1646 above we'd check this: plus_optab[vect_int_mode]).
1647 However the type (mode) we use to check available target support
1648 for the vector operation to be created *inside the loop*, is
1649 determined by the type of the other arguments to STMT (in the
1650 example we'd check this: widen_sum_optab[vect_short_mode]).
1652 This is contrary to "regular" reductions, in which the types of all
1653 the arguments are the same as the type of the reduction variable.
1654 For "regular" reductions we can therefore use the same vector type
1655 (and also the same tree-code) when generating the epilog code and
1656 when generating the code inside the loop. */
1658 if (orig_stmt)
1660 /* This is a reduction pattern: get the vectype from the type of the
1661 reduction variable, and get the tree-code from orig_stmt. */
1662 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1663 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
1664 vec_mode = TYPE_MODE (vectype);
1666 else
1668 /* Regular reduction: use the same vectype and tree-code as used for
1669 the vector code inside the loop can be used for the epilog code. */
1670 orig_code = code;
1673 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
1674 return false;
1675 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
1676 if (!reduc_optab)
1678 if (vect_print_dump_info (REPORT_DETAILS))
1679 fprintf (vect_dump, "no optab for reduction.");
1680 epilog_reduc_code = NUM_TREE_CODES;
1682 if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1684 if (vect_print_dump_info (REPORT_DETAILS))
1685 fprintf (vect_dump, "reduc op not supported by target.");
1686 epilog_reduc_code = NUM_TREE_CODES;
1689 if (!vec_stmt) /* transformation not required. */
1691 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
1692 return true;
1695 /** Transform. **/
1697 if (vect_print_dump_info (REPORT_DETAILS))
1698 fprintf (vect_dump, "transform reduction.");
1700 /* Create the destination vector */
1701 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1703 /* Create the reduction-phi that defines the reduction-operand. */
1704 new_phi = create_phi_node (vec_dest, loop->header);
1706 /* In case the vectorization factor (VF) is bigger than the number
1707 of elements that we can fit in a vectype (nunits), we have to generate
1708 more than one vector stmt - i.e - we need to "unroll" the
1709 vector stmt by a factor VF/nunits. For more details see documentation
1710 in vectorizable_operation. */
1712 prev_stmt_info = NULL;
1713 for (j = 0; j < ncopies; j++)
1715 /* Handle uses. */
1716 if (j == 0)
1718 op = TREE_OPERAND (operation, 0);
1719 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
1720 if (op_type == ternary_op)
1722 op = TREE_OPERAND (operation, 1);
1723 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
1726 /* Get the vector def for the reduction variable from the phi node */
1727 reduc_def = PHI_RESULT (new_phi);
1729 else
1731 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
1732 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
1733 if (op_type == ternary_op)
1734 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
1736 /* Get the vector def for the reduction variable from the vectorized
1737 reduction operation generated in the previous iteration (j-1) */
1738 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
1741 /* Arguments are ready. create the new vector stmt. */
1743 if (op_type == binary_op)
1744 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
1745 else
1746 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
1747 reduc_def);
1748 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
1749 new_temp = make_ssa_name (vec_dest, new_stmt);
1750 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1751 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1753 if (j == 0)
1754 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1755 else
1756 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1757 prev_stmt_info = vinfo_for_stmt (new_stmt);
1760 /* Finalize the reduction-phi (set it's arguments) and create the
1761 epilog reduction code. */
1762 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
1763 return true;
1766 /* Checks if CALL can be vectorized in type VECTYPE. Returns
1767 a function declaration if the target has a vectorized version
1768 of the function, or NULL_TREE if the function cannot be vectorized. */
1770 tree
1771 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
1773 tree fndecl = get_callee_fndecl (call);
1774 enum built_in_function code;
1776 /* We only handle functions that do not read or clobber memory -- i.e.
1777 const or novops ones. */
1778 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
1779 return NULL_TREE;
1781 if (!fndecl
1782 || TREE_CODE (fndecl) != FUNCTION_DECL
1783 || !DECL_BUILT_IN (fndecl))
1784 return NULL_TREE;
1786 code = DECL_FUNCTION_CODE (fndecl);
1787 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
1788 vectype_in);
1791 /* Function vectorizable_call.
1793 Check if STMT performs a function call that can be vectorized.
1794 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1795 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1796 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1798 bool
1799 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1801 tree vec_dest;
1802 tree scalar_dest;
1803 tree operation;
1804 tree op, type;
1805 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
1806 tree vectype_out, vectype_in;
1807 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1808 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
1809 enum vect_def_type dt[2];
1810 int ncopies, j, nargs;
1811 call_expr_arg_iterator iter;
1813 /* Is STMT a vectorizable call? */
1814 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
1815 return false;
1817 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
1818 return false;
1820 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1821 if (TREE_CODE (operation) != CALL_EXPR)
1822 return false;
1824 /* Process function arguments. */
1825 rhs_type = NULL_TREE;
1826 nargs = 0;
1827 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
1829 ++nargs;
1831 /* Bail out if the function has more than two arguments, we
1832 do not have interesting builtin functions to vectorize with
1833 more than two arguments. */
1834 if (nargs > 2)
1835 return false;
1837 /* We can only handle calls with arguments of the same type. */
1838 if (rhs_type
1839 && rhs_type != TREE_TYPE (op))
1841 if (vect_print_dump_info (REPORT_DETAILS))
1842 fprintf (vect_dump, "argument types differ.");
1843 return false;
1845 rhs_type = TREE_TYPE (op);
1847 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs-1]))
1849 if (vect_print_dump_info (REPORT_DETAILS))
1850 fprintf (vect_dump, "use not simple.");
1851 return false;
1855 /* No arguments is also not good. */
1856 if (nargs == 0)
1857 return false;
1859 vectype_in = get_vectype_for_scalar_type (rhs_type);
1861 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
1862 vectype_out = get_vectype_for_scalar_type (lhs_type);
1864 /* Only handle the case of vectors with the same number of elements.
1865 FIXME: We need a way to handle for example the SSE2 cvtpd2dq
1866 instruction which converts V2DFmode to V4SImode but only
1867 using the lower half of the V4SImode result. */
1868 if (TYPE_VECTOR_SUBPARTS (vectype_in) != TYPE_VECTOR_SUBPARTS (vectype_out))
1869 return false;
1871 /* For now, we only vectorize functions if a target specific builtin
1872 is available. TODO -- in some cases, it might be profitable to
1873 insert the calls for pieces of the vector, in order to be able
1874 to vectorize other operations in the loop. */
1875 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
1876 if (fndecl == NULL_TREE)
1878 if (vect_print_dump_info (REPORT_DETAILS))
1879 fprintf (vect_dump, "function is not vectorizable.");
1881 return false;
1884 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
1886 if (!vec_stmt) /* transformation not required. */
1888 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
1889 return true;
1892 /** Transform. **/
1894 if (vect_print_dump_info (REPORT_DETAILS))
1895 fprintf (vect_dump, "transform operation.");
1897 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1898 / TYPE_VECTOR_SUBPARTS (vectype_out));
1899 gcc_assert (ncopies >= 1);
1901 /* Handle def. */
1902 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1903 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
1905 prev_stmt_info = NULL;
1906 for (j = 0; j < ncopies; ++j)
1908 tree new_stmt, vargs;
1909 tree vec_oprnd[2];
1910 int n;
1912 /* Build argument list for the vectorized call. */
1913 /* FIXME: Rewrite this so that it doesn't construct a temporary
1914 list. */
1915 vargs = NULL_TREE;
1916 n = -1;
1917 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
1919 ++n;
1921 if (j == 0)
1922 vec_oprnd[n] = vect_get_vec_def_for_operand (op, stmt, NULL);
1923 else
1924 vec_oprnd[n] = vect_get_vec_def_for_stmt_copy (dt[n], vec_oprnd[n]);
1926 vargs = tree_cons (NULL_TREE, vec_oprnd[n], vargs);
1928 vargs = nreverse (vargs);
1930 rhs = build_function_call_expr (fndecl, vargs);
1931 new_stmt = build2 (GIMPLE_MODIFY_STMT, NULL_TREE, vec_dest, rhs);
1932 new_temp = make_ssa_name (vec_dest, new_stmt);
1933 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1935 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1937 if (j == 0)
1938 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1939 else
1940 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1941 prev_stmt_info = vinfo_for_stmt (new_stmt);
1944 /* The call in STMT might prevent it from being removed in dce. We however
1945 cannot remove it here, due to the way the ssa name it defines is mapped
1946 to the new definition. So just replace rhs of the statement with something
1947 harmless. */
1948 type = TREE_TYPE (scalar_dest);
1949 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
1951 return true;
1955 /* Function vectorizable_conversion.
1957 Check if STMT performs a conversion operation, that can be vectorized.
1958 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1959 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1960 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1962 bool
1963 vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
1964 tree * vec_stmt)
1966 tree vec_dest;
1967 tree scalar_dest;
1968 tree operation;
1969 tree op0;
1970 tree vec_oprnd0 = NULL_TREE;
1971 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1972 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1973 enum tree_code code;
1974 tree new_temp;
1975 tree def, def_stmt;
1976 enum vect_def_type dt0;
1977 tree new_stmt;
1978 int nunits_in;
1979 int nunits_out;
1980 int ncopies, j;
1981 tree vectype_out, vectype_in;
1982 tree rhs_type, lhs_type;
1983 tree builtin_decl;
1984 stmt_vec_info prev_stmt_info;
1986 /* Is STMT a vectorizable conversion? */
1988 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1989 return false;
1991 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1993 if (STMT_VINFO_LIVE_P (stmt_info))
1995 /* FORNOW: not yet supported. */
1996 if (vect_print_dump_info (REPORT_DETAILS))
1997 fprintf (vect_dump, "value used after loop.");
1998 return false;
2001 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2002 return false;
2004 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2005 return false;
2007 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2008 code = TREE_CODE (operation);
2009 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
2010 return false;
2012 /* Check types of lhs and rhs */
2013 op0 = TREE_OPERAND (operation, 0);
2014 rhs_type = TREE_TYPE (op0);
2015 vectype_in = get_vectype_for_scalar_type (rhs_type);
2016 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2018 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2019 lhs_type = TREE_TYPE (scalar_dest);
2020 vectype_out = get_vectype_for_scalar_type (lhs_type);
2021 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
2022 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2024 /* FORNOW: need to extend to support short<->float conversions as well. */
2025 if (nunits_out != nunits_in)
2026 return false;
2028 /* Bail out if the types are both integral or non-integral */
2029 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
2030 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
2031 return false;
2033 /* Sanity check: make sure that at least one copy of the vectorized stmt
2034 needs to be generated. */
2035 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2036 gcc_assert (ncopies >= 1);
2038 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2040 if (vect_print_dump_info (REPORT_DETAILS))
2041 fprintf (vect_dump, "use not simple.");
2042 return false;
2045 /* Supportable by target? */
2046 if (!targetm.vectorize.builtin_conversion (code, vectype_in))
2048 if (vect_print_dump_info (REPORT_DETAILS))
2049 fprintf (vect_dump, "op not supported by target.");
2050 return false;
2053 if (!vec_stmt) /* transformation not required. */
2055 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
2056 return true;
2059 /** Transform. **/
2061 if (vect_print_dump_info (REPORT_DETAILS))
2062 fprintf (vect_dump, "transform conversion.");
2064 /* Handle def. */
2065 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2067 prev_stmt_info = NULL;
2068 for (j = 0; j < ncopies; j++)
2070 tree sym;
2071 ssa_op_iter iter;
2073 if (j == 0)
2074 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2075 else
2076 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2078 builtin_decl =
2079 targetm.vectorize.builtin_conversion (code, vectype_in);
2080 new_stmt = build_call_expr (builtin_decl, 1, vec_oprnd0);
2082 /* Arguments are ready. create the new vector stmt. */
2083 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
2084 new_stmt);
2085 new_temp = make_ssa_name (vec_dest, new_stmt);
2086 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2087 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2088 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2090 if (TREE_CODE (sym) == SSA_NAME)
2091 sym = SSA_NAME_VAR (sym);
2092 mark_sym_for_renaming (sym);
2095 if (j == 0)
2096 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2097 else
2098 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2099 prev_stmt_info = vinfo_for_stmt (new_stmt);
2101 return true;
2105 /* Function vectorizable_assignment.
2107 Check if STMT performs an assignment (copy) that can be vectorized.
2108 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2109 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2110 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2112 bool
2113 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2115 tree vec_dest;
2116 tree scalar_dest;
2117 tree op;
2118 tree vec_oprnd;
2119 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2120 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2121 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2122 tree new_temp;
2123 tree def, def_stmt;
2124 enum vect_def_type dt;
2125 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2126 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2128 gcc_assert (ncopies >= 1);
2129 if (ncopies > 1)
2130 return false; /* FORNOW */
2132 /* Is vectorizable assignment? */
2133 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2134 return false;
2136 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2138 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2139 return false;
2141 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2142 if (TREE_CODE (scalar_dest) != SSA_NAME)
2143 return false;
2145 op = GIMPLE_STMT_OPERAND (stmt, 1);
2146 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
2148 if (vect_print_dump_info (REPORT_DETAILS))
2149 fprintf (vect_dump, "use not simple.");
2150 return false;
2153 if (!vec_stmt) /* transformation not required. */
2155 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
2156 return true;
2159 /** Transform. **/
2160 if (vect_print_dump_info (REPORT_DETAILS))
2161 fprintf (vect_dump, "transform assignment.");
2163 /* Handle def. */
2164 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2166 /* Handle use. */
2167 op = GIMPLE_STMT_OPERAND (stmt, 1);
2168 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
2170 /* Arguments are ready. create the new vector stmt. */
2171 *vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, vec_oprnd);
2172 new_temp = make_ssa_name (vec_dest, *vec_stmt);
2173 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
2174 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
2176 return true;
2180 /* Function vect_min_worthwhile_factor.
2182 For a loop where we could vectorize the operation indicated by CODE,
2183 return the minimum vectorization factor that makes it worthwhile
2184 to use generic vectors. */
2185 static int
2186 vect_min_worthwhile_factor (enum tree_code code)
2188 switch (code)
2190 case PLUS_EXPR:
2191 case MINUS_EXPR:
2192 case NEGATE_EXPR:
2193 return 4;
2195 case BIT_AND_EXPR:
2196 case BIT_IOR_EXPR:
2197 case BIT_XOR_EXPR:
2198 case BIT_NOT_EXPR:
2199 return 2;
2201 default:
2202 return INT_MAX;
2207 /* Function vectorizable_operation.
2209 Check if STMT performs a binary or unary operation that can be vectorized.
2210 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2211 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2212 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2214 bool
2215 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2217 tree vec_dest;
2218 tree scalar_dest;
2219 tree operation;
2220 tree op0, op1 = NULL;
2221 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2222 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2223 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2224 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2225 enum tree_code code;
2226 enum machine_mode vec_mode;
2227 tree new_temp;
2228 int op_type;
2229 optab optab;
2230 int icode;
2231 enum machine_mode optab_op2_mode;
2232 tree def, def_stmt;
2233 enum vect_def_type dt0, dt1;
2234 tree new_stmt;
2235 stmt_vec_info prev_stmt_info;
2236 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
2237 int nunits_out;
2238 tree vectype_out;
2239 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2240 int j;
2242 gcc_assert (ncopies >= 1);
2244 /* Is STMT a vectorizable binary/unary operation? */
2245 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2246 return false;
2248 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2250 if (STMT_VINFO_LIVE_P (stmt_info))
2252 /* FORNOW: not yet supported. */
2253 if (vect_print_dump_info (REPORT_DETAILS))
2254 fprintf (vect_dump, "value used after loop.");
2255 return false;
2258 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2259 return false;
2261 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2262 return false;
2264 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2265 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2266 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2267 if (nunits_out != nunits_in)
2268 return false;
2270 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2271 code = TREE_CODE (operation);
2272 optab = optab_for_tree_code (code, vectype);
2274 /* Support only unary or binary operations. */
2275 op_type = TREE_OPERAND_LENGTH (operation);
2276 if (op_type != unary_op && op_type != binary_op)
2278 if (vect_print_dump_info (REPORT_DETAILS))
2279 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
2280 return false;
2283 op0 = TREE_OPERAND (operation, 0);
2284 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2286 if (vect_print_dump_info (REPORT_DETAILS))
2287 fprintf (vect_dump, "use not simple.");
2288 return false;
2291 if (op_type == binary_op)
2293 op1 = TREE_OPERAND (operation, 1);
2294 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2296 if (vect_print_dump_info (REPORT_DETAILS))
2297 fprintf (vect_dump, "use not simple.");
2298 return false;
2302 /* Supportable by target? */
2303 if (!optab)
2305 if (vect_print_dump_info (REPORT_DETAILS))
2306 fprintf (vect_dump, "no optab.");
2307 return false;
2309 vec_mode = TYPE_MODE (vectype);
2310 icode = (int) optab->handlers[(int) vec_mode].insn_code;
2311 if (icode == CODE_FOR_nothing)
2313 if (vect_print_dump_info (REPORT_DETAILS))
2314 fprintf (vect_dump, "op not supported by target.");
2315 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2316 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2317 < vect_min_worthwhile_factor (code))
2318 return false;
2319 if (vect_print_dump_info (REPORT_DETAILS))
2320 fprintf (vect_dump, "proceeding using word mode.");
2323 /* Worthwhile without SIMD support? */
2324 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2325 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2326 < vect_min_worthwhile_factor (code))
2328 if (vect_print_dump_info (REPORT_DETAILS))
2329 fprintf (vect_dump, "not worthwhile without SIMD support.");
2330 return false;
2333 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2335 /* FORNOW: not yet supported. */
2336 if (!VECTOR_MODE_P (vec_mode))
2337 return false;
2339 /* Invariant argument is needed for a vector shift
2340 by a scalar shift operand. */
2341 optab_op2_mode = insn_data[icode].operand[2].mode;
2342 if (! (VECTOR_MODE_P (optab_op2_mode)
2343 || dt1 == vect_constant_def
2344 || dt1 == vect_invariant_def))
2346 if (vect_print_dump_info (REPORT_DETAILS))
2347 fprintf (vect_dump, "operand mode requires invariant argument.");
2348 return false;
2352 if (!vec_stmt) /* transformation not required. */
2354 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
2355 return true;
2358 /** Transform. **/
2360 if (vect_print_dump_info (REPORT_DETAILS))
2361 fprintf (vect_dump, "transform binary/unary operation.");
2363 /* Handle def. */
2364 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2366 /* In case the vectorization factor (VF) is bigger than the number
2367 of elements that we can fit in a vectype (nunits), we have to generate
2368 more than one vector stmt - i.e - we need to "unroll" the
2369 vector stmt by a factor VF/nunits. In doing so, we record a pointer
2370 from one copy of the vector stmt to the next, in the field
2371 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
2372 stages to find the correct vector defs to be used when vectorizing
2373 stmts that use the defs of the current stmt. The example below illustrates
2374 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
2375 4 vectorized stmts):
2377 before vectorization:
2378 RELATED_STMT VEC_STMT
2379 S1: x = memref - -
2380 S2: z = x + 1 - -
2382 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
2383 there):
2384 RELATED_STMT VEC_STMT
2385 VS1_0: vx0 = memref0 VS1_1 -
2386 VS1_1: vx1 = memref1 VS1_2 -
2387 VS1_2: vx2 = memref2 VS1_3 -
2388 VS1_3: vx3 = memref3 - -
2389 S1: x = load - VS1_0
2390 S2: z = x + 1 - -
2392 step2: vectorize stmt S2 (done here):
2393 To vectorize stmt S2 we first need to find the relevant vector
2394 def for the first operand 'x'. This is, as usual, obtained from
2395 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
2396 that defines 'x' (S1). This way we find the stmt VS1_0, and the
2397 relevant vector def 'vx0'. Having found 'vx0' we can generate
2398 the vector stmt VS2_0, and as usual, record it in the
2399 STMT_VINFO_VEC_STMT of stmt S2.
2400 When creating the second copy (VS2_1), we obtain the relevant vector
2401 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
2402 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
2403 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
2404 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
2405 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
2406 chain of stmts and pointers:
2407 RELATED_STMT VEC_STMT
2408 VS1_0: vx0 = memref0 VS1_1 -
2409 VS1_1: vx1 = memref1 VS1_2 -
2410 VS1_2: vx2 = memref2 VS1_3 -
2411 VS1_3: vx3 = memref3 - -
2412 S1: x = load - VS1_0
2413 VS2_0: vz0 = vx0 + v1 VS2_1 -
2414 VS2_1: vz1 = vx1 + v1 VS2_2 -
2415 VS2_2: vz2 = vx2 + v1 VS2_3 -
2416 VS2_3: vz3 = vx3 + v1 - -
2417 S2: z = x + 1 - VS2_0 */
2419 prev_stmt_info = NULL;
2420 for (j = 0; j < ncopies; j++)
2422 /* Handle uses. */
2423 if (j == 0)
2425 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2426 if (op_type == binary_op)
2428 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2430 /* Vector shl and shr insn patterns can be defined with
2431 scalar operand 2 (shift operand). In this case, use
2432 constant or loop invariant op1 directly, without
2433 extending it to vector mode first. */
2434 optab_op2_mode = insn_data[icode].operand[2].mode;
2435 if (!VECTOR_MODE_P (optab_op2_mode))
2437 if (vect_print_dump_info (REPORT_DETAILS))
2438 fprintf (vect_dump, "operand 1 using scalar mode.");
2439 vec_oprnd1 = op1;
2442 if (!vec_oprnd1)
2443 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2446 else
2448 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2449 if (op_type == binary_op)
2450 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2453 /* Arguments are ready. create the new vector stmt. */
2455 if (op_type == binary_op)
2456 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
2457 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
2458 else
2459 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
2460 build1 (code, vectype, vec_oprnd0));
2461 new_temp = make_ssa_name (vec_dest, new_stmt);
2462 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2463 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2465 if (j == 0)
2466 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2467 else
2468 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2469 prev_stmt_info = vinfo_for_stmt (new_stmt);
2472 return true;
2476 /* Function vectorizable_type_demotion
2478 Check if STMT performs a binary or unary operation that involves
2479 type demotion, and if it can be vectorized.
2480 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2481 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2482 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2484 bool
2485 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
2486 tree *vec_stmt)
2488 tree vec_dest;
2489 tree scalar_dest;
2490 tree operation;
2491 tree op0;
2492 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2493 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2494 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2495 enum tree_code code;
2496 tree new_temp;
2497 tree def, def_stmt;
2498 enum vect_def_type dt0;
2499 tree new_stmt;
2500 stmt_vec_info prev_stmt_info;
2501 int nunits_in;
2502 int nunits_out;
2503 tree vectype_out;
2504 int ncopies;
2505 int j;
2506 tree expr;
2507 tree vectype_in;
2508 tree scalar_type;
2509 optab optab;
2510 enum machine_mode vec_mode;
2512 /* Is STMT a vectorizable type-demotion operation? */
2514 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2515 return false;
2517 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2519 if (STMT_VINFO_LIVE_P (stmt_info))
2521 /* FORNOW: not yet supported. */
2522 if (vect_print_dump_info (REPORT_DETAILS))
2523 fprintf (vect_dump, "value used after loop.");
2524 return false;
2527 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2528 return false;
2530 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2531 return false;
2533 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2534 code = TREE_CODE (operation);
2535 if (code != NOP_EXPR && code != CONVERT_EXPR)
2536 return false;
2538 op0 = TREE_OPERAND (operation, 0);
2539 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2540 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2542 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2543 scalar_type = TREE_TYPE (scalar_dest);
2544 vectype_out = get_vectype_for_scalar_type (scalar_type);
2545 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2546 if (nunits_in != nunits_out / 2) /* FORNOW */
2547 return false;
2549 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2550 gcc_assert (ncopies >= 1);
2552 if (! INTEGRAL_TYPE_P (scalar_type)
2553 || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2554 return false;
2556 /* Check the operands of the operation. */
2557 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2559 if (vect_print_dump_info (REPORT_DETAILS))
2560 fprintf (vect_dump, "use not simple.");
2561 return false;
2564 /* Supportable by target? */
2565 code = VEC_PACK_MOD_EXPR;
2566 optab = optab_for_tree_code (VEC_PACK_MOD_EXPR, vectype_in);
2567 if (!optab)
2568 return false;
2570 vec_mode = TYPE_MODE (vectype_in);
2571 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
2572 return false;
2574 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2576 if (!vec_stmt) /* transformation not required. */
2578 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
2579 return true;
2582 /** Transform. **/
2584 if (vect_print_dump_info (REPORT_DETAILS))
2585 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
2586 ncopies);
2588 /* Handle def. */
2589 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2591 /* In case the vectorization factor (VF) is bigger than the number
2592 of elements that we can fit in a vectype (nunits), we have to generate
2593 more than one vector stmt - i.e - we need to "unroll" the
2594 vector stmt by a factor VF/nunits. */
2595 prev_stmt_info = NULL;
2596 for (j = 0; j < ncopies; j++)
2598 /* Handle uses. */
2599 if (j == 0)
2601 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2602 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2604 else
2606 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
2607 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2610 /* Arguments are ready. Create the new vector stmt. */
2611 expr = build2 (code, vectype_out, vec_oprnd0, vec_oprnd1);
2612 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
2613 new_temp = make_ssa_name (vec_dest, new_stmt);
2614 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2615 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2617 if (j == 0)
2618 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2619 else
2620 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2622 prev_stmt_info = vinfo_for_stmt (new_stmt);
2625 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2626 return true;
2630 /* Function vect_gen_widened_results_half
2632 Create a vector stmt whose code, type, number of arguments, and result
2633 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2634 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2635 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2636 needs to be created (DECL is a function-decl of a target-builtin).
2637 STMT is the original scalar stmt that we are vectorizing. */
2639 static tree
2640 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2641 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2642 tree vec_dest, block_stmt_iterator *bsi,
2643 tree stmt)
2645 tree expr;
2646 tree new_stmt;
2647 tree new_temp;
2648 tree sym;
2649 ssa_op_iter iter;
2651 /* Generate half of the widened result: */
2652 if (code == CALL_EXPR)
2654 /* Target specific support */
2655 if (op_type == binary_op)
2656 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
2657 else
2658 expr = build_call_expr (decl, 1, vec_oprnd0);
2660 else
2662 /* Generic support */
2663 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2664 if (op_type == binary_op)
2665 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2666 else
2667 expr = build1 (code, vectype, vec_oprnd0);
2669 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
2670 new_temp = make_ssa_name (vec_dest, new_stmt);
2671 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2672 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2674 if (code == CALL_EXPR)
2676 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2678 if (TREE_CODE (sym) == SSA_NAME)
2679 sym = SSA_NAME_VAR (sym);
2680 mark_sym_for_renaming (sym);
2684 return new_stmt;
2688 /* Function vectorizable_type_promotion
2690 Check if STMT performs a binary or unary operation that involves
2691 type promotion, and if it can be vectorized.
2692 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2693 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2694 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2696 bool
2697 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
2698 tree *vec_stmt)
2700 tree vec_dest;
2701 tree scalar_dest;
2702 tree operation;
2703 tree op0, op1 = NULL;
2704 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2705 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2706 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2707 enum tree_code code, code1 = CODE_FOR_nothing, code2 = CODE_FOR_nothing;
2708 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
2709 int op_type;
2710 tree def, def_stmt;
2711 enum vect_def_type dt0, dt1;
2712 tree new_stmt;
2713 stmt_vec_info prev_stmt_info;
2714 int nunits_in;
2715 int nunits_out;
2716 tree vectype_out;
2717 int ncopies;
2718 int j;
2719 tree vectype_in;
2721 /* Is STMT a vectorizable type-promotion operation? */
2723 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2724 return false;
2726 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2728 if (STMT_VINFO_LIVE_P (stmt_info))
2730 /* FORNOW: not yet supported. */
2731 if (vect_print_dump_info (REPORT_DETAILS))
2732 fprintf (vect_dump, "value used after loop.");
2733 return false;
2736 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2737 return false;
2739 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2740 return false;
2742 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2743 code = TREE_CODE (operation);
2744 if (code != NOP_EXPR && code != WIDEN_MULT_EXPR)
2745 return false;
2747 op0 = TREE_OPERAND (operation, 0);
2748 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2749 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2750 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2751 gcc_assert (ncopies >= 1);
2753 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2754 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2755 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2756 if (nunits_out != nunits_in / 2) /* FORNOW */
2757 return false;
2759 if (! INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
2760 || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2761 return false;
2763 /* Check the operands of the operation. */
2764 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2766 if (vect_print_dump_info (REPORT_DETAILS))
2767 fprintf (vect_dump, "use not simple.");
2768 return false;
2771 op_type = TREE_CODE_LENGTH (code);
2772 if (op_type == binary_op)
2774 op1 = TREE_OPERAND (operation, 1);
2775 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2777 if (vect_print_dump_info (REPORT_DETAILS))
2778 fprintf (vect_dump, "use not simple.");
2779 return false;
2783 /* Supportable by target? */
2784 if (!supportable_widening_operation (code, stmt, vectype_in,
2785 &decl1, &decl2, &code1, &code2))
2786 return false;
2788 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2790 if (!vec_stmt) /* transformation not required. */
2792 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
2793 return true;
2796 /** Transform. **/
2798 if (vect_print_dump_info (REPORT_DETAILS))
2799 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
2800 ncopies);
2802 /* Handle def. */
2803 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2805 /* In case the vectorization factor (VF) is bigger than the number
2806 of elements that we can fit in a vectype (nunits), we have to generate
2807 more than one vector stmt - i.e - we need to "unroll" the
2808 vector stmt by a factor VF/nunits. */
2810 prev_stmt_info = NULL;
2811 for (j = 0; j < ncopies; j++)
2813 /* Handle uses. */
2814 if (j == 0)
2816 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2817 if (op_type == binary_op)
2818 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2820 else
2822 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2823 if (op_type == binary_op)
2824 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2827 /* Arguments are ready. Create the new vector stmt. We are creating
2828 two vector defs because the widened result does not fit in one vector.
2829 The vectorized stmt can be expressed as a call to a taregt builtin,
2830 or a using a tree-code. */
2831 /* Generate first half of the widened result: */
2832 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
2833 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2834 if (j == 0)
2835 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2836 else
2837 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2838 prev_stmt_info = vinfo_for_stmt (new_stmt);
2840 /* Generate second half of the widened result: */
2841 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
2842 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2843 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2844 prev_stmt_info = vinfo_for_stmt (new_stmt);
2848 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2849 return true;
2853 /* Function vect_strided_store_supported.
2855 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
2856 and FALSE otherwise. */
2858 static bool
2859 vect_strided_store_supported (tree vectype)
2861 optab interleave_high_optab, interleave_low_optab;
2862 int mode;
2864 mode = (int) TYPE_MODE (vectype);
2866 /* Check that the operation is supported. */
2867 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
2868 vectype);
2869 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
2870 vectype);
2871 if (!interleave_high_optab || !interleave_low_optab)
2873 if (vect_print_dump_info (REPORT_DETAILS))
2874 fprintf (vect_dump, "no optab for interleave.");
2875 return false;
2878 if (interleave_high_optab->handlers[(int) mode].insn_code
2879 == CODE_FOR_nothing
2880 || interleave_low_optab->handlers[(int) mode].insn_code
2881 == CODE_FOR_nothing)
2883 if (vect_print_dump_info (REPORT_DETAILS))
2884 fprintf (vect_dump, "interleave op not supported by target.");
2885 return false;
2887 return true;
2891 /* Function vect_permute_store_chain.
2893 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
2894 a power of 2, generate interleave_high/low stmts to reorder the data
2895 correctly for the stores. Return the final references for stores in
2896 RESULT_CHAIN.
2898 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
2899 The input is 4 vectors each containing 8 elements. We assign a number to each
2900 element, the input sequence is:
2902 1st vec: 0 1 2 3 4 5 6 7
2903 2nd vec: 8 9 10 11 12 13 14 15
2904 3rd vec: 16 17 18 19 20 21 22 23
2905 4th vec: 24 25 26 27 28 29 30 31
2907 The output sequence should be:
2909 1st vec: 0 8 16 24 1 9 17 25
2910 2nd vec: 2 10 18 26 3 11 19 27
2911 3rd vec: 4 12 20 28 5 13 21 30
2912 4th vec: 6 14 22 30 7 15 23 31
2914 i.e., we interleave the contents of the four vectors in their order.
2916 We use interleave_high/low instructions to create such output. The input of
2917 each interleave_high/low operation is two vectors:
2918 1st vec 2nd vec
2919 0 1 2 3 4 5 6 7
2920 the even elements of the result vector are obtained left-to-right from the
2921 high/low elements of the first vector. The odd elements of the result are
2922 obtained left-to-right from the high/low elements of the second vector.
2923 The output of interleave_high will be: 0 4 1 5
2924 and of interleave_low: 2 6 3 7
2927 The permutation is done in log LENGTH stages. In each stage interleave_high
2928 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
2929 where the first argument is taken from the first half of DR_CHAIN and the
2930 second argument from it's second half.
2931 In our example,
2933 I1: interleave_high (1st vec, 3rd vec)
2934 I2: interleave_low (1st vec, 3rd vec)
2935 I3: interleave_high (2nd vec, 4th vec)
2936 I4: interleave_low (2nd vec, 4th vec)
2938 The output for the first stage is:
2940 I1: 0 16 1 17 2 18 3 19
2941 I2: 4 20 5 21 6 22 7 23
2942 I3: 8 24 9 25 10 26 11 27
2943 I4: 12 28 13 29 14 30 15 31
2945 The output of the second stage, i.e. the final result is:
2947 I1: 0 8 16 24 1 9 17 25
2948 I2: 2 10 18 26 3 11 19 27
2949 I3: 4 12 20 28 5 13 21 30
2950 I4: 6 14 22 30 7 15 23 31. */
2952 static bool
2953 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
2954 unsigned int length,
2955 tree stmt,
2956 block_stmt_iterator *bsi,
2957 VEC(tree,heap) **result_chain)
2959 tree perm_dest, perm_stmt, vect1, vect2, high, low;
2960 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2961 tree scalar_dest;
2962 int i;
2963 unsigned int j;
2964 VEC(tree,heap) *first, *second;
2966 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2967 first = VEC_alloc (tree, heap, length/2);
2968 second = VEC_alloc (tree, heap, length/2);
2970 /* Check that the operation is supported. */
2971 if (!vect_strided_store_supported (vectype))
2972 return false;
2974 *result_chain = VEC_copy (tree, heap, dr_chain);
2976 for (i = 0; i < exact_log2 (length); i++)
2978 for (j = 0; j < length/2; j++)
2980 vect1 = VEC_index (tree, dr_chain, j);
2981 vect2 = VEC_index (tree, dr_chain, j+length/2);
2983 /* Create interleaving stmt:
2984 in the case of big endian:
2985 high = interleave_high (vect1, vect2)
2986 and in the case of little endian:
2987 high = interleave_low (vect1, vect2). */
2988 perm_dest = create_tmp_var (vectype, "vect_inter_high");
2989 DECL_GIMPLE_REG_P (perm_dest) = 1;
2990 add_referenced_var (perm_dest);
2991 if (BYTES_BIG_ENDIAN)
2992 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2993 build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype,
2994 vect1, vect2));
2995 else
2996 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2997 build2 (VEC_INTERLEAVE_LOW_EXPR, vectype,
2998 vect1, vect2));
2999 high = make_ssa_name (perm_dest, perm_stmt);
3000 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
3001 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3002 VEC_replace (tree, *result_chain, 2*j, high);
3004 /* Create interleaving stmt:
3005 in the case of big endian:
3006 low = interleave_low (vect1, vect2)
3007 and in the case of little endian:
3008 low = interleave_high (vect1, vect2). */
3009 perm_dest = create_tmp_var (vectype, "vect_inter_low");
3010 DECL_GIMPLE_REG_P (perm_dest) = 1;
3011 add_referenced_var (perm_dest);
3012 if (BYTES_BIG_ENDIAN)
3013 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3014 build2 (VEC_INTERLEAVE_LOW_EXPR, vectype,
3015 vect1, vect2));
3016 else
3017 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3018 build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype,
3019 vect1, vect2));
3020 low = make_ssa_name (perm_dest, perm_stmt);
3021 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
3022 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3023 VEC_replace (tree, *result_chain, 2*j+1, low);
3025 dr_chain = VEC_copy (tree, heap, *result_chain);
3027 return true;
3031 /* Function vectorizable_store.
3033 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
3034 can be vectorized.
3035 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3036 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3037 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3039 bool
3040 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3042 tree scalar_dest;
3043 tree data_ref;
3044 tree op;
3045 tree vec_oprnd = NULL_TREE;
3046 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3047 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
3048 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3049 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3050 enum machine_mode vec_mode;
3051 tree dummy;
3052 enum dr_alignment_support alignment_support_cheme;
3053 ssa_op_iter iter;
3054 def_operand_p def_p;
3055 tree def, def_stmt;
3056 enum vect_def_type dt;
3057 stmt_vec_info prev_stmt_info = NULL;
3058 tree dataref_ptr = NULL_TREE;
3059 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3060 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3061 int j;
3062 tree next_stmt, first_stmt;
3063 bool strided_store = false;
3064 unsigned int group_size, i;
3065 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
3066 gcc_assert (ncopies >= 1);
3068 /* Is vectorizable store? */
3070 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3071 return false;
3073 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3074 if (TREE_CODE (scalar_dest) != ARRAY_REF
3075 && TREE_CODE (scalar_dest) != INDIRECT_REF
3076 && !DR_GROUP_FIRST_DR (stmt_info))
3077 return false;
3079 op = GIMPLE_STMT_OPERAND (stmt, 1);
3080 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
3082 if (vect_print_dump_info (REPORT_DETAILS))
3083 fprintf (vect_dump, "use not simple.");
3084 return false;
3087 vec_mode = TYPE_MODE (vectype);
3088 /* FORNOW. In some cases can vectorize even if data-type not supported
3089 (e.g. - array initialization with 0). */
3090 if (mov_optab->handlers[(int)vec_mode].insn_code == CODE_FOR_nothing)
3091 return false;
3093 if (!STMT_VINFO_DATA_REF (stmt_info))
3094 return false;
3096 if (DR_GROUP_FIRST_DR (stmt_info))
3098 strided_store = true;
3099 if (!vect_strided_store_supported (vectype))
3100 return false;
3103 if (!vec_stmt) /* transformation not required. */
3105 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
3106 return true;
3109 /** Transform. **/
3111 if (vect_print_dump_info (REPORT_DETAILS))
3112 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
3114 if (strided_store)
3116 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3117 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3118 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3120 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
3122 /* We vectorize all the stmts of the interleaving group when we
3123 reach the last stmt in the group. */
3124 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
3125 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
3127 *vec_stmt = NULL_TREE;
3128 return true;
3131 else
3133 first_stmt = stmt;
3134 first_dr = dr;
3135 group_size = 1;
3138 dr_chain = VEC_alloc (tree, heap, group_size);
3139 oprnds = VEC_alloc (tree, heap, group_size);
3141 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3142 gcc_assert (alignment_support_cheme);
3143 gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */
3145 /* In case the vectorization factor (VF) is bigger than the number
3146 of elements that we can fit in a vectype (nunits), we have to generate
3147 more than one vector stmt - i.e - we need to "unroll" the
3148 vector stmt by a factor VF/nunits. For more details see documentation in
3149 vect_get_vec_def_for_copy_stmt. */
3151 /* In case of interleaving (non-unit strided access):
3153 S1: &base + 2 = x2
3154 S2: &base = x0
3155 S3: &base + 1 = x1
3156 S4: &base + 3 = x3
3158 We create vectorized stores starting from base address (the access of the
3159 first stmt in the chain (S2 in the above example), when the last store stmt
3160 of the chain (S4) is reached:
3162 VS1: &base = vx2
3163 VS2: &base + vec_size*1 = vx0
3164 VS3: &base + vec_size*2 = vx1
3165 VS4: &base + vec_size*3 = vx3
3167 Then permutation statements are generated:
3169 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
3170 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
3173 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3174 (the order of the data-refs in the output of vect_permute_store_chain
3175 corresponds to the order of scalar stmts in the interleaving chain - see
3176 the documentation of vect_permute_store_chain()).
3178 In case of both multiple types and interleaving, above vector stores and
3179 permutation stmts are created for every copy. The result vector stmts are
3180 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3181 STMT_VINFO_RELATED_STMT for the next copies.
3184 prev_stmt_info = NULL;
3185 for (j = 0; j < ncopies; j++)
3187 tree new_stmt;
3188 tree ptr_incr;
3190 if (j == 0)
3192 /* For interleaved stores we collect vectorized defs for all the
3193 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
3194 as an input to vect_permute_store_chain(), and OPRNDS as an input
3195 to vect_get_vec_def_for_stmt_copy() for the next copy.
3196 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3197 OPRNDS are of size 1. */
3198 next_stmt = first_stmt;
3199 for (i = 0; i < group_size; i++)
3201 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
3202 is the exact number of stmts in the chain. Therefore, NEXT_STMT
3203 can't be NULL_TREE. In case that there is no interleaving,
3204 GROUP_SIZE is 1, and only one iteration of the loop will be
3205 executed. */
3206 gcc_assert (next_stmt);
3207 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
3208 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
3209 VEC_quick_push(tree, dr_chain, vec_oprnd);
3210 VEC_quick_push(tree, oprnds, vec_oprnd);
3211 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3213 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE,
3214 &dummy, &ptr_incr, false,
3215 TREE_TYPE (vec_oprnd));
3217 else
3219 /* For interleaved stores we created vectorized defs for all the
3220 defs stored in OPRNDS in the previous iteration (previous copy).
3221 DR_CHAIN is then used as an input to vect_permute_store_chain(),
3222 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
3223 next copy.
3224 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3225 OPRNDS are of size 1. */
3226 for (i = 0; i < group_size; i++)
3228 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
3229 VEC_index (tree, oprnds, i));
3230 VEC_replace(tree, dr_chain, i, vec_oprnd);
3231 VEC_replace(tree, oprnds, i, vec_oprnd);
3233 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3236 if (strided_store)
3238 result_chain = VEC_alloc (tree, heap, group_size);
3239 /* Permute. */
3240 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
3241 &result_chain))
3242 return false;
3245 next_stmt = first_stmt;
3246 for (i = 0; i < group_size; i++)
3248 /* For strided stores vectorized defs are interleaved in
3249 vect_permute_store_chain(). */
3250 if (strided_store)
3251 vec_oprnd = VEC_index(tree, result_chain, i);
3253 data_ref = build_fold_indirect_ref (dataref_ptr);
3254 /* Arguments are ready. Create the new vector stmt. */
3255 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, data_ref,
3256 vec_oprnd);
3257 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3259 /* Set the VDEFs for the vector pointer. If this virtual def
3260 has a use outside the loop and a loop peel is performed
3261 then the def may be renamed by the peel. Mark it for
3262 renaming so the later use will also be renamed. */
3263 copy_virtual_operands (new_stmt, next_stmt);
3264 if (j == 0)
3266 /* The original store is deleted so the same SSA_NAMEs
3267 can be used. */
3268 FOR_EACH_SSA_TREE_OPERAND (def, next_stmt, iter, SSA_OP_VDEF)
3270 SSA_NAME_DEF_STMT (def) = new_stmt;
3271 mark_sym_for_renaming (SSA_NAME_VAR (def));
3274 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3276 else
3278 /* Create new names for all the definitions created by COPY and
3279 add replacement mappings for each new name. */
3280 FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VDEF)
3282 create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p);
3283 mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p)));
3286 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3289 prev_stmt_info = vinfo_for_stmt (new_stmt);
3290 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3291 if (!next_stmt)
3292 break;
3293 /* Bump the vector pointer. */
3294 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3298 return true;
3302 /* Function vect_setup_realignment
3304 This function is called when vectorizing an unaligned load using
3305 the dr_unaligned_software_pipeline scheme.
3306 This function generates the following code at the loop prolog:
3308 p = initial_addr;
3309 msq_init = *(floor(p)); # prolog load
3310 realignment_token = call target_builtin;
3311 loop:
3312 msq = phi (msq_init, ---)
3314 The code above sets up a new (vector) pointer, pointing to the first
3315 location accessed by STMT, and a "floor-aligned" load using that pointer.
3316 It also generates code to compute the "realignment-token" (if the relevant
3317 target hook was defined), and creates a phi-node at the loop-header bb
3318 whose arguments are the result of the prolog-load (created by this
3319 function) and the result of a load that takes place in the loop (to be
3320 created by the caller to this function).
3321 The caller to this function uses the phi-result (msq) to create the
3322 realignment code inside the loop, and sets up the missing phi argument,
3323 as follows:
3325 loop:
3326 msq = phi (msq_init, lsq)
3327 lsq = *(floor(p')); # load in loop
3328 result = realign_load (msq, lsq, realignment_token);
3330 Input:
3331 STMT - (scalar) load stmt to be vectorized. This load accesses
3332 a memory location that may be unaligned.
3333 BSI - place where new code is to be inserted.
3335 Output:
3336 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
3337 target hook, if defined.
3338 Return value - the result of the loop-header phi node. */
3340 static tree
3341 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
3342 tree *realignment_token)
3344 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3345 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3346 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3347 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3348 edge pe = loop_preheader_edge (loop);
3349 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3350 tree vec_dest;
3351 tree init_addr;
3352 tree inc;
3353 tree ptr;
3354 tree data_ref;
3355 tree new_stmt;
3356 basic_block new_bb;
3357 tree msq_init;
3358 tree new_temp;
3359 tree phi_stmt;
3360 tree msq;
3362 /* 1. Create msq_init = *(floor(p1)) in the loop preheader */
3363 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3364 ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true,
3365 NULL_TREE);
3366 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
3367 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, data_ref);
3368 new_temp = make_ssa_name (vec_dest, new_stmt);
3369 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3370 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3371 gcc_assert (!new_bb);
3372 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
3373 copy_virtual_operands (new_stmt, stmt);
3374 update_vuses_to_preheader (new_stmt, loop);
3376 /* 2. Create permutation mask, if required, in loop preheader. */
3377 if (targetm.vectorize.builtin_mask_for_load)
3379 tree builtin_decl;
3381 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
3382 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
3383 vec_dest = vect_create_destination_var (scalar_dest,
3384 TREE_TYPE (new_stmt));
3385 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3386 new_stmt);
3387 new_temp = make_ssa_name (vec_dest, new_stmt);
3388 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3389 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3390 gcc_assert (!new_bb);
3391 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
3393 /* The result of the CALL_EXPR to this builtin is determined from
3394 the value of the parameter and no global variables are touched
3395 which makes the builtin a "const" function. Requiring the
3396 builtin to have the "const" attribute makes it unnecessary
3397 to call mark_call_clobbered. */
3398 gcc_assert (TREE_READONLY (builtin_decl));
3401 /* 3. Create msq = phi <msq_init, lsq> in loop */
3402 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3403 msq = make_ssa_name (vec_dest, NULL_TREE);
3404 phi_stmt = create_phi_node (msq, loop->header);
3405 SSA_NAME_DEF_STMT (msq) = phi_stmt;
3406 add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop));
3408 return msq;
3412 /* Function vect_strided_load_supported.
3414 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
3415 and FALSE otherwise. */
3417 static bool
3418 vect_strided_load_supported (tree vectype)
3420 optab perm_even_optab, perm_odd_optab;
3421 int mode;
3423 mode = (int) TYPE_MODE (vectype);
3425 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
3426 if (!perm_even_optab)
3428 if (vect_print_dump_info (REPORT_DETAILS))
3429 fprintf (vect_dump, "no optab for perm_even.");
3430 return false;
3433 if (perm_even_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3435 if (vect_print_dump_info (REPORT_DETAILS))
3436 fprintf (vect_dump, "perm_even op not supported by target.");
3437 return false;
3440 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
3441 if (!perm_odd_optab)
3443 if (vect_print_dump_info (REPORT_DETAILS))
3444 fprintf (vect_dump, "no optab for perm_odd.");
3445 return false;
3448 if (perm_odd_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3450 if (vect_print_dump_info (REPORT_DETAILS))
3451 fprintf (vect_dump, "perm_odd op not supported by target.");
3452 return false;
3454 return true;
3458 /* Function vect_permute_load_chain.
3460 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
3461 a power of 2, generate extract_even/odd stmts to reorder the input data
3462 correctly. Return the final references for loads in RESULT_CHAIN.
3464 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3465 The input is 4 vectors each containing 8 elements. We assign a number to each
3466 element, the input sequence is:
3468 1st vec: 0 1 2 3 4 5 6 7
3469 2nd vec: 8 9 10 11 12 13 14 15
3470 3rd vec: 16 17 18 19 20 21 22 23
3471 4th vec: 24 25 26 27 28 29 30 31
3473 The output sequence should be:
3475 1st vec: 0 4 8 12 16 20 24 28
3476 2nd vec: 1 5 9 13 17 21 25 29
3477 3rd vec: 2 6 10 14 18 22 26 30
3478 4th vec: 3 7 11 15 19 23 27 31
3480 i.e., the first output vector should contain the first elements of each
3481 interleaving group, etc.
3483 We use extract_even/odd instructions to create such output. The input of each
3484 extract_even/odd operation is two vectors
3485 1st vec 2nd vec
3486 0 1 2 3 4 5 6 7
3488 and the output is the vector of extracted even/odd elements. The output of
3489 extract_even will be: 0 2 4 6
3490 and of extract_odd: 1 3 5 7
3493 The permutation is done in log LENGTH stages. In each stage extract_even and
3494 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
3495 order. In our example,
3497 E1: extract_even (1st vec, 2nd vec)
3498 E2: extract_odd (1st vec, 2nd vec)
3499 E3: extract_even (3rd vec, 4th vec)
3500 E4: extract_odd (3rd vec, 4th vec)
3502 The output for the first stage will be:
3504 E1: 0 2 4 6 8 10 12 14
3505 E2: 1 3 5 7 9 11 13 15
3506 E3: 16 18 20 22 24 26 28 30
3507 E4: 17 19 21 23 25 27 29 31
3509 In order to proceed and create the correct sequence for the next stage (or
3510 for the correct output, if the second stage is the last one, as in our
3511 example), we first put the output of extract_even operation and then the
3512 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
3513 The input for the second stage is:
3515 1st vec (E1): 0 2 4 6 8 10 12 14
3516 2nd vec (E3): 16 18 20 22 24 26 28 30
3517 3rd vec (E2): 1 3 5 7 9 11 13 15
3518 4th vec (E4): 17 19 21 23 25 27 29 31
3520 The output of the second stage:
3522 E1: 0 4 8 12 16 20 24 28
3523 E2: 2 6 10 14 18 22 26 30
3524 E3: 1 5 9 13 17 21 25 29
3525 E4: 3 7 11 15 19 23 27 31
3527 And RESULT_CHAIN after reordering:
3529 1st vec (E1): 0 4 8 12 16 20 24 28
3530 2nd vec (E3): 1 5 9 13 17 21 25 29
3531 3rd vec (E2): 2 6 10 14 18 22 26 30
3532 4th vec (E4): 3 7 11 15 19 23 27 31. */
3534 static bool
3535 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
3536 unsigned int length,
3537 tree stmt,
3538 block_stmt_iterator *bsi,
3539 VEC(tree,heap) **result_chain)
3541 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
3542 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3543 int i;
3544 unsigned int j;
3546 /* Check that the operation is supported. */
3547 if (!vect_strided_load_supported (vectype))
3548 return false;
3550 *result_chain = VEC_copy (tree, heap, dr_chain);
3551 for (i = 0; i < exact_log2 (length); i++)
3553 for (j = 0; j < length; j +=2)
3555 first_vect = VEC_index (tree, dr_chain, j);
3556 second_vect = VEC_index (tree, dr_chain, j+1);
3558 /* data_ref = permute_even (first_data_ref, second_data_ref); */
3559 perm_dest = create_tmp_var (vectype, "vect_perm_even");
3560 DECL_GIMPLE_REG_P (perm_dest) = 1;
3561 add_referenced_var (perm_dest);
3563 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3564 build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
3565 first_vect, second_vect));
3567 data_ref = make_ssa_name (perm_dest, perm_stmt);
3568 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3569 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3570 mark_symbols_for_renaming (perm_stmt);
3572 VEC_replace (tree, *result_chain, j/2, data_ref);
3574 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
3575 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
3576 DECL_GIMPLE_REG_P (perm_dest) = 1;
3577 add_referenced_var (perm_dest);
3579 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3580 build2 (VEC_EXTRACT_ODD_EXPR, vectype,
3581 first_vect, second_vect));
3582 data_ref = make_ssa_name (perm_dest, perm_stmt);
3583 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3584 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3585 mark_symbols_for_renaming (perm_stmt);
3587 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
3589 dr_chain = VEC_copy (tree, heap, *result_chain);
3591 return true;
3595 /* Function vect_transform_strided_load.
3597 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
3598 to perform their permutation and ascribe the result vectorized statements to
3599 the scalar statements.
3602 static bool
3603 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
3604 block_stmt_iterator *bsi)
3606 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3607 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3608 tree next_stmt, new_stmt;
3609 VEC(tree,heap) *result_chain = NULL;
3610 unsigned int i, gap_count;
3611 tree tmp_data_ref;
3613 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
3614 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
3615 vectors, that are ready for vector computation. */
3616 result_chain = VEC_alloc (tree, heap, size);
3617 /* Permute. */
3618 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
3619 return false;
3621 /* Put a permuted data-ref in the VECTORIZED_STMT field.
3622 Since we scan the chain starting from it's first node, their order
3623 corresponds the order of data-refs in RESULT_CHAIN. */
3624 next_stmt = first_stmt;
3625 gap_count = 1;
3626 for (i = 0; VEC_iterate(tree, result_chain, i, tmp_data_ref); i++)
3628 if (!next_stmt)
3629 break;
3631 /* Skip the gaps. Loads created for the gaps will be removed by dead
3632 code elimination pass later.
3633 DR_GROUP_GAP is the number of steps in elements from the previous
3634 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
3635 correspond to the gaps.
3637 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
3639 gap_count++;
3640 continue;
3643 while (next_stmt)
3645 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
3646 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
3647 copies, and we put the new vector statement in the first available
3648 RELATED_STMT. */
3649 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
3650 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
3651 else
3653 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
3654 tree rel_stmt = STMT_VINFO_RELATED_STMT (
3655 vinfo_for_stmt (prev_stmt));
3656 while (rel_stmt)
3658 prev_stmt = rel_stmt;
3659 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
3661 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
3663 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3664 gap_count = 1;
3665 /* If NEXT_STMT accesses the same DR as the previous statement,
3666 put the same TMP_DATA_REF as its vectorized statement; otherwise
3667 get the next data-ref from RESULT_CHAIN. */
3668 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
3669 break;
3672 return true;
3676 /* vectorizable_load.
3678 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
3679 can be vectorized.
3680 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3681 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3682 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3684 bool
3685 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3687 tree scalar_dest;
3688 tree vec_dest = NULL;
3689 tree data_ref = NULL;
3690 tree op;
3691 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3692 stmt_vec_info prev_stmt_info;
3693 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3694 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3695 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
3696 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3697 tree new_temp;
3698 int mode;
3699 tree new_stmt = NULL_TREE;
3700 tree dummy;
3701 enum dr_alignment_support alignment_support_cheme;
3702 tree dataref_ptr = NULL_TREE;
3703 tree ptr_incr;
3704 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3705 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3706 int i, j, group_size;
3707 tree msq = NULL_TREE, lsq;
3708 tree offset = NULL_TREE;
3709 tree realignment_token = NULL_TREE;
3710 tree phi_stmt = NULL_TREE;
3711 VEC(tree,heap) *dr_chain = NULL;
3712 bool strided_load = false;
3713 tree first_stmt;
3715 /* Is vectorizable load? */
3716 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3717 return false;
3719 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3721 if (STMT_VINFO_LIVE_P (stmt_info))
3723 /* FORNOW: not yet supported. */
3724 if (vect_print_dump_info (REPORT_DETAILS))
3725 fprintf (vect_dump, "value used after loop.");
3726 return false;
3729 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3730 return false;
3732 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3733 if (TREE_CODE (scalar_dest) != SSA_NAME)
3734 return false;
3736 op = GIMPLE_STMT_OPERAND (stmt, 1);
3737 if (TREE_CODE (op) != ARRAY_REF
3738 && TREE_CODE (op) != INDIRECT_REF
3739 && !DR_GROUP_FIRST_DR (stmt_info))
3740 return false;
3742 if (!STMT_VINFO_DATA_REF (stmt_info))
3743 return false;
3745 mode = (int) TYPE_MODE (vectype);
3747 /* FORNOW. In some cases can vectorize even if data-type not supported
3748 (e.g. - data copies). */
3749 if (mov_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3751 if (vect_print_dump_info (REPORT_DETAILS))
3752 fprintf (vect_dump, "Aligned load, but unsupported type.");
3753 return false;
3756 /* Check if the load is a part of an interleaving chain. */
3757 if (DR_GROUP_FIRST_DR (stmt_info))
3759 strided_load = true;
3761 /* Check if interleaving is supported. */
3762 if (!vect_strided_load_supported (vectype))
3763 return false;
3766 if (!vec_stmt) /* transformation not required. */
3768 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
3769 return true;
3772 /** Transform. **/
3774 if (vect_print_dump_info (REPORT_DETAILS))
3775 fprintf (vect_dump, "transform load.");
3777 if (strided_load)
3779 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3780 /* Check if the chain of loads is already vectorized. */
3781 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
3783 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3784 return true;
3786 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3787 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3788 dr_chain = VEC_alloc (tree, heap, group_size);
3790 else
3792 first_stmt = stmt;
3793 first_dr = dr;
3794 group_size = 1;
3797 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3798 gcc_assert (alignment_support_cheme);
3801 /* In case the vectorization factor (VF) is bigger than the number
3802 of elements that we can fit in a vectype (nunits), we have to generate
3803 more than one vector stmt - i.e - we need to "unroll" the
3804 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3805 from one copy of the vector stmt to the next, in the field
3806 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3807 stages to find the correct vector defs to be used when vectorizing
3808 stmts that use the defs of the current stmt. The example below illustrates
3809 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3810 4 vectorized stmts):
3812 before vectorization:
3813 RELATED_STMT VEC_STMT
3814 S1: x = memref - -
3815 S2: z = x + 1 - -
3817 step 1: vectorize stmt S1:
3818 We first create the vector stmt VS1_0, and, as usual, record a
3819 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
3820 Next, we create the vector stmt VS1_1, and record a pointer to
3821 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
3822 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
3823 stmts and pointers:
3824 RELATED_STMT VEC_STMT
3825 VS1_0: vx0 = memref0 VS1_1 -
3826 VS1_1: vx1 = memref1 VS1_2 -
3827 VS1_2: vx2 = memref2 VS1_3 -
3828 VS1_3: vx3 = memref3 - -
3829 S1: x = load - VS1_0
3830 S2: z = x + 1 - -
3832 See in documentation in vect_get_vec_def_for_stmt_copy for how the
3833 information we recorded in RELATED_STMT field is used to vectorize
3834 stmt S2. */
3836 /* In case of interleaving (non-unit strided access):
3838 S1: x2 = &base + 2
3839 S2: x0 = &base
3840 S3: x1 = &base + 1
3841 S4: x3 = &base + 3
3843 Vectorized loads are created in the order of memory accesses
3844 starting from the access of the first stmt of the chain:
3846 VS1: vx0 = &base
3847 VS2: vx1 = &base + vec_size*1
3848 VS3: vx3 = &base + vec_size*2
3849 VS4: vx4 = &base + vec_size*3
3851 Then permutation statements are generated:
3853 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
3854 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
3857 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3858 (the order of the data-refs in the output of vect_permute_load_chain
3859 corresponds to the order of scalar stmts in the interleaving chain - see
3860 the documentation of vect_permute_load_chain()).
3861 The generation of permutation stmts and recording them in
3862 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
3864 In case of both multiple types and interleaving, the vector loads and
3865 permutation stmts above are created for every copy. The result vector stmts
3866 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3867 STMT_VINFO_RELATED_STMT for the next copies. */
3869 /* If the data reference is aligned (dr_aligned) or potentially unaligned
3870 on a target that supports unaligned accesses (dr_unaligned_supported)
3871 we generate the following code:
3872 p = initial_addr;
3873 indx = 0;
3874 loop {
3875 p = p + indx * vectype_size;
3876 vec_dest = *(p);
3877 indx = indx + 1;
3880 Otherwise, the data reference is potentially unaligned on a target that
3881 does not support unaligned accesses (dr_unaligned_software_pipeline) -
3882 then generate the following code, in which the data in each iteration is
3883 obtained by two vector loads, one from the previous iteration, and one
3884 from the current iteration:
3885 p1 = initial_addr;
3886 msq_init = *(floor(p1))
3887 p2 = initial_addr + VS - 1;
3888 realignment_token = call target_builtin;
3889 indx = 0;
3890 loop {
3891 p2 = p2 + indx * vectype_size
3892 lsq = *(floor(p2))
3893 vec_dest = realign_load (msq, lsq, realignment_token)
3894 indx = indx + 1;
3895 msq = lsq;
3896 } */
3898 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3900 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token);
3901 phi_stmt = SSA_NAME_DEF_STMT (msq);
3902 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
3905 prev_stmt_info = NULL;
3906 for (j = 0; j < ncopies; j++)
3908 /* 1. Create the vector pointer update chain. */
3909 if (j == 0)
3910 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset, &dummy,
3911 &ptr_incr, false, NULL_TREE);
3912 else
3913 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3915 for (i = 0; i < group_size; i++)
3917 /* 2. Create the vector-load in the loop. */
3918 switch (alignment_support_cheme)
3920 case dr_aligned:
3921 gcc_assert (aligned_access_p (first_dr));
3922 data_ref = build_fold_indirect_ref (dataref_ptr);
3923 break;
3924 case dr_unaligned_supported:
3926 int mis = DR_MISALIGNMENT (first_dr);
3927 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
3929 gcc_assert (!aligned_access_p (first_dr));
3930 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
3931 data_ref =
3932 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
3933 break;
3935 case dr_unaligned_software_pipeline:
3936 gcc_assert (!aligned_access_p (first_dr));
3937 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
3938 break;
3939 default:
3940 gcc_unreachable ();
3942 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3943 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3944 data_ref);
3945 new_temp = make_ssa_name (vec_dest, new_stmt);
3946 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3947 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3948 copy_virtual_operands (new_stmt, stmt);
3949 mark_symbols_for_renaming (new_stmt);
3951 /* 3. Handle explicit realignment if necessary/supported. */
3952 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3954 /* Create in loop:
3955 <vec_dest = realign_load (msq, lsq, realignment_token)> */
3956 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
3957 if (!realignment_token)
3958 realignment_token = dataref_ptr;
3959 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3960 new_stmt =
3961 build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token);
3962 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3963 new_stmt);
3964 new_temp = make_ssa_name (vec_dest, new_stmt);
3965 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3966 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3967 if (i == group_size - 1 && j == ncopies - 1)
3968 add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop));
3969 msq = lsq;
3971 if (strided_load)
3972 VEC_quick_push (tree, dr_chain, new_temp);
3973 if (i < group_size - 1)
3974 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3977 if (strided_load)
3979 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
3980 return false;
3981 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3982 dr_chain = VEC_alloc (tree, heap, group_size);
3984 else
3986 if (j == 0)
3987 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3988 else
3989 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3990 prev_stmt_info = vinfo_for_stmt (new_stmt);
3994 return true;
3998 /* Function vectorizable_live_operation.
4000 STMT computes a value that is used outside the loop. Check if
4001 it can be supported. */
4003 bool
4004 vectorizable_live_operation (tree stmt,
4005 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
4006 tree *vec_stmt ATTRIBUTE_UNUSED)
4008 tree operation;
4009 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4010 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4011 int i;
4012 int op_type;
4013 tree op;
4014 tree def, def_stmt;
4015 enum vect_def_type dt;
4017 if (!STMT_VINFO_LIVE_P (stmt_info))
4018 return false;
4020 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4021 return false;
4023 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4024 return false;
4026 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4027 op_type = TREE_OPERAND_LENGTH (operation);
4029 /* FORNOW: support only if all uses are invariant. This means
4030 that the scalar operations can remain in place, unvectorized.
4031 The original last scalar value that they compute will be used. */
4033 for (i = 0; i < op_type; i++)
4035 op = TREE_OPERAND (operation, i);
4036 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4038 if (vect_print_dump_info (REPORT_DETAILS))
4039 fprintf (vect_dump, "use not simple.");
4040 return false;
4043 if (dt != vect_invariant_def && dt != vect_constant_def)
4044 return false;
4047 /* No transformation is required for the cases we currently support. */
4048 return true;
4052 /* Function vect_is_simple_cond.
4054 Input:
4055 LOOP - the loop that is being vectorized.
4056 COND - Condition that is checked for simple use.
4058 Returns whether a COND can be vectorized. Checks whether
4059 condition operands are supportable using vec_is_simple_use. */
4061 static bool
4062 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
4064 tree lhs, rhs;
4065 tree def;
4066 enum vect_def_type dt;
4068 if (!COMPARISON_CLASS_P (cond))
4069 return false;
4071 lhs = TREE_OPERAND (cond, 0);
4072 rhs = TREE_OPERAND (cond, 1);
4074 if (TREE_CODE (lhs) == SSA_NAME)
4076 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
4077 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
4078 return false;
4080 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST)
4081 return false;
4083 if (TREE_CODE (rhs) == SSA_NAME)
4085 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
4086 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
4087 return false;
4089 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST)
4090 return false;
4092 return true;
4095 /* vectorizable_condition.
4097 Check if STMT is conditional modify expression that can be vectorized.
4098 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4099 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
4100 at BSI.
4102 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4104 bool
4105 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
4107 tree scalar_dest = NULL_TREE;
4108 tree vec_dest = NULL_TREE;
4109 tree op = NULL_TREE;
4110 tree cond_expr, then_clause, else_clause;
4111 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4112 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4113 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
4114 tree vec_compare, vec_cond_expr;
4115 tree new_temp;
4116 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4117 enum machine_mode vec_mode;
4118 tree def;
4119 enum vect_def_type dt;
4120 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4121 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4123 gcc_assert (ncopies >= 1);
4124 if (ncopies > 1)
4125 return false; /* FORNOW */
4127 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4128 return false;
4130 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
4132 if (STMT_VINFO_LIVE_P (stmt_info))
4134 /* FORNOW: not yet supported. */
4135 if (vect_print_dump_info (REPORT_DETAILS))
4136 fprintf (vect_dump, "value used after loop.");
4137 return false;
4140 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4141 return false;
4143 op = GIMPLE_STMT_OPERAND (stmt, 1);
4145 if (TREE_CODE (op) != COND_EXPR)
4146 return false;
4148 cond_expr = TREE_OPERAND (op, 0);
4149 then_clause = TREE_OPERAND (op, 1);
4150 else_clause = TREE_OPERAND (op, 2);
4152 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
4153 return false;
4155 /* We do not handle two different vector types for the condition
4156 and the values. */
4157 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
4158 return false;
4160 if (TREE_CODE (then_clause) == SSA_NAME)
4162 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
4163 if (!vect_is_simple_use (then_clause, loop_vinfo,
4164 &then_def_stmt, &def, &dt))
4165 return false;
4167 else if (TREE_CODE (then_clause) != INTEGER_CST
4168 && TREE_CODE (then_clause) != REAL_CST)
4169 return false;
4171 if (TREE_CODE (else_clause) == SSA_NAME)
4173 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
4174 if (!vect_is_simple_use (else_clause, loop_vinfo,
4175 &else_def_stmt, &def, &dt))
4176 return false;
4178 else if (TREE_CODE (else_clause) != INTEGER_CST
4179 && TREE_CODE (else_clause) != REAL_CST)
4180 return false;
4183 vec_mode = TYPE_MODE (vectype);
4185 if (!vec_stmt)
4187 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
4188 return expand_vec_cond_expr_p (op, vec_mode);
4191 /* Transform */
4193 /* Handle def. */
4194 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4195 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4197 /* Handle cond expr. */
4198 vec_cond_lhs =
4199 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
4200 vec_cond_rhs =
4201 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
4202 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
4203 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
4205 /* Arguments are ready. create the new vector stmt. */
4206 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
4207 vec_cond_lhs, vec_cond_rhs);
4208 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
4209 vec_compare, vec_then_clause, vec_else_clause);
4211 *vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
4212 vec_cond_expr);
4213 new_temp = make_ssa_name (vec_dest, *vec_stmt);
4214 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
4215 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
4217 return true;
4220 /* Function vect_transform_stmt.
4222 Create a vectorized stmt to replace STMT, and insert it at BSI. */
4224 bool
4225 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
4227 bool is_store = false;
4228 tree vec_stmt = NULL_TREE;
4229 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4230 tree orig_stmt_in_pattern;
4231 bool done;
4233 if (STMT_VINFO_RELEVANT_P (stmt_info))
4235 switch (STMT_VINFO_TYPE (stmt_info))
4237 case type_demotion_vec_info_type:
4238 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
4239 gcc_assert (done);
4240 break;
4242 case type_promotion_vec_info_type:
4243 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
4244 gcc_assert (done);
4245 break;
4247 case type_conversion_vec_info_type:
4248 done = vectorizable_conversion (stmt, bsi, &vec_stmt);
4249 gcc_assert (done);
4250 break;
4252 case op_vec_info_type:
4253 done = vectorizable_operation (stmt, bsi, &vec_stmt);
4254 gcc_assert (done);
4255 break;
4257 case assignment_vec_info_type:
4258 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
4259 gcc_assert (done);
4260 break;
4262 case load_vec_info_type:
4263 done = vectorizable_load (stmt, bsi, &vec_stmt);
4264 gcc_assert (done);
4265 break;
4267 case store_vec_info_type:
4268 done = vectorizable_store (stmt, bsi, &vec_stmt);
4269 gcc_assert (done);
4270 if (DR_GROUP_FIRST_DR (stmt_info))
4272 /* In case of interleaving, the whole chain is vectorized when the
4273 last store in the chain is reached. Store stmts before the last
4274 one are skipped, and there vec_stmt_info shouldn't be freed
4275 meanwhile. */
4276 *strided_store = true;
4277 if (STMT_VINFO_VEC_STMT (stmt_info))
4278 is_store = true;
4280 else
4281 is_store = true;
4282 break;
4284 case condition_vec_info_type:
4285 done = vectorizable_condition (stmt, bsi, &vec_stmt);
4286 gcc_assert (done);
4287 break;
4289 case call_vec_info_type:
4290 done = vectorizable_call (stmt, bsi, &vec_stmt);
4291 break;
4293 default:
4294 if (vect_print_dump_info (REPORT_DETAILS))
4295 fprintf (vect_dump, "stmt not supported.");
4296 gcc_unreachable ();
4299 gcc_assert (vec_stmt || *strided_store);
4300 if (vec_stmt)
4302 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
4303 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
4304 if (orig_stmt_in_pattern)
4306 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
4307 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
4309 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4311 /* STMT was inserted by the vectorizer to replace a
4312 computation idiom. ORIG_STMT_IN_PATTERN is a stmt in the
4313 original sequence that computed this idiom. We need to
4314 record a pointer to VEC_STMT in the stmt_info of
4315 ORIG_STMT_IN_PATTERN. See more details in the
4316 documentation of vect_pattern_recog. */
4318 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
4324 if (STMT_VINFO_LIVE_P (stmt_info))
4326 switch (STMT_VINFO_TYPE (stmt_info))
4328 case reduc_vec_info_type:
4329 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
4330 gcc_assert (done);
4331 break;
4333 default:
4334 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
4335 gcc_assert (done);
4339 return is_store;
4343 /* This function builds ni_name = number of iterations loop executes
4344 on the loop preheader. */
4346 static tree
4347 vect_build_loop_niters (loop_vec_info loop_vinfo)
4349 tree ni_name, stmt, var;
4350 edge pe;
4351 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4352 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
4354 var = create_tmp_var (TREE_TYPE (ni), "niters");
4355 add_referenced_var (var);
4356 ni_name = force_gimple_operand (ni, &stmt, false, var);
4358 pe = loop_preheader_edge (loop);
4359 if (stmt)
4361 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4362 gcc_assert (!new_bb);
4365 return ni_name;
4369 /* This function generates the following statements:
4371 ni_name = number of iterations loop executes
4372 ratio = ni_name / vf
4373 ratio_mult_vf_name = ratio * vf
4375 and places them at the loop preheader edge. */
4377 static void
4378 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
4379 tree *ni_name_ptr,
4380 tree *ratio_mult_vf_name_ptr,
4381 tree *ratio_name_ptr)
4384 edge pe;
4385 basic_block new_bb;
4386 tree stmt, ni_name;
4387 tree var;
4388 tree ratio_name;
4389 tree ratio_mult_vf_name;
4390 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4391 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
4392 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4393 tree log_vf;
4395 pe = loop_preheader_edge (loop);
4397 /* Generate temporary variable that contains
4398 number of iterations loop executes. */
4400 ni_name = vect_build_loop_niters (loop_vinfo);
4401 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
4403 /* Create: ratio = ni >> log2(vf) */
4405 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
4406 if (!is_gimple_val (ratio_name))
4408 var = create_tmp_var (TREE_TYPE (ni), "bnd");
4409 add_referenced_var (var);
4411 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
4412 pe = loop_preheader_edge (loop);
4413 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4414 gcc_assert (!new_bb);
4417 /* Create: ratio_mult_vf = ratio << log2 (vf). */
4419 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
4420 ratio_name, log_vf);
4421 if (!is_gimple_val (ratio_mult_vf_name))
4423 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
4424 add_referenced_var (var);
4426 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
4427 true, var);
4428 pe = loop_preheader_edge (loop);
4429 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4430 gcc_assert (!new_bb);
4433 *ni_name_ptr = ni_name;
4434 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
4435 *ratio_name_ptr = ratio_name;
4437 return;
4441 /* Function update_vuses_to_preheader.
4443 Input:
4444 STMT - a statement with potential VUSEs.
4445 LOOP - the loop whose preheader will contain STMT.
4447 It's possible to vectorize a loop even though an SSA_NAME from a VUSE
4448 appears to be defined in a VDEF in another statement in a loop.
4449 One such case is when the VUSE is at the dereference of a __restricted__
4450 pointer in a load and the VDEF is at the dereference of a different
4451 __restricted__ pointer in a store. Vectorization may result in
4452 copy_virtual_uses being called to copy the problematic VUSE to a new
4453 statement that is being inserted in the loop preheader. This procedure
4454 is called to change the SSA_NAME in the new statement's VUSE from the
4455 SSA_NAME updated in the loop to the related SSA_NAME available on the
4456 path entering the loop.
4458 When this function is called, we have the following situation:
4460 # vuse <name1>
4461 S1: vload
4462 do {
4463 # name1 = phi < name0 , name2>
4465 # vuse <name1>
4466 S2: vload
4468 # name2 = vdef <name1>
4469 S3: vstore
4471 }while...
4473 Stmt S1 was created in the loop preheader block as part of misaligned-load
4474 handling. This function fixes the name of the vuse of S1 from 'name1' to
4475 'name0'. */
4477 static void
4478 update_vuses_to_preheader (tree stmt, struct loop *loop)
4480 basic_block header_bb = loop->header;
4481 edge preheader_e = loop_preheader_edge (loop);
4482 ssa_op_iter iter;
4483 use_operand_p use_p;
4485 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VUSE)
4487 tree ssa_name = USE_FROM_PTR (use_p);
4488 tree def_stmt = SSA_NAME_DEF_STMT (ssa_name);
4489 tree name_var = SSA_NAME_VAR (ssa_name);
4490 basic_block bb = bb_for_stmt (def_stmt);
4492 /* For a use before any definitions, def_stmt is a NOP_EXPR. */
4493 if (!IS_EMPTY_STMT (def_stmt)
4494 && flow_bb_inside_loop_p (loop, bb))
4496 /* If the block containing the statement defining the SSA_NAME
4497 is in the loop then it's necessary to find the definition
4498 outside the loop using the PHI nodes of the header. */
4499 tree phi;
4500 bool updated = false;
4502 for (phi = phi_nodes (header_bb); phi; phi = TREE_CHAIN (phi))
4504 if (SSA_NAME_VAR (PHI_RESULT (phi)) == name_var)
4506 SET_USE (use_p, PHI_ARG_DEF (phi, preheader_e->dest_idx));
4507 updated = true;
4508 break;
4511 gcc_assert (updated);
4517 /* Function vect_update_ivs_after_vectorizer.
4519 "Advance" the induction variables of LOOP to the value they should take
4520 after the execution of LOOP. This is currently necessary because the
4521 vectorizer does not handle induction variables that are used after the
4522 loop. Such a situation occurs when the last iterations of LOOP are
4523 peeled, because:
4524 1. We introduced new uses after LOOP for IVs that were not originally used
4525 after LOOP: the IVs of LOOP are now used by an epilog loop.
4526 2. LOOP is going to be vectorized; this means that it will iterate N/VF
4527 times, whereas the loop IVs should be bumped N times.
4529 Input:
4530 - LOOP - a loop that is going to be vectorized. The last few iterations
4531 of LOOP were peeled.
4532 - NITERS - the number of iterations that LOOP executes (before it is
4533 vectorized). i.e, the number of times the ivs should be bumped.
4534 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
4535 coming out from LOOP on which there are uses of the LOOP ivs
4536 (this is the path from LOOP->exit to epilog_loop->preheader).
4538 The new definitions of the ivs are placed in LOOP->exit.
4539 The phi args associated with the edge UPDATE_E in the bb
4540 UPDATE_E->dest are updated accordingly.
4542 Assumption 1: Like the rest of the vectorizer, this function assumes
4543 a single loop exit that has a single predecessor.
4545 Assumption 2: The phi nodes in the LOOP header and in update_bb are
4546 organized in the same order.
4548 Assumption 3: The access function of the ivs is simple enough (see
4549 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
4551 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
4552 coming out of LOOP on which the ivs of LOOP are used (this is the path
4553 that leads to the epilog loop; other paths skip the epilog loop). This
4554 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
4555 needs to have its phis updated.
4558 static void
4559 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
4560 edge update_e)
4562 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4563 basic_block exit_bb = single_exit (loop)->dest;
4564 tree phi, phi1;
4565 basic_block update_bb = update_e->dest;
4567 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
4569 /* Make sure there exists a single-predecessor exit bb: */
4570 gcc_assert (single_pred_p (exit_bb));
4572 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
4573 phi && phi1;
4574 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
4576 tree access_fn = NULL;
4577 tree evolution_part;
4578 tree init_expr;
4579 tree step_expr;
4580 tree var, stmt, ni, ni_name;
4581 block_stmt_iterator last_bsi;
4583 if (vect_print_dump_info (REPORT_DETAILS))
4585 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
4586 print_generic_expr (vect_dump, phi, TDF_SLIM);
4589 /* Skip virtual phi's. */
4590 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
4592 if (vect_print_dump_info (REPORT_DETAILS))
4593 fprintf (vect_dump, "virtual phi. skip.");
4594 continue;
4597 /* Skip reduction phis. */
4598 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
4600 if (vect_print_dump_info (REPORT_DETAILS))
4601 fprintf (vect_dump, "reduc phi. skip.");
4602 continue;
4605 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
4606 gcc_assert (access_fn);
4607 evolution_part =
4608 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
4609 gcc_assert (evolution_part != NULL_TREE);
4611 /* FORNOW: We do not support IVs whose evolution function is a polynomial
4612 of degree >= 2 or exponential. */
4613 gcc_assert (!tree_is_chrec (evolution_part));
4615 step_expr = evolution_part;
4616 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
4617 loop->num));
4619 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
4620 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
4621 fold_convert (TREE_TYPE (init_expr),
4622 niters),
4623 step_expr),
4624 init_expr);
4626 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
4627 add_referenced_var (var);
4629 ni_name = force_gimple_operand (ni, &stmt, false, var);
4631 /* Insert stmt into exit_bb. */
4632 last_bsi = bsi_last (exit_bb);
4633 if (stmt)
4634 bsi_insert_before (&last_bsi, stmt, BSI_SAME_STMT);
4636 /* Fix phi expressions in the successor bb. */
4637 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
4642 /* Function vect_do_peeling_for_loop_bound
4644 Peel the last iterations of the loop represented by LOOP_VINFO.
4645 The peeled iterations form a new epilog loop. Given that the loop now
4646 iterates NITERS times, the new epilog loop iterates
4647 NITERS % VECTORIZATION_FACTOR times.
4649 The original loop will later be made to iterate
4650 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
4652 static void
4653 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
4655 tree ni_name, ratio_mult_vf_name;
4656 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4657 struct loop *new_loop;
4658 edge update_e;
4659 basic_block preheader;
4660 int loop_num;
4661 unsigned int th;
4663 if (vect_print_dump_info (REPORT_DETAILS))
4664 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
4666 initialize_original_copy_tables ();
4668 /* Generate the following variables on the preheader of original loop:
4670 ni_name = number of iteration the original loop executes
4671 ratio = ni_name / vf
4672 ratio_mult_vf_name = ratio * vf */
4673 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
4674 &ratio_mult_vf_name, ratio);
4676 loop_num = loop->num;
4677 /* Threshold for vectorized loop. */
4678 th = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)) *
4679 LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4680 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
4681 ratio_mult_vf_name, ni_name, false, th);
4682 gcc_assert (new_loop);
4683 gcc_assert (loop_num == loop->num);
4684 #ifdef ENABLE_CHECKING
4685 slpeel_verify_cfg_after_peeling (loop, new_loop);
4686 #endif
4688 /* A guard that controls whether the new_loop is to be executed or skipped
4689 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
4690 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
4691 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
4692 is on the path where the LOOP IVs are used and need to be updated. */
4694 preheader = loop_preheader_edge (new_loop)->src;
4695 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
4696 update_e = EDGE_PRED (preheader, 0);
4697 else
4698 update_e = EDGE_PRED (preheader, 1);
4700 /* Update IVs of original loop as if they were advanced
4701 by ratio_mult_vf_name steps. */
4702 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
4704 /* After peeling we have to reset scalar evolution analyzer. */
4705 scev_reset ();
4707 free_original_copy_tables ();
4711 /* Function vect_gen_niters_for_prolog_loop
4713 Set the number of iterations for the loop represented by LOOP_VINFO
4714 to the minimum between LOOP_NITERS (the original iteration count of the loop)
4715 and the misalignment of DR - the data reference recorded in
4716 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
4717 this loop, the data reference DR will refer to an aligned location.
4719 The following computation is generated:
4721 If the misalignment of DR is known at compile time:
4722 addr_mis = int mis = DR_MISALIGNMENT (dr);
4723 Else, compute address misalignment in bytes:
4724 addr_mis = addr & (vectype_size - 1)
4726 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
4728 (elem_size = element type size; an element is the scalar element
4729 whose type is the inner type of the vectype)
4731 For interleaving,
4733 prolog_niters = min ( LOOP_NITERS ,
4734 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
4735 where group_size is the size of the interleaved group.
4738 static tree
4739 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
4741 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
4742 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4743 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4744 tree var, stmt;
4745 tree iters, iters_name;
4746 edge pe;
4747 basic_block new_bb;
4748 tree dr_stmt = DR_STMT (dr);
4749 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
4750 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4751 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
4752 tree niters_type = TREE_TYPE (loop_niters);
4753 int group_size = 1;
4754 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
4756 if (DR_GROUP_FIRST_DR (stmt_info))
4758 /* For interleaved access element size must be multiplied by the size of
4759 the interleaved group. */
4760 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
4761 DR_GROUP_FIRST_DR (stmt_info)));
4762 element_size *= group_size;
4765 pe = loop_preheader_edge (loop);
4767 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
4769 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
4770 int elem_misalign = byte_misalign / element_size;
4772 if (vect_print_dump_info (REPORT_DETAILS))
4773 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
4774 iters = build_int_cst (niters_type,
4775 (vf - elem_misalign)&(vf/group_size-1));
4777 else
4779 tree new_stmts = NULL_TREE;
4780 tree start_addr =
4781 vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE);
4782 tree ptr_type = TREE_TYPE (start_addr);
4783 tree size = TYPE_SIZE (ptr_type);
4784 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
4785 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
4786 tree elem_size_log =
4787 build_int_cst (type, exact_log2 (vectype_align/vf));
4788 tree vf_minus_1 = build_int_cst (type, vf - 1);
4789 tree vf_tree = build_int_cst (type, vf);
4790 tree byte_misalign;
4791 tree elem_misalign;
4793 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
4794 gcc_assert (!new_bb);
4796 /* Create: byte_misalign = addr & (vectype_size - 1) */
4797 byte_misalign =
4798 fold_build2 (BIT_AND_EXPR, type, start_addr, vectype_size_minus_1);
4800 /* Create: elem_misalign = byte_misalign / element_size */
4801 elem_misalign =
4802 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
4804 /* Create: (niters_type) (VF - elem_misalign)&(VF - 1) */
4805 iters = fold_build2 (MINUS_EXPR, type, vf_tree, elem_misalign);
4806 iters = fold_build2 (BIT_AND_EXPR, type, iters, vf_minus_1);
4807 iters = fold_convert (niters_type, iters);
4810 /* Create: prolog_loop_niters = min (iters, loop_niters) */
4811 /* If the loop bound is known at compile time we already verified that it is
4812 greater than vf; since the misalignment ('iters') is at most vf, there's
4813 no need to generate the MIN_EXPR in this case. */
4814 if (TREE_CODE (loop_niters) != INTEGER_CST)
4815 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
4817 if (vect_print_dump_info (REPORT_DETAILS))
4819 fprintf (vect_dump, "niters for prolog loop: ");
4820 print_generic_expr (vect_dump, iters, TDF_SLIM);
4823 var = create_tmp_var (niters_type, "prolog_loop_niters");
4824 add_referenced_var (var);
4825 iters_name = force_gimple_operand (iters, &stmt, false, var);
4827 /* Insert stmt on loop preheader edge. */
4828 if (stmt)
4830 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4831 gcc_assert (!new_bb);
4834 return iters_name;
4838 /* Function vect_update_init_of_dr
4840 NITERS iterations were peeled from LOOP. DR represents a data reference
4841 in LOOP. This function updates the information recorded in DR to
4842 account for the fact that the first NITERS iterations had already been
4843 executed. Specifically, it updates the OFFSET field of DR. */
4845 static void
4846 vect_update_init_of_dr (struct data_reference *dr, tree niters)
4848 tree offset = DR_OFFSET (dr);
4850 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
4851 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
4852 DR_OFFSET (dr) = offset;
4856 /* Function vect_update_inits_of_drs
4858 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
4859 This function updates the information recorded for the data references in
4860 the loop to account for the fact that the first NITERS iterations had
4861 already been executed. Specifically, it updates the initial_condition of the
4862 access_function of all the data_references in the loop. */
4864 static void
4865 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
4867 unsigned int i;
4868 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
4869 struct data_reference *dr;
4871 if (vect_dump && (dump_flags & TDF_DETAILS))
4872 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
4874 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
4875 vect_update_init_of_dr (dr, niters);
4879 /* Function vect_do_peeling_for_alignment
4881 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
4882 'niters' is set to the misalignment of one of the data references in the
4883 loop, thereby forcing it to refer to an aligned location at the beginning
4884 of the execution of this loop. The data reference for which we are
4885 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
4887 static void
4888 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
4890 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4891 tree niters_of_prolog_loop, ni_name;
4892 tree n_iters;
4893 struct loop *new_loop;
4895 if (vect_print_dump_info (REPORT_DETAILS))
4896 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
4898 initialize_original_copy_tables ();
4900 ni_name = vect_build_loop_niters (loop_vinfo);
4901 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
4903 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
4904 new_loop =
4905 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
4906 niters_of_prolog_loop, ni_name, true, 0);
4907 gcc_assert (new_loop);
4908 #ifdef ENABLE_CHECKING
4909 slpeel_verify_cfg_after_peeling (new_loop, loop);
4910 #endif
4912 /* Update number of times loop executes. */
4913 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
4914 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
4915 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
4917 /* Update the init conditions of the access functions of all data refs. */
4918 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
4920 /* After peeling we have to reset scalar evolution analyzer. */
4921 scev_reset ();
4923 free_original_copy_tables ();
4927 /* Function vect_create_cond_for_align_checks.
4929 Create a conditional expression that represents the alignment checks for
4930 all of data references (array element references) whose alignment must be
4931 checked at runtime.
4933 Input:
4934 LOOP_VINFO - two fields of the loop information are used.
4935 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
4936 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
4938 Output:
4939 COND_EXPR_STMT_LIST - statements needed to construct the conditional
4940 expression.
4941 The returned value is the conditional expression to be used in the if
4942 statement that controls which version of the loop gets executed at runtime.
4944 The algorithm makes two assumptions:
4945 1) The number of bytes "n" in a vector is a power of 2.
4946 2) An address "a" is aligned if a%n is zero and that this
4947 test can be done as a&(n-1) == 0. For example, for 16
4948 byte vectors the test is a&0xf == 0. */
4950 static tree
4951 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
4952 tree *cond_expr_stmt_list)
4954 VEC(tree,heap) *may_misalign_stmts
4955 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
4956 tree ref_stmt;
4957 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
4958 tree mask_cst;
4959 unsigned int i;
4960 tree psize;
4961 tree int_ptrsize_type;
4962 char tmp_name[20];
4963 tree or_tmp_name = NULL_TREE;
4964 tree and_tmp, and_tmp_name, and_stmt;
4965 tree ptrsize_zero;
4967 /* Check that mask is one less than a power of 2, i.e., mask is
4968 all zeros followed by all ones. */
4969 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
4971 /* CHECKME: what is the best integer or unsigned type to use to hold a
4972 cast from a pointer value? */
4973 psize = TYPE_SIZE (ptr_type_node);
4974 int_ptrsize_type
4975 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
4977 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
4978 of the first vector of the i'th data reference. */
4980 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
4982 tree new_stmt_list = NULL_TREE;
4983 tree addr_base;
4984 tree addr_tmp, addr_tmp_name, addr_stmt;
4985 tree or_tmp, new_or_tmp_name, or_stmt;
4987 /* create: addr_tmp = (int)(address_of_first_vector) */
4988 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
4989 &new_stmt_list,
4990 NULL_TREE);
4992 if (new_stmt_list != NULL_TREE)
4993 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
4995 sprintf (tmp_name, "%s%d", "addr2int", i);
4996 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4997 add_referenced_var (addr_tmp);
4998 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
4999 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
5000 addr_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
5001 addr_tmp_name, addr_stmt);
5002 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
5003 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
5005 /* The addresses are OR together. */
5007 if (or_tmp_name != NULL_TREE)
5009 /* create: or_tmp = or_tmp | addr_tmp */
5010 sprintf (tmp_name, "%s%d", "orptrs", i);
5011 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
5012 add_referenced_var (or_tmp);
5013 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
5014 or_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
5015 new_or_tmp_name,
5016 build2 (BIT_IOR_EXPR, int_ptrsize_type,
5017 or_tmp_name,
5018 addr_tmp_name));
5019 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
5020 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
5021 or_tmp_name = new_or_tmp_name;
5023 else
5024 or_tmp_name = addr_tmp_name;
5026 } /* end for i */
5028 mask_cst = build_int_cst (int_ptrsize_type, mask);
5030 /* create: and_tmp = or_tmp & mask */
5031 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
5032 add_referenced_var (and_tmp);
5033 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
5035 and_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
5036 and_tmp_name,
5037 build2 (BIT_AND_EXPR, int_ptrsize_type,
5038 or_tmp_name, mask_cst));
5039 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
5040 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
5042 /* Make and_tmp the left operand of the conditional test against zero.
5043 if and_tmp has a nonzero bit then some address is unaligned. */
5044 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
5045 return build2 (EQ_EXPR, boolean_type_node,
5046 and_tmp_name, ptrsize_zero);
5050 /* Function vect_transform_loop.
5052 The analysis phase has determined that the loop is vectorizable.
5053 Vectorize the loop - created vectorized stmts to replace the scalar
5054 stmts in the loop, and update the loop exit condition. */
5056 void
5057 vect_transform_loop (loop_vec_info loop_vinfo)
5059 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5060 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5061 int nbbs = loop->num_nodes;
5062 block_stmt_iterator si, next_si;
5063 int i;
5064 tree ratio = NULL;
5065 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5066 bool strided_store;
5068 if (vect_print_dump_info (REPORT_DETAILS))
5069 fprintf (vect_dump, "=== vec_transform_loop ===");
5071 /* If the loop has data references that may or may not be aligned then
5072 two versions of the loop need to be generated, one which is vectorized
5073 and one which isn't. A test is then generated to control which of the
5074 loops is executed. The test checks for the alignment of all of the
5075 data references that may or may not be aligned. */
5077 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
5079 struct loop *nloop;
5080 tree cond_expr;
5081 tree cond_expr_stmt_list = NULL_TREE;
5082 basic_block condition_bb;
5083 block_stmt_iterator cond_exp_bsi;
5084 basic_block merge_bb;
5085 basic_block new_exit_bb;
5086 edge new_exit_e, e;
5087 tree orig_phi, new_phi, arg;
5088 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
5090 cond_expr = vect_create_cond_for_align_checks (loop_vinfo,
5091 &cond_expr_stmt_list);
5092 initialize_original_copy_tables ();
5093 nloop = loop_version (loop, cond_expr, &condition_bb,
5094 prob, prob, REG_BR_PROB_BASE - prob, true);
5095 free_original_copy_tables();
5097 /** Loop versioning violates an assumption we try to maintain during
5098 vectorization - that the loop exit block has a single predecessor.
5099 After versioning, the exit block of both loop versions is the same
5100 basic block (i.e. it has two predecessors). Just in order to simplify
5101 following transformations in the vectorizer, we fix this situation
5102 here by adding a new (empty) block on the exit-edge of the loop,
5103 with the proper loop-exit phis to maintain loop-closed-form. **/
5105 merge_bb = single_exit (loop)->dest;
5106 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
5107 new_exit_bb = split_edge (single_exit (loop));
5108 new_exit_e = single_exit (loop);
5109 e = EDGE_SUCC (new_exit_bb, 0);
5111 for (orig_phi = phi_nodes (merge_bb); orig_phi;
5112 orig_phi = PHI_CHAIN (orig_phi))
5114 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
5115 new_exit_bb);
5116 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
5117 add_phi_arg (new_phi, arg, new_exit_e);
5118 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
5121 /** end loop-exit-fixes after versioning **/
5123 update_ssa (TODO_update_ssa);
5124 cond_exp_bsi = bsi_last (condition_bb);
5125 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
5128 /* CHECKME: we wouldn't need this if we called update_ssa once
5129 for all loops. */
5130 bitmap_zero (vect_memsyms_to_rename);
5132 /* Peel the loop if there are data refs with unknown alignment.
5133 Only one data ref with unknown store is allowed. */
5135 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5136 vect_do_peeling_for_alignment (loop_vinfo);
5138 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5139 compile time constant), or it is a constant that doesn't divide by the
5140 vectorization factor, then an epilog loop needs to be created.
5141 We therefore duplicate the loop: the original loop will be vectorized,
5142 and will compute the first (n/VF) iterations. The second copy of the loop
5143 will remain scalar and will compute the remaining (n%VF) iterations.
5144 (VF is the vectorization factor). */
5146 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5147 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5148 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
5149 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
5150 else
5151 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5152 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5154 /* 1) Make sure the loop header has exactly two entries
5155 2) Make sure we have a preheader basic block. */
5157 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5159 split_edge (loop_preheader_edge (loop));
5161 /* FORNOW: the vectorizer supports only loops which body consist
5162 of one basic block (header + empty latch). When the vectorizer will
5163 support more involved loop forms, the order by which the BBs are
5164 traversed need to be reconsidered. */
5166 for (i = 0; i < nbbs; i++)
5168 basic_block bb = bbs[i];
5170 for (si = bsi_start (bb); !bsi_end_p (si);)
5172 tree stmt = bsi_stmt (si);
5173 stmt_vec_info stmt_info;
5174 bool is_store;
5176 if (vect_print_dump_info (REPORT_DETAILS))
5178 fprintf (vect_dump, "------>vectorizing statement: ");
5179 print_generic_expr (vect_dump, stmt, TDF_SLIM);
5181 stmt_info = vinfo_for_stmt (stmt);
5182 gcc_assert (stmt_info);
5183 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5184 && !STMT_VINFO_LIVE_P (stmt_info))
5186 bsi_next (&si);
5187 continue;
5190 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5191 != (unsigned HOST_WIDE_INT) vectorization_factor)
5192 && vect_print_dump_info (REPORT_DETAILS))
5193 fprintf (vect_dump, "multiple-types.");
5195 /* -------- vectorize statement ------------ */
5196 if (vect_print_dump_info (REPORT_DETAILS))
5197 fprintf (vect_dump, "transform statement.");
5199 strided_store = false;
5200 is_store = vect_transform_stmt (stmt, &si, &strided_store);
5201 if (is_store)
5203 stmt_ann_t ann;
5204 if (DR_GROUP_FIRST_DR (stmt_info))
5206 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5207 interleaving chain was completed - free all the stores in
5208 the chain. */
5209 tree next = DR_GROUP_FIRST_DR (stmt_info);
5210 tree tmp;
5211 stmt_vec_info next_stmt_info;
5213 while (next)
5215 next_si = bsi_for_stmt (next);
5216 next_stmt_info = vinfo_for_stmt (next);
5217 /* Free the attached stmt_vec_info and remove the stmt. */
5218 ann = stmt_ann (next);
5219 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
5220 free (next_stmt_info);
5221 set_stmt_info (ann, NULL);
5222 bsi_remove (&next_si, true);
5223 next = tmp;
5225 bsi_remove (&si, true);
5226 continue;
5228 else
5230 /* Free the attached stmt_vec_info and remove the stmt. */
5231 ann = stmt_ann (stmt);
5232 free (stmt_info);
5233 set_stmt_info (ann, NULL);
5234 bsi_remove (&si, true);
5235 continue;
5238 bsi_next (&si);
5239 } /* stmts in BB */
5240 } /* BBs in loop */
5242 slpeel_make_loop_iterate_ntimes (loop, ratio);
5244 mark_set_for_renaming (vect_memsyms_to_rename);
5246 /* The memory tags and pointers in vectorized statements need to
5247 have their SSA forms updated. FIXME, why can't this be delayed
5248 until all the loops have been transformed? */
5249 update_ssa (TODO_update_ssa);
5251 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
5252 fprintf (vect_dump, "LOOP VECTORIZED.");