2007-05-01 H.J. Lu <hongjiu.lu@intel.com>
[official-gcc.git] / gcc / tree-vect-transform.c
blob161e82d3314da6ecb9e59bfa9650a735b50074f1
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to the Free
19 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
20 02110-1301, USA. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "ggc.h"
27 #include "tree.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "timevar.h"
35 #include "cfgloop.h"
36 #include "expr.h"
37 #include "optabs.h"
38 #include "params.h"
39 #include "recog.h"
40 #include "tree-data-ref.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "langhooks.h"
45 #include "tree-pass.h"
46 #include "toplev.h"
47 #include "real.h"
49 /* Utility functions for the code transformation. */
50 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (tree, block_stmt_iterator *, tree, tree *, tree *, bool, tree);
54 static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree);
55 static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
58 static tree vect_init_vector (tree, tree, tree);
59 static void vect_finish_stmt_generation
60 (tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void update_vuses_to_preheader (tree, struct loop*);
63 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
64 static tree get_initial_def_for_reduction (tree, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
77 /* Function vect_get_new_vect_var.
79 Returns a name for a new variable. The current naming scheme appends the
80 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
81 the name of vectorizer generated variables, and appends that to NAME if
82 provided. */
84 static tree
85 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
87 const char *prefix;
88 tree new_vect_var;
90 switch (var_kind)
92 case vect_simple_var:
93 prefix = "vect_";
94 break;
95 case vect_scalar_var:
96 prefix = "stmp_";
97 break;
98 case vect_pointer_var:
99 prefix = "vect_p";
100 break;
101 default:
102 gcc_unreachable ();
105 if (name)
106 new_vect_var = create_tmp_var (type, concat (prefix, name, NULL));
107 else
108 new_vect_var = create_tmp_var (type, prefix);
110 /* Mark vector typed variable as a gimple register variable. */
111 if (TREE_CODE (type) == VECTOR_TYPE)
112 DECL_GIMPLE_REG_P (new_vect_var) = true;
114 return new_vect_var;
118 /* Function vect_create_addr_base_for_vector_ref.
120 Create an expression that computes the address of the first memory location
121 that will be accessed for a data reference.
123 Input:
124 STMT: The statement containing the data reference.
125 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
126 OFFSET: Optional. If supplied, it is be added to the initial address.
128 Output:
129 1. Return an SSA_NAME whose value is the address of the memory location of
130 the first vector of the data reference.
131 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
132 these statement(s) which define the returned SSA_NAME.
134 FORNOW: We are only handling array accesses with step 1. */
136 static tree
137 vect_create_addr_base_for_vector_ref (tree stmt,
138 tree *new_stmt_list,
139 tree offset)
141 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
142 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
143 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
144 tree base_name = build_fold_indirect_ref (data_ref_base);
145 tree vec_stmt;
146 tree addr_base, addr_expr;
147 tree dest, new_stmt;
148 tree base_offset = unshare_expr (DR_OFFSET (dr));
149 tree init = unshare_expr (DR_INIT (dr));
150 tree vect_ptr_type, addr_expr2;
152 /* Create base_offset */
153 base_offset = size_binop (PLUS_EXPR, base_offset, init);
154 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
155 add_referenced_var (dest);
156 base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest);
157 append_to_statement_list_force (new_stmt, new_stmt_list);
159 if (offset)
161 tree tmp = create_tmp_var (TREE_TYPE (base_offset), "offset");
162 tree step;
164 /* For interleaved access step we divide STEP by the size of the
165 interleaving group. */
166 if (DR_GROUP_SIZE (stmt_info))
167 step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr),
168 build_int_cst (TREE_TYPE (offset),
169 DR_GROUP_SIZE (stmt_info)));
170 else
171 step = DR_STEP (dr);
173 add_referenced_var (tmp);
174 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
175 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
176 base_offset, offset);
177 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
178 append_to_statement_list_force (new_stmt, new_stmt_list);
181 /* base + base_offset */
182 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
183 base_offset);
185 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
187 /* addr_expr = addr_base */
188 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
189 get_name (base_name));
190 add_referenced_var (addr_expr);
191 vec_stmt = fold_convert (vect_ptr_type, addr_base);
192 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
193 get_name (base_name));
194 add_referenced_var (addr_expr2);
195 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
196 append_to_statement_list_force (new_stmt, new_stmt_list);
198 if (vect_print_dump_info (REPORT_DETAILS))
200 fprintf (vect_dump, "created ");
201 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
203 return vec_stmt;
207 /* Function vect_create_data_ref_ptr.
209 Create a new pointer to vector type (vp), that points to the first location
210 accessed in the loop by STMT, along with the def-use update chain to
211 appropriately advance the pointer through the loop iterations. Also set
212 aliasing information for the pointer. This vector pointer is used by the
213 callers to this function to create a memory reference expression for vector
214 load/store access.
216 Input:
217 1. STMT: a stmt that references memory. Expected to be of the form
218 GIMPLE_MODIFY_STMT <name, data-ref> or
219 GIMPLE_MODIFY_STMT <data-ref, name>.
220 2. BSI: block_stmt_iterator where new stmts can be added.
221 3. OFFSET (optional): an offset to be added to the initial address accessed
222 by the data-ref in STMT.
223 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
224 pointing to the initial address.
225 5. TYPE: if not NULL indicates the required type of the data-ref
227 Output:
228 1. Declare a new ptr to vector_type, and have it point to the base of the
229 data reference (initial addressed accessed by the data reference).
230 For example, for vector of type V8HI, the following code is generated:
232 v8hi *vp;
233 vp = (v8hi *)initial_address;
235 if OFFSET is not supplied:
236 initial_address = &a[init];
237 if OFFSET is supplied:
238 initial_address = &a[init + OFFSET];
240 Return the initial_address in INITIAL_ADDRESS.
242 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
243 update the pointer in each iteration of the loop.
245 Return the increment stmt that updates the pointer in PTR_INCR.
247 3. Return the pointer. */
249 static tree
250 vect_create_data_ref_ptr (tree stmt,
251 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
252 tree offset, tree *initial_address, tree *ptr_incr,
253 bool only_init, tree type)
255 tree base_name;
256 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
257 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
258 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
259 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
260 tree vect_ptr_type;
261 tree vect_ptr;
262 tree tag;
263 tree new_temp;
264 tree vec_stmt;
265 tree new_stmt_list = NULL_TREE;
266 edge pe = loop_preheader_edge (loop);
267 basic_block new_bb;
268 tree vect_ptr_init;
269 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
271 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
273 if (vect_print_dump_info (REPORT_DETAILS))
275 tree data_ref_base = base_name;
276 fprintf (vect_dump, "create vector-pointer variable to type: ");
277 print_generic_expr (vect_dump, vectype, TDF_SLIM);
278 if (TREE_CODE (data_ref_base) == VAR_DECL)
279 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
280 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
281 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
282 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
283 fprintf (vect_dump, " vectorizing a record based array ref: ");
284 else if (TREE_CODE (data_ref_base) == SSA_NAME)
285 fprintf (vect_dump, " vectorizing a pointer ref: ");
286 print_generic_expr (vect_dump, base_name, TDF_SLIM);
289 /** (1) Create the new vector-pointer variable: **/
290 if (type)
291 vect_ptr_type = build_pointer_type (type);
292 else
293 vect_ptr_type = build_pointer_type (vectype);
294 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
295 get_name (base_name));
296 add_referenced_var (vect_ptr);
298 /** (2) Add aliasing information to the new vector-pointer:
299 (The points-to info (DR_PTR_INFO) may be defined later.) **/
301 tag = DR_MEMTAG (dr);
302 gcc_assert (tag);
304 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
305 tag must be created with tag added to its may alias list. */
306 if (!MTAG_P (tag))
307 new_type_alias (vect_ptr, tag, DR_REF (dr));
308 else
309 set_symbol_mem_tag (vect_ptr, tag);
311 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
313 /** (3) Calculate the initial address the vector-pointer, and set
314 the vector-pointer to point to it before the loop: **/
316 /* Create: (&(base[init_val+offset]) in the loop preheader. */
317 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
318 offset);
319 pe = loop_preheader_edge (loop);
320 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
321 gcc_assert (!new_bb);
322 *initial_address = new_temp;
324 /* Create: p = (vectype *) initial_base */
325 vec_stmt = fold_convert (vect_ptr_type, new_temp);
326 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
327 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
328 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
329 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
330 gcc_assert (!new_bb);
333 /** (4) Handle the updating of the vector-pointer inside the loop: **/
335 if (only_init) /* No update in loop is required. */
337 /* Copy the points-to information if it exists. */
338 if (DR_PTR_INFO (dr))
339 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
340 return vect_ptr_init;
342 else
344 block_stmt_iterator incr_bsi;
345 bool insert_after;
346 tree indx_before_incr, indx_after_incr;
347 tree incr;
349 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
350 create_iv (vect_ptr_init,
351 fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)),
352 NULL_TREE, loop, &incr_bsi, insert_after,
353 &indx_before_incr, &indx_after_incr);
354 incr = bsi_stmt (incr_bsi);
355 set_stmt_info (stmt_ann (incr),
356 new_stmt_vec_info (incr, loop_vinfo));
358 /* Copy the points-to information if it exists. */
359 if (DR_PTR_INFO (dr))
361 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
362 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
364 merge_alias_info (vect_ptr_init, indx_before_incr);
365 merge_alias_info (vect_ptr_init, indx_after_incr);
366 if (ptr_incr)
367 *ptr_incr = incr;
369 return indx_before_incr;
374 /* Function bump_vector_ptr
376 Increment a pointer (to a vector type) by vector-size. Connect the new
377 increment stmt to the existing def-use update-chain of the pointer.
379 The pointer def-use update-chain before this function:
380 DATAREF_PTR = phi (p_0, p_2)
381 ....
382 PTR_INCR: p_2 = DATAREF_PTR + step
384 The pointer def-use update-chain after this function:
385 DATAREF_PTR = phi (p_0, p_2)
386 ....
387 NEW_DATAREF_PTR = DATAREF_PTR + vector_size
388 ....
389 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
391 Input:
392 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
393 in the loop.
394 PTR_INCR - the stmt that updates the pointer in each iteration of the loop.
395 The increment amount across iterations is also expected to be
396 vector_size.
397 BSI - location where the new update stmt is to be placed.
398 STMT - the original scalar memory-access stmt that is being vectorized.
400 Output: Return NEW_DATAREF_PTR as illustrated above.
404 static tree
405 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
406 tree stmt)
408 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
409 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
410 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
411 tree vptr_type = TREE_TYPE (dataref_ptr);
412 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
413 tree update = fold_convert (vptr_type, TYPE_SIZE_UNIT (vectype));
414 tree incr_stmt;
415 ssa_op_iter iter;
416 use_operand_p use_p;
417 tree new_dataref_ptr;
419 incr_stmt = build_gimple_modify_stmt (ptr_var,
420 build2 (PLUS_EXPR, vptr_type,
421 dataref_ptr, update));
422 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
423 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
424 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
426 /* Update the vector-pointer's cross-iteration increment. */
427 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
429 tree use = USE_FROM_PTR (use_p);
431 if (use == dataref_ptr)
432 SET_USE (use_p, new_dataref_ptr);
433 else
434 gcc_assert (tree_int_cst_compare (use, update) == 0);
437 /* Copy the points-to information if it exists. */
438 if (DR_PTR_INFO (dr))
439 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
440 merge_alias_info (new_dataref_ptr, dataref_ptr);
442 return new_dataref_ptr;
446 /* Function vect_create_destination_var.
448 Create a new temporary of type VECTYPE. */
450 static tree
451 vect_create_destination_var (tree scalar_dest, tree vectype)
453 tree vec_dest;
454 const char *new_name;
455 tree type;
456 enum vect_var_kind kind;
458 kind = vectype ? vect_simple_var : vect_scalar_var;
459 type = vectype ? vectype : TREE_TYPE (scalar_dest);
461 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
463 new_name = get_name (scalar_dest);
464 if (!new_name)
465 new_name = "var_";
466 vec_dest = vect_get_new_vect_var (type, kind, new_name);
467 add_referenced_var (vec_dest);
469 return vec_dest;
473 /* Function vect_init_vector.
475 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
476 the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be
477 used in the vectorization of STMT. */
479 static tree
480 vect_init_vector (tree stmt, tree vector_var, tree vector_type)
482 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
483 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
484 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
485 tree new_var;
486 tree init_stmt;
487 tree vec_oprnd;
488 edge pe;
489 tree new_temp;
490 basic_block new_bb;
492 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
493 add_referenced_var (new_var);
495 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
496 new_temp = make_ssa_name (new_var, init_stmt);
497 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
499 pe = loop_preheader_edge (loop);
500 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
501 gcc_assert (!new_bb);
503 if (vect_print_dump_info (REPORT_DETAILS))
505 fprintf (vect_dump, "created new init_stmt: ");
506 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
509 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
510 return vec_oprnd;
514 /* Function get_initial_def_for_induction
516 Input:
517 IV_PHI - the initial value of the induction variable
519 Output:
520 Return a vector variable, initialized with the first VF values of
521 the induction variable. E.g., for an iv with IV_PHI='X' and
522 evolution S, for a vector of 4 units, we want to return:
523 [X, X + S, X + 2*S, X + 3*S]. */
525 static tree
526 get_initial_def_for_induction (tree iv_phi)
528 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
529 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
530 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
531 tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
532 tree vectype = get_vectype_for_scalar_type (scalar_type);
533 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
534 edge pe = loop_preheader_edge (loop);
535 basic_block new_bb;
536 block_stmt_iterator bsi;
537 tree vec, vec_init, vec_step, t;
538 tree access_fn;
539 tree new_var;
540 tree new_name;
541 tree init_stmt;
542 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
543 tree init_expr, step_expr;
544 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
545 int i;
546 bool ok;
547 int ncopies = vf / nunits;
548 tree expr;
549 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
550 tree stmts;
551 tree stmt = NULL_TREE;
552 block_stmt_iterator si;
553 basic_block bb = bb_for_stmt (iv_phi);
555 gcc_assert (phi_info);
556 gcc_assert (ncopies >= 1);
558 /* Find the first insertion point in the BB. */
559 si = bsi_after_labels (bb);
560 stmt = bsi_stmt (si);
562 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (iv_phi));
563 gcc_assert (access_fn);
564 ok = vect_is_simple_iv_evolution (loop->num, access_fn,
565 &init_expr, &step_expr);
566 gcc_assert (ok);
568 /* Create the vector that holds the initial_value of the induction. */
569 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
570 add_referenced_var (new_var);
572 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
573 if (stmts)
575 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
576 gcc_assert (!new_bb);
579 t = NULL_TREE;
580 t = tree_cons (NULL_TREE, new_name, t);
581 for (i = 1; i < nunits; i++)
583 tree tmp;
585 /* Create: new_name = new_name + step_expr */
586 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
587 init_stmt = build_gimple_modify_stmt (new_var, tmp);
588 new_name = make_ssa_name (new_var, init_stmt);
589 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
591 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
592 gcc_assert (!new_bb);
594 if (vect_print_dump_info (REPORT_DETAILS))
596 fprintf (vect_dump, "created new init_stmt: ");
597 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
599 t = tree_cons (NULL_TREE, new_name, t);
601 vec = build_constructor_from_list (vectype, nreverse (t));
602 vec_init = vect_init_vector (stmt, vec, vectype);
605 /* Create the vector that holds the step of the induction. */
606 expr = build_int_cst (scalar_type, vf);
607 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
608 t = NULL_TREE;
609 for (i = 0; i < nunits; i++)
610 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
611 vec = build_constructor_from_list (vectype, t);
612 vec_step = vect_init_vector (stmt, vec, vectype);
615 /* Create the following def-use cycle:
616 loop prolog:
617 vec_init = [X, X+S, X+2*S, X+3*S]
618 vec_step = [VF*S, VF*S, VF*S, VF*S]
619 loop:
620 vec_iv = PHI <vec_init, vec_loop>
622 STMT
624 vec_loop = vec_iv + vec_step; */
626 /* Create the induction-phi that defines the induction-operand. */
627 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
628 add_referenced_var (vec_dest);
629 induction_phi = create_phi_node (vec_dest, loop->header);
630 set_stmt_info (get_stmt_ann (induction_phi),
631 new_stmt_vec_info (induction_phi, loop_vinfo));
632 induc_def = PHI_RESULT (induction_phi);
634 /* Create the iv update inside the loop */
635 new_stmt = build_gimple_modify_stmt (NULL_TREE,
636 build2 (PLUS_EXPR, vectype,
637 induc_def, vec_step));
638 vec_def = make_ssa_name (vec_dest, new_stmt);
639 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
640 bsi = bsi_for_stmt (stmt);
641 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
643 /* Set the arguments of the phi node: */
644 add_phi_arg (induction_phi, vec_init, loop_preheader_edge (loop));
645 add_phi_arg (induction_phi, vec_def, loop_latch_edge (loop));
648 /* In case the vectorization factor (VF) is bigger than the number
649 of elements that we can fit in a vectype (nunits), we have to generate
650 more than one vector stmt - i.e - we need to "unroll" the
651 vector stmt by a factor VF/nunits. For more details see documentation
652 in vectorizable_operation. */
654 if (ncopies > 1)
656 stmt_vec_info prev_stmt_vinfo;
658 /* Create the vector that holds the step of the induction. */
659 expr = build_int_cst (scalar_type, nunits);
660 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
661 t = NULL_TREE;
662 for (i = 0; i < nunits; i++)
663 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
664 vec = build_constructor_from_list (vectype, t);
665 vec_step = vect_init_vector (stmt, vec, vectype);
667 vec_def = induc_def;
668 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
669 for (i = 1; i < ncopies; i++)
671 tree tmp;
673 /* vec_i = vec_prev + vec_{step*nunits} */
674 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
675 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
676 vec_def = make_ssa_name (vec_dest, new_stmt);
677 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
678 bsi = bsi_for_stmt (stmt);
679 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
681 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
682 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
686 if (vect_print_dump_info (REPORT_DETAILS))
688 fprintf (vect_dump, "transform induction: created def-use cycle:");
689 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
690 fprintf (vect_dump, "\n");
691 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
694 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
695 return induc_def;
699 /* Function vect_get_vec_def_for_operand.
701 OP is an operand in STMT. This function returns a (vector) def that will be
702 used in the vectorized stmt for STMT.
704 In the case that OP is an SSA_NAME which is defined in the loop, then
705 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
707 In case OP is an invariant or constant, a new stmt that creates a vector def
708 needs to be introduced. */
710 static tree
711 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
713 tree vec_oprnd;
714 tree vec_stmt;
715 tree def_stmt;
716 stmt_vec_info def_stmt_info = NULL;
717 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
718 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
719 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
720 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
721 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
722 tree vec_inv;
723 tree vec_cst;
724 tree t = NULL_TREE;
725 tree def;
726 int i;
727 enum vect_def_type dt;
728 bool is_simple_use;
729 tree vector_type;
731 if (vect_print_dump_info (REPORT_DETAILS))
733 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
734 print_generic_expr (vect_dump, op, TDF_SLIM);
737 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
738 gcc_assert (is_simple_use);
739 if (vect_print_dump_info (REPORT_DETAILS))
741 if (def)
743 fprintf (vect_dump, "def = ");
744 print_generic_expr (vect_dump, def, TDF_SLIM);
746 if (def_stmt)
748 fprintf (vect_dump, " def_stmt = ");
749 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
753 switch (dt)
755 /* Case 1: operand is a constant. */
756 case vect_constant_def:
758 if (scalar_def)
759 *scalar_def = op;
761 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
762 if (vect_print_dump_info (REPORT_DETAILS))
763 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
765 for (i = nunits - 1; i >= 0; --i)
767 t = tree_cons (NULL_TREE, op, t);
769 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
770 vec_cst = build_vector (vector_type, t);
772 return vect_init_vector (stmt, vec_cst, vector_type);
775 /* Case 2: operand is defined outside the loop - loop invariant. */
776 case vect_invariant_def:
778 if (scalar_def)
779 *scalar_def = def;
781 /* Create 'vec_inv = {inv,inv,..,inv}' */
782 if (vect_print_dump_info (REPORT_DETAILS))
783 fprintf (vect_dump, "Create vector_inv.");
785 for (i = nunits - 1; i >= 0; --i)
787 t = tree_cons (NULL_TREE, def, t);
790 /* FIXME: use build_constructor directly. */
791 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
792 vec_inv = build_constructor_from_list (vector_type, t);
793 return vect_init_vector (stmt, vec_inv, vector_type);
796 /* Case 3: operand is defined inside the loop. */
797 case vect_loop_def:
799 if (scalar_def)
800 *scalar_def = def_stmt;
802 /* Get the def from the vectorized stmt. */
803 def_stmt_info = vinfo_for_stmt (def_stmt);
804 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
805 gcc_assert (vec_stmt);
806 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
807 return vec_oprnd;
810 /* Case 4: operand is defined by a loop header phi - reduction */
811 case vect_reduction_def:
813 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
815 /* Get the def before the loop */
816 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
817 return get_initial_def_for_reduction (stmt, op, scalar_def);
820 /* Case 5: operand is defined by loop-header phi - induction. */
821 case vect_induction_def:
823 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
825 /* Get the def before the loop */
826 return get_initial_def_for_induction (def_stmt);
829 default:
830 gcc_unreachable ();
835 /* Function vect_get_vec_def_for_stmt_copy
837 Return a vector-def for an operand. This function is used when the
838 vectorized stmt to be created (by the caller to this function) is a "copy"
839 created in case the vectorized result cannot fit in one vector, and several
840 copies of the vector-stmt are required. In this case the vector-def is
841 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
842 of the stmt that defines VEC_OPRND.
843 DT is the type of the vector def VEC_OPRND.
845 Context:
846 In case the vectorization factor (VF) is bigger than the number
847 of elements that can fit in a vectype (nunits), we have to generate
848 more than one vector stmt to vectorize the scalar stmt. This situation
849 arises when there are multiple data-types operated upon in the loop; the
850 smallest data-type determines the VF, and as a result, when vectorizing
851 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
852 vector stmt (each computing a vector of 'nunits' results, and together
853 computing 'VF' results in each iteration). This function is called when
854 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
855 which VF=16 and nunits=4, so the number of copies required is 4):
857 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
859 S1: x = load VS1.0: vx.0 = memref0 VS1.1
860 VS1.1: vx.1 = memref1 VS1.2
861 VS1.2: vx.2 = memref2 VS1.3
862 VS1.3: vx.3 = memref3
864 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
865 VSnew.1: vz1 = vx.1 + ... VSnew.2
866 VSnew.2: vz2 = vx.2 + ... VSnew.3
867 VSnew.3: vz3 = vx.3 + ...
869 The vectorization of S1 is explained in vectorizable_load.
870 The vectorization of S2:
871 To create the first vector-stmt out of the 4 copies - VSnew.0 -
872 the function 'vect_get_vec_def_for_operand' is called to
873 get the relevant vector-def for each operand of S2. For operand x it
874 returns the vector-def 'vx.0'.
876 To create the remaining copies of the vector-stmt (VSnew.j), this
877 function is called to get the relevant vector-def for each operand. It is
878 obtained from the respective VS1.j stmt, which is recorded in the
879 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
881 For example, to obtain the vector-def 'vx.1' in order to create the
882 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
883 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
884 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
885 and return its def ('vx.1').
886 Overall, to create the above sequence this function will be called 3 times:
887 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
888 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
889 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
891 static tree
892 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
894 tree vec_stmt_for_operand;
895 stmt_vec_info def_stmt_info;
897 /* Do nothing; can reuse same def. */
898 if (dt == vect_invariant_def || dt == vect_constant_def )
899 return vec_oprnd;
901 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
902 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
903 gcc_assert (def_stmt_info);
904 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
905 gcc_assert (vec_stmt_for_operand);
906 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
908 return vec_oprnd;
912 /* Function vect_finish_stmt_generation.
914 Insert a new stmt. */
916 static void
917 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
918 block_stmt_iterator *bsi)
920 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
921 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
923 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
924 set_stmt_info (get_stmt_ann (vec_stmt),
925 new_stmt_vec_info (vec_stmt, loop_vinfo));
927 if (vect_print_dump_info (REPORT_DETAILS))
929 fprintf (vect_dump, "add new stmt: ");
930 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
933 /* Make sure bsi points to the stmt that is being vectorized. */
934 gcc_assert (stmt == bsi_stmt (*bsi));
936 #ifdef USE_MAPPED_LOCATION
937 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
938 #else
939 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
940 #endif
944 /* Function get_initial_def_for_reduction
946 Input:
947 STMT - a stmt that performs a reduction operation in the loop.
948 INIT_VAL - the initial value of the reduction variable
950 Output:
951 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
952 of the reduction (used for adjusting the epilog - see below).
953 Return a vector variable, initialized according to the operation that STMT
954 performs. This vector will be used as the initial value of the
955 vector of partial results.
957 Option1 (adjust in epilog): Initialize the vector as follows:
958 add: [0,0,...,0,0]
959 mult: [1,1,...,1,1]
960 min/max: [init_val,init_val,..,init_val,init_val]
961 bit and/or: [init_val,init_val,..,init_val,init_val]
962 and when necessary (e.g. add/mult case) let the caller know
963 that it needs to adjust the result by init_val.
965 Option2: Initialize the vector as follows:
966 add: [0,0,...,0,init_val]
967 mult: [1,1,...,1,init_val]
968 min/max: [init_val,init_val,...,init_val]
969 bit and/or: [init_val,init_val,...,init_val]
970 and no adjustments are needed.
972 For example, for the following code:
974 s = init_val;
975 for (i=0;i<n;i++)
976 s = s + a[i];
978 STMT is 's = s + a[i]', and the reduction variable is 's'.
979 For a vector of 4 units, we want to return either [0,0,0,init_val],
980 or [0,0,0,0] and let the caller know that it needs to adjust
981 the result at the end by 'init_val'.
983 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
984 initialization vector is simpler (same element in all entries).
985 A cost model should help decide between these two schemes. */
987 static tree
988 get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
990 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
991 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
992 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
993 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
994 tree type = TREE_TYPE (init_val);
995 tree vecdef;
996 tree def_for_init;
997 tree init_def;
998 tree t = NULL_TREE;
999 int i;
1000 tree vector_type;
1002 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
1003 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
1005 switch (code)
1007 case WIDEN_SUM_EXPR:
1008 case DOT_PROD_EXPR:
1009 case PLUS_EXPR:
1010 *adjustment_def = init_val;
1011 /* Create a vector of zeros for init_def. */
1012 if (INTEGRAL_TYPE_P (type))
1013 def_for_init = build_int_cst (type, 0);
1014 else
1015 def_for_init = build_real (type, dconst0);
1016 for (i = nunits - 1; i >= 0; --i)
1017 t = tree_cons (NULL_TREE, def_for_init, t);
1018 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
1019 init_def = build_vector (vector_type, t);
1020 break;
1022 case MIN_EXPR:
1023 case MAX_EXPR:
1024 *adjustment_def = NULL_TREE;
1025 init_def = vecdef;
1026 break;
1028 default:
1029 gcc_unreachable ();
1032 return init_def;
1036 /* Function vect_create_epilog_for_reduction
1038 Create code at the loop-epilog to finalize the result of a reduction
1039 computation.
1041 VECT_DEF is a vector of partial results.
1042 REDUC_CODE is the tree-code for the epilog reduction.
1043 STMT is the scalar reduction stmt that is being vectorized.
1044 REDUCTION_PHI is the phi-node that carries the reduction computation.
1046 This function:
1047 1. Creates the reduction def-use cycle: sets the arguments for
1048 REDUCTION_PHI:
1049 The loop-entry argument is the vectorized initial-value of the reduction.
1050 The loop-latch argument is VECT_DEF - the vector of partial sums.
1051 2. "Reduces" the vector of partial results VECT_DEF into a single result,
1052 by applying the operation specified by REDUC_CODE if available, or by
1053 other means (whole-vector shifts or a scalar loop).
1054 The function also creates a new phi node at the loop exit to preserve
1055 loop-closed form, as illustrated below.
1057 The flow at the entry to this function:
1059 loop:
1060 vec_def = phi <null, null> # REDUCTION_PHI
1061 VECT_DEF = vector_stmt # vectorized form of STMT
1062 s_loop = scalar_stmt # (scalar) STMT
1063 loop_exit:
1064 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1065 use <s_out0>
1066 use <s_out0>
1068 The above is transformed by this function into:
1070 loop:
1071 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
1072 VECT_DEF = vector_stmt # vectorized form of STMT
1073 s_loop = scalar_stmt # (scalar) STMT
1074 loop_exit:
1075 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1076 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1077 v_out2 = reduce <v_out1>
1078 s_out3 = extract_field <v_out2, 0>
1079 s_out4 = adjust_result <s_out3>
1080 use <s_out4>
1081 use <s_out4>
1084 static void
1085 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
1086 enum tree_code reduc_code, tree reduction_phi)
1088 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1089 tree vectype;
1090 enum machine_mode mode;
1091 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1092 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1093 basic_block exit_bb;
1094 tree scalar_dest;
1095 tree scalar_type;
1096 tree new_phi;
1097 block_stmt_iterator exit_bsi;
1098 tree vec_dest;
1099 tree new_temp;
1100 tree new_name;
1101 tree epilog_stmt;
1102 tree new_scalar_dest, exit_phi;
1103 tree bitsize, bitpos, bytesize;
1104 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1105 tree scalar_initial_def;
1106 tree vec_initial_def;
1107 tree orig_name;
1108 imm_use_iterator imm_iter;
1109 use_operand_p use_p;
1110 bool extract_scalar_result;
1111 tree reduction_op;
1112 tree orig_stmt;
1113 tree use_stmt;
1114 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
1115 int op_type;
1117 op_type = TREE_OPERAND_LENGTH (operation);
1118 reduction_op = TREE_OPERAND (operation, op_type-1);
1119 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
1120 mode = TYPE_MODE (vectype);
1122 /*** 1. Create the reduction def-use cycle ***/
1124 /* 1.1 set the loop-entry arg of the reduction-phi: */
1125 /* For the case of reduction, vect_get_vec_def_for_operand returns
1126 the scalar def before the loop, that defines the initial value
1127 of the reduction variable. */
1128 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
1129 &scalar_initial_def);
1130 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
1132 /* 1.2 set the loop-latch arg for the reduction-phi: */
1133 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
1135 if (vect_print_dump_info (REPORT_DETAILS))
1137 fprintf (vect_dump, "transform reduction: created def-use cycle:");
1138 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
1139 fprintf (vect_dump, "\n");
1140 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
1144 /*** 2. Create epilog code
1145 The reduction epilog code operates across the elements of the vector
1146 of partial results computed by the vectorized loop.
1147 The reduction epilog code consists of:
1148 step 1: compute the scalar result in a vector (v_out2)
1149 step 2: extract the scalar result (s_out3) from the vector (v_out2)
1150 step 3: adjust the scalar result (s_out3) if needed.
1152 Step 1 can be accomplished using one the following three schemes:
1153 (scheme 1) using reduc_code, if available.
1154 (scheme 2) using whole-vector shifts, if available.
1155 (scheme 3) using a scalar loop. In this case steps 1+2 above are
1156 combined.
1158 The overall epilog code looks like this:
1160 s_out0 = phi <s_loop> # original EXIT_PHI
1161 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1162 v_out2 = reduce <v_out1> # step 1
1163 s_out3 = extract_field <v_out2, 0> # step 2
1164 s_out4 = adjust_result <s_out3> # step 3
1166 (step 3 is optional, and step2 1 and 2 may be combined).
1167 Lastly, the uses of s_out0 are replaced by s_out4.
1169 ***/
1171 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
1172 v_out1 = phi <v_loop> */
1174 exit_bb = single_exit (loop)->dest;
1175 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
1176 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
1177 exit_bsi = bsi_after_labels (exit_bb);
1179 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
1180 (i.e. when reduc_code is not available) and in the final adjustment
1181 code (if needed). Also get the original scalar reduction variable as
1182 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
1183 represents a reduction pattern), the tree-code and scalar-def are
1184 taken from the original stmt that the pattern-stmt (STMT) replaces.
1185 Otherwise (it is a regular reduction) - the tree-code and scalar-def
1186 are taken from STMT. */
1188 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1189 if (!orig_stmt)
1191 /* Regular reduction */
1192 orig_stmt = stmt;
1194 else
1196 /* Reduction pattern */
1197 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
1198 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1199 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
1201 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1202 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
1203 scalar_type = TREE_TYPE (scalar_dest);
1204 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
1205 bitsize = TYPE_SIZE (scalar_type);
1206 bytesize = TYPE_SIZE_UNIT (scalar_type);
1208 /* 2.3 Create the reduction code, using one of the three schemes described
1209 above. */
1211 if (reduc_code < NUM_TREE_CODES)
1213 tree tmp;
1215 /*** Case 1: Create:
1216 v_out2 = reduc_expr <v_out1> */
1218 if (vect_print_dump_info (REPORT_DETAILS))
1219 fprintf (vect_dump, "Reduce using direct vector reduction.");
1221 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1222 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
1223 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
1224 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1225 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1226 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
1228 extract_scalar_result = true;
1230 else
1232 enum tree_code shift_code = 0;
1233 bool have_whole_vector_shift = true;
1234 int bit_offset;
1235 int element_bitsize = tree_low_cst (bitsize, 1);
1236 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1237 tree vec_temp;
1239 if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
1240 shift_code = VEC_RSHIFT_EXPR;
1241 else
1242 have_whole_vector_shift = false;
1244 /* Regardless of whether we have a whole vector shift, if we're
1245 emulating the operation via tree-vect-generic, we don't want
1246 to use it. Only the first round of the reduction is likely
1247 to still be profitable via emulation. */
1248 /* ??? It might be better to emit a reduction tree code here, so that
1249 tree-vect-generic can expand the first round via bit tricks. */
1250 if (!VECTOR_MODE_P (mode))
1251 have_whole_vector_shift = false;
1252 else
1254 optab optab = optab_for_tree_code (code, vectype);
1255 if (optab->handlers[mode].insn_code == CODE_FOR_nothing)
1256 have_whole_vector_shift = false;
1259 if (have_whole_vector_shift)
1261 /*** Case 2: Create:
1262 for (offset = VS/2; offset >= element_size; offset/=2)
1264 Create: va' = vec_shift <va, offset>
1265 Create: va = vop <va, va'>
1266 } */
1268 if (vect_print_dump_info (REPORT_DETAILS))
1269 fprintf (vect_dump, "Reduce using vector shifts");
1271 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1272 new_temp = PHI_RESULT (new_phi);
1274 for (bit_offset = vec_size_in_bits/2;
1275 bit_offset >= element_bitsize;
1276 bit_offset /= 2)
1278 tree bitpos = size_int (bit_offset);
1279 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
1280 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
1281 new_name = make_ssa_name (vec_dest, epilog_stmt);
1282 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1283 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
1285 tmp = build2 (code, vectype, new_name, new_temp);
1286 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
1287 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1288 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1289 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
1292 extract_scalar_result = true;
1294 else
1296 tree rhs;
1298 /*** Case 3: Create:
1299 s = extract_field <v_out2, 0>
1300 for (offset = element_size;
1301 offset < vector_size;
1302 offset += element_size;)
1304 Create: s' = extract_field <v_out2, offset>
1305 Create: s = op <s, s'>
1306 } */
1308 if (vect_print_dump_info (REPORT_DETAILS))
1309 fprintf (vect_dump, "Reduce using scalar code. ");
1311 vec_temp = PHI_RESULT (new_phi);
1312 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1313 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1314 bitsize_zero_node);
1315 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1316 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
1317 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1318 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1319 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
1321 for (bit_offset = element_bitsize;
1322 bit_offset < vec_size_in_bits;
1323 bit_offset += element_bitsize)
1325 tree tmp;
1326 tree bitpos = bitsize_int (bit_offset);
1327 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1328 bitpos);
1330 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1331 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
1332 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
1333 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1334 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
1336 tmp = build2 (code, scalar_type, new_name, new_temp);
1337 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
1338 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1339 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1340 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
1343 extract_scalar_result = false;
1347 /* 2.4 Extract the final scalar result. Create:
1348 s_out3 = extract_field <v_out2, bitpos> */
1350 if (extract_scalar_result)
1352 tree rhs;
1354 if (vect_print_dump_info (REPORT_DETAILS))
1355 fprintf (vect_dump, "extract scalar result");
1357 if (BYTES_BIG_ENDIAN)
1358 bitpos = size_binop (MULT_EXPR,
1359 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
1360 TYPE_SIZE (scalar_type));
1361 else
1362 bitpos = bitsize_zero_node;
1364 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
1365 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1366 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
1367 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1368 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1369 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
1372 /* 2.4 Adjust the final result by the initial value of the reduction
1373 variable. (When such adjustment is not needed, then
1374 'scalar_initial_def' is zero).
1376 Create:
1377 s_out4 = scalar_expr <s_out3, scalar_initial_def> */
1379 if (scalar_initial_def)
1381 tree tmp = build2 (code, scalar_type, new_temp, scalar_initial_def);
1382 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
1383 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1384 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1385 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
1388 /* 2.6 Replace uses of s_out0 with uses of s_out3 */
1390 /* Find the loop-closed-use at the loop exit of the original scalar result.
1391 (The reduction result is expected to have two immediate uses - one at the
1392 latch block, and one at the loop exit). */
1393 exit_phi = NULL;
1394 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
1396 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
1398 exit_phi = USE_STMT (use_p);
1399 break;
1402 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
1403 gcc_assert (exit_phi);
1404 /* Replace the uses: */
1405 orig_name = PHI_RESULT (exit_phi);
1406 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
1407 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1408 SET_USE (use_p, new_temp);
1412 /* Function vectorizable_reduction.
1414 Check if STMT performs a reduction operation that can be vectorized.
1415 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1416 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1417 Return FALSE if not a vectorizable STMT, TRUE otherwise.
1419 This function also handles reduction idioms (patterns) that have been
1420 recognized in advance during vect_pattern_recog. In this case, STMT may be
1421 of this form:
1422 X = pattern_expr (arg0, arg1, ..., X)
1423 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
1424 sequence that had been detected and replaced by the pattern-stmt (STMT).
1426 In some cases of reduction patterns, the type of the reduction variable X is
1427 different than the type of the other arguments of STMT.
1428 In such cases, the vectype that is used when transforming STMT into a vector
1429 stmt is different than the vectype that is used to determine the
1430 vectorization factor, because it consists of a different number of elements
1431 than the actual number of elements that are being operated upon in parallel.
1433 For example, consider an accumulation of shorts into an int accumulator.
1434 On some targets it's possible to vectorize this pattern operating on 8
1435 shorts at a time (hence, the vectype for purposes of determining the
1436 vectorization factor should be V8HI); on the other hand, the vectype that
1437 is used to create the vector form is actually V4SI (the type of the result).
1439 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
1440 indicates what is the actual level of parallelism (V8HI in the example), so
1441 that the right vectorization factor would be derived. This vectype
1442 corresponds to the type of arguments to the reduction stmt, and should *NOT*
1443 be used to create the vectorized stmt. The right vectype for the vectorized
1444 stmt is obtained from the type of the result X:
1445 get_vectype_for_scalar_type (TREE_TYPE (X))
1447 This means that, contrary to "regular" reductions (or "regular" stmts in
1448 general), the following equation:
1449 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
1450 does *NOT* necessarily hold for reduction patterns. */
1452 bool
1453 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1455 tree vec_dest;
1456 tree scalar_dest;
1457 tree op;
1458 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
1459 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1460 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1461 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1462 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1463 tree operation;
1464 enum tree_code code, orig_code, epilog_reduc_code = 0;
1465 enum machine_mode vec_mode;
1466 int op_type;
1467 optab optab, reduc_optab;
1468 tree new_temp = NULL_TREE;
1469 tree def, def_stmt;
1470 enum vect_def_type dt;
1471 tree new_phi;
1472 tree scalar_type;
1473 bool is_simple_use;
1474 tree orig_stmt;
1475 stmt_vec_info orig_stmt_info;
1476 tree expr = NULL_TREE;
1477 int i;
1478 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1479 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1480 stmt_vec_info prev_stmt_info;
1481 tree reduc_def;
1482 tree new_stmt = NULL_TREE;
1483 int j;
1485 gcc_assert (ncopies >= 1);
1487 /* 1. Is vectorizable reduction? */
1489 /* Not supportable if the reduction variable is used in the loop. */
1490 if (STMT_VINFO_RELEVANT_P (stmt_info))
1491 return false;
1493 if (!STMT_VINFO_LIVE_P (stmt_info))
1494 return false;
1496 /* Make sure it was already recognized as a reduction computation. */
1497 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
1498 return false;
1500 /* 2. Has this been recognized as a reduction pattern?
1502 Check if STMT represents a pattern that has been recognized
1503 in earlier analysis stages. For stmts that represent a pattern,
1504 the STMT_VINFO_RELATED_STMT field records the last stmt in
1505 the original sequence that constitutes the pattern. */
1507 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1508 if (orig_stmt)
1510 orig_stmt_info = vinfo_for_stmt (orig_stmt);
1511 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
1512 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
1513 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
1516 /* 3. Check the operands of the operation. The first operands are defined
1517 inside the loop body. The last operand is the reduction variable,
1518 which is defined by the loop-header-phi. */
1520 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
1522 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1523 code = TREE_CODE (operation);
1524 op_type = TREE_OPERAND_LENGTH (operation);
1525 if (op_type != binary_op && op_type != ternary_op)
1526 return false;
1527 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1528 scalar_type = TREE_TYPE (scalar_dest);
1530 /* All uses but the last are expected to be defined in the loop.
1531 The last use is the reduction variable. */
1532 for (i = 0; i < op_type-1; i++)
1534 op = TREE_OPERAND (operation, i);
1535 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1536 gcc_assert (is_simple_use);
1537 if (dt != vect_loop_def
1538 && dt != vect_invariant_def
1539 && dt != vect_constant_def
1540 && dt != vect_induction_def)
1541 return false;
1544 op = TREE_OPERAND (operation, i);
1545 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1546 gcc_assert (is_simple_use);
1547 gcc_assert (dt == vect_reduction_def);
1548 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1549 if (orig_stmt)
1550 gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt));
1551 else
1552 gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt));
1554 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
1555 return false;
1557 /* 4. Supportable by target? */
1559 /* 4.1. check support for the operation in the loop */
1560 optab = optab_for_tree_code (code, vectype);
1561 if (!optab)
1563 if (vect_print_dump_info (REPORT_DETAILS))
1564 fprintf (vect_dump, "no optab.");
1565 return false;
1567 vec_mode = TYPE_MODE (vectype);
1568 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1570 if (vect_print_dump_info (REPORT_DETAILS))
1571 fprintf (vect_dump, "op not supported by target.");
1572 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1573 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1574 < vect_min_worthwhile_factor (code))
1575 return false;
1576 if (vect_print_dump_info (REPORT_DETAILS))
1577 fprintf (vect_dump, "proceeding using word mode.");
1580 /* Worthwhile without SIMD support? */
1581 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1582 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1583 < vect_min_worthwhile_factor (code))
1585 if (vect_print_dump_info (REPORT_DETAILS))
1586 fprintf (vect_dump, "not worthwhile without SIMD support.");
1587 return false;
1590 /* 4.2. Check support for the epilog operation.
1592 If STMT represents a reduction pattern, then the type of the
1593 reduction variable may be different than the type of the rest
1594 of the arguments. For example, consider the case of accumulation
1595 of shorts into an int accumulator; The original code:
1596 S1: int_a = (int) short_a;
1597 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
1599 was replaced with:
1600 STMT: int_acc = widen_sum <short_a, int_acc>
1602 This means that:
1603 1. The tree-code that is used to create the vector operation in the
1604 epilog code (that reduces the partial results) is not the
1605 tree-code of STMT, but is rather the tree-code of the original
1606 stmt from the pattern that STMT is replacing. I.e, in the example
1607 above we want to use 'widen_sum' in the loop, but 'plus' in the
1608 epilog.
1609 2. The type (mode) we use to check available target support
1610 for the vector operation to be created in the *epilog*, is
1611 determined by the type of the reduction variable (in the example
1612 above we'd check this: plus_optab[vect_int_mode]).
1613 However the type (mode) we use to check available target support
1614 for the vector operation to be created *inside the loop*, is
1615 determined by the type of the other arguments to STMT (in the
1616 example we'd check this: widen_sum_optab[vect_short_mode]).
1618 This is contrary to "regular" reductions, in which the types of all
1619 the arguments are the same as the type of the reduction variable.
1620 For "regular" reductions we can therefore use the same vector type
1621 (and also the same tree-code) when generating the epilog code and
1622 when generating the code inside the loop. */
1624 if (orig_stmt)
1626 /* This is a reduction pattern: get the vectype from the type of the
1627 reduction variable, and get the tree-code from orig_stmt. */
1628 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1629 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
1630 vec_mode = TYPE_MODE (vectype);
1632 else
1634 /* Regular reduction: use the same vectype and tree-code as used for
1635 the vector code inside the loop can be used for the epilog code. */
1636 orig_code = code;
1639 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
1640 return false;
1641 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
1642 if (!reduc_optab)
1644 if (vect_print_dump_info (REPORT_DETAILS))
1645 fprintf (vect_dump, "no optab for reduction.");
1646 epilog_reduc_code = NUM_TREE_CODES;
1648 if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1650 if (vect_print_dump_info (REPORT_DETAILS))
1651 fprintf (vect_dump, "reduc op not supported by target.");
1652 epilog_reduc_code = NUM_TREE_CODES;
1655 if (!vec_stmt) /* transformation not required. */
1657 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
1658 return true;
1661 /** Transform. **/
1663 if (vect_print_dump_info (REPORT_DETAILS))
1664 fprintf (vect_dump, "transform reduction.");
1666 /* Create the destination vector */
1667 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1669 /* Create the reduction-phi that defines the reduction-operand. */
1670 new_phi = create_phi_node (vec_dest, loop->header);
1672 /* In case the vectorization factor (VF) is bigger than the number
1673 of elements that we can fit in a vectype (nunits), we have to generate
1674 more than one vector stmt - i.e - we need to "unroll" the
1675 vector stmt by a factor VF/nunits. For more details see documentation
1676 in vectorizable_operation. */
1678 prev_stmt_info = NULL;
1679 for (j = 0; j < ncopies; j++)
1681 /* Handle uses. */
1682 if (j == 0)
1684 op = TREE_OPERAND (operation, 0);
1685 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
1686 if (op_type == ternary_op)
1688 op = TREE_OPERAND (operation, 1);
1689 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
1692 /* Get the vector def for the reduction variable from the phi node */
1693 reduc_def = PHI_RESULT (new_phi);
1695 else
1697 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
1698 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
1699 if (op_type == ternary_op)
1700 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
1702 /* Get the vector def for the reduction variable from the vectorized
1703 reduction operation generated in the previous iteration (j-1) */
1704 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
1707 /* Arguments are ready. create the new vector stmt. */
1708 if (op_type == binary_op)
1709 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
1710 else
1711 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
1712 reduc_def);
1713 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
1714 new_temp = make_ssa_name (vec_dest, new_stmt);
1715 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1716 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1718 if (j == 0)
1719 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1720 else
1721 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1722 prev_stmt_info = vinfo_for_stmt (new_stmt);
1725 /* Finalize the reduction-phi (set it's arguments) and create the
1726 epilog reduction code. */
1727 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
1728 return true;
1731 /* Checks if CALL can be vectorized in type VECTYPE. Returns
1732 a function declaration if the target has a vectorized version
1733 of the function, or NULL_TREE if the function cannot be vectorized. */
1735 tree
1736 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
1738 tree fndecl = get_callee_fndecl (call);
1739 enum built_in_function code;
1741 /* We only handle functions that do not read or clobber memory -- i.e.
1742 const or novops ones. */
1743 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
1744 return NULL_TREE;
1746 if (!fndecl
1747 || TREE_CODE (fndecl) != FUNCTION_DECL
1748 || !DECL_BUILT_IN (fndecl))
1749 return NULL_TREE;
1751 code = DECL_FUNCTION_CODE (fndecl);
1752 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
1753 vectype_in);
1756 /* Function vectorizable_call.
1758 Check if STMT performs a function call that can be vectorized.
1759 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1760 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1761 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1763 bool
1764 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1766 tree vec_dest;
1767 tree scalar_dest;
1768 tree operation;
1769 tree op, type;
1770 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
1771 tree vectype_out, vectype_in;
1772 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1773 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
1774 enum vect_def_type dt[2];
1775 int ncopies, j, nargs;
1776 call_expr_arg_iterator iter;
1778 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1779 return false;
1781 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
1782 return false;
1784 /* FORNOW: not yet supported. */
1785 if (STMT_VINFO_LIVE_P (stmt_info))
1787 if (vect_print_dump_info (REPORT_DETAILS))
1788 fprintf (vect_dump, "value used after loop.");
1789 return false;
1792 /* Is STMT a vectorizable call? */
1793 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
1794 return false;
1796 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
1797 return false;
1799 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1800 if (TREE_CODE (operation) != CALL_EXPR)
1801 return false;
1803 /* Process function arguments. */
1804 rhs_type = NULL_TREE;
1805 nargs = 0;
1806 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
1808 ++nargs;
1810 /* Bail out if the function has more than two arguments, we
1811 do not have interesting builtin functions to vectorize with
1812 more than two arguments. */
1813 if (nargs > 2)
1814 return false;
1816 /* We can only handle calls with arguments of the same type. */
1817 if (rhs_type
1818 && rhs_type != TREE_TYPE (op))
1820 if (vect_print_dump_info (REPORT_DETAILS))
1821 fprintf (vect_dump, "argument types differ.");
1822 return false;
1824 rhs_type = TREE_TYPE (op);
1826 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs-1]))
1828 if (vect_print_dump_info (REPORT_DETAILS))
1829 fprintf (vect_dump, "use not simple.");
1830 return false;
1834 /* No arguments is also not good. */
1835 if (nargs == 0)
1836 return false;
1838 vectype_in = get_vectype_for_scalar_type (rhs_type);
1840 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
1841 vectype_out = get_vectype_for_scalar_type (lhs_type);
1843 /* Only handle the case of vectors with the same number of elements.
1844 FIXME: We need a way to handle for example the SSE2 cvtpd2dq
1845 instruction which converts V2DFmode to V4SImode but only
1846 using the lower half of the V4SImode result. */
1847 if (TYPE_VECTOR_SUBPARTS (vectype_in) != TYPE_VECTOR_SUBPARTS (vectype_out))
1848 return false;
1850 /* For now, we only vectorize functions if a target specific builtin
1851 is available. TODO -- in some cases, it might be profitable to
1852 insert the calls for pieces of the vector, in order to be able
1853 to vectorize other operations in the loop. */
1854 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
1855 if (fndecl == NULL_TREE)
1857 if (vect_print_dump_info (REPORT_DETAILS))
1858 fprintf (vect_dump, "function is not vectorizable.");
1860 return false;
1863 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
1865 if (!vec_stmt) /* transformation not required. */
1867 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
1868 return true;
1871 /** Transform. **/
1873 if (vect_print_dump_info (REPORT_DETAILS))
1874 fprintf (vect_dump, "transform operation.");
1876 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1877 / TYPE_VECTOR_SUBPARTS (vectype_out));
1878 gcc_assert (ncopies >= 1);
1880 /* Handle def. */
1881 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1882 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
1884 prev_stmt_info = NULL;
1885 for (j = 0; j < ncopies; ++j)
1887 tree new_stmt, vargs;
1888 tree vec_oprnd[2];
1889 int n;
1891 /* Build argument list for the vectorized call. */
1892 /* FIXME: Rewrite this so that it doesn't construct a temporary
1893 list. */
1894 vargs = NULL_TREE;
1895 n = -1;
1896 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
1898 ++n;
1900 if (j == 0)
1901 vec_oprnd[n] = vect_get_vec_def_for_operand (op, stmt, NULL);
1902 else
1903 vec_oprnd[n] = vect_get_vec_def_for_stmt_copy (dt[n], vec_oprnd[n]);
1905 vargs = tree_cons (NULL_TREE, vec_oprnd[n], vargs);
1907 vargs = nreverse (vargs);
1909 rhs = build_function_call_expr (fndecl, vargs);
1910 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
1911 new_temp = make_ssa_name (vec_dest, new_stmt);
1912 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1914 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1916 if (j == 0)
1917 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1918 else
1919 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1920 prev_stmt_info = vinfo_for_stmt (new_stmt);
1923 /* The call in STMT might prevent it from being removed in dce. We however
1924 cannot remove it here, due to the way the ssa name it defines is mapped
1925 to the new definition. So just replace rhs of the statement with something
1926 harmless. */
1927 type = TREE_TYPE (scalar_dest);
1928 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
1930 return true;
1934 /* Function vectorizable_conversion.
1936 Check if STMT performs a conversion operation, that can be vectorized.
1937 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1938 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1939 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1941 bool
1942 vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
1943 tree * vec_stmt)
1945 tree vec_dest;
1946 tree scalar_dest;
1947 tree operation;
1948 tree op0;
1949 tree vec_oprnd0 = NULL_TREE;
1950 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1951 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1952 enum tree_code code;
1953 tree new_temp;
1954 tree def, def_stmt;
1955 enum vect_def_type dt0;
1956 tree new_stmt;
1957 int nunits_in;
1958 int nunits_out;
1959 int ncopies, j;
1960 tree vectype_out, vectype_in;
1961 tree rhs_type, lhs_type;
1962 tree builtin_decl;
1963 stmt_vec_info prev_stmt_info;
1965 /* Is STMT a vectorizable conversion? */
1967 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1968 return false;
1970 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
1971 return false;
1973 if (STMT_VINFO_LIVE_P (stmt_info))
1975 /* FORNOW: not yet supported. */
1976 if (vect_print_dump_info (REPORT_DETAILS))
1977 fprintf (vect_dump, "value used after loop.");
1978 return false;
1981 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
1982 return false;
1984 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
1985 return false;
1987 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1988 code = TREE_CODE (operation);
1989 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
1990 return false;
1992 /* Check types of lhs and rhs */
1993 op0 = TREE_OPERAND (operation, 0);
1994 rhs_type = TREE_TYPE (op0);
1995 vectype_in = get_vectype_for_scalar_type (rhs_type);
1996 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
1998 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1999 lhs_type = TREE_TYPE (scalar_dest);
2000 vectype_out = get_vectype_for_scalar_type (lhs_type);
2001 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
2002 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2004 /* FORNOW: need to extend to support short<->float conversions as well. */
2005 if (nunits_out != nunits_in)
2006 return false;
2008 /* Bail out if the types are both integral or non-integral */
2009 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
2010 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
2011 return false;
2013 /* Sanity check: make sure that at least one copy of the vectorized stmt
2014 needs to be generated. */
2015 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2016 gcc_assert (ncopies >= 1);
2018 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2020 if (vect_print_dump_info (REPORT_DETAILS))
2021 fprintf (vect_dump, "use not simple.");
2022 return false;
2025 /* Supportable by target? */
2026 if (!targetm.vectorize.builtin_conversion (code, vectype_in))
2028 if (vect_print_dump_info (REPORT_DETAILS))
2029 fprintf (vect_dump, "op not supported by target.");
2030 return false;
2033 if (!vec_stmt) /* transformation not required. */
2035 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
2036 return true;
2039 /** Transform. **/
2041 if (vect_print_dump_info (REPORT_DETAILS))
2042 fprintf (vect_dump, "transform conversion.");
2044 /* Handle def. */
2045 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2047 prev_stmt_info = NULL;
2048 for (j = 0; j < ncopies; j++)
2050 tree sym;
2051 ssa_op_iter iter;
2053 if (j == 0)
2054 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2055 else
2056 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2058 builtin_decl =
2059 targetm.vectorize.builtin_conversion (code, vectype_in);
2060 new_stmt = build_call_expr (builtin_decl, 1, vec_oprnd0);
2062 /* Arguments are ready. create the new vector stmt. */
2063 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
2064 new_temp = make_ssa_name (vec_dest, new_stmt);
2065 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2066 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2067 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2069 if (TREE_CODE (sym) == SSA_NAME)
2070 sym = SSA_NAME_VAR (sym);
2071 mark_sym_for_renaming (sym);
2074 if (j == 0)
2075 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2076 else
2077 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2078 prev_stmt_info = vinfo_for_stmt (new_stmt);
2080 return true;
2084 /* Function vectorizable_assignment.
2086 Check if STMT performs an assignment (copy) that can be vectorized.
2087 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2088 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2089 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2091 bool
2092 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2094 tree vec_dest;
2095 tree scalar_dest;
2096 tree op;
2097 tree vec_oprnd;
2098 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2099 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2100 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2101 tree new_temp;
2102 tree def, def_stmt;
2103 enum vect_def_type dt;
2104 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2105 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2107 gcc_assert (ncopies >= 1);
2108 if (ncopies > 1)
2109 return false; /* FORNOW */
2111 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2112 return false;
2114 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2115 return false;
2117 /* FORNOW: not yet supported. */
2118 if (STMT_VINFO_LIVE_P (stmt_info))
2120 if (vect_print_dump_info (REPORT_DETAILS))
2121 fprintf (vect_dump, "value used after loop.");
2122 return false;
2125 /* Is vectorizable assignment? */
2126 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2127 return false;
2129 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2130 if (TREE_CODE (scalar_dest) != SSA_NAME)
2131 return false;
2133 op = GIMPLE_STMT_OPERAND (stmt, 1);
2134 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
2136 if (vect_print_dump_info (REPORT_DETAILS))
2137 fprintf (vect_dump, "use not simple.");
2138 return false;
2141 if (!vec_stmt) /* transformation not required. */
2143 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
2144 return true;
2147 /** Transform. **/
2148 if (vect_print_dump_info (REPORT_DETAILS))
2149 fprintf (vect_dump, "transform assignment.");
2151 /* Handle def. */
2152 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2154 /* Handle use. */
2155 op = GIMPLE_STMT_OPERAND (stmt, 1);
2156 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
2158 /* Arguments are ready. create the new vector stmt. */
2159 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_oprnd);
2160 new_temp = make_ssa_name (vec_dest, *vec_stmt);
2161 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
2162 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
2164 return true;
2168 /* Function vect_min_worthwhile_factor.
2170 For a loop where we could vectorize the operation indicated by CODE,
2171 return the minimum vectorization factor that makes it worthwhile
2172 to use generic vectors. */
2173 static int
2174 vect_min_worthwhile_factor (enum tree_code code)
2176 switch (code)
2178 case PLUS_EXPR:
2179 case MINUS_EXPR:
2180 case NEGATE_EXPR:
2181 return 4;
2183 case BIT_AND_EXPR:
2184 case BIT_IOR_EXPR:
2185 case BIT_XOR_EXPR:
2186 case BIT_NOT_EXPR:
2187 return 2;
2189 default:
2190 return INT_MAX;
2195 /* Function vectorizable_induction
2197 Check if PHI performs an induction computation that can be vectorized.
2198 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
2199 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
2200 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2202 bool
2203 vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
2204 tree *vec_stmt)
2206 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
2207 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2208 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2209 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2210 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2211 tree vec_def;
2213 gcc_assert (ncopies >= 1);
2215 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2216 return false;
2218 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
2220 if (STMT_VINFO_LIVE_P (stmt_info))
2222 /* FORNOW: not yet supported. */
2223 if (vect_print_dump_info (REPORT_DETAILS))
2224 fprintf (vect_dump, "value used after loop.");
2225 return false;
2228 if (TREE_CODE (phi) != PHI_NODE)
2229 return false;
2231 if (!vec_stmt) /* transformation not required. */
2233 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
2234 return true;
2237 /** Transform. **/
2239 if (vect_print_dump_info (REPORT_DETAILS))
2240 fprintf (vect_dump, "transform induction phi.");
2242 vec_def = get_initial_def_for_induction (phi);
2243 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
2244 return true;
2248 /* Function vectorizable_operation.
2250 Check if STMT performs a binary or unary operation that can be vectorized.
2251 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2252 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2253 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2255 bool
2256 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2258 tree vec_dest;
2259 tree scalar_dest;
2260 tree operation;
2261 tree op0, op1 = NULL;
2262 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2263 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2264 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2265 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2266 enum tree_code code;
2267 enum machine_mode vec_mode;
2268 tree new_temp;
2269 int op_type;
2270 optab optab;
2271 int icode;
2272 enum machine_mode optab_op2_mode;
2273 tree def, def_stmt;
2274 enum vect_def_type dt0, dt1;
2275 tree new_stmt;
2276 stmt_vec_info prev_stmt_info;
2277 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
2278 int nunits_out;
2279 tree vectype_out;
2280 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2281 int j;
2283 gcc_assert (ncopies >= 1);
2285 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2286 return false;
2288 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2289 return false;
2291 /* FORNOW: not yet supported. */
2292 if (STMT_VINFO_LIVE_P (stmt_info))
2294 if (vect_print_dump_info (REPORT_DETAILS))
2295 fprintf (vect_dump, "value used after loop.");
2296 return false;
2299 /* Is STMT a vectorizable binary/unary operation? */
2300 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2301 return false;
2303 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2304 return false;
2306 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2307 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2308 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2309 if (nunits_out != nunits_in)
2310 return false;
2312 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2313 code = TREE_CODE (operation);
2314 optab = optab_for_tree_code (code, vectype);
2316 /* Support only unary or binary operations. */
2317 op_type = TREE_OPERAND_LENGTH (operation);
2318 if (op_type != unary_op && op_type != binary_op)
2320 if (vect_print_dump_info (REPORT_DETAILS))
2321 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
2322 return false;
2325 op0 = TREE_OPERAND (operation, 0);
2326 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2328 if (vect_print_dump_info (REPORT_DETAILS))
2329 fprintf (vect_dump, "use not simple.");
2330 return false;
2333 if (op_type == binary_op)
2335 op1 = TREE_OPERAND (operation, 1);
2336 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2338 if (vect_print_dump_info (REPORT_DETAILS))
2339 fprintf (vect_dump, "use not simple.");
2340 return false;
2344 /* Supportable by target? */
2345 if (!optab)
2347 if (vect_print_dump_info (REPORT_DETAILS))
2348 fprintf (vect_dump, "no optab.");
2349 return false;
2351 vec_mode = TYPE_MODE (vectype);
2352 icode = (int) optab->handlers[(int) vec_mode].insn_code;
2353 if (icode == CODE_FOR_nothing)
2355 if (vect_print_dump_info (REPORT_DETAILS))
2356 fprintf (vect_dump, "op not supported by target.");
2357 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2358 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2359 < vect_min_worthwhile_factor (code))
2360 return false;
2361 if (vect_print_dump_info (REPORT_DETAILS))
2362 fprintf (vect_dump, "proceeding using word mode.");
2365 /* Worthwhile without SIMD support? */
2366 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2367 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2368 < vect_min_worthwhile_factor (code))
2370 if (vect_print_dump_info (REPORT_DETAILS))
2371 fprintf (vect_dump, "not worthwhile without SIMD support.");
2372 return false;
2375 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2377 /* FORNOW: not yet supported. */
2378 if (!VECTOR_MODE_P (vec_mode))
2379 return false;
2381 /* Invariant argument is needed for a vector shift
2382 by a scalar shift operand. */
2383 optab_op2_mode = insn_data[icode].operand[2].mode;
2384 if (! (VECTOR_MODE_P (optab_op2_mode)
2385 || dt1 == vect_constant_def
2386 || dt1 == vect_invariant_def))
2388 if (vect_print_dump_info (REPORT_DETAILS))
2389 fprintf (vect_dump, "operand mode requires invariant argument.");
2390 return false;
2394 if (!vec_stmt) /* transformation not required. */
2396 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
2397 return true;
2400 /** Transform. **/
2402 if (vect_print_dump_info (REPORT_DETAILS))
2403 fprintf (vect_dump, "transform binary/unary operation.");
2405 /* Handle def. */
2406 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2408 /* In case the vectorization factor (VF) is bigger than the number
2409 of elements that we can fit in a vectype (nunits), we have to generate
2410 more than one vector stmt - i.e - we need to "unroll" the
2411 vector stmt by a factor VF/nunits. In doing so, we record a pointer
2412 from one copy of the vector stmt to the next, in the field
2413 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
2414 stages to find the correct vector defs to be used when vectorizing
2415 stmts that use the defs of the current stmt. The example below illustrates
2416 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
2417 4 vectorized stmts):
2419 before vectorization:
2420 RELATED_STMT VEC_STMT
2421 S1: x = memref - -
2422 S2: z = x + 1 - -
2424 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
2425 there):
2426 RELATED_STMT VEC_STMT
2427 VS1_0: vx0 = memref0 VS1_1 -
2428 VS1_1: vx1 = memref1 VS1_2 -
2429 VS1_2: vx2 = memref2 VS1_3 -
2430 VS1_3: vx3 = memref3 - -
2431 S1: x = load - VS1_0
2432 S2: z = x + 1 - -
2434 step2: vectorize stmt S2 (done here):
2435 To vectorize stmt S2 we first need to find the relevant vector
2436 def for the first operand 'x'. This is, as usual, obtained from
2437 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
2438 that defines 'x' (S1). This way we find the stmt VS1_0, and the
2439 relevant vector def 'vx0'. Having found 'vx0' we can generate
2440 the vector stmt VS2_0, and as usual, record it in the
2441 STMT_VINFO_VEC_STMT of stmt S2.
2442 When creating the second copy (VS2_1), we obtain the relevant vector
2443 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
2444 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
2445 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
2446 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
2447 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
2448 chain of stmts and pointers:
2449 RELATED_STMT VEC_STMT
2450 VS1_0: vx0 = memref0 VS1_1 -
2451 VS1_1: vx1 = memref1 VS1_2 -
2452 VS1_2: vx2 = memref2 VS1_3 -
2453 VS1_3: vx3 = memref3 - -
2454 S1: x = load - VS1_0
2455 VS2_0: vz0 = vx0 + v1 VS2_1 -
2456 VS2_1: vz1 = vx1 + v1 VS2_2 -
2457 VS2_2: vz2 = vx2 + v1 VS2_3 -
2458 VS2_3: vz3 = vx3 + v1 - -
2459 S2: z = x + 1 - VS2_0 */
2461 prev_stmt_info = NULL;
2462 for (j = 0; j < ncopies; j++)
2464 /* Handle uses. */
2465 if (j == 0)
2467 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2468 if (op_type == binary_op)
2470 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2472 /* Vector shl and shr insn patterns can be defined with
2473 scalar operand 2 (shift operand). In this case, use
2474 constant or loop invariant op1 directly, without
2475 extending it to vector mode first. */
2476 optab_op2_mode = insn_data[icode].operand[2].mode;
2477 if (!VECTOR_MODE_P (optab_op2_mode))
2479 if (vect_print_dump_info (REPORT_DETAILS))
2480 fprintf (vect_dump, "operand 1 using scalar mode.");
2481 vec_oprnd1 = op1;
2484 if (!vec_oprnd1)
2485 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2488 else
2490 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2491 if (op_type == binary_op)
2492 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2495 /* Arguments are ready. create the new vector stmt. */
2497 if (op_type == binary_op)
2498 new_stmt = build_gimple_modify_stmt (vec_dest,
2499 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
2500 else
2501 new_stmt = build_gimple_modify_stmt (vec_dest,
2502 build1 (code, vectype, vec_oprnd0));
2503 new_temp = make_ssa_name (vec_dest, new_stmt);
2504 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2505 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2507 if (j == 0)
2508 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2509 else
2510 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2511 prev_stmt_info = vinfo_for_stmt (new_stmt);
2514 return true;
2518 /* Function vectorizable_type_demotion
2520 Check if STMT performs a binary or unary operation that involves
2521 type demotion, and if it can be vectorized.
2522 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2523 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2524 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2526 bool
2527 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
2528 tree *vec_stmt)
2530 tree vec_dest;
2531 tree scalar_dest;
2532 tree operation;
2533 tree op0;
2534 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2535 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2536 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2537 enum tree_code code;
2538 tree new_temp;
2539 tree def, def_stmt;
2540 enum vect_def_type dt0;
2541 tree new_stmt;
2542 stmt_vec_info prev_stmt_info;
2543 int nunits_in;
2544 int nunits_out;
2545 tree vectype_out;
2546 int ncopies;
2547 int j;
2548 tree expr;
2549 tree vectype_in;
2550 tree scalar_type;
2551 optab optab;
2552 enum machine_mode vec_mode;
2554 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2555 return false;
2557 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2558 return false;
2560 /* FORNOW: not yet supported. */
2561 if (STMT_VINFO_LIVE_P (stmt_info))
2563 if (vect_print_dump_info (REPORT_DETAILS))
2564 fprintf (vect_dump, "value used after loop.");
2565 return false;
2568 /* Is STMT a vectorizable type-demotion operation? */
2569 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2570 return false;
2572 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2573 return false;
2575 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2576 code = TREE_CODE (operation);
2577 if (code != NOP_EXPR && code != CONVERT_EXPR)
2578 return false;
2580 op0 = TREE_OPERAND (operation, 0);
2581 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2582 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2584 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2585 scalar_type = TREE_TYPE (scalar_dest);
2586 vectype_out = get_vectype_for_scalar_type (scalar_type);
2587 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2588 if (nunits_in != nunits_out / 2) /* FORNOW */
2589 return false;
2591 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2592 gcc_assert (ncopies >= 1);
2594 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
2595 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2596 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
2597 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
2598 && (code == NOP_EXPR || code == CONVERT_EXPR))))
2599 return false;
2601 /* Check the operands of the operation. */
2602 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2604 if (vect_print_dump_info (REPORT_DETAILS))
2605 fprintf (vect_dump, "use not simple.");
2606 return false;
2609 /* Supportable by target? */
2610 code = VEC_PACK_TRUNC_EXPR;
2611 optab = optab_for_tree_code (code, vectype_in);
2612 if (!optab)
2613 return false;
2615 vec_mode = TYPE_MODE (vectype_in);
2616 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
2617 return false;
2619 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2621 if (!vec_stmt) /* transformation not required. */
2623 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
2624 return true;
2627 /** Transform. **/
2628 if (vect_print_dump_info (REPORT_DETAILS))
2629 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
2630 ncopies);
2632 /* Handle def. */
2633 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2635 /* In case the vectorization factor (VF) is bigger than the number
2636 of elements that we can fit in a vectype (nunits), we have to generate
2637 more than one vector stmt - i.e - we need to "unroll" the
2638 vector stmt by a factor VF/nunits. */
2639 prev_stmt_info = NULL;
2640 for (j = 0; j < ncopies; j++)
2642 /* Handle uses. */
2643 if (j == 0)
2645 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2646 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2648 else
2650 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
2651 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2654 /* Arguments are ready. Create the new vector stmt. */
2655 expr = build2 (code, vectype_out, vec_oprnd0, vec_oprnd1);
2656 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2657 new_temp = make_ssa_name (vec_dest, new_stmt);
2658 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2659 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2661 if (j == 0)
2662 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2663 else
2664 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2666 prev_stmt_info = vinfo_for_stmt (new_stmt);
2669 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2670 return true;
2674 /* Function vect_gen_widened_results_half
2676 Create a vector stmt whose code, type, number of arguments, and result
2677 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2678 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2679 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2680 needs to be created (DECL is a function-decl of a target-builtin).
2681 STMT is the original scalar stmt that we are vectorizing. */
2683 static tree
2684 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2685 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2686 tree vec_dest, block_stmt_iterator *bsi,
2687 tree stmt)
2689 tree expr;
2690 tree new_stmt;
2691 tree new_temp;
2692 tree sym;
2693 ssa_op_iter iter;
2695 /* Generate half of the widened result: */
2696 if (code == CALL_EXPR)
2698 /* Target specific support */
2699 if (op_type == binary_op)
2700 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
2701 else
2702 expr = build_call_expr (decl, 1, vec_oprnd0);
2704 else
2706 /* Generic support */
2707 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2708 if (op_type == binary_op)
2709 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2710 else
2711 expr = build1 (code, vectype, vec_oprnd0);
2713 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2714 new_temp = make_ssa_name (vec_dest, new_stmt);
2715 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2716 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2718 if (code == CALL_EXPR)
2720 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2722 if (TREE_CODE (sym) == SSA_NAME)
2723 sym = SSA_NAME_VAR (sym);
2724 mark_sym_for_renaming (sym);
2728 return new_stmt;
2732 /* Function vectorizable_type_promotion
2734 Check if STMT performs a binary or unary operation that involves
2735 type promotion, and if it can be vectorized.
2736 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2737 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2738 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2740 bool
2741 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
2742 tree *vec_stmt)
2744 tree vec_dest;
2745 tree scalar_dest;
2746 tree operation;
2747 tree op0, op1 = NULL;
2748 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2749 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2750 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2751 enum tree_code code, code1 = CODE_FOR_nothing, code2 = CODE_FOR_nothing;
2752 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
2753 int op_type;
2754 tree def, def_stmt;
2755 enum vect_def_type dt0, dt1;
2756 tree new_stmt;
2757 stmt_vec_info prev_stmt_info;
2758 int nunits_in;
2759 int nunits_out;
2760 tree vectype_out;
2761 int ncopies;
2762 int j;
2763 tree vectype_in;
2765 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2766 return false;
2768 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2769 return false;
2771 /* FORNOW: not yet supported. */
2772 if (STMT_VINFO_LIVE_P (stmt_info))
2774 if (vect_print_dump_info (REPORT_DETAILS))
2775 fprintf (vect_dump, "value used after loop.");
2776 return false;
2779 /* Is STMT a vectorizable type-promotion operation? */
2780 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2781 return false;
2783 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2784 return false;
2786 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2787 code = TREE_CODE (operation);
2788 if (code != NOP_EXPR && code != WIDEN_MULT_EXPR)
2789 return false;
2791 op0 = TREE_OPERAND (operation, 0);
2792 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2793 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2794 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2795 gcc_assert (ncopies >= 1);
2797 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2798 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2799 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2800 if (nunits_out != nunits_in / 2) /* FORNOW */
2801 return false;
2803 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
2804 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2805 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
2806 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
2807 && (code == CONVERT_EXPR || code == NOP_EXPR))))
2808 return false;
2810 /* Check the operands of the operation. */
2811 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2813 if (vect_print_dump_info (REPORT_DETAILS))
2814 fprintf (vect_dump, "use not simple.");
2815 return false;
2818 op_type = TREE_CODE_LENGTH (code);
2819 if (op_type == binary_op)
2821 op1 = TREE_OPERAND (operation, 1);
2822 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2824 if (vect_print_dump_info (REPORT_DETAILS))
2825 fprintf (vect_dump, "use not simple.");
2826 return false;
2830 /* Supportable by target? */
2831 if (!supportable_widening_operation (code, stmt, vectype_in,
2832 &decl1, &decl2, &code1, &code2))
2833 return false;
2835 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2837 if (!vec_stmt) /* transformation not required. */
2839 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
2840 return true;
2843 /** Transform. **/
2845 if (vect_print_dump_info (REPORT_DETAILS))
2846 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
2847 ncopies);
2849 /* Handle def. */
2850 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2852 /* In case the vectorization factor (VF) is bigger than the number
2853 of elements that we can fit in a vectype (nunits), we have to generate
2854 more than one vector stmt - i.e - we need to "unroll" the
2855 vector stmt by a factor VF/nunits. */
2857 prev_stmt_info = NULL;
2858 for (j = 0; j < ncopies; j++)
2860 /* Handle uses. */
2861 if (j == 0)
2863 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2864 if (op_type == binary_op)
2865 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2867 else
2869 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2870 if (op_type == binary_op)
2871 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2874 /* Arguments are ready. Create the new vector stmt. We are creating
2875 two vector defs because the widened result does not fit in one vector.
2876 The vectorized stmt can be expressed as a call to a taregt builtin,
2877 or a using a tree-code. */
2878 /* Generate first half of the widened result: */
2879 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
2880 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2881 if (j == 0)
2882 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2883 else
2884 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2885 prev_stmt_info = vinfo_for_stmt (new_stmt);
2887 /* Generate second half of the widened result: */
2888 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
2889 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2890 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2891 prev_stmt_info = vinfo_for_stmt (new_stmt);
2895 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2896 return true;
2900 /* Function vect_strided_store_supported.
2902 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
2903 and FALSE otherwise. */
2905 static bool
2906 vect_strided_store_supported (tree vectype)
2908 optab interleave_high_optab, interleave_low_optab;
2909 int mode;
2911 mode = (int) TYPE_MODE (vectype);
2913 /* Check that the operation is supported. */
2914 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
2915 vectype);
2916 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
2917 vectype);
2918 if (!interleave_high_optab || !interleave_low_optab)
2920 if (vect_print_dump_info (REPORT_DETAILS))
2921 fprintf (vect_dump, "no optab for interleave.");
2922 return false;
2925 if (interleave_high_optab->handlers[(int) mode].insn_code
2926 == CODE_FOR_nothing
2927 || interleave_low_optab->handlers[(int) mode].insn_code
2928 == CODE_FOR_nothing)
2930 if (vect_print_dump_info (REPORT_DETAILS))
2931 fprintf (vect_dump, "interleave op not supported by target.");
2932 return false;
2934 return true;
2938 /* Function vect_permute_store_chain.
2940 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
2941 a power of 2, generate interleave_high/low stmts to reorder the data
2942 correctly for the stores. Return the final references for stores in
2943 RESULT_CHAIN.
2945 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
2946 The input is 4 vectors each containing 8 elements. We assign a number to each
2947 element, the input sequence is:
2949 1st vec: 0 1 2 3 4 5 6 7
2950 2nd vec: 8 9 10 11 12 13 14 15
2951 3rd vec: 16 17 18 19 20 21 22 23
2952 4th vec: 24 25 26 27 28 29 30 31
2954 The output sequence should be:
2956 1st vec: 0 8 16 24 1 9 17 25
2957 2nd vec: 2 10 18 26 3 11 19 27
2958 3rd vec: 4 12 20 28 5 13 21 30
2959 4th vec: 6 14 22 30 7 15 23 31
2961 i.e., we interleave the contents of the four vectors in their order.
2963 We use interleave_high/low instructions to create such output. The input of
2964 each interleave_high/low operation is two vectors:
2965 1st vec 2nd vec
2966 0 1 2 3 4 5 6 7
2967 the even elements of the result vector are obtained left-to-right from the
2968 high/low elements of the first vector. The odd elements of the result are
2969 obtained left-to-right from the high/low elements of the second vector.
2970 The output of interleave_high will be: 0 4 1 5
2971 and of interleave_low: 2 6 3 7
2974 The permutation is done in log LENGTH stages. In each stage interleave_high
2975 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
2976 where the first argument is taken from the first half of DR_CHAIN and the
2977 second argument from it's second half.
2978 In our example,
2980 I1: interleave_high (1st vec, 3rd vec)
2981 I2: interleave_low (1st vec, 3rd vec)
2982 I3: interleave_high (2nd vec, 4th vec)
2983 I4: interleave_low (2nd vec, 4th vec)
2985 The output for the first stage is:
2987 I1: 0 16 1 17 2 18 3 19
2988 I2: 4 20 5 21 6 22 7 23
2989 I3: 8 24 9 25 10 26 11 27
2990 I4: 12 28 13 29 14 30 15 31
2992 The output of the second stage, i.e. the final result is:
2994 I1: 0 8 16 24 1 9 17 25
2995 I2: 2 10 18 26 3 11 19 27
2996 I3: 4 12 20 28 5 13 21 30
2997 I4: 6 14 22 30 7 15 23 31. */
2999 static bool
3000 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
3001 unsigned int length,
3002 tree stmt,
3003 block_stmt_iterator *bsi,
3004 VEC(tree,heap) **result_chain)
3006 tree perm_dest, perm_stmt, vect1, vect2, high, low;
3007 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3008 tree scalar_dest, tmp;
3009 int i;
3010 unsigned int j;
3011 VEC(tree,heap) *first, *second;
3013 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3014 first = VEC_alloc (tree, heap, length/2);
3015 second = VEC_alloc (tree, heap, length/2);
3017 /* Check that the operation is supported. */
3018 if (!vect_strided_store_supported (vectype))
3019 return false;
3021 *result_chain = VEC_copy (tree, heap, dr_chain);
3023 for (i = 0; i < exact_log2 (length); i++)
3025 for (j = 0; j < length/2; j++)
3027 vect1 = VEC_index (tree, dr_chain, j);
3028 vect2 = VEC_index (tree, dr_chain, j+length/2);
3030 /* Create interleaving stmt:
3031 in the case of big endian:
3032 high = interleave_high (vect1, vect2)
3033 and in the case of little endian:
3034 high = interleave_low (vect1, vect2). */
3035 perm_dest = create_tmp_var (vectype, "vect_inter_high");
3036 DECL_GIMPLE_REG_P (perm_dest) = 1;
3037 add_referenced_var (perm_dest);
3038 if (BYTES_BIG_ENDIAN)
3039 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
3040 else
3041 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
3042 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
3043 high = make_ssa_name (perm_dest, perm_stmt);
3044 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
3045 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3046 VEC_replace (tree, *result_chain, 2*j, high);
3048 /* Create interleaving stmt:
3049 in the case of big endian:
3050 low = interleave_low (vect1, vect2)
3051 and in the case of little endian:
3052 low = interleave_high (vect1, vect2). */
3053 perm_dest = create_tmp_var (vectype, "vect_inter_low");
3054 DECL_GIMPLE_REG_P (perm_dest) = 1;
3055 add_referenced_var (perm_dest);
3056 if (BYTES_BIG_ENDIAN)
3057 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
3058 else
3059 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
3060 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
3061 low = make_ssa_name (perm_dest, perm_stmt);
3062 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
3063 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3064 VEC_replace (tree, *result_chain, 2*j+1, low);
3066 dr_chain = VEC_copy (tree, heap, *result_chain);
3068 return true;
3072 /* Function vectorizable_store.
3074 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
3075 can be vectorized.
3076 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3077 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3078 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3080 bool
3081 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3083 tree scalar_dest;
3084 tree data_ref;
3085 tree op;
3086 tree vec_oprnd = NULL_TREE;
3087 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3088 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
3089 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3090 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3091 enum machine_mode vec_mode;
3092 tree dummy;
3093 enum dr_alignment_support alignment_support_cheme;
3094 ssa_op_iter iter;
3095 def_operand_p def_p;
3096 tree def, def_stmt;
3097 enum vect_def_type dt;
3098 stmt_vec_info prev_stmt_info = NULL;
3099 tree dataref_ptr = NULL_TREE;
3100 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3101 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3102 int j;
3103 tree next_stmt, first_stmt;
3104 bool strided_store = false;
3105 unsigned int group_size, i;
3106 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
3107 gcc_assert (ncopies >= 1);
3109 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3110 return false;
3112 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3113 return false;
3115 if (STMT_VINFO_LIVE_P (stmt_info))
3117 if (vect_print_dump_info (REPORT_DETAILS))
3118 fprintf (vect_dump, "value used after loop.");
3119 return false;
3122 /* Is vectorizable store? */
3124 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3125 return false;
3127 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3128 if (TREE_CODE (scalar_dest) != ARRAY_REF
3129 && TREE_CODE (scalar_dest) != INDIRECT_REF
3130 && !DR_GROUP_FIRST_DR (stmt_info))
3131 return false;
3133 op = GIMPLE_STMT_OPERAND (stmt, 1);
3134 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
3136 if (vect_print_dump_info (REPORT_DETAILS))
3137 fprintf (vect_dump, "use not simple.");
3138 return false;
3141 vec_mode = TYPE_MODE (vectype);
3142 /* FORNOW. In some cases can vectorize even if data-type not supported
3143 (e.g. - array initialization with 0). */
3144 if (mov_optab->handlers[(int)vec_mode].insn_code == CODE_FOR_nothing)
3145 return false;
3147 if (!STMT_VINFO_DATA_REF (stmt_info))
3148 return false;
3150 if (DR_GROUP_FIRST_DR (stmt_info))
3152 strided_store = true;
3153 if (!vect_strided_store_supported (vectype))
3154 return false;
3157 if (!vec_stmt) /* transformation not required. */
3159 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
3160 return true;
3163 /** Transform. **/
3165 if (vect_print_dump_info (REPORT_DETAILS))
3166 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
3168 if (strided_store)
3170 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3171 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3172 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3174 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
3176 /* We vectorize all the stmts of the interleaving group when we
3177 reach the last stmt in the group. */
3178 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
3179 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
3181 *vec_stmt = NULL_TREE;
3182 return true;
3185 else
3187 first_stmt = stmt;
3188 first_dr = dr;
3189 group_size = 1;
3192 dr_chain = VEC_alloc (tree, heap, group_size);
3193 oprnds = VEC_alloc (tree, heap, group_size);
3195 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3196 gcc_assert (alignment_support_cheme);
3197 gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */
3199 /* In case the vectorization factor (VF) is bigger than the number
3200 of elements that we can fit in a vectype (nunits), we have to generate
3201 more than one vector stmt - i.e - we need to "unroll" the
3202 vector stmt by a factor VF/nunits. For more details see documentation in
3203 vect_get_vec_def_for_copy_stmt. */
3205 /* In case of interleaving (non-unit strided access):
3207 S1: &base + 2 = x2
3208 S2: &base = x0
3209 S3: &base + 1 = x1
3210 S4: &base + 3 = x3
3212 We create vectorized stores starting from base address (the access of the
3213 first stmt in the chain (S2 in the above example), when the last store stmt
3214 of the chain (S4) is reached:
3216 VS1: &base = vx2
3217 VS2: &base + vec_size*1 = vx0
3218 VS3: &base + vec_size*2 = vx1
3219 VS4: &base + vec_size*3 = vx3
3221 Then permutation statements are generated:
3223 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
3224 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
3227 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3228 (the order of the data-refs in the output of vect_permute_store_chain
3229 corresponds to the order of scalar stmts in the interleaving chain - see
3230 the documentation of vect_permute_store_chain()).
3232 In case of both multiple types and interleaving, above vector stores and
3233 permutation stmts are created for every copy. The result vector stmts are
3234 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3235 STMT_VINFO_RELATED_STMT for the next copies.
3238 prev_stmt_info = NULL;
3239 for (j = 0; j < ncopies; j++)
3241 tree new_stmt;
3242 tree ptr_incr;
3244 if (j == 0)
3246 /* For interleaved stores we collect vectorized defs for all the
3247 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
3248 as an input to vect_permute_store_chain(), and OPRNDS as an input
3249 to vect_get_vec_def_for_stmt_copy() for the next copy.
3250 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3251 OPRNDS are of size 1. */
3252 next_stmt = first_stmt;
3253 for (i = 0; i < group_size; i++)
3255 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
3256 is the exact number of stmts in the chain. Therefore, NEXT_STMT
3257 can't be NULL_TREE. In case that there is no interleaving,
3258 GROUP_SIZE is 1, and only one iteration of the loop will be
3259 executed. */
3260 gcc_assert (next_stmt);
3261 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
3262 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
3263 VEC_quick_push(tree, dr_chain, vec_oprnd);
3264 VEC_quick_push(tree, oprnds, vec_oprnd);
3265 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3267 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE,
3268 &dummy, &ptr_incr, false,
3269 TREE_TYPE (vec_oprnd));
3271 else
3273 /* For interleaved stores we created vectorized defs for all the
3274 defs stored in OPRNDS in the previous iteration (previous copy).
3275 DR_CHAIN is then used as an input to vect_permute_store_chain(),
3276 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
3277 next copy.
3278 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3279 OPRNDS are of size 1. */
3280 for (i = 0; i < group_size; i++)
3282 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
3283 VEC_index (tree, oprnds, i));
3284 VEC_replace(tree, dr_chain, i, vec_oprnd);
3285 VEC_replace(tree, oprnds, i, vec_oprnd);
3287 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3290 if (strided_store)
3292 result_chain = VEC_alloc (tree, heap, group_size);
3293 /* Permute. */
3294 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
3295 &result_chain))
3296 return false;
3299 next_stmt = first_stmt;
3300 for (i = 0; i < group_size; i++)
3302 /* For strided stores vectorized defs are interleaved in
3303 vect_permute_store_chain(). */
3304 if (strided_store)
3305 vec_oprnd = VEC_index(tree, result_chain, i);
3307 data_ref = build_fold_indirect_ref (dataref_ptr);
3308 /* Arguments are ready. Create the new vector stmt. */
3309 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
3310 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3312 /* Set the VDEFs for the vector pointer. If this virtual def
3313 has a use outside the loop and a loop peel is performed
3314 then the def may be renamed by the peel. Mark it for
3315 renaming so the later use will also be renamed. */
3316 copy_virtual_operands (new_stmt, next_stmt);
3317 if (j == 0)
3319 /* The original store is deleted so the same SSA_NAMEs
3320 can be used. */
3321 FOR_EACH_SSA_TREE_OPERAND (def, next_stmt, iter, SSA_OP_VDEF)
3323 SSA_NAME_DEF_STMT (def) = new_stmt;
3324 mark_sym_for_renaming (SSA_NAME_VAR (def));
3327 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3329 else
3331 /* Create new names for all the definitions created by COPY and
3332 add replacement mappings for each new name. */
3333 FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VDEF)
3335 create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p);
3336 mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p)));
3339 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3342 prev_stmt_info = vinfo_for_stmt (new_stmt);
3343 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3344 if (!next_stmt)
3345 break;
3346 /* Bump the vector pointer. */
3347 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3351 return true;
3355 /* Function vect_setup_realignment
3357 This function is called when vectorizing an unaligned load using
3358 the dr_unaligned_software_pipeline scheme.
3359 This function generates the following code at the loop prolog:
3361 p = initial_addr;
3362 msq_init = *(floor(p)); # prolog load
3363 realignment_token = call target_builtin;
3364 loop:
3365 msq = phi (msq_init, ---)
3367 The code above sets up a new (vector) pointer, pointing to the first
3368 location accessed by STMT, and a "floor-aligned" load using that pointer.
3369 It also generates code to compute the "realignment-token" (if the relevant
3370 target hook was defined), and creates a phi-node at the loop-header bb
3371 whose arguments are the result of the prolog-load (created by this
3372 function) and the result of a load that takes place in the loop (to be
3373 created by the caller to this function).
3374 The caller to this function uses the phi-result (msq) to create the
3375 realignment code inside the loop, and sets up the missing phi argument,
3376 as follows:
3378 loop:
3379 msq = phi (msq_init, lsq)
3380 lsq = *(floor(p')); # load in loop
3381 result = realign_load (msq, lsq, realignment_token);
3383 Input:
3384 STMT - (scalar) load stmt to be vectorized. This load accesses
3385 a memory location that may be unaligned.
3386 BSI - place where new code is to be inserted.
3388 Output:
3389 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
3390 target hook, if defined.
3391 Return value - the result of the loop-header phi node. */
3393 static tree
3394 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
3395 tree *realignment_token)
3397 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3398 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3399 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3400 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3401 edge pe = loop_preheader_edge (loop);
3402 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3403 tree vec_dest;
3404 tree init_addr;
3405 tree inc;
3406 tree ptr;
3407 tree data_ref;
3408 tree new_stmt;
3409 basic_block new_bb;
3410 tree msq_init;
3411 tree new_temp;
3412 tree phi_stmt;
3413 tree msq;
3415 /* 1. Create msq_init = *(floor(p1)) in the loop preheader */
3416 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3417 ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true,
3418 NULL_TREE);
3419 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
3420 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
3421 new_temp = make_ssa_name (vec_dest, new_stmt);
3422 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3423 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3424 gcc_assert (!new_bb);
3425 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
3426 copy_virtual_operands (new_stmt, stmt);
3427 update_vuses_to_preheader (new_stmt, loop);
3429 /* 2. Create permutation mask, if required, in loop preheader. */
3430 if (targetm.vectorize.builtin_mask_for_load)
3432 tree builtin_decl;
3434 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
3435 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
3436 vec_dest = vect_create_destination_var (scalar_dest,
3437 TREE_TYPE (new_stmt));
3438 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3439 new_temp = make_ssa_name (vec_dest, new_stmt);
3440 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3441 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3442 gcc_assert (!new_bb);
3443 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
3445 /* The result of the CALL_EXPR to this builtin is determined from
3446 the value of the parameter and no global variables are touched
3447 which makes the builtin a "const" function. Requiring the
3448 builtin to have the "const" attribute makes it unnecessary
3449 to call mark_call_clobbered. */
3450 gcc_assert (TREE_READONLY (builtin_decl));
3453 /* 3. Create msq = phi <msq_init, lsq> in loop */
3454 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3455 msq = make_ssa_name (vec_dest, NULL_TREE);
3456 phi_stmt = create_phi_node (msq, loop->header);
3457 SSA_NAME_DEF_STMT (msq) = phi_stmt;
3458 add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop));
3460 return msq;
3464 /* Function vect_strided_load_supported.
3466 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
3467 and FALSE otherwise. */
3469 static bool
3470 vect_strided_load_supported (tree vectype)
3472 optab perm_even_optab, perm_odd_optab;
3473 int mode;
3475 mode = (int) TYPE_MODE (vectype);
3477 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
3478 if (!perm_even_optab)
3480 if (vect_print_dump_info (REPORT_DETAILS))
3481 fprintf (vect_dump, "no optab for perm_even.");
3482 return false;
3485 if (perm_even_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3487 if (vect_print_dump_info (REPORT_DETAILS))
3488 fprintf (vect_dump, "perm_even op not supported by target.");
3489 return false;
3492 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
3493 if (!perm_odd_optab)
3495 if (vect_print_dump_info (REPORT_DETAILS))
3496 fprintf (vect_dump, "no optab for perm_odd.");
3497 return false;
3500 if (perm_odd_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3502 if (vect_print_dump_info (REPORT_DETAILS))
3503 fprintf (vect_dump, "perm_odd op not supported by target.");
3504 return false;
3506 return true;
3510 /* Function vect_permute_load_chain.
3512 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
3513 a power of 2, generate extract_even/odd stmts to reorder the input data
3514 correctly. Return the final references for loads in RESULT_CHAIN.
3516 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3517 The input is 4 vectors each containing 8 elements. We assign a number to each
3518 element, the input sequence is:
3520 1st vec: 0 1 2 3 4 5 6 7
3521 2nd vec: 8 9 10 11 12 13 14 15
3522 3rd vec: 16 17 18 19 20 21 22 23
3523 4th vec: 24 25 26 27 28 29 30 31
3525 The output sequence should be:
3527 1st vec: 0 4 8 12 16 20 24 28
3528 2nd vec: 1 5 9 13 17 21 25 29
3529 3rd vec: 2 6 10 14 18 22 26 30
3530 4th vec: 3 7 11 15 19 23 27 31
3532 i.e., the first output vector should contain the first elements of each
3533 interleaving group, etc.
3535 We use extract_even/odd instructions to create such output. The input of each
3536 extract_even/odd operation is two vectors
3537 1st vec 2nd vec
3538 0 1 2 3 4 5 6 7
3540 and the output is the vector of extracted even/odd elements. The output of
3541 extract_even will be: 0 2 4 6
3542 and of extract_odd: 1 3 5 7
3545 The permutation is done in log LENGTH stages. In each stage extract_even and
3546 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
3547 order. In our example,
3549 E1: extract_even (1st vec, 2nd vec)
3550 E2: extract_odd (1st vec, 2nd vec)
3551 E3: extract_even (3rd vec, 4th vec)
3552 E4: extract_odd (3rd vec, 4th vec)
3554 The output for the first stage will be:
3556 E1: 0 2 4 6 8 10 12 14
3557 E2: 1 3 5 7 9 11 13 15
3558 E3: 16 18 20 22 24 26 28 30
3559 E4: 17 19 21 23 25 27 29 31
3561 In order to proceed and create the correct sequence for the next stage (or
3562 for the correct output, if the second stage is the last one, as in our
3563 example), we first put the output of extract_even operation and then the
3564 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
3565 The input for the second stage is:
3567 1st vec (E1): 0 2 4 6 8 10 12 14
3568 2nd vec (E3): 16 18 20 22 24 26 28 30
3569 3rd vec (E2): 1 3 5 7 9 11 13 15
3570 4th vec (E4): 17 19 21 23 25 27 29 31
3572 The output of the second stage:
3574 E1: 0 4 8 12 16 20 24 28
3575 E2: 2 6 10 14 18 22 26 30
3576 E3: 1 5 9 13 17 21 25 29
3577 E4: 3 7 11 15 19 23 27 31
3579 And RESULT_CHAIN after reordering:
3581 1st vec (E1): 0 4 8 12 16 20 24 28
3582 2nd vec (E3): 1 5 9 13 17 21 25 29
3583 3rd vec (E2): 2 6 10 14 18 22 26 30
3584 4th vec (E4): 3 7 11 15 19 23 27 31. */
3586 static bool
3587 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
3588 unsigned int length,
3589 tree stmt,
3590 block_stmt_iterator *bsi,
3591 VEC(tree,heap) **result_chain)
3593 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
3594 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3595 tree tmp;
3596 int i;
3597 unsigned int j;
3599 /* Check that the operation is supported. */
3600 if (!vect_strided_load_supported (vectype))
3601 return false;
3603 *result_chain = VEC_copy (tree, heap, dr_chain);
3604 for (i = 0; i < exact_log2 (length); i++)
3606 for (j = 0; j < length; j +=2)
3608 first_vect = VEC_index (tree, dr_chain, j);
3609 second_vect = VEC_index (tree, dr_chain, j+1);
3611 /* data_ref = permute_even (first_data_ref, second_data_ref); */
3612 perm_dest = create_tmp_var (vectype, "vect_perm_even");
3613 DECL_GIMPLE_REG_P (perm_dest) = 1;
3614 add_referenced_var (perm_dest);
3616 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
3617 first_vect, second_vect);
3618 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
3620 data_ref = make_ssa_name (perm_dest, perm_stmt);
3621 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3622 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3623 mark_symbols_for_renaming (perm_stmt);
3625 VEC_replace (tree, *result_chain, j/2, data_ref);
3627 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
3628 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
3629 DECL_GIMPLE_REG_P (perm_dest) = 1;
3630 add_referenced_var (perm_dest);
3632 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
3633 first_vect, second_vect);
3634 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
3635 data_ref = make_ssa_name (perm_dest, perm_stmt);
3636 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3637 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3638 mark_symbols_for_renaming (perm_stmt);
3640 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
3642 dr_chain = VEC_copy (tree, heap, *result_chain);
3644 return true;
3648 /* Function vect_transform_strided_load.
3650 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
3651 to perform their permutation and ascribe the result vectorized statements to
3652 the scalar statements.
3655 static bool
3656 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
3657 block_stmt_iterator *bsi)
3659 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3660 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3661 tree next_stmt, new_stmt;
3662 VEC(tree,heap) *result_chain = NULL;
3663 unsigned int i, gap_count;
3664 tree tmp_data_ref;
3666 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
3667 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
3668 vectors, that are ready for vector computation. */
3669 result_chain = VEC_alloc (tree, heap, size);
3670 /* Permute. */
3671 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
3672 return false;
3674 /* Put a permuted data-ref in the VECTORIZED_STMT field.
3675 Since we scan the chain starting from it's first node, their order
3676 corresponds the order of data-refs in RESULT_CHAIN. */
3677 next_stmt = first_stmt;
3678 gap_count = 1;
3679 for (i = 0; VEC_iterate(tree, result_chain, i, tmp_data_ref); i++)
3681 if (!next_stmt)
3682 break;
3684 /* Skip the gaps. Loads created for the gaps will be removed by dead
3685 code elimination pass later.
3686 DR_GROUP_GAP is the number of steps in elements from the previous
3687 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
3688 correspond to the gaps.
3690 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
3692 gap_count++;
3693 continue;
3696 while (next_stmt)
3698 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
3699 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
3700 copies, and we put the new vector statement in the first available
3701 RELATED_STMT. */
3702 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
3703 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
3704 else
3706 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
3707 tree rel_stmt = STMT_VINFO_RELATED_STMT (
3708 vinfo_for_stmt (prev_stmt));
3709 while (rel_stmt)
3711 prev_stmt = rel_stmt;
3712 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
3714 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
3716 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3717 gap_count = 1;
3718 /* If NEXT_STMT accesses the same DR as the previous statement,
3719 put the same TMP_DATA_REF as its vectorized statement; otherwise
3720 get the next data-ref from RESULT_CHAIN. */
3721 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
3722 break;
3725 return true;
3729 /* vectorizable_load.
3731 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
3732 can be vectorized.
3733 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3734 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3735 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3737 bool
3738 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3740 tree scalar_dest;
3741 tree vec_dest = NULL;
3742 tree data_ref = NULL;
3743 tree op;
3744 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3745 stmt_vec_info prev_stmt_info;
3746 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3747 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3748 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
3749 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3750 tree new_temp;
3751 int mode;
3752 tree new_stmt = NULL_TREE;
3753 tree dummy;
3754 enum dr_alignment_support alignment_support_cheme;
3755 tree dataref_ptr = NULL_TREE;
3756 tree ptr_incr;
3757 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3758 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3759 int i, j, group_size;
3760 tree msq = NULL_TREE, lsq;
3761 tree offset = NULL_TREE;
3762 tree realignment_token = NULL_TREE;
3763 tree phi_stmt = NULL_TREE;
3764 VEC(tree,heap) *dr_chain = NULL;
3765 bool strided_load = false;
3766 tree first_stmt;
3768 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3769 return false;
3771 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3772 return false;
3774 /* FORNOW: not yet supported. */
3775 if (STMT_VINFO_LIVE_P (stmt_info))
3777 if (vect_print_dump_info (REPORT_DETAILS))
3778 fprintf (vect_dump, "value used after loop.");
3779 return false;
3782 /* Is vectorizable load? */
3783 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3784 return false;
3786 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3787 if (TREE_CODE (scalar_dest) != SSA_NAME)
3788 return false;
3790 op = GIMPLE_STMT_OPERAND (stmt, 1);
3791 if (TREE_CODE (op) != ARRAY_REF
3792 && TREE_CODE (op) != INDIRECT_REF
3793 && !DR_GROUP_FIRST_DR (stmt_info))
3794 return false;
3796 if (!STMT_VINFO_DATA_REF (stmt_info))
3797 return false;
3799 mode = (int) TYPE_MODE (vectype);
3801 /* FORNOW. In some cases can vectorize even if data-type not supported
3802 (e.g. - data copies). */
3803 if (mov_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3805 if (vect_print_dump_info (REPORT_DETAILS))
3806 fprintf (vect_dump, "Aligned load, but unsupported type.");
3807 return false;
3810 /* Check if the load is a part of an interleaving chain. */
3811 if (DR_GROUP_FIRST_DR (stmt_info))
3813 strided_load = true;
3815 /* Check if interleaving is supported. */
3816 if (!vect_strided_load_supported (vectype))
3817 return false;
3820 if (!vec_stmt) /* transformation not required. */
3822 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
3823 return true;
3826 /** Transform. **/
3828 if (vect_print_dump_info (REPORT_DETAILS))
3829 fprintf (vect_dump, "transform load.");
3831 if (strided_load)
3833 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3834 /* Check if the chain of loads is already vectorized. */
3835 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
3837 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3838 return true;
3840 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3841 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3842 dr_chain = VEC_alloc (tree, heap, group_size);
3844 else
3846 first_stmt = stmt;
3847 first_dr = dr;
3848 group_size = 1;
3851 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3852 gcc_assert (alignment_support_cheme);
3855 /* In case the vectorization factor (VF) is bigger than the number
3856 of elements that we can fit in a vectype (nunits), we have to generate
3857 more than one vector stmt - i.e - we need to "unroll" the
3858 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3859 from one copy of the vector stmt to the next, in the field
3860 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3861 stages to find the correct vector defs to be used when vectorizing
3862 stmts that use the defs of the current stmt. The example below illustrates
3863 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3864 4 vectorized stmts):
3866 before vectorization:
3867 RELATED_STMT VEC_STMT
3868 S1: x = memref - -
3869 S2: z = x + 1 - -
3871 step 1: vectorize stmt S1:
3872 We first create the vector stmt VS1_0, and, as usual, record a
3873 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
3874 Next, we create the vector stmt VS1_1, and record a pointer to
3875 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
3876 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
3877 stmts and pointers:
3878 RELATED_STMT VEC_STMT
3879 VS1_0: vx0 = memref0 VS1_1 -
3880 VS1_1: vx1 = memref1 VS1_2 -
3881 VS1_2: vx2 = memref2 VS1_3 -
3882 VS1_3: vx3 = memref3 - -
3883 S1: x = load - VS1_0
3884 S2: z = x + 1 - -
3886 See in documentation in vect_get_vec_def_for_stmt_copy for how the
3887 information we recorded in RELATED_STMT field is used to vectorize
3888 stmt S2. */
3890 /* In case of interleaving (non-unit strided access):
3892 S1: x2 = &base + 2
3893 S2: x0 = &base
3894 S3: x1 = &base + 1
3895 S4: x3 = &base + 3
3897 Vectorized loads are created in the order of memory accesses
3898 starting from the access of the first stmt of the chain:
3900 VS1: vx0 = &base
3901 VS2: vx1 = &base + vec_size*1
3902 VS3: vx3 = &base + vec_size*2
3903 VS4: vx4 = &base + vec_size*3
3905 Then permutation statements are generated:
3907 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
3908 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
3911 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3912 (the order of the data-refs in the output of vect_permute_load_chain
3913 corresponds to the order of scalar stmts in the interleaving chain - see
3914 the documentation of vect_permute_load_chain()).
3915 The generation of permutation stmts and recording them in
3916 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
3918 In case of both multiple types and interleaving, the vector loads and
3919 permutation stmts above are created for every copy. The result vector stmts
3920 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3921 STMT_VINFO_RELATED_STMT for the next copies. */
3923 /* If the data reference is aligned (dr_aligned) or potentially unaligned
3924 on a target that supports unaligned accesses (dr_unaligned_supported)
3925 we generate the following code:
3926 p = initial_addr;
3927 indx = 0;
3928 loop {
3929 p = p + indx * vectype_size;
3930 vec_dest = *(p);
3931 indx = indx + 1;
3934 Otherwise, the data reference is potentially unaligned on a target that
3935 does not support unaligned accesses (dr_unaligned_software_pipeline) -
3936 then generate the following code, in which the data in each iteration is
3937 obtained by two vector loads, one from the previous iteration, and one
3938 from the current iteration:
3939 p1 = initial_addr;
3940 msq_init = *(floor(p1))
3941 p2 = initial_addr + VS - 1;
3942 realignment_token = call target_builtin;
3943 indx = 0;
3944 loop {
3945 p2 = p2 + indx * vectype_size
3946 lsq = *(floor(p2))
3947 vec_dest = realign_load (msq, lsq, realignment_token)
3948 indx = indx + 1;
3949 msq = lsq;
3950 } */
3952 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3954 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token);
3955 phi_stmt = SSA_NAME_DEF_STMT (msq);
3956 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
3959 prev_stmt_info = NULL;
3960 for (j = 0; j < ncopies; j++)
3962 /* 1. Create the vector pointer update chain. */
3963 if (j == 0)
3964 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset, &dummy,
3965 &ptr_incr, false, NULL_TREE);
3966 else
3967 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3969 for (i = 0; i < group_size; i++)
3971 /* 2. Create the vector-load in the loop. */
3972 switch (alignment_support_cheme)
3974 case dr_aligned:
3975 gcc_assert (aligned_access_p (first_dr));
3976 data_ref = build_fold_indirect_ref (dataref_ptr);
3977 break;
3978 case dr_unaligned_supported:
3980 int mis = DR_MISALIGNMENT (first_dr);
3981 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
3983 gcc_assert (!aligned_access_p (first_dr));
3984 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
3985 data_ref =
3986 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
3987 break;
3989 case dr_unaligned_software_pipeline:
3990 gcc_assert (!aligned_access_p (first_dr));
3991 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
3992 break;
3993 default:
3994 gcc_unreachable ();
3996 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3997 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
3998 new_temp = make_ssa_name (vec_dest, new_stmt);
3999 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4000 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4001 copy_virtual_operands (new_stmt, stmt);
4002 mark_symbols_for_renaming (new_stmt);
4004 /* 3. Handle explicit realignment if necessary/supported. */
4005 if (alignment_support_cheme == dr_unaligned_software_pipeline)
4007 /* Create in loop:
4008 <vec_dest = realign_load (msq, lsq, realignment_token)> */
4009 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
4010 if (!realignment_token)
4011 realignment_token = dataref_ptr;
4012 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4013 new_stmt =
4014 build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token);
4015 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
4016 new_temp = make_ssa_name (vec_dest, new_stmt);
4017 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4018 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4019 if (i == group_size - 1 && j == ncopies - 1)
4020 add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop));
4021 msq = lsq;
4023 if (strided_load)
4024 VEC_quick_push (tree, dr_chain, new_temp);
4025 if (i < group_size - 1)
4026 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
4029 if (strided_load)
4031 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
4032 return false;
4033 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4034 dr_chain = VEC_alloc (tree, heap, group_size);
4036 else
4038 if (j == 0)
4039 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4040 else
4041 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4042 prev_stmt_info = vinfo_for_stmt (new_stmt);
4046 return true;
4050 /* Function vectorizable_live_operation.
4052 STMT computes a value that is used outside the loop. Check if
4053 it can be supported. */
4055 bool
4056 vectorizable_live_operation (tree stmt,
4057 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
4058 tree *vec_stmt ATTRIBUTE_UNUSED)
4060 tree operation;
4061 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4062 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4063 int i;
4064 int op_type;
4065 tree op;
4066 tree def, def_stmt;
4067 enum vect_def_type dt;
4069 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4071 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4072 return false;
4074 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4075 return false;
4077 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4078 return false;
4080 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4081 op_type = TREE_OPERAND_LENGTH (operation);
4083 /* FORNOW: support only if all uses are invariant. This means
4084 that the scalar operations can remain in place, unvectorized.
4085 The original last scalar value that they compute will be used. */
4087 for (i = 0; i < op_type; i++)
4089 op = TREE_OPERAND (operation, i);
4090 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4092 if (vect_print_dump_info (REPORT_DETAILS))
4093 fprintf (vect_dump, "use not simple.");
4094 return false;
4097 if (dt != vect_invariant_def && dt != vect_constant_def)
4098 return false;
4101 /* No transformation is required for the cases we currently support. */
4102 return true;
4106 /* Function vect_is_simple_cond.
4108 Input:
4109 LOOP - the loop that is being vectorized.
4110 COND - Condition that is checked for simple use.
4112 Returns whether a COND can be vectorized. Checks whether
4113 condition operands are supportable using vec_is_simple_use. */
4115 static bool
4116 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
4118 tree lhs, rhs;
4119 tree def;
4120 enum vect_def_type dt;
4122 if (!COMPARISON_CLASS_P (cond))
4123 return false;
4125 lhs = TREE_OPERAND (cond, 0);
4126 rhs = TREE_OPERAND (cond, 1);
4128 if (TREE_CODE (lhs) == SSA_NAME)
4130 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
4131 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
4132 return false;
4134 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST)
4135 return false;
4137 if (TREE_CODE (rhs) == SSA_NAME)
4139 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
4140 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
4141 return false;
4143 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST)
4144 return false;
4146 return true;
4149 /* vectorizable_condition.
4151 Check if STMT is conditional modify expression that can be vectorized.
4152 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4153 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
4154 at BSI.
4156 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4158 bool
4159 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
4161 tree scalar_dest = NULL_TREE;
4162 tree vec_dest = NULL_TREE;
4163 tree op = NULL_TREE;
4164 tree cond_expr, then_clause, else_clause;
4165 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4166 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4167 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
4168 tree vec_compare, vec_cond_expr;
4169 tree new_temp;
4170 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4171 enum machine_mode vec_mode;
4172 tree def;
4173 enum vect_def_type dt;
4174 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4175 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4177 gcc_assert (ncopies >= 1);
4178 if (ncopies > 1)
4179 return false; /* FORNOW */
4181 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4182 return false;
4184 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4185 return false;
4187 /* FORNOW: not yet supported. */
4188 if (STMT_VINFO_LIVE_P (stmt_info))
4190 if (vect_print_dump_info (REPORT_DETAILS))
4191 fprintf (vect_dump, "value used after loop.");
4192 return false;
4195 /* Is vectorizable conditional operation? */
4196 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4197 return false;
4199 op = GIMPLE_STMT_OPERAND (stmt, 1);
4201 if (TREE_CODE (op) != COND_EXPR)
4202 return false;
4204 cond_expr = TREE_OPERAND (op, 0);
4205 then_clause = TREE_OPERAND (op, 1);
4206 else_clause = TREE_OPERAND (op, 2);
4208 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
4209 return false;
4211 /* We do not handle two different vector types for the condition
4212 and the values. */
4213 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
4214 return false;
4216 if (TREE_CODE (then_clause) == SSA_NAME)
4218 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
4219 if (!vect_is_simple_use (then_clause, loop_vinfo,
4220 &then_def_stmt, &def, &dt))
4221 return false;
4223 else if (TREE_CODE (then_clause) != INTEGER_CST
4224 && TREE_CODE (then_clause) != REAL_CST)
4225 return false;
4227 if (TREE_CODE (else_clause) == SSA_NAME)
4229 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
4230 if (!vect_is_simple_use (else_clause, loop_vinfo,
4231 &else_def_stmt, &def, &dt))
4232 return false;
4234 else if (TREE_CODE (else_clause) != INTEGER_CST
4235 && TREE_CODE (else_clause) != REAL_CST)
4236 return false;
4239 vec_mode = TYPE_MODE (vectype);
4241 if (!vec_stmt)
4243 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
4244 return expand_vec_cond_expr_p (op, vec_mode);
4247 /* Transform */
4249 /* Handle def. */
4250 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4251 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4253 /* Handle cond expr. */
4254 vec_cond_lhs =
4255 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
4256 vec_cond_rhs =
4257 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
4258 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
4259 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
4261 /* Arguments are ready. create the new vector stmt. */
4262 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
4263 vec_cond_lhs, vec_cond_rhs);
4264 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
4265 vec_compare, vec_then_clause, vec_else_clause);
4267 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
4268 new_temp = make_ssa_name (vec_dest, *vec_stmt);
4269 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
4270 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
4272 return true;
4275 /* Function vect_transform_stmt.
4277 Create a vectorized stmt to replace STMT, and insert it at BSI. */
4279 bool
4280 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
4282 bool is_store = false;
4283 tree vec_stmt = NULL_TREE;
4284 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4285 tree orig_stmt_in_pattern;
4286 bool done;
4288 switch (STMT_VINFO_TYPE (stmt_info))
4290 case type_demotion_vec_info_type:
4291 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
4292 gcc_assert (done);
4293 break;
4295 case type_promotion_vec_info_type:
4296 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
4297 gcc_assert (done);
4298 break;
4300 case type_conversion_vec_info_type:
4301 done = vectorizable_conversion (stmt, bsi, &vec_stmt);
4302 gcc_assert (done);
4303 break;
4305 case induc_vec_info_type:
4306 done = vectorizable_induction (stmt, bsi, &vec_stmt);
4307 gcc_assert (done);
4308 break;
4310 case op_vec_info_type:
4311 done = vectorizable_operation (stmt, bsi, &vec_stmt);
4312 gcc_assert (done);
4313 break;
4315 case assignment_vec_info_type:
4316 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
4317 gcc_assert (done);
4318 break;
4320 case load_vec_info_type:
4321 done = vectorizable_load (stmt, bsi, &vec_stmt);
4322 gcc_assert (done);
4323 break;
4325 case store_vec_info_type:
4326 done = vectorizable_store (stmt, bsi, &vec_stmt);
4327 gcc_assert (done);
4328 if (DR_GROUP_FIRST_DR (stmt_info))
4330 /* In case of interleaving, the whole chain is vectorized when the
4331 last store in the chain is reached. Store stmts before the last
4332 one are skipped, and there vec_stmt_info shouldn't be freed
4333 meanwhile. */
4334 *strided_store = true;
4335 if (STMT_VINFO_VEC_STMT (stmt_info))
4336 is_store = true;
4338 else
4339 is_store = true;
4340 break;
4342 case condition_vec_info_type:
4343 done = vectorizable_condition (stmt, bsi, &vec_stmt);
4344 gcc_assert (done);
4345 break;
4347 case call_vec_info_type:
4348 done = vectorizable_call (stmt, bsi, &vec_stmt);
4349 break;
4351 case reduc_vec_info_type:
4352 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
4353 gcc_assert (done);
4354 break;
4356 default:
4357 if (!STMT_VINFO_LIVE_P (stmt_info))
4359 if (vect_print_dump_info (REPORT_DETAILS))
4360 fprintf (vect_dump, "stmt not supported.");
4361 gcc_unreachable ();
4365 if (STMT_VINFO_LIVE_P (stmt_info)
4366 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
4368 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
4369 gcc_assert (done);
4372 if (vec_stmt)
4374 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
4375 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
4376 if (orig_stmt_in_pattern)
4378 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
4379 /* STMT was inserted by the vectorizer to replace a computation idiom.
4380 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
4381 computed this idiom. We need to record a pointer to VEC_STMT in
4382 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
4383 documentation of vect_pattern_recog. */
4384 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
4386 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4387 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
4392 return is_store;
4396 /* This function builds ni_name = number of iterations loop executes
4397 on the loop preheader. */
4399 static tree
4400 vect_build_loop_niters (loop_vec_info loop_vinfo)
4402 tree ni_name, stmt, var;
4403 edge pe;
4404 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4405 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
4407 var = create_tmp_var (TREE_TYPE (ni), "niters");
4408 add_referenced_var (var);
4409 ni_name = force_gimple_operand (ni, &stmt, false, var);
4411 pe = loop_preheader_edge (loop);
4412 if (stmt)
4414 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4415 gcc_assert (!new_bb);
4418 return ni_name;
4422 /* This function generates the following statements:
4424 ni_name = number of iterations loop executes
4425 ratio = ni_name / vf
4426 ratio_mult_vf_name = ratio * vf
4428 and places them at the loop preheader edge. */
4430 static void
4431 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
4432 tree *ni_name_ptr,
4433 tree *ratio_mult_vf_name_ptr,
4434 tree *ratio_name_ptr)
4437 edge pe;
4438 basic_block new_bb;
4439 tree stmt, ni_name;
4440 tree var;
4441 tree ratio_name;
4442 tree ratio_mult_vf_name;
4443 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4444 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
4445 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4446 tree log_vf;
4448 pe = loop_preheader_edge (loop);
4450 /* Generate temporary variable that contains
4451 number of iterations loop executes. */
4453 ni_name = vect_build_loop_niters (loop_vinfo);
4454 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
4456 /* Create: ratio = ni >> log2(vf) */
4458 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
4459 if (!is_gimple_val (ratio_name))
4461 var = create_tmp_var (TREE_TYPE (ni), "bnd");
4462 add_referenced_var (var);
4464 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
4465 pe = loop_preheader_edge (loop);
4466 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4467 gcc_assert (!new_bb);
4470 /* Create: ratio_mult_vf = ratio << log2 (vf). */
4472 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
4473 ratio_name, log_vf);
4474 if (!is_gimple_val (ratio_mult_vf_name))
4476 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
4477 add_referenced_var (var);
4479 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
4480 true, var);
4481 pe = loop_preheader_edge (loop);
4482 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4483 gcc_assert (!new_bb);
4486 *ni_name_ptr = ni_name;
4487 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
4488 *ratio_name_ptr = ratio_name;
4490 return;
4494 /* Function update_vuses_to_preheader.
4496 Input:
4497 STMT - a statement with potential VUSEs.
4498 LOOP - the loop whose preheader will contain STMT.
4500 It's possible to vectorize a loop even though an SSA_NAME from a VUSE
4501 appears to be defined in a VDEF in another statement in a loop.
4502 One such case is when the VUSE is at the dereference of a __restricted__
4503 pointer in a load and the VDEF is at the dereference of a different
4504 __restricted__ pointer in a store. Vectorization may result in
4505 copy_virtual_uses being called to copy the problematic VUSE to a new
4506 statement that is being inserted in the loop preheader. This procedure
4507 is called to change the SSA_NAME in the new statement's VUSE from the
4508 SSA_NAME updated in the loop to the related SSA_NAME available on the
4509 path entering the loop.
4511 When this function is called, we have the following situation:
4513 # vuse <name1>
4514 S1: vload
4515 do {
4516 # name1 = phi < name0 , name2>
4518 # vuse <name1>
4519 S2: vload
4521 # name2 = vdef <name1>
4522 S3: vstore
4524 }while...
4526 Stmt S1 was created in the loop preheader block as part of misaligned-load
4527 handling. This function fixes the name of the vuse of S1 from 'name1' to
4528 'name0'. */
4530 static void
4531 update_vuses_to_preheader (tree stmt, struct loop *loop)
4533 basic_block header_bb = loop->header;
4534 edge preheader_e = loop_preheader_edge (loop);
4535 ssa_op_iter iter;
4536 use_operand_p use_p;
4538 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VUSE)
4540 tree ssa_name = USE_FROM_PTR (use_p);
4541 tree def_stmt = SSA_NAME_DEF_STMT (ssa_name);
4542 tree name_var = SSA_NAME_VAR (ssa_name);
4543 basic_block bb = bb_for_stmt (def_stmt);
4545 /* For a use before any definitions, def_stmt is a NOP_EXPR. */
4546 if (!IS_EMPTY_STMT (def_stmt)
4547 && flow_bb_inside_loop_p (loop, bb))
4549 /* If the block containing the statement defining the SSA_NAME
4550 is in the loop then it's necessary to find the definition
4551 outside the loop using the PHI nodes of the header. */
4552 tree phi;
4553 bool updated = false;
4555 for (phi = phi_nodes (header_bb); phi; phi = PHI_CHAIN (phi))
4557 if (SSA_NAME_VAR (PHI_RESULT (phi)) == name_var)
4559 SET_USE (use_p, PHI_ARG_DEF (phi, preheader_e->dest_idx));
4560 updated = true;
4561 break;
4564 gcc_assert (updated);
4570 /* Function vect_update_ivs_after_vectorizer.
4572 "Advance" the induction variables of LOOP to the value they should take
4573 after the execution of LOOP. This is currently necessary because the
4574 vectorizer does not handle induction variables that are used after the
4575 loop. Such a situation occurs when the last iterations of LOOP are
4576 peeled, because:
4577 1. We introduced new uses after LOOP for IVs that were not originally used
4578 after LOOP: the IVs of LOOP are now used by an epilog loop.
4579 2. LOOP is going to be vectorized; this means that it will iterate N/VF
4580 times, whereas the loop IVs should be bumped N times.
4582 Input:
4583 - LOOP - a loop that is going to be vectorized. The last few iterations
4584 of LOOP were peeled.
4585 - NITERS - the number of iterations that LOOP executes (before it is
4586 vectorized). i.e, the number of times the ivs should be bumped.
4587 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
4588 coming out from LOOP on which there are uses of the LOOP ivs
4589 (this is the path from LOOP->exit to epilog_loop->preheader).
4591 The new definitions of the ivs are placed in LOOP->exit.
4592 The phi args associated with the edge UPDATE_E in the bb
4593 UPDATE_E->dest are updated accordingly.
4595 Assumption 1: Like the rest of the vectorizer, this function assumes
4596 a single loop exit that has a single predecessor.
4598 Assumption 2: The phi nodes in the LOOP header and in update_bb are
4599 organized in the same order.
4601 Assumption 3: The access function of the ivs is simple enough (see
4602 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
4604 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
4605 coming out of LOOP on which the ivs of LOOP are used (this is the path
4606 that leads to the epilog loop; other paths skip the epilog loop). This
4607 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
4608 needs to have its phis updated.
4611 static void
4612 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
4613 edge update_e)
4615 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4616 basic_block exit_bb = single_exit (loop)->dest;
4617 tree phi, phi1;
4618 basic_block update_bb = update_e->dest;
4620 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
4622 /* Make sure there exists a single-predecessor exit bb: */
4623 gcc_assert (single_pred_p (exit_bb));
4625 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
4626 phi && phi1;
4627 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
4629 tree access_fn = NULL;
4630 tree evolution_part;
4631 tree init_expr;
4632 tree step_expr;
4633 tree var, stmt, ni, ni_name;
4634 block_stmt_iterator last_bsi;
4636 if (vect_print_dump_info (REPORT_DETAILS))
4638 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
4639 print_generic_expr (vect_dump, phi, TDF_SLIM);
4642 /* Skip virtual phi's. */
4643 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
4645 if (vect_print_dump_info (REPORT_DETAILS))
4646 fprintf (vect_dump, "virtual phi. skip.");
4647 continue;
4650 /* Skip reduction phis. */
4651 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
4653 if (vect_print_dump_info (REPORT_DETAILS))
4654 fprintf (vect_dump, "reduc phi. skip.");
4655 continue;
4658 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
4659 gcc_assert (access_fn);
4660 evolution_part =
4661 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
4662 gcc_assert (evolution_part != NULL_TREE);
4664 /* FORNOW: We do not support IVs whose evolution function is a polynomial
4665 of degree >= 2 or exponential. */
4666 gcc_assert (!tree_is_chrec (evolution_part));
4668 step_expr = evolution_part;
4669 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
4670 loop->num));
4672 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
4673 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
4674 fold_convert (TREE_TYPE (init_expr),
4675 niters),
4676 step_expr),
4677 init_expr);
4679 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
4680 add_referenced_var (var);
4682 ni_name = force_gimple_operand (ni, &stmt, false, var);
4684 /* Insert stmt into exit_bb. */
4685 last_bsi = bsi_last (exit_bb);
4686 if (stmt)
4687 bsi_insert_before (&last_bsi, stmt, BSI_SAME_STMT);
4689 /* Fix phi expressions in the successor bb. */
4690 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
4695 /* Function vect_do_peeling_for_loop_bound
4697 Peel the last iterations of the loop represented by LOOP_VINFO.
4698 The peeled iterations form a new epilog loop. Given that the loop now
4699 iterates NITERS times, the new epilog loop iterates
4700 NITERS % VECTORIZATION_FACTOR times.
4702 The original loop will later be made to iterate
4703 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
4705 static void
4706 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
4708 tree ni_name, ratio_mult_vf_name;
4709 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4710 struct loop *new_loop;
4711 edge update_e;
4712 basic_block preheader;
4713 int loop_num;
4714 unsigned int th;
4716 if (vect_print_dump_info (REPORT_DETAILS))
4717 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
4719 initialize_original_copy_tables ();
4721 /* Generate the following variables on the preheader of original loop:
4723 ni_name = number of iteration the original loop executes
4724 ratio = ni_name / vf
4725 ratio_mult_vf_name = ratio * vf */
4726 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
4727 &ratio_mult_vf_name, ratio);
4729 loop_num = loop->num;
4730 /* Threshold for vectorized loop. */
4731 th = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)) *
4732 LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4733 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
4734 ratio_mult_vf_name, ni_name, false, th);
4735 gcc_assert (new_loop);
4736 gcc_assert (loop_num == loop->num);
4737 #ifdef ENABLE_CHECKING
4738 slpeel_verify_cfg_after_peeling (loop, new_loop);
4739 #endif
4741 /* A guard that controls whether the new_loop is to be executed or skipped
4742 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
4743 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
4744 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
4745 is on the path where the LOOP IVs are used and need to be updated. */
4747 preheader = loop_preheader_edge (new_loop)->src;
4748 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
4749 update_e = EDGE_PRED (preheader, 0);
4750 else
4751 update_e = EDGE_PRED (preheader, 1);
4753 /* Update IVs of original loop as if they were advanced
4754 by ratio_mult_vf_name steps. */
4755 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
4757 /* After peeling we have to reset scalar evolution analyzer. */
4758 scev_reset ();
4760 free_original_copy_tables ();
4764 /* Function vect_gen_niters_for_prolog_loop
4766 Set the number of iterations for the loop represented by LOOP_VINFO
4767 to the minimum between LOOP_NITERS (the original iteration count of the loop)
4768 and the misalignment of DR - the data reference recorded in
4769 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
4770 this loop, the data reference DR will refer to an aligned location.
4772 The following computation is generated:
4774 If the misalignment of DR is known at compile time:
4775 addr_mis = int mis = DR_MISALIGNMENT (dr);
4776 Else, compute address misalignment in bytes:
4777 addr_mis = addr & (vectype_size - 1)
4779 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
4781 (elem_size = element type size; an element is the scalar element
4782 whose type is the inner type of the vectype)
4784 For interleaving,
4786 prolog_niters = min ( LOOP_NITERS ,
4787 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
4788 where group_size is the size of the interleaved group.
4791 static tree
4792 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
4794 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
4795 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4796 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4797 tree var, stmt;
4798 tree iters, iters_name;
4799 edge pe;
4800 basic_block new_bb;
4801 tree dr_stmt = DR_STMT (dr);
4802 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
4803 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4804 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
4805 tree niters_type = TREE_TYPE (loop_niters);
4806 int group_size = 1;
4807 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
4809 if (DR_GROUP_FIRST_DR (stmt_info))
4811 /* For interleaved access element size must be multiplied by the size of
4812 the interleaved group. */
4813 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
4814 DR_GROUP_FIRST_DR (stmt_info)));
4815 element_size *= group_size;
4818 pe = loop_preheader_edge (loop);
4820 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
4822 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
4823 int elem_misalign = byte_misalign / element_size;
4825 if (vect_print_dump_info (REPORT_DETAILS))
4826 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
4827 iters = build_int_cst (niters_type,
4828 (vf - elem_misalign)&(vf/group_size-1));
4830 else
4832 tree new_stmts = NULL_TREE;
4833 tree start_addr =
4834 vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE);
4835 tree ptr_type = TREE_TYPE (start_addr);
4836 tree size = TYPE_SIZE (ptr_type);
4837 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
4838 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
4839 tree elem_size_log =
4840 build_int_cst (type, exact_log2 (vectype_align/vf));
4841 tree vf_minus_1 = build_int_cst (type, vf - 1);
4842 tree vf_tree = build_int_cst (type, vf);
4843 tree byte_misalign;
4844 tree elem_misalign;
4846 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
4847 gcc_assert (!new_bb);
4849 /* Create: byte_misalign = addr & (vectype_size - 1) */
4850 byte_misalign =
4851 fold_build2 (BIT_AND_EXPR, type, start_addr, vectype_size_minus_1);
4853 /* Create: elem_misalign = byte_misalign / element_size */
4854 elem_misalign =
4855 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
4857 /* Create: (niters_type) (VF - elem_misalign)&(VF - 1) */
4858 iters = fold_build2 (MINUS_EXPR, type, vf_tree, elem_misalign);
4859 iters = fold_build2 (BIT_AND_EXPR, type, iters, vf_minus_1);
4860 iters = fold_convert (niters_type, iters);
4863 /* Create: prolog_loop_niters = min (iters, loop_niters) */
4864 /* If the loop bound is known at compile time we already verified that it is
4865 greater than vf; since the misalignment ('iters') is at most vf, there's
4866 no need to generate the MIN_EXPR in this case. */
4867 if (TREE_CODE (loop_niters) != INTEGER_CST)
4868 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
4870 if (vect_print_dump_info (REPORT_DETAILS))
4872 fprintf (vect_dump, "niters for prolog loop: ");
4873 print_generic_expr (vect_dump, iters, TDF_SLIM);
4876 var = create_tmp_var (niters_type, "prolog_loop_niters");
4877 add_referenced_var (var);
4878 iters_name = force_gimple_operand (iters, &stmt, false, var);
4880 /* Insert stmt on loop preheader edge. */
4881 if (stmt)
4883 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4884 gcc_assert (!new_bb);
4887 return iters_name;
4891 /* Function vect_update_init_of_dr
4893 NITERS iterations were peeled from LOOP. DR represents a data reference
4894 in LOOP. This function updates the information recorded in DR to
4895 account for the fact that the first NITERS iterations had already been
4896 executed. Specifically, it updates the OFFSET field of DR. */
4898 static void
4899 vect_update_init_of_dr (struct data_reference *dr, tree niters)
4901 tree offset = DR_OFFSET (dr);
4903 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
4904 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
4905 DR_OFFSET (dr) = offset;
4909 /* Function vect_update_inits_of_drs
4911 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
4912 This function updates the information recorded for the data references in
4913 the loop to account for the fact that the first NITERS iterations had
4914 already been executed. Specifically, it updates the initial_condition of the
4915 access_function of all the data_references in the loop. */
4917 static void
4918 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
4920 unsigned int i;
4921 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
4922 struct data_reference *dr;
4924 if (vect_dump && (dump_flags & TDF_DETAILS))
4925 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
4927 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
4928 vect_update_init_of_dr (dr, niters);
4932 /* Function vect_do_peeling_for_alignment
4934 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
4935 'niters' is set to the misalignment of one of the data references in the
4936 loop, thereby forcing it to refer to an aligned location at the beginning
4937 of the execution of this loop. The data reference for which we are
4938 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
4940 static void
4941 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
4943 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4944 tree niters_of_prolog_loop, ni_name;
4945 tree n_iters;
4946 struct loop *new_loop;
4948 if (vect_print_dump_info (REPORT_DETAILS))
4949 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
4951 initialize_original_copy_tables ();
4953 ni_name = vect_build_loop_niters (loop_vinfo);
4954 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
4956 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
4957 new_loop =
4958 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
4959 niters_of_prolog_loop, ni_name, true, 0);
4960 gcc_assert (new_loop);
4961 #ifdef ENABLE_CHECKING
4962 slpeel_verify_cfg_after_peeling (new_loop, loop);
4963 #endif
4965 /* Update number of times loop executes. */
4966 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
4967 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
4968 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
4970 /* Update the init conditions of the access functions of all data refs. */
4971 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
4973 /* After peeling we have to reset scalar evolution analyzer. */
4974 scev_reset ();
4976 free_original_copy_tables ();
4980 /* Function vect_create_cond_for_align_checks.
4982 Create a conditional expression that represents the alignment checks for
4983 all of data references (array element references) whose alignment must be
4984 checked at runtime.
4986 Input:
4987 LOOP_VINFO - two fields of the loop information are used.
4988 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
4989 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
4991 Output:
4992 COND_EXPR_STMT_LIST - statements needed to construct the conditional
4993 expression.
4994 The returned value is the conditional expression to be used in the if
4995 statement that controls which version of the loop gets executed at runtime.
4997 The algorithm makes two assumptions:
4998 1) The number of bytes "n" in a vector is a power of 2.
4999 2) An address "a" is aligned if a%n is zero and that this
5000 test can be done as a&(n-1) == 0. For example, for 16
5001 byte vectors the test is a&0xf == 0. */
5003 static tree
5004 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
5005 tree *cond_expr_stmt_list)
5007 VEC(tree,heap) *may_misalign_stmts
5008 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
5009 tree ref_stmt, tmp;
5010 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
5011 tree mask_cst;
5012 unsigned int i;
5013 tree psize;
5014 tree int_ptrsize_type;
5015 char tmp_name[20];
5016 tree or_tmp_name = NULL_TREE;
5017 tree and_tmp, and_tmp_name, and_stmt;
5018 tree ptrsize_zero;
5020 /* Check that mask is one less than a power of 2, i.e., mask is
5021 all zeros followed by all ones. */
5022 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
5024 /* CHECKME: what is the best integer or unsigned type to use to hold a
5025 cast from a pointer value? */
5026 psize = TYPE_SIZE (ptr_type_node);
5027 int_ptrsize_type
5028 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
5030 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
5031 of the first vector of the i'th data reference. */
5033 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
5035 tree new_stmt_list = NULL_TREE;
5036 tree addr_base;
5037 tree addr_tmp, addr_tmp_name, addr_stmt;
5038 tree or_tmp, new_or_tmp_name, or_stmt;
5040 /* create: addr_tmp = (int)(address_of_first_vector) */
5041 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
5042 &new_stmt_list,
5043 NULL_TREE);
5045 if (new_stmt_list != NULL_TREE)
5046 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
5048 sprintf (tmp_name, "%s%d", "addr2int", i);
5049 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
5050 add_referenced_var (addr_tmp);
5051 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
5052 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
5053 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
5054 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
5055 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
5057 /* The addresses are OR together. */
5059 if (or_tmp_name != NULL_TREE)
5061 /* create: or_tmp = or_tmp | addr_tmp */
5062 sprintf (tmp_name, "%s%d", "orptrs", i);
5063 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
5064 add_referenced_var (or_tmp);
5065 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
5066 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
5067 or_tmp_name, addr_tmp_name);
5068 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
5069 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
5070 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
5071 or_tmp_name = new_or_tmp_name;
5073 else
5074 or_tmp_name = addr_tmp_name;
5076 } /* end for i */
5078 mask_cst = build_int_cst (int_ptrsize_type, mask);
5080 /* create: and_tmp = or_tmp & mask */
5081 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
5082 add_referenced_var (and_tmp);
5083 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
5085 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
5086 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
5087 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
5088 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
5090 /* Make and_tmp the left operand of the conditional test against zero.
5091 if and_tmp has a nonzero bit then some address is unaligned. */
5092 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
5093 return build2 (EQ_EXPR, boolean_type_node,
5094 and_tmp_name, ptrsize_zero);
5098 /* Function vect_transform_loop.
5100 The analysis phase has determined that the loop is vectorizable.
5101 Vectorize the loop - created vectorized stmts to replace the scalar
5102 stmts in the loop, and update the loop exit condition. */
5104 void
5105 vect_transform_loop (loop_vec_info loop_vinfo)
5107 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5108 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5109 int nbbs = loop->num_nodes;
5110 block_stmt_iterator si, next_si;
5111 int i;
5112 tree ratio = NULL;
5113 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5114 bool strided_store;
5116 if (vect_print_dump_info (REPORT_DETAILS))
5117 fprintf (vect_dump, "=== vec_transform_loop ===");
5119 /* If the loop has data references that may or may not be aligned then
5120 two versions of the loop need to be generated, one which is vectorized
5121 and one which isn't. A test is then generated to control which of the
5122 loops is executed. The test checks for the alignment of all of the
5123 data references that may or may not be aligned. */
5125 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
5127 struct loop *nloop;
5128 tree cond_expr;
5129 tree cond_expr_stmt_list = NULL_TREE;
5130 basic_block condition_bb;
5131 block_stmt_iterator cond_exp_bsi;
5132 basic_block merge_bb;
5133 basic_block new_exit_bb;
5134 edge new_exit_e, e;
5135 tree orig_phi, new_phi, arg;
5136 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
5138 cond_expr = vect_create_cond_for_align_checks (loop_vinfo,
5139 &cond_expr_stmt_list);
5140 initialize_original_copy_tables ();
5141 nloop = loop_version (loop, cond_expr, &condition_bb,
5142 prob, prob, REG_BR_PROB_BASE - prob, true);
5143 free_original_copy_tables();
5145 /** Loop versioning violates an assumption we try to maintain during
5146 vectorization - that the loop exit block has a single predecessor.
5147 After versioning, the exit block of both loop versions is the same
5148 basic block (i.e. it has two predecessors). Just in order to simplify
5149 following transformations in the vectorizer, we fix this situation
5150 here by adding a new (empty) block on the exit-edge of the loop,
5151 with the proper loop-exit phis to maintain loop-closed-form. **/
5153 merge_bb = single_exit (loop)->dest;
5154 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
5155 new_exit_bb = split_edge (single_exit (loop));
5156 new_exit_e = single_exit (loop);
5157 e = EDGE_SUCC (new_exit_bb, 0);
5159 for (orig_phi = phi_nodes (merge_bb); orig_phi;
5160 orig_phi = PHI_CHAIN (orig_phi))
5162 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
5163 new_exit_bb);
5164 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
5165 add_phi_arg (new_phi, arg, new_exit_e);
5166 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
5169 /** end loop-exit-fixes after versioning **/
5171 update_ssa (TODO_update_ssa);
5172 cond_exp_bsi = bsi_last (condition_bb);
5173 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
5176 /* CHECKME: we wouldn't need this if we called update_ssa once
5177 for all loops. */
5178 bitmap_zero (vect_memsyms_to_rename);
5180 /* Peel the loop if there are data refs with unknown alignment.
5181 Only one data ref with unknown store is allowed. */
5183 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5184 vect_do_peeling_for_alignment (loop_vinfo);
5186 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5187 compile time constant), or it is a constant that doesn't divide by the
5188 vectorization factor, then an epilog loop needs to be created.
5189 We therefore duplicate the loop: the original loop will be vectorized,
5190 and will compute the first (n/VF) iterations. The second copy of the loop
5191 will remain scalar and will compute the remaining (n%VF) iterations.
5192 (VF is the vectorization factor). */
5194 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5195 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5196 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
5197 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
5198 else
5199 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5200 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5202 /* 1) Make sure the loop header has exactly two entries
5203 2) Make sure we have a preheader basic block. */
5205 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5207 split_edge (loop_preheader_edge (loop));
5209 /* FORNOW: the vectorizer supports only loops which body consist
5210 of one basic block (header + empty latch). When the vectorizer will
5211 support more involved loop forms, the order by which the BBs are
5212 traversed need to be reconsidered. */
5214 for (i = 0; i < nbbs; i++)
5216 basic_block bb = bbs[i];
5217 stmt_vec_info stmt_info;
5218 tree phi;
5220 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
5222 if (vect_print_dump_info (REPORT_DETAILS))
5224 fprintf (vect_dump, "------>vectorizing phi: ");
5225 print_generic_expr (vect_dump, phi, TDF_SLIM);
5227 stmt_info = vinfo_for_stmt (phi);
5228 if (!stmt_info)
5229 continue;
5230 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5231 && !STMT_VINFO_LIVE_P (stmt_info))
5232 continue;
5234 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5235 != (unsigned HOST_WIDE_INT) vectorization_factor)
5236 && vect_print_dump_info (REPORT_DETAILS))
5237 fprintf (vect_dump, "multiple-types.");
5239 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5241 if (vect_print_dump_info (REPORT_DETAILS))
5242 fprintf (vect_dump, "transform phi.");
5243 vect_transform_stmt (phi, NULL, NULL);
5247 for (si = bsi_start (bb); !bsi_end_p (si);)
5249 tree stmt = bsi_stmt (si);
5250 bool is_store;
5252 if (vect_print_dump_info (REPORT_DETAILS))
5254 fprintf (vect_dump, "------>vectorizing statement: ");
5255 print_generic_expr (vect_dump, stmt, TDF_SLIM);
5257 stmt_info = vinfo_for_stmt (stmt);
5258 gcc_assert (stmt_info);
5259 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5260 && !STMT_VINFO_LIVE_P (stmt_info))
5262 bsi_next (&si);
5263 continue;
5266 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5267 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5268 != (unsigned HOST_WIDE_INT) vectorization_factor)
5269 && vect_print_dump_info (REPORT_DETAILS))
5270 fprintf (vect_dump, "multiple-types.");
5272 /* -------- vectorize statement ------------ */
5273 if (vect_print_dump_info (REPORT_DETAILS))
5274 fprintf (vect_dump, "transform statement.");
5276 strided_store = false;
5277 is_store = vect_transform_stmt (stmt, &si, &strided_store);
5278 if (is_store)
5280 stmt_ann_t ann;
5281 if (DR_GROUP_FIRST_DR (stmt_info))
5283 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5284 interleaving chain was completed - free all the stores in
5285 the chain. */
5286 tree next = DR_GROUP_FIRST_DR (stmt_info);
5287 tree tmp;
5288 stmt_vec_info next_stmt_info;
5290 while (next)
5292 next_si = bsi_for_stmt (next);
5293 next_stmt_info = vinfo_for_stmt (next);
5294 /* Free the attached stmt_vec_info and remove the stmt. */
5295 ann = stmt_ann (next);
5296 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
5297 free (next_stmt_info);
5298 set_stmt_info (ann, NULL);
5299 bsi_remove (&next_si, true);
5300 next = tmp;
5302 bsi_remove (&si, true);
5303 continue;
5305 else
5307 /* Free the attached stmt_vec_info and remove the stmt. */
5308 ann = stmt_ann (stmt);
5309 free (stmt_info);
5310 set_stmt_info (ann, NULL);
5311 bsi_remove (&si, true);
5312 continue;
5315 bsi_next (&si);
5316 } /* stmts in BB */
5317 } /* BBs in loop */
5319 slpeel_make_loop_iterate_ntimes (loop, ratio);
5321 mark_set_for_renaming (vect_memsyms_to_rename);
5323 /* The memory tags and pointers in vectorized statements need to
5324 have their SSA forms updated. FIXME, why can't this be delayed
5325 until all the loops have been transformed? */
5326 update_ssa (TODO_update_ssa);
5328 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
5329 fprintf (vect_dump, "LOOP VECTORIZED.");