aix: Fix building fat library for AIX
[official-gcc.git] / gcc / tree-vect-stmts.cc
blob7e571968a59d7f0cc9833ca930f5f2975c8dcd21
1 /* Statement Analysis and Transformation for Vectorization
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "ssa.h"
31 #include "optabs-tree.h"
32 #include "insn-config.h"
33 #include "recog.h" /* FIXME: for insn_data */
34 #include "cgraph.h"
35 #include "dumpfile.h"
36 #include "alias.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-cfg.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "cfgloop.h"
46 #include "explow.h"
47 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "builtins.h"
51 #include "internal-fn.h"
52 #include "tree-vector-builder.h"
53 #include "vec-perm-indices.h"
54 #include "gimple-range.h"
55 #include "tree-ssa-loop-niter.h"
56 #include "gimple-fold.h"
57 #include "regs.h"
58 #include "attribs.h"
59 #include "optabs-libfuncs.h"
61 /* For lang_hooks.types.type_for_mode. */
62 #include "langhooks.h"
64 /* Return the vectorized type for the given statement. */
66 tree
67 stmt_vectype (class _stmt_vec_info *stmt_info)
69 return STMT_VINFO_VECTYPE (stmt_info);
72 /* Return TRUE iff the given statement is in an inner loop relative to
73 the loop being vectorized. */
74 bool
75 stmt_in_inner_loop_p (vec_info *vinfo, class _stmt_vec_info *stmt_info)
77 gimple *stmt = STMT_VINFO_STMT (stmt_info);
78 basic_block bb = gimple_bb (stmt);
79 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
80 class loop* loop;
82 if (!loop_vinfo)
83 return false;
85 loop = LOOP_VINFO_LOOP (loop_vinfo);
87 return (bb->loop_father == loop->inner);
90 /* Record the cost of a statement, either by directly informing the
91 target model or by saving it in a vector for later processing.
92 Return a preliminary estimate of the statement's cost. */
94 static unsigned
95 record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
96 enum vect_cost_for_stmt kind,
97 stmt_vec_info stmt_info, slp_tree node,
98 tree vectype, int misalign,
99 enum vect_cost_model_location where)
101 if ((kind == vector_load || kind == unaligned_load)
102 && (stmt_info && STMT_VINFO_GATHER_SCATTER_P (stmt_info)))
103 kind = vector_gather_load;
104 if ((kind == vector_store || kind == unaligned_store)
105 && (stmt_info && STMT_VINFO_GATHER_SCATTER_P (stmt_info)))
106 kind = vector_scatter_store;
108 stmt_info_for_cost si
109 = { count, kind, where, stmt_info, node, vectype, misalign };
110 body_cost_vec->safe_push (si);
112 return (unsigned)
113 (builtin_vectorization_cost (kind, vectype, misalign) * count);
116 unsigned
117 record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
118 enum vect_cost_for_stmt kind, stmt_vec_info stmt_info,
119 tree vectype, int misalign,
120 enum vect_cost_model_location where)
122 return record_stmt_cost (body_cost_vec, count, kind, stmt_info, NULL,
123 vectype, misalign, where);
126 unsigned
127 record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
128 enum vect_cost_for_stmt kind, slp_tree node,
129 tree vectype, int misalign,
130 enum vect_cost_model_location where)
132 return record_stmt_cost (body_cost_vec, count, kind, NULL, node,
133 vectype, misalign, where);
136 unsigned
137 record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
138 enum vect_cost_for_stmt kind,
139 enum vect_cost_model_location where)
141 gcc_assert (kind == cond_branch_taken || kind == cond_branch_not_taken
142 || kind == scalar_stmt);
143 return record_stmt_cost (body_cost_vec, count, kind, NULL, NULL,
144 NULL_TREE, 0, where);
147 /* Return a variable of type ELEM_TYPE[NELEMS]. */
149 static tree
150 create_vector_array (tree elem_type, unsigned HOST_WIDE_INT nelems)
152 return create_tmp_var (build_array_type_nelts (elem_type, nelems),
153 "vect_array");
156 /* ARRAY is an array of vectors created by create_vector_array.
157 Return an SSA_NAME for the vector in index N. The reference
158 is part of the vectorization of STMT_INFO and the vector is associated
159 with scalar destination SCALAR_DEST. */
161 static tree
162 read_vector_array (vec_info *vinfo,
163 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
164 tree scalar_dest, tree array, unsigned HOST_WIDE_INT n)
166 tree vect_type, vect, vect_name, array_ref;
167 gimple *new_stmt;
169 gcc_assert (TREE_CODE (TREE_TYPE (array)) == ARRAY_TYPE);
170 vect_type = TREE_TYPE (TREE_TYPE (array));
171 vect = vect_create_destination_var (scalar_dest, vect_type);
172 array_ref = build4 (ARRAY_REF, vect_type, array,
173 build_int_cst (size_type_node, n),
174 NULL_TREE, NULL_TREE);
176 new_stmt = gimple_build_assign (vect, array_ref);
177 vect_name = make_ssa_name (vect, new_stmt);
178 gimple_assign_set_lhs (new_stmt, vect_name);
179 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
181 return vect_name;
184 /* ARRAY is an array of vectors created by create_vector_array.
185 Emit code to store SSA_NAME VECT in index N of the array.
186 The store is part of the vectorization of STMT_INFO. */
188 static void
189 write_vector_array (vec_info *vinfo,
190 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
191 tree vect, tree array, unsigned HOST_WIDE_INT n)
193 tree array_ref;
194 gimple *new_stmt;
196 array_ref = build4 (ARRAY_REF, TREE_TYPE (vect), array,
197 build_int_cst (size_type_node, n),
198 NULL_TREE, NULL_TREE);
200 new_stmt = gimple_build_assign (array_ref, vect);
201 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
204 /* PTR is a pointer to an array of type TYPE. Return a representation
205 of *PTR. The memory reference replaces those in FIRST_DR
206 (and its group). */
208 static tree
209 create_array_ref (tree type, tree ptr, tree alias_ptr_type)
211 tree mem_ref;
213 mem_ref = build2 (MEM_REF, type, ptr, build_int_cst (alias_ptr_type, 0));
214 /* Arrays have the same alignment as their type. */
215 set_ptr_info_alignment (get_ptr_info (ptr), TYPE_ALIGN_UNIT (type), 0);
216 return mem_ref;
219 /* Add a clobber of variable VAR to the vectorization of STMT_INFO.
220 Emit the clobber before *GSI. */
222 static void
223 vect_clobber_variable (vec_info *vinfo, stmt_vec_info stmt_info,
224 gimple_stmt_iterator *gsi, tree var)
226 tree clobber = build_clobber (TREE_TYPE (var));
227 gimple *new_stmt = gimple_build_assign (var, clobber);
228 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
231 /* Utility functions used by vect_mark_stmts_to_be_vectorized. */
233 /* Function vect_mark_relevant.
235 Mark STMT_INFO as "relevant for vectorization" and add it to WORKLIST. */
237 static void
238 vect_mark_relevant (vec<stmt_vec_info> *worklist, stmt_vec_info stmt_info,
239 enum vect_relevant relevant, bool live_p)
241 enum vect_relevant save_relevant = STMT_VINFO_RELEVANT (stmt_info);
242 bool save_live_p = STMT_VINFO_LIVE_P (stmt_info);
244 if (dump_enabled_p ())
245 dump_printf_loc (MSG_NOTE, vect_location,
246 "mark relevant %d, live %d: %G", relevant, live_p,
247 stmt_info->stmt);
249 /* If this stmt is an original stmt in a pattern, we might need to mark its
250 related pattern stmt instead of the original stmt. However, such stmts
251 may have their own uses that are not in any pattern, in such cases the
252 stmt itself should be marked. */
253 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
255 /* This is the last stmt in a sequence that was detected as a
256 pattern that can potentially be vectorized. Don't mark the stmt
257 as relevant/live because it's not going to be vectorized.
258 Instead mark the pattern-stmt that replaces it. */
260 if (dump_enabled_p ())
261 dump_printf_loc (MSG_NOTE, vect_location,
262 "last stmt in pattern. don't mark"
263 " relevant/live.\n");
265 stmt_vec_info old_stmt_info = stmt_info;
266 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
267 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == old_stmt_info);
268 save_relevant = STMT_VINFO_RELEVANT (stmt_info);
269 save_live_p = STMT_VINFO_LIVE_P (stmt_info);
271 if (live_p && relevant == vect_unused_in_scope)
273 if (dump_enabled_p ())
274 dump_printf_loc (MSG_NOTE, vect_location,
275 "vec_stmt_relevant_p: forcing live pattern stmt "
276 "relevant.\n");
277 relevant = vect_used_only_live;
280 if (dump_enabled_p ())
281 dump_printf_loc (MSG_NOTE, vect_location,
282 "mark relevant %d, live %d: %G", relevant, live_p,
283 stmt_info->stmt);
286 STMT_VINFO_LIVE_P (stmt_info) |= live_p;
287 if (relevant > STMT_VINFO_RELEVANT (stmt_info))
288 STMT_VINFO_RELEVANT (stmt_info) = relevant;
290 if (STMT_VINFO_RELEVANT (stmt_info) == save_relevant
291 && STMT_VINFO_LIVE_P (stmt_info) == save_live_p)
293 if (dump_enabled_p ())
294 dump_printf_loc (MSG_NOTE, vect_location,
295 "already marked relevant/live.\n");
296 return;
299 worklist->safe_push (stmt_info);
303 /* Function is_simple_and_all_uses_invariant
305 Return true if STMT_INFO is simple and all uses of it are invariant. */
307 bool
308 is_simple_and_all_uses_invariant (stmt_vec_info stmt_info,
309 loop_vec_info loop_vinfo)
311 tree op;
312 ssa_op_iter iter;
314 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
315 if (!stmt)
316 return false;
318 FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
320 enum vect_def_type dt = vect_uninitialized_def;
322 if (!vect_is_simple_use (op, loop_vinfo, &dt))
324 if (dump_enabled_p ())
325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
326 "use not simple.\n");
327 return false;
330 if (dt != vect_external_def && dt != vect_constant_def)
331 return false;
333 return true;
336 /* Function vect_stmt_relevant_p.
338 Return true if STMT_INFO, in the loop that is represented by LOOP_VINFO,
339 is "relevant for vectorization".
341 A stmt is considered "relevant for vectorization" if:
342 - it has uses outside the loop.
343 - it has vdefs (it alters memory).
344 - control stmts in the loop (except for the exit condition).
345 - it is an induction and we have multiple exits.
347 CHECKME: what other side effects would the vectorizer allow? */
349 static bool
350 vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
351 enum vect_relevant *relevant, bool *live_p)
353 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
354 ssa_op_iter op_iter;
355 imm_use_iterator imm_iter;
356 use_operand_p use_p;
357 def_operand_p def_p;
359 *relevant = vect_unused_in_scope;
360 *live_p = false;
362 /* cond stmt other than loop exit cond. */
363 gimple *stmt = STMT_VINFO_STMT (stmt_info);
364 if (is_ctrl_stmt (stmt)
365 && LOOP_VINFO_LOOP_IV_COND (loop_vinfo) != stmt
366 && (!loop->inner || gimple_bb (stmt)->loop_father == loop))
367 *relevant = vect_used_in_scope;
369 /* changing memory. */
370 if (gimple_code (stmt_info->stmt) != GIMPLE_PHI)
371 if (gimple_vdef (stmt_info->stmt)
372 && !gimple_clobber_p (stmt_info->stmt))
374 if (dump_enabled_p ())
375 dump_printf_loc (MSG_NOTE, vect_location,
376 "vec_stmt_relevant_p: stmt has vdefs.\n");
377 *relevant = vect_used_in_scope;
380 /* uses outside the loop. */
381 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
383 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, DEF_FROM_PTR (def_p))
385 basic_block bb = gimple_bb (USE_STMT (use_p));
386 if (!flow_bb_inside_loop_p (loop, bb))
388 if (is_gimple_debug (USE_STMT (use_p)))
389 continue;
391 if (dump_enabled_p ())
392 dump_printf_loc (MSG_NOTE, vect_location,
393 "vec_stmt_relevant_p: used out of loop.\n");
395 /* We expect all such uses to be in the loop exit phis
396 (because of loop closed form) */
397 gcc_assert (gimple_code (USE_STMT (use_p)) == GIMPLE_PHI);
399 *live_p = true;
404 /* Check if it's an induction and multiple exits. In this case there will be
405 a usage later on after peeling which is needed for the alternate exit. */
406 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
407 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
409 if (dump_enabled_p ())
410 dump_printf_loc (MSG_NOTE, vect_location,
411 "vec_stmt_relevant_p: induction forced for "
412 "early break.\n");
413 *live_p = true;
417 if (*live_p && *relevant == vect_unused_in_scope
418 && !is_simple_and_all_uses_invariant (stmt_info, loop_vinfo))
420 if (dump_enabled_p ())
421 dump_printf_loc (MSG_NOTE, vect_location,
422 "vec_stmt_relevant_p: stmt live but not relevant.\n");
423 *relevant = vect_used_only_live;
426 return (*live_p || *relevant);
430 /* Function exist_non_indexing_operands_for_use_p
432 USE is one of the uses attached to STMT_INFO. Check if USE is
433 used in STMT_INFO for anything other than indexing an array. */
435 static bool
436 exist_non_indexing_operands_for_use_p (tree use, stmt_vec_info stmt_info)
438 tree operand;
440 /* USE corresponds to some operand in STMT. If there is no data
441 reference in STMT, then any operand that corresponds to USE
442 is not indexing an array. */
443 if (!STMT_VINFO_DATA_REF (stmt_info))
444 return true;
446 /* STMT has a data_ref. FORNOW this means that its of one of
447 the following forms:
448 -1- ARRAY_REF = var
449 -2- var = ARRAY_REF
450 (This should have been verified in analyze_data_refs).
452 'var' in the second case corresponds to a def, not a use,
453 so USE cannot correspond to any operands that are not used
454 for array indexing.
456 Therefore, all we need to check is if STMT falls into the
457 first case, and whether var corresponds to USE. */
459 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
460 if (!assign || !gimple_assign_copy_p (assign))
462 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
463 if (call && gimple_call_internal_p (call))
465 internal_fn ifn = gimple_call_internal_fn (call);
466 int mask_index = internal_fn_mask_index (ifn);
467 if (mask_index >= 0
468 && use == gimple_call_arg (call, mask_index))
469 return true;
470 int stored_value_index = internal_fn_stored_value_index (ifn);
471 if (stored_value_index >= 0
472 && use == gimple_call_arg (call, stored_value_index))
473 return true;
474 if (internal_gather_scatter_fn_p (ifn)
475 && use == gimple_call_arg (call, 1))
476 return true;
478 return false;
481 if (TREE_CODE (gimple_assign_lhs (assign)) == SSA_NAME)
482 return false;
483 operand = gimple_assign_rhs1 (assign);
484 if (TREE_CODE (operand) != SSA_NAME)
485 return false;
487 if (operand == use)
488 return true;
490 return false;
495 Function process_use.
497 Inputs:
498 - a USE in STMT_VINFO in a loop represented by LOOP_VINFO
499 - RELEVANT - enum value to be set in the STMT_VINFO of the stmt
500 that defined USE. This is done by calling mark_relevant and passing it
501 the WORKLIST (to add DEF_STMT to the WORKLIST in case it is relevant).
502 - FORCE is true if exist_non_indexing_operands_for_use_p check shouldn't
503 be performed.
505 Outputs:
506 Generally, LIVE_P and RELEVANT are used to define the liveness and
507 relevance info of the DEF_STMT of this USE:
508 STMT_VINFO_LIVE_P (DEF_stmt_vinfo) <-- live_p
509 STMT_VINFO_RELEVANT (DEF_stmt_vinfo) <-- relevant
510 Exceptions:
511 - case 1: If USE is used only for address computations (e.g. array indexing),
512 which does not need to be directly vectorized, then the liveness/relevance
513 of the respective DEF_STMT is left unchanged.
514 - case 2: If STMT_VINFO is a reduction phi and DEF_STMT is a reduction stmt,
515 we skip DEF_STMT cause it had already been processed.
516 - case 3: If DEF_STMT and STMT_VINFO are in different nests, then
517 "relevant" will be modified accordingly.
519 Return true if everything is as expected. Return false otherwise. */
521 static opt_result
522 process_use (stmt_vec_info stmt_vinfo, tree use, loop_vec_info loop_vinfo,
523 enum vect_relevant relevant, vec<stmt_vec_info> *worklist,
524 bool force)
526 stmt_vec_info dstmt_vinfo;
527 enum vect_def_type dt;
529 /* case 1: we are only interested in uses that need to be vectorized. Uses
530 that are used for address computation are not considered relevant. */
531 if (!force && !exist_non_indexing_operands_for_use_p (use, stmt_vinfo))
532 return opt_result::success ();
534 if (!vect_is_simple_use (use, loop_vinfo, &dt, &dstmt_vinfo))
535 return opt_result::failure_at (stmt_vinfo->stmt,
536 "not vectorized:"
537 " unsupported use in stmt.\n");
539 if (!dstmt_vinfo)
540 return opt_result::success ();
542 basic_block def_bb = gimple_bb (dstmt_vinfo->stmt);
543 basic_block bb = gimple_bb (stmt_vinfo->stmt);
545 /* case 2: A reduction phi (STMT) defined by a reduction stmt (DSTMT_VINFO).
546 We have to force the stmt live since the epilogue loop needs it to
547 continue computing the reduction. */
548 if (gimple_code (stmt_vinfo->stmt) == GIMPLE_PHI
549 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
550 && gimple_code (dstmt_vinfo->stmt) != GIMPLE_PHI
551 && STMT_VINFO_DEF_TYPE (dstmt_vinfo) == vect_reduction_def
552 && bb->loop_father == def_bb->loop_father)
554 if (dump_enabled_p ())
555 dump_printf_loc (MSG_NOTE, vect_location,
556 "reduc-stmt defining reduc-phi in the same nest.\n");
557 vect_mark_relevant (worklist, dstmt_vinfo, relevant, true);
558 return opt_result::success ();
561 /* case 3a: outer-loop stmt defining an inner-loop stmt:
562 outer-loop-header-bb:
563 d = dstmt_vinfo
564 inner-loop:
565 stmt # use (d)
566 outer-loop-tail-bb:
567 ... */
568 if (flow_loop_nested_p (def_bb->loop_father, bb->loop_father))
570 if (dump_enabled_p ())
571 dump_printf_loc (MSG_NOTE, vect_location,
572 "outer-loop def-stmt defining inner-loop stmt.\n");
574 switch (relevant)
576 case vect_unused_in_scope:
577 relevant = (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_nested_cycle) ?
578 vect_used_in_scope : vect_unused_in_scope;
579 break;
581 case vect_used_in_outer_by_reduction:
582 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_reduction_def);
583 relevant = vect_used_by_reduction;
584 break;
586 case vect_used_in_outer:
587 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_reduction_def);
588 relevant = vect_used_in_scope;
589 break;
591 case vect_used_in_scope:
592 break;
594 default:
595 gcc_unreachable ();
599 /* case 3b: inner-loop stmt defining an outer-loop stmt:
600 outer-loop-header-bb:
602 inner-loop:
603 d = dstmt_vinfo
604 outer-loop-tail-bb (or outer-loop-exit-bb in double reduction):
605 stmt # use (d) */
606 else if (flow_loop_nested_p (bb->loop_father, def_bb->loop_father))
608 if (dump_enabled_p ())
609 dump_printf_loc (MSG_NOTE, vect_location,
610 "inner-loop def-stmt defining outer-loop stmt.\n");
612 switch (relevant)
614 case vect_unused_in_scope:
615 relevant = (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
616 || STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_double_reduction_def) ?
617 vect_used_in_outer_by_reduction : vect_unused_in_scope;
618 break;
620 case vect_used_by_reduction:
621 case vect_used_only_live:
622 relevant = vect_used_in_outer_by_reduction;
623 break;
625 case vect_used_in_scope:
626 relevant = vect_used_in_outer;
627 break;
629 default:
630 gcc_unreachable ();
633 /* We are also not interested in uses on loop PHI backedges that are
634 inductions. Otherwise we'll needlessly vectorize the IV increment
635 and cause hybrid SLP for SLP inductions. Unless the PHI is live
636 of course. */
637 else if (gimple_code (stmt_vinfo->stmt) == GIMPLE_PHI
638 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_induction_def
639 && ! STMT_VINFO_LIVE_P (stmt_vinfo)
640 && (PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
641 loop_latch_edge (bb->loop_father))
642 == use))
644 if (dump_enabled_p ())
645 dump_printf_loc (MSG_NOTE, vect_location,
646 "induction value on backedge.\n");
647 return opt_result::success ();
651 vect_mark_relevant (worklist, dstmt_vinfo, relevant, false);
652 return opt_result::success ();
656 /* Function vect_mark_stmts_to_be_vectorized.
658 Not all stmts in the loop need to be vectorized. For example:
660 for i...
661 for j...
662 1. T0 = i + j
663 2. T1 = a[T0]
665 3. j = j + 1
667 Stmt 1 and 3 do not need to be vectorized, because loop control and
668 addressing of vectorized data-refs are handled differently.
670 This pass detects such stmts. */
672 opt_result
673 vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo, bool *fatal)
675 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
676 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
677 unsigned int nbbs = loop->num_nodes;
678 gimple_stmt_iterator si;
679 unsigned int i;
680 basic_block bb;
681 bool live_p;
682 enum vect_relevant relevant;
684 DUMP_VECT_SCOPE ("vect_mark_stmts_to_be_vectorized");
686 auto_vec<stmt_vec_info, 64> worklist;
688 /* 1. Init worklist. */
689 for (i = 0; i < nbbs; i++)
691 bb = bbs[i];
692 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
694 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
695 if (dump_enabled_p ())
696 dump_printf_loc (MSG_NOTE, vect_location, "init: phi relevant? %G",
697 phi_info->stmt);
699 if (vect_stmt_relevant_p (phi_info, loop_vinfo, &relevant, &live_p))
700 vect_mark_relevant (&worklist, phi_info, relevant, live_p);
702 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
704 if (is_gimple_debug (gsi_stmt (si)))
705 continue;
706 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
707 if (dump_enabled_p ())
708 dump_printf_loc (MSG_NOTE, vect_location,
709 "init: stmt relevant? %G", stmt_info->stmt);
711 if (vect_stmt_relevant_p (stmt_info, loop_vinfo, &relevant, &live_p))
712 vect_mark_relevant (&worklist, stmt_info, relevant, live_p);
716 /* 2. Process_worklist */
717 while (worklist.length () > 0)
719 use_operand_p use_p;
720 ssa_op_iter iter;
722 stmt_vec_info stmt_vinfo = worklist.pop ();
723 if (dump_enabled_p ())
724 dump_printf_loc (MSG_NOTE, vect_location,
725 "worklist: examine stmt: %G", stmt_vinfo->stmt);
727 /* Examine the USEs of STMT. For each USE, mark the stmt that defines it
728 (DEF_STMT) as relevant/irrelevant according to the relevance property
729 of STMT. */
730 relevant = STMT_VINFO_RELEVANT (stmt_vinfo);
732 /* Generally, the relevance property of STMT (in STMT_VINFO_RELEVANT) is
733 propagated as is to the DEF_STMTs of its USEs.
735 One exception is when STMT has been identified as defining a reduction
736 variable; in this case we set the relevance to vect_used_by_reduction.
737 This is because we distinguish between two kinds of relevant stmts -
738 those that are used by a reduction computation, and those that are
739 (also) used by a regular computation. This allows us later on to
740 identify stmts that are used solely by a reduction, and therefore the
741 order of the results that they produce does not have to be kept. */
743 switch (STMT_VINFO_DEF_TYPE (stmt_vinfo))
745 case vect_reduction_def:
746 gcc_assert (relevant != vect_unused_in_scope);
747 if (relevant != vect_unused_in_scope
748 && relevant != vect_used_in_scope
749 && relevant != vect_used_by_reduction
750 && relevant != vect_used_only_live)
751 return opt_result::failure_at
752 (stmt_vinfo->stmt, "unsupported use of reduction.\n");
753 break;
755 case vect_nested_cycle:
756 if (relevant != vect_unused_in_scope
757 && relevant != vect_used_in_outer_by_reduction
758 && relevant != vect_used_in_outer)
759 return opt_result::failure_at
760 (stmt_vinfo->stmt, "unsupported use of nested cycle.\n");
761 break;
763 case vect_double_reduction_def:
764 if (relevant != vect_unused_in_scope
765 && relevant != vect_used_by_reduction
766 && relevant != vect_used_only_live)
767 return opt_result::failure_at
768 (stmt_vinfo->stmt, "unsupported use of double reduction.\n");
769 break;
771 default:
772 break;
775 if (is_pattern_stmt_p (stmt_vinfo))
777 /* Pattern statements are not inserted into the code, so
778 FOR_EACH_PHI_OR_STMT_USE optimizes their operands out, and we
779 have to scan the RHS or function arguments instead. */
780 if (gassign *assign = dyn_cast <gassign *> (stmt_vinfo->stmt))
782 enum tree_code rhs_code = gimple_assign_rhs_code (assign);
783 tree op = gimple_assign_rhs1 (assign);
785 i = 1;
786 if (rhs_code == COND_EXPR && COMPARISON_CLASS_P (op))
788 opt_result res
789 = process_use (stmt_vinfo, TREE_OPERAND (op, 0),
790 loop_vinfo, relevant, &worklist, false);
791 if (!res)
792 return res;
793 res = process_use (stmt_vinfo, TREE_OPERAND (op, 1),
794 loop_vinfo, relevant, &worklist, false);
795 if (!res)
796 return res;
797 i = 2;
799 for (; i < gimple_num_ops (assign); i++)
801 op = gimple_op (assign, i);
802 if (TREE_CODE (op) == SSA_NAME)
804 opt_result res
805 = process_use (stmt_vinfo, op, loop_vinfo, relevant,
806 &worklist, false);
807 if (!res)
808 return res;
812 else if (gcond *cond = dyn_cast <gcond *> (stmt_vinfo->stmt))
814 tree_code rhs_code = gimple_cond_code (cond);
815 gcc_assert (TREE_CODE_CLASS (rhs_code) == tcc_comparison);
816 opt_result res
817 = process_use (stmt_vinfo, gimple_cond_lhs (cond),
818 loop_vinfo, relevant, &worklist, false);
819 if (!res)
820 return res;
821 res = process_use (stmt_vinfo, gimple_cond_rhs (cond),
822 loop_vinfo, relevant, &worklist, false);
823 if (!res)
824 return res;
826 else if (gcall *call = dyn_cast <gcall *> (stmt_vinfo->stmt))
828 for (i = 0; i < gimple_call_num_args (call); i++)
830 tree arg = gimple_call_arg (call, i);
831 opt_result res
832 = process_use (stmt_vinfo, arg, loop_vinfo, relevant,
833 &worklist, false);
834 if (!res)
835 return res;
838 else
839 gcc_unreachable ();
841 else
842 FOR_EACH_PHI_OR_STMT_USE (use_p, stmt_vinfo->stmt, iter, SSA_OP_USE)
844 tree op = USE_FROM_PTR (use_p);
845 opt_result res
846 = process_use (stmt_vinfo, op, loop_vinfo, relevant,
847 &worklist, false);
848 if (!res)
849 return res;
852 if (STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo))
854 gather_scatter_info gs_info;
855 if (!vect_check_gather_scatter (stmt_vinfo, loop_vinfo, &gs_info))
856 gcc_unreachable ();
857 opt_result res
858 = process_use (stmt_vinfo, gs_info.offset, loop_vinfo, relevant,
859 &worklist, true);
860 if (!res)
862 if (fatal)
863 *fatal = false;
864 return res;
867 } /* while worklist */
869 return opt_result::success ();
872 /* Function vect_model_simple_cost.
874 Models cost for simple operations, i.e. those that only emit ncopies of a
875 single op. Right now, this does not account for multiple insns that could
876 be generated for the single vector op. We will handle that shortly. */
878 static void
879 vect_model_simple_cost (vec_info *,
880 stmt_vec_info stmt_info, int ncopies,
881 enum vect_def_type *dt,
882 int ndts,
883 slp_tree node,
884 stmt_vector_for_cost *cost_vec,
885 vect_cost_for_stmt kind = vector_stmt)
887 int inside_cost = 0, prologue_cost = 0;
889 gcc_assert (cost_vec != NULL);
891 /* ??? Somehow we need to fix this at the callers. */
892 if (node)
893 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
895 if (!node)
896 /* Cost the "broadcast" of a scalar operand in to a vector operand.
897 Use scalar_to_vec to cost the broadcast, as elsewhere in the vector
898 cost model. */
899 for (int i = 0; i < ndts; i++)
900 if (dt[i] == vect_constant_def || dt[i] == vect_external_def)
901 prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
902 stmt_info, 0, vect_prologue);
904 /* Pass the inside-of-loop statements to the target-specific cost model. */
905 inside_cost += record_stmt_cost (cost_vec, ncopies, kind,
906 stmt_info, 0, vect_body);
908 if (dump_enabled_p ())
909 dump_printf_loc (MSG_NOTE, vect_location,
910 "vect_model_simple_cost: inside_cost = %d, "
911 "prologue_cost = %d .\n", inside_cost, prologue_cost);
915 /* Model cost for type demotion and promotion operations. PWR is
916 normally zero for single-step promotions and demotions. It will be
917 one if two-step promotion/demotion is required, and so on. NCOPIES
918 is the number of vector results (and thus number of instructions)
919 for the narrowest end of the operation chain. Each additional
920 step doubles the number of instructions required. If WIDEN_ARITH
921 is true the stmt is doing widening arithmetic. */
923 static void
924 vect_model_promotion_demotion_cost (stmt_vec_info stmt_info,
925 enum vect_def_type *dt,
926 unsigned int ncopies, int pwr,
927 stmt_vector_for_cost *cost_vec,
928 bool widen_arith)
930 int i;
931 int inside_cost = 0, prologue_cost = 0;
933 for (i = 0; i < pwr + 1; i++)
935 inside_cost += record_stmt_cost (cost_vec, ncopies,
936 widen_arith
937 ? vector_stmt : vec_promote_demote,
938 stmt_info, 0, vect_body);
939 ncopies *= 2;
942 /* FORNOW: Assuming maximum 2 args per stmts. */
943 for (i = 0; i < 2; i++)
944 if (dt[i] == vect_constant_def || dt[i] == vect_external_def)
945 prologue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
946 stmt_info, 0, vect_prologue);
948 if (dump_enabled_p ())
949 dump_printf_loc (MSG_NOTE, vect_location,
950 "vect_model_promotion_demotion_cost: inside_cost = %d, "
951 "prologue_cost = %d .\n", inside_cost, prologue_cost);
954 /* Returns true if the current function returns DECL. */
956 static bool
957 cfun_returns (tree decl)
959 edge_iterator ei;
960 edge e;
961 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
963 greturn *ret = safe_dyn_cast <greturn *> (*gsi_last_bb (e->src));
964 if (!ret)
965 continue;
966 if (gimple_return_retval (ret) == decl)
967 return true;
968 /* We often end up with an aggregate copy to the result decl,
969 handle that case as well. First skip intermediate clobbers
970 though. */
971 gimple *def = ret;
974 def = SSA_NAME_DEF_STMT (gimple_vuse (def));
976 while (gimple_clobber_p (def));
977 if (is_a <gassign *> (def)
978 && gimple_assign_lhs (def) == gimple_return_retval (ret)
979 && gimple_assign_rhs1 (def) == decl)
980 return true;
982 return false;
985 /* Calculate cost of DR's memory access. */
986 void
987 vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,
988 dr_alignment_support alignment_support_scheme,
989 int misalignment,
990 unsigned int *inside_cost,
991 stmt_vector_for_cost *body_cost_vec)
993 switch (alignment_support_scheme)
995 case dr_aligned:
997 *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
998 vector_store, stmt_info, 0,
999 vect_body);
1001 if (dump_enabled_p ())
1002 dump_printf_loc (MSG_NOTE, vect_location,
1003 "vect_model_store_cost: aligned.\n");
1004 break;
1007 case dr_unaligned_supported:
1009 /* Here, we assign an additional cost for the unaligned store. */
1010 *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
1011 unaligned_store, stmt_info,
1012 misalignment, vect_body);
1013 if (dump_enabled_p ())
1014 dump_printf_loc (MSG_NOTE, vect_location,
1015 "vect_model_store_cost: unaligned supported by "
1016 "hardware.\n");
1017 break;
1020 case dr_unaligned_unsupported:
1022 *inside_cost = VECT_MAX_COST;
1024 if (dump_enabled_p ())
1025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026 "vect_model_store_cost: unsupported access.\n");
1027 break;
1030 default:
1031 gcc_unreachable ();
1035 /* Calculate cost of DR's memory access. */
1036 void
1037 vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,
1038 dr_alignment_support alignment_support_scheme,
1039 int misalignment,
1040 bool add_realign_cost, unsigned int *inside_cost,
1041 unsigned int *prologue_cost,
1042 stmt_vector_for_cost *prologue_cost_vec,
1043 stmt_vector_for_cost *body_cost_vec,
1044 bool record_prologue_costs)
1046 switch (alignment_support_scheme)
1048 case dr_aligned:
1050 *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vector_load,
1051 stmt_info, 0, vect_body);
1053 if (dump_enabled_p ())
1054 dump_printf_loc (MSG_NOTE, vect_location,
1055 "vect_model_load_cost: aligned.\n");
1057 break;
1059 case dr_unaligned_supported:
1061 /* Here, we assign an additional cost for the unaligned load. */
1062 *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
1063 unaligned_load, stmt_info,
1064 misalignment, vect_body);
1066 if (dump_enabled_p ())
1067 dump_printf_loc (MSG_NOTE, vect_location,
1068 "vect_model_load_cost: unaligned supported by "
1069 "hardware.\n");
1071 break;
1073 case dr_explicit_realign:
1075 *inside_cost += record_stmt_cost (body_cost_vec, ncopies * 2,
1076 vector_load, stmt_info, 0, vect_body);
1077 *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
1078 vec_perm, stmt_info, 0, vect_body);
1080 /* FIXME: If the misalignment remains fixed across the iterations of
1081 the containing loop, the following cost should be added to the
1082 prologue costs. */
1083 if (targetm.vectorize.builtin_mask_for_load)
1084 *inside_cost += record_stmt_cost (body_cost_vec, 1, vector_stmt,
1085 stmt_info, 0, vect_body);
1087 if (dump_enabled_p ())
1088 dump_printf_loc (MSG_NOTE, vect_location,
1089 "vect_model_load_cost: explicit realign\n");
1091 break;
1093 case dr_explicit_realign_optimized:
1095 if (dump_enabled_p ())
1096 dump_printf_loc (MSG_NOTE, vect_location,
1097 "vect_model_load_cost: unaligned software "
1098 "pipelined.\n");
1100 /* Unaligned software pipeline has a load of an address, an initial
1101 load, and possibly a mask operation to "prime" the loop. However,
1102 if this is an access in a group of loads, which provide grouped
1103 access, then the above cost should only be considered for one
1104 access in the group. Inside the loop, there is a load op
1105 and a realignment op. */
1107 if (add_realign_cost && record_prologue_costs)
1109 *prologue_cost += record_stmt_cost (prologue_cost_vec, 2,
1110 vector_stmt, stmt_info,
1111 0, vect_prologue);
1112 if (targetm.vectorize.builtin_mask_for_load)
1113 *prologue_cost += record_stmt_cost (prologue_cost_vec, 1,
1114 vector_stmt, stmt_info,
1115 0, vect_prologue);
1118 *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vector_load,
1119 stmt_info, 0, vect_body);
1120 *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vec_perm,
1121 stmt_info, 0, vect_body);
1123 if (dump_enabled_p ())
1124 dump_printf_loc (MSG_NOTE, vect_location,
1125 "vect_model_load_cost: explicit realign optimized"
1126 "\n");
1128 break;
1131 case dr_unaligned_unsupported:
1133 *inside_cost = VECT_MAX_COST;
1135 if (dump_enabled_p ())
1136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1137 "vect_model_load_cost: unsupported access.\n");
1138 break;
1141 default:
1142 gcc_unreachable ();
1146 /* Insert the new stmt NEW_STMT at *GSI or at the appropriate place in
1147 the loop preheader for the vectorized stmt STMT_VINFO. */
1149 static void
1150 vect_init_vector_1 (vec_info *vinfo, stmt_vec_info stmt_vinfo, gimple *new_stmt,
1151 gimple_stmt_iterator *gsi)
1153 if (gsi)
1154 vect_finish_stmt_generation (vinfo, stmt_vinfo, new_stmt, gsi);
1155 else
1156 vinfo->insert_on_entry (stmt_vinfo, new_stmt);
1158 if (dump_enabled_p ())
1159 dump_printf_loc (MSG_NOTE, vect_location,
1160 "created new init_stmt: %G", new_stmt);
1163 /* Function vect_init_vector.
1165 Insert a new stmt (INIT_STMT) that initializes a new variable of type
1166 TYPE with the value VAL. If TYPE is a vector type and VAL does not have
1167 vector type a vector with all elements equal to VAL is created first.
1168 Place the initialization at GSI if it is not NULL. Otherwise, place the
1169 initialization at the loop preheader.
1170 Return the DEF of INIT_STMT.
1171 It will be used in the vectorization of STMT_INFO. */
1173 tree
1174 vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
1175 gimple_stmt_iterator *gsi)
1177 gimple *init_stmt;
1178 tree new_temp;
1180 /* We abuse this function to push sth to a SSA name with initial 'val'. */
1181 if (! useless_type_conversion_p (type, TREE_TYPE (val)))
1183 gcc_assert (VECTOR_TYPE_P (type));
1184 if (! types_compatible_p (TREE_TYPE (type), TREE_TYPE (val)))
1186 /* Scalar boolean value should be transformed into
1187 all zeros or all ones value before building a vector. */
1188 if (VECTOR_BOOLEAN_TYPE_P (type))
1190 tree true_val = build_all_ones_cst (TREE_TYPE (type));
1191 tree false_val = build_zero_cst (TREE_TYPE (type));
1193 if (CONSTANT_CLASS_P (val))
1194 val = integer_zerop (val) ? false_val : true_val;
1195 else
1197 new_temp = make_ssa_name (TREE_TYPE (type));
1198 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
1199 val, true_val, false_val);
1200 vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
1201 val = new_temp;
1204 else
1206 gimple_seq stmts = NULL;
1207 if (! INTEGRAL_TYPE_P (TREE_TYPE (val)))
1208 val = gimple_build (&stmts, VIEW_CONVERT_EXPR,
1209 TREE_TYPE (type), val);
1210 else
1211 /* ??? Condition vectorization expects us to do
1212 promotion of invariant/external defs. */
1213 val = gimple_convert (&stmts, TREE_TYPE (type), val);
1214 for (gimple_stmt_iterator gsi2 = gsi_start (stmts);
1215 !gsi_end_p (gsi2); )
1217 init_stmt = gsi_stmt (gsi2);
1218 gsi_remove (&gsi2, false);
1219 vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
1223 val = build_vector_from_val (type, val);
1226 new_temp = vect_get_new_ssa_name (type, vect_simple_var, "cst_");
1227 init_stmt = gimple_build_assign (new_temp, val);
1228 vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
1229 return new_temp;
1233 /* Function vect_get_vec_defs_for_operand.
1235 OP is an operand in STMT_VINFO. This function returns a vector of
1236 NCOPIES defs that will be used in the vectorized stmts for STMT_VINFO.
1238 In the case that OP is an SSA_NAME which is defined in the loop, then
1239 STMT_VINFO_VEC_STMTS of the defining stmt holds the relevant defs.
1241 In case OP is an invariant or constant, a new stmt that creates a vector def
1242 needs to be introduced. VECTYPE may be used to specify a required type for
1243 vector invariant. */
1245 void
1246 vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info stmt_vinfo,
1247 unsigned ncopies,
1248 tree op, vec<tree> *vec_oprnds, tree vectype)
1250 gimple *def_stmt;
1251 enum vect_def_type dt;
1252 bool is_simple_use;
1253 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1255 if (dump_enabled_p ())
1256 dump_printf_loc (MSG_NOTE, vect_location,
1257 "vect_get_vec_defs_for_operand: %T\n", op);
1259 stmt_vec_info def_stmt_info;
1260 is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt,
1261 &def_stmt_info, &def_stmt);
1262 gcc_assert (is_simple_use);
1263 if (def_stmt && dump_enabled_p ())
1264 dump_printf_loc (MSG_NOTE, vect_location, " def_stmt = %G", def_stmt);
1266 vec_oprnds->create (ncopies);
1267 if (dt == vect_constant_def || dt == vect_external_def)
1269 tree stmt_vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1270 tree vector_type;
1272 if (vectype)
1273 vector_type = vectype;
1274 else if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
1275 && VECTOR_BOOLEAN_TYPE_P (stmt_vectype))
1276 vector_type = truth_type_for (stmt_vectype);
1277 else
1278 vector_type = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op));
1280 gcc_assert (vector_type);
1281 tree vop = vect_init_vector (vinfo, stmt_vinfo, op, vector_type, NULL);
1282 while (ncopies--)
1283 vec_oprnds->quick_push (vop);
1285 else
1287 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
1288 gcc_assert (STMT_VINFO_VEC_STMTS (def_stmt_info).length () == ncopies);
1289 for (unsigned i = 0; i < ncopies; ++i)
1290 vec_oprnds->quick_push (gimple_get_lhs
1291 (STMT_VINFO_VEC_STMTS (def_stmt_info)[i]));
1296 /* Get vectorized definitions for OP0 and OP1. */
1298 void
1299 vect_get_vec_defs (vec_info *vinfo, stmt_vec_info stmt_info, slp_tree slp_node,
1300 unsigned ncopies,
1301 tree op0, tree vectype0, vec<tree> *vec_oprnds0,
1302 tree op1, tree vectype1, vec<tree> *vec_oprnds1,
1303 tree op2, tree vectype2, vec<tree> *vec_oprnds2,
1304 tree op3, tree vectype3, vec<tree> *vec_oprnds3)
1306 if (slp_node)
1308 if (op0)
1309 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], vec_oprnds0);
1310 if (op1)
1311 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[1], vec_oprnds1);
1312 if (op2)
1313 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[2], vec_oprnds2);
1314 if (op3)
1315 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[3], vec_oprnds3);
1317 else
1319 if (op0)
1320 vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
1321 op0, vec_oprnds0, vectype0);
1322 if (op1)
1323 vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
1324 op1, vec_oprnds1, vectype1);
1325 if (op2)
1326 vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
1327 op2, vec_oprnds2, vectype2);
1328 if (op3)
1329 vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
1330 op3, vec_oprnds3, vectype3);
1334 void
1335 vect_get_vec_defs (vec_info *vinfo, stmt_vec_info stmt_info, slp_tree slp_node,
1336 unsigned ncopies,
1337 tree op0, vec<tree> *vec_oprnds0,
1338 tree op1, vec<tree> *vec_oprnds1,
1339 tree op2, vec<tree> *vec_oprnds2,
1340 tree op3, vec<tree> *vec_oprnds3)
1342 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
1343 op0, NULL_TREE, vec_oprnds0,
1344 op1, NULL_TREE, vec_oprnds1,
1345 op2, NULL_TREE, vec_oprnds2,
1346 op3, NULL_TREE, vec_oprnds3);
1349 /* Helper function called by vect_finish_replace_stmt and
1350 vect_finish_stmt_generation. Set the location of the new
1351 statement and create and return a stmt_vec_info for it. */
1353 static void
1354 vect_finish_stmt_generation_1 (vec_info *,
1355 stmt_vec_info stmt_info, gimple *vec_stmt)
1357 if (dump_enabled_p ())
1358 dump_printf_loc (MSG_NOTE, vect_location, "add new stmt: %G", vec_stmt);
1360 if (stmt_info)
1362 gimple_set_location (vec_stmt, gimple_location (stmt_info->stmt));
1364 /* While EH edges will generally prevent vectorization, stmt might
1365 e.g. be in a must-not-throw region. Ensure newly created stmts
1366 that could throw are part of the same region. */
1367 int lp_nr = lookup_stmt_eh_lp (stmt_info->stmt);
1368 if (lp_nr != 0 && stmt_could_throw_p (cfun, vec_stmt))
1369 add_stmt_to_eh_lp (vec_stmt, lp_nr);
1371 else
1372 gcc_assert (!stmt_could_throw_p (cfun, vec_stmt));
1375 /* Replace the scalar statement STMT_INFO with a new vector statement VEC_STMT,
1376 which sets the same scalar result as STMT_INFO did. Create and return a
1377 stmt_vec_info for VEC_STMT. */
1379 void
1380 vect_finish_replace_stmt (vec_info *vinfo,
1381 stmt_vec_info stmt_info, gimple *vec_stmt)
1383 gimple *scalar_stmt = vect_orig_stmt (stmt_info)->stmt;
1384 gcc_assert (gimple_get_lhs (scalar_stmt) == gimple_get_lhs (vec_stmt));
1386 gimple_stmt_iterator gsi = gsi_for_stmt (scalar_stmt);
1387 gsi_replace (&gsi, vec_stmt, true);
1389 vect_finish_stmt_generation_1 (vinfo, stmt_info, vec_stmt);
1392 /* Add VEC_STMT to the vectorized implementation of STMT_INFO and insert it
1393 before *GSI. Create and return a stmt_vec_info for VEC_STMT. */
1395 void
1396 vect_finish_stmt_generation (vec_info *vinfo,
1397 stmt_vec_info stmt_info, gimple *vec_stmt,
1398 gimple_stmt_iterator *gsi)
1400 gcc_assert (!stmt_info || gimple_code (stmt_info->stmt) != GIMPLE_LABEL);
1402 if (!gsi_end_p (*gsi)
1403 && gimple_has_mem_ops (vec_stmt))
1405 gimple *at_stmt = gsi_stmt (*gsi);
1406 tree vuse = gimple_vuse (at_stmt);
1407 if (vuse && TREE_CODE (vuse) == SSA_NAME)
1409 tree vdef = gimple_vdef (at_stmt);
1410 gimple_set_vuse (vec_stmt, gimple_vuse (at_stmt));
1411 gimple_set_modified (vec_stmt, true);
1412 /* If we have an SSA vuse and insert a store, update virtual
1413 SSA form to avoid triggering the renamer. Do so only
1414 if we can easily see all uses - which is what almost always
1415 happens with the way vectorized stmts are inserted. */
1416 if ((vdef && TREE_CODE (vdef) == SSA_NAME)
1417 && ((is_gimple_assign (vec_stmt)
1418 && !is_gimple_reg (gimple_assign_lhs (vec_stmt)))
1419 || (is_gimple_call (vec_stmt)
1420 && (!(gimple_call_flags (vec_stmt)
1421 & (ECF_CONST|ECF_PURE|ECF_NOVOPS))
1422 || (gimple_call_lhs (vec_stmt)
1423 && !is_gimple_reg (gimple_call_lhs (vec_stmt)))))))
1425 tree new_vdef = copy_ssa_name (vuse, vec_stmt);
1426 gimple_set_vdef (vec_stmt, new_vdef);
1427 SET_USE (gimple_vuse_op (at_stmt), new_vdef);
1431 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
1432 vect_finish_stmt_generation_1 (vinfo, stmt_info, vec_stmt);
1435 /* We want to vectorize a call to combined function CFN with function
1436 decl FNDECL, using VECTYPE_OUT as the type of the output and VECTYPE_IN
1437 as the types of all inputs. Check whether this is possible using
1438 an internal function, returning its code if so or IFN_LAST if not. */
1440 static internal_fn
1441 vectorizable_internal_function (combined_fn cfn, tree fndecl,
1442 tree vectype_out, tree vectype_in)
1444 internal_fn ifn;
1445 if (internal_fn_p (cfn))
1446 ifn = as_internal_fn (cfn);
1447 else
1448 ifn = associated_internal_fn (fndecl);
1449 if (ifn != IFN_LAST && direct_internal_fn_p (ifn))
1451 const direct_internal_fn_info &info = direct_internal_fn (ifn);
1452 if (info.vectorizable)
1454 bool same_size_p = TYPE_SIZE (vectype_in) == TYPE_SIZE (vectype_out);
1455 tree type0 = (info.type0 < 0 ? vectype_out : vectype_in);
1456 tree type1 = (info.type1 < 0 ? vectype_out : vectype_in);
1458 /* The type size of both the vectype_in and vectype_out should be
1459 exactly the same when vectype_out isn't participating the optab.
1460 While there is no restriction for type size when vectype_out
1461 is part of the optab query. */
1462 if (type0 != vectype_out && type1 != vectype_out && !same_size_p)
1463 return IFN_LAST;
1465 if (direct_internal_fn_supported_p (ifn, tree_pair (type0, type1),
1466 OPTIMIZE_FOR_SPEED))
1467 return ifn;
1470 return IFN_LAST;
1474 static tree permute_vec_elements (vec_info *, tree, tree, tree, stmt_vec_info,
1475 gimple_stmt_iterator *);
1477 /* Check whether a load or store statement in the loop described by
1478 LOOP_VINFO is possible in a loop using partial vectors. This is
1479 testing whether the vectorizer pass has the appropriate support,
1480 as well as whether the target does.
1482 VLS_TYPE says whether the statement is a load or store and VECTYPE
1483 is the type of the vector being loaded or stored. SLP_NODE is the SLP
1484 node that contains the statement, or null if none. MEMORY_ACCESS_TYPE
1485 says how the load or store is going to be implemented and GROUP_SIZE
1486 is the number of load or store statements in the containing group.
1487 If the access is a gather load or scatter store, GS_INFO describes
1488 its arguments. If the load or store is conditional, SCALAR_MASK is the
1489 condition under which it occurs.
1491 Clear LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P if a loop using partial
1492 vectors is not supported, otherwise record the required rgroup control
1493 types. */
1495 static void
1496 check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
1497 slp_tree slp_node,
1498 vec_load_store_type vls_type,
1499 int group_size,
1500 vect_memory_access_type
1501 memory_access_type,
1502 gather_scatter_info *gs_info,
1503 tree scalar_mask)
1505 /* Invariant loads need no special support. */
1506 if (memory_access_type == VMAT_INVARIANT)
1507 return;
1509 unsigned int nvectors;
1510 if (slp_node)
1511 nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1512 else
1513 nvectors = vect_get_num_copies (loop_vinfo, vectype);
1515 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1516 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
1517 machine_mode vecmode = TYPE_MODE (vectype);
1518 bool is_load = (vls_type == VLS_LOAD);
1519 if (memory_access_type == VMAT_LOAD_STORE_LANES)
1521 internal_fn ifn
1522 = (is_load ? vect_load_lanes_supported (vectype, group_size, true)
1523 : vect_store_lanes_supported (vectype, group_size, true));
1524 if (ifn == IFN_MASK_LEN_LOAD_LANES || ifn == IFN_MASK_LEN_STORE_LANES)
1525 vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
1526 else if (ifn == IFN_MASK_LOAD_LANES || ifn == IFN_MASK_STORE_LANES)
1527 vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
1528 scalar_mask);
1529 else
1531 if (dump_enabled_p ())
1532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533 "can't operate on partial vectors because"
1534 " the target doesn't have an appropriate"
1535 " load/store-lanes instruction.\n");
1536 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
1538 return;
1541 if (memory_access_type == VMAT_GATHER_SCATTER)
1543 internal_fn ifn = (is_load
1544 ? IFN_MASK_GATHER_LOAD
1545 : IFN_MASK_SCATTER_STORE);
1546 internal_fn len_ifn = (is_load
1547 ? IFN_MASK_LEN_GATHER_LOAD
1548 : IFN_MASK_LEN_SCATTER_STORE);
1549 if (internal_gather_scatter_fn_supported_p (len_ifn, vectype,
1550 gs_info->memory_type,
1551 gs_info->offset_vectype,
1552 gs_info->scale))
1553 vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
1554 else if (internal_gather_scatter_fn_supported_p (ifn, vectype,
1555 gs_info->memory_type,
1556 gs_info->offset_vectype,
1557 gs_info->scale))
1558 vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
1559 scalar_mask);
1560 else
1562 if (dump_enabled_p ())
1563 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1564 "can't operate on partial vectors because"
1565 " the target doesn't have an appropriate"
1566 " gather load or scatter store instruction.\n");
1567 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
1569 return;
1572 if (memory_access_type != VMAT_CONTIGUOUS
1573 && memory_access_type != VMAT_CONTIGUOUS_PERMUTE)
1575 /* Element X of the data must come from iteration i * VF + X of the
1576 scalar loop. We need more work to support other mappings. */
1577 if (dump_enabled_p ())
1578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1579 "can't operate on partial vectors because an"
1580 " access isn't contiguous.\n");
1581 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
1582 return;
1585 if (!VECTOR_MODE_P (vecmode))
1587 if (dump_enabled_p ())
1588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1589 "can't operate on partial vectors when emulating"
1590 " vector operations.\n");
1591 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
1592 return;
1595 /* We might load more scalars than we need for permuting SLP loads.
1596 We checked in get_group_load_store_type that the extra elements
1597 don't leak into a new vector. */
1598 auto group_memory_nvectors = [](poly_uint64 size, poly_uint64 nunits)
1600 unsigned int nvectors;
1601 if (can_div_away_from_zero_p (size, nunits, &nvectors))
1602 return nvectors;
1603 gcc_unreachable ();
1606 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1607 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1608 machine_mode mask_mode;
1609 machine_mode vmode;
1610 bool using_partial_vectors_p = false;
1611 if (get_len_load_store_mode (vecmode, is_load).exists (&vmode))
1613 nvectors = group_memory_nvectors (group_size * vf, nunits);
1614 unsigned factor = (vecmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vecmode);
1615 vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, factor);
1616 using_partial_vectors_p = true;
1618 else if (targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode)
1619 && can_vec_mask_load_store_p (vecmode, mask_mode, is_load))
1621 nvectors = group_memory_nvectors (group_size * vf, nunits);
1622 vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
1623 using_partial_vectors_p = true;
1626 if (!using_partial_vectors_p)
1628 if (dump_enabled_p ())
1629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630 "can't operate on partial vectors because the"
1631 " target doesn't have the appropriate partial"
1632 " vectorization load or store.\n");
1633 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
1637 /* Return the mask input to a masked load or store. VEC_MASK is the vectorized
1638 form of the scalar mask condition and LOOP_MASK, if nonnull, is the mask
1639 that needs to be applied to all loads and stores in a vectorized loop.
1640 Return VEC_MASK if LOOP_MASK is null or if VEC_MASK is already masked,
1641 otherwise return VEC_MASK & LOOP_MASK.
1643 MASK_TYPE is the type of both masks. If new statements are needed,
1644 insert them before GSI. */
1646 static tree
1647 prepare_vec_mask (loop_vec_info loop_vinfo, tree mask_type, tree loop_mask,
1648 tree vec_mask, gimple_stmt_iterator *gsi)
1650 gcc_assert (useless_type_conversion_p (mask_type, TREE_TYPE (vec_mask)));
1651 if (!loop_mask)
1652 return vec_mask;
1654 gcc_assert (TREE_TYPE (loop_mask) == mask_type);
1656 if (loop_vinfo->vec_cond_masked_set.contains ({ vec_mask, loop_mask }))
1657 return vec_mask;
1659 tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and");
1660 gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR,
1661 vec_mask, loop_mask);
1663 gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT);
1664 return and_res;
1667 /* Determine whether we can use a gather load or scatter store to vectorize
1668 strided load or store STMT_INFO by truncating the current offset to a
1669 smaller width. We need to be able to construct an offset vector:
1671 { 0, X, X*2, X*3, ... }
1673 without loss of precision, where X is STMT_INFO's DR_STEP.
1675 Return true if this is possible, describing the gather load or scatter
1676 store in GS_INFO. MASKED_P is true if the load or store is conditional. */
1678 static bool
1679 vect_truncate_gather_scatter_offset (stmt_vec_info stmt_info,
1680 loop_vec_info loop_vinfo, bool masked_p,
1681 gather_scatter_info *gs_info)
1683 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
1684 data_reference *dr = dr_info->dr;
1685 tree step = DR_STEP (dr);
1686 if (TREE_CODE (step) != INTEGER_CST)
1688 /* ??? Perhaps we could use range information here? */
1689 if (dump_enabled_p ())
1690 dump_printf_loc (MSG_NOTE, vect_location,
1691 "cannot truncate variable step.\n");
1692 return false;
1695 /* Get the number of bits in an element. */
1696 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1697 scalar_mode element_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
1698 unsigned int element_bits = GET_MODE_BITSIZE (element_mode);
1700 /* Set COUNT to the upper limit on the number of elements - 1.
1701 Start with the maximum vectorization factor. */
1702 unsigned HOST_WIDE_INT count = vect_max_vf (loop_vinfo) - 1;
1704 /* Try lowering COUNT to the number of scalar latch iterations. */
1705 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1706 widest_int max_iters;
1707 if (max_loop_iterations (loop, &max_iters)
1708 && max_iters < count)
1709 count = max_iters.to_shwi ();
1711 /* Try scales of 1 and the element size. */
1712 int scales[] = { 1, vect_get_scalar_dr_size (dr_info) };
1713 wi::overflow_type overflow = wi::OVF_NONE;
1714 for (int i = 0; i < 2; ++i)
1716 int scale = scales[i];
1717 widest_int factor;
1718 if (!wi::multiple_of_p (wi::to_widest (step), scale, SIGNED, &factor))
1719 continue;
1721 /* Determine the minimum precision of (COUNT - 1) * STEP / SCALE. */
1722 widest_int range = wi::mul (count, factor, SIGNED, &overflow);
1723 if (overflow)
1724 continue;
1725 signop sign = range >= 0 ? UNSIGNED : SIGNED;
1726 unsigned int min_offset_bits = wi::min_precision (range, sign);
1728 /* Find the narrowest viable offset type. */
1729 unsigned int offset_bits = 1U << ceil_log2 (min_offset_bits);
1730 tree offset_type = build_nonstandard_integer_type (offset_bits,
1731 sign == UNSIGNED);
1733 /* See whether the target supports the operation with an offset
1734 no narrower than OFFSET_TYPE. */
1735 tree memory_type = TREE_TYPE (DR_REF (dr));
1736 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
1737 vectype, memory_type, offset_type, scale,
1738 &gs_info->ifn, &gs_info->offset_vectype)
1739 || gs_info->ifn == IFN_LAST)
1740 continue;
1742 gs_info->decl = NULL_TREE;
1743 /* Logically the sum of DR_BASE_ADDRESS, DR_INIT and DR_OFFSET,
1744 but we don't need to store that here. */
1745 gs_info->base = NULL_TREE;
1746 gs_info->element_type = TREE_TYPE (vectype);
1747 gs_info->offset = fold_convert (offset_type, step);
1748 gs_info->offset_dt = vect_constant_def;
1749 gs_info->scale = scale;
1750 gs_info->memory_type = memory_type;
1751 return true;
1754 if (overflow && dump_enabled_p ())
1755 dump_printf_loc (MSG_NOTE, vect_location,
1756 "truncating gather/scatter offset to %d bits"
1757 " might change its value.\n", element_bits);
1759 return false;
1762 /* Return true if we can use gather/scatter internal functions to
1763 vectorize STMT_INFO, which is a grouped or strided load or store.
1764 MASKED_P is true if load or store is conditional. When returning
1765 true, fill in GS_INFO with the information required to perform the
1766 operation. */
1768 static bool
1769 vect_use_strided_gather_scatters_p (stmt_vec_info stmt_info,
1770 loop_vec_info loop_vinfo, bool masked_p,
1771 gather_scatter_info *gs_info)
1773 if (!vect_check_gather_scatter (stmt_info, loop_vinfo, gs_info)
1774 || gs_info->ifn == IFN_LAST)
1775 return vect_truncate_gather_scatter_offset (stmt_info, loop_vinfo,
1776 masked_p, gs_info);
1778 tree old_offset_type = TREE_TYPE (gs_info->offset);
1779 tree new_offset_type = TREE_TYPE (gs_info->offset_vectype);
1781 gcc_assert (TYPE_PRECISION (new_offset_type)
1782 >= TYPE_PRECISION (old_offset_type));
1783 gs_info->offset = fold_convert (new_offset_type, gs_info->offset);
1785 if (dump_enabled_p ())
1786 dump_printf_loc (MSG_NOTE, vect_location,
1787 "using gather/scatter for strided/grouped access,"
1788 " scale = %d\n", gs_info->scale);
1790 return true;
1793 /* STMT_INFO is a non-strided load or store, meaning that it accesses
1794 elements with a known constant step. Return -1 if that step
1795 is negative, 0 if it is zero, and 1 if it is greater than zero. */
1797 static int
1798 compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info)
1800 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
1801 return tree_int_cst_compare (vect_dr_behavior (vinfo, dr_info)->step,
1802 size_zero_node);
1805 /* If the target supports a permute mask that reverses the elements in
1806 a vector of type VECTYPE, return that mask, otherwise return null. */
1808 tree
1809 perm_mask_for_reverse (tree vectype)
1811 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1813 /* The encoding has a single stepped pattern. */
1814 vec_perm_builder sel (nunits, 1, 3);
1815 for (int i = 0; i < 3; ++i)
1816 sel.quick_push (nunits - 1 - i);
1818 vec_perm_indices indices (sel, 1, nunits);
1819 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
1820 indices))
1821 return NULL_TREE;
1822 return vect_gen_perm_mask_checked (vectype, indices);
1825 /* A subroutine of get_load_store_type, with a subset of the same
1826 arguments. Handle the case where STMT_INFO is a load or store that
1827 accesses consecutive elements with a negative step. Sets *POFFSET
1828 to the offset to be applied to the DR for the first access. */
1830 static vect_memory_access_type
1831 get_negative_load_store_type (vec_info *vinfo,
1832 stmt_vec_info stmt_info, tree vectype,
1833 vec_load_store_type vls_type,
1834 unsigned int ncopies, poly_int64 *poffset)
1836 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
1837 dr_alignment_support alignment_support_scheme;
1839 if (ncopies > 1)
1841 if (dump_enabled_p ())
1842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1843 "multiple types with negative step.\n");
1844 return VMAT_ELEMENTWISE;
1847 /* For backward running DRs the first access in vectype actually is
1848 N-1 elements before the address of the DR. */
1849 *poffset = ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1850 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1852 int misalignment = dr_misalignment (dr_info, vectype, *poffset);
1853 alignment_support_scheme
1854 = vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment);
1855 if (alignment_support_scheme != dr_aligned
1856 && alignment_support_scheme != dr_unaligned_supported)
1858 if (dump_enabled_p ())
1859 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1860 "negative step but alignment required.\n");
1861 *poffset = 0;
1862 return VMAT_ELEMENTWISE;
1865 if (vls_type == VLS_STORE_INVARIANT)
1867 if (dump_enabled_p ())
1868 dump_printf_loc (MSG_NOTE, vect_location,
1869 "negative step with invariant source;"
1870 " no permute needed.\n");
1871 return VMAT_CONTIGUOUS_DOWN;
1874 if (!perm_mask_for_reverse (vectype))
1876 if (dump_enabled_p ())
1877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1878 "negative step and reversing not supported.\n");
1879 *poffset = 0;
1880 return VMAT_ELEMENTWISE;
1883 return VMAT_CONTIGUOUS_REVERSE;
1886 /* STMT_INFO is either a masked or unconditional store. Return the value
1887 being stored. */
1889 tree
1890 vect_get_store_rhs (stmt_vec_info stmt_info)
1892 if (gassign *assign = dyn_cast <gassign *> (stmt_info->stmt))
1894 gcc_assert (gimple_assign_single_p (assign));
1895 return gimple_assign_rhs1 (assign);
1897 if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
1899 internal_fn ifn = gimple_call_internal_fn (call);
1900 int index = internal_fn_stored_value_index (ifn);
1901 gcc_assert (index >= 0);
1902 return gimple_call_arg (call, index);
1904 gcc_unreachable ();
1907 /* Function VECTOR_VECTOR_COMPOSITION_TYPE
1909 This function returns a vector type which can be composed with NETLS pieces,
1910 whose type is recorded in PTYPE. VTYPE should be a vector type, and has the
1911 same vector size as the return vector. It checks target whether supports
1912 pieces-size vector mode for construction firstly, if target fails to, check
1913 pieces-size scalar mode for construction further. It returns NULL_TREE if
1914 fails to find the available composition.
1916 For example, for (vtype=V16QI, nelts=4), we can probably get:
1917 - V16QI with PTYPE V4QI.
1918 - V4SI with PTYPE SI.
1919 - NULL_TREE. */
1921 static tree
1922 vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype)
1924 gcc_assert (VECTOR_TYPE_P (vtype));
1925 gcc_assert (known_gt (nelts, 0U));
1927 machine_mode vmode = TYPE_MODE (vtype);
1928 if (!VECTOR_MODE_P (vmode))
1929 return NULL_TREE;
1931 /* When we are asked to compose the vector from its components let
1932 that happen directly. */
1933 if (known_eq (TYPE_VECTOR_SUBPARTS (vtype), nelts))
1935 *ptype = TREE_TYPE (vtype);
1936 return vtype;
1939 poly_uint64 vbsize = GET_MODE_BITSIZE (vmode);
1940 unsigned int pbsize;
1941 if (constant_multiple_p (vbsize, nelts, &pbsize))
1943 /* First check if vec_init optab supports construction from
1944 vector pieces directly. */
1945 scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vtype));
1946 poly_uint64 inelts = pbsize / GET_MODE_BITSIZE (elmode);
1947 machine_mode rmode;
1948 if (related_vector_mode (vmode, elmode, inelts).exists (&rmode)
1949 && (convert_optab_handler (vec_init_optab, vmode, rmode)
1950 != CODE_FOR_nothing))
1952 *ptype = build_vector_type (TREE_TYPE (vtype), inelts);
1953 return vtype;
1956 /* Otherwise check if exists an integer type of the same piece size and
1957 if vec_init optab supports construction from it directly. */
1958 if (int_mode_for_size (pbsize, 0).exists (&elmode)
1959 && related_vector_mode (vmode, elmode, nelts).exists (&rmode)
1960 && (convert_optab_handler (vec_init_optab, rmode, elmode)
1961 != CODE_FOR_nothing))
1963 *ptype = build_nonstandard_integer_type (pbsize, 1);
1964 return build_vector_type (*ptype, nelts);
1968 return NULL_TREE;
1971 /* A subroutine of get_load_store_type, with a subset of the same
1972 arguments. Handle the case where STMT_INFO is part of a grouped load
1973 or store.
1975 For stores, the statements in the group are all consecutive
1976 and there is no gap at the end. For loads, the statements in the
1977 group might not be consecutive; there can be gaps between statements
1978 as well as at the end. */
1980 static bool
1981 get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
1982 tree vectype, slp_tree slp_node,
1983 bool masked_p, vec_load_store_type vls_type,
1984 vect_memory_access_type *memory_access_type,
1985 poly_int64 *poffset,
1986 dr_alignment_support *alignment_support_scheme,
1987 int *misalignment,
1988 gather_scatter_info *gs_info,
1989 internal_fn *lanes_ifn)
1991 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1992 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1993 stmt_vec_info first_stmt_info;
1994 unsigned int group_size;
1995 unsigned HOST_WIDE_INT gap;
1996 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1998 first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1999 group_size = DR_GROUP_SIZE (first_stmt_info);
2000 gap = DR_GROUP_GAP (first_stmt_info);
2002 else
2004 first_stmt_info = stmt_info;
2005 group_size = 1;
2006 gap = 0;
2008 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2009 bool single_element_p = (stmt_info == first_stmt_info
2010 && !DR_GROUP_NEXT_ELEMENT (stmt_info));
2011 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2013 /* True if the vectorized statements would access beyond the last
2014 statement in the group. */
2015 bool overrun_p = false;
2017 /* True if we can cope with such overrun by peeling for gaps, so that
2018 there is at least one final scalar iteration after the vector loop. */
2019 bool can_overrun_p = (!masked_p
2020 && vls_type == VLS_LOAD
2021 && loop_vinfo
2022 && !loop->inner);
2024 /* There can only be a gap at the end of the group if the stride is
2025 known at compile time. */
2026 gcc_assert (!STMT_VINFO_STRIDED_P (first_stmt_info) || gap == 0);
2028 /* Stores can't yet have gaps. */
2029 gcc_assert (slp_node || vls_type == VLS_LOAD || gap == 0);
2031 if (slp_node)
2033 /* For SLP vectorization we directly vectorize a subchain
2034 without permutation. */
2035 if (! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
2036 first_dr_info
2037 = STMT_VINFO_DR_INFO (SLP_TREE_SCALAR_STMTS (slp_node)[0]);
2038 if (STMT_VINFO_STRIDED_P (first_stmt_info))
2040 /* Try to use consecutive accesses of DR_GROUP_SIZE elements,
2041 separated by the stride, until we have a complete vector.
2042 Fall back to scalar accesses if that isn't possible. */
2043 if (multiple_p (nunits, group_size))
2044 *memory_access_type = VMAT_STRIDED_SLP;
2045 else
2046 *memory_access_type = VMAT_ELEMENTWISE;
2048 else
2050 overrun_p = loop_vinfo && gap != 0;
2051 if (overrun_p && vls_type != VLS_LOAD)
2053 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2054 "Grouped store with gaps requires"
2055 " non-consecutive accesses\n");
2056 return false;
2058 /* An overrun is fine if the trailing elements are smaller
2059 than the alignment boundary B. Every vector access will
2060 be a multiple of B and so we are guaranteed to access a
2061 non-gap element in the same B-sized block. */
2062 if (overrun_p
2063 && gap < (vect_known_alignment_in_bytes (first_dr_info,
2064 vectype)
2065 / vect_get_scalar_dr_size (first_dr_info)))
2066 overrun_p = false;
2068 /* If the gap splits the vector in half and the target
2069 can do half-vector operations avoid the epilogue peeling
2070 by simply loading half of the vector only. Usually
2071 the construction with an upper zero half will be elided. */
2072 dr_alignment_support alss;
2073 int misalign = dr_misalignment (first_dr_info, vectype);
2074 tree half_vtype;
2075 if (overrun_p
2076 && !masked_p
2077 && (((alss = vect_supportable_dr_alignment (vinfo, first_dr_info,
2078 vectype, misalign)))
2079 == dr_aligned
2080 || alss == dr_unaligned_supported)
2081 && known_eq (nunits, (group_size - gap) * 2)
2082 && known_eq (nunits, group_size)
2083 && (vector_vector_composition_type (vectype, 2, &half_vtype)
2084 != NULL_TREE))
2085 overrun_p = false;
2087 if (overrun_p && !can_overrun_p)
2089 if (dump_enabled_p ())
2090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2091 "Peeling for outer loop is not supported\n");
2092 return false;
2094 int cmp = compare_step_with_zero (vinfo, stmt_info);
2095 if (cmp < 0)
2097 if (single_element_p)
2098 /* ??? The VMAT_CONTIGUOUS_REVERSE code generation is
2099 only correct for single element "interleaving" SLP. */
2100 *memory_access_type = get_negative_load_store_type
2101 (vinfo, stmt_info, vectype, vls_type, 1, poffset);
2102 else
2104 /* Try to use consecutive accesses of DR_GROUP_SIZE elements,
2105 separated by the stride, until we have a complete vector.
2106 Fall back to scalar accesses if that isn't possible. */
2107 if (multiple_p (nunits, group_size))
2108 *memory_access_type = VMAT_STRIDED_SLP;
2109 else
2110 *memory_access_type = VMAT_ELEMENTWISE;
2113 else if (cmp == 0 && loop_vinfo)
2115 gcc_assert (vls_type == VLS_LOAD);
2116 *memory_access_type = VMAT_INVARIANT;
2117 /* Invariant accesses perform only component accesses, alignment
2118 is irrelevant for them. */
2119 *alignment_support_scheme = dr_unaligned_supported;
2121 else
2122 *memory_access_type = VMAT_CONTIGUOUS;
2124 /* When we have a contiguous access across loop iterations
2125 but the access in the loop doesn't cover the full vector
2126 we can end up with no gap recorded but still excess
2127 elements accessed, see PR103116. Make sure we peel for
2128 gaps if necessary and sufficient and give up if not.
2130 If there is a combination of the access not covering the full
2131 vector and a gap recorded then we may need to peel twice. */
2132 if (loop_vinfo
2133 && *memory_access_type == VMAT_CONTIGUOUS
2134 && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
2135 && !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2136 nunits))
2138 unsigned HOST_WIDE_INT cnunits, cvf;
2139 if (!can_overrun_p
2140 || !nunits.is_constant (&cnunits)
2141 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf)
2142 /* Peeling for gaps assumes that a single scalar iteration
2143 is enough to make sure the last vector iteration doesn't
2144 access excess elements.
2145 ??? Enhancements include peeling multiple iterations
2146 or using masked loads with a static mask. */
2147 || (group_size * cvf) % cnunits + group_size - gap < cnunits)
2149 if (dump_enabled_p ())
2150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2151 "peeling for gaps insufficient for "
2152 "access\n");
2153 return false;
2155 overrun_p = true;
2159 else
2161 /* We can always handle this case using elementwise accesses,
2162 but see if something more efficient is available. */
2163 *memory_access_type = VMAT_ELEMENTWISE;
2165 /* If there is a gap at the end of the group then these optimizations
2166 would access excess elements in the last iteration. */
2167 bool would_overrun_p = (gap != 0);
2168 /* An overrun is fine if the trailing elements are smaller than the
2169 alignment boundary B. Every vector access will be a multiple of B
2170 and so we are guaranteed to access a non-gap element in the
2171 same B-sized block. */
2172 if (would_overrun_p
2173 && !masked_p
2174 && gap < (vect_known_alignment_in_bytes (first_dr_info, vectype)
2175 / vect_get_scalar_dr_size (first_dr_info)))
2176 would_overrun_p = false;
2178 if (!STMT_VINFO_STRIDED_P (first_stmt_info)
2179 && (can_overrun_p || !would_overrun_p)
2180 && compare_step_with_zero (vinfo, stmt_info) > 0)
2182 /* First cope with the degenerate case of a single-element
2183 vector. */
2184 if (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U))
2187 else
2189 /* Otherwise try using LOAD/STORE_LANES. */
2190 *lanes_ifn
2191 = vls_type == VLS_LOAD
2192 ? vect_load_lanes_supported (vectype, group_size, masked_p)
2193 : vect_store_lanes_supported (vectype, group_size,
2194 masked_p);
2195 if (*lanes_ifn != IFN_LAST)
2197 *memory_access_type = VMAT_LOAD_STORE_LANES;
2198 overrun_p = would_overrun_p;
2201 /* If that fails, try using permuting loads. */
2202 else if (vls_type == VLS_LOAD
2203 ? vect_grouped_load_supported (vectype,
2204 single_element_p,
2205 group_size)
2206 : vect_grouped_store_supported (vectype, group_size))
2208 *memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
2209 overrun_p = would_overrun_p;
2214 /* As a last resort, trying using a gather load or scatter store.
2216 ??? Although the code can handle all group sizes correctly,
2217 it probably isn't a win to use separate strided accesses based
2218 on nearby locations. Or, even if it's a win over scalar code,
2219 it might not be a win over vectorizing at a lower VF, if that
2220 allows us to use contiguous accesses. */
2221 if (*memory_access_type == VMAT_ELEMENTWISE
2222 && single_element_p
2223 && loop_vinfo
2224 && vect_use_strided_gather_scatters_p (stmt_info, loop_vinfo,
2225 masked_p, gs_info))
2226 *memory_access_type = VMAT_GATHER_SCATTER;
2229 if (*memory_access_type == VMAT_GATHER_SCATTER
2230 || *memory_access_type == VMAT_ELEMENTWISE)
2232 *alignment_support_scheme = dr_unaligned_supported;
2233 *misalignment = DR_MISALIGNMENT_UNKNOWN;
2235 else
2237 *misalignment = dr_misalignment (first_dr_info, vectype, *poffset);
2238 *alignment_support_scheme
2239 = vect_supportable_dr_alignment (vinfo, first_dr_info, vectype,
2240 *misalignment);
2243 if (vls_type != VLS_LOAD && first_stmt_info == stmt_info)
2245 /* STMT is the leader of the group. Check the operands of all the
2246 stmts of the group. */
2247 stmt_vec_info next_stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2248 while (next_stmt_info)
2250 tree op = vect_get_store_rhs (next_stmt_info);
2251 enum vect_def_type dt;
2252 if (!vect_is_simple_use (op, vinfo, &dt))
2254 if (dump_enabled_p ())
2255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2256 "use not simple.\n");
2257 return false;
2259 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
2263 if (overrun_p)
2265 gcc_assert (can_overrun_p);
2266 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2268 "Data access with gaps requires scalar "
2269 "epilogue loop\n");
2270 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2273 return true;
2276 /* Analyze load or store statement STMT_INFO of type VLS_TYPE. Return true
2277 if there is a memory access type that the vectorized form can use,
2278 storing it in *MEMORY_ACCESS_TYPE if so. If we decide to use gathers
2279 or scatters, fill in GS_INFO accordingly. In addition
2280 *ALIGNMENT_SUPPORT_SCHEME is filled out and false is returned if
2281 the target does not support the alignment scheme. *MISALIGNMENT
2282 is set according to the alignment of the access (including
2283 DR_MISALIGNMENT_UNKNOWN when it is unknown).
2285 SLP says whether we're performing SLP rather than loop vectorization.
2286 MASKED_P is true if the statement is conditional on a vectorized mask.
2287 VECTYPE is the vector type that the vectorized statements will use.
2288 NCOPIES is the number of vector statements that will be needed. */
2290 static bool
2291 get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
2292 tree vectype, slp_tree slp_node,
2293 bool masked_p, vec_load_store_type vls_type,
2294 unsigned int ncopies,
2295 vect_memory_access_type *memory_access_type,
2296 poly_int64 *poffset,
2297 dr_alignment_support *alignment_support_scheme,
2298 int *misalignment,
2299 gather_scatter_info *gs_info,
2300 internal_fn *lanes_ifn)
2302 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2303 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2304 *misalignment = DR_MISALIGNMENT_UNKNOWN;
2305 *poffset = 0;
2306 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2308 *memory_access_type = VMAT_GATHER_SCATTER;
2309 if (!vect_check_gather_scatter (stmt_info, loop_vinfo, gs_info))
2310 gcc_unreachable ();
2311 /* When using internal functions, we rely on pattern recognition
2312 to convert the type of the offset to the type that the target
2313 requires, with the result being a call to an internal function.
2314 If that failed for some reason (e.g. because another pattern
2315 took priority), just handle cases in which the offset already
2316 has the right type. */
2317 else if (gs_info->ifn != IFN_LAST
2318 && !is_gimple_call (stmt_info->stmt)
2319 && !tree_nop_conversion_p (TREE_TYPE (gs_info->offset),
2320 TREE_TYPE (gs_info->offset_vectype)))
2322 if (dump_enabled_p ())
2323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2324 "%s offset requires a conversion\n",
2325 vls_type == VLS_LOAD ? "gather" : "scatter");
2326 return false;
2328 else if (!vect_is_simple_use (gs_info->offset, vinfo,
2329 &gs_info->offset_dt,
2330 &gs_info->offset_vectype))
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334 "%s index use not simple.\n",
2335 vls_type == VLS_LOAD ? "gather" : "scatter");
2336 return false;
2338 else if (gs_info->ifn == IFN_LAST && !gs_info->decl)
2340 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()
2341 || !TYPE_VECTOR_SUBPARTS (gs_info->offset_vectype).is_constant ()
2342 || !constant_multiple_p (TYPE_VECTOR_SUBPARTS
2343 (gs_info->offset_vectype),
2344 TYPE_VECTOR_SUBPARTS (vectype)))
2346 if (dump_enabled_p ())
2347 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2348 "unsupported vector types for emulated "
2349 "gather.\n");
2350 return false;
2353 /* Gather-scatter accesses perform only component accesses, alignment
2354 is irrelevant for them. */
2355 *alignment_support_scheme = dr_unaligned_supported;
2357 else if (STMT_VINFO_GROUPED_ACCESS (stmt_info) || slp_node)
2359 if (!get_group_load_store_type (vinfo, stmt_info, vectype, slp_node,
2360 masked_p,
2361 vls_type, memory_access_type, poffset,
2362 alignment_support_scheme,
2363 misalignment, gs_info, lanes_ifn))
2364 return false;
2366 else if (STMT_VINFO_STRIDED_P (stmt_info))
2368 gcc_assert (!slp_node);
2369 if (loop_vinfo
2370 && vect_use_strided_gather_scatters_p (stmt_info, loop_vinfo,
2371 masked_p, gs_info))
2372 *memory_access_type = VMAT_GATHER_SCATTER;
2373 else
2374 *memory_access_type = VMAT_ELEMENTWISE;
2375 /* Alignment is irrelevant here. */
2376 *alignment_support_scheme = dr_unaligned_supported;
2378 else
2380 int cmp = compare_step_with_zero (vinfo, stmt_info);
2381 if (cmp == 0)
2383 gcc_assert (vls_type == VLS_LOAD);
2384 *memory_access_type = VMAT_INVARIANT;
2385 /* Invariant accesses perform only component accesses, alignment
2386 is irrelevant for them. */
2387 *alignment_support_scheme = dr_unaligned_supported;
2389 else
2391 if (cmp < 0)
2392 *memory_access_type = get_negative_load_store_type
2393 (vinfo, stmt_info, vectype, vls_type, ncopies, poffset);
2394 else
2395 *memory_access_type = VMAT_CONTIGUOUS;
2396 *misalignment = dr_misalignment (STMT_VINFO_DR_INFO (stmt_info),
2397 vectype, *poffset);
2398 *alignment_support_scheme
2399 = vect_supportable_dr_alignment (vinfo,
2400 STMT_VINFO_DR_INFO (stmt_info),
2401 vectype, *misalignment);
2405 if ((*memory_access_type == VMAT_ELEMENTWISE
2406 || *memory_access_type == VMAT_STRIDED_SLP)
2407 && !nunits.is_constant ())
2409 if (dump_enabled_p ())
2410 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2411 "Not using elementwise accesses due to variable "
2412 "vectorization factor.\n");
2413 return false;
2416 if (*alignment_support_scheme == dr_unaligned_unsupported)
2418 if (dump_enabled_p ())
2419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2420 "unsupported unaligned access\n");
2421 return false;
2424 /* FIXME: At the moment the cost model seems to underestimate the
2425 cost of using elementwise accesses. This check preserves the
2426 traditional behavior until that can be fixed. */
2427 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
2428 if (!first_stmt_info)
2429 first_stmt_info = stmt_info;
2430 if (*memory_access_type == VMAT_ELEMENTWISE
2431 && !STMT_VINFO_STRIDED_P (first_stmt_info)
2432 && !(stmt_info == DR_GROUP_FIRST_ELEMENT (stmt_info)
2433 && !DR_GROUP_NEXT_ELEMENT (stmt_info)
2434 && !pow2p_hwi (DR_GROUP_SIZE (stmt_info))))
2436 if (dump_enabled_p ())
2437 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2438 "not falling back to elementwise accesses\n");
2439 return false;
2441 return true;
2444 /* Return true if boolean argument at MASK_INDEX is suitable for vectorizing
2445 conditional operation STMT_INFO. When returning true, store the mask
2446 in *MASK, the type of its definition in *MASK_DT_OUT, the type of the
2447 vectorized mask in *MASK_VECTYPE_OUT and the SLP node corresponding
2448 to the mask in *MASK_NODE if MASK_NODE is not NULL. */
2450 static bool
2451 vect_check_scalar_mask (vec_info *vinfo, stmt_vec_info stmt_info,
2452 slp_tree slp_node, unsigned mask_index,
2453 tree *mask, slp_tree *mask_node,
2454 vect_def_type *mask_dt_out, tree *mask_vectype_out)
2456 enum vect_def_type mask_dt;
2457 tree mask_vectype;
2458 slp_tree mask_node_1;
2459 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, mask_index,
2460 mask, &mask_node_1, &mask_dt, &mask_vectype))
2462 if (dump_enabled_p ())
2463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2464 "mask use not simple.\n");
2465 return false;
2468 if (!VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (*mask)))
2470 if (dump_enabled_p ())
2471 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2472 "mask argument is not a boolean.\n");
2473 return false;
2476 /* If the caller is not prepared for adjusting an external/constant
2477 SLP mask vector type fail. */
2478 if (slp_node
2479 && !mask_node
2480 && SLP_TREE_DEF_TYPE (mask_node_1) != vect_internal_def)
2482 if (dump_enabled_p ())
2483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2484 "SLP mask argument is not vectorized.\n");
2485 return false;
2488 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2489 if (!mask_vectype)
2490 mask_vectype = get_mask_type_for_scalar_type (vinfo, TREE_TYPE (vectype),
2491 mask_node_1);
2493 if (!mask_vectype || !VECTOR_BOOLEAN_TYPE_P (mask_vectype))
2495 if (dump_enabled_p ())
2496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2497 "could not find an appropriate vector mask type.\n");
2498 return false;
2501 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_vectype),
2502 TYPE_VECTOR_SUBPARTS (vectype)))
2504 if (dump_enabled_p ())
2505 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2506 "vector mask type %T"
2507 " does not match vector data type %T.\n",
2508 mask_vectype, vectype);
2510 return false;
2513 *mask_dt_out = mask_dt;
2514 *mask_vectype_out = mask_vectype;
2515 if (mask_node)
2516 *mask_node = mask_node_1;
2517 return true;
2520 /* Return true if stored value is suitable for vectorizing store
2521 statement STMT_INFO. When returning true, store the scalar stored
2522 in *RHS and *RHS_NODE, the type of the definition in *RHS_DT_OUT,
2523 the type of the vectorized store value in
2524 *RHS_VECTYPE_OUT and the type of the store in *VLS_TYPE_OUT. */
2526 static bool
2527 vect_check_store_rhs (vec_info *vinfo, stmt_vec_info stmt_info,
2528 slp_tree slp_node, tree *rhs, slp_tree *rhs_node,
2529 vect_def_type *rhs_dt_out, tree *rhs_vectype_out,
2530 vec_load_store_type *vls_type_out)
2532 int op_no = 0;
2533 if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
2535 if (gimple_call_internal_p (call)
2536 && internal_store_fn_p (gimple_call_internal_fn (call)))
2537 op_no = internal_fn_stored_value_index (gimple_call_internal_fn (call));
2539 if (slp_node)
2540 op_no = vect_slp_child_index_for_operand
2541 (stmt_info->stmt, op_no, STMT_VINFO_GATHER_SCATTER_P (stmt_info));
2543 enum vect_def_type rhs_dt;
2544 tree rhs_vectype;
2545 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, op_no,
2546 rhs, rhs_node, &rhs_dt, &rhs_vectype))
2548 if (dump_enabled_p ())
2549 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2550 "use not simple.\n");
2551 return false;
2554 /* In the case this is a store from a constant make sure
2555 native_encode_expr can handle it. */
2556 if (CONSTANT_CLASS_P (*rhs) && native_encode_expr (*rhs, NULL, 64) == 0)
2558 if (dump_enabled_p ())
2559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2560 "cannot encode constant as a byte sequence.\n");
2561 return false;
2564 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2565 if (rhs_vectype && !useless_type_conversion_p (vectype, rhs_vectype))
2567 if (dump_enabled_p ())
2568 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2569 "incompatible vector types.\n");
2570 return false;
2573 *rhs_dt_out = rhs_dt;
2574 *rhs_vectype_out = rhs_vectype;
2575 if (rhs_dt == vect_constant_def || rhs_dt == vect_external_def)
2576 *vls_type_out = VLS_STORE_INVARIANT;
2577 else
2578 *vls_type_out = VLS_STORE;
2579 return true;
2582 /* Build an all-ones vector mask of type MASKTYPE while vectorizing STMT_INFO.
2583 Note that we support masks with floating-point type, in which case the
2584 floats are interpreted as a bitmask. */
2586 static tree
2587 vect_build_all_ones_mask (vec_info *vinfo,
2588 stmt_vec_info stmt_info, tree masktype)
2590 if (TREE_CODE (masktype) == INTEGER_TYPE)
2591 return build_int_cst (masktype, -1);
2592 else if (VECTOR_BOOLEAN_TYPE_P (masktype)
2593 || TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
2595 tree mask = build_int_cst (TREE_TYPE (masktype), -1);
2596 mask = build_vector_from_val (masktype, mask);
2597 return vect_init_vector (vinfo, stmt_info, mask, masktype, NULL);
2599 else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (masktype)))
2601 REAL_VALUE_TYPE r;
2602 long tmp[6];
2603 for (int j = 0; j < 6; ++j)
2604 tmp[j] = -1;
2605 real_from_target (&r, tmp, TYPE_MODE (TREE_TYPE (masktype)));
2606 tree mask = build_real (TREE_TYPE (masktype), r);
2607 mask = build_vector_from_val (masktype, mask);
2608 return vect_init_vector (vinfo, stmt_info, mask, masktype, NULL);
2610 gcc_unreachable ();
2613 /* Build an all-zero merge value of type VECTYPE while vectorizing
2614 STMT_INFO as a gather load. */
2616 static tree
2617 vect_build_zero_merge_argument (vec_info *vinfo,
2618 stmt_vec_info stmt_info, tree vectype)
2620 tree merge;
2621 if (TREE_CODE (TREE_TYPE (vectype)) == INTEGER_TYPE)
2622 merge = build_int_cst (TREE_TYPE (vectype), 0);
2623 else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (vectype)))
2625 REAL_VALUE_TYPE r;
2626 long tmp[6];
2627 for (int j = 0; j < 6; ++j)
2628 tmp[j] = 0;
2629 real_from_target (&r, tmp, TYPE_MODE (TREE_TYPE (vectype)));
2630 merge = build_real (TREE_TYPE (vectype), r);
2632 else
2633 gcc_unreachable ();
2634 merge = build_vector_from_val (vectype, merge);
2635 return vect_init_vector (vinfo, stmt_info, merge, vectype, NULL);
2638 /* Build a gather load call while vectorizing STMT_INFO. Insert new
2639 instructions before GSI and add them to VEC_STMT. GS_INFO describes
2640 the gather load operation. If the load is conditional, MASK is the
2641 vectorized condition, otherwise MASK is null. PTR is the base
2642 pointer and OFFSET is the vectorized offset. */
2644 static gimple *
2645 vect_build_one_gather_load_call (vec_info *vinfo, stmt_vec_info stmt_info,
2646 gimple_stmt_iterator *gsi,
2647 gather_scatter_info *gs_info,
2648 tree ptr, tree offset, tree mask)
2650 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2651 tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info->decl));
2652 tree rettype = TREE_TYPE (TREE_TYPE (gs_info->decl));
2653 tree srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
2654 /* ptrtype */ arglist = TREE_CHAIN (arglist);
2655 tree idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
2656 tree masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
2657 tree scaletype = TREE_VALUE (arglist);
2658 tree var;
2659 gcc_checking_assert (types_compatible_p (srctype, rettype)
2660 && (!mask
2661 || TREE_CODE (masktype) == INTEGER_TYPE
2662 || types_compatible_p (srctype, masktype)));
2664 tree op = offset;
2665 if (!useless_type_conversion_p (idxtype, TREE_TYPE (op)))
2667 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (op)),
2668 TYPE_VECTOR_SUBPARTS (idxtype)));
2669 var = vect_get_new_ssa_name (idxtype, vect_simple_var);
2670 op = build1 (VIEW_CONVERT_EXPR, idxtype, op);
2671 gassign *new_stmt = gimple_build_assign (var, VIEW_CONVERT_EXPR, op);
2672 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
2673 op = var;
2676 tree src_op = NULL_TREE;
2677 tree mask_op = NULL_TREE;
2678 if (mask)
2680 if (!useless_type_conversion_p (masktype, TREE_TYPE (mask)))
2682 tree utype, optype = TREE_TYPE (mask);
2683 if (VECTOR_TYPE_P (masktype)
2684 || TYPE_MODE (masktype) == TYPE_MODE (optype))
2685 utype = masktype;
2686 else
2687 utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1);
2688 var = vect_get_new_ssa_name (utype, vect_scalar_var);
2689 tree mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask);
2690 gassign *new_stmt
2691 = gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg);
2692 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
2693 mask_arg = var;
2694 if (!useless_type_conversion_p (masktype, utype))
2696 gcc_assert (TYPE_PRECISION (utype)
2697 <= TYPE_PRECISION (masktype));
2698 var = vect_get_new_ssa_name (masktype, vect_scalar_var);
2699 new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg);
2700 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
2701 mask_arg = var;
2703 src_op = build_zero_cst (srctype);
2704 mask_op = mask_arg;
2706 else
2708 src_op = mask;
2709 mask_op = mask;
2712 else
2714 src_op = vect_build_zero_merge_argument (vinfo, stmt_info, rettype);
2715 mask_op = vect_build_all_ones_mask (vinfo, stmt_info, masktype);
2718 tree scale = build_int_cst (scaletype, gs_info->scale);
2719 gimple *new_stmt = gimple_build_call (gs_info->decl, 5, src_op, ptr, op,
2720 mask_op, scale);
2722 if (!useless_type_conversion_p (vectype, rettype))
2724 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype),
2725 TYPE_VECTOR_SUBPARTS (rettype)));
2726 op = vect_get_new_ssa_name (rettype, vect_simple_var);
2727 gimple_call_set_lhs (new_stmt, op);
2728 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
2729 op = build1 (VIEW_CONVERT_EXPR, vectype, op);
2730 new_stmt = gimple_build_assign (NULL_TREE, VIEW_CONVERT_EXPR, op);
2733 return new_stmt;
2736 /* Build a scatter store call while vectorizing STMT_INFO. Insert new
2737 instructions before GSI. GS_INFO describes the scatter store operation.
2738 PTR is the base pointer, OFFSET the vectorized offsets and OPRND the
2739 vectorized data to store.
2740 If the store is conditional, MASK is the vectorized condition, otherwise
2741 MASK is null. */
2743 static gimple *
2744 vect_build_one_scatter_store_call (vec_info *vinfo, stmt_vec_info stmt_info,
2745 gimple_stmt_iterator *gsi,
2746 gather_scatter_info *gs_info,
2747 tree ptr, tree offset, tree oprnd, tree mask)
2749 tree rettype = TREE_TYPE (TREE_TYPE (gs_info->decl));
2750 tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info->decl));
2751 /* tree ptrtype = TREE_VALUE (arglist); */ arglist = TREE_CHAIN (arglist);
2752 tree masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
2753 tree idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
2754 tree srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
2755 tree scaletype = TREE_VALUE (arglist);
2756 gcc_checking_assert (TREE_CODE (masktype) == INTEGER_TYPE
2757 && TREE_CODE (rettype) == VOID_TYPE);
2759 tree mask_arg = NULL_TREE;
2760 if (mask)
2762 mask_arg = mask;
2763 tree optype = TREE_TYPE (mask_arg);
2764 tree utype;
2765 if (TYPE_MODE (masktype) == TYPE_MODE (optype))
2766 utype = masktype;
2767 else
2768 utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1);
2769 tree var = vect_get_new_ssa_name (utype, vect_scalar_var);
2770 mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_arg);
2771 gassign *new_stmt
2772 = gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg);
2773 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
2774 mask_arg = var;
2775 if (!useless_type_conversion_p (masktype, utype))
2777 gcc_assert (TYPE_PRECISION (utype) <= TYPE_PRECISION (masktype));
2778 tree var = vect_get_new_ssa_name (masktype, vect_scalar_var);
2779 new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg);
2780 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
2781 mask_arg = var;
2784 else
2786 mask_arg = build_int_cst (masktype, -1);
2787 mask_arg = vect_init_vector (vinfo, stmt_info, mask_arg, masktype, NULL);
2790 tree src = oprnd;
2791 if (!useless_type_conversion_p (srctype, TREE_TYPE (src)))
2793 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (src)),
2794 TYPE_VECTOR_SUBPARTS (srctype)));
2795 tree var = vect_get_new_ssa_name (srctype, vect_simple_var);
2796 src = build1 (VIEW_CONVERT_EXPR, srctype, src);
2797 gassign *new_stmt = gimple_build_assign (var, VIEW_CONVERT_EXPR, src);
2798 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
2799 src = var;
2802 tree op = offset;
2803 if (!useless_type_conversion_p (idxtype, TREE_TYPE (op)))
2805 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (op)),
2806 TYPE_VECTOR_SUBPARTS (idxtype)));
2807 tree var = vect_get_new_ssa_name (idxtype, vect_simple_var);
2808 op = build1 (VIEW_CONVERT_EXPR, idxtype, op);
2809 gassign *new_stmt = gimple_build_assign (var, VIEW_CONVERT_EXPR, op);
2810 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
2811 op = var;
2814 tree scale = build_int_cst (scaletype, gs_info->scale);
2815 gcall *new_stmt
2816 = gimple_build_call (gs_info->decl, 5, ptr, mask_arg, op, src, scale);
2817 return new_stmt;
2820 /* Prepare the base and offset in GS_INFO for vectorization.
2821 Set *DATAREF_PTR to the loop-invariant base address and *VEC_OFFSET
2822 to the vectorized offset argument for the first copy of STMT_INFO.
2823 STMT_INFO is the statement described by GS_INFO and LOOP is the
2824 containing loop. */
2826 static void
2827 vect_get_gather_scatter_ops (loop_vec_info loop_vinfo,
2828 class loop *loop, stmt_vec_info stmt_info,
2829 slp_tree slp_node, gather_scatter_info *gs_info,
2830 tree *dataref_ptr, vec<tree> *vec_offset)
2832 gimple_seq stmts = NULL;
2833 *dataref_ptr = force_gimple_operand (gs_info->base, &stmts, true, NULL_TREE);
2834 if (stmts != NULL)
2836 basic_block new_bb;
2837 edge pe = loop_preheader_edge (loop);
2838 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2839 gcc_assert (!new_bb);
2841 if (slp_node)
2842 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], vec_offset);
2843 else
2845 unsigned ncopies
2846 = vect_get_num_copies (loop_vinfo, gs_info->offset_vectype);
2847 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, ncopies,
2848 gs_info->offset, vec_offset,
2849 gs_info->offset_vectype);
2853 /* Prepare to implement a grouped or strided load or store using
2854 the gather load or scatter store operation described by GS_INFO.
2855 STMT_INFO is the load or store statement.
2857 Set *DATAREF_BUMP to the amount that should be added to the base
2858 address after each copy of the vectorized statement. Set *VEC_OFFSET
2859 to an invariant offset vector in which element I has the value
2860 I * DR_STEP / SCALE. */
2862 static void
2863 vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
2864 loop_vec_info loop_vinfo,
2865 gimple_stmt_iterator *gsi,
2866 gather_scatter_info *gs_info,
2867 tree *dataref_bump, tree *vec_offset,
2868 vec_loop_lens *loop_lens)
2870 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2871 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2873 if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2875 /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]);
2876 ivtmp_8 = _31 * 16 (step in bytes);
2877 .MASK_LEN_SCATTER_STORE (vectp_a.9_7, ... );
2878 vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */
2879 tree loop_len
2880 = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0);
2881 tree tmp
2882 = fold_build2 (MULT_EXPR, sizetype,
2883 fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
2884 loop_len);
2885 *dataref_bump = force_gimple_operand_gsi (gsi, tmp, true, NULL_TREE, true,
2886 GSI_SAME_STMT);
2888 else
2890 tree bump
2891 = size_binop (MULT_EXPR,
2892 fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
2893 size_int (TYPE_VECTOR_SUBPARTS (vectype)));
2894 *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
2897 /* The offset given in GS_INFO can have pointer type, so use the element
2898 type of the vector instead. */
2899 tree offset_type = TREE_TYPE (gs_info->offset_vectype);
2901 /* Calculate X = DR_STEP / SCALE and convert it to the appropriate type. */
2902 tree step = size_binop (EXACT_DIV_EXPR, unshare_expr (DR_STEP (dr)),
2903 ssize_int (gs_info->scale));
2904 step = fold_convert (offset_type, step);
2906 /* Create {0, X, X*2, X*3, ...}. */
2907 tree offset = fold_build2 (VEC_SERIES_EXPR, gs_info->offset_vectype,
2908 build_zero_cst (offset_type), step);
2909 *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo, offset);
2912 /* Prepare the pointer IVs which needs to be updated by a variable amount.
2913 Such variable amount is the outcome of .SELECT_VL. In this case, we can
2914 allow each iteration process the flexible number of elements as long as
2915 the number <= vf elments.
2917 Return data reference according to SELECT_VL.
2918 If new statements are needed, insert them before GSI. */
2920 static tree
2921 vect_get_loop_variant_data_ptr_increment (
2922 vec_info *vinfo, tree aggr_type, gimple_stmt_iterator *gsi,
2923 vec_loop_lens *loop_lens, dr_vec_info *dr_info,
2924 vect_memory_access_type memory_access_type)
2926 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
2927 tree step = vect_dr_behavior (vinfo, dr_info)->step;
2929 /* gather/scatter never reach here. */
2930 gcc_assert (memory_access_type != VMAT_GATHER_SCATTER);
2932 /* When we support SELECT_VL pattern, we dynamic adjust
2933 the memory address by .SELECT_VL result.
2935 The result of .SELECT_VL is the number of elements to
2936 be processed of each iteration. So the memory address
2937 adjustment operation should be:
2939 addr = addr + .SELECT_VL (ARG..) * step;
2941 tree loop_len
2942 = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, aggr_type, 0, 0);
2943 tree len_type = TREE_TYPE (loop_len);
2944 /* Since the outcome of .SELECT_VL is element size, we should adjust
2945 it into bytesize so that it can be used in address pointer variable
2946 amount IVs adjustment. */
2947 tree tmp = fold_build2 (MULT_EXPR, len_type, loop_len,
2948 wide_int_to_tree (len_type, wi::to_widest (step)));
2949 tree bump = make_temp_ssa_name (len_type, NULL, "ivtmp");
2950 gassign *assign = gimple_build_assign (bump, tmp);
2951 gsi_insert_before (gsi, assign, GSI_SAME_STMT);
2952 return bump;
2955 /* Return the amount that should be added to a vector pointer to move
2956 to the next or previous copy of AGGR_TYPE. DR_INFO is the data reference
2957 being vectorized and MEMORY_ACCESS_TYPE describes the type of
2958 vectorization. */
2960 static tree
2961 vect_get_data_ptr_increment (vec_info *vinfo, gimple_stmt_iterator *gsi,
2962 dr_vec_info *dr_info, tree aggr_type,
2963 vect_memory_access_type memory_access_type,
2964 vec_loop_lens *loop_lens = nullptr)
2966 if (memory_access_type == VMAT_INVARIANT)
2967 return size_zero_node;
2969 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
2970 if (loop_vinfo && LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2971 return vect_get_loop_variant_data_ptr_increment (vinfo, aggr_type, gsi,
2972 loop_lens, dr_info,
2973 memory_access_type);
2975 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
2976 tree step = vect_dr_behavior (vinfo, dr_info)->step;
2977 if (tree_int_cst_sgn (step) == -1)
2978 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
2979 return iv_step;
2982 /* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64,128}. */
2984 static bool
2985 vectorizable_bswap (vec_info *vinfo,
2986 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
2987 gimple **vec_stmt, slp_tree slp_node,
2988 slp_tree *slp_op,
2989 tree vectype_in, stmt_vector_for_cost *cost_vec)
2991 tree op, vectype;
2992 gcall *stmt = as_a <gcall *> (stmt_info->stmt);
2993 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2994 unsigned ncopies;
2996 op = gimple_call_arg (stmt, 0);
2997 vectype = STMT_VINFO_VECTYPE (stmt_info);
2998 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3000 /* Multiple types in SLP are handled by creating the appropriate number of
3001 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
3002 case of SLP. */
3003 if (slp_node)
3004 ncopies = 1;
3005 else
3006 ncopies = vect_get_num_copies (loop_vinfo, vectype);
3008 gcc_assert (ncopies >= 1);
3010 if (TYPE_SIZE (vectype_in) != TYPE_SIZE (vectype))
3012 if (dump_enabled_p ())
3013 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3014 "mismatched vector sizes %T and %T\n",
3015 vectype_in, vectype);
3016 return false;
3019 tree char_vectype = get_same_sized_vectype (char_type_node, vectype_in);
3020 if (! char_vectype)
3021 return false;
3023 poly_uint64 num_bytes = TYPE_VECTOR_SUBPARTS (char_vectype);
3024 unsigned word_bytes;
3025 if (!constant_multiple_p (num_bytes, nunits, &word_bytes))
3026 return false;
3028 /* The encoding uses one stepped pattern for each byte in the word. */
3029 vec_perm_builder elts (num_bytes, word_bytes, 3);
3030 for (unsigned i = 0; i < 3; ++i)
3031 for (unsigned j = 0; j < word_bytes; ++j)
3032 elts.quick_push ((i + 1) * word_bytes - j - 1);
3034 vec_perm_indices indices (elts, 1, num_bytes);
3035 machine_mode vmode = TYPE_MODE (char_vectype);
3036 if (!can_vec_perm_const_p (vmode, vmode, indices))
3037 return false;
3039 if (! vec_stmt)
3041 if (slp_node
3042 && !vect_maybe_update_slp_op_vectype (slp_op[0], vectype_in))
3044 if (dump_enabled_p ())
3045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3046 "incompatible vector types for invariants\n");
3047 return false;
3050 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3051 DUMP_VECT_SCOPE ("vectorizable_bswap");
3052 record_stmt_cost (cost_vec,
3053 1, vector_stmt, stmt_info, 0, vect_prologue);
3054 record_stmt_cost (cost_vec,
3055 slp_node
3056 ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) : ncopies,
3057 vec_perm, stmt_info, 0, vect_body);
3058 return true;
3061 tree bswap_vconst = vec_perm_indices_to_tree (char_vectype, indices);
3063 /* Transform. */
3064 vec<tree> vec_oprnds = vNULL;
3065 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
3066 op, &vec_oprnds);
3067 /* Arguments are ready. create the new vector stmt. */
3068 unsigned i;
3069 tree vop;
3070 FOR_EACH_VEC_ELT (vec_oprnds, i, vop)
3072 gimple *new_stmt;
3073 tree tem = make_ssa_name (char_vectype);
3074 new_stmt = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR,
3075 char_vectype, vop));
3076 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3077 tree tem2 = make_ssa_name (char_vectype);
3078 new_stmt = gimple_build_assign (tem2, VEC_PERM_EXPR,
3079 tem, tem, bswap_vconst);
3080 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3081 tem = make_ssa_name (vectype);
3082 new_stmt = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR,
3083 vectype, tem2));
3084 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3085 if (slp_node)
3086 slp_node->push_vec_def (new_stmt);
3087 else
3088 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
3091 if (!slp_node)
3092 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
3094 vec_oprnds.release ();
3095 return true;
3098 /* Return true if vector types VECTYPE_IN and VECTYPE_OUT have
3099 integer elements and if we can narrow VECTYPE_IN to VECTYPE_OUT
3100 in a single step. On success, store the binary pack code in
3101 *CONVERT_CODE. */
3103 static bool
3104 simple_integer_narrowing (tree vectype_out, tree vectype_in,
3105 code_helper *convert_code)
3107 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype_out))
3108 || !INTEGRAL_TYPE_P (TREE_TYPE (vectype_in)))
3109 return false;
3111 code_helper code;
3112 int multi_step_cvt = 0;
3113 auto_vec <tree, 8> interm_types;
3114 if (!supportable_narrowing_operation (NOP_EXPR, vectype_out, vectype_in,
3115 &code, &multi_step_cvt, &interm_types)
3116 || multi_step_cvt)
3117 return false;
3119 *convert_code = code;
3120 return true;
3123 /* Function vectorizable_call.
3125 Check if STMT_INFO performs a function call that can be vectorized.
3126 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
3127 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3128 Return true if STMT_INFO is vectorizable in this way. */
3130 static bool
3131 vectorizable_call (vec_info *vinfo,
3132 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
3133 gimple **vec_stmt, slp_tree slp_node,
3134 stmt_vector_for_cost *cost_vec)
3136 gcall *stmt;
3137 tree vec_dest;
3138 tree scalar_dest;
3139 tree op;
3140 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3141 tree vectype_out, vectype_in;
3142 poly_uint64 nunits_in;
3143 poly_uint64 nunits_out;
3144 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3145 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
3146 tree fndecl, new_temp, rhs_type;
3147 enum vect_def_type dt[4]
3148 = { vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type,
3149 vect_unknown_def_type };
3150 tree vectypes[ARRAY_SIZE (dt)] = {};
3151 slp_tree slp_op[ARRAY_SIZE (dt)] = {};
3152 int ndts = ARRAY_SIZE (dt);
3153 int ncopies, j;
3154 auto_vec<tree, 8> vargs;
3155 enum { NARROW, NONE, WIDEN } modifier;
3156 size_t i, nargs;
3157 tree lhs;
3158 tree clz_ctz_arg1 = NULL_TREE;
3160 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
3161 return false;
3163 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
3164 && ! vec_stmt)
3165 return false;
3167 /* Is STMT_INFO a vectorizable call? */
3168 stmt = dyn_cast <gcall *> (stmt_info->stmt);
3169 if (!stmt)
3170 return false;
3172 if (gimple_call_internal_p (stmt)
3173 && (internal_load_fn_p (gimple_call_internal_fn (stmt))
3174 || internal_store_fn_p (gimple_call_internal_fn (stmt))))
3175 /* Handled by vectorizable_load and vectorizable_store. */
3176 return false;
3178 if (gimple_call_lhs (stmt) == NULL_TREE
3179 || TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
3180 return false;
3182 gcc_checking_assert (!stmt_can_throw_internal (cfun, stmt));
3184 vectype_out = STMT_VINFO_VECTYPE (stmt_info);
3186 /* Process function arguments. */
3187 rhs_type = NULL_TREE;
3188 vectype_in = NULL_TREE;
3189 nargs = gimple_call_num_args (stmt);
3191 /* Bail out if the function has more than four arguments, we do not have
3192 interesting builtin functions to vectorize with more than two arguments
3193 except for fma. No arguments is also not good. */
3194 if (nargs == 0 || nargs > 4)
3195 return false;
3197 /* Ignore the arguments of IFN_GOMP_SIMD_LANE, they are magic. */
3198 combined_fn cfn = gimple_call_combined_fn (stmt);
3199 if (cfn == CFN_GOMP_SIMD_LANE)
3201 nargs = 0;
3202 rhs_type = unsigned_type_node;
3204 /* Similarly pretend IFN_CLZ and IFN_CTZ only has one argument, the second
3205 argument just says whether it is well-defined at zero or not and what
3206 value should be returned for it. */
3207 if ((cfn == CFN_CLZ || cfn == CFN_CTZ) && nargs == 2)
3209 nargs = 1;
3210 clz_ctz_arg1 = gimple_call_arg (stmt, 1);
3213 int mask_opno = -1;
3214 if (internal_fn_p (cfn))
3215 mask_opno = internal_fn_mask_index (as_internal_fn (cfn));
3217 for (i = 0; i < nargs; i++)
3219 if ((int) i == mask_opno)
3221 if (!vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_opno,
3222 &op, &slp_op[i], &dt[i], &vectypes[i]))
3223 return false;
3224 continue;
3227 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
3228 i, &op, &slp_op[i], &dt[i], &vectypes[i]))
3230 if (dump_enabled_p ())
3231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3232 "use not simple.\n");
3233 return false;
3236 /* We can only handle calls with arguments of the same type. */
3237 if (rhs_type
3238 && !types_compatible_p (rhs_type, TREE_TYPE (op)))
3240 if (dump_enabled_p ())
3241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3242 "argument types differ.\n");
3243 return false;
3245 if (!rhs_type)
3246 rhs_type = TREE_TYPE (op);
3248 if (!vectype_in)
3249 vectype_in = vectypes[i];
3250 else if (vectypes[i]
3251 && !types_compatible_p (vectypes[i], vectype_in))
3253 if (dump_enabled_p ())
3254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3255 "argument vector types differ.\n");
3256 return false;
3259 /* If all arguments are external or constant defs, infer the vector type
3260 from the scalar type. */
3261 if (!vectype_in)
3262 vectype_in = get_vectype_for_scalar_type (vinfo, rhs_type, slp_node);
3263 if (vec_stmt)
3264 gcc_assert (vectype_in);
3265 if (!vectype_in)
3267 if (dump_enabled_p ())
3268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3269 "no vectype for scalar type %T\n", rhs_type);
3271 return false;
3274 if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
3275 != VECTOR_BOOLEAN_TYPE_P (vectype_in))
3277 if (dump_enabled_p ())
3278 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3279 "mixed mask and nonmask vector types\n");
3280 return false;
3283 if (vect_emulated_vector_p (vectype_in) || vect_emulated_vector_p (vectype_out))
3285 if (dump_enabled_p ())
3286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3287 "use emulated vector type for call\n");
3288 return false;
3291 /* FORNOW */
3292 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3293 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3294 if (known_eq (nunits_in * 2, nunits_out))
3295 modifier = NARROW;
3296 else if (known_eq (nunits_out, nunits_in))
3297 modifier = NONE;
3298 else if (known_eq (nunits_out * 2, nunits_in))
3299 modifier = WIDEN;
3300 else
3301 return false;
3303 /* We only handle functions that do not read or clobber memory. */
3304 if (gimple_vuse (stmt))
3306 if (dump_enabled_p ())
3307 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3308 "function reads from or writes to memory.\n");
3309 return false;
3312 /* For now, we only vectorize functions if a target specific builtin
3313 is available. TODO -- in some cases, it might be profitable to
3314 insert the calls for pieces of the vector, in order to be able
3315 to vectorize other operations in the loop. */
3316 fndecl = NULL_TREE;
3317 internal_fn ifn = IFN_LAST;
3318 tree callee = gimple_call_fndecl (stmt);
3320 /* First try using an internal function. */
3321 code_helper convert_code = MAX_TREE_CODES;
3322 if (cfn != CFN_LAST
3323 && (modifier == NONE
3324 || (modifier == NARROW
3325 && simple_integer_narrowing (vectype_out, vectype_in,
3326 &convert_code))))
3327 ifn = vectorizable_internal_function (cfn, callee, vectype_out,
3328 vectype_in);
3330 /* If that fails, try asking for a target-specific built-in function. */
3331 if (ifn == IFN_LAST)
3333 if (cfn != CFN_LAST)
3334 fndecl = targetm.vectorize.builtin_vectorized_function
3335 (cfn, vectype_out, vectype_in);
3336 else if (callee && fndecl_built_in_p (callee, BUILT_IN_MD))
3337 fndecl = targetm.vectorize.builtin_md_vectorized_function
3338 (callee, vectype_out, vectype_in);
3341 if (ifn == IFN_LAST && !fndecl)
3343 if (cfn == CFN_GOMP_SIMD_LANE
3344 && !slp_node
3345 && loop_vinfo
3346 && LOOP_VINFO_LOOP (loop_vinfo)->simduid
3347 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
3348 && LOOP_VINFO_LOOP (loop_vinfo)->simduid
3349 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
3351 /* We can handle IFN_GOMP_SIMD_LANE by returning a
3352 { 0, 1, 2, ... vf - 1 } vector. */
3353 gcc_assert (nargs == 0);
3355 else if (modifier == NONE
3356 && (gimple_call_builtin_p (stmt, BUILT_IN_BSWAP16)
3357 || gimple_call_builtin_p (stmt, BUILT_IN_BSWAP32)
3358 || gimple_call_builtin_p (stmt, BUILT_IN_BSWAP64)
3359 || gimple_call_builtin_p (stmt, BUILT_IN_BSWAP128)))
3360 return vectorizable_bswap (vinfo, stmt_info, gsi, vec_stmt, slp_node,
3361 slp_op, vectype_in, cost_vec);
3362 else
3364 if (dump_enabled_p ())
3365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3366 "function is not vectorizable.\n");
3367 return false;
3371 if (slp_node)
3372 ncopies = 1;
3373 else if (modifier == NARROW && ifn == IFN_LAST)
3374 ncopies = vect_get_num_copies (loop_vinfo, vectype_out);
3375 else
3376 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
3378 /* Sanity check: make sure that at least one copy of the vectorized stmt
3379 needs to be generated. */
3380 gcc_assert (ncopies >= 1);
3382 int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
3383 internal_fn cond_fn = get_conditional_internal_fn (ifn);
3384 internal_fn cond_len_fn = get_len_internal_fn (ifn);
3385 int len_opno = internal_fn_len_index (cond_len_fn);
3386 vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
3387 vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
3388 if (!vec_stmt) /* transformation not required. */
3390 if (slp_node)
3391 for (i = 0; i < nargs; ++i)
3392 if (!vect_maybe_update_slp_op_vectype (slp_op[i],
3393 vectypes[i]
3394 ? vectypes[i] : vectype_in))
3396 if (dump_enabled_p ())
3397 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3398 "incompatible vector types for invariants\n");
3399 return false;
3401 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3402 DUMP_VECT_SCOPE ("vectorizable_call");
3403 vect_model_simple_cost (vinfo, stmt_info,
3404 ncopies, dt, ndts, slp_node, cost_vec);
3405 if (ifn != IFN_LAST && modifier == NARROW && !slp_node)
3406 record_stmt_cost (cost_vec, ncopies / 2,
3407 vec_promote_demote, stmt_info, 0, vect_body);
3409 if (loop_vinfo
3410 && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3411 && (reduc_idx >= 0 || mask_opno >= 0))
3413 if (reduc_idx >= 0
3414 && (cond_fn == IFN_LAST
3415 || !direct_internal_fn_supported_p (cond_fn, vectype_out,
3416 OPTIMIZE_FOR_SPEED))
3417 && (cond_len_fn == IFN_LAST
3418 || !direct_internal_fn_supported_p (cond_len_fn, vectype_out,
3419 OPTIMIZE_FOR_SPEED)))
3421 if (dump_enabled_p ())
3422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3423 "can't use a fully-masked loop because no"
3424 " conditional operation is available.\n");
3425 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3427 else
3429 unsigned int nvectors
3430 = (slp_node
3431 ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
3432 : ncopies);
3433 tree scalar_mask = NULL_TREE;
3434 if (mask_opno >= 0)
3435 scalar_mask = gimple_call_arg (stmt_info->stmt, mask_opno);
3436 if (cond_len_fn != IFN_LAST
3437 && direct_internal_fn_supported_p (cond_len_fn, vectype_out,
3438 OPTIMIZE_FOR_SPEED))
3439 vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_out,
3441 else
3442 vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out,
3443 scalar_mask);
3446 return true;
3449 /* Transform. */
3451 if (dump_enabled_p ())
3452 dump_printf_loc (MSG_NOTE, vect_location, "transform call.\n");
3454 /* Handle def. */
3455 scalar_dest = gimple_call_lhs (stmt);
3456 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3458 bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
3459 bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
3460 unsigned int vect_nargs = nargs;
3461 if (len_loop_p)
3463 if (len_opno >= 0)
3465 ifn = cond_len_fn;
3466 /* COND_* -> COND_LEN_* takes 2 extra arguments:LEN,BIAS. */
3467 vect_nargs += 2;
3469 else if (reduc_idx >= 0)
3470 gcc_unreachable ();
3472 else if (masked_loop_p && reduc_idx >= 0)
3474 ifn = cond_fn;
3475 vect_nargs += 2;
3477 if (clz_ctz_arg1)
3478 ++vect_nargs;
3480 if (modifier == NONE || ifn != IFN_LAST)
3482 tree prev_res = NULL_TREE;
3483 vargs.safe_grow (vect_nargs, true);
3484 auto_vec<vec<tree> > vec_defs (nargs);
3485 for (j = 0; j < ncopies; ++j)
3487 /* Build argument list for the vectorized call. */
3488 if (slp_node)
3490 vec<tree> vec_oprnds0;
3492 vect_get_slp_defs (vinfo, slp_node, &vec_defs);
3493 vec_oprnds0 = vec_defs[0];
3495 /* Arguments are ready. Create the new vector stmt. */
3496 FOR_EACH_VEC_ELT (vec_oprnds0, i, vec_oprnd0)
3498 int varg = 0;
3499 if (masked_loop_p && reduc_idx >= 0)
3501 unsigned int vec_num = vec_oprnds0.length ();
3502 /* Always true for SLP. */
3503 gcc_assert (ncopies == 1);
3504 vargs[varg++] = vect_get_loop_mask (loop_vinfo,
3505 gsi, masks, vec_num,
3506 vectype_out, i);
3508 size_t k;
3509 for (k = 0; k < nargs; k++)
3511 vec<tree> vec_oprndsk = vec_defs[k];
3512 vargs[varg++] = vec_oprndsk[i];
3514 if (masked_loop_p && reduc_idx >= 0)
3515 vargs[varg++] = vargs[reduc_idx + 1];
3516 if (clz_ctz_arg1)
3517 vargs[varg++] = clz_ctz_arg1;
3519 gimple *new_stmt;
3520 if (modifier == NARROW)
3522 /* We don't define any narrowing conditional functions
3523 at present. */
3524 gcc_assert (mask_opno < 0);
3525 tree half_res = make_ssa_name (vectype_in);
3526 gcall *call
3527 = gimple_build_call_internal_vec (ifn, vargs);
3528 gimple_call_set_lhs (call, half_res);
3529 gimple_call_set_nothrow (call, true);
3530 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
3531 if ((i & 1) == 0)
3533 prev_res = half_res;
3534 continue;
3536 new_temp = make_ssa_name (vec_dest);
3537 new_stmt = vect_gimple_build (new_temp, convert_code,
3538 prev_res, half_res);
3539 vect_finish_stmt_generation (vinfo, stmt_info,
3540 new_stmt, gsi);
3542 else
3544 if (len_opno >= 0 && len_loop_p)
3546 unsigned int vec_num = vec_oprnds0.length ();
3547 /* Always true for SLP. */
3548 gcc_assert (ncopies == 1);
3549 tree len
3550 = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num,
3551 vectype_out, i, 1);
3552 signed char biasval
3553 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
3554 tree bias = build_int_cst (intQI_type_node, biasval);
3555 vargs[len_opno] = len;
3556 vargs[len_opno + 1] = bias;
3558 else if (mask_opno >= 0 && masked_loop_p)
3560 unsigned int vec_num = vec_oprnds0.length ();
3561 /* Always true for SLP. */
3562 gcc_assert (ncopies == 1);
3563 tree mask = vect_get_loop_mask (loop_vinfo,
3564 gsi, masks, vec_num,
3565 vectype_out, i);
3566 vargs[mask_opno] = prepare_vec_mask
3567 (loop_vinfo, TREE_TYPE (mask), mask,
3568 vargs[mask_opno], gsi);
3571 gcall *call;
3572 if (ifn != IFN_LAST)
3573 call = gimple_build_call_internal_vec (ifn, vargs);
3574 else
3575 call = gimple_build_call_vec (fndecl, vargs);
3576 new_temp = make_ssa_name (vec_dest, call);
3577 gimple_call_set_lhs (call, new_temp);
3578 gimple_call_set_nothrow (call, true);
3579 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
3580 new_stmt = call;
3582 slp_node->push_vec_def (new_stmt);
3584 continue;
3587 int varg = 0;
3588 if (masked_loop_p && reduc_idx >= 0)
3589 vargs[varg++] = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies,
3590 vectype_out, j);
3591 for (i = 0; i < nargs; i++)
3593 op = gimple_call_arg (stmt, i);
3594 if (j == 0)
3596 vec_defs.quick_push (vNULL);
3597 vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
3598 op, &vec_defs[i],
3599 vectypes[i]);
3601 vargs[varg++] = vec_defs[i][j];
3603 if (masked_loop_p && reduc_idx >= 0)
3604 vargs[varg++] = vargs[reduc_idx + 1];
3605 if (clz_ctz_arg1)
3606 vargs[varg++] = clz_ctz_arg1;
3608 if (len_opno >= 0 && len_loop_p)
3610 tree len = vect_get_loop_len (loop_vinfo, gsi, lens, ncopies,
3611 vectype_out, j, 1);
3612 signed char biasval
3613 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
3614 tree bias = build_int_cst (intQI_type_node, biasval);
3615 vargs[len_opno] = len;
3616 vargs[len_opno + 1] = bias;
3618 else if (mask_opno >= 0 && masked_loop_p)
3620 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies,
3621 vectype_out, j);
3622 vargs[mask_opno]
3623 = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
3624 vargs[mask_opno], gsi);
3627 gimple *new_stmt;
3628 if (cfn == CFN_GOMP_SIMD_LANE)
3630 tree cst = build_index_vector (vectype_out, j * nunits_out, 1);
3631 tree new_var
3632 = vect_get_new_ssa_name (vectype_out, vect_simple_var, "cst_");
3633 gimple *init_stmt = gimple_build_assign (new_var, cst);
3634 vect_init_vector_1 (vinfo, stmt_info, init_stmt, NULL);
3635 new_temp = make_ssa_name (vec_dest);
3636 new_stmt = gimple_build_assign (new_temp, new_var);
3637 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3639 else if (modifier == NARROW)
3641 /* We don't define any narrowing conditional functions at
3642 present. */
3643 gcc_assert (mask_opno < 0);
3644 tree half_res = make_ssa_name (vectype_in);
3645 gcall *call = gimple_build_call_internal_vec (ifn, vargs);
3646 gimple_call_set_lhs (call, half_res);
3647 gimple_call_set_nothrow (call, true);
3648 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
3649 if ((j & 1) == 0)
3651 prev_res = half_res;
3652 continue;
3654 new_temp = make_ssa_name (vec_dest);
3655 new_stmt = vect_gimple_build (new_temp, convert_code, prev_res,
3656 half_res);
3657 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3659 else
3661 gcall *call;
3662 if (ifn != IFN_LAST)
3663 call = gimple_build_call_internal_vec (ifn, vargs);
3664 else
3665 call = gimple_build_call_vec (fndecl, vargs);
3666 new_temp = make_ssa_name (vec_dest, call);
3667 gimple_call_set_lhs (call, new_temp);
3668 gimple_call_set_nothrow (call, true);
3669 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
3670 new_stmt = call;
3673 if (j == (modifier == NARROW ? 1 : 0))
3674 *vec_stmt = new_stmt;
3675 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
3677 for (i = 0; i < nargs; i++)
3679 vec<tree> vec_oprndsi = vec_defs[i];
3680 vec_oprndsi.release ();
3683 else if (modifier == NARROW)
3685 auto_vec<vec<tree> > vec_defs (nargs);
3686 /* We don't define any narrowing conditional functions at present. */
3687 gcc_assert (mask_opno < 0);
3688 for (j = 0; j < ncopies; ++j)
3690 /* Build argument list for the vectorized call. */
3691 if (j == 0)
3692 vargs.create (nargs * 2);
3693 else
3694 vargs.truncate (0);
3696 if (slp_node)
3698 vec<tree> vec_oprnds0;
3700 vect_get_slp_defs (vinfo, slp_node, &vec_defs);
3701 vec_oprnds0 = vec_defs[0];
3703 /* Arguments are ready. Create the new vector stmt. */
3704 for (i = 0; vec_oprnds0.iterate (i, &vec_oprnd0); i += 2)
3706 size_t k;
3707 vargs.truncate (0);
3708 for (k = 0; k < nargs; k++)
3710 vec<tree> vec_oprndsk = vec_defs[k];
3711 vargs.quick_push (vec_oprndsk[i]);
3712 vargs.quick_push (vec_oprndsk[i + 1]);
3714 gcall *call;
3715 if (ifn != IFN_LAST)
3716 call = gimple_build_call_internal_vec (ifn, vargs);
3717 else
3718 call = gimple_build_call_vec (fndecl, vargs);
3719 new_temp = make_ssa_name (vec_dest, call);
3720 gimple_call_set_lhs (call, new_temp);
3721 gimple_call_set_nothrow (call, true);
3722 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
3723 slp_node->push_vec_def (call);
3725 continue;
3728 for (i = 0; i < nargs; i++)
3730 op = gimple_call_arg (stmt, i);
3731 if (j == 0)
3733 vec_defs.quick_push (vNULL);
3734 vect_get_vec_defs_for_operand (vinfo, stmt_info, 2 * ncopies,
3735 op, &vec_defs[i], vectypes[i]);
3737 vec_oprnd0 = vec_defs[i][2*j];
3738 vec_oprnd1 = vec_defs[i][2*j+1];
3740 vargs.quick_push (vec_oprnd0);
3741 vargs.quick_push (vec_oprnd1);
3744 gcall *new_stmt = gimple_build_call_vec (fndecl, vargs);
3745 new_temp = make_ssa_name (vec_dest, new_stmt);
3746 gimple_call_set_lhs (new_stmt, new_temp);
3747 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3749 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
3752 if (!slp_node)
3753 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
3755 for (i = 0; i < nargs; i++)
3757 vec<tree> vec_oprndsi = vec_defs[i];
3758 vec_oprndsi.release ();
3761 else
3762 /* No current target implements this case. */
3763 return false;
3765 vargs.release ();
3767 /* The call in STMT might prevent it from being removed in dce.
3768 We however cannot remove it here, due to the way the ssa name
3769 it defines is mapped to the new definition. So just replace
3770 rhs of the statement with something harmless. */
3772 if (slp_node)
3773 return true;
3775 stmt_info = vect_orig_stmt (stmt_info);
3776 lhs = gimple_get_lhs (stmt_info->stmt);
3778 gassign *new_stmt
3779 = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3780 vinfo->replace_stmt (gsi, stmt_info, new_stmt);
3782 return true;
3786 struct simd_call_arg_info
3788 tree vectype;
3789 tree op;
3790 HOST_WIDE_INT linear_step;
3791 enum vect_def_type dt;
3792 unsigned int align;
3793 bool simd_lane_linear;
3796 /* Helper function of vectorizable_simd_clone_call. If OP, an SSA_NAME,
3797 is linear within simd lane (but not within whole loop), note it in
3798 *ARGINFO. */
3800 static void
3801 vect_simd_lane_linear (tree op, class loop *loop,
3802 struct simd_call_arg_info *arginfo)
3804 gimple *def_stmt = SSA_NAME_DEF_STMT (op);
3806 if (!is_gimple_assign (def_stmt)
3807 || gimple_assign_rhs_code (def_stmt) != POINTER_PLUS_EXPR
3808 || !is_gimple_min_invariant (gimple_assign_rhs1 (def_stmt)))
3809 return;
3811 tree base = gimple_assign_rhs1 (def_stmt);
3812 HOST_WIDE_INT linear_step = 0;
3813 tree v = gimple_assign_rhs2 (def_stmt);
3814 while (TREE_CODE (v) == SSA_NAME)
3816 tree t;
3817 def_stmt = SSA_NAME_DEF_STMT (v);
3818 if (is_gimple_assign (def_stmt))
3819 switch (gimple_assign_rhs_code (def_stmt))
3821 case PLUS_EXPR:
3822 t = gimple_assign_rhs2 (def_stmt);
3823 if (linear_step || TREE_CODE (t) != INTEGER_CST)
3824 return;
3825 base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (base), base, t);
3826 v = gimple_assign_rhs1 (def_stmt);
3827 continue;
3828 case MULT_EXPR:
3829 t = gimple_assign_rhs2 (def_stmt);
3830 if (linear_step || !tree_fits_shwi_p (t) || integer_zerop (t))
3831 return;
3832 linear_step = tree_to_shwi (t);
3833 v = gimple_assign_rhs1 (def_stmt);
3834 continue;
3835 CASE_CONVERT:
3836 t = gimple_assign_rhs1 (def_stmt);
3837 if (TREE_CODE (TREE_TYPE (t)) != INTEGER_TYPE
3838 || (TYPE_PRECISION (TREE_TYPE (v))
3839 < TYPE_PRECISION (TREE_TYPE (t))))
3840 return;
3841 if (!linear_step)
3842 linear_step = 1;
3843 v = t;
3844 continue;
3845 default:
3846 return;
3848 else if (gimple_call_internal_p (def_stmt, IFN_GOMP_SIMD_LANE)
3849 && loop->simduid
3850 && TREE_CODE (gimple_call_arg (def_stmt, 0)) == SSA_NAME
3851 && (SSA_NAME_VAR (gimple_call_arg (def_stmt, 0))
3852 == loop->simduid))
3854 if (!linear_step)
3855 linear_step = 1;
3856 arginfo->linear_step = linear_step;
3857 arginfo->op = base;
3858 arginfo->simd_lane_linear = true;
3859 return;
3864 /* Function vectorizable_simd_clone_call.
3866 Check if STMT_INFO performs a function call that can be vectorized
3867 by calling a simd clone of the function.
3868 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
3869 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3870 Return true if STMT_INFO is vectorizable in this way. */
3872 static bool
3873 vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
3874 gimple_stmt_iterator *gsi,
3875 gimple **vec_stmt, slp_tree slp_node,
3876 stmt_vector_for_cost *)
3878 tree vec_dest;
3879 tree scalar_dest;
3880 tree op, type;
3881 tree vec_oprnd0 = NULL_TREE;
3882 tree vectype;
3883 poly_uint64 nunits;
3884 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3885 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
3886 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
3887 tree fndecl, new_temp;
3888 int ncopies, j;
3889 auto_vec<simd_call_arg_info> arginfo;
3890 vec<tree> vargs = vNULL;
3891 size_t i, nargs;
3892 tree lhs, rtype, ratype;
3893 vec<constructor_elt, va_gc> *ret_ctor_elts = NULL;
3894 int masked_call_offset = 0;
3896 /* Is STMT a vectorizable call? */
3897 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
3898 if (!stmt)
3899 return false;
3901 fndecl = gimple_call_fndecl (stmt);
3902 if (fndecl == NULL_TREE
3903 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
3905 fndecl = gimple_call_arg (stmt, 0);
3906 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
3907 fndecl = TREE_OPERAND (fndecl, 0);
3908 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
3909 masked_call_offset = 1;
3911 if (fndecl == NULL_TREE)
3912 return false;
3914 struct cgraph_node *node = cgraph_node::get (fndecl);
3915 if (node == NULL || node->simd_clones == NULL)
3916 return false;
3918 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
3919 return false;
3921 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
3922 && ! vec_stmt)
3923 return false;
3925 if (gimple_call_lhs (stmt)
3926 && TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
3927 return false;
3929 gcc_checking_assert (!stmt_can_throw_internal (cfun, stmt));
3931 vectype = STMT_VINFO_VECTYPE (stmt_info);
3933 if (loop_vinfo && nested_in_vect_loop_p (loop, stmt_info))
3934 return false;
3936 /* Process function arguments. */
3937 nargs = gimple_call_num_args (stmt) - masked_call_offset;
3939 /* Bail out if the function has zero arguments. */
3940 if (nargs == 0)
3941 return false;
3943 vec<tree>& simd_clone_info = (slp_node ? SLP_TREE_SIMD_CLONE_INFO (slp_node)
3944 : STMT_VINFO_SIMD_CLONE_INFO (stmt_info));
3945 arginfo.reserve (nargs, true);
3946 auto_vec<slp_tree> slp_op;
3947 slp_op.safe_grow_cleared (nargs);
3949 for (i = 0; i < nargs; i++)
3951 simd_call_arg_info thisarginfo;
3952 affine_iv iv;
3954 thisarginfo.linear_step = 0;
3955 thisarginfo.align = 0;
3956 thisarginfo.op = NULL_TREE;
3957 thisarginfo.simd_lane_linear = false;
3959 int op_no = i + masked_call_offset;
3960 if (slp_node)
3961 op_no = vect_slp_child_index_for_operand (stmt, op_no, false);
3962 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
3963 op_no, &op, &slp_op[i],
3964 &thisarginfo.dt, &thisarginfo.vectype)
3965 || thisarginfo.dt == vect_uninitialized_def)
3967 if (dump_enabled_p ())
3968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3969 "use not simple.\n");
3970 return false;
3973 if (thisarginfo.dt == vect_constant_def
3974 || thisarginfo.dt == vect_external_def)
3976 /* With SLP we determine the vector type of constants/externals
3977 at analysis time, handling conflicts via
3978 vect_maybe_update_slp_op_vectype. At transform time
3979 we have a vector type recorded for SLP. */
3980 gcc_assert (!vec_stmt
3981 || !slp_node
3982 || thisarginfo.vectype != NULL_TREE);
3983 if (!vec_stmt)
3984 thisarginfo.vectype = get_vectype_for_scalar_type (vinfo,
3985 TREE_TYPE (op),
3986 slp_node);
3988 else
3989 gcc_assert (thisarginfo.vectype != NULL_TREE);
3991 /* For linear arguments, the analyze phase should have saved
3992 the base and step in {STMT_VINFO,SLP_TREE}_SIMD_CLONE_INFO. */
3993 if (i * 3 + 4 <= simd_clone_info.length ()
3994 && simd_clone_info[i * 3 + 2])
3996 gcc_assert (vec_stmt);
3997 thisarginfo.linear_step = tree_to_shwi (simd_clone_info[i * 3 + 2]);
3998 thisarginfo.op = simd_clone_info[i * 3 + 1];
3999 thisarginfo.simd_lane_linear
4000 = (simd_clone_info[i * 3 + 3] == boolean_true_node);
4001 /* If loop has been peeled for alignment, we need to adjust it. */
4002 tree n1 = LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo);
4003 tree n2 = LOOP_VINFO_NITERS (loop_vinfo);
4004 if (n1 != n2 && !thisarginfo.simd_lane_linear)
4006 tree bias = fold_build2 (MINUS_EXPR, TREE_TYPE (n1), n1, n2);
4007 tree step = simd_clone_info[i * 3 + 2];
4008 tree opt = TREE_TYPE (thisarginfo.op);
4009 bias = fold_convert (TREE_TYPE (step), bias);
4010 bias = fold_build2 (MULT_EXPR, TREE_TYPE (step), bias, step);
4011 thisarginfo.op
4012 = fold_build2 (POINTER_TYPE_P (opt)
4013 ? POINTER_PLUS_EXPR : PLUS_EXPR, opt,
4014 thisarginfo.op, bias);
4017 else if (!vec_stmt
4018 && thisarginfo.dt != vect_constant_def
4019 && thisarginfo.dt != vect_external_def
4020 && loop_vinfo
4021 && TREE_CODE (op) == SSA_NAME
4022 && simple_iv (loop, loop_containing_stmt (stmt), op,
4023 &iv, false)
4024 && tree_fits_shwi_p (iv.step))
4026 thisarginfo.linear_step = tree_to_shwi (iv.step);
4027 thisarginfo.op = iv.base;
4029 else if ((thisarginfo.dt == vect_constant_def
4030 || thisarginfo.dt == vect_external_def)
4031 && POINTER_TYPE_P (TREE_TYPE (op)))
4032 thisarginfo.align = get_pointer_alignment (op) / BITS_PER_UNIT;
4033 /* Addresses of array elements indexed by GOMP_SIMD_LANE are
4034 linear too. */
4035 if (POINTER_TYPE_P (TREE_TYPE (op))
4036 && !thisarginfo.linear_step
4037 && !vec_stmt
4038 && thisarginfo.dt != vect_constant_def
4039 && thisarginfo.dt != vect_external_def
4040 && loop_vinfo
4041 && TREE_CODE (op) == SSA_NAME)
4042 vect_simd_lane_linear (op, loop, &thisarginfo);
4044 arginfo.quick_push (thisarginfo);
4047 poly_uint64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
4048 unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 1;
4049 unsigned int badness = 0;
4050 struct cgraph_node *bestn = NULL;
4051 if (simd_clone_info.exists ())
4052 bestn = cgraph_node::get (simd_clone_info[0]);
4053 else
4054 for (struct cgraph_node *n = node->simd_clones; n != NULL;
4055 n = n->simdclone->next_clone)
4057 unsigned int this_badness = 0;
4058 unsigned int num_calls;
4059 /* The number of arguments in the call and the number of parameters in
4060 the simdclone should match. However, when the simdclone is
4061 'inbranch', it could have one more paramater than nargs when using
4062 an inbranch simdclone to call a non-inbranch call, either in a
4063 non-masked loop using a all true constant mask, or inside a masked
4064 loop using it's mask. */
4065 size_t simd_nargs = n->simdclone->nargs;
4066 if (!masked_call_offset && n->simdclone->inbranch)
4067 simd_nargs--;
4068 if (!constant_multiple_p (vf * group_size, n->simdclone->simdlen,
4069 &num_calls)
4070 || (!n->simdclone->inbranch && (masked_call_offset > 0))
4071 || (nargs != simd_nargs))
4072 continue;
4073 if (num_calls != 1)
4074 this_badness += floor_log2 (num_calls) * 4096;
4075 if (n->simdclone->inbranch)
4076 this_badness += 8192;
4077 int target_badness = targetm.simd_clone.usable (n);
4078 if (target_badness < 0)
4079 continue;
4080 this_badness += target_badness * 512;
4081 for (i = 0; i < nargs; i++)
4083 switch (n->simdclone->args[i].arg_type)
4085 case SIMD_CLONE_ARG_TYPE_VECTOR:
4086 if (!useless_type_conversion_p
4087 (n->simdclone->args[i].orig_type,
4088 TREE_TYPE (gimple_call_arg (stmt,
4089 i + masked_call_offset))))
4090 i = -1;
4091 else if (arginfo[i].dt == vect_constant_def
4092 || arginfo[i].dt == vect_external_def
4093 || arginfo[i].linear_step)
4094 this_badness += 64;
4095 break;
4096 case SIMD_CLONE_ARG_TYPE_UNIFORM:
4097 if (arginfo[i].dt != vect_constant_def
4098 && arginfo[i].dt != vect_external_def)
4099 i = -1;
4100 break;
4101 case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
4102 case SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP:
4103 if (arginfo[i].dt == vect_constant_def
4104 || arginfo[i].dt == vect_external_def
4105 || (arginfo[i].linear_step
4106 != n->simdclone->args[i].linear_step))
4107 i = -1;
4108 break;
4109 case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
4110 case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP:
4111 case SIMD_CLONE_ARG_TYPE_LINEAR_UVAL_CONSTANT_STEP:
4112 case SIMD_CLONE_ARG_TYPE_LINEAR_REF_VARIABLE_STEP:
4113 case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP:
4114 case SIMD_CLONE_ARG_TYPE_LINEAR_UVAL_VARIABLE_STEP:
4115 /* FORNOW */
4116 i = -1;
4117 break;
4118 case SIMD_CLONE_ARG_TYPE_MASK:
4119 /* While we can create a traditional data vector from
4120 an incoming integer mode mask we have no good way to
4121 force generate an integer mode mask from a traditional
4122 boolean vector input. */
4123 if (SCALAR_INT_MODE_P (n->simdclone->mask_mode)
4124 && !SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype)))
4125 i = -1;
4126 else if (!SCALAR_INT_MODE_P (n->simdclone->mask_mode)
4127 && SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype)))
4128 this_badness += 2048;
4129 break;
4131 if (i == (size_t) -1)
4132 break;
4133 if (n->simdclone->args[i].alignment > arginfo[i].align)
4135 i = -1;
4136 break;
4138 if (arginfo[i].align)
4139 this_badness += (exact_log2 (arginfo[i].align)
4140 - exact_log2 (n->simdclone->args[i].alignment));
4142 if (i == (size_t) -1)
4143 continue;
4144 if (masked_call_offset == 0
4145 && n->simdclone->inbranch
4146 && n->simdclone->nargs > nargs)
4148 gcc_assert (n->simdclone->args[n->simdclone->nargs - 1].arg_type ==
4149 SIMD_CLONE_ARG_TYPE_MASK);
4150 /* Penalize using a masked SIMD clone in a non-masked loop, that is
4151 not in a branch, as we'd have to construct an all-true mask. */
4152 if (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4153 this_badness += 64;
4155 if (bestn == NULL || this_badness < badness)
4157 bestn = n;
4158 badness = this_badness;
4162 if (bestn == NULL)
4163 return false;
4165 unsigned int num_mask_args = 0;
4166 if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
4167 for (i = 0; i < nargs; i++)
4168 if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK)
4169 num_mask_args++;
4171 for (i = 0; i < nargs; i++)
4173 if ((arginfo[i].dt == vect_constant_def
4174 || arginfo[i].dt == vect_external_def)
4175 && bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
4177 tree arg_type = TREE_TYPE (gimple_call_arg (stmt,
4178 i + masked_call_offset));
4179 arginfo[i].vectype = get_vectype_for_scalar_type (vinfo, arg_type,
4180 slp_node);
4181 if (arginfo[i].vectype == NULL
4182 || !constant_multiple_p (bestn->simdclone->simdlen,
4183 TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
4184 return false;
4187 if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR
4188 && VECTOR_BOOLEAN_TYPE_P (bestn->simdclone->args[i].vector_type))
4190 if (dump_enabled_p ())
4191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4192 "vector mask arguments are not supported.\n");
4193 return false;
4196 if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK)
4198 tree clone_arg_vectype = bestn->simdclone->args[i].vector_type;
4199 if (bestn->simdclone->mask_mode == VOIDmode)
4201 if (maybe_ne (TYPE_VECTOR_SUBPARTS (clone_arg_vectype),
4202 TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
4204 /* FORNOW we only have partial support for vector-type masks
4205 that can't hold all of simdlen. */
4206 if (dump_enabled_p ())
4207 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4208 vect_location,
4209 "in-branch vector clones are not yet"
4210 " supported for mismatched vector sizes.\n");
4211 return false;
4213 if (!expand_vec_cond_expr_p (clone_arg_vectype,
4214 arginfo[i].vectype, ERROR_MARK))
4216 if (dump_enabled_p ())
4217 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4218 vect_location,
4219 "cannot compute mask argument for"
4220 " in-branch vector clones.\n");
4221 return false;
4224 else if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
4226 if (!SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype))
4227 || maybe_ne (exact_div (bestn->simdclone->simdlen,
4228 num_mask_args),
4229 TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
4231 /* FORNOW we only have partial support for integer-type masks
4232 that represent the same number of lanes as the
4233 vectorized mask inputs. */
4234 if (dump_enabled_p ())
4235 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4236 vect_location,
4237 "in-branch vector clones are not yet "
4238 "supported for mismatched vector sizes.\n");
4239 return false;
4242 else
4244 if (dump_enabled_p ())
4245 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4246 vect_location,
4247 "in-branch vector clones not supported"
4248 " on this target.\n");
4249 return false;
4254 fndecl = bestn->decl;
4255 nunits = bestn->simdclone->simdlen;
4256 if (slp_node)
4257 ncopies = vector_unroll_factor (vf * group_size, nunits);
4258 else
4259 ncopies = vector_unroll_factor (vf, nunits);
4261 /* If the function isn't const, only allow it in simd loops where user
4262 has asserted that at least nunits consecutive iterations can be
4263 performed using SIMD instructions. */
4264 if ((loop == NULL || maybe_lt ((unsigned) loop->safelen, nunits))
4265 && gimple_vuse (stmt))
4266 return false;
4268 /* Sanity check: make sure that at least one copy of the vectorized stmt
4269 needs to be generated. */
4270 gcc_assert (ncopies >= 1);
4272 if (!vec_stmt) /* transformation not required. */
4274 if (slp_node)
4275 for (unsigned i = 0; i < nargs; ++i)
4276 if (!vect_maybe_update_slp_op_vectype (slp_op[i], arginfo[i].vectype))
4278 if (dump_enabled_p ())
4279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4280 "incompatible vector types for invariants\n");
4281 return false;
4283 /* When the original call is pure or const but the SIMD ABI dictates
4284 an aggregate return we will have to use a virtual definition and
4285 in a loop eventually even need to add a virtual PHI. That's
4286 not straight-forward so allow to fix this up via renaming. */
4287 if (gimple_call_lhs (stmt)
4288 && !gimple_vdef (stmt)
4289 && TREE_CODE (TREE_TYPE (TREE_TYPE (bestn->decl))) == ARRAY_TYPE)
4290 vinfo->any_known_not_updated_vssa = true;
4291 /* ??? For SLP code-gen we end up inserting after the last
4292 vector argument def rather than at the original call position
4293 so automagic virtual operand updating doesn't work. */
4294 if (gimple_vuse (stmt) && slp_node)
4295 vinfo->any_known_not_updated_vssa = true;
4296 simd_clone_info.safe_push (bestn->decl);
4297 for (i = 0; i < bestn->simdclone->nargs; i++)
4299 switch (bestn->simdclone->args[i].arg_type)
4301 default:
4302 continue;
4303 case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
4304 case SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP:
4306 simd_clone_info.safe_grow_cleared (i * 3 + 1, true);
4307 simd_clone_info.safe_push (arginfo[i].op);
4308 tree lst = POINTER_TYPE_P (TREE_TYPE (arginfo[i].op))
4309 ? size_type_node : TREE_TYPE (arginfo[i].op);
4310 tree ls = build_int_cst (lst, arginfo[i].linear_step);
4311 simd_clone_info.safe_push (ls);
4312 tree sll = arginfo[i].simd_lane_linear
4313 ? boolean_true_node : boolean_false_node;
4314 simd_clone_info.safe_push (sll);
4316 break;
4317 case SIMD_CLONE_ARG_TYPE_MASK:
4318 if (loop_vinfo
4319 && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
4320 vect_record_loop_mask (loop_vinfo,
4321 &LOOP_VINFO_MASKS (loop_vinfo),
4322 ncopies, vectype, op);
4324 break;
4328 if (!bestn->simdclone->inbranch && loop_vinfo)
4330 if (dump_enabled_p ()
4331 && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
4332 dump_printf_loc (MSG_NOTE, vect_location,
4333 "can't use a fully-masked loop because a"
4334 " non-masked simd clone was selected.\n");
4335 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
4338 STMT_VINFO_TYPE (stmt_info) = call_simd_clone_vec_info_type;
4339 DUMP_VECT_SCOPE ("vectorizable_simd_clone_call");
4340 /* vect_model_simple_cost (vinfo, stmt_info, ncopies,
4341 dt, slp_node, cost_vec); */
4342 return true;
4345 /* Transform. */
4347 if (dump_enabled_p ())
4348 dump_printf_loc (MSG_NOTE, vect_location, "transform call.\n");
4350 /* Handle def. */
4351 scalar_dest = gimple_call_lhs (stmt);
4352 vec_dest = NULL_TREE;
4353 rtype = NULL_TREE;
4354 ratype = NULL_TREE;
4355 if (scalar_dest)
4357 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4358 rtype = TREE_TYPE (TREE_TYPE (fndecl));
4359 if (TREE_CODE (rtype) == ARRAY_TYPE)
4361 ratype = rtype;
4362 rtype = TREE_TYPE (ratype);
4366 auto_vec<vec<tree> > vec_oprnds;
4367 auto_vec<unsigned> vec_oprnds_i;
4368 vec_oprnds_i.safe_grow_cleared (nargs, true);
4369 if (slp_node)
4371 vec_oprnds.reserve_exact (nargs);
4372 vect_get_slp_defs (vinfo, slp_node, &vec_oprnds);
4374 else
4375 vec_oprnds.safe_grow_cleared (nargs, true);
4376 for (j = 0; j < ncopies; ++j)
4378 poly_uint64 callee_nelements;
4379 poly_uint64 caller_nelements;
4380 /* Build argument list for the vectorized call. */
4381 if (j == 0)
4382 vargs.create (nargs);
4383 else
4384 vargs.truncate (0);
4386 for (i = 0; i < nargs; i++)
4388 unsigned int k, l, m, o;
4389 tree atype;
4390 op = gimple_call_arg (stmt, i + masked_call_offset);
4391 switch (bestn->simdclone->args[i].arg_type)
4393 case SIMD_CLONE_ARG_TYPE_VECTOR:
4394 atype = bestn->simdclone->args[i].vector_type;
4395 caller_nelements = TYPE_VECTOR_SUBPARTS (arginfo[i].vectype);
4396 callee_nelements = TYPE_VECTOR_SUBPARTS (atype);
4397 o = vector_unroll_factor (nunits, callee_nelements);
4398 for (m = j * o; m < (j + 1) * o; m++)
4400 if (known_lt (callee_nelements, caller_nelements))
4402 poly_uint64 prec = GET_MODE_BITSIZE (TYPE_MODE (atype));
4403 if (!constant_multiple_p (caller_nelements,
4404 callee_nelements, &k))
4405 gcc_unreachable ();
4407 gcc_assert ((k & (k - 1)) == 0);
4408 if (m == 0)
4410 if (!slp_node)
4411 vect_get_vec_defs_for_operand (vinfo, stmt_info,
4412 ncopies * o / k, op,
4413 &vec_oprnds[i]);
4414 vec_oprnds_i[i] = 0;
4415 vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4417 else
4419 vec_oprnd0 = arginfo[i].op;
4420 if ((m & (k - 1)) == 0)
4421 vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4423 arginfo[i].op = vec_oprnd0;
4424 vec_oprnd0
4425 = build3 (BIT_FIELD_REF, atype, vec_oprnd0,
4426 bitsize_int (prec),
4427 bitsize_int ((m & (k - 1)) * prec));
4428 gassign *new_stmt
4429 = gimple_build_assign (make_ssa_name (atype),
4430 vec_oprnd0);
4431 vect_finish_stmt_generation (vinfo, stmt_info,
4432 new_stmt, gsi);
4433 vargs.safe_push (gimple_assign_lhs (new_stmt));
4435 else
4437 if (!constant_multiple_p (callee_nelements,
4438 caller_nelements, &k))
4439 gcc_unreachable ();
4440 gcc_assert ((k & (k - 1)) == 0);
4441 vec<constructor_elt, va_gc> *ctor_elts;
4442 if (k != 1)
4443 vec_alloc (ctor_elts, k);
4444 else
4445 ctor_elts = NULL;
4446 for (l = 0; l < k; l++)
4448 if (m == 0 && l == 0)
4450 if (!slp_node)
4451 vect_get_vec_defs_for_operand (vinfo, stmt_info,
4452 k * o * ncopies,
4454 &vec_oprnds[i]);
4455 vec_oprnds_i[i] = 0;
4456 vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4458 else
4459 vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4460 arginfo[i].op = vec_oprnd0;
4461 if (k == 1)
4462 break;
4463 CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE,
4464 vec_oprnd0);
4466 if (k == 1)
4467 if (!useless_type_conversion_p (TREE_TYPE (vec_oprnd0),
4468 atype))
4470 vec_oprnd0 = build1 (VIEW_CONVERT_EXPR, atype,
4471 vec_oprnd0);
4472 gassign *new_stmt
4473 = gimple_build_assign (make_ssa_name (atype),
4474 vec_oprnd0);
4475 vect_finish_stmt_generation (vinfo, stmt_info,
4476 new_stmt, gsi);
4477 vargs.safe_push (gimple_get_lhs (new_stmt));
4479 else
4480 vargs.safe_push (vec_oprnd0);
4481 else
4483 vec_oprnd0 = build_constructor (atype, ctor_elts);
4484 gassign *new_stmt
4485 = gimple_build_assign (make_ssa_name (atype),
4486 vec_oprnd0);
4487 vect_finish_stmt_generation (vinfo, stmt_info,
4488 new_stmt, gsi);
4489 vargs.safe_push (gimple_assign_lhs (new_stmt));
4493 break;
4494 case SIMD_CLONE_ARG_TYPE_MASK:
4495 if (bestn->simdclone->mask_mode == VOIDmode)
4497 atype = bestn->simdclone->args[i].vector_type;
4498 tree elt_type = TREE_TYPE (atype);
4499 tree one = fold_convert (elt_type, integer_one_node);
4500 tree zero = fold_convert (elt_type, integer_zero_node);
4501 callee_nelements = TYPE_VECTOR_SUBPARTS (atype);
4502 caller_nelements = TYPE_VECTOR_SUBPARTS (arginfo[i].vectype);
4503 o = vector_unroll_factor (nunits, callee_nelements);
4504 for (m = j * o; m < (j + 1) * o; m++)
4506 if (maybe_lt (callee_nelements, caller_nelements))
4508 /* The mask type has fewer elements than simdlen. */
4510 /* FORNOW */
4511 gcc_unreachable ();
4513 else if (known_eq (callee_nelements, caller_nelements))
4515 /* The SIMD clone function has the same number of
4516 elements as the current function. */
4517 if (m == 0)
4519 if (!slp_node)
4520 vect_get_vec_defs_for_operand (vinfo, stmt_info,
4521 o * ncopies,
4523 &vec_oprnds[i]);
4524 vec_oprnds_i[i] = 0;
4526 vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4527 if (loop_vinfo
4528 && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4530 vec_loop_masks *loop_masks
4531 = &LOOP_VINFO_MASKS (loop_vinfo);
4532 tree loop_mask
4533 = vect_get_loop_mask (loop_vinfo, gsi,
4534 loop_masks, ncopies,
4535 vectype, j);
4536 vec_oprnd0
4537 = prepare_vec_mask (loop_vinfo,
4538 TREE_TYPE (loop_mask),
4539 loop_mask, vec_oprnd0,
4540 gsi);
4541 loop_vinfo->vec_cond_masked_set.add ({ vec_oprnd0,
4542 loop_mask });
4545 vec_oprnd0
4546 = build3 (VEC_COND_EXPR, atype, vec_oprnd0,
4547 build_vector_from_val (atype, one),
4548 build_vector_from_val (atype, zero));
4549 gassign *new_stmt
4550 = gimple_build_assign (make_ssa_name (atype),
4551 vec_oprnd0);
4552 vect_finish_stmt_generation (vinfo, stmt_info,
4553 new_stmt, gsi);
4554 vargs.safe_push (gimple_assign_lhs (new_stmt));
4556 else
4558 /* The mask type has more elements than simdlen. */
4560 /* FORNOW */
4561 gcc_unreachable ();
4565 else if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
4567 atype = bestn->simdclone->args[i].vector_type;
4568 /* Guess the number of lanes represented by atype. */
4569 poly_uint64 atype_subparts
4570 = exact_div (bestn->simdclone->simdlen,
4571 num_mask_args);
4572 o = vector_unroll_factor (nunits, atype_subparts);
4573 for (m = j * o; m < (j + 1) * o; m++)
4575 if (m == 0)
4577 if (!slp_node)
4578 vect_get_vec_defs_for_operand (vinfo, stmt_info,
4579 o * ncopies,
4581 &vec_oprnds[i]);
4582 vec_oprnds_i[i] = 0;
4584 if (maybe_lt (atype_subparts,
4585 TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
4587 /* The mask argument has fewer elements than the
4588 input vector. */
4589 /* FORNOW */
4590 gcc_unreachable ();
4592 else if (known_eq (atype_subparts,
4593 TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
4595 /* The vector mask argument matches the input
4596 in the number of lanes, but not necessarily
4597 in the mode. */
4598 vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4599 tree st = lang_hooks.types.type_for_mode
4600 (TYPE_MODE (TREE_TYPE (vec_oprnd0)), 1);
4601 vec_oprnd0 = build1 (VIEW_CONVERT_EXPR, st,
4602 vec_oprnd0);
4603 gassign *new_stmt
4604 = gimple_build_assign (make_ssa_name (st),
4605 vec_oprnd0);
4606 vect_finish_stmt_generation (vinfo, stmt_info,
4607 new_stmt, gsi);
4608 if (!types_compatible_p (atype, st))
4610 new_stmt
4611 = gimple_build_assign (make_ssa_name (atype),
4612 NOP_EXPR,
4613 gimple_assign_lhs
4614 (new_stmt));
4615 vect_finish_stmt_generation (vinfo, stmt_info,
4616 new_stmt, gsi);
4618 vargs.safe_push (gimple_assign_lhs (new_stmt));
4620 else
4622 /* The mask argument has more elements than the
4623 input vector. */
4624 /* FORNOW */
4625 gcc_unreachable ();
4629 else
4630 gcc_unreachable ();
4631 break;
4632 case SIMD_CLONE_ARG_TYPE_UNIFORM:
4633 vargs.safe_push (op);
4634 break;
4635 case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
4636 case SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP:
4637 if (j == 0)
4639 gimple_seq stmts;
4640 arginfo[i].op
4641 = force_gimple_operand (unshare_expr (arginfo[i].op),
4642 &stmts, true, NULL_TREE);
4643 if (stmts != NULL)
4645 basic_block new_bb;
4646 edge pe = loop_preheader_edge (loop);
4647 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4648 gcc_assert (!new_bb);
4650 if (arginfo[i].simd_lane_linear)
4652 vargs.safe_push (arginfo[i].op);
4653 break;
4655 tree phi_res = copy_ssa_name (op);
4656 gphi *new_phi = create_phi_node (phi_res, loop->header);
4657 add_phi_arg (new_phi, arginfo[i].op,
4658 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4659 enum tree_code code
4660 = POINTER_TYPE_P (TREE_TYPE (op))
4661 ? POINTER_PLUS_EXPR : PLUS_EXPR;
4662 tree type = POINTER_TYPE_P (TREE_TYPE (op))
4663 ? sizetype : TREE_TYPE (op);
4664 poly_widest_int cst
4665 = wi::mul (bestn->simdclone->args[i].linear_step,
4666 ncopies * nunits);
4667 tree tcst = wide_int_to_tree (type, cst);
4668 tree phi_arg = copy_ssa_name (op);
4669 gassign *new_stmt
4670 = gimple_build_assign (phi_arg, code, phi_res, tcst);
4671 gimple_stmt_iterator si = gsi_after_labels (loop->header);
4672 gsi_insert_after (&si, new_stmt, GSI_NEW_STMT);
4673 add_phi_arg (new_phi, phi_arg, loop_latch_edge (loop),
4674 UNKNOWN_LOCATION);
4675 arginfo[i].op = phi_res;
4676 vargs.safe_push (phi_res);
4678 else
4680 enum tree_code code
4681 = POINTER_TYPE_P (TREE_TYPE (op))
4682 ? POINTER_PLUS_EXPR : PLUS_EXPR;
4683 tree type = POINTER_TYPE_P (TREE_TYPE (op))
4684 ? sizetype : TREE_TYPE (op);
4685 poly_widest_int cst
4686 = wi::mul (bestn->simdclone->args[i].linear_step,
4687 j * nunits);
4688 tree tcst = wide_int_to_tree (type, cst);
4689 new_temp = make_ssa_name (TREE_TYPE (op));
4690 gassign *new_stmt
4691 = gimple_build_assign (new_temp, code,
4692 arginfo[i].op, tcst);
4693 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
4694 vargs.safe_push (new_temp);
4696 break;
4697 case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP:
4698 case SIMD_CLONE_ARG_TYPE_LINEAR_UVAL_CONSTANT_STEP:
4699 case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
4700 case SIMD_CLONE_ARG_TYPE_LINEAR_REF_VARIABLE_STEP:
4701 case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP:
4702 case SIMD_CLONE_ARG_TYPE_LINEAR_UVAL_VARIABLE_STEP:
4703 default:
4704 gcc_unreachable ();
4708 if (masked_call_offset == 0
4709 && bestn->simdclone->inbranch
4710 && bestn->simdclone->nargs > nargs)
4712 unsigned long m, o;
4713 size_t mask_i = bestn->simdclone->nargs - 1;
4714 tree mask;
4715 gcc_assert (bestn->simdclone->args[mask_i].arg_type ==
4716 SIMD_CLONE_ARG_TYPE_MASK);
4718 tree masktype = bestn->simdclone->args[mask_i].vector_type;
4719 callee_nelements = TYPE_VECTOR_SUBPARTS (masktype);
4720 o = vector_unroll_factor (nunits, callee_nelements);
4721 for (m = j * o; m < (j + 1) * o; m++)
4723 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4725 vec_loop_masks *loop_masks = &LOOP_VINFO_MASKS (loop_vinfo);
4726 mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
4727 ncopies, vectype, j);
4729 else
4730 mask = vect_build_all_ones_mask (vinfo, stmt_info, masktype);
4732 gassign *new_stmt;
4733 if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
4735 /* This means we are dealing with integer mask modes.
4736 First convert to an integer type with the same size as
4737 the current vector type. */
4738 unsigned HOST_WIDE_INT intermediate_size
4739 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (mask)));
4740 tree mid_int_type =
4741 build_nonstandard_integer_type (intermediate_size, 1);
4742 mask = build1 (VIEW_CONVERT_EXPR, mid_int_type, mask);
4743 new_stmt
4744 = gimple_build_assign (make_ssa_name (mid_int_type),
4745 mask);
4746 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4747 /* Then zero-extend to the mask mode. */
4748 mask = fold_build1 (NOP_EXPR, masktype,
4749 gimple_get_lhs (new_stmt));
4751 else if (bestn->simdclone->mask_mode == VOIDmode)
4753 tree one = fold_convert (TREE_TYPE (masktype),
4754 integer_one_node);
4755 tree zero = fold_convert (TREE_TYPE (masktype),
4756 integer_zero_node);
4757 mask = build3 (VEC_COND_EXPR, masktype, mask,
4758 build_vector_from_val (masktype, one),
4759 build_vector_from_val (masktype, zero));
4761 else
4762 gcc_unreachable ();
4764 new_stmt = gimple_build_assign (make_ssa_name (masktype), mask);
4765 vect_finish_stmt_generation (vinfo, stmt_info,
4766 new_stmt, gsi);
4767 mask = gimple_assign_lhs (new_stmt);
4768 vargs.safe_push (mask);
4772 gcall *new_call = gimple_build_call_vec (fndecl, vargs);
4773 if (vec_dest)
4775 gcc_assert (ratype
4776 || known_eq (TYPE_VECTOR_SUBPARTS (rtype), nunits));
4777 if (ratype)
4778 new_temp = create_tmp_var (ratype);
4779 else if (useless_type_conversion_p (vectype, rtype))
4780 new_temp = make_ssa_name (vec_dest, new_call);
4781 else
4782 new_temp = make_ssa_name (rtype, new_call);
4783 gimple_call_set_lhs (new_call, new_temp);
4785 vect_finish_stmt_generation (vinfo, stmt_info, new_call, gsi);
4786 gimple *new_stmt = new_call;
4788 if (vec_dest)
4790 if (!multiple_p (TYPE_VECTOR_SUBPARTS (vectype), nunits))
4792 unsigned int k, l;
4793 poly_uint64 prec = GET_MODE_BITSIZE (TYPE_MODE (vectype));
4794 poly_uint64 bytes = GET_MODE_SIZE (TYPE_MODE (vectype));
4795 k = vector_unroll_factor (nunits,
4796 TYPE_VECTOR_SUBPARTS (vectype));
4797 gcc_assert ((k & (k - 1)) == 0);
4798 for (l = 0; l < k; l++)
4800 tree t;
4801 if (ratype)
4803 t = build_fold_addr_expr (new_temp);
4804 t = build2 (MEM_REF, vectype, t,
4805 build_int_cst (TREE_TYPE (t), l * bytes));
4807 else
4808 t = build3 (BIT_FIELD_REF, vectype, new_temp,
4809 bitsize_int (prec), bitsize_int (l * prec));
4810 new_stmt = gimple_build_assign (make_ssa_name (vectype), t);
4811 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
4813 if (j == 0 && l == 0)
4814 *vec_stmt = new_stmt;
4815 if (slp_node)
4816 SLP_TREE_VEC_DEFS (slp_node)
4817 .quick_push (gimple_assign_lhs (new_stmt));
4818 else
4819 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
4822 if (ratype)
4823 vect_clobber_variable (vinfo, stmt_info, gsi, new_temp);
4824 continue;
4826 else if (!multiple_p (nunits, TYPE_VECTOR_SUBPARTS (vectype)))
4828 unsigned int k;
4829 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
4830 TYPE_VECTOR_SUBPARTS (rtype), &k))
4831 gcc_unreachable ();
4832 gcc_assert ((k & (k - 1)) == 0);
4833 if ((j & (k - 1)) == 0)
4834 vec_alloc (ret_ctor_elts, k);
4835 if (ratype)
4837 unsigned int m, o;
4838 o = vector_unroll_factor (nunits,
4839 TYPE_VECTOR_SUBPARTS (rtype));
4840 for (m = 0; m < o; m++)
4842 tree tem = build4 (ARRAY_REF, rtype, new_temp,
4843 size_int (m), NULL_TREE, NULL_TREE);
4844 new_stmt = gimple_build_assign (make_ssa_name (rtype),
4845 tem);
4846 vect_finish_stmt_generation (vinfo, stmt_info,
4847 new_stmt, gsi);
4848 CONSTRUCTOR_APPEND_ELT (ret_ctor_elts, NULL_TREE,
4849 gimple_assign_lhs (new_stmt));
4851 vect_clobber_variable (vinfo, stmt_info, gsi, new_temp);
4853 else
4854 CONSTRUCTOR_APPEND_ELT (ret_ctor_elts, NULL_TREE, new_temp);
4855 if ((j & (k - 1)) != k - 1)
4856 continue;
4857 vec_oprnd0 = build_constructor (vectype, ret_ctor_elts);
4858 new_stmt
4859 = gimple_build_assign (make_ssa_name (vec_dest), vec_oprnd0);
4860 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
4862 if ((unsigned) j == k - 1)
4863 *vec_stmt = new_stmt;
4864 if (slp_node)
4865 SLP_TREE_VEC_DEFS (slp_node)
4866 .quick_push (gimple_assign_lhs (new_stmt));
4867 else
4868 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
4869 continue;
4871 else if (ratype)
4873 tree t = build_fold_addr_expr (new_temp);
4874 t = build2 (MEM_REF, vectype, t,
4875 build_int_cst (TREE_TYPE (t), 0));
4876 new_stmt = gimple_build_assign (make_ssa_name (vec_dest), t);
4877 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
4878 vect_clobber_variable (vinfo, stmt_info, gsi, new_temp);
4880 else if (!useless_type_conversion_p (vectype, rtype))
4882 vec_oprnd0 = build1 (VIEW_CONVERT_EXPR, vectype, new_temp);
4883 new_stmt
4884 = gimple_build_assign (make_ssa_name (vec_dest), vec_oprnd0);
4885 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
4889 if (j == 0)
4890 *vec_stmt = new_stmt;
4891 if (slp_node)
4892 SLP_TREE_VEC_DEFS (slp_node).quick_push (gimple_get_lhs (new_stmt));
4893 else
4894 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
4897 for (i = 0; i < nargs; ++i)
4899 vec<tree> oprndsi = vec_oprnds[i];
4900 oprndsi.release ();
4902 vargs.release ();
4904 /* Mark the clone as no longer being a candidate for GC. */
4905 bestn->gc_candidate = false;
4907 /* The call in STMT might prevent it from being removed in dce.
4908 We however cannot remove it here, due to the way the ssa name
4909 it defines is mapped to the new definition. So just replace
4910 rhs of the statement with something harmless. */
4912 if (slp_node)
4913 return true;
4915 gimple *new_stmt;
4916 if (scalar_dest)
4918 type = TREE_TYPE (scalar_dest);
4919 lhs = gimple_call_lhs (vect_orig_stmt (stmt_info)->stmt);
4920 new_stmt = gimple_build_assign (lhs, build_zero_cst (type));
4922 else
4923 new_stmt = gimple_build_nop ();
4924 vinfo->replace_stmt (gsi, vect_orig_stmt (stmt_info), new_stmt);
4925 unlink_stmt_vdef (stmt);
4927 return true;
4931 /* Function vect_gen_widened_results_half
4933 Create a vector stmt whose code, type, number of arguments, and result
4934 variable are CODE, OP_TYPE, and VEC_DEST, and its arguments are
4935 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at GSI.
4936 In the case that CODE is a CALL_EXPR, this means that a call to DECL
4937 needs to be created (DECL is a function-decl of a target-builtin).
4938 STMT_INFO is the original scalar stmt that we are vectorizing. */
4940 static gimple *
4941 vect_gen_widened_results_half (vec_info *vinfo, code_helper ch,
4942 tree vec_oprnd0, tree vec_oprnd1, int op_type,
4943 tree vec_dest, gimple_stmt_iterator *gsi,
4944 stmt_vec_info stmt_info)
4946 gimple *new_stmt;
4947 tree new_temp;
4949 /* Generate half of the widened result: */
4950 if (op_type != binary_op)
4951 vec_oprnd1 = NULL;
4952 new_stmt = vect_gimple_build (vec_dest, ch, vec_oprnd0, vec_oprnd1);
4953 new_temp = make_ssa_name (vec_dest, new_stmt);
4954 gimple_set_lhs (new_stmt, new_temp);
4955 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
4957 return new_stmt;
4961 /* Create vectorized demotion statements for vector operands from VEC_OPRNDS.
4962 For multi-step conversions store the resulting vectors and call the function
4963 recursively. When NARROW_SRC_P is true, there's still a conversion after
4964 narrowing, don't store the vectors in the SLP_NODE or in vector info of
4965 the scalar statement(or in STMT_VINFO_RELATED_STMT chain). */
4967 static void
4968 vect_create_vectorized_demotion_stmts (vec_info *vinfo, vec<tree> *vec_oprnds,
4969 int multi_step_cvt,
4970 stmt_vec_info stmt_info,
4971 vec<tree> &vec_dsts,
4972 gimple_stmt_iterator *gsi,
4973 slp_tree slp_node, code_helper code,
4974 bool narrow_src_p)
4976 unsigned int i;
4977 tree vop0, vop1, new_tmp, vec_dest;
4979 vec_dest = vec_dsts.pop ();
4981 for (i = 0; i < vec_oprnds->length (); i += 2)
4983 /* Create demotion operation. */
4984 vop0 = (*vec_oprnds)[i];
4985 vop1 = (*vec_oprnds)[i + 1];
4986 gimple *new_stmt = vect_gimple_build (vec_dest, code, vop0, vop1);
4987 new_tmp = make_ssa_name (vec_dest, new_stmt);
4988 gimple_set_lhs (new_stmt, new_tmp);
4989 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
4990 if (multi_step_cvt || narrow_src_p)
4991 /* Store the resulting vector for next recursive call,
4992 or return the resulting vector_tmp for NARROW FLOAT_EXPR. */
4993 (*vec_oprnds)[i/2] = new_tmp;
4994 else
4996 /* This is the last step of the conversion sequence. Store the
4997 vectors in SLP_NODE or in vector info of the scalar statement
4998 (or in STMT_VINFO_RELATED_STMT chain). */
4999 if (slp_node)
5000 slp_node->push_vec_def (new_stmt);
5001 else
5002 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
5006 /* For multi-step demotion operations we first generate demotion operations
5007 from the source type to the intermediate types, and then combine the
5008 results (stored in VEC_OPRNDS) in demotion operation to the destination
5009 type. */
5010 if (multi_step_cvt)
5012 /* At each level of recursion we have half of the operands we had at the
5013 previous level. */
5014 vec_oprnds->truncate ((i+1)/2);
5015 vect_create_vectorized_demotion_stmts (vinfo, vec_oprnds,
5016 multi_step_cvt - 1,
5017 stmt_info, vec_dsts, gsi,
5018 slp_node, VEC_PACK_TRUNC_EXPR,
5019 narrow_src_p);
5022 vec_dsts.quick_push (vec_dest);
5026 /* Create vectorized promotion statements for vector operands from VEC_OPRNDS0
5027 and VEC_OPRNDS1, for a binary operation associated with scalar statement
5028 STMT_INFO. For multi-step conversions store the resulting vectors and
5029 call the function recursively. */
5031 static void
5032 vect_create_vectorized_promotion_stmts (vec_info *vinfo,
5033 vec<tree> *vec_oprnds0,
5034 vec<tree> *vec_oprnds1,
5035 stmt_vec_info stmt_info, tree vec_dest,
5036 gimple_stmt_iterator *gsi,
5037 code_helper ch1,
5038 code_helper ch2, int op_type)
5040 int i;
5041 tree vop0, vop1, new_tmp1, new_tmp2;
5042 gimple *new_stmt1, *new_stmt2;
5043 vec<tree> vec_tmp = vNULL;
5045 vec_tmp.create (vec_oprnds0->length () * 2);
5046 FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
5048 if (op_type == binary_op)
5049 vop1 = (*vec_oprnds1)[i];
5050 else
5051 vop1 = NULL_TREE;
5053 /* Generate the two halves of promotion operation. */
5054 new_stmt1 = vect_gen_widened_results_half (vinfo, ch1, vop0, vop1,
5055 op_type, vec_dest, gsi,
5056 stmt_info);
5057 new_stmt2 = vect_gen_widened_results_half (vinfo, ch2, vop0, vop1,
5058 op_type, vec_dest, gsi,
5059 stmt_info);
5060 if (is_gimple_call (new_stmt1))
5062 new_tmp1 = gimple_call_lhs (new_stmt1);
5063 new_tmp2 = gimple_call_lhs (new_stmt2);
5065 else
5067 new_tmp1 = gimple_assign_lhs (new_stmt1);
5068 new_tmp2 = gimple_assign_lhs (new_stmt2);
5071 /* Store the results for the next step. */
5072 vec_tmp.quick_push (new_tmp1);
5073 vec_tmp.quick_push (new_tmp2);
5076 vec_oprnds0->release ();
5077 *vec_oprnds0 = vec_tmp;
5080 /* Create vectorized promotion stmts for widening stmts using only half the
5081 potential vector size for input. */
5082 static void
5083 vect_create_half_widening_stmts (vec_info *vinfo,
5084 vec<tree> *vec_oprnds0,
5085 vec<tree> *vec_oprnds1,
5086 stmt_vec_info stmt_info, tree vec_dest,
5087 gimple_stmt_iterator *gsi,
5088 code_helper code1,
5089 int op_type)
5091 int i;
5092 tree vop0, vop1;
5093 gimple *new_stmt1;
5094 gimple *new_stmt2;
5095 gimple *new_stmt3;
5096 vec<tree> vec_tmp = vNULL;
5098 vec_tmp.create (vec_oprnds0->length ());
5099 FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
5101 tree new_tmp1, new_tmp2, new_tmp3, out_type;
5103 gcc_assert (op_type == binary_op);
5104 vop1 = (*vec_oprnds1)[i];
5106 /* Widen the first vector input. */
5107 out_type = TREE_TYPE (vec_dest);
5108 new_tmp1 = make_ssa_name (out_type);
5109 new_stmt1 = gimple_build_assign (new_tmp1, NOP_EXPR, vop0);
5110 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt1, gsi);
5111 if (VECTOR_TYPE_P (TREE_TYPE (vop1)))
5113 /* Widen the second vector input. */
5114 new_tmp2 = make_ssa_name (out_type);
5115 new_stmt2 = gimple_build_assign (new_tmp2, NOP_EXPR, vop1);
5116 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt2, gsi);
5117 /* Perform the operation. With both vector inputs widened. */
5118 new_stmt3 = vect_gimple_build (vec_dest, code1, new_tmp1, new_tmp2);
5120 else
5122 /* Perform the operation. With the single vector input widened. */
5123 new_stmt3 = vect_gimple_build (vec_dest, code1, new_tmp1, vop1);
5126 new_tmp3 = make_ssa_name (vec_dest, new_stmt3);
5127 gimple_assign_set_lhs (new_stmt3, new_tmp3);
5128 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt3, gsi);
5130 /* Store the results for the next step. */
5131 vec_tmp.quick_push (new_tmp3);
5134 vec_oprnds0->release ();
5135 *vec_oprnds0 = vec_tmp;
5139 /* Check if STMT_INFO performs a conversion operation that can be vectorized.
5140 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5141 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5142 Return true if STMT_INFO is vectorizable in this way. */
5144 static bool
5145 vectorizable_conversion (vec_info *vinfo,
5146 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5147 gimple **vec_stmt, slp_tree slp_node,
5148 stmt_vector_for_cost *cost_vec)
5150 tree vec_dest, cvt_op = NULL_TREE;
5151 tree scalar_dest;
5152 tree op0, op1 = NULL_TREE;
5153 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5154 tree_code tc1, tc2;
5155 code_helper code, code1, code2;
5156 code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
5157 tree new_temp;
5158 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
5159 int ndts = 2;
5160 poly_uint64 nunits_in;
5161 poly_uint64 nunits_out;
5162 tree vectype_out, vectype_in;
5163 int ncopies, i;
5164 tree lhs_type, rhs_type;
5165 /* For conversions between floating point and integer, there're 2 NARROW
5166 cases. NARROW_SRC is for FLOAT_EXPR, means
5167 integer --DEMOTION--> integer --FLOAT_EXPR--> floating point.
5168 This is safe when the range of the source integer can fit into the lower
5169 precision. NARROW_DST is for FIX_TRUNC_EXPR, means
5170 floating point --FIX_TRUNC_EXPR--> integer --DEMOTION--> INTEGER.
5171 For other conversions, when there's narrowing, NARROW_DST is used as
5172 default. */
5173 enum { NARROW_SRC, NARROW_DST, NONE, WIDEN } modifier;
5174 vec<tree> vec_oprnds0 = vNULL;
5175 vec<tree> vec_oprnds1 = vNULL;
5176 tree vop0;
5177 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5178 int multi_step_cvt = 0;
5179 vec<tree> interm_types = vNULL;
5180 tree intermediate_type, cvt_type = NULL_TREE;
5181 int op_type;
5182 unsigned short fltsz;
5184 /* Is STMT a vectorizable conversion? */
5186 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
5187 return false;
5189 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
5190 && ! vec_stmt)
5191 return false;
5193 gimple* stmt = stmt_info->stmt;
5194 if (!(is_gimple_assign (stmt) || is_gimple_call (stmt)))
5195 return false;
5197 if (gimple_get_lhs (stmt) == NULL_TREE
5198 || TREE_CODE (gimple_get_lhs (stmt)) != SSA_NAME)
5199 return false;
5201 if (TREE_CODE (gimple_get_lhs (stmt)) != SSA_NAME)
5202 return false;
5204 if (is_gimple_assign (stmt))
5206 code = gimple_assign_rhs_code (stmt);
5207 op_type = TREE_CODE_LENGTH ((tree_code) code);
5209 else if (gimple_call_internal_p (stmt))
5211 code = gimple_call_internal_fn (stmt);
5212 op_type = gimple_call_num_args (stmt);
5214 else
5215 return false;
5217 bool widen_arith = (code == WIDEN_MULT_EXPR
5218 || code == WIDEN_LSHIFT_EXPR
5219 || widening_fn_p (code));
5221 if (!widen_arith
5222 && !CONVERT_EXPR_CODE_P (code)
5223 && code != FIX_TRUNC_EXPR
5224 && code != FLOAT_EXPR)
5225 return false;
5227 /* Check types of lhs and rhs. */
5228 scalar_dest = gimple_get_lhs (stmt);
5229 lhs_type = TREE_TYPE (scalar_dest);
5230 vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5232 /* Check the operands of the operation. */
5233 slp_tree slp_op0, slp_op1 = NULL;
5234 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
5235 0, &op0, &slp_op0, &dt[0], &vectype_in))
5237 if (dump_enabled_p ())
5238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5239 "use not simple.\n");
5240 return false;
5243 rhs_type = TREE_TYPE (op0);
5244 if ((code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
5245 && !((INTEGRAL_TYPE_P (lhs_type)
5246 && INTEGRAL_TYPE_P (rhs_type))
5247 || (SCALAR_FLOAT_TYPE_P (lhs_type)
5248 && SCALAR_FLOAT_TYPE_P (rhs_type))))
5249 return false;
5251 if (!VECTOR_BOOLEAN_TYPE_P (vectype_out)
5252 && ((INTEGRAL_TYPE_P (lhs_type)
5253 && !type_has_mode_precision_p (lhs_type))
5254 || (INTEGRAL_TYPE_P (rhs_type)
5255 && !type_has_mode_precision_p (rhs_type))))
5257 if (dump_enabled_p ())
5258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5259 "type conversion to/from bit-precision unsupported."
5260 "\n");
5261 return false;
5264 if (op_type == binary_op)
5266 gcc_assert (code == WIDEN_MULT_EXPR
5267 || code == WIDEN_LSHIFT_EXPR
5268 || widening_fn_p (code));
5270 op1 = is_gimple_assign (stmt) ? gimple_assign_rhs2 (stmt) :
5271 gimple_call_arg (stmt, 0);
5272 tree vectype1_in;
5273 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 1,
5274 &op1, &slp_op1, &dt[1], &vectype1_in))
5276 if (dump_enabled_p ())
5277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5278 "use not simple.\n");
5279 return false;
5281 /* For WIDEN_MULT_EXPR, if OP0 is a constant, use the type of
5282 OP1. */
5283 if (!vectype_in)
5284 vectype_in = vectype1_in;
5287 /* If op0 is an external or constant def, infer the vector type
5288 from the scalar type. */
5289 if (!vectype_in)
5290 vectype_in = get_vectype_for_scalar_type (vinfo, rhs_type, slp_node);
5291 if (vec_stmt)
5292 gcc_assert (vectype_in);
5293 if (!vectype_in)
5295 if (dump_enabled_p ())
5296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5297 "no vectype for scalar type %T\n", rhs_type);
5299 return false;
5302 if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
5303 && !VECTOR_BOOLEAN_TYPE_P (vectype_in))
5305 if (dump_enabled_p ())
5306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5307 "can't convert between boolean and non "
5308 "boolean vectors %T\n", rhs_type);
5310 return false;
5313 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
5314 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
5315 if (known_eq (nunits_out, nunits_in))
5316 if (widen_arith)
5317 modifier = WIDEN;
5318 else
5319 modifier = NONE;
5320 else if (multiple_p (nunits_out, nunits_in))
5321 modifier = NARROW_DST;
5322 else
5324 gcc_checking_assert (multiple_p (nunits_in, nunits_out));
5325 modifier = WIDEN;
5328 /* Multiple types in SLP are handled by creating the appropriate number of
5329 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
5330 case of SLP. */
5331 if (slp_node)
5332 ncopies = 1;
5333 else if (modifier == NARROW_DST)
5334 ncopies = vect_get_num_copies (loop_vinfo, vectype_out);
5335 else
5336 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5338 /* Sanity check: make sure that at least one copy of the vectorized stmt
5339 needs to be generated. */
5340 gcc_assert (ncopies >= 1);
5342 bool found_mode = false;
5343 scalar_mode lhs_mode = SCALAR_TYPE_MODE (lhs_type);
5344 scalar_mode rhs_mode = SCALAR_TYPE_MODE (rhs_type);
5345 opt_scalar_mode rhs_mode_iter;
5347 /* Supportable by target? */
5348 switch (modifier)
5350 case NONE:
5351 if (code != FIX_TRUNC_EXPR
5352 && code != FLOAT_EXPR
5353 && !CONVERT_EXPR_CODE_P (code))
5354 return false;
5355 gcc_assert (code.is_tree_code ());
5356 if (supportable_convert_operation ((tree_code) code, vectype_out,
5357 vectype_in, &tc1))
5359 code1 = tc1;
5360 break;
5363 /* For conversions between float and integer types try whether
5364 we can use intermediate signed integer types to support the
5365 conversion. */
5366 if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
5367 && (code == FLOAT_EXPR ||
5368 (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
5370 bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
5371 bool float_expr_p = code == FLOAT_EXPR;
5372 unsigned short target_size;
5373 scalar_mode intermediate_mode;
5374 if (demotion)
5376 intermediate_mode = lhs_mode;
5377 target_size = GET_MODE_SIZE (rhs_mode);
5379 else
5381 target_size = GET_MODE_SIZE (lhs_mode);
5382 if (!int_mode_for_size
5383 (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
5384 goto unsupported;
5386 code1 = float_expr_p ? code : NOP_EXPR;
5387 codecvt1 = float_expr_p ? NOP_EXPR : code;
5388 opt_scalar_mode mode_iter;
5389 FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
5391 intermediate_mode = mode_iter.require ();
5393 if (GET_MODE_SIZE (intermediate_mode) > target_size)
5394 break;
5396 scalar_mode cvt_mode;
5397 if (!int_mode_for_size
5398 (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
5399 break;
5401 cvt_type = build_nonstandard_integer_type
5402 (GET_MODE_BITSIZE (cvt_mode), 0);
5404 /* Check if the intermediate type can hold OP0's range.
5405 When converting from float to integer this is not necessary
5406 because values that do not fit the (smaller) target type are
5407 unspecified anyway. */
5408 if (demotion && float_expr_p)
5410 wide_int op_min_value, op_max_value;
5411 if (!vect_get_range_info (op0, &op_min_value, &op_max_value))
5412 break;
5414 if (cvt_type == NULL_TREE
5415 || (wi::min_precision (op_max_value, SIGNED)
5416 > TYPE_PRECISION (cvt_type))
5417 || (wi::min_precision (op_min_value, SIGNED)
5418 > TYPE_PRECISION (cvt_type)))
5419 continue;
5422 cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type, slp_node);
5423 /* This should only happened for SLP as long as loop vectorizer
5424 only supports same-sized vector. */
5425 if (cvt_type == NULL_TREE
5426 || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in)
5427 || !supportable_convert_operation ((tree_code) code1,
5428 vectype_out,
5429 cvt_type, &tc1)
5430 || !supportable_convert_operation ((tree_code) codecvt1,
5431 cvt_type,
5432 vectype_in, &tc2))
5433 continue;
5435 found_mode = true;
5436 break;
5439 if (found_mode)
5441 multi_step_cvt++;
5442 interm_types.safe_push (cvt_type);
5443 cvt_type = NULL_TREE;
5444 code1 = tc1;
5445 codecvt1 = tc2;
5446 break;
5449 /* FALLTHRU */
5450 unsupported:
5451 if (dump_enabled_p ())
5452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5453 "conversion not supported by target.\n");
5454 return false;
5456 case WIDEN:
5457 if (known_eq (nunits_in, nunits_out))
5459 if (!(code.is_tree_code ()
5460 && supportable_half_widening_operation ((tree_code) code,
5461 vectype_out, vectype_in,
5462 &tc1)))
5463 goto unsupported;
5464 code1 = tc1;
5465 gcc_assert (!(multi_step_cvt && op_type == binary_op));
5466 break;
5468 if (supportable_widening_operation (vinfo, code, stmt_info,
5469 vectype_out, vectype_in, &code1,
5470 &code2, &multi_step_cvt,
5471 &interm_types))
5473 /* Binary widening operation can only be supported directly by the
5474 architecture. */
5475 gcc_assert (!(multi_step_cvt && op_type == binary_op));
5476 break;
5479 if (code != FLOAT_EXPR
5480 || GET_MODE_SIZE (lhs_mode) <= GET_MODE_SIZE (rhs_mode))
5481 goto unsupported;
5483 fltsz = GET_MODE_SIZE (lhs_mode);
5484 FOR_EACH_2XWIDER_MODE (rhs_mode_iter, rhs_mode)
5486 rhs_mode = rhs_mode_iter.require ();
5487 if (GET_MODE_SIZE (rhs_mode) > fltsz)
5488 break;
5490 cvt_type
5491 = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), 0);
5492 cvt_type = get_same_sized_vectype (cvt_type, vectype_in);
5493 if (cvt_type == NULL_TREE)
5494 goto unsupported;
5496 if (GET_MODE_SIZE (rhs_mode) == fltsz)
5498 tc1 = ERROR_MARK;
5499 gcc_assert (code.is_tree_code ());
5500 if (!supportable_convert_operation ((tree_code) code, vectype_out,
5501 cvt_type, &tc1))
5502 goto unsupported;
5503 codecvt1 = tc1;
5505 else if (!supportable_widening_operation (vinfo, code,
5506 stmt_info, vectype_out,
5507 cvt_type, &codecvt1,
5508 &codecvt2, &multi_step_cvt,
5509 &interm_types))
5510 continue;
5511 else
5512 gcc_assert (multi_step_cvt == 0);
5514 if (supportable_widening_operation (vinfo, NOP_EXPR, stmt_info,
5515 cvt_type,
5516 vectype_in, &code1,
5517 &code2, &multi_step_cvt,
5518 &interm_types))
5520 found_mode = true;
5521 break;
5525 if (!found_mode)
5526 goto unsupported;
5528 if (GET_MODE_SIZE (rhs_mode) == fltsz)
5529 codecvt2 = ERROR_MARK;
5530 else
5532 multi_step_cvt++;
5533 interm_types.safe_push (cvt_type);
5534 cvt_type = NULL_TREE;
5536 break;
5538 case NARROW_DST:
5539 gcc_assert (op_type == unary_op);
5540 if (supportable_narrowing_operation (code, vectype_out, vectype_in,
5541 &code1, &multi_step_cvt,
5542 &interm_types))
5543 break;
5545 if (GET_MODE_SIZE (lhs_mode) >= GET_MODE_SIZE (rhs_mode))
5546 goto unsupported;
5548 if (code == FIX_TRUNC_EXPR)
5550 cvt_type
5551 = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), 0);
5552 cvt_type = get_same_sized_vectype (cvt_type, vectype_in);
5553 if (cvt_type == NULL_TREE)
5554 goto unsupported;
5555 if (supportable_convert_operation ((tree_code) code, cvt_type, vectype_in,
5556 &tc1))
5557 codecvt1 = tc1;
5558 else
5559 goto unsupported;
5560 if (supportable_narrowing_operation (NOP_EXPR, vectype_out, cvt_type,
5561 &code1, &multi_step_cvt,
5562 &interm_types))
5563 break;
5565 /* If op0 can be represented with low precision integer,
5566 truncate it to cvt_type and the do FLOAT_EXPR. */
5567 else if (code == FLOAT_EXPR)
5569 wide_int op_min_value, op_max_value;
5570 if (!vect_get_range_info (op0, &op_min_value, &op_max_value))
5571 goto unsupported;
5573 cvt_type
5574 = build_nonstandard_integer_type (GET_MODE_BITSIZE (lhs_mode), 0);
5575 if (cvt_type == NULL_TREE
5576 || (wi::min_precision (op_max_value, SIGNED)
5577 > TYPE_PRECISION (cvt_type))
5578 || (wi::min_precision (op_min_value, SIGNED)
5579 > TYPE_PRECISION (cvt_type)))
5580 goto unsupported;
5582 cvt_type = get_same_sized_vectype (cvt_type, vectype_out);
5583 if (cvt_type == NULL_TREE)
5584 goto unsupported;
5585 if (!supportable_narrowing_operation (NOP_EXPR, cvt_type, vectype_in,
5586 &code1, &multi_step_cvt,
5587 &interm_types))
5588 goto unsupported;
5589 if (supportable_convert_operation ((tree_code) code, vectype_out,
5590 cvt_type, &tc1))
5592 codecvt1 = tc1;
5593 modifier = NARROW_SRC;
5594 break;
5598 goto unsupported;
5600 default:
5601 gcc_unreachable ();
5604 if (!vec_stmt) /* transformation not required. */
5606 if (slp_node
5607 && (!vect_maybe_update_slp_op_vectype (slp_op0, vectype_in)
5608 || !vect_maybe_update_slp_op_vectype (slp_op1, vectype_in)))
5610 if (dump_enabled_p ())
5611 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5612 "incompatible vector types for invariants\n");
5613 return false;
5615 DUMP_VECT_SCOPE ("vectorizable_conversion");
5616 if (modifier == NONE)
5618 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
5619 vect_model_simple_cost (vinfo, stmt_info,
5620 ncopies * (1 + multi_step_cvt),
5621 dt, ndts, slp_node, cost_vec);
5623 else if (modifier == NARROW_SRC || modifier == NARROW_DST)
5625 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
5626 /* The final packing step produces one vector result per copy. */
5627 unsigned int nvectors
5628 = (slp_node ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) : ncopies);
5629 vect_model_promotion_demotion_cost (stmt_info, dt, nvectors,
5630 multi_step_cvt, cost_vec,
5631 widen_arith);
5633 else
5635 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
5636 /* The initial unpacking step produces two vector results
5637 per copy. MULTI_STEP_CVT is 0 for a single conversion,
5638 so >> MULTI_STEP_CVT divides by 2^(number of steps - 1). */
5639 unsigned int nvectors
5640 = (slp_node
5641 ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) >> multi_step_cvt
5642 : ncopies * 2);
5643 vect_model_promotion_demotion_cost (stmt_info, dt, nvectors,
5644 multi_step_cvt, cost_vec,
5645 widen_arith);
5647 interm_types.release ();
5648 return true;
5651 /* Transform. */
5652 if (dump_enabled_p ())
5653 dump_printf_loc (MSG_NOTE, vect_location,
5654 "transform conversion. ncopies = %d.\n", ncopies);
5656 if (op_type == binary_op)
5658 if (CONSTANT_CLASS_P (op0))
5659 op0 = fold_convert (TREE_TYPE (op1), op0);
5660 else if (CONSTANT_CLASS_P (op1))
5661 op1 = fold_convert (TREE_TYPE (op0), op1);
5664 /* In case of multi-step conversion, we first generate conversion operations
5665 to the intermediate types, and then from that types to the final one.
5666 We create vector destinations for the intermediate type (TYPES) received
5667 from supportable_*_operation, and store them in the correct order
5668 for future use in vect_create_vectorized_*_stmts (). */
5669 auto_vec<tree> vec_dsts (multi_step_cvt + 1);
5670 bool widen_or_narrow_float_p
5671 = cvt_type && (modifier == WIDEN || modifier == NARROW_SRC);
5672 vec_dest = vect_create_destination_var (scalar_dest,
5673 widen_or_narrow_float_p
5674 ? cvt_type : vectype_out);
5675 vec_dsts.quick_push (vec_dest);
5677 if (multi_step_cvt)
5679 for (i = interm_types.length () - 1;
5680 interm_types.iterate (i, &intermediate_type); i--)
5682 vec_dest = vect_create_destination_var (scalar_dest,
5683 intermediate_type);
5684 vec_dsts.quick_push (vec_dest);
5688 if (cvt_type)
5689 vec_dest = vect_create_destination_var (scalar_dest,
5690 widen_or_narrow_float_p
5691 ? vectype_out : cvt_type);
5693 int ninputs = 1;
5694 if (!slp_node)
5696 if (modifier == WIDEN)
5698 else if (modifier == NARROW_SRC || modifier == NARROW_DST)
5700 if (multi_step_cvt)
5701 ninputs = vect_pow2 (multi_step_cvt);
5702 ninputs *= 2;
5706 switch (modifier)
5708 case NONE:
5709 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
5710 op0, vectype_in, &vec_oprnds0);
5711 /* vec_dest is intermediate type operand when multi_step_cvt. */
5712 if (multi_step_cvt)
5714 cvt_op = vec_dest;
5715 vec_dest = vec_dsts[0];
5718 FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
5720 /* Arguments are ready, create the new vector stmt. */
5721 gimple* new_stmt;
5722 if (multi_step_cvt)
5724 gcc_assert (multi_step_cvt == 1);
5725 new_stmt = vect_gimple_build (cvt_op, codecvt1, vop0);
5726 new_temp = make_ssa_name (cvt_op, new_stmt);
5727 gimple_assign_set_lhs (new_stmt, new_temp);
5728 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5729 vop0 = new_temp;
5731 new_stmt = vect_gimple_build (vec_dest, code1, vop0);
5732 new_temp = make_ssa_name (vec_dest, new_stmt);
5733 gimple_set_lhs (new_stmt, new_temp);
5734 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5736 if (slp_node)
5737 slp_node->push_vec_def (new_stmt);
5738 else
5739 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
5741 break;
5743 case WIDEN:
5744 /* In case the vectorization factor (VF) is bigger than the number
5745 of elements that we can fit in a vectype (nunits), we have to
5746 generate more than one vector stmt - i.e - we need to "unroll"
5747 the vector stmt by a factor VF/nunits. */
5748 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies * ninputs,
5749 op0, vectype_in, &vec_oprnds0,
5750 code == WIDEN_LSHIFT_EXPR ? NULL_TREE : op1,
5751 vectype_in, &vec_oprnds1);
5752 if (code == WIDEN_LSHIFT_EXPR)
5754 int oprnds_size = vec_oprnds0.length ();
5755 vec_oprnds1.create (oprnds_size);
5756 for (i = 0; i < oprnds_size; ++i)
5757 vec_oprnds1.quick_push (op1);
5759 /* Arguments are ready. Create the new vector stmts. */
5760 for (i = multi_step_cvt; i >= 0; i--)
5762 tree this_dest = vec_dsts[i];
5763 code_helper c1 = code1, c2 = code2;
5764 if (i == 0 && codecvt2 != ERROR_MARK)
5766 c1 = codecvt1;
5767 c2 = codecvt2;
5769 if (known_eq (nunits_out, nunits_in))
5770 vect_create_half_widening_stmts (vinfo, &vec_oprnds0, &vec_oprnds1,
5771 stmt_info, this_dest, gsi, c1,
5772 op_type);
5773 else
5774 vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
5775 &vec_oprnds1, stmt_info,
5776 this_dest, gsi,
5777 c1, c2, op_type);
5780 FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
5782 gimple *new_stmt;
5783 if (cvt_type)
5785 new_temp = make_ssa_name (vec_dest);
5786 new_stmt = vect_gimple_build (new_temp, codecvt1, vop0);
5787 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5789 else
5790 new_stmt = SSA_NAME_DEF_STMT (vop0);
5792 if (slp_node)
5793 slp_node->push_vec_def (new_stmt);
5794 else
5795 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
5797 break;
5799 case NARROW_SRC:
5800 case NARROW_DST:
5801 /* In case the vectorization factor (VF) is bigger than the number
5802 of elements that we can fit in a vectype (nunits), we have to
5803 generate more than one vector stmt - i.e - we need to "unroll"
5804 the vector stmt by a factor VF/nunits. */
5805 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies * ninputs,
5806 op0, vectype_in, &vec_oprnds0);
5807 /* Arguments are ready. Create the new vector stmts. */
5808 if (cvt_type && modifier == NARROW_DST)
5809 FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
5811 new_temp = make_ssa_name (vec_dest);
5812 gimple *new_stmt = vect_gimple_build (new_temp, codecvt1, vop0);
5813 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5814 vec_oprnds0[i] = new_temp;
5817 vect_create_vectorized_demotion_stmts (vinfo, &vec_oprnds0,
5818 multi_step_cvt,
5819 stmt_info, vec_dsts, gsi,
5820 slp_node, code1,
5821 modifier == NARROW_SRC);
5822 /* After demoting op0 to cvt_type, convert it to dest. */
5823 if (cvt_type && code == FLOAT_EXPR)
5825 for (unsigned int i = 0; i != vec_oprnds0.length() / 2; i++)
5827 /* Arguments are ready, create the new vector stmt. */
5828 gcc_assert (TREE_CODE_LENGTH ((tree_code) codecvt1) == unary_op);
5829 gimple *new_stmt
5830 = vect_gimple_build (vec_dest, codecvt1, vec_oprnds0[i]);
5831 new_temp = make_ssa_name (vec_dest, new_stmt);
5832 gimple_set_lhs (new_stmt, new_temp);
5833 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5835 /* This is the last step of the conversion sequence. Store the
5836 vectors in SLP_NODE or in vector info of the scalar statement
5837 (or in STMT_VINFO_RELATED_STMT chain). */
5838 if (slp_node)
5839 slp_node->push_vec_def (new_stmt);
5840 else
5841 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
5844 break;
5846 if (!slp_node)
5847 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
5849 vec_oprnds0.release ();
5850 vec_oprnds1.release ();
5851 interm_types.release ();
5853 return true;
5856 /* Return true if we can assume from the scalar form of STMT_INFO that
5857 neither the scalar nor the vector forms will generate code. STMT_INFO
5858 is known not to involve a data reference. */
5860 bool
5861 vect_nop_conversion_p (stmt_vec_info stmt_info)
5863 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
5864 if (!stmt)
5865 return false;
5867 tree lhs = gimple_assign_lhs (stmt);
5868 tree_code code = gimple_assign_rhs_code (stmt);
5869 tree rhs = gimple_assign_rhs1 (stmt);
5871 if (code == SSA_NAME || code == VIEW_CONVERT_EXPR)
5872 return true;
5874 if (CONVERT_EXPR_CODE_P (code))
5875 return tree_nop_conversion_p (TREE_TYPE (lhs), TREE_TYPE (rhs));
5877 return false;
5880 /* Function vectorizable_assignment.
5882 Check if STMT_INFO performs an assignment (copy) that can be vectorized.
5883 If VEC_STMT is also passed, vectorize the STMT_INFO: create a vectorized
5884 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5885 Return true if STMT_INFO is vectorizable in this way. */
5887 static bool
5888 vectorizable_assignment (vec_info *vinfo,
5889 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5890 gimple **vec_stmt, slp_tree slp_node,
5891 stmt_vector_for_cost *cost_vec)
5893 tree vec_dest;
5894 tree scalar_dest;
5895 tree op;
5896 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5897 tree new_temp;
5898 enum vect_def_type dt[1] = {vect_unknown_def_type};
5899 int ndts = 1;
5900 int ncopies;
5901 int i;
5902 vec<tree> vec_oprnds = vNULL;
5903 tree vop;
5904 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5905 enum tree_code code;
5906 tree vectype_in;
5908 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
5909 return false;
5911 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
5912 && ! vec_stmt)
5913 return false;
5915 /* Is vectorizable assignment? */
5916 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
5917 if (!stmt)
5918 return false;
5920 scalar_dest = gimple_assign_lhs (stmt);
5921 if (TREE_CODE (scalar_dest) != SSA_NAME)
5922 return false;
5924 if (STMT_VINFO_DATA_REF (stmt_info))
5925 return false;
5927 code = gimple_assign_rhs_code (stmt);
5928 if (!(gimple_assign_single_p (stmt)
5929 || code == PAREN_EXPR
5930 || CONVERT_EXPR_CODE_P (code)))
5931 return false;
5933 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5934 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5936 /* Multiple types in SLP are handled by creating the appropriate number of
5937 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
5938 case of SLP. */
5939 if (slp_node)
5940 ncopies = 1;
5941 else
5942 ncopies = vect_get_num_copies (loop_vinfo, vectype);
5944 gcc_assert (ncopies >= 1);
5946 slp_tree slp_op;
5947 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 0, &op, &slp_op,
5948 &dt[0], &vectype_in))
5950 if (dump_enabled_p ())
5951 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5952 "use not simple.\n");
5953 return false;
5955 if (!vectype_in)
5956 vectype_in = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op), slp_node);
5958 /* We can handle VIEW_CONVERT conversions that do not change the number
5959 of elements or the vector size or other conversions when the component
5960 mode keeps the same. */
5961 if (!vectype_in
5962 || maybe_ne (TYPE_VECTOR_SUBPARTS (vectype_in), nunits)
5963 || (code == VIEW_CONVERT_EXPR
5964 && maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
5965 GET_MODE_SIZE (TYPE_MODE (vectype_in))))
5966 || (CONVERT_EXPR_CODE_P (code)
5967 && (TYPE_MODE (TREE_TYPE (vectype))
5968 != TYPE_MODE (TREE_TYPE (vectype_in)))))
5969 return false;
5971 if (VECTOR_BOOLEAN_TYPE_P (vectype) != VECTOR_BOOLEAN_TYPE_P (vectype_in))
5973 if (dump_enabled_p ())
5974 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5975 "can't convert between boolean and non "
5976 "boolean vectors %T\n", TREE_TYPE (op));
5978 return false;
5981 /* We do not handle bit-precision changes. */
5982 if ((CONVERT_EXPR_CODE_P (code)
5983 || code == VIEW_CONVERT_EXPR)
5984 && ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
5985 && !type_has_mode_precision_p (TREE_TYPE (scalar_dest)))
5986 || (INTEGRAL_TYPE_P (TREE_TYPE (op))
5987 && !type_has_mode_precision_p (TREE_TYPE (op))))
5988 /* But a conversion that does not change the bit-pattern is ok. */
5989 && !(INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
5990 && INTEGRAL_TYPE_P (TREE_TYPE (op))
5991 && (((TYPE_PRECISION (TREE_TYPE (scalar_dest))
5992 > TYPE_PRECISION (TREE_TYPE (op)))
5993 && TYPE_UNSIGNED (TREE_TYPE (op)))
5994 || (TYPE_PRECISION (TREE_TYPE (scalar_dest))
5995 == TYPE_PRECISION (TREE_TYPE (op))))))
5997 if (dump_enabled_p ())
5998 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5999 "type conversion to/from bit-precision "
6000 "unsupported.\n");
6001 return false;
6004 if (!vec_stmt) /* transformation not required. */
6006 if (slp_node
6007 && !vect_maybe_update_slp_op_vectype (slp_op, vectype_in))
6009 if (dump_enabled_p ())
6010 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6011 "incompatible vector types for invariants\n");
6012 return false;
6014 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
6015 DUMP_VECT_SCOPE ("vectorizable_assignment");
6016 if (!vect_nop_conversion_p (stmt_info))
6017 vect_model_simple_cost (vinfo, stmt_info, ncopies, dt, ndts, slp_node,
6018 cost_vec);
6019 return true;
6022 /* Transform. */
6023 if (dump_enabled_p ())
6024 dump_printf_loc (MSG_NOTE, vect_location, "transform assignment.\n");
6026 /* Handle def. */
6027 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6029 /* Handle use. */
6030 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies, op, &vec_oprnds);
6032 /* Arguments are ready. create the new vector stmt. */
6033 FOR_EACH_VEC_ELT (vec_oprnds, i, vop)
6035 if (CONVERT_EXPR_CODE_P (code)
6036 || code == VIEW_CONVERT_EXPR)
6037 vop = build1 (VIEW_CONVERT_EXPR, vectype, vop);
6038 gassign *new_stmt = gimple_build_assign (vec_dest, vop);
6039 new_temp = make_ssa_name (vec_dest, new_stmt);
6040 gimple_assign_set_lhs (new_stmt, new_temp);
6041 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6042 if (slp_node)
6043 slp_node->push_vec_def (new_stmt);
6044 else
6045 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6047 if (!slp_node)
6048 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
6050 vec_oprnds.release ();
6051 return true;
6055 /* Return TRUE if CODE (a shift operation) is supported for SCALAR_TYPE
6056 either as shift by a scalar or by a vector. */
6058 bool
6059 vect_supportable_shift (vec_info *vinfo, enum tree_code code, tree scalar_type)
6062 machine_mode vec_mode;
6063 optab optab;
6064 int icode;
6065 tree vectype;
6067 vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
6068 if (!vectype)
6069 return false;
6071 optab = optab_for_tree_code (code, vectype, optab_scalar);
6072 if (!optab
6073 || optab_handler (optab, TYPE_MODE (vectype)) == CODE_FOR_nothing)
6075 optab = optab_for_tree_code (code, vectype, optab_vector);
6076 if (!optab
6077 || (optab_handler (optab, TYPE_MODE (vectype))
6078 == CODE_FOR_nothing))
6079 return false;
6082 vec_mode = TYPE_MODE (vectype);
6083 icode = (int) optab_handler (optab, vec_mode);
6084 if (icode == CODE_FOR_nothing)
6085 return false;
6087 return true;
6091 /* Function vectorizable_shift.
6093 Check if STMT_INFO performs a shift operation that can be vectorized.
6094 If VEC_STMT is also passed, vectorize the STMT_INFO: create a vectorized
6095 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6096 Return true if STMT_INFO is vectorizable in this way. */
6098 static bool
6099 vectorizable_shift (vec_info *vinfo,
6100 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6101 gimple **vec_stmt, slp_tree slp_node,
6102 stmt_vector_for_cost *cost_vec)
6104 tree vec_dest;
6105 tree scalar_dest;
6106 tree op0, op1 = NULL;
6107 tree vec_oprnd1 = NULL_TREE;
6108 tree vectype;
6109 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6110 enum tree_code code;
6111 machine_mode vec_mode;
6112 tree new_temp;
6113 optab optab;
6114 int icode;
6115 machine_mode optab_op2_mode;
6116 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
6117 int ndts = 2;
6118 poly_uint64 nunits_in;
6119 poly_uint64 nunits_out;
6120 tree vectype_out;
6121 tree op1_vectype;
6122 int ncopies;
6123 int i;
6124 vec<tree> vec_oprnds0 = vNULL;
6125 vec<tree> vec_oprnds1 = vNULL;
6126 tree vop0, vop1;
6127 unsigned int k;
6128 bool scalar_shift_arg = true;
6129 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
6130 bool incompatible_op1_vectype_p = false;
6132 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
6133 return false;
6135 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
6136 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle
6137 && ! vec_stmt)
6138 return false;
6140 /* Is STMT a vectorizable binary/unary operation? */
6141 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
6142 if (!stmt)
6143 return false;
6145 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6146 return false;
6148 code = gimple_assign_rhs_code (stmt);
6150 if (!(code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
6151 || code == RROTATE_EXPR))
6152 return false;
6154 scalar_dest = gimple_assign_lhs (stmt);
6155 vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6156 if (!type_has_mode_precision_p (TREE_TYPE (scalar_dest)))
6158 if (dump_enabled_p ())
6159 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6160 "bit-precision shifts not supported.\n");
6161 return false;
6164 slp_tree slp_op0;
6165 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
6166 0, &op0, &slp_op0, &dt[0], &vectype))
6168 if (dump_enabled_p ())
6169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6170 "use not simple.\n");
6171 return false;
6173 /* If op0 is an external or constant def, infer the vector type
6174 from the scalar type. */
6175 if (!vectype)
6176 vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op0), slp_node);
6177 if (vec_stmt)
6178 gcc_assert (vectype);
6179 if (!vectype)
6181 if (dump_enabled_p ())
6182 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6183 "no vectype for scalar type\n");
6184 return false;
6187 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6188 nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
6189 if (maybe_ne (nunits_out, nunits_in))
6190 return false;
6192 stmt_vec_info op1_def_stmt_info;
6193 slp_tree slp_op1;
6194 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 1, &op1, &slp_op1,
6195 &dt[1], &op1_vectype, &op1_def_stmt_info))
6197 if (dump_enabled_p ())
6198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6199 "use not simple.\n");
6200 return false;
6203 /* Multiple types in SLP are handled by creating the appropriate number of
6204 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
6205 case of SLP. */
6206 if (slp_node)
6207 ncopies = 1;
6208 else
6209 ncopies = vect_get_num_copies (loop_vinfo, vectype);
6211 gcc_assert (ncopies >= 1);
6213 /* Determine whether the shift amount is a vector, or scalar. If the
6214 shift/rotate amount is a vector, use the vector/vector shift optabs. */
6216 if ((dt[1] == vect_internal_def
6217 || dt[1] == vect_induction_def
6218 || dt[1] == vect_nested_cycle)
6219 && !slp_node)
6220 scalar_shift_arg = false;
6221 else if (dt[1] == vect_constant_def
6222 || dt[1] == vect_external_def
6223 || dt[1] == vect_internal_def)
6225 /* In SLP, need to check whether the shift count is the same,
6226 in loops if it is a constant or invariant, it is always
6227 a scalar shift. */
6228 if (slp_node)
6230 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6231 stmt_vec_info slpstmt_info;
6233 FOR_EACH_VEC_ELT (stmts, k, slpstmt_info)
6235 gassign *slpstmt = as_a <gassign *> (slpstmt_info->stmt);
6236 if (!operand_equal_p (gimple_assign_rhs2 (slpstmt), op1, 0))
6237 scalar_shift_arg = false;
6240 /* For internal SLP defs we have to make sure we see scalar stmts
6241 for all vector elements.
6242 ??? For different vectors we could resort to a different
6243 scalar shift operand but code-generation below simply always
6244 takes the first. */
6245 if (dt[1] == vect_internal_def
6246 && maybe_ne (nunits_out * SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
6247 stmts.length ()))
6248 scalar_shift_arg = false;
6251 /* If the shift amount is computed by a pattern stmt we cannot
6252 use the scalar amount directly thus give up and use a vector
6253 shift. */
6254 if (op1_def_stmt_info && is_pattern_stmt_p (op1_def_stmt_info))
6255 scalar_shift_arg = false;
6257 else
6259 if (dump_enabled_p ())
6260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6261 "operand mode requires invariant argument.\n");
6262 return false;
6265 /* Vector shifted by vector. */
6266 bool was_scalar_shift_arg = scalar_shift_arg;
6267 if (!scalar_shift_arg)
6269 optab = optab_for_tree_code (code, vectype, optab_vector);
6270 if (dump_enabled_p ())
6271 dump_printf_loc (MSG_NOTE, vect_location,
6272 "vector/vector shift/rotate found.\n");
6274 if (!op1_vectype)
6275 op1_vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op1),
6276 slp_op1);
6277 incompatible_op1_vectype_p
6278 = (op1_vectype == NULL_TREE
6279 || maybe_ne (TYPE_VECTOR_SUBPARTS (op1_vectype),
6280 TYPE_VECTOR_SUBPARTS (vectype))
6281 || TYPE_MODE (op1_vectype) != TYPE_MODE (vectype));
6282 if (incompatible_op1_vectype_p
6283 && (!slp_node
6284 || SLP_TREE_DEF_TYPE (slp_op1) != vect_constant_def
6285 || slp_op1->refcnt != 1))
6287 if (dump_enabled_p ())
6288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6289 "unusable type for last operand in"
6290 " vector/vector shift/rotate.\n");
6291 return false;
6294 /* See if the machine has a vector shifted by scalar insn and if not
6295 then see if it has a vector shifted by vector insn. */
6296 else
6298 optab = optab_for_tree_code (code, vectype, optab_scalar);
6299 if (optab
6300 && optab_handler (optab, TYPE_MODE (vectype)) != CODE_FOR_nothing)
6302 if (dump_enabled_p ())
6303 dump_printf_loc (MSG_NOTE, vect_location,
6304 "vector/scalar shift/rotate found.\n");
6306 else
6308 optab = optab_for_tree_code (code, vectype, optab_vector);
6309 if (optab
6310 && (optab_handler (optab, TYPE_MODE (vectype))
6311 != CODE_FOR_nothing))
6313 scalar_shift_arg = false;
6315 if (dump_enabled_p ())
6316 dump_printf_loc (MSG_NOTE, vect_location,
6317 "vector/vector shift/rotate found.\n");
6319 if (!op1_vectype)
6320 op1_vectype = get_vectype_for_scalar_type (vinfo,
6321 TREE_TYPE (op1),
6322 slp_op1);
6324 /* Unlike the other binary operators, shifts/rotates have
6325 the rhs being int, instead of the same type as the lhs,
6326 so make sure the scalar is the right type if we are
6327 dealing with vectors of long long/long/short/char. */
6328 incompatible_op1_vectype_p
6329 = (!op1_vectype
6330 || !tree_nop_conversion_p (TREE_TYPE (vectype),
6331 TREE_TYPE (op1)));
6332 if (incompatible_op1_vectype_p
6333 && dt[1] == vect_internal_def)
6335 if (dump_enabled_p ())
6336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6337 "unusable type for last operand in"
6338 " vector/vector shift/rotate.\n");
6339 return false;
6345 /* Supportable by target? */
6346 if (!optab)
6348 if (dump_enabled_p ())
6349 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6350 "no optab.\n");
6351 return false;
6353 vec_mode = TYPE_MODE (vectype);
6354 icode = (int) optab_handler (optab, vec_mode);
6355 if (icode == CODE_FOR_nothing)
6357 if (dump_enabled_p ())
6358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6359 "op not supported by target.\n");
6360 return false;
6362 /* vector lowering cannot optimize vector shifts using word arithmetic. */
6363 if (vect_emulated_vector_p (vectype))
6364 return false;
6366 if (!vec_stmt) /* transformation not required. */
6368 if (slp_node
6369 && (!vect_maybe_update_slp_op_vectype (slp_op0, vectype)
6370 || ((!scalar_shift_arg || dt[1] == vect_internal_def)
6371 && (!incompatible_op1_vectype_p
6372 || dt[1] == vect_constant_def)
6373 && !vect_maybe_update_slp_op_vectype
6374 (slp_op1,
6375 incompatible_op1_vectype_p ? vectype : op1_vectype))))
6377 if (dump_enabled_p ())
6378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6379 "incompatible vector types for invariants\n");
6380 return false;
6382 /* Now adjust the constant shift amount in place. */
6383 if (slp_node
6384 && incompatible_op1_vectype_p
6385 && dt[1] == vect_constant_def)
6387 for (unsigned i = 0;
6388 i < SLP_TREE_SCALAR_OPS (slp_op1).length (); ++i)
6390 SLP_TREE_SCALAR_OPS (slp_op1)[i]
6391 = fold_convert (TREE_TYPE (vectype),
6392 SLP_TREE_SCALAR_OPS (slp_op1)[i]);
6393 gcc_assert ((TREE_CODE (SLP_TREE_SCALAR_OPS (slp_op1)[i])
6394 == INTEGER_CST));
6397 STMT_VINFO_TYPE (stmt_info) = shift_vec_info_type;
6398 DUMP_VECT_SCOPE ("vectorizable_shift");
6399 vect_model_simple_cost (vinfo, stmt_info, ncopies, dt,
6400 scalar_shift_arg ? 1 : ndts, slp_node, cost_vec);
6401 return true;
6404 /* Transform. */
6406 if (dump_enabled_p ())
6407 dump_printf_loc (MSG_NOTE, vect_location,
6408 "transform binary/unary operation.\n");
6410 if (incompatible_op1_vectype_p && !slp_node)
6412 gcc_assert (!scalar_shift_arg && was_scalar_shift_arg);
6413 op1 = fold_convert (TREE_TYPE (vectype), op1);
6414 if (dt[1] != vect_constant_def)
6415 op1 = vect_init_vector (vinfo, stmt_info, op1,
6416 TREE_TYPE (vectype), NULL);
6419 /* Handle def. */
6420 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6422 if (scalar_shift_arg && dt[1] != vect_internal_def)
6424 /* Vector shl and shr insn patterns can be defined with scalar
6425 operand 2 (shift operand). In this case, use constant or loop
6426 invariant op1 directly, without extending it to vector mode
6427 first. */
6428 optab_op2_mode = insn_data[icode].operand[2].mode;
6429 if (!VECTOR_MODE_P (optab_op2_mode))
6431 if (dump_enabled_p ())
6432 dump_printf_loc (MSG_NOTE, vect_location,
6433 "operand 1 using scalar mode.\n");
6434 vec_oprnd1 = op1;
6435 vec_oprnds1.create (slp_node ? slp_node->vec_stmts_size : ncopies);
6436 vec_oprnds1.quick_push (vec_oprnd1);
6437 /* Store vec_oprnd1 for every vector stmt to be created.
6438 We check during the analysis that all the shift arguments
6439 are the same.
6440 TODO: Allow different constants for different vector
6441 stmts generated for an SLP instance. */
6442 for (k = 0;
6443 k < (slp_node ? slp_node->vec_stmts_size - 1 : ncopies - 1); k++)
6444 vec_oprnds1.quick_push (vec_oprnd1);
6447 else if (!scalar_shift_arg && slp_node && incompatible_op1_vectype_p)
6449 if (was_scalar_shift_arg)
6451 /* If the argument was the same in all lanes create
6452 the correctly typed vector shift amount directly. */
6453 op1 = fold_convert (TREE_TYPE (vectype), op1);
6454 op1 = vect_init_vector (vinfo, stmt_info, op1, TREE_TYPE (vectype),
6455 !loop_vinfo ? gsi : NULL);
6456 vec_oprnd1 = vect_init_vector (vinfo, stmt_info, op1, vectype,
6457 !loop_vinfo ? gsi : NULL);
6458 vec_oprnds1.create (slp_node->vec_stmts_size);
6459 for (k = 0; k < slp_node->vec_stmts_size; k++)
6460 vec_oprnds1.quick_push (vec_oprnd1);
6462 else if (dt[1] == vect_constant_def)
6463 /* The constant shift amount has been adjusted in place. */
6465 else
6466 gcc_assert (TYPE_MODE (op1_vectype) == TYPE_MODE (vectype));
6469 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
6470 (a special case for certain kind of vector shifts); otherwise,
6471 operand 1 should be of a vector type (the usual case). */
6472 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
6473 op0, &vec_oprnds0,
6474 vec_oprnd1 ? NULL_TREE : op1, &vec_oprnds1);
6476 /* Arguments are ready. Create the new vector stmt. */
6477 FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
6479 /* For internal defs where we need to use a scalar shift arg
6480 extract the first lane. */
6481 if (scalar_shift_arg && dt[1] == vect_internal_def)
6483 vop1 = vec_oprnds1[0];
6484 new_temp = make_ssa_name (TREE_TYPE (TREE_TYPE (vop1)));
6485 gassign *new_stmt
6486 = gimple_build_assign (new_temp,
6487 build3 (BIT_FIELD_REF, TREE_TYPE (new_temp),
6488 vop1,
6489 TYPE_SIZE (TREE_TYPE (new_temp)),
6490 bitsize_zero_node));
6491 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6492 vop1 = new_temp;
6494 else
6495 vop1 = vec_oprnds1[i];
6496 gassign *new_stmt = gimple_build_assign (vec_dest, code, vop0, vop1);
6497 new_temp = make_ssa_name (vec_dest, new_stmt);
6498 gimple_assign_set_lhs (new_stmt, new_temp);
6499 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6500 if (slp_node)
6501 slp_node->push_vec_def (new_stmt);
6502 else
6503 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6506 if (!slp_node)
6507 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
6509 vec_oprnds0.release ();
6510 vec_oprnds1.release ();
6512 return true;
6515 /* Function vectorizable_operation.
6517 Check if STMT_INFO performs a binary, unary or ternary operation that can
6518 be vectorized.
6519 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6520 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6521 Return true if STMT_INFO is vectorizable in this way. */
6523 static bool
6524 vectorizable_operation (vec_info *vinfo,
6525 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6526 gimple **vec_stmt, slp_tree slp_node,
6527 stmt_vector_for_cost *cost_vec)
6529 tree vec_dest;
6530 tree scalar_dest;
6531 tree op0, op1 = NULL_TREE, op2 = NULL_TREE;
6532 tree vectype;
6533 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6534 enum tree_code code, orig_code;
6535 machine_mode vec_mode;
6536 tree new_temp;
6537 int op_type;
6538 optab optab;
6539 bool target_support_p;
6540 enum vect_def_type dt[3]
6541 = {vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type};
6542 int ndts = 3;
6543 poly_uint64 nunits_in;
6544 poly_uint64 nunits_out;
6545 tree vectype_out;
6546 int ncopies, vec_num;
6547 int i;
6548 vec<tree> vec_oprnds0 = vNULL;
6549 vec<tree> vec_oprnds1 = vNULL;
6550 vec<tree> vec_oprnds2 = vNULL;
6551 tree vop0, vop1, vop2;
6552 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
6554 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
6555 return false;
6557 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
6558 && ! vec_stmt)
6559 return false;
6561 /* Is STMT a vectorizable binary/unary operation? */
6562 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
6563 if (!stmt)
6564 return false;
6566 /* Loads and stores are handled in vectorizable_{load,store}. */
6567 if (STMT_VINFO_DATA_REF (stmt_info))
6568 return false;
6570 orig_code = code = gimple_assign_rhs_code (stmt);
6572 /* Shifts are handled in vectorizable_shift. */
6573 if (code == LSHIFT_EXPR
6574 || code == RSHIFT_EXPR
6575 || code == LROTATE_EXPR
6576 || code == RROTATE_EXPR)
6577 return false;
6579 /* Comparisons are handled in vectorizable_comparison. */
6580 if (TREE_CODE_CLASS (code) == tcc_comparison)
6581 return false;
6583 /* Conditions are handled in vectorizable_condition. */
6584 if (code == COND_EXPR)
6585 return false;
6587 /* For pointer addition and subtraction, we should use the normal
6588 plus and minus for the vector operation. */
6589 if (code == POINTER_PLUS_EXPR)
6590 code = PLUS_EXPR;
6591 if (code == POINTER_DIFF_EXPR)
6592 code = MINUS_EXPR;
6594 /* Support only unary or binary operations. */
6595 op_type = TREE_CODE_LENGTH (code);
6596 if (op_type != unary_op && op_type != binary_op && op_type != ternary_op)
6598 if (dump_enabled_p ())
6599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6600 "num. args = %d (not unary/binary/ternary op).\n",
6601 op_type);
6602 return false;
6605 scalar_dest = gimple_assign_lhs (stmt);
6606 vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6608 /* Most operations cannot handle bit-precision types without extra
6609 truncations. */
6610 bool mask_op_p = VECTOR_BOOLEAN_TYPE_P (vectype_out);
6611 if (!mask_op_p
6612 && !type_has_mode_precision_p (TREE_TYPE (scalar_dest))
6613 /* Exception are bitwise binary operations. */
6614 && code != BIT_IOR_EXPR
6615 && code != BIT_XOR_EXPR
6616 && code != BIT_AND_EXPR)
6618 if (dump_enabled_p ())
6619 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6620 "bit-precision arithmetic not supported.\n");
6621 return false;
6624 slp_tree slp_op0;
6625 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
6626 0, &op0, &slp_op0, &dt[0], &vectype))
6628 if (dump_enabled_p ())
6629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6630 "use not simple.\n");
6631 return false;
6633 bool is_invariant = (dt[0] == vect_external_def
6634 || dt[0] == vect_constant_def);
6635 /* If op0 is an external or constant def, infer the vector type
6636 from the scalar type. */
6637 if (!vectype)
6639 /* For boolean type we cannot determine vectype by
6640 invariant value (don't know whether it is a vector
6641 of booleans or vector of integers). We use output
6642 vectype because operations on boolean don't change
6643 type. */
6644 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op0)))
6646 if (!VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (scalar_dest)))
6648 if (dump_enabled_p ())
6649 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6650 "not supported operation on bool value.\n");
6651 return false;
6653 vectype = vectype_out;
6655 else
6656 vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op0),
6657 slp_node);
6659 if (vec_stmt)
6660 gcc_assert (vectype);
6661 if (!vectype)
6663 if (dump_enabled_p ())
6664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6665 "no vectype for scalar type %T\n",
6666 TREE_TYPE (op0));
6668 return false;
6671 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6672 nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
6673 if (maybe_ne (nunits_out, nunits_in)
6674 || !tree_nop_conversion_p (TREE_TYPE (vectype_out), TREE_TYPE (vectype)))
6675 return false;
6677 tree vectype2 = NULL_TREE, vectype3 = NULL_TREE;
6678 slp_tree slp_op1 = NULL, slp_op2 = NULL;
6679 if (op_type == binary_op || op_type == ternary_op)
6681 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
6682 1, &op1, &slp_op1, &dt[1], &vectype2))
6684 if (dump_enabled_p ())
6685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6686 "use not simple.\n");
6687 return false;
6689 is_invariant &= (dt[1] == vect_external_def
6690 || dt[1] == vect_constant_def);
6691 if (vectype2
6692 && (maybe_ne (nunits_out, TYPE_VECTOR_SUBPARTS (vectype2))
6693 || !tree_nop_conversion_p (TREE_TYPE (vectype_out),
6694 TREE_TYPE (vectype2))))
6695 return false;
6697 if (op_type == ternary_op)
6699 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
6700 2, &op2, &slp_op2, &dt[2], &vectype3))
6702 if (dump_enabled_p ())
6703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6704 "use not simple.\n");
6705 return false;
6707 is_invariant &= (dt[2] == vect_external_def
6708 || dt[2] == vect_constant_def);
6709 if (vectype3
6710 && (maybe_ne (nunits_out, TYPE_VECTOR_SUBPARTS (vectype3))
6711 || !tree_nop_conversion_p (TREE_TYPE (vectype_out),
6712 TREE_TYPE (vectype3))))
6713 return false;
6716 /* Multiple types in SLP are handled by creating the appropriate number of
6717 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
6718 case of SLP. */
6719 if (slp_node)
6721 ncopies = 1;
6722 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6724 else
6726 ncopies = vect_get_num_copies (loop_vinfo, vectype);
6727 vec_num = 1;
6730 gcc_assert (ncopies >= 1);
6732 /* Reject attempts to combine mask types with nonmask types, e.g. if
6733 we have an AND between a (nonmask) boolean loaded from memory and
6734 a (mask) boolean result of a comparison.
6736 TODO: We could easily fix these cases up using pattern statements. */
6737 if (VECTOR_BOOLEAN_TYPE_P (vectype) != mask_op_p
6738 || (vectype2 && VECTOR_BOOLEAN_TYPE_P (vectype2) != mask_op_p)
6739 || (vectype3 && VECTOR_BOOLEAN_TYPE_P (vectype3) != mask_op_p))
6741 if (dump_enabled_p ())
6742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6743 "mixed mask and nonmask vector types\n");
6744 return false;
6747 /* Supportable by target? */
6749 vec_mode = TYPE_MODE (vectype);
6750 if (code == MULT_HIGHPART_EXPR)
6751 target_support_p = can_mult_highpart_p (vec_mode, TYPE_UNSIGNED (vectype));
6752 else
6754 optab = optab_for_tree_code (code, vectype, optab_default);
6755 if (!optab)
6757 if (dump_enabled_p ())
6758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6759 "no optab.\n");
6760 return false;
6762 target_support_p = (optab_handler (optab, vec_mode) != CODE_FOR_nothing
6763 || optab_libfunc (optab, vec_mode));
6766 bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);
6767 if (!target_support_p || using_emulated_vectors_p)
6769 if (dump_enabled_p ())
6770 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6771 "op not supported by target.\n");
6772 /* When vec_mode is not a vector mode and we verified ops we
6773 do not have to lower like AND are natively supported let
6774 those through even when the mode isn't word_mode. For
6775 ops we have to lower the lowering code assumes we are
6776 dealing with word_mode. */
6777 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype))
6778 || (((code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR)
6779 || !target_support_p)
6780 && maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD))
6781 /* Check only during analysis. */
6782 || (!vec_stmt && !vect_can_vectorize_without_simd_p (code)))
6784 if (dump_enabled_p ())
6785 dump_printf (MSG_NOTE, "using word mode not possible.\n");
6786 return false;
6788 if (dump_enabled_p ())
6789 dump_printf_loc (MSG_NOTE, vect_location,
6790 "proceeding using word mode.\n");
6791 using_emulated_vectors_p = true;
6794 int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
6795 vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
6796 vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
6797 internal_fn cond_fn = get_conditional_internal_fn (code);
6798 internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
6800 /* If operating on inactive elements could generate spurious traps,
6801 we need to restrict the operation to active lanes. Note that this
6802 specifically doesn't apply to unhoisted invariants, since they
6803 operate on the same value for every lane.
6805 Similarly, if this operation is part of a reduction, a fully-masked
6806 loop should only change the active lanes of the reduction chain,
6807 keeping the inactive lanes as-is. */
6808 bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
6809 || reduc_idx >= 0);
6811 if (!vec_stmt) /* transformation not required. */
6813 if (loop_vinfo
6814 && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
6815 && mask_out_inactive)
6817 if (cond_len_fn != IFN_LAST
6818 && direct_internal_fn_supported_p (cond_len_fn, vectype,
6819 OPTIMIZE_FOR_SPEED))
6820 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype,
6822 else if (cond_fn != IFN_LAST
6823 && direct_internal_fn_supported_p (cond_fn, vectype,
6824 OPTIMIZE_FOR_SPEED))
6825 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6826 vectype, NULL);
6827 else
6829 if (dump_enabled_p ())
6830 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6831 "can't use a fully-masked loop because no"
6832 " conditional operation is available.\n");
6833 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6837 /* Put types on constant and invariant SLP children. */
6838 if (slp_node
6839 && (!vect_maybe_update_slp_op_vectype (slp_op0, vectype)
6840 || !vect_maybe_update_slp_op_vectype (slp_op1, vectype)
6841 || !vect_maybe_update_slp_op_vectype (slp_op2, vectype)))
6843 if (dump_enabled_p ())
6844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6845 "incompatible vector types for invariants\n");
6846 return false;
6849 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
6850 DUMP_VECT_SCOPE ("vectorizable_operation");
6851 vect_model_simple_cost (vinfo, stmt_info,
6852 ncopies, dt, ndts, slp_node, cost_vec);
6853 if (using_emulated_vectors_p)
6855 /* The above vect_model_simple_cost call handles constants
6856 in the prologue and (mis-)costs one of the stmts as
6857 vector stmt. See below for the actual lowering that will
6858 be applied. */
6859 unsigned n
6860 = slp_node ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) : ncopies;
6861 switch (code)
6863 case PLUS_EXPR:
6864 n *= 5;
6865 break;
6866 case MINUS_EXPR:
6867 n *= 6;
6868 break;
6869 case NEGATE_EXPR:
6870 n *= 4;
6871 break;
6872 default:
6873 /* Bit operations do not have extra cost and are accounted
6874 as vector stmt by vect_model_simple_cost. */
6875 n = 0;
6876 break;
6878 if (n != 0)
6880 /* We also need to materialize two large constants. */
6881 record_stmt_cost (cost_vec, 2, scalar_stmt, stmt_info,
6882 0, vect_prologue);
6883 record_stmt_cost (cost_vec, n, scalar_stmt, stmt_info,
6884 0, vect_body);
6887 return true;
6890 /* Transform. */
6892 if (dump_enabled_p ())
6893 dump_printf_loc (MSG_NOTE, vect_location,
6894 "transform binary/unary operation.\n");
6896 bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6897 bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
6899 /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
6900 vectors with unsigned elements, but the result is signed. So, we
6901 need to compute the MINUS_EXPR into vectype temporary and
6902 VIEW_CONVERT_EXPR it into the final vectype_out result. */
6903 tree vec_cvt_dest = NULL_TREE;
6904 if (orig_code == POINTER_DIFF_EXPR)
6906 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6907 vec_cvt_dest = vect_create_destination_var (scalar_dest, vectype_out);
6909 /* Handle def. */
6910 else
6911 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6913 /* In case the vectorization factor (VF) is bigger than the number
6914 of elements that we can fit in a vectype (nunits), we have to generate
6915 more than one vector stmt - i.e - we need to "unroll" the
6916 vector stmt by a factor VF/nunits. In doing so, we record a pointer
6917 from one copy of the vector stmt to the next, in the field
6918 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
6919 stages to find the correct vector defs to be used when vectorizing
6920 stmts that use the defs of the current stmt. The example below
6921 illustrates the vectorization process when VF=16 and nunits=4 (i.e.,
6922 we need to create 4 vectorized stmts):
6924 before vectorization:
6925 RELATED_STMT VEC_STMT
6926 S1: x = memref - -
6927 S2: z = x + 1 - -
6929 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
6930 there):
6931 RELATED_STMT VEC_STMT
6932 VS1_0: vx0 = memref0 VS1_1 -
6933 VS1_1: vx1 = memref1 VS1_2 -
6934 VS1_2: vx2 = memref2 VS1_3 -
6935 VS1_3: vx3 = memref3 - -
6936 S1: x = load - VS1_0
6937 S2: z = x + 1 - -
6939 step2: vectorize stmt S2 (done here):
6940 To vectorize stmt S2 we first need to find the relevant vector
6941 def for the first operand 'x'. This is, as usual, obtained from
6942 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
6943 that defines 'x' (S1). This way we find the stmt VS1_0, and the
6944 relevant vector def 'vx0'. Having found 'vx0' we can generate
6945 the vector stmt VS2_0, and as usual, record it in the
6946 STMT_VINFO_VEC_STMT of stmt S2.
6947 When creating the second copy (VS2_1), we obtain the relevant vector
6948 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
6949 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
6950 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
6951 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
6952 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
6953 chain of stmts and pointers:
6954 RELATED_STMT VEC_STMT
6955 VS1_0: vx0 = memref0 VS1_1 -
6956 VS1_1: vx1 = memref1 VS1_2 -
6957 VS1_2: vx2 = memref2 VS1_3 -
6958 VS1_3: vx3 = memref3 - -
6959 S1: x = load - VS1_0
6960 VS2_0: vz0 = vx0 + v1 VS2_1 -
6961 VS2_1: vz1 = vx1 + v1 VS2_2 -
6962 VS2_2: vz2 = vx2 + v1 VS2_3 -
6963 VS2_3: vz3 = vx3 + v1 - -
6964 S2: z = x + 1 - VS2_0 */
6966 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
6967 op0, &vec_oprnds0, op1, &vec_oprnds1, op2, &vec_oprnds2);
6968 /* Arguments are ready. Create the new vector stmt. */
6969 FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
6971 gimple *new_stmt = NULL;
6972 vop1 = ((op_type == binary_op || op_type == ternary_op)
6973 ? vec_oprnds1[i] : NULL_TREE);
6974 vop2 = ((op_type == ternary_op) ? vec_oprnds2[i] : NULL_TREE);
6975 if (using_emulated_vectors_p
6976 && (code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR))
6978 /* Lower the operation. This follows vector lowering. */
6979 unsigned int width = vector_element_bits (vectype);
6980 tree inner_type = TREE_TYPE (vectype);
6981 tree word_type
6982 = build_nonstandard_integer_type (GET_MODE_BITSIZE (word_mode), 1);
6983 HOST_WIDE_INT max = GET_MODE_MASK (TYPE_MODE (inner_type));
6984 tree low_bits = build_replicated_int_cst (word_type, width, max >> 1);
6985 tree high_bits
6986 = build_replicated_int_cst (word_type, width, max & ~(max >> 1));
6987 tree wvop0 = make_ssa_name (word_type);
6988 new_stmt = gimple_build_assign (wvop0, VIEW_CONVERT_EXPR,
6989 build1 (VIEW_CONVERT_EXPR,
6990 word_type, vop0));
6991 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6992 tree result_low, signs;
6993 if (code == PLUS_EXPR || code == MINUS_EXPR)
6995 tree wvop1 = make_ssa_name (word_type);
6996 new_stmt = gimple_build_assign (wvop1, VIEW_CONVERT_EXPR,
6997 build1 (VIEW_CONVERT_EXPR,
6998 word_type, vop1));
6999 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7000 signs = make_ssa_name (word_type);
7001 new_stmt = gimple_build_assign (signs,
7002 BIT_XOR_EXPR, wvop0, wvop1);
7003 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7004 tree b_low = make_ssa_name (word_type);
7005 new_stmt = gimple_build_assign (b_low,
7006 BIT_AND_EXPR, wvop1, low_bits);
7007 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7008 tree a_low = make_ssa_name (word_type);
7009 if (code == PLUS_EXPR)
7010 new_stmt = gimple_build_assign (a_low,
7011 BIT_AND_EXPR, wvop0, low_bits);
7012 else
7013 new_stmt = gimple_build_assign (a_low,
7014 BIT_IOR_EXPR, wvop0, high_bits);
7015 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7016 if (code == MINUS_EXPR)
7018 new_stmt = gimple_build_assign (NULL_TREE,
7019 BIT_NOT_EXPR, signs);
7020 signs = make_ssa_name (word_type);
7021 gimple_assign_set_lhs (new_stmt, signs);
7022 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7024 new_stmt = gimple_build_assign (NULL_TREE,
7025 BIT_AND_EXPR, signs, high_bits);
7026 signs = make_ssa_name (word_type);
7027 gimple_assign_set_lhs (new_stmt, signs);
7028 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7029 result_low = make_ssa_name (word_type);
7030 new_stmt = gimple_build_assign (result_low, code, a_low, b_low);
7031 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7033 else
7035 tree a_low = make_ssa_name (word_type);
7036 new_stmt = gimple_build_assign (a_low,
7037 BIT_AND_EXPR, wvop0, low_bits);
7038 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7039 signs = make_ssa_name (word_type);
7040 new_stmt = gimple_build_assign (signs, BIT_NOT_EXPR, wvop0);
7041 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7042 new_stmt = gimple_build_assign (NULL_TREE,
7043 BIT_AND_EXPR, signs, high_bits);
7044 signs = make_ssa_name (word_type);
7045 gimple_assign_set_lhs (new_stmt, signs);
7046 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7047 result_low = make_ssa_name (word_type);
7048 new_stmt = gimple_build_assign (result_low,
7049 MINUS_EXPR, high_bits, a_low);
7050 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7052 new_stmt = gimple_build_assign (NULL_TREE, BIT_XOR_EXPR, result_low,
7053 signs);
7054 result_low = make_ssa_name (word_type);
7055 gimple_assign_set_lhs (new_stmt, result_low);
7056 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7057 new_stmt = gimple_build_assign (NULL_TREE, VIEW_CONVERT_EXPR,
7058 build1 (VIEW_CONVERT_EXPR,
7059 vectype, result_low));
7060 new_temp = make_ssa_name (vectype);
7061 gimple_assign_set_lhs (new_stmt, new_temp);
7062 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7064 else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
7066 tree mask;
7067 if (masked_loop_p)
7068 mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
7069 vec_num * ncopies, vectype, i);
7070 else
7071 /* Dummy mask. */
7072 mask = build_minus_one_cst (truth_type_for (vectype));
7073 auto_vec<tree> vops (6);
7074 vops.quick_push (mask);
7075 vops.quick_push (vop0);
7076 if (vop1)
7077 vops.quick_push (vop1);
7078 if (vop2)
7079 vops.quick_push (vop2);
7080 if (reduc_idx >= 0)
7082 /* Perform the operation on active elements only and take
7083 inactive elements from the reduction chain input. */
7084 gcc_assert (!vop2);
7085 vops.quick_push (reduc_idx == 1 ? vop1 : vop0);
7087 else
7089 auto else_value = targetm.preferred_else_value
7090 (cond_fn, vectype, vops.length () - 1, &vops[1]);
7091 vops.quick_push (else_value);
7093 if (len_loop_p)
7095 tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
7096 vec_num * ncopies, vectype, i, 1);
7097 signed char biasval
7098 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7099 tree bias = build_int_cst (intQI_type_node, biasval);
7100 vops.quick_push (len);
7101 vops.quick_push (bias);
7103 gcall *call
7104 = gimple_build_call_internal_vec (masked_loop_p ? cond_fn
7105 : cond_len_fn,
7106 vops);
7107 new_temp = make_ssa_name (vec_dest, call);
7108 gimple_call_set_lhs (call, new_temp);
7109 gimple_call_set_nothrow (call, true);
7110 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
7111 new_stmt = call;
7113 else
7115 tree mask = NULL_TREE;
7116 /* When combining two masks check if either of them is elsewhere
7117 combined with a loop mask, if that's the case we can mark that the
7118 new combined mask doesn't need to be combined with a loop mask. */
7119 if (masked_loop_p
7120 && code == BIT_AND_EXPR
7121 && VECTOR_BOOLEAN_TYPE_P (vectype))
7123 if (loop_vinfo->scalar_cond_masked_set.contains ({ op0,
7124 ncopies}))
7126 mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
7127 vec_num * ncopies, vectype, i);
7129 vop0 = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
7130 vop0, gsi);
7133 if (loop_vinfo->scalar_cond_masked_set.contains ({ op1,
7134 ncopies }))
7136 mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
7137 vec_num * ncopies, vectype, i);
7139 vop1 = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
7140 vop1, gsi);
7144 new_stmt = gimple_build_assign (vec_dest, code, vop0, vop1, vop2);
7145 new_temp = make_ssa_name (vec_dest, new_stmt);
7146 gimple_assign_set_lhs (new_stmt, new_temp);
7147 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7148 if (using_emulated_vectors_p)
7149 suppress_warning (new_stmt, OPT_Wvector_operation_performance);
7151 /* Enter the combined value into the vector cond hash so we don't
7152 AND it with a loop mask again. */
7153 if (mask)
7154 loop_vinfo->vec_cond_masked_set.add ({ new_temp, mask });
7157 if (vec_cvt_dest)
7159 new_temp = build1 (VIEW_CONVERT_EXPR, vectype_out, new_temp);
7160 new_stmt = gimple_build_assign (vec_cvt_dest, VIEW_CONVERT_EXPR,
7161 new_temp);
7162 new_temp = make_ssa_name (vec_cvt_dest, new_stmt);
7163 gimple_assign_set_lhs (new_stmt, new_temp);
7164 vect_finish_stmt_generation (vinfo, stmt_info,
7165 new_stmt, gsi);
7168 if (slp_node)
7169 slp_node->push_vec_def (new_stmt);
7170 else
7171 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7174 if (!slp_node)
7175 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7177 vec_oprnds0.release ();
7178 vec_oprnds1.release ();
7179 vec_oprnds2.release ();
7181 return true;
7184 /* A helper function to ensure data reference DR_INFO's base alignment. */
7186 static void
7187 ensure_base_align (dr_vec_info *dr_info)
7189 /* Alignment is only analyzed for the first element of a DR group,
7190 use that to look at base alignment we need to enforce. */
7191 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
7192 dr_info = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
7194 gcc_assert (dr_info->misalignment != DR_MISALIGNMENT_UNINITIALIZED);
7196 if (dr_info->base_misaligned)
7198 tree base_decl = dr_info->base_decl;
7200 // We should only be able to increase the alignment of a base object if
7201 // we know what its new alignment should be at compile time.
7202 unsigned HOST_WIDE_INT align_base_to =
7203 DR_TARGET_ALIGNMENT (dr_info).to_constant () * BITS_PER_UNIT;
7205 if (decl_in_symtab_p (base_decl))
7206 symtab_node::get (base_decl)->increase_alignment (align_base_to);
7207 else if (DECL_ALIGN (base_decl) < align_base_to)
7209 SET_DECL_ALIGN (base_decl, align_base_to);
7210 DECL_USER_ALIGN (base_decl) = 1;
7212 dr_info->base_misaligned = false;
7217 /* Function get_group_alias_ptr_type.
7219 Return the alias type for the group starting at FIRST_STMT_INFO. */
7221 static tree
7222 get_group_alias_ptr_type (stmt_vec_info first_stmt_info)
7224 struct data_reference *first_dr, *next_dr;
7226 first_dr = STMT_VINFO_DATA_REF (first_stmt_info);
7227 stmt_vec_info next_stmt_info = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
7228 while (next_stmt_info)
7230 next_dr = STMT_VINFO_DATA_REF (next_stmt_info);
7231 if (get_alias_set (DR_REF (first_dr))
7232 != get_alias_set (DR_REF (next_dr)))
7234 if (dump_enabled_p ())
7235 dump_printf_loc (MSG_NOTE, vect_location,
7236 "conflicting alias set types.\n");
7237 return ptr_type_node;
7239 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7241 return reference_alias_ptr_type (DR_REF (first_dr));
7245 /* Function scan_operand_equal_p.
7247 Helper function for check_scan_store. Compare two references
7248 with .GOMP_SIMD_LANE bases. */
7250 static bool
7251 scan_operand_equal_p (tree ref1, tree ref2)
7253 tree ref[2] = { ref1, ref2 };
7254 poly_int64 bitsize[2], bitpos[2];
7255 tree offset[2], base[2];
7256 for (int i = 0; i < 2; ++i)
7258 machine_mode mode;
7259 int unsignedp, reversep, volatilep = 0;
7260 base[i] = get_inner_reference (ref[i], &bitsize[i], &bitpos[i],
7261 &offset[i], &mode, &unsignedp,
7262 &reversep, &volatilep);
7263 if (reversep || volatilep || maybe_ne (bitpos[i], 0))
7264 return false;
7265 if (TREE_CODE (base[i]) == MEM_REF
7266 && offset[i] == NULL_TREE
7267 && TREE_CODE (TREE_OPERAND (base[i], 0)) == SSA_NAME)
7269 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base[i], 0));
7270 if (is_gimple_assign (def_stmt)
7271 && gimple_assign_rhs_code (def_stmt) == POINTER_PLUS_EXPR
7272 && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == ADDR_EXPR
7273 && TREE_CODE (gimple_assign_rhs2 (def_stmt)) == SSA_NAME)
7275 if (maybe_ne (mem_ref_offset (base[i]), 0))
7276 return false;
7277 base[i] = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
7278 offset[i] = gimple_assign_rhs2 (def_stmt);
7283 if (!operand_equal_p (base[0], base[1], 0))
7284 return false;
7285 if (maybe_ne (bitsize[0], bitsize[1]))
7286 return false;
7287 if (offset[0] != offset[1])
7289 if (!offset[0] || !offset[1])
7290 return false;
7291 if (!operand_equal_p (offset[0], offset[1], 0))
7293 tree step[2];
7294 for (int i = 0; i < 2; ++i)
7296 step[i] = integer_one_node;
7297 if (TREE_CODE (offset[i]) == SSA_NAME)
7299 gimple *def_stmt = SSA_NAME_DEF_STMT (offset[i]);
7300 if (is_gimple_assign (def_stmt)
7301 && gimple_assign_rhs_code (def_stmt) == MULT_EXPR
7302 && (TREE_CODE (gimple_assign_rhs2 (def_stmt))
7303 == INTEGER_CST))
7305 step[i] = gimple_assign_rhs2 (def_stmt);
7306 offset[i] = gimple_assign_rhs1 (def_stmt);
7309 else if (TREE_CODE (offset[i]) == MULT_EXPR)
7311 step[i] = TREE_OPERAND (offset[i], 1);
7312 offset[i] = TREE_OPERAND (offset[i], 0);
7314 tree rhs1 = NULL_TREE;
7315 if (TREE_CODE (offset[i]) == SSA_NAME)
7317 gimple *def_stmt = SSA_NAME_DEF_STMT (offset[i]);
7318 if (gimple_assign_cast_p (def_stmt))
7319 rhs1 = gimple_assign_rhs1 (def_stmt);
7321 else if (CONVERT_EXPR_P (offset[i]))
7322 rhs1 = TREE_OPERAND (offset[i], 0);
7323 if (rhs1
7324 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
7325 && INTEGRAL_TYPE_P (TREE_TYPE (offset[i]))
7326 && (TYPE_PRECISION (TREE_TYPE (offset[i]))
7327 >= TYPE_PRECISION (TREE_TYPE (rhs1))))
7328 offset[i] = rhs1;
7330 if (!operand_equal_p (offset[0], offset[1], 0)
7331 || !operand_equal_p (step[0], step[1], 0))
7332 return false;
7335 return true;
7339 enum scan_store_kind {
7340 /* Normal permutation. */
7341 scan_store_kind_perm,
7343 /* Whole vector left shift permutation with zero init. */
7344 scan_store_kind_lshift_zero,
7346 /* Whole vector left shift permutation and VEC_COND_EXPR. */
7347 scan_store_kind_lshift_cond
7350 /* Function check_scan_store.
7352 Verify if we can perform the needed permutations or whole vector shifts.
7353 Return -1 on failure, otherwise exact log2 of vectype's nunits.
7354 USE_WHOLE_VECTOR is a vector of enum scan_store_kind which operation
7355 to do at each step. */
7357 static int
7358 scan_store_can_perm_p (tree vectype, tree init,
7359 vec<enum scan_store_kind> *use_whole_vector = NULL)
7361 enum machine_mode vec_mode = TYPE_MODE (vectype);
7362 unsigned HOST_WIDE_INT nunits;
7363 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits))
7364 return -1;
7365 int units_log2 = exact_log2 (nunits);
7366 if (units_log2 <= 0)
7367 return -1;
7369 int i;
7370 enum scan_store_kind whole_vector_shift_kind = scan_store_kind_perm;
7371 for (i = 0; i <= units_log2; ++i)
7373 unsigned HOST_WIDE_INT j, k;
7374 enum scan_store_kind kind = scan_store_kind_perm;
7375 vec_perm_builder sel (nunits, nunits, 1);
7376 sel.quick_grow (nunits);
7377 if (i == units_log2)
7379 for (j = 0; j < nunits; ++j)
7380 sel[j] = nunits - 1;
7382 else
7384 for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j)
7385 sel[j] = j;
7386 for (k = 0; j < nunits; ++j, ++k)
7387 sel[j] = nunits + k;
7389 vec_perm_indices indices (sel, i == units_log2 ? 1 : 2, nunits);
7390 if (!can_vec_perm_const_p (vec_mode, vec_mode, indices))
7392 if (i == units_log2)
7393 return -1;
7395 if (whole_vector_shift_kind == scan_store_kind_perm)
7397 if (optab_handler (vec_shl_optab, vec_mode) == CODE_FOR_nothing)
7398 return -1;
7399 whole_vector_shift_kind = scan_store_kind_lshift_zero;
7400 /* Whole vector shifts shift in zeros, so if init is all zero
7401 constant, there is no need to do anything further. */
7402 if ((TREE_CODE (init) != INTEGER_CST
7403 && TREE_CODE (init) != REAL_CST)
7404 || !initializer_zerop (init))
7406 tree masktype = truth_type_for (vectype);
7407 if (!expand_vec_cond_expr_p (vectype, masktype, VECTOR_CST))
7408 return -1;
7409 whole_vector_shift_kind = scan_store_kind_lshift_cond;
7412 kind = whole_vector_shift_kind;
7414 if (use_whole_vector)
7416 if (kind != scan_store_kind_perm && use_whole_vector->is_empty ())
7417 use_whole_vector->safe_grow_cleared (i, true);
7418 if (kind != scan_store_kind_perm || !use_whole_vector->is_empty ())
7419 use_whole_vector->safe_push (kind);
7423 return units_log2;
7427 /* Function check_scan_store.
7429 Check magic stores for #pragma omp scan {in,ex}clusive reductions. */
7431 static bool
7432 check_scan_store (vec_info *vinfo, stmt_vec_info stmt_info, tree vectype,
7433 enum vect_def_type rhs_dt, bool slp, tree mask,
7434 vect_memory_access_type memory_access_type)
7436 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7437 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
7438 tree ref_type;
7440 gcc_assert (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) > 1);
7441 if (slp
7442 || mask
7443 || memory_access_type != VMAT_CONTIGUOUS
7444 || TREE_CODE (DR_BASE_ADDRESS (dr_info->dr)) != ADDR_EXPR
7445 || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0))
7446 || loop_vinfo == NULL
7447 || LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7448 || STMT_VINFO_GROUPED_ACCESS (stmt_info)
7449 || !integer_zerop (get_dr_vinfo_offset (vinfo, dr_info))
7450 || !integer_zerop (DR_INIT (dr_info->dr))
7451 || !(ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr)))
7452 || !alias_sets_conflict_p (get_alias_set (vectype),
7453 get_alias_set (TREE_TYPE (ref_type))))
7455 if (dump_enabled_p ())
7456 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7457 "unsupported OpenMP scan store.\n");
7458 return false;
7461 /* We need to pattern match code built by OpenMP lowering and simplified
7462 by following optimizations into something we can handle.
7463 #pragma omp simd reduction(inscan,+:r)
7464 for (...)
7466 r += something ();
7467 #pragma omp scan inclusive (r)
7468 use (r);
7470 shall have body with:
7471 // Initialization for input phase, store the reduction initializer:
7472 _20 = .GOMP_SIMD_LANE (simduid.3_14(D), 0);
7473 _21 = .GOMP_SIMD_LANE (simduid.3_14(D), 1);
7474 D.2042[_21] = 0;
7475 // Actual input phase:
7477 r.0_5 = D.2042[_20];
7478 _6 = _4 + r.0_5;
7479 D.2042[_20] = _6;
7480 // Initialization for scan phase:
7481 _25 = .GOMP_SIMD_LANE (simduid.3_14(D), 2);
7482 _26 = D.2043[_25];
7483 _27 = D.2042[_25];
7484 _28 = _26 + _27;
7485 D.2043[_25] = _28;
7486 D.2042[_25] = _28;
7487 // Actual scan phase:
7489 r.1_8 = D.2042[_20];
7491 The "omp simd array" variable D.2042 holds the privatized copy used
7492 inside of the loop and D.2043 is another one that holds copies of
7493 the current original list item. The separate GOMP_SIMD_LANE ifn
7494 kinds are there in order to allow optimizing the initializer store
7495 and combiner sequence, e.g. if it is originally some C++ish user
7496 defined reduction, but allow the vectorizer to pattern recognize it
7497 and turn into the appropriate vectorized scan.
7499 For exclusive scan, this is slightly different:
7500 #pragma omp simd reduction(inscan,+:r)
7501 for (...)
7503 use (r);
7504 #pragma omp scan exclusive (r)
7505 r += something ();
7507 shall have body with:
7508 // Initialization for input phase, store the reduction initializer:
7509 _20 = .GOMP_SIMD_LANE (simduid.3_14(D), 0);
7510 _21 = .GOMP_SIMD_LANE (simduid.3_14(D), 1);
7511 D.2042[_21] = 0;
7512 // Actual input phase:
7514 r.0_5 = D.2042[_20];
7515 _6 = _4 + r.0_5;
7516 D.2042[_20] = _6;
7517 // Initialization for scan phase:
7518 _25 = .GOMP_SIMD_LANE (simduid.3_14(D), 3);
7519 _26 = D.2043[_25];
7520 D.2044[_25] = _26;
7521 _27 = D.2042[_25];
7522 _28 = _26 + _27;
7523 D.2043[_25] = _28;
7524 // Actual scan phase:
7526 r.1_8 = D.2044[_20];
7527 ... */
7529 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 2)
7531 /* Match the D.2042[_21] = 0; store above. Just require that
7532 it is a constant or external definition store. */
7533 if (rhs_dt != vect_constant_def && rhs_dt != vect_external_def)
7535 fail_init:
7536 if (dump_enabled_p ())
7537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7538 "unsupported OpenMP scan initializer store.\n");
7539 return false;
7542 if (! loop_vinfo->scan_map)
7543 loop_vinfo->scan_map = new hash_map<tree, tree>;
7544 tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
7545 tree &cached = loop_vinfo->scan_map->get_or_insert (var);
7546 if (cached)
7547 goto fail_init;
7548 cached = gimple_assign_rhs1 (STMT_VINFO_STMT (stmt_info));
7550 /* These stores can be vectorized normally. */
7551 return true;
7554 if (rhs_dt != vect_internal_def)
7556 fail:
7557 if (dump_enabled_p ())
7558 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7559 "unsupported OpenMP scan combiner pattern.\n");
7560 return false;
7563 gimple *stmt = STMT_VINFO_STMT (stmt_info);
7564 tree rhs = gimple_assign_rhs1 (stmt);
7565 if (TREE_CODE (rhs) != SSA_NAME)
7566 goto fail;
7568 gimple *other_store_stmt = NULL;
7569 tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
7570 bool inscan_var_store
7571 = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL;
7573 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
7575 if (!inscan_var_store)
7577 use_operand_p use_p;
7578 imm_use_iterator iter;
7579 FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
7581 gimple *use_stmt = USE_STMT (use_p);
7582 if (use_stmt == stmt || is_gimple_debug (use_stmt))
7583 continue;
7584 if (gimple_bb (use_stmt) != gimple_bb (stmt)
7585 || !is_gimple_assign (use_stmt)
7586 || gimple_assign_rhs_class (use_stmt) != GIMPLE_BINARY_RHS
7587 || other_store_stmt
7588 || TREE_CODE (gimple_assign_lhs (use_stmt)) != SSA_NAME)
7589 goto fail;
7590 other_store_stmt = use_stmt;
7592 if (other_store_stmt == NULL)
7593 goto fail;
7594 rhs = gimple_assign_lhs (other_store_stmt);
7595 if (!single_imm_use (rhs, &use_p, &other_store_stmt))
7596 goto fail;
7599 else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3)
7601 use_operand_p use_p;
7602 imm_use_iterator iter;
7603 FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
7605 gimple *use_stmt = USE_STMT (use_p);
7606 if (use_stmt == stmt || is_gimple_debug (use_stmt))
7607 continue;
7608 if (other_store_stmt)
7609 goto fail;
7610 other_store_stmt = use_stmt;
7613 else
7614 goto fail;
7616 gimple *def_stmt = SSA_NAME_DEF_STMT (rhs);
7617 if (gimple_bb (def_stmt) != gimple_bb (stmt)
7618 || !is_gimple_assign (def_stmt)
7619 || gimple_assign_rhs_class (def_stmt) != GIMPLE_BINARY_RHS)
7620 goto fail;
7622 enum tree_code code = gimple_assign_rhs_code (def_stmt);
7623 /* For pointer addition, we should use the normal plus for the vector
7624 operation. */
7625 switch (code)
7627 case POINTER_PLUS_EXPR:
7628 code = PLUS_EXPR;
7629 break;
7630 case MULT_HIGHPART_EXPR:
7631 goto fail;
7632 default:
7633 break;
7635 if (TREE_CODE_LENGTH (code) != binary_op || !commutative_tree_code (code))
7636 goto fail;
7638 tree rhs1 = gimple_assign_rhs1 (def_stmt);
7639 tree rhs2 = gimple_assign_rhs2 (def_stmt);
7640 if (TREE_CODE (rhs1) != SSA_NAME || TREE_CODE (rhs2) != SSA_NAME)
7641 goto fail;
7643 gimple *load1_stmt = SSA_NAME_DEF_STMT (rhs1);
7644 gimple *load2_stmt = SSA_NAME_DEF_STMT (rhs2);
7645 if (gimple_bb (load1_stmt) != gimple_bb (stmt)
7646 || !gimple_assign_load_p (load1_stmt)
7647 || gimple_bb (load2_stmt) != gimple_bb (stmt)
7648 || !gimple_assign_load_p (load2_stmt))
7649 goto fail;
7651 stmt_vec_info load1_stmt_info = loop_vinfo->lookup_stmt (load1_stmt);
7652 stmt_vec_info load2_stmt_info = loop_vinfo->lookup_stmt (load2_stmt);
7653 if (load1_stmt_info == NULL
7654 || load2_stmt_info == NULL
7655 || (STMT_VINFO_SIMD_LANE_ACCESS_P (load1_stmt_info)
7656 != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info))
7657 || (STMT_VINFO_SIMD_LANE_ACCESS_P (load2_stmt_info)
7658 != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)))
7659 goto fail;
7661 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && inscan_var_store)
7663 dr_vec_info *load1_dr_info = STMT_VINFO_DR_INFO (load1_stmt_info);
7664 if (TREE_CODE (DR_BASE_ADDRESS (load1_dr_info->dr)) != ADDR_EXPR
7665 || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0)))
7666 goto fail;
7667 tree var1 = TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0);
7668 tree lrhs;
7669 if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
7670 lrhs = rhs1;
7671 else
7672 lrhs = rhs2;
7673 use_operand_p use_p;
7674 imm_use_iterator iter;
7675 FOR_EACH_IMM_USE_FAST (use_p, iter, lrhs)
7677 gimple *use_stmt = USE_STMT (use_p);
7678 if (use_stmt == def_stmt || is_gimple_debug (use_stmt))
7679 continue;
7680 if (other_store_stmt)
7681 goto fail;
7682 other_store_stmt = use_stmt;
7686 if (other_store_stmt == NULL)
7687 goto fail;
7688 if (gimple_bb (other_store_stmt) != gimple_bb (stmt)
7689 || !gimple_store_p (other_store_stmt))
7690 goto fail;
7692 stmt_vec_info other_store_stmt_info
7693 = loop_vinfo->lookup_stmt (other_store_stmt);
7694 if (other_store_stmt_info == NULL
7695 || (STMT_VINFO_SIMD_LANE_ACCESS_P (other_store_stmt_info)
7696 != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)))
7697 goto fail;
7699 gimple *stmt1 = stmt;
7700 gimple *stmt2 = other_store_stmt;
7701 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store)
7702 std::swap (stmt1, stmt2);
7703 if (scan_operand_equal_p (gimple_assign_lhs (stmt1),
7704 gimple_assign_rhs1 (load2_stmt)))
7706 std::swap (rhs1, rhs2);
7707 std::swap (load1_stmt, load2_stmt);
7708 std::swap (load1_stmt_info, load2_stmt_info);
7710 if (!scan_operand_equal_p (gimple_assign_lhs (stmt1),
7711 gimple_assign_rhs1 (load1_stmt)))
7712 goto fail;
7714 tree var3 = NULL_TREE;
7715 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3
7716 && !scan_operand_equal_p (gimple_assign_lhs (stmt2),
7717 gimple_assign_rhs1 (load2_stmt)))
7718 goto fail;
7719 else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
7721 dr_vec_info *load2_dr_info = STMT_VINFO_DR_INFO (load2_stmt_info);
7722 if (TREE_CODE (DR_BASE_ADDRESS (load2_dr_info->dr)) != ADDR_EXPR
7723 || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0)))
7724 goto fail;
7725 var3 = TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0);
7726 if (!lookup_attribute ("omp simd array", DECL_ATTRIBUTES (var3))
7727 || lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var3))
7728 || lookup_attribute ("omp simd inscan exclusive",
7729 DECL_ATTRIBUTES (var3)))
7730 goto fail;
7733 dr_vec_info *other_dr_info = STMT_VINFO_DR_INFO (other_store_stmt_info);
7734 if (TREE_CODE (DR_BASE_ADDRESS (other_dr_info->dr)) != ADDR_EXPR
7735 || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (other_dr_info->dr), 0)))
7736 goto fail;
7738 tree var1 = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
7739 tree var2 = TREE_OPERAND (DR_BASE_ADDRESS (other_dr_info->dr), 0);
7740 if (!lookup_attribute ("omp simd array", DECL_ATTRIBUTES (var1))
7741 || !lookup_attribute ("omp simd array", DECL_ATTRIBUTES (var2))
7742 || (!lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
7743 == (!lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var2))))
7744 goto fail;
7746 if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
7747 std::swap (var1, var2);
7749 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
7751 if (!lookup_attribute ("omp simd inscan exclusive",
7752 DECL_ATTRIBUTES (var1)))
7753 goto fail;
7754 var1 = var3;
7757 if (loop_vinfo->scan_map == NULL)
7758 goto fail;
7759 tree *init = loop_vinfo->scan_map->get (var1);
7760 if (init == NULL)
7761 goto fail;
7763 /* The IL is as expected, now check if we can actually vectorize it.
7764 Inclusive scan:
7765 _26 = D.2043[_25];
7766 _27 = D.2042[_25];
7767 _28 = _26 + _27;
7768 D.2043[_25] = _28;
7769 D.2042[_25] = _28;
7770 should be vectorized as (where _40 is the vectorized rhs
7771 from the D.2042[_21] = 0; store):
7772 _30 = MEM <vector(8) int> [(int *)&D.2043];
7773 _31 = MEM <vector(8) int> [(int *)&D.2042];
7774 _32 = VEC_PERM_EXPR <_40, _31, { 0, 8, 9, 10, 11, 12, 13, 14 }>;
7775 _33 = _31 + _32;
7776 // _33 = { _31[0], _31[0]+_31[1], _31[1]+_31[2], ..., _31[6]+_31[7] };
7777 _34 = VEC_PERM_EXPR <_40, _33, { 0, 1, 8, 9, 10, 11, 12, 13 }>;
7778 _35 = _33 + _34;
7779 // _35 = { _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
7780 // _31[1]+.._31[4], ... _31[4]+.._31[7] };
7781 _36 = VEC_PERM_EXPR <_40, _35, { 0, 1, 2, 3, 8, 9, 10, 11 }>;
7782 _37 = _35 + _36;
7783 // _37 = { _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
7784 // _31[0]+.._31[4], ... _31[0]+.._31[7] };
7785 _38 = _30 + _37;
7786 _39 = VEC_PERM_EXPR <_38, _38, { 7, 7, 7, 7, 7, 7, 7, 7 }>;
7787 MEM <vector(8) int> [(int *)&D.2043] = _39;
7788 MEM <vector(8) int> [(int *)&D.2042] = _38;
7789 Exclusive scan:
7790 _26 = D.2043[_25];
7791 D.2044[_25] = _26;
7792 _27 = D.2042[_25];
7793 _28 = _26 + _27;
7794 D.2043[_25] = _28;
7795 should be vectorized as (where _40 is the vectorized rhs
7796 from the D.2042[_21] = 0; store):
7797 _30 = MEM <vector(8) int> [(int *)&D.2043];
7798 _31 = MEM <vector(8) int> [(int *)&D.2042];
7799 _32 = VEC_PERM_EXPR <_40, _31, { 0, 8, 9, 10, 11, 12, 13, 14 }>;
7800 _33 = VEC_PERM_EXPR <_40, _32, { 0, 8, 9, 10, 11, 12, 13, 14 }>;
7801 _34 = _32 + _33;
7802 // _34 = { 0, _31[0], _31[0]+_31[1], _31[1]+_31[2], _31[2]+_31[3],
7803 // _31[3]+_31[4], ... _31[5]+.._31[6] };
7804 _35 = VEC_PERM_EXPR <_40, _34, { 0, 1, 8, 9, 10, 11, 12, 13 }>;
7805 _36 = _34 + _35;
7806 // _36 = { 0, _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
7807 // _31[1]+.._31[4], ... _31[3]+.._31[6] };
7808 _37 = VEC_PERM_EXPR <_40, _36, { 0, 1, 2, 3, 8, 9, 10, 11 }>;
7809 _38 = _36 + _37;
7810 // _38 = { 0, _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
7811 // _31[0]+.._31[4], ... _31[0]+.._31[6] };
7812 _39 = _30 + _38;
7813 _50 = _31 + _39;
7814 _51 = VEC_PERM_EXPR <_50, _50, { 7, 7, 7, 7, 7, 7, 7, 7 }>;
7815 MEM <vector(8) int> [(int *)&D.2044] = _39;
7816 MEM <vector(8) int> [(int *)&D.2042] = _51; */
7817 enum machine_mode vec_mode = TYPE_MODE (vectype);
7818 optab optab = optab_for_tree_code (code, vectype, optab_default);
7819 if (!optab || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7820 goto fail;
7822 int units_log2 = scan_store_can_perm_p (vectype, *init);
7823 if (units_log2 == -1)
7824 goto fail;
7826 return true;
7830 /* Function vectorizable_scan_store.
7832 Helper of vectorizable_score, arguments like on vectorizable_store.
7833 Handle only the transformation, checking is done in check_scan_store. */
7835 static bool
7836 vectorizable_scan_store (vec_info *vinfo,
7837 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7838 gimple **vec_stmt, int ncopies)
7840 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7841 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
7842 tree ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr));
7843 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7845 if (dump_enabled_p ())
7846 dump_printf_loc (MSG_NOTE, vect_location,
7847 "transform scan store. ncopies = %d\n", ncopies);
7849 gimple *stmt = STMT_VINFO_STMT (stmt_info);
7850 tree rhs = gimple_assign_rhs1 (stmt);
7851 gcc_assert (TREE_CODE (rhs) == SSA_NAME);
7853 tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
7854 bool inscan_var_store
7855 = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL;
7857 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store)
7859 use_operand_p use_p;
7860 imm_use_iterator iter;
7861 FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
7863 gimple *use_stmt = USE_STMT (use_p);
7864 if (use_stmt == stmt || is_gimple_debug (use_stmt))
7865 continue;
7866 rhs = gimple_assign_lhs (use_stmt);
7867 break;
7871 gimple *def_stmt = SSA_NAME_DEF_STMT (rhs);
7872 enum tree_code code = gimple_assign_rhs_code (def_stmt);
7873 if (code == POINTER_PLUS_EXPR)
7874 code = PLUS_EXPR;
7875 gcc_assert (TREE_CODE_LENGTH (code) == binary_op
7876 && commutative_tree_code (code));
7877 tree rhs1 = gimple_assign_rhs1 (def_stmt);
7878 tree rhs2 = gimple_assign_rhs2 (def_stmt);
7879 gcc_assert (TREE_CODE (rhs1) == SSA_NAME && TREE_CODE (rhs2) == SSA_NAME);
7880 gimple *load1_stmt = SSA_NAME_DEF_STMT (rhs1);
7881 gimple *load2_stmt = SSA_NAME_DEF_STMT (rhs2);
7882 stmt_vec_info load1_stmt_info = loop_vinfo->lookup_stmt (load1_stmt);
7883 stmt_vec_info load2_stmt_info = loop_vinfo->lookup_stmt (load2_stmt);
7884 dr_vec_info *load1_dr_info = STMT_VINFO_DR_INFO (load1_stmt_info);
7885 dr_vec_info *load2_dr_info = STMT_VINFO_DR_INFO (load2_stmt_info);
7886 tree var1 = TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0);
7887 tree var2 = TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0);
7889 if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
7891 std::swap (rhs1, rhs2);
7892 std::swap (var1, var2);
7893 std::swap (load1_dr_info, load2_dr_info);
7896 tree *init = loop_vinfo->scan_map->get (var1);
7897 gcc_assert (init);
7899 unsigned HOST_WIDE_INT nunits;
7900 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits))
7901 gcc_unreachable ();
7902 auto_vec<enum scan_store_kind, 16> use_whole_vector;
7903 int units_log2 = scan_store_can_perm_p (vectype, *init, &use_whole_vector);
7904 gcc_assert (units_log2 > 0);
7905 auto_vec<tree, 16> perms;
7906 perms.quick_grow (units_log2 + 1);
7907 tree zero_vec = NULL_TREE, masktype = NULL_TREE;
7908 for (int i = 0; i <= units_log2; ++i)
7910 unsigned HOST_WIDE_INT j, k;
7911 vec_perm_builder sel (nunits, nunits, 1);
7912 sel.quick_grow (nunits);
7913 if (i == units_log2)
7914 for (j = 0; j < nunits; ++j)
7915 sel[j] = nunits - 1;
7916 else
7918 for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j)
7919 sel[j] = j;
7920 for (k = 0; j < nunits; ++j, ++k)
7921 sel[j] = nunits + k;
7923 vec_perm_indices indices (sel, i == units_log2 ? 1 : 2, nunits);
7924 if (!use_whole_vector.is_empty ()
7925 && use_whole_vector[i] != scan_store_kind_perm)
7927 if (zero_vec == NULL_TREE)
7928 zero_vec = build_zero_cst (vectype);
7929 if (masktype == NULL_TREE
7930 && use_whole_vector[i] == scan_store_kind_lshift_cond)
7931 masktype = truth_type_for (vectype);
7932 perms[i] = vect_gen_perm_mask_any (vectype, indices);
7934 else
7935 perms[i] = vect_gen_perm_mask_checked (vectype, indices);
7938 tree vec_oprnd1 = NULL_TREE;
7939 tree vec_oprnd2 = NULL_TREE;
7940 tree vec_oprnd3 = NULL_TREE;
7941 tree dataref_ptr = DR_BASE_ADDRESS (dr_info->dr);
7942 tree dataref_offset = build_int_cst (ref_type, 0);
7943 tree bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info,
7944 vectype, VMAT_CONTIGUOUS);
7945 tree ldataref_ptr = NULL_TREE;
7946 tree orig = NULL_TREE;
7947 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store)
7948 ldataref_ptr = DR_BASE_ADDRESS (load1_dr_info->dr);
7949 auto_vec<tree> vec_oprnds1;
7950 auto_vec<tree> vec_oprnds2;
7951 auto_vec<tree> vec_oprnds3;
7952 vect_get_vec_defs (vinfo, stmt_info, NULL, ncopies,
7953 *init, &vec_oprnds1,
7954 ldataref_ptr == NULL ? rhs1 : NULL, &vec_oprnds2,
7955 rhs2, &vec_oprnds3);
7956 for (int j = 0; j < ncopies; j++)
7958 vec_oprnd1 = vec_oprnds1[j];
7959 if (ldataref_ptr == NULL)
7960 vec_oprnd2 = vec_oprnds2[j];
7961 vec_oprnd3 = vec_oprnds3[j];
7962 if (j == 0)
7963 orig = vec_oprnd3;
7964 else if (!inscan_var_store)
7965 dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump);
7967 if (ldataref_ptr)
7969 vec_oprnd2 = make_ssa_name (vectype);
7970 tree data_ref = fold_build2 (MEM_REF, vectype,
7971 unshare_expr (ldataref_ptr),
7972 dataref_offset);
7973 vect_copy_ref_info (data_ref, DR_REF (load1_dr_info->dr));
7974 gimple *g = gimple_build_assign (vec_oprnd2, data_ref);
7975 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
7976 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
7977 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7980 tree v = vec_oprnd2;
7981 for (int i = 0; i < units_log2; ++i)
7983 tree new_temp = make_ssa_name (vectype);
7984 gimple *g = gimple_build_assign (new_temp, VEC_PERM_EXPR,
7985 (zero_vec
7986 && (use_whole_vector[i]
7987 != scan_store_kind_perm))
7988 ? zero_vec : vec_oprnd1, v,
7989 perms[i]);
7990 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
7991 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
7992 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7994 if (zero_vec && use_whole_vector[i] == scan_store_kind_lshift_cond)
7996 /* Whole vector shift shifted in zero bits, but if *init
7997 is not initializer_zerop, we need to replace those elements
7998 with elements from vec_oprnd1. */
7999 tree_vector_builder vb (masktype, nunits, 1);
8000 for (unsigned HOST_WIDE_INT k = 0; k < nunits; ++k)
8001 vb.quick_push (k < (HOST_WIDE_INT_1U << i)
8002 ? boolean_false_node : boolean_true_node);
8004 tree new_temp2 = make_ssa_name (vectype);
8005 g = gimple_build_assign (new_temp2, VEC_COND_EXPR, vb.build (),
8006 new_temp, vec_oprnd1);
8007 vect_finish_stmt_generation (vinfo, stmt_info,
8008 g, gsi);
8009 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
8010 new_temp = new_temp2;
8013 /* For exclusive scan, perform the perms[i] permutation once
8014 more. */
8015 if (i == 0
8016 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4
8017 && v == vec_oprnd2)
8019 v = new_temp;
8020 --i;
8021 continue;
8024 tree new_temp2 = make_ssa_name (vectype);
8025 g = gimple_build_assign (new_temp2, code, v, new_temp);
8026 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8027 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
8029 v = new_temp2;
8032 tree new_temp = make_ssa_name (vectype);
8033 gimple *g = gimple_build_assign (new_temp, code, orig, v);
8034 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8035 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
8037 tree last_perm_arg = new_temp;
8038 /* For exclusive scan, new_temp computed above is the exclusive scan
8039 prefix sum. Turn it into inclusive prefix sum for the broadcast
8040 of the last element into orig. */
8041 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
8043 last_perm_arg = make_ssa_name (vectype);
8044 g = gimple_build_assign (last_perm_arg, code, new_temp, vec_oprnd2);
8045 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8046 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
8049 orig = make_ssa_name (vectype);
8050 g = gimple_build_assign (orig, VEC_PERM_EXPR, last_perm_arg,
8051 last_perm_arg, perms[units_log2]);
8052 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8053 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
8055 if (!inscan_var_store)
8057 tree data_ref = fold_build2 (MEM_REF, vectype,
8058 unshare_expr (dataref_ptr),
8059 dataref_offset);
8060 vect_copy_ref_info (data_ref, DR_REF (dr_info->dr));
8061 g = gimple_build_assign (data_ref, new_temp);
8062 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8063 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
8067 if (inscan_var_store)
8068 for (int j = 0; j < ncopies; j++)
8070 if (j != 0)
8071 dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump);
8073 tree data_ref = fold_build2 (MEM_REF, vectype,
8074 unshare_expr (dataref_ptr),
8075 dataref_offset);
8076 vect_copy_ref_info (data_ref, DR_REF (dr_info->dr));
8077 gimple *g = gimple_build_assign (data_ref, orig);
8078 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8079 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
8081 return true;
8085 /* Function vectorizable_store.
8087 Check if STMT_INFO defines a non scalar data-ref (array/pointer/structure)
8088 that can be vectorized.
8089 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
8090 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
8091 Return true if STMT_INFO is vectorizable in this way. */
8093 static bool
8094 vectorizable_store (vec_info *vinfo,
8095 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8096 gimple **vec_stmt, slp_tree slp_node,
8097 stmt_vector_for_cost *cost_vec)
8099 tree data_ref;
8100 tree vec_oprnd = NULL_TREE;
8101 tree elem_type;
8102 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8103 class loop *loop = NULL;
8104 machine_mode vec_mode;
8105 tree dummy;
8106 enum vect_def_type rhs_dt = vect_unknown_def_type;
8107 enum vect_def_type mask_dt = vect_unknown_def_type;
8108 tree dataref_ptr = NULL_TREE;
8109 tree dataref_offset = NULL_TREE;
8110 gimple *ptr_incr = NULL;
8111 int ncopies;
8112 int j;
8113 stmt_vec_info first_stmt_info;
8114 bool grouped_store;
8115 unsigned int group_size, i;
8116 bool slp = (slp_node != NULL);
8117 unsigned int vec_num;
8118 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
8119 tree aggr_type;
8120 gather_scatter_info gs_info;
8121 poly_uint64 vf;
8122 vec_load_store_type vls_type;
8123 tree ref_type;
8125 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
8126 return false;
8128 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8129 && ! vec_stmt)
8130 return false;
8132 /* Is vectorizable store? */
8134 tree mask = NULL_TREE, mask_vectype = NULL_TREE;
8135 slp_tree mask_node = NULL;
8136 if (gassign *assign = dyn_cast <gassign *> (stmt_info->stmt))
8138 tree scalar_dest = gimple_assign_lhs (assign);
8139 if (TREE_CODE (scalar_dest) == VIEW_CONVERT_EXPR
8140 && is_pattern_stmt_p (stmt_info))
8141 scalar_dest = TREE_OPERAND (scalar_dest, 0);
8142 if (TREE_CODE (scalar_dest) != ARRAY_REF
8143 && TREE_CODE (scalar_dest) != BIT_FIELD_REF
8144 && TREE_CODE (scalar_dest) != INDIRECT_REF
8145 && TREE_CODE (scalar_dest) != COMPONENT_REF
8146 && TREE_CODE (scalar_dest) != IMAGPART_EXPR
8147 && TREE_CODE (scalar_dest) != REALPART_EXPR
8148 && TREE_CODE (scalar_dest) != MEM_REF)
8149 return false;
8151 else
8153 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
8154 if (!call || !gimple_call_internal_p (call))
8155 return false;
8157 internal_fn ifn = gimple_call_internal_fn (call);
8158 if (!internal_store_fn_p (ifn))
8159 return false;
8161 int mask_index = internal_fn_mask_index (ifn);
8162 if (mask_index >= 0 && slp_node)
8163 mask_index = vect_slp_child_index_for_operand
8164 (call, mask_index, STMT_VINFO_GATHER_SCATTER_P (stmt_info));
8165 if (mask_index >= 0
8166 && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
8167 &mask, &mask_node, &mask_dt,
8168 &mask_vectype))
8169 return false;
8172 /* Cannot have hybrid store SLP -- that would mean storing to the
8173 same location twice. */
8174 gcc_assert (slp == PURE_SLP_STMT (stmt_info));
8176 tree vectype = STMT_VINFO_VECTYPE (stmt_info), rhs_vectype = NULL_TREE;
8177 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8179 if (loop_vinfo)
8181 loop = LOOP_VINFO_LOOP (loop_vinfo);
8182 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8184 else
8185 vf = 1;
8187 /* Multiple types in SLP are handled by creating the appropriate number of
8188 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
8189 case of SLP. */
8190 if (slp)
8191 ncopies = 1;
8192 else
8193 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8195 gcc_assert (ncopies >= 1);
8197 /* FORNOW. This restriction should be relaxed. */
8198 if (loop && nested_in_vect_loop_p (loop, stmt_info) && ncopies > 1)
8200 if (dump_enabled_p ())
8201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8202 "multiple types in nested loop.\n");
8203 return false;
8206 tree op;
8207 slp_tree op_node;
8208 if (!vect_check_store_rhs (vinfo, stmt_info, slp_node,
8209 &op, &op_node, &rhs_dt, &rhs_vectype, &vls_type))
8210 return false;
8212 elem_type = TREE_TYPE (vectype);
8213 vec_mode = TYPE_MODE (vectype);
8215 if (!STMT_VINFO_DATA_REF (stmt_info))
8216 return false;
8218 vect_memory_access_type memory_access_type;
8219 enum dr_alignment_support alignment_support_scheme;
8220 int misalignment;
8221 poly_int64 poffset;
8222 internal_fn lanes_ifn;
8223 if (!get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask, vls_type,
8224 ncopies, &memory_access_type, &poffset,
8225 &alignment_support_scheme, &misalignment, &gs_info,
8226 &lanes_ifn))
8227 return false;
8229 if (mask)
8231 if (memory_access_type == VMAT_CONTIGUOUS)
8233 if (!VECTOR_MODE_P (vec_mode)
8234 || !can_vec_mask_load_store_p (vec_mode,
8235 TYPE_MODE (mask_vectype), false))
8236 return false;
8238 else if (memory_access_type != VMAT_LOAD_STORE_LANES
8239 && (memory_access_type != VMAT_GATHER_SCATTER
8240 || (gs_info.decl && !VECTOR_BOOLEAN_TYPE_P (mask_vectype))))
8242 if (dump_enabled_p ())
8243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8244 "unsupported access type for masked store.\n");
8245 return false;
8247 else if (memory_access_type == VMAT_GATHER_SCATTER
8248 && gs_info.ifn == IFN_LAST
8249 && !gs_info.decl)
8251 if (dump_enabled_p ())
8252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8253 "unsupported masked emulated scatter.\n");
8254 return false;
8257 else
8259 /* FORNOW. In some cases can vectorize even if data-type not supported
8260 (e.g. - array initialization with 0). */
8261 if (optab_handler (mov_optab, vec_mode) == CODE_FOR_nothing)
8262 return false;
8265 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info), *first_dr_info = NULL;
8266 grouped_store = (STMT_VINFO_GROUPED_ACCESS (stmt_info)
8267 && memory_access_type != VMAT_GATHER_SCATTER
8268 && (slp || memory_access_type != VMAT_CONTIGUOUS));
8269 if (grouped_store)
8271 first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8272 first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
8273 group_size = DR_GROUP_SIZE (first_stmt_info);
8275 else
8277 first_stmt_info = stmt_info;
8278 first_dr_info = dr_info;
8279 group_size = vec_num = 1;
8282 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) > 1 && !vec_stmt)
8284 if (!check_scan_store (vinfo, stmt_info, vectype, rhs_dt, slp, mask,
8285 memory_access_type))
8286 return false;
8289 bool costing_p = !vec_stmt;
8290 if (costing_p) /* transformation not required. */
8292 STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
8294 if (loop_vinfo
8295 && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8296 check_load_store_for_partial_vectors (loop_vinfo, vectype, slp_node,
8297 vls_type, group_size,
8298 memory_access_type, &gs_info,
8299 mask);
8301 if (slp_node
8302 && (!vect_maybe_update_slp_op_vectype (op_node, vectype)
8303 || (mask
8304 && !vect_maybe_update_slp_op_vectype (mask_node,
8305 mask_vectype))))
8307 if (dump_enabled_p ())
8308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8309 "incompatible vector types for invariants\n");
8310 return false;
8313 if (dump_enabled_p ()
8314 && memory_access_type != VMAT_ELEMENTWISE
8315 && memory_access_type != VMAT_GATHER_SCATTER
8316 && alignment_support_scheme != dr_aligned)
8317 dump_printf_loc (MSG_NOTE, vect_location,
8318 "Vectorizing an unaligned access.\n");
8320 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
8322 /* As function vect_transform_stmt shows, for interleaving stores
8323 the whole chain is vectorized when the last store in the chain
8324 is reached, the other stores in the group are skipped. So we
8325 want to only cost the last one here, but it's not trivial to
8326 get the last, as it's equivalent to use the first one for
8327 costing, use the first one instead. */
8328 if (grouped_store
8329 && !slp
8330 && first_stmt_info != stmt_info)
8331 return true;
8333 gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
8335 /* Transform. */
8337 ensure_base_align (dr_info);
8339 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) >= 3)
8341 gcc_assert (memory_access_type == VMAT_CONTIGUOUS);
8342 gcc_assert (!slp);
8343 if (costing_p)
8345 unsigned int inside_cost = 0, prologue_cost = 0;
8346 if (vls_type == VLS_STORE_INVARIANT)
8347 prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
8348 stmt_info, 0, vect_prologue);
8349 vect_get_store_cost (vinfo, stmt_info, ncopies,
8350 alignment_support_scheme, misalignment,
8351 &inside_cost, cost_vec);
8353 if (dump_enabled_p ())
8354 dump_printf_loc (MSG_NOTE, vect_location,
8355 "vect_model_store_cost: inside_cost = %d, "
8356 "prologue_cost = %d .\n",
8357 inside_cost, prologue_cost);
8359 return true;
8361 return vectorizable_scan_store (vinfo, stmt_info, gsi, vec_stmt, ncopies);
8364 if (grouped_store)
8366 /* FORNOW */
8367 gcc_assert (!loop || !nested_in_vect_loop_p (loop, stmt_info));
8369 if (slp)
8371 grouped_store = false;
8372 /* VEC_NUM is the number of vect stmts to be created for this
8373 group. */
8374 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8375 first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
8376 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_stmt_info)
8377 == first_stmt_info);
8378 first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
8379 op = vect_get_store_rhs (first_stmt_info);
8381 else
8382 /* VEC_NUM is the number of vect stmts to be created for this
8383 group. */
8384 vec_num = group_size;
8386 ref_type = get_group_alias_ptr_type (first_stmt_info);
8388 else
8389 ref_type = reference_alias_ptr_type (DR_REF (first_dr_info->dr));
8391 if (!costing_p && dump_enabled_p ())
8392 dump_printf_loc (MSG_NOTE, vect_location, "transform store. ncopies = %d\n",
8393 ncopies);
8395 /* Check if we need to update prologue cost for invariant,
8396 and update it accordingly if so. If it's not for
8397 interleaving store, we can just check vls_type; but if
8398 it's for interleaving store, need to check the def_type
8399 of the stored value since the current vls_type is just
8400 for first_stmt_info. */
8401 auto update_prologue_cost = [&](unsigned *prologue_cost, tree store_rhs)
8403 gcc_assert (costing_p);
8404 if (slp)
8405 return;
8406 if (grouped_store)
8408 gcc_assert (store_rhs);
8409 enum vect_def_type cdt;
8410 gcc_assert (vect_is_simple_use (store_rhs, vinfo, &cdt));
8411 if (cdt != vect_constant_def && cdt != vect_external_def)
8412 return;
8414 else if (vls_type != VLS_STORE_INVARIANT)
8415 return;
8416 *prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, stmt_info,
8417 0, vect_prologue);
8420 if (memory_access_type == VMAT_ELEMENTWISE
8421 || memory_access_type == VMAT_STRIDED_SLP)
8423 unsigned inside_cost = 0, prologue_cost = 0;
8424 gimple_stmt_iterator incr_gsi;
8425 bool insert_after;
8426 gimple *incr;
8427 tree offvar;
8428 tree ivstep;
8429 tree running_off;
8430 tree stride_base, stride_step, alias_off;
8431 tree vec_oprnd = NULL_TREE;
8432 tree dr_offset;
8433 unsigned int g;
8434 /* Checked by get_load_store_type. */
8435 unsigned int const_nunits = nunits.to_constant ();
8437 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8438 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
8440 dr_offset = get_dr_vinfo_offset (vinfo, first_dr_info);
8441 stride_base
8442 = fold_build_pointer_plus
8443 (DR_BASE_ADDRESS (first_dr_info->dr),
8444 size_binop (PLUS_EXPR,
8445 convert_to_ptrofftype (dr_offset),
8446 convert_to_ptrofftype (DR_INIT (first_dr_info->dr))));
8447 stride_step = fold_convert (sizetype, DR_STEP (first_dr_info->dr));
8449 /* For a store with loop-invariant (but other than power-of-2)
8450 stride (i.e. not a grouped access) like so:
8452 for (i = 0; i < n; i += stride)
8453 array[i] = ...;
8455 we generate a new induction variable and new stores from
8456 the components of the (vectorized) rhs:
8458 for (j = 0; ; j += VF*stride)
8459 vectemp = ...;
8460 tmp1 = vectemp[0];
8461 array[j] = tmp1;
8462 tmp2 = vectemp[1];
8463 array[j + stride] = tmp2;
8467 unsigned nstores = const_nunits;
8468 unsigned lnel = 1;
8469 tree ltype = elem_type;
8470 tree lvectype = vectype;
8471 if (slp)
8473 if (group_size < const_nunits
8474 && const_nunits % group_size == 0)
8476 nstores = const_nunits / group_size;
8477 lnel = group_size;
8478 ltype = build_vector_type (elem_type, group_size);
8479 lvectype = vectype;
8481 /* First check if vec_extract optab doesn't support extraction
8482 of vector elts directly. */
8483 scalar_mode elmode = SCALAR_TYPE_MODE (elem_type);
8484 machine_mode vmode;
8485 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
8486 || !related_vector_mode (TYPE_MODE (vectype), elmode,
8487 group_size).exists (&vmode)
8488 || (convert_optab_handler (vec_extract_optab,
8489 TYPE_MODE (vectype), vmode)
8490 == CODE_FOR_nothing))
8492 /* Try to avoid emitting an extract of vector elements
8493 by performing the extracts using an integer type of the
8494 same size, extracting from a vector of those and then
8495 re-interpreting it as the original vector type if
8496 supported. */
8497 unsigned lsize
8498 = group_size * GET_MODE_BITSIZE (elmode);
8499 unsigned int lnunits = const_nunits / group_size;
8500 /* If we can't construct such a vector fall back to
8501 element extracts from the original vector type and
8502 element size stores. */
8503 if (int_mode_for_size (lsize, 0).exists (&elmode)
8504 && VECTOR_MODE_P (TYPE_MODE (vectype))
8505 && related_vector_mode (TYPE_MODE (vectype), elmode,
8506 lnunits).exists (&vmode)
8507 && (convert_optab_handler (vec_extract_optab,
8508 vmode, elmode)
8509 != CODE_FOR_nothing))
8511 nstores = lnunits;
8512 lnel = group_size;
8513 ltype = build_nonstandard_integer_type (lsize, 1);
8514 lvectype = build_vector_type (ltype, nstores);
8516 /* Else fall back to vector extraction anyway.
8517 Fewer stores are more important than avoiding spilling
8518 of the vector we extract from. Compared to the
8519 construction case in vectorizable_load no store-forwarding
8520 issue exists here for reasonable archs. */
8523 else if (group_size >= const_nunits
8524 && group_size % const_nunits == 0)
8526 int mis_align = dr_misalignment (first_dr_info, vectype);
8527 dr_alignment_support dr_align
8528 = vect_supportable_dr_alignment (vinfo, dr_info, vectype,
8529 mis_align);
8530 if (dr_align == dr_aligned
8531 || dr_align == dr_unaligned_supported)
8533 nstores = 1;
8534 lnel = const_nunits;
8535 ltype = vectype;
8536 lvectype = vectype;
8537 alignment_support_scheme = dr_align;
8538 misalignment = mis_align;
8541 ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type));
8542 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8545 if (!costing_p)
8547 ivstep = stride_step;
8548 ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
8549 build_int_cst (TREE_TYPE (ivstep), vf));
8551 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
8553 stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base);
8554 ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep);
8555 create_iv (stride_base, PLUS_EXPR, ivstep, NULL, loop, &incr_gsi,
8556 insert_after, &offvar, NULL);
8557 incr = gsi_stmt (incr_gsi);
8559 stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step);
8562 alias_off = build_int_cst (ref_type, 0);
8563 stmt_vec_info next_stmt_info = first_stmt_info;
8564 auto_vec<tree> vec_oprnds;
8565 /* For costing some adjacent vector stores, we'd like to cost with
8566 the total number of them once instead of cost each one by one. */
8567 unsigned int n_adjacent_stores = 0;
8568 for (g = 0; g < group_size; g++)
8570 running_off = offvar;
8571 if (!costing_p)
8573 if (g)
8575 tree size = TYPE_SIZE_UNIT (ltype);
8576 tree pos
8577 = fold_build2 (MULT_EXPR, sizetype, size_int (g), size);
8578 tree newoff = copy_ssa_name (running_off, NULL);
8579 incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
8580 running_off, pos);
8581 vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi);
8582 running_off = newoff;
8585 if (!slp)
8586 op = vect_get_store_rhs (next_stmt_info);
8587 if (!costing_p)
8588 vect_get_vec_defs (vinfo, next_stmt_info, slp_node, ncopies, op,
8589 &vec_oprnds);
8590 else
8591 update_prologue_cost (&prologue_cost, op);
8592 unsigned int group_el = 0;
8593 unsigned HOST_WIDE_INT
8594 elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
8595 for (j = 0; j < ncopies; j++)
8597 if (!costing_p)
8599 vec_oprnd = vec_oprnds[j];
8600 /* Pun the vector to extract from if necessary. */
8601 if (lvectype != vectype)
8603 tree tem = make_ssa_name (lvectype);
8604 tree cvt
8605 = build1 (VIEW_CONVERT_EXPR, lvectype, vec_oprnd);
8606 gimple *pun = gimple_build_assign (tem, cvt);
8607 vect_finish_stmt_generation (vinfo, stmt_info, pun, gsi);
8608 vec_oprnd = tem;
8611 for (i = 0; i < nstores; i++)
8613 if (costing_p)
8615 /* Only need vector extracting when there are more
8616 than one stores. */
8617 if (nstores > 1)
8618 inside_cost
8619 += record_stmt_cost (cost_vec, 1, vec_to_scalar,
8620 stmt_info, 0, vect_body);
8621 /* Take a single lane vector type store as scalar
8622 store to avoid ICE like 110776. */
8623 if (VECTOR_TYPE_P (ltype)
8624 && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
8625 n_adjacent_stores++;
8626 else
8627 inside_cost
8628 += record_stmt_cost (cost_vec, 1, scalar_store,
8629 stmt_info, 0, vect_body);
8630 continue;
8632 tree newref, newoff;
8633 gimple *incr, *assign;
8634 tree size = TYPE_SIZE (ltype);
8635 /* Extract the i'th component. */
8636 tree pos = fold_build2 (MULT_EXPR, bitsizetype,
8637 bitsize_int (i), size);
8638 tree elem = fold_build3 (BIT_FIELD_REF, ltype, vec_oprnd,
8639 size, pos);
8641 elem = force_gimple_operand_gsi (gsi, elem, true,
8642 NULL_TREE, true,
8643 GSI_SAME_STMT);
8645 tree this_off = build_int_cst (TREE_TYPE (alias_off),
8646 group_el * elsz);
8647 newref = build2 (MEM_REF, ltype,
8648 running_off, this_off);
8649 vect_copy_ref_info (newref, DR_REF (first_dr_info->dr));
8651 /* And store it to *running_off. */
8652 assign = gimple_build_assign (newref, elem);
8653 vect_finish_stmt_generation (vinfo, stmt_info, assign, gsi);
8655 group_el += lnel;
8656 if (! slp
8657 || group_el == group_size)
8659 newoff = copy_ssa_name (running_off, NULL);
8660 incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
8661 running_off, stride_step);
8662 vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi);
8664 running_off = newoff;
8665 group_el = 0;
8667 if (g == group_size - 1
8668 && !slp)
8670 if (j == 0 && i == 0)
8671 *vec_stmt = assign;
8672 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (assign);
8676 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
8677 vec_oprnds.truncate(0);
8678 if (slp)
8679 break;
8682 if (costing_p)
8684 if (n_adjacent_stores > 0)
8685 vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
8686 alignment_support_scheme, misalignment,
8687 &inside_cost, cost_vec);
8688 if (dump_enabled_p ())
8689 dump_printf_loc (MSG_NOTE, vect_location,
8690 "vect_model_store_cost: inside_cost = %d, "
8691 "prologue_cost = %d .\n",
8692 inside_cost, prologue_cost);
8695 return true;
8698 gcc_assert (alignment_support_scheme);
8699 vec_loop_masks *loop_masks
8700 = (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8701 ? &LOOP_VINFO_MASKS (loop_vinfo)
8702 : NULL);
8703 vec_loop_lens *loop_lens
8704 = (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)
8705 ? &LOOP_VINFO_LENS (loop_vinfo)
8706 : NULL);
8708 /* The vect_transform_stmt and vect_analyze_stmt will go here but there
8709 are some difference here. We cannot enable both the lens and masks
8710 during transform but it is allowed during analysis.
8711 Shouldn't go with length-based approach if fully masked. */
8712 if (cost_vec == NULL)
8713 /* The cost_vec is NULL during transfrom. */
8714 gcc_assert ((!loop_lens || !loop_masks));
8716 /* Targets with store-lane instructions must not require explicit
8717 realignment. vect_supportable_dr_alignment always returns either
8718 dr_aligned or dr_unaligned_supported for masked operations. */
8719 gcc_assert ((memory_access_type != VMAT_LOAD_STORE_LANES
8720 && !mask
8721 && !loop_masks)
8722 || alignment_support_scheme == dr_aligned
8723 || alignment_support_scheme == dr_unaligned_supported);
8725 tree offset = NULL_TREE;
8726 if (!known_eq (poffset, 0))
8727 offset = size_int (poffset);
8729 tree bump;
8730 tree vec_offset = NULL_TREE;
8731 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
8733 aggr_type = NULL_TREE;
8734 bump = NULL_TREE;
8736 else if (memory_access_type == VMAT_GATHER_SCATTER)
8738 aggr_type = elem_type;
8739 if (!costing_p)
8740 vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
8741 &bump, &vec_offset, loop_lens);
8743 else
8745 if (memory_access_type == VMAT_LOAD_STORE_LANES)
8746 aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
8747 else
8748 aggr_type = vectype;
8749 bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
8750 memory_access_type, loop_lens);
8753 if (mask && !costing_p)
8754 LOOP_VINFO_HAS_MASK_STORE (loop_vinfo) = true;
8756 /* In case the vectorization factor (VF) is bigger than the number
8757 of elements that we can fit in a vectype (nunits), we have to generate
8758 more than one vector stmt - i.e - we need to "unroll" the
8759 vector stmt by a factor VF/nunits. */
8761 /* In case of interleaving (non-unit grouped access):
8763 S1: &base + 2 = x2
8764 S2: &base = x0
8765 S3: &base + 1 = x1
8766 S4: &base + 3 = x3
8768 We create vectorized stores starting from base address (the access of the
8769 first stmt in the chain (S2 in the above example), when the last store stmt
8770 of the chain (S4) is reached:
8772 VS1: &base = vx2
8773 VS2: &base + vec_size*1 = vx0
8774 VS3: &base + vec_size*2 = vx1
8775 VS4: &base + vec_size*3 = vx3
8777 Then permutation statements are generated:
8779 VS5: vx5 = VEC_PERM_EXPR < vx0, vx3, {0, 8, 1, 9, 2, 10, 3, 11} >
8780 VS6: vx6 = VEC_PERM_EXPR < vx0, vx3, {4, 12, 5, 13, 6, 14, 7, 15} >
8783 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
8784 (the order of the data-refs in the output of vect_permute_store_chain
8785 corresponds to the order of scalar stmts in the interleaving chain - see
8786 the documentation of vect_permute_store_chain()).
8788 In case of both multiple types and interleaving, above vector stores and
8789 permutation stmts are created for every copy. The result vector stmts are
8790 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
8791 STMT_VINFO_RELATED_STMT for the next copies.
8794 auto_vec<tree> dr_chain (group_size);
8795 auto_vec<tree> vec_masks;
8796 tree vec_mask = NULL;
8797 auto_delete_vec<auto_vec<tree>> gvec_oprnds (group_size);
8798 for (i = 0; i < group_size; i++)
8799 gvec_oprnds.quick_push (new auto_vec<tree> ());
8801 if (memory_access_type == VMAT_LOAD_STORE_LANES)
8803 gcc_assert (!slp && grouped_store);
8804 unsigned inside_cost = 0, prologue_cost = 0;
8805 /* For costing some adjacent vector stores, we'd like to cost with
8806 the total number of them once instead of cost each one by one. */
8807 unsigned int n_adjacent_stores = 0;
8808 for (j = 0; j < ncopies; j++)
8810 gimple *new_stmt;
8811 if (j == 0)
8813 /* For interleaved stores we collect vectorized defs for all
8814 the stores in the group in DR_CHAIN. DR_CHAIN is then used
8815 as an input to vect_permute_store_chain(). */
8816 stmt_vec_info next_stmt_info = first_stmt_info;
8817 for (i = 0; i < group_size; i++)
8819 /* Since gaps are not supported for interleaved stores,
8820 DR_GROUP_SIZE is the exact number of stmts in the
8821 chain. Therefore, NEXT_STMT_INFO can't be NULL_TREE. */
8822 op = vect_get_store_rhs (next_stmt_info);
8823 if (costing_p)
8824 update_prologue_cost (&prologue_cost, op);
8825 else
8827 vect_get_vec_defs_for_operand (vinfo, next_stmt_info,
8828 ncopies, op,
8829 gvec_oprnds[i]);
8830 vec_oprnd = (*gvec_oprnds[i])[0];
8831 dr_chain.quick_push (vec_oprnd);
8833 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
8836 if (!costing_p)
8838 if (mask)
8840 vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
8841 mask, &vec_masks,
8842 mask_vectype);
8843 vec_mask = vec_masks[0];
8846 /* We should have catched mismatched types earlier. */
8847 gcc_assert (
8848 useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd)));
8849 dataref_ptr
8850 = vect_create_data_ref_ptr (vinfo, first_stmt_info,
8851 aggr_type, NULL, offset, &dummy,
8852 gsi, &ptr_incr, false, bump);
8855 else if (!costing_p)
8857 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
8858 /* DR_CHAIN is then used as an input to
8859 vect_permute_store_chain(). */
8860 for (i = 0; i < group_size; i++)
8862 vec_oprnd = (*gvec_oprnds[i])[j];
8863 dr_chain[i] = vec_oprnd;
8865 if (mask)
8866 vec_mask = vec_masks[j];
8867 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
8868 stmt_info, bump);
8871 if (costing_p)
8873 n_adjacent_stores += vec_num;
8874 continue;
8877 /* Get an array into which we can store the individual vectors. */
8878 tree vec_array = create_vector_array (vectype, vec_num);
8880 /* Invalidate the current contents of VEC_ARRAY. This should
8881 become an RTL clobber too, which prevents the vector registers
8882 from being upward-exposed. */
8883 vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
8885 /* Store the individual vectors into the array. */
8886 for (i = 0; i < vec_num; i++)
8888 vec_oprnd = dr_chain[i];
8889 write_vector_array (vinfo, stmt_info, gsi, vec_oprnd, vec_array,
8893 tree final_mask = NULL;
8894 tree final_len = NULL;
8895 tree bias = NULL;
8896 if (loop_masks)
8897 final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
8898 ncopies, vectype, j);
8899 if (vec_mask)
8900 final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask,
8901 vec_mask, gsi);
8903 if (lanes_ifn == IFN_MASK_LEN_STORE_LANES)
8905 if (loop_lens)
8906 final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
8907 ncopies, vectype, j, 1);
8908 else
8909 final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
8910 signed char biasval
8911 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
8912 bias = build_int_cst (intQI_type_node, biasval);
8913 if (!final_mask)
8915 mask_vectype = truth_type_for (vectype);
8916 final_mask = build_minus_one_cst (mask_vectype);
8920 gcall *call;
8921 if (final_len && final_mask)
8923 /* Emit:
8924 MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
8925 LEN, BIAS, VEC_ARRAY). */
8926 unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
8927 tree alias_ptr = build_int_cst (ref_type, align);
8928 call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
8929 dataref_ptr, alias_ptr,
8930 final_mask, final_len, bias,
8931 vec_array);
8933 else if (final_mask)
8935 /* Emit:
8936 MASK_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
8937 VEC_ARRAY). */
8938 unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
8939 tree alias_ptr = build_int_cst (ref_type, align);
8940 call = gimple_build_call_internal (IFN_MASK_STORE_LANES, 4,
8941 dataref_ptr, alias_ptr,
8942 final_mask, vec_array);
8944 else
8946 /* Emit:
8947 MEM_REF[...all elements...] = STORE_LANES (VEC_ARRAY). */
8948 data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
8949 call = gimple_build_call_internal (IFN_STORE_LANES, 1, vec_array);
8950 gimple_call_set_lhs (call, data_ref);
8952 gimple_call_set_nothrow (call, true);
8953 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
8954 new_stmt = call;
8956 /* Record that VEC_ARRAY is now dead. */
8957 vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
8958 if (j == 0)
8959 *vec_stmt = new_stmt;
8960 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8963 if (costing_p)
8965 if (n_adjacent_stores > 0)
8966 vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
8967 alignment_support_scheme, misalignment,
8968 &inside_cost, cost_vec);
8969 if (dump_enabled_p ())
8970 dump_printf_loc (MSG_NOTE, vect_location,
8971 "vect_model_store_cost: inside_cost = %d, "
8972 "prologue_cost = %d .\n",
8973 inside_cost, prologue_cost);
8976 return true;
8979 if (memory_access_type == VMAT_GATHER_SCATTER)
8981 gcc_assert (!grouped_store);
8982 auto_vec<tree> vec_offsets;
8983 unsigned int inside_cost = 0, prologue_cost = 0;
8984 for (j = 0; j < ncopies; j++)
8986 gimple *new_stmt;
8987 if (j == 0)
8989 if (costing_p && vls_type == VLS_STORE_INVARIANT)
8990 prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
8991 stmt_info, 0, vect_prologue);
8992 else if (!costing_p)
8994 /* Since the store is not grouped, DR_GROUP_SIZE is 1, and
8995 DR_CHAIN is of size 1. */
8996 gcc_assert (group_size == 1);
8997 if (slp_node)
8998 vect_get_slp_defs (op_node, gvec_oprnds[0]);
8999 else
9000 vect_get_vec_defs_for_operand (vinfo, first_stmt_info,
9001 ncopies, op, gvec_oprnds[0]);
9002 if (mask)
9004 if (slp_node)
9005 vect_get_slp_defs (mask_node, &vec_masks);
9006 else
9007 vect_get_vec_defs_for_operand (vinfo, stmt_info,
9008 ncopies,
9009 mask, &vec_masks,
9010 mask_vectype);
9013 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
9014 vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
9015 slp_node, &gs_info,
9016 &dataref_ptr, &vec_offsets);
9017 else
9018 dataref_ptr
9019 = vect_create_data_ref_ptr (vinfo, first_stmt_info,
9020 aggr_type, NULL, offset,
9021 &dummy, gsi, &ptr_incr, false,
9022 bump);
9025 else if (!costing_p)
9027 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
9028 if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
9029 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
9030 gsi, stmt_info, bump);
9033 new_stmt = NULL;
9034 for (i = 0; i < vec_num; ++i)
9036 if (!costing_p)
9038 vec_oprnd = (*gvec_oprnds[0])[vec_num * j + i];
9039 if (mask)
9040 vec_mask = vec_masks[vec_num * j + i];
9041 /* We should have catched mismatched types earlier. */
9042 gcc_assert (useless_type_conversion_p (vectype,
9043 TREE_TYPE (vec_oprnd)));
9045 unsigned HOST_WIDE_INT align;
9046 tree final_mask = NULL_TREE;
9047 tree final_len = NULL_TREE;
9048 tree bias = NULL_TREE;
9049 if (!costing_p)
9051 if (loop_masks)
9052 final_mask = vect_get_loop_mask (loop_vinfo, gsi,
9053 loop_masks, ncopies,
9054 vectype, j);
9055 if (vec_mask)
9056 final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
9057 final_mask, vec_mask, gsi);
9060 if (gs_info.ifn != IFN_LAST)
9062 if (costing_p)
9064 unsigned int cnunits = vect_nunits_for_cost (vectype);
9065 inside_cost
9066 += record_stmt_cost (cost_vec, cnunits, scalar_store,
9067 stmt_info, 0, vect_body);
9068 continue;
9071 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
9072 vec_offset = vec_offsets[vec_num * j + i];
9073 tree scale = size_int (gs_info.scale);
9075 if (gs_info.ifn == IFN_MASK_LEN_SCATTER_STORE)
9077 if (loop_lens)
9078 final_len = vect_get_loop_len (loop_vinfo, gsi,
9079 loop_lens, ncopies,
9080 vectype, j, 1);
9081 else
9082 final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
9083 signed char biasval
9084 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
9085 bias = build_int_cst (intQI_type_node, biasval);
9086 if (!final_mask)
9088 mask_vectype = truth_type_for (vectype);
9089 final_mask = build_minus_one_cst (mask_vectype);
9093 gcall *call;
9094 if (final_len && final_mask)
9095 call = gimple_build_call_internal
9096 (IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
9097 vec_offset, scale, vec_oprnd, final_mask,
9098 final_len, bias);
9099 else if (final_mask)
9100 call = gimple_build_call_internal
9101 (IFN_MASK_SCATTER_STORE, 5, dataref_ptr,
9102 vec_offset, scale, vec_oprnd, final_mask);
9103 else
9104 call = gimple_build_call_internal (IFN_SCATTER_STORE, 4,
9105 dataref_ptr, vec_offset,
9106 scale, vec_oprnd);
9107 gimple_call_set_nothrow (call, true);
9108 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
9109 new_stmt = call;
9111 else if (gs_info.decl)
9113 /* The builtin decls path for scatter is legacy, x86 only. */
9114 gcc_assert (nunits.is_constant ()
9115 && (!final_mask
9116 || SCALAR_INT_MODE_P
9117 (TYPE_MODE (TREE_TYPE (final_mask)))));
9118 if (costing_p)
9120 unsigned int cnunits = vect_nunits_for_cost (vectype);
9121 inside_cost
9122 += record_stmt_cost (cost_vec, cnunits, scalar_store,
9123 stmt_info, 0, vect_body);
9124 continue;
9126 poly_uint64 offset_nunits
9127 = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype);
9128 if (known_eq (nunits, offset_nunits))
9130 new_stmt = vect_build_one_scatter_store_call
9131 (vinfo, stmt_info, gsi, &gs_info,
9132 dataref_ptr, vec_offsets[vec_num * j + i],
9133 vec_oprnd, final_mask);
9134 vect_finish_stmt_generation (vinfo, stmt_info,
9135 new_stmt, gsi);
9137 else if (known_eq (nunits, offset_nunits * 2))
9139 /* We have a offset vector with half the number of
9140 lanes but the builtins will store full vectype
9141 data from the lower lanes. */
9142 new_stmt = vect_build_one_scatter_store_call
9143 (vinfo, stmt_info, gsi, &gs_info,
9144 dataref_ptr,
9145 vec_offsets[2 * vec_num * j + 2 * i],
9146 vec_oprnd, final_mask);
9147 vect_finish_stmt_generation (vinfo, stmt_info,
9148 new_stmt, gsi);
9149 int count = nunits.to_constant ();
9150 vec_perm_builder sel (count, count, 1);
9151 sel.quick_grow (count);
9152 for (int i = 0; i < count; ++i)
9153 sel[i] = i | (count / 2);
9154 vec_perm_indices indices (sel, 2, count);
9155 tree perm_mask
9156 = vect_gen_perm_mask_checked (vectype, indices);
9157 new_stmt = gimple_build_assign (NULL_TREE, VEC_PERM_EXPR,
9158 vec_oprnd, vec_oprnd,
9159 perm_mask);
9160 vec_oprnd = make_ssa_name (vectype);
9161 gimple_set_lhs (new_stmt, vec_oprnd);
9162 vect_finish_stmt_generation (vinfo, stmt_info,
9163 new_stmt, gsi);
9164 if (final_mask)
9166 new_stmt = gimple_build_assign (NULL_TREE,
9167 VEC_UNPACK_HI_EXPR,
9168 final_mask);
9169 final_mask = make_ssa_name
9170 (truth_type_for (gs_info.offset_vectype));
9171 gimple_set_lhs (new_stmt, final_mask);
9172 vect_finish_stmt_generation (vinfo, stmt_info,
9173 new_stmt, gsi);
9175 new_stmt = vect_build_one_scatter_store_call
9176 (vinfo, stmt_info, gsi, &gs_info,
9177 dataref_ptr,
9178 vec_offsets[2 * vec_num * j + 2 * i + 1],
9179 vec_oprnd, final_mask);
9180 vect_finish_stmt_generation (vinfo, stmt_info,
9181 new_stmt, gsi);
9183 else if (known_eq (nunits * 2, offset_nunits))
9185 /* We have a offset vector with double the number of
9186 lanes. Select the low/high part accordingly. */
9187 vec_offset = vec_offsets[(vec_num * j + i) / 2];
9188 if ((vec_num * j + i) & 1)
9190 int count = offset_nunits.to_constant ();
9191 vec_perm_builder sel (count, count, 1);
9192 sel.quick_grow (count);
9193 for (int i = 0; i < count; ++i)
9194 sel[i] = i | (count / 2);
9195 vec_perm_indices indices (sel, 2, count);
9196 tree perm_mask = vect_gen_perm_mask_checked
9197 (TREE_TYPE (vec_offset), indices);
9198 new_stmt = gimple_build_assign (NULL_TREE,
9199 VEC_PERM_EXPR,
9200 vec_offset,
9201 vec_offset,
9202 perm_mask);
9203 vec_offset = make_ssa_name (TREE_TYPE (vec_offset));
9204 gimple_set_lhs (new_stmt, vec_offset);
9205 vect_finish_stmt_generation (vinfo, stmt_info,
9206 new_stmt, gsi);
9208 new_stmt = vect_build_one_scatter_store_call
9209 (vinfo, stmt_info, gsi, &gs_info,
9210 dataref_ptr, vec_offset,
9211 vec_oprnd, final_mask);
9212 vect_finish_stmt_generation (vinfo, stmt_info,
9213 new_stmt, gsi);
9215 else
9216 gcc_unreachable ();
9218 else
9220 /* Emulated scatter. */
9221 gcc_assert (!final_mask);
9222 if (costing_p)
9224 unsigned int cnunits = vect_nunits_for_cost (vectype);
9225 /* For emulated scatter N offset vector element extracts
9226 (we assume the scalar scaling and ptr + offset add is
9227 consumed by the load). */
9228 inside_cost
9229 += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
9230 stmt_info, 0, vect_body);
9231 /* N scalar stores plus extracting the elements. */
9232 inside_cost
9233 += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
9234 stmt_info, 0, vect_body);
9235 inside_cost
9236 += record_stmt_cost (cost_vec, cnunits, scalar_store,
9237 stmt_info, 0, vect_body);
9238 continue;
9241 unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
9242 unsigned HOST_WIDE_INT const_offset_nunits
9243 = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype).to_constant ();
9244 vec<constructor_elt, va_gc> *ctor_elts;
9245 vec_alloc (ctor_elts, const_nunits);
9246 gimple_seq stmts = NULL;
9247 tree elt_type = TREE_TYPE (vectype);
9248 unsigned HOST_WIDE_INT elt_size
9249 = tree_to_uhwi (TYPE_SIZE (elt_type));
9250 /* We support offset vectors with more elements
9251 than the data vector for now. */
9252 unsigned HOST_WIDE_INT factor
9253 = const_offset_nunits / const_nunits;
9254 vec_offset = vec_offsets[(vec_num * j + i) / factor];
9255 unsigned elt_offset
9256 = ((vec_num * j + i) % factor) * const_nunits;
9257 tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
9258 tree scale = size_int (gs_info.scale);
9259 align = get_object_alignment (DR_REF (first_dr_info->dr));
9260 tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
9261 for (unsigned k = 0; k < const_nunits; ++k)
9263 /* Compute the offsetted pointer. */
9264 tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
9265 bitsize_int (k + elt_offset));
9266 tree idx
9267 = gimple_build (&stmts, BIT_FIELD_REF, idx_type,
9268 vec_offset, TYPE_SIZE (idx_type), boff);
9269 idx = gimple_convert (&stmts, sizetype, idx);
9270 idx = gimple_build (&stmts, MULT_EXPR, sizetype,
9271 idx, scale);
9272 tree ptr
9273 = gimple_build (&stmts, PLUS_EXPR,
9274 TREE_TYPE (dataref_ptr),
9275 dataref_ptr, idx);
9276 ptr = gimple_convert (&stmts, ptr_type_node, ptr);
9277 /* Extract the element to be stored. */
9278 tree elt
9279 = gimple_build (&stmts, BIT_FIELD_REF,
9280 TREE_TYPE (vectype),
9281 vec_oprnd, TYPE_SIZE (elt_type),
9282 bitsize_int (k * elt_size));
9283 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
9284 stmts = NULL;
9285 tree ref
9286 = build2 (MEM_REF, ltype, ptr,
9287 build_int_cst (ref_type, 0));
9288 new_stmt = gimple_build_assign (ref, elt);
9289 vect_finish_stmt_generation (vinfo, stmt_info,
9290 new_stmt, gsi);
9292 if (slp)
9293 slp_node->push_vec_def (new_stmt);
9296 if (!slp && !costing_p)
9297 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9300 if (!slp && !costing_p)
9301 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9303 if (costing_p && dump_enabled_p ())
9304 dump_printf_loc (MSG_NOTE, vect_location,
9305 "vect_model_store_cost: inside_cost = %d, "
9306 "prologue_cost = %d .\n",
9307 inside_cost, prologue_cost);
9309 return true;
9312 gcc_assert (memory_access_type == VMAT_CONTIGUOUS
9313 || memory_access_type == VMAT_CONTIGUOUS_DOWN
9314 || memory_access_type == VMAT_CONTIGUOUS_PERMUTE
9315 || memory_access_type == VMAT_CONTIGUOUS_REVERSE);
9317 unsigned inside_cost = 0, prologue_cost = 0;
9318 /* For costing some adjacent vector stores, we'd like to cost with
9319 the total number of them once instead of cost each one by one. */
9320 unsigned int n_adjacent_stores = 0;
9321 auto_vec<tree> result_chain (group_size);
9322 auto_vec<tree, 1> vec_oprnds;
9323 for (j = 0; j < ncopies; j++)
9325 gimple *new_stmt;
9326 if (j == 0)
9328 if (slp && !costing_p)
9330 /* Get vectorized arguments for SLP_NODE. */
9331 vect_get_vec_defs (vinfo, stmt_info, slp_node, 1, op,
9332 &vec_oprnds, mask, &vec_masks);
9333 vec_oprnd = vec_oprnds[0];
9334 if (mask)
9335 vec_mask = vec_masks[0];
9337 else
9339 /* For interleaved stores we collect vectorized defs for all the
9340 stores in the group in DR_CHAIN. DR_CHAIN is then used as an
9341 input to vect_permute_store_chain().
9343 If the store is not grouped, DR_GROUP_SIZE is 1, and DR_CHAIN
9344 is of size 1. */
9345 stmt_vec_info next_stmt_info = first_stmt_info;
9346 for (i = 0; i < group_size; i++)
9348 /* Since gaps are not supported for interleaved stores,
9349 DR_GROUP_SIZE is the exact number of stmts in the chain.
9350 Therefore, NEXT_STMT_INFO can't be NULL_TREE. In case
9351 that there is no interleaving, DR_GROUP_SIZE is 1,
9352 and only one iteration of the loop will be executed. */
9353 op = vect_get_store_rhs (next_stmt_info);
9354 if (costing_p)
9355 update_prologue_cost (&prologue_cost, op);
9356 else
9358 vect_get_vec_defs_for_operand (vinfo, next_stmt_info,
9359 ncopies, op,
9360 gvec_oprnds[i]);
9361 vec_oprnd = (*gvec_oprnds[i])[0];
9362 dr_chain.quick_push (vec_oprnd);
9364 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
9366 if (mask && !costing_p)
9368 vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
9369 mask, &vec_masks,
9370 mask_vectype);
9371 vec_mask = vec_masks[0];
9375 /* We should have catched mismatched types earlier. */
9376 gcc_assert (costing_p
9377 || useless_type_conversion_p (vectype,
9378 TREE_TYPE (vec_oprnd)));
9379 bool simd_lane_access_p
9380 = STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) != 0;
9381 if (!costing_p
9382 && simd_lane_access_p
9383 && !loop_masks
9384 && TREE_CODE (DR_BASE_ADDRESS (first_dr_info->dr)) == ADDR_EXPR
9385 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (first_dr_info->dr), 0))
9386 && integer_zerop (get_dr_vinfo_offset (vinfo, first_dr_info))
9387 && integer_zerop (DR_INIT (first_dr_info->dr))
9388 && alias_sets_conflict_p (get_alias_set (aggr_type),
9389 get_alias_set (TREE_TYPE (ref_type))))
9391 dataref_ptr = unshare_expr (DR_BASE_ADDRESS (first_dr_info->dr));
9392 dataref_offset = build_int_cst (ref_type, 0);
9394 else if (!costing_p)
9395 dataref_ptr
9396 = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
9397 simd_lane_access_p ? loop : NULL,
9398 offset, &dummy, gsi, &ptr_incr,
9399 simd_lane_access_p, bump);
9401 else if (!costing_p)
9403 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
9404 /* DR_CHAIN is then used as an input to vect_permute_store_chain().
9405 If the store is not grouped, DR_GROUP_SIZE is 1, and DR_CHAIN is
9406 of size 1. */
9407 for (i = 0; i < group_size; i++)
9409 vec_oprnd = (*gvec_oprnds[i])[j];
9410 dr_chain[i] = vec_oprnd;
9412 if (mask)
9413 vec_mask = vec_masks[j];
9414 if (dataref_offset)
9415 dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump);
9416 else
9417 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
9418 stmt_info, bump);
9421 new_stmt = NULL;
9422 if (grouped_store)
9424 /* Permute. */
9425 gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
9426 if (costing_p)
9428 int group_size = DR_GROUP_SIZE (first_stmt_info);
9429 int nstmts = ceil_log2 (group_size) * group_size;
9430 inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm,
9431 stmt_info, 0, vect_body);
9432 if (dump_enabled_p ())
9433 dump_printf_loc (MSG_NOTE, vect_location,
9434 "vect_model_store_cost: "
9435 "strided group_size = %d .\n",
9436 group_size);
9438 else
9439 vect_permute_store_chain (vinfo, dr_chain, group_size, stmt_info,
9440 gsi, &result_chain);
9443 stmt_vec_info next_stmt_info = first_stmt_info;
9444 for (i = 0; i < vec_num; i++)
9446 if (!costing_p)
9448 if (slp)
9449 vec_oprnd = vec_oprnds[i];
9450 else if (grouped_store)
9451 /* For grouped stores vectorized defs are interleaved in
9452 vect_permute_store_chain(). */
9453 vec_oprnd = result_chain[i];
9456 if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
9458 if (costing_p)
9459 inside_cost += record_stmt_cost (cost_vec, 1, vec_perm,
9460 stmt_info, 0, vect_body);
9461 else
9463 tree perm_mask = perm_mask_for_reverse (vectype);
9464 tree perm_dest = vect_create_destination_var (
9465 vect_get_store_rhs (stmt_info), vectype);
9466 tree new_temp = make_ssa_name (perm_dest);
9468 /* Generate the permute statement. */
9469 gimple *perm_stmt
9470 = gimple_build_assign (new_temp, VEC_PERM_EXPR, vec_oprnd,
9471 vec_oprnd, perm_mask);
9472 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
9473 gsi);
9475 perm_stmt = SSA_NAME_DEF_STMT (new_temp);
9476 vec_oprnd = new_temp;
9480 if (costing_p)
9482 n_adjacent_stores++;
9484 if (!slp)
9486 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
9487 if (!next_stmt_info)
9488 break;
9491 continue;
9494 tree final_mask = NULL_TREE;
9495 tree final_len = NULL_TREE;
9496 tree bias = NULL_TREE;
9497 if (loop_masks)
9498 final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
9499 vec_num * ncopies, vectype,
9500 vec_num * j + i);
9501 if (slp && vec_mask)
9502 vec_mask = vec_masks[i];
9503 if (vec_mask)
9504 final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask,
9505 vec_mask, gsi);
9507 if (i > 0)
9508 /* Bump the vector pointer. */
9509 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
9510 stmt_info, bump);
9512 unsigned misalign;
9513 unsigned HOST_WIDE_INT align;
9514 align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
9515 if (alignment_support_scheme == dr_aligned)
9516 misalign = 0;
9517 else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
9519 align = dr_alignment (vect_dr_behavior (vinfo, first_dr_info));
9520 misalign = 0;
9522 else
9523 misalign = misalignment;
9524 if (dataref_offset == NULL_TREE
9525 && TREE_CODE (dataref_ptr) == SSA_NAME)
9526 set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
9527 misalign);
9528 align = least_bit_hwi (misalign | align);
9530 /* Compute IFN when LOOP_LENS or final_mask valid. */
9531 machine_mode vmode = TYPE_MODE (vectype);
9532 machine_mode new_vmode = vmode;
9533 internal_fn partial_ifn = IFN_LAST;
9534 if (loop_lens)
9536 opt_machine_mode new_ovmode
9537 = get_len_load_store_mode (vmode, false, &partial_ifn);
9538 new_vmode = new_ovmode.require ();
9539 unsigned factor
9540 = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode);
9541 final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
9542 vec_num * ncopies, vectype,
9543 vec_num * j + i, factor);
9545 else if (final_mask)
9547 if (!can_vec_mask_load_store_p (
9548 vmode, TYPE_MODE (TREE_TYPE (final_mask)), false,
9549 &partial_ifn))
9550 gcc_unreachable ();
9553 if (partial_ifn == IFN_MASK_LEN_STORE)
9555 if (!final_len)
9557 /* Pass VF value to 'len' argument of
9558 MASK_LEN_STORE if LOOP_LENS is invalid. */
9559 final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
9561 if (!final_mask)
9563 /* Pass all ones value to 'mask' argument of
9564 MASK_LEN_STORE if final_mask is invalid. */
9565 mask_vectype = truth_type_for (vectype);
9566 final_mask = build_minus_one_cst (mask_vectype);
9569 if (final_len)
9571 signed char biasval
9572 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
9574 bias = build_int_cst (intQI_type_node, biasval);
9577 /* Arguments are ready. Create the new vector stmt. */
9578 if (final_len)
9580 gcall *call;
9581 tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
9582 /* Need conversion if it's wrapped with VnQI. */
9583 if (vmode != new_vmode)
9585 tree new_vtype
9586 = build_vector_type_for_mode (unsigned_intQI_type_node,
9587 new_vmode);
9588 tree var = vect_get_new_ssa_name (new_vtype, vect_simple_var);
9589 vec_oprnd = build1 (VIEW_CONVERT_EXPR, new_vtype, vec_oprnd);
9590 gassign *new_stmt
9591 = gimple_build_assign (var, VIEW_CONVERT_EXPR, vec_oprnd);
9592 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
9593 vec_oprnd = var;
9596 if (partial_ifn == IFN_MASK_LEN_STORE)
9597 call = gimple_build_call_internal (IFN_MASK_LEN_STORE, 6,
9598 dataref_ptr, ptr, final_mask,
9599 final_len, bias, vec_oprnd);
9600 else
9601 call = gimple_build_call_internal (IFN_LEN_STORE, 5,
9602 dataref_ptr, ptr, final_len,
9603 bias, vec_oprnd);
9604 gimple_call_set_nothrow (call, true);
9605 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
9606 new_stmt = call;
9608 else if (final_mask)
9610 tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
9611 gcall *call
9612 = gimple_build_call_internal (IFN_MASK_STORE, 4, dataref_ptr,
9613 ptr, final_mask, vec_oprnd);
9614 gimple_call_set_nothrow (call, true);
9615 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
9616 new_stmt = call;
9618 else
9620 data_ref
9621 = fold_build2 (MEM_REF, vectype, dataref_ptr,
9622 dataref_offset ? dataref_offset
9623 : build_int_cst (ref_type, 0));
9624 if (alignment_support_scheme == dr_aligned)
9626 else
9627 TREE_TYPE (data_ref)
9628 = build_aligned_type (TREE_TYPE (data_ref),
9629 align * BITS_PER_UNIT);
9630 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
9631 new_stmt = gimple_build_assign (data_ref, vec_oprnd);
9632 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
9635 if (slp)
9636 continue;
9638 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
9639 if (!next_stmt_info)
9640 break;
9642 if (!slp && !costing_p)
9644 if (j == 0)
9645 *vec_stmt = new_stmt;
9646 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9650 if (costing_p)
9652 if (n_adjacent_stores > 0)
9653 vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
9654 alignment_support_scheme, misalignment,
9655 &inside_cost, cost_vec);
9657 /* When vectorizing a store into the function result assign
9658 a penalty if the function returns in a multi-register location.
9659 In this case we assume we'll end up with having to spill the
9660 vector result and do piecewise loads as a conservative estimate. */
9661 tree base = get_base_address (STMT_VINFO_DATA_REF (stmt_info)->ref);
9662 if (base
9663 && (TREE_CODE (base) == RESULT_DECL
9664 || (DECL_P (base) && cfun_returns (base)))
9665 && !aggregate_value_p (base, cfun->decl))
9667 rtx reg = hard_function_value (TREE_TYPE (base), cfun->decl, 0, 1);
9668 /* ??? Handle PARALLEL in some way. */
9669 if (REG_P (reg))
9671 int nregs = hard_regno_nregs (REGNO (reg), GET_MODE (reg));
9672 /* Assume that a single reg-reg move is possible and cheap,
9673 do not account for vector to gp register move cost. */
9674 if (nregs > 1)
9676 /* Spill. */
9677 prologue_cost
9678 += record_stmt_cost (cost_vec, ncopies, vector_store,
9679 stmt_info, 0, vect_epilogue);
9680 /* Loads. */
9681 prologue_cost
9682 += record_stmt_cost (cost_vec, ncopies * nregs, scalar_load,
9683 stmt_info, 0, vect_epilogue);
9687 if (dump_enabled_p ())
9688 dump_printf_loc (MSG_NOTE, vect_location,
9689 "vect_model_store_cost: inside_cost = %d, "
9690 "prologue_cost = %d .\n",
9691 inside_cost, prologue_cost);
9694 return true;
9697 /* Given a vector type VECTYPE, turns permutation SEL into the equivalent
9698 VECTOR_CST mask. No checks are made that the target platform supports the
9699 mask, so callers may wish to test can_vec_perm_const_p separately, or use
9700 vect_gen_perm_mask_checked. */
9702 tree
9703 vect_gen_perm_mask_any (tree vectype, const vec_perm_indices &sel)
9705 tree mask_type;
9707 poly_uint64 nunits = sel.length ();
9708 gcc_assert (known_eq (nunits, TYPE_VECTOR_SUBPARTS (vectype)));
9710 mask_type = build_vector_type (ssizetype, nunits);
9711 return vec_perm_indices_to_tree (mask_type, sel);
9714 /* Checked version of vect_gen_perm_mask_any. Asserts can_vec_perm_const_p,
9715 i.e. that the target supports the pattern _for arbitrary input vectors_. */
9717 tree
9718 vect_gen_perm_mask_checked (tree vectype, const vec_perm_indices &sel)
9720 machine_mode vmode = TYPE_MODE (vectype);
9721 gcc_assert (can_vec_perm_const_p (vmode, vmode, sel));
9722 return vect_gen_perm_mask_any (vectype, sel);
9725 /* Given a vector variable X and Y, that was generated for the scalar
9726 STMT_INFO, generate instructions to permute the vector elements of X and Y
9727 using permutation mask MASK_VEC, insert them at *GSI and return the
9728 permuted vector variable. */
9730 static tree
9731 permute_vec_elements (vec_info *vinfo,
9732 tree x, tree y, tree mask_vec, stmt_vec_info stmt_info,
9733 gimple_stmt_iterator *gsi)
9735 tree vectype = TREE_TYPE (x);
9736 tree perm_dest, data_ref;
9737 gimple *perm_stmt;
9739 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
9740 if (scalar_dest && TREE_CODE (scalar_dest) == SSA_NAME)
9741 perm_dest = vect_create_destination_var (scalar_dest, vectype);
9742 else
9743 perm_dest = vect_get_new_vect_var (vectype, vect_simple_var, NULL);
9744 data_ref = make_ssa_name (perm_dest);
9746 /* Generate the permute statement. */
9747 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, x, y, mask_vec);
9748 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
9750 return data_ref;
9753 /* Hoist the definitions of all SSA uses on STMT_INFO out of the loop LOOP,
9754 inserting them on the loops preheader edge. Returns true if we
9755 were successful in doing so (and thus STMT_INFO can be moved then),
9756 otherwise returns false. HOIST_P indicates if we want to hoist the
9757 definitions of all SSA uses, it would be false when we are costing. */
9759 static bool
9760 hoist_defs_of_uses (stmt_vec_info stmt_info, class loop *loop, bool hoist_p)
9762 ssa_op_iter i;
9763 tree op;
9764 bool any = false;
9766 FOR_EACH_SSA_TREE_OPERAND (op, stmt_info->stmt, i, SSA_OP_USE)
9768 gimple *def_stmt = SSA_NAME_DEF_STMT (op);
9769 if (!gimple_nop_p (def_stmt)
9770 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
9772 /* Make sure we don't need to recurse. While we could do
9773 so in simple cases when there are more complex use webs
9774 we don't have an easy way to preserve stmt order to fulfil
9775 dependencies within them. */
9776 tree op2;
9777 ssa_op_iter i2;
9778 if (gimple_code (def_stmt) == GIMPLE_PHI)
9779 return false;
9780 FOR_EACH_SSA_TREE_OPERAND (op2, def_stmt, i2, SSA_OP_USE)
9782 gimple *def_stmt2 = SSA_NAME_DEF_STMT (op2);
9783 if (!gimple_nop_p (def_stmt2)
9784 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt2)))
9785 return false;
9787 any = true;
9791 if (!any)
9792 return true;
9794 if (!hoist_p)
9795 return true;
9797 FOR_EACH_SSA_TREE_OPERAND (op, stmt_info->stmt, i, SSA_OP_USE)
9799 gimple *def_stmt = SSA_NAME_DEF_STMT (op);
9800 if (!gimple_nop_p (def_stmt)
9801 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
9803 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
9804 gsi_remove (&gsi, false);
9805 gsi_insert_on_edge_immediate (loop_preheader_edge (loop), def_stmt);
9809 return true;
9812 /* vectorizable_load.
9814 Check if STMT_INFO reads a non scalar data-ref (array/pointer/structure)
9815 that can be vectorized.
9816 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
9817 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
9818 Return true if STMT_INFO is vectorizable in this way. */
9820 static bool
9821 vectorizable_load (vec_info *vinfo,
9822 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
9823 gimple **vec_stmt, slp_tree slp_node,
9824 stmt_vector_for_cost *cost_vec)
9826 tree scalar_dest;
9827 tree vec_dest = NULL;
9828 tree data_ref = NULL;
9829 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
9830 class loop *loop = NULL;
9831 class loop *containing_loop = gimple_bb (stmt_info->stmt)->loop_father;
9832 bool nested_in_vect_loop = false;
9833 tree elem_type;
9834 /* Avoid false positive uninitialized warning, see PR110652. */
9835 tree new_temp = NULL_TREE;
9836 machine_mode mode;
9837 tree dummy;
9838 tree dataref_ptr = NULL_TREE;
9839 tree dataref_offset = NULL_TREE;
9840 gimple *ptr_incr = NULL;
9841 int ncopies;
9842 int i, j;
9843 unsigned int group_size;
9844 poly_uint64 group_gap_adj;
9845 tree msq = NULL_TREE, lsq;
9846 tree realignment_token = NULL_TREE;
9847 gphi *phi = NULL;
9848 vec<tree> dr_chain = vNULL;
9849 bool grouped_load = false;
9850 stmt_vec_info first_stmt_info;
9851 stmt_vec_info first_stmt_info_for_drptr = NULL;
9852 bool compute_in_loop = false;
9853 class loop *at_loop;
9854 int vec_num;
9855 bool slp = (slp_node != NULL);
9856 bool slp_perm = false;
9857 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
9858 poly_uint64 vf;
9859 tree aggr_type;
9860 gather_scatter_info gs_info;
9861 tree ref_type;
9862 enum vect_def_type mask_dt = vect_unknown_def_type;
9864 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
9865 return false;
9867 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
9868 && ! vec_stmt)
9869 return false;
9871 if (!STMT_VINFO_DATA_REF (stmt_info))
9872 return false;
9874 tree mask = NULL_TREE, mask_vectype = NULL_TREE;
9875 int mask_index = -1;
9876 slp_tree slp_op = NULL;
9877 if (gassign *assign = dyn_cast <gassign *> (stmt_info->stmt))
9879 scalar_dest = gimple_assign_lhs (assign);
9880 if (TREE_CODE (scalar_dest) != SSA_NAME)
9881 return false;
9883 tree_code code = gimple_assign_rhs_code (assign);
9884 if (code != ARRAY_REF
9885 && code != BIT_FIELD_REF
9886 && code != INDIRECT_REF
9887 && code != COMPONENT_REF
9888 && code != IMAGPART_EXPR
9889 && code != REALPART_EXPR
9890 && code != MEM_REF
9891 && TREE_CODE_CLASS (code) != tcc_declaration)
9892 return false;
9894 else
9896 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
9897 if (!call || !gimple_call_internal_p (call))
9898 return false;
9900 internal_fn ifn = gimple_call_internal_fn (call);
9901 if (!internal_load_fn_p (ifn))
9902 return false;
9904 scalar_dest = gimple_call_lhs (call);
9905 if (!scalar_dest)
9906 return false;
9908 mask_index = internal_fn_mask_index (ifn);
9909 if (mask_index >= 0 && slp_node)
9910 mask_index = vect_slp_child_index_for_operand
9911 (call, mask_index, STMT_VINFO_GATHER_SCATTER_P (stmt_info));
9912 if (mask_index >= 0
9913 && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
9914 &mask, &slp_op, &mask_dt, &mask_vectype))
9915 return false;
9918 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9919 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9921 if (loop_vinfo)
9923 loop = LOOP_VINFO_LOOP (loop_vinfo);
9924 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
9925 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9927 else
9928 vf = 1;
9930 /* Multiple types in SLP are handled by creating the appropriate number of
9931 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
9932 case of SLP. */
9933 if (slp)
9934 ncopies = 1;
9935 else
9936 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9938 gcc_assert (ncopies >= 1);
9940 /* FORNOW. This restriction should be relaxed. */
9941 if (nested_in_vect_loop && ncopies > 1)
9943 if (dump_enabled_p ())
9944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9945 "multiple types in nested loop.\n");
9946 return false;
9949 /* Invalidate assumptions made by dependence analysis when vectorization
9950 on the unrolled body effectively re-orders stmts. */
9951 if (ncopies > 1
9952 && STMT_VINFO_MIN_NEG_DIST (stmt_info) != 0
9953 && maybe_gt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9954 STMT_VINFO_MIN_NEG_DIST (stmt_info)))
9956 if (dump_enabled_p ())
9957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9958 "cannot perform implicit CSE when unrolling "
9959 "with negative dependence distance\n");
9960 return false;
9963 elem_type = TREE_TYPE (vectype);
9964 mode = TYPE_MODE (vectype);
9966 /* FORNOW. In some cases can vectorize even if data-type not supported
9967 (e.g. - data copies). */
9968 if (optab_handler (mov_optab, mode) == CODE_FOR_nothing)
9970 if (dump_enabled_p ())
9971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9972 "Aligned load, but unsupported type.\n");
9973 return false;
9976 /* Check if the load is a part of an interleaving chain. */
9977 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
9979 grouped_load = true;
9980 /* FORNOW */
9981 gcc_assert (!nested_in_vect_loop);
9982 gcc_assert (!STMT_VINFO_GATHER_SCATTER_P (stmt_info));
9984 first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
9985 group_size = DR_GROUP_SIZE (first_stmt_info);
9987 /* Refuse non-SLP vectorization of SLP-only groups. */
9988 if (!slp && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))
9990 if (dump_enabled_p ())
9991 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9992 "cannot vectorize load in non-SLP mode.\n");
9993 return false;
9996 /* Invalidate assumptions made by dependence analysis when vectorization
9997 on the unrolled body effectively re-orders stmts. */
9998 if (!PURE_SLP_STMT (stmt_info)
9999 && STMT_VINFO_MIN_NEG_DIST (stmt_info) != 0
10000 && maybe_gt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
10001 STMT_VINFO_MIN_NEG_DIST (stmt_info)))
10003 if (dump_enabled_p ())
10004 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10005 "cannot perform implicit CSE when performing "
10006 "group loads with negative dependence distance\n");
10007 return false;
10010 else
10011 group_size = 1;
10013 if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
10015 slp_perm = true;
10017 if (!loop_vinfo)
10019 /* In BB vectorization we may not actually use a loaded vector
10020 accessing elements in excess of DR_GROUP_SIZE. */
10021 stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
10022 group_info = DR_GROUP_FIRST_ELEMENT (group_info);
10023 unsigned HOST_WIDE_INT nunits;
10024 unsigned j, k, maxk = 0;
10025 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
10026 if (k > maxk)
10027 maxk = k;
10028 tree vectype = SLP_TREE_VECTYPE (slp_node);
10029 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
10030 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
10032 if (dump_enabled_p ())
10033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10034 "BB vectorization with gaps at the end of "
10035 "a load is not supported\n");
10036 return false;
10040 auto_vec<tree> tem;
10041 unsigned n_perms;
10042 if (!vect_transform_slp_perm_load (vinfo, slp_node, tem, NULL, vf,
10043 true, &n_perms))
10045 if (dump_enabled_p ())
10046 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
10047 vect_location,
10048 "unsupported load permutation\n");
10049 return false;
10053 vect_memory_access_type memory_access_type;
10054 enum dr_alignment_support alignment_support_scheme;
10055 int misalignment;
10056 poly_int64 poffset;
10057 internal_fn lanes_ifn;
10058 if (!get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask, VLS_LOAD,
10059 ncopies, &memory_access_type, &poffset,
10060 &alignment_support_scheme, &misalignment, &gs_info,
10061 &lanes_ifn))
10062 return false;
10064 if (mask)
10066 if (memory_access_type == VMAT_CONTIGUOUS)
10068 machine_mode vec_mode = TYPE_MODE (vectype);
10069 if (!VECTOR_MODE_P (vec_mode)
10070 || !can_vec_mask_load_store_p (vec_mode,
10071 TYPE_MODE (mask_vectype), true))
10072 return false;
10074 else if (memory_access_type != VMAT_LOAD_STORE_LANES
10075 && memory_access_type != VMAT_GATHER_SCATTER)
10077 if (dump_enabled_p ())
10078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10079 "unsupported access type for masked load.\n");
10080 return false;
10082 else if (memory_access_type == VMAT_GATHER_SCATTER
10083 && gs_info.ifn == IFN_LAST
10084 && !gs_info.decl)
10086 if (dump_enabled_p ())
10087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10088 "unsupported masked emulated gather.\n");
10089 return false;
10091 else if (memory_access_type == VMAT_ELEMENTWISE
10092 || memory_access_type == VMAT_STRIDED_SLP)
10094 if (dump_enabled_p ())
10095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10096 "unsupported masked strided access.\n");
10097 return false;
10101 bool costing_p = !vec_stmt;
10103 if (costing_p) /* transformation not required. */
10105 if (slp_node
10106 && mask
10107 && !vect_maybe_update_slp_op_vectype (slp_op,
10108 mask_vectype))
10110 if (dump_enabled_p ())
10111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10112 "incompatible vector types for invariants\n");
10113 return false;
10116 if (!slp)
10117 STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
10119 if (loop_vinfo
10120 && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10121 check_load_store_for_partial_vectors (loop_vinfo, vectype, slp_node,
10122 VLS_LOAD, group_size,
10123 memory_access_type, &gs_info,
10124 mask);
10126 if (dump_enabled_p ()
10127 && memory_access_type != VMAT_ELEMENTWISE
10128 && memory_access_type != VMAT_GATHER_SCATTER
10129 && alignment_support_scheme != dr_aligned)
10130 dump_printf_loc (MSG_NOTE, vect_location,
10131 "Vectorizing an unaligned access.\n");
10133 if (memory_access_type == VMAT_LOAD_STORE_LANES)
10134 vinfo->any_known_not_updated_vssa = true;
10136 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
10139 if (!slp)
10140 gcc_assert (memory_access_type
10141 == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
10143 if (dump_enabled_p () && !costing_p)
10144 dump_printf_loc (MSG_NOTE, vect_location,
10145 "transform load. ncopies = %d\n", ncopies);
10147 /* Transform. */
10149 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info), *first_dr_info = NULL;
10150 ensure_base_align (dr_info);
10152 if (memory_access_type == VMAT_INVARIANT)
10154 gcc_assert (!grouped_load && !mask && !bb_vinfo);
10155 /* If we have versioned for aliasing or the loop doesn't
10156 have any data dependencies that would preclude this,
10157 then we are sure this is a loop invariant load and
10158 thus we can insert it on the preheader edge.
10159 TODO: hoist_defs_of_uses should ideally be computed
10160 once at analysis time, remembered and used in the
10161 transform time. */
10162 bool hoist_p = (LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo)
10163 && !nested_in_vect_loop
10164 && hoist_defs_of_uses (stmt_info, loop, !costing_p));
10165 if (costing_p)
10167 enum vect_cost_model_location cost_loc
10168 = hoist_p ? vect_prologue : vect_body;
10169 unsigned int cost = record_stmt_cost (cost_vec, 1, scalar_load,
10170 stmt_info, 0, cost_loc);
10171 cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, stmt_info, 0,
10172 cost_loc);
10173 unsigned int prologue_cost = hoist_p ? cost : 0;
10174 unsigned int inside_cost = hoist_p ? 0 : cost;
10175 if (dump_enabled_p ())
10176 dump_printf_loc (MSG_NOTE, vect_location,
10177 "vect_model_load_cost: inside_cost = %d, "
10178 "prologue_cost = %d .\n",
10179 inside_cost, prologue_cost);
10180 return true;
10182 if (hoist_p)
10184 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
10185 if (dump_enabled_p ())
10186 dump_printf_loc (MSG_NOTE, vect_location,
10187 "hoisting out of the vectorized loop: %G",
10188 (gimple *) stmt);
10189 scalar_dest = copy_ssa_name (scalar_dest);
10190 tree rhs = unshare_expr (gimple_assign_rhs1 (stmt));
10191 edge pe = loop_preheader_edge (loop);
10192 gphi *vphi = get_virtual_phi (loop->header);
10193 tree vuse;
10194 if (vphi)
10195 vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
10196 else
10197 vuse = gimple_vuse (gsi_stmt (*gsi));
10198 gimple *new_stmt = gimple_build_assign (scalar_dest, rhs);
10199 gimple_set_vuse (new_stmt, vuse);
10200 gsi_insert_on_edge_immediate (pe, new_stmt);
10202 /* These copies are all equivalent. */
10203 if (hoist_p)
10204 new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest,
10205 vectype, NULL);
10206 else
10208 gimple_stmt_iterator gsi2 = *gsi;
10209 gsi_next (&gsi2);
10210 new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest,
10211 vectype, &gsi2);
10213 gimple *new_stmt = SSA_NAME_DEF_STMT (new_temp);
10214 if (slp)
10215 for (j = 0; j < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); ++j)
10216 slp_node->push_vec_def (new_stmt);
10217 else
10219 for (j = 0; j < ncopies; ++j)
10220 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10221 *vec_stmt = new_stmt;
10223 return true;
10226 if (memory_access_type == VMAT_ELEMENTWISE
10227 || memory_access_type == VMAT_STRIDED_SLP)
10229 gimple_stmt_iterator incr_gsi;
10230 bool insert_after;
10231 tree offvar;
10232 tree ivstep;
10233 tree running_off;
10234 vec<constructor_elt, va_gc> *v = NULL;
10235 tree stride_base, stride_step, alias_off;
10236 /* Checked by get_load_store_type. */
10237 unsigned int const_nunits = nunits.to_constant ();
10238 unsigned HOST_WIDE_INT cst_offset = 0;
10239 tree dr_offset;
10240 unsigned int inside_cost = 0;
10242 gcc_assert (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
10243 gcc_assert (!nested_in_vect_loop);
10245 if (grouped_load)
10247 first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
10248 first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
10250 else
10252 first_stmt_info = stmt_info;
10253 first_dr_info = dr_info;
10256 if (slp && grouped_load)
10258 group_size = DR_GROUP_SIZE (first_stmt_info);
10259 ref_type = get_group_alias_ptr_type (first_stmt_info);
10261 else
10263 if (grouped_load)
10264 cst_offset
10265 = (tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)))
10266 * vect_get_place_in_interleaving_chain (stmt_info,
10267 first_stmt_info));
10268 group_size = 1;
10269 ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr));
10272 if (!costing_p)
10274 dr_offset = get_dr_vinfo_offset (vinfo, first_dr_info);
10275 stride_base = fold_build_pointer_plus (
10276 DR_BASE_ADDRESS (first_dr_info->dr),
10277 size_binop (PLUS_EXPR, convert_to_ptrofftype (dr_offset),
10278 convert_to_ptrofftype (DR_INIT (first_dr_info->dr))));
10279 stride_step = fold_convert (sizetype, DR_STEP (first_dr_info->dr));
10281 /* For a load with loop-invariant (but other than power-of-2)
10282 stride (i.e. not a grouped access) like so:
10284 for (i = 0; i < n; i += stride)
10285 ... = array[i];
10287 we generate a new induction variable and new accesses to
10288 form a new vector (or vectors, depending on ncopies):
10290 for (j = 0; ; j += VF*stride)
10291 tmp1 = array[j];
10292 tmp2 = array[j + stride];
10294 vectemp = {tmp1, tmp2, ...}
10297 ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (stride_step), stride_step,
10298 build_int_cst (TREE_TYPE (stride_step), vf));
10300 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
10302 stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base);
10303 ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep);
10304 create_iv (stride_base, PLUS_EXPR, ivstep, NULL,
10305 loop, &incr_gsi, insert_after,
10306 &offvar, NULL);
10308 stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step);
10311 running_off = offvar;
10312 alias_off = build_int_cst (ref_type, 0);
10313 int nloads = const_nunits;
10314 int lnel = 1;
10315 tree ltype = TREE_TYPE (vectype);
10316 tree lvectype = vectype;
10317 auto_vec<tree> dr_chain;
10318 if (memory_access_type == VMAT_STRIDED_SLP)
10320 if (group_size < const_nunits)
10322 /* First check if vec_init optab supports construction from vector
10323 elts directly. Otherwise avoid emitting a constructor of
10324 vector elements by performing the loads using an integer type
10325 of the same size, constructing a vector of those and then
10326 re-interpreting it as the original vector type. This avoids a
10327 huge runtime penalty due to the general inability to perform
10328 store forwarding from smaller stores to a larger load. */
10329 tree ptype;
10330 tree vtype
10331 = vector_vector_composition_type (vectype,
10332 const_nunits / group_size,
10333 &ptype);
10334 if (vtype != NULL_TREE)
10336 nloads = const_nunits / group_size;
10337 lnel = group_size;
10338 lvectype = vtype;
10339 ltype = ptype;
10342 else
10344 nloads = 1;
10345 lnel = const_nunits;
10346 ltype = vectype;
10348 ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
10350 /* Load vector(1) scalar_type if it's 1 element-wise vectype. */
10351 else if (nloads == 1)
10352 ltype = vectype;
10354 if (slp)
10356 /* For SLP permutation support we need to load the whole group,
10357 not only the number of vector stmts the permutation result
10358 fits in. */
10359 if (slp_perm)
10361 /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
10362 variable VF. */
10363 unsigned int const_vf = vf.to_constant ();
10364 ncopies = CEIL (group_size * const_vf, const_nunits);
10365 dr_chain.create (ncopies);
10367 else
10368 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10370 unsigned int group_el = 0;
10371 unsigned HOST_WIDE_INT
10372 elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
10373 unsigned int n_groups = 0;
10374 /* For costing some adjacent vector loads, we'd like to cost with
10375 the total number of them once instead of cost each one by one. */
10376 unsigned int n_adjacent_loads = 0;
10377 for (j = 0; j < ncopies; j++)
10379 if (nloads > 1 && !costing_p)
10380 vec_alloc (v, nloads);
10381 gimple *new_stmt = NULL;
10382 for (i = 0; i < nloads; i++)
10384 if (costing_p)
10386 /* For VMAT_ELEMENTWISE, just cost it as scalar_load to
10387 avoid ICE, see PR110776. */
10388 if (VECTOR_TYPE_P (ltype)
10389 && memory_access_type != VMAT_ELEMENTWISE)
10390 n_adjacent_loads++;
10391 else
10392 inside_cost += record_stmt_cost (cost_vec, 1, scalar_load,
10393 stmt_info, 0, vect_body);
10394 continue;
10396 tree this_off = build_int_cst (TREE_TYPE (alias_off),
10397 group_el * elsz + cst_offset);
10398 tree data_ref = build2 (MEM_REF, ltype, running_off, this_off);
10399 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
10400 new_stmt = gimple_build_assign (make_ssa_name (ltype), data_ref);
10401 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
10402 if (nloads > 1)
10403 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
10404 gimple_assign_lhs (new_stmt));
10406 group_el += lnel;
10407 if (! slp
10408 || group_el == group_size)
10410 n_groups++;
10411 /* When doing SLP make sure to not load elements from
10412 the next vector iteration, those will not be accessed
10413 so just use the last element again. See PR107451. */
10414 if (!slp || known_lt (n_groups, vf))
10416 tree newoff = copy_ssa_name (running_off);
10417 gimple *incr
10418 = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
10419 running_off, stride_step);
10420 vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi);
10421 running_off = newoff;
10423 group_el = 0;
10427 if (nloads > 1)
10429 if (costing_p)
10430 inside_cost += record_stmt_cost (cost_vec, 1, vec_construct,
10431 stmt_info, 0, vect_body);
10432 else
10434 tree vec_inv = build_constructor (lvectype, v);
10435 new_temp = vect_init_vector (vinfo, stmt_info, vec_inv,
10436 lvectype, gsi);
10437 new_stmt = SSA_NAME_DEF_STMT (new_temp);
10438 if (lvectype != vectype)
10440 new_stmt
10441 = gimple_build_assign (make_ssa_name (vectype),
10442 VIEW_CONVERT_EXPR,
10443 build1 (VIEW_CONVERT_EXPR,
10444 vectype, new_temp));
10445 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt,
10446 gsi);
10451 if (!costing_p)
10453 if (slp)
10455 if (slp_perm)
10456 dr_chain.quick_push (gimple_assign_lhs (new_stmt));
10457 else
10458 slp_node->push_vec_def (new_stmt);
10460 else
10462 if (j == 0)
10463 *vec_stmt = new_stmt;
10464 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10468 if (slp_perm)
10470 unsigned n_perms;
10471 if (costing_p)
10473 unsigned n_loads;
10474 vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf,
10475 true, &n_perms, &n_loads);
10476 inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm,
10477 first_stmt_info, 0, vect_body);
10479 else
10480 vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
10481 false, &n_perms);
10484 if (costing_p)
10486 if (n_adjacent_loads > 0)
10487 vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
10488 alignment_support_scheme, misalignment, false,
10489 &inside_cost, nullptr, cost_vec, cost_vec,
10490 true);
10491 if (dump_enabled_p ())
10492 dump_printf_loc (MSG_NOTE, vect_location,
10493 "vect_model_load_cost: inside_cost = %u, "
10494 "prologue_cost = 0 .\n",
10495 inside_cost);
10498 return true;
10501 if (memory_access_type == VMAT_GATHER_SCATTER
10502 || (!slp && memory_access_type == VMAT_CONTIGUOUS))
10503 grouped_load = false;
10505 if (grouped_load
10506 || (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()))
10508 if (grouped_load)
10510 first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
10511 group_size = DR_GROUP_SIZE (first_stmt_info);
10513 else
10515 first_stmt_info = stmt_info;
10516 group_size = 1;
10518 /* For SLP vectorization we directly vectorize a subchain
10519 without permutation. */
10520 if (slp && ! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
10521 first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
10522 /* For BB vectorization always use the first stmt to base
10523 the data ref pointer on. */
10524 if (bb_vinfo)
10525 first_stmt_info_for_drptr
10526 = vect_find_first_scalar_stmt_in_slp (slp_node);
10528 /* Check if the chain of loads is already vectorized. */
10529 if (STMT_VINFO_VEC_STMTS (first_stmt_info).exists ()
10530 /* For SLP we would need to copy over SLP_TREE_VEC_DEFS.
10531 ??? But we can only do so if there is exactly one
10532 as we have no way to get at the rest. Leave the CSE
10533 opportunity alone.
10534 ??? With the group load eventually participating
10535 in multiple different permutations (having multiple
10536 slp nodes which refer to the same group) the CSE
10537 is even wrong code. See PR56270. */
10538 && !slp)
10540 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10541 return true;
10543 first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
10544 group_gap_adj = 0;
10546 /* VEC_NUM is the number of vect stmts to be created for this group. */
10547 if (slp)
10549 grouped_load = false;
10550 /* If an SLP permutation is from N elements to N elements,
10551 and if one vector holds a whole number of N, we can load
10552 the inputs to the permutation in the same way as an
10553 unpermuted sequence. In other cases we need to load the
10554 whole group, not only the number of vector stmts the
10555 permutation result fits in. */
10556 unsigned scalar_lanes = SLP_TREE_LANES (slp_node);
10557 if (slp_perm
10558 && (group_size != scalar_lanes
10559 || !multiple_p (nunits, group_size)))
10561 /* We don't yet generate such SLP_TREE_LOAD_PERMUTATIONs for
10562 variable VF; see vect_transform_slp_perm_load. */
10563 unsigned int const_vf = vf.to_constant ();
10564 unsigned int const_nunits = nunits.to_constant ();
10565 vec_num = CEIL (group_size * const_vf, const_nunits);
10566 group_gap_adj = vf * group_size - nunits * vec_num;
10568 else
10570 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10571 group_gap_adj
10572 = group_size - scalar_lanes;
10575 else
10576 vec_num = group_size;
10578 ref_type = get_group_alias_ptr_type (first_stmt_info);
10580 else
10582 first_stmt_info = stmt_info;
10583 first_dr_info = dr_info;
10584 group_size = vec_num = 1;
10585 group_gap_adj = 0;
10586 ref_type = reference_alias_ptr_type (DR_REF (first_dr_info->dr));
10587 if (slp)
10588 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10591 gcc_assert (alignment_support_scheme);
10592 vec_loop_masks *loop_masks
10593 = (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10594 ? &LOOP_VINFO_MASKS (loop_vinfo)
10595 : NULL);
10596 vec_loop_lens *loop_lens
10597 = (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)
10598 ? &LOOP_VINFO_LENS (loop_vinfo)
10599 : NULL);
10601 /* The vect_transform_stmt and vect_analyze_stmt will go here but there
10602 are some difference here. We cannot enable both the lens and masks
10603 during transform but it is allowed during analysis.
10604 Shouldn't go with length-based approach if fully masked. */
10605 if (cost_vec == NULL)
10606 /* The cost_vec is NULL during transfrom. */
10607 gcc_assert ((!loop_lens || !loop_masks));
10609 /* Targets with store-lane instructions must not require explicit
10610 realignment. vect_supportable_dr_alignment always returns either
10611 dr_aligned or dr_unaligned_supported for masked operations. */
10612 gcc_assert ((memory_access_type != VMAT_LOAD_STORE_LANES
10613 && !mask
10614 && !loop_masks)
10615 || alignment_support_scheme == dr_aligned
10616 || alignment_support_scheme == dr_unaligned_supported);
10618 /* In case the vectorization factor (VF) is bigger than the number
10619 of elements that we can fit in a vectype (nunits), we have to generate
10620 more than one vector stmt - i.e - we need to "unroll" the
10621 vector stmt by a factor VF/nunits. In doing so, we record a pointer
10622 from one copy of the vector stmt to the next, in the field
10623 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
10624 stages to find the correct vector defs to be used when vectorizing
10625 stmts that use the defs of the current stmt. The example below
10626 illustrates the vectorization process when VF=16 and nunits=4 (i.e., we
10627 need to create 4 vectorized stmts):
10629 before vectorization:
10630 RELATED_STMT VEC_STMT
10631 S1: x = memref - -
10632 S2: z = x + 1 - -
10634 step 1: vectorize stmt S1:
10635 We first create the vector stmt VS1_0, and, as usual, record a
10636 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
10637 Next, we create the vector stmt VS1_1, and record a pointer to
10638 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
10639 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
10640 stmts and pointers:
10641 RELATED_STMT VEC_STMT
10642 VS1_0: vx0 = memref0 VS1_1 -
10643 VS1_1: vx1 = memref1 VS1_2 -
10644 VS1_2: vx2 = memref2 VS1_3 -
10645 VS1_3: vx3 = memref3 - -
10646 S1: x = load - VS1_0
10647 S2: z = x + 1 - -
10650 /* In case of interleaving (non-unit grouped access):
10652 S1: x2 = &base + 2
10653 S2: x0 = &base
10654 S3: x1 = &base + 1
10655 S4: x3 = &base + 3
10657 Vectorized loads are created in the order of memory accesses
10658 starting from the access of the first stmt of the chain:
10660 VS1: vx0 = &base
10661 VS2: vx1 = &base + vec_size*1
10662 VS3: vx3 = &base + vec_size*2
10663 VS4: vx4 = &base + vec_size*3
10665 Then permutation statements are generated:
10667 VS5: vx5 = VEC_PERM_EXPR < vx0, vx1, { 0, 2, ..., i*2 } >
10668 VS6: vx6 = VEC_PERM_EXPR < vx0, vx1, { 1, 3, ..., i*2+1 } >
10671 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
10672 (the order of the data-refs in the output of vect_permute_load_chain
10673 corresponds to the order of scalar stmts in the interleaving chain - see
10674 the documentation of vect_permute_load_chain()).
10675 The generation of permutation stmts and recording them in
10676 STMT_VINFO_VEC_STMT is done in vect_transform_grouped_load().
10678 In case of both multiple types and interleaving, the vector loads and
10679 permutation stmts above are created for every copy. The result vector
10680 stmts are put in STMT_VINFO_VEC_STMT for the first copy and in the
10681 corresponding STMT_VINFO_RELATED_STMT for the next copies. */
10683 /* If the data reference is aligned (dr_aligned) or potentially unaligned
10684 on a target that supports unaligned accesses (dr_unaligned_supported)
10685 we generate the following code:
10686 p = initial_addr;
10687 indx = 0;
10688 loop {
10689 p = p + indx * vectype_size;
10690 vec_dest = *(p);
10691 indx = indx + 1;
10694 Otherwise, the data reference is potentially unaligned on a target that
10695 does not support unaligned accesses (dr_explicit_realign_optimized) -
10696 then generate the following code, in which the data in each iteration is
10697 obtained by two vector loads, one from the previous iteration, and one
10698 from the current iteration:
10699 p1 = initial_addr;
10700 msq_init = *(floor(p1))
10701 p2 = initial_addr + VS - 1;
10702 realignment_token = call target_builtin;
10703 indx = 0;
10704 loop {
10705 p2 = p2 + indx * vectype_size
10706 lsq = *(floor(p2))
10707 vec_dest = realign_load (msq, lsq, realignment_token)
10708 indx = indx + 1;
10709 msq = lsq;
10710 } */
10712 /* If the misalignment remains the same throughout the execution of the
10713 loop, we can create the init_addr and permutation mask at the loop
10714 preheader. Otherwise, it needs to be created inside the loop.
10715 This can only occur when vectorizing memory accesses in the inner-loop
10716 nested within an outer-loop that is being vectorized. */
10718 if (nested_in_vect_loop
10719 && !multiple_p (DR_STEP_ALIGNMENT (dr_info->dr),
10720 GET_MODE_SIZE (TYPE_MODE (vectype))))
10722 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
10723 compute_in_loop = true;
10726 bool diff_first_stmt_info
10727 = first_stmt_info_for_drptr && first_stmt_info != first_stmt_info_for_drptr;
10729 tree offset = NULL_TREE;
10730 if ((alignment_support_scheme == dr_explicit_realign_optimized
10731 || alignment_support_scheme == dr_explicit_realign)
10732 && !compute_in_loop)
10734 /* If we have different first_stmt_info, we can't set up realignment
10735 here, since we can't guarantee first_stmt_info DR has been
10736 initialized yet, use first_stmt_info_for_drptr DR by bumping the
10737 distance from first_stmt_info DR instead as below. */
10738 if (!costing_p)
10740 if (!diff_first_stmt_info)
10741 msq = vect_setup_realignment (vinfo, first_stmt_info, gsi,
10742 &realignment_token,
10743 alignment_support_scheme, NULL_TREE,
10744 &at_loop);
10745 if (alignment_support_scheme == dr_explicit_realign_optimized)
10747 phi = as_a<gphi *> (SSA_NAME_DEF_STMT (msq));
10748 offset = size_binop (MINUS_EXPR, TYPE_SIZE_UNIT (vectype),
10749 size_one_node);
10750 gcc_assert (!first_stmt_info_for_drptr);
10754 else
10755 at_loop = loop;
10757 if (!known_eq (poffset, 0))
10758 offset = (offset
10759 ? size_binop (PLUS_EXPR, offset, size_int (poffset))
10760 : size_int (poffset));
10762 tree bump;
10763 tree vec_offset = NULL_TREE;
10764 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
10766 aggr_type = NULL_TREE;
10767 bump = NULL_TREE;
10769 else if (memory_access_type == VMAT_GATHER_SCATTER)
10771 aggr_type = elem_type;
10772 if (!costing_p)
10773 vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
10774 &bump, &vec_offset, loop_lens);
10776 else
10778 if (memory_access_type == VMAT_LOAD_STORE_LANES)
10779 aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
10780 else
10781 aggr_type = vectype;
10782 bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
10783 memory_access_type, loop_lens);
10786 auto_vec<tree> vec_offsets;
10787 auto_vec<tree> vec_masks;
10788 if (mask && !costing_p)
10790 if (slp_node)
10791 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[mask_index],
10792 &vec_masks);
10793 else
10794 vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask,
10795 &vec_masks, mask_vectype);
10798 tree vec_mask = NULL_TREE;
10799 if (memory_access_type == VMAT_LOAD_STORE_LANES)
10801 gcc_assert (alignment_support_scheme == dr_aligned
10802 || alignment_support_scheme == dr_unaligned_supported);
10803 gcc_assert (grouped_load && !slp);
10805 unsigned int inside_cost = 0, prologue_cost = 0;
10806 /* For costing some adjacent vector loads, we'd like to cost with
10807 the total number of them once instead of cost each one by one. */
10808 unsigned int n_adjacent_loads = 0;
10809 for (j = 0; j < ncopies; j++)
10811 if (costing_p)
10813 /* An IFN_LOAD_LANES will load all its vector results,
10814 regardless of which ones we actually need. Account
10815 for the cost of unused results. */
10816 if (first_stmt_info == stmt_info)
10818 unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
10819 stmt_vec_info next_stmt_info = first_stmt_info;
10822 gaps -= 1;
10823 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
10825 while (next_stmt_info);
10826 if (gaps)
10828 if (dump_enabled_p ())
10829 dump_printf_loc (MSG_NOTE, vect_location,
10830 "vect_model_load_cost: %d "
10831 "unused vectors.\n",
10832 gaps);
10833 vect_get_load_cost (vinfo, stmt_info, gaps,
10834 alignment_support_scheme,
10835 misalignment, false, &inside_cost,
10836 &prologue_cost, cost_vec, cost_vec,
10837 true);
10840 n_adjacent_loads++;
10841 continue;
10844 /* 1. Create the vector or array pointer update chain. */
10845 if (j == 0)
10846 dataref_ptr
10847 = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
10848 at_loop, offset, &dummy, gsi,
10849 &ptr_incr, false, bump);
10850 else
10852 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10853 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
10854 stmt_info, bump);
10856 if (mask)
10857 vec_mask = vec_masks[j];
10859 tree vec_array = create_vector_array (vectype, vec_num);
10861 tree final_mask = NULL_TREE;
10862 tree final_len = NULL_TREE;
10863 tree bias = NULL_TREE;
10864 if (loop_masks)
10865 final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
10866 ncopies, vectype, j);
10867 if (vec_mask)
10868 final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask,
10869 vec_mask, gsi);
10871 if (lanes_ifn == IFN_MASK_LEN_LOAD_LANES)
10873 if (loop_lens)
10874 final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
10875 ncopies, vectype, j, 1);
10876 else
10877 final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
10878 signed char biasval
10879 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10880 bias = build_int_cst (intQI_type_node, biasval);
10881 if (!final_mask)
10883 mask_vectype = truth_type_for (vectype);
10884 final_mask = build_minus_one_cst (mask_vectype);
10888 gcall *call;
10889 if (final_len && final_mask)
10891 /* Emit:
10892 VEC_ARRAY = MASK_LEN_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
10893 VEC_MASK, LEN, BIAS). */
10894 unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
10895 tree alias_ptr = build_int_cst (ref_type, align);
10896 call = gimple_build_call_internal (IFN_MASK_LEN_LOAD_LANES, 5,
10897 dataref_ptr, alias_ptr,
10898 final_mask, final_len, bias);
10900 else if (final_mask)
10902 /* Emit:
10903 VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
10904 VEC_MASK). */
10905 unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
10906 tree alias_ptr = build_int_cst (ref_type, align);
10907 call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
10908 dataref_ptr, alias_ptr,
10909 final_mask);
10911 else
10913 /* Emit:
10914 VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]). */
10915 data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
10916 call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref);
10918 gimple_call_set_lhs (call, vec_array);
10919 gimple_call_set_nothrow (call, true);
10920 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
10922 dr_chain.create (vec_num);
10923 /* Extract each vector into an SSA_NAME. */
10924 for (i = 0; i < vec_num; i++)
10926 new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
10927 vec_array, i);
10928 dr_chain.quick_push (new_temp);
10931 /* Record the mapping between SSA_NAMEs and statements. */
10932 vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
10934 /* Record that VEC_ARRAY is now dead. */
10935 vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
10937 dr_chain.release ();
10939 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10942 if (costing_p)
10944 if (n_adjacent_loads > 0)
10945 vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
10946 alignment_support_scheme, misalignment, false,
10947 &inside_cost, &prologue_cost, cost_vec,
10948 cost_vec, true);
10949 if (dump_enabled_p ())
10950 dump_printf_loc (MSG_NOTE, vect_location,
10951 "vect_model_load_cost: inside_cost = %u, "
10952 "prologue_cost = %u .\n",
10953 inside_cost, prologue_cost);
10956 return true;
10959 if (memory_access_type == VMAT_GATHER_SCATTER)
10961 gcc_assert (alignment_support_scheme == dr_aligned
10962 || alignment_support_scheme == dr_unaligned_supported);
10963 gcc_assert (!grouped_load && !slp_perm);
10965 unsigned int inside_cost = 0, prologue_cost = 0;
10966 for (j = 0; j < ncopies; j++)
10968 /* 1. Create the vector or array pointer update chain. */
10969 if (j == 0 && !costing_p)
10971 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
10972 vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
10973 slp_node, &gs_info, &dataref_ptr,
10974 &vec_offsets);
10975 else
10976 dataref_ptr
10977 = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
10978 at_loop, offset, &dummy, gsi,
10979 &ptr_incr, false, bump);
10981 else if (!costing_p)
10983 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10984 if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
10985 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
10986 gsi, stmt_info, bump);
10989 gimple *new_stmt = NULL;
10990 for (i = 0; i < vec_num; i++)
10992 tree final_mask = NULL_TREE;
10993 tree final_len = NULL_TREE;
10994 tree bias = NULL_TREE;
10995 if (!costing_p)
10997 if (mask)
10998 vec_mask = vec_masks[vec_num * j + i];
10999 if (loop_masks)
11000 final_mask
11001 = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
11002 vec_num * ncopies, vectype,
11003 vec_num * j + i);
11004 if (vec_mask)
11005 final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
11006 final_mask, vec_mask, gsi);
11008 if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
11009 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
11010 gsi, stmt_info, bump);
11013 /* 2. Create the vector-load in the loop. */
11014 unsigned HOST_WIDE_INT align;
11015 if (gs_info.ifn != IFN_LAST)
11017 if (costing_p)
11019 unsigned int cnunits = vect_nunits_for_cost (vectype);
11020 inside_cost
11021 = record_stmt_cost (cost_vec, cnunits, scalar_load,
11022 stmt_info, 0, vect_body);
11023 continue;
11025 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
11026 vec_offset = vec_offsets[vec_num * j + i];
11027 tree zero = build_zero_cst (vectype);
11028 tree scale = size_int (gs_info.scale);
11030 if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
11032 if (loop_lens)
11033 final_len
11034 = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
11035 vec_num * ncopies, vectype,
11036 vec_num * j + i, 1);
11037 else
11038 final_len
11039 = build_int_cst (sizetype,
11040 TYPE_VECTOR_SUBPARTS (vectype));
11041 signed char biasval
11042 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
11043 bias = build_int_cst (intQI_type_node, biasval);
11044 if (!final_mask)
11046 mask_vectype = truth_type_for (vectype);
11047 final_mask = build_minus_one_cst (mask_vectype);
11051 gcall *call;
11052 if (final_len && final_mask)
11053 call
11054 = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7,
11055 dataref_ptr, vec_offset,
11056 scale, zero, final_mask,
11057 final_len, bias);
11058 else if (final_mask)
11059 call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
11060 dataref_ptr, vec_offset,
11061 scale, zero, final_mask);
11062 else
11063 call = gimple_build_call_internal (IFN_GATHER_LOAD, 4,
11064 dataref_ptr, vec_offset,
11065 scale, zero);
11066 gimple_call_set_nothrow (call, true);
11067 new_stmt = call;
11068 data_ref = NULL_TREE;
11070 else if (gs_info.decl)
11072 /* The builtin decls path for gather is legacy, x86 only. */
11073 gcc_assert (!final_len && nunits.is_constant ());
11074 if (costing_p)
11076 unsigned int cnunits = vect_nunits_for_cost (vectype);
11077 inside_cost
11078 = record_stmt_cost (cost_vec, cnunits, scalar_load,
11079 stmt_info, 0, vect_body);
11080 continue;
11082 poly_uint64 offset_nunits
11083 = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype);
11084 if (known_eq (nunits, offset_nunits))
11086 new_stmt = vect_build_one_gather_load_call
11087 (vinfo, stmt_info, gsi, &gs_info,
11088 dataref_ptr, vec_offsets[vec_num * j + i],
11089 final_mask);
11090 data_ref = NULL_TREE;
11092 else if (known_eq (nunits, offset_nunits * 2))
11094 /* We have a offset vector with half the number of
11095 lanes but the builtins will produce full vectype
11096 data with just the lower lanes filled. */
11097 new_stmt = vect_build_one_gather_load_call
11098 (vinfo, stmt_info, gsi, &gs_info,
11099 dataref_ptr, vec_offsets[2 * vec_num * j + 2 * i],
11100 final_mask);
11101 tree low = make_ssa_name (vectype);
11102 gimple_set_lhs (new_stmt, low);
11103 vect_finish_stmt_generation (vinfo, stmt_info,
11104 new_stmt, gsi);
11106 /* now put upper half of final_mask in final_mask low. */
11107 if (final_mask
11108 && !SCALAR_INT_MODE_P
11109 (TYPE_MODE (TREE_TYPE (final_mask))))
11111 int count = nunits.to_constant ();
11112 vec_perm_builder sel (count, count, 1);
11113 sel.quick_grow (count);
11114 for (int i = 0; i < count; ++i)
11115 sel[i] = i | (count / 2);
11116 vec_perm_indices indices (sel, 2, count);
11117 tree perm_mask = vect_gen_perm_mask_checked
11118 (TREE_TYPE (final_mask), indices);
11119 new_stmt = gimple_build_assign (NULL_TREE,
11120 VEC_PERM_EXPR,
11121 final_mask,
11122 final_mask,
11123 perm_mask);
11124 final_mask = make_ssa_name (TREE_TYPE (final_mask));
11125 gimple_set_lhs (new_stmt, final_mask);
11126 vect_finish_stmt_generation (vinfo, stmt_info,
11127 new_stmt, gsi);
11129 else if (final_mask)
11131 new_stmt = gimple_build_assign (NULL_TREE,
11132 VEC_UNPACK_HI_EXPR,
11133 final_mask);
11134 final_mask = make_ssa_name
11135 (truth_type_for (gs_info.offset_vectype));
11136 gimple_set_lhs (new_stmt, final_mask);
11137 vect_finish_stmt_generation (vinfo, stmt_info,
11138 new_stmt, gsi);
11141 new_stmt = vect_build_one_gather_load_call
11142 (vinfo, stmt_info, gsi, &gs_info,
11143 dataref_ptr,
11144 vec_offsets[2 * vec_num * j + 2 * i + 1],
11145 final_mask);
11146 tree high = make_ssa_name (vectype);
11147 gimple_set_lhs (new_stmt, high);
11148 vect_finish_stmt_generation (vinfo, stmt_info,
11149 new_stmt, gsi);
11151 /* compose low + high. */
11152 int count = nunits.to_constant ();
11153 vec_perm_builder sel (count, count, 1);
11154 sel.quick_grow (count);
11155 for (int i = 0; i < count; ++i)
11156 sel[i] = i < count / 2 ? i : i + count / 2;
11157 vec_perm_indices indices (sel, 2, count);
11158 tree perm_mask
11159 = vect_gen_perm_mask_checked (vectype, indices);
11160 new_stmt = gimple_build_assign (NULL_TREE,
11161 VEC_PERM_EXPR,
11162 low, high, perm_mask);
11163 data_ref = NULL_TREE;
11165 else if (known_eq (nunits * 2, offset_nunits))
11167 /* We have a offset vector with double the number of
11168 lanes. Select the low/high part accordingly. */
11169 vec_offset = vec_offsets[(vec_num * j + i) / 2];
11170 if ((vec_num * j + i) & 1)
11172 int count = offset_nunits.to_constant ();
11173 vec_perm_builder sel (count, count, 1);
11174 sel.quick_grow (count);
11175 for (int i = 0; i < count; ++i)
11176 sel[i] = i | (count / 2);
11177 vec_perm_indices indices (sel, 2, count);
11178 tree perm_mask = vect_gen_perm_mask_checked
11179 (TREE_TYPE (vec_offset), indices);
11180 new_stmt = gimple_build_assign (NULL_TREE,
11181 VEC_PERM_EXPR,
11182 vec_offset,
11183 vec_offset,
11184 perm_mask);
11185 vec_offset = make_ssa_name (TREE_TYPE (vec_offset));
11186 gimple_set_lhs (new_stmt, vec_offset);
11187 vect_finish_stmt_generation (vinfo, stmt_info,
11188 new_stmt, gsi);
11190 new_stmt = vect_build_one_gather_load_call
11191 (vinfo, stmt_info, gsi, &gs_info,
11192 dataref_ptr, vec_offset, final_mask);
11193 data_ref = NULL_TREE;
11195 else
11196 gcc_unreachable ();
11198 else
11200 /* Emulated gather-scatter. */
11201 gcc_assert (!final_mask);
11202 unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
11203 if (costing_p)
11205 /* For emulated gathers N offset vector element
11206 offset add is consumed by the load). */
11207 inside_cost = record_stmt_cost (cost_vec, const_nunits,
11208 vec_to_scalar, stmt_info,
11209 0, vect_body);
11210 /* N scalar loads plus gathering them into a
11211 vector. */
11212 inside_cost
11213 = record_stmt_cost (cost_vec, const_nunits, scalar_load,
11214 stmt_info, 0, vect_body);
11215 inside_cost
11216 = record_stmt_cost (cost_vec, 1, vec_construct,
11217 stmt_info, 0, vect_body);
11218 continue;
11220 unsigned HOST_WIDE_INT const_offset_nunits
11221 = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
11222 .to_constant ();
11223 vec<constructor_elt, va_gc> *ctor_elts;
11224 vec_alloc (ctor_elts, const_nunits);
11225 gimple_seq stmts = NULL;
11226 /* We support offset vectors with more elements
11227 than the data vector for now. */
11228 unsigned HOST_WIDE_INT factor
11229 = const_offset_nunits / const_nunits;
11230 vec_offset = vec_offsets[(vec_num * j + i) / factor];
11231 unsigned elt_offset
11232 = ((vec_num * j + i) % factor) * const_nunits;
11233 tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
11234 tree scale = size_int (gs_info.scale);
11235 align = get_object_alignment (DR_REF (first_dr_info->dr));
11236 tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
11237 for (unsigned k = 0; k < const_nunits; ++k)
11239 tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
11240 bitsize_int (k + elt_offset));
11241 tree idx
11242 = gimple_build (&stmts, BIT_FIELD_REF, idx_type,
11243 vec_offset, TYPE_SIZE (idx_type), boff);
11244 idx = gimple_convert (&stmts, sizetype, idx);
11245 idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx,
11246 scale);
11247 tree ptr = gimple_build (&stmts, PLUS_EXPR,
11248 TREE_TYPE (dataref_ptr),
11249 dataref_ptr, idx);
11250 ptr = gimple_convert (&stmts, ptr_type_node, ptr);
11251 tree elt = make_ssa_name (TREE_TYPE (vectype));
11252 tree ref = build2 (MEM_REF, ltype, ptr,
11253 build_int_cst (ref_type, 0));
11254 new_stmt = gimple_build_assign (elt, ref);
11255 gimple_set_vuse (new_stmt, gimple_vuse (gsi_stmt (*gsi)));
11256 gimple_seq_add_stmt (&stmts, new_stmt);
11257 CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
11259 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
11260 new_stmt = gimple_build_assign (
11261 NULL_TREE, build_constructor (vectype, ctor_elts));
11262 data_ref = NULL_TREE;
11265 vec_dest = vect_create_destination_var (scalar_dest, vectype);
11266 /* DATA_REF is null if we've already built the statement. */
11267 if (data_ref)
11269 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
11270 new_stmt = gimple_build_assign (vec_dest, data_ref);
11272 new_temp = make_ssa_name (vec_dest, new_stmt);
11273 gimple_set_lhs (new_stmt, new_temp);
11274 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11276 /* Store vector loads in the corresponding SLP_NODE. */
11277 if (slp)
11278 slp_node->push_vec_def (new_stmt);
11281 if (!slp && !costing_p)
11282 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
11285 if (!slp && !costing_p)
11286 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
11288 if (costing_p && dump_enabled_p ())
11289 dump_printf_loc (MSG_NOTE, vect_location,
11290 "vect_model_load_cost: inside_cost = %u, "
11291 "prologue_cost = %u .\n",
11292 inside_cost, prologue_cost);
11293 return true;
11296 poly_uint64 group_elt = 0;
11297 unsigned int inside_cost = 0, prologue_cost = 0;
11298 /* For costing some adjacent vector loads, we'd like to cost with
11299 the total number of them once instead of cost each one by one. */
11300 unsigned int n_adjacent_loads = 0;
11301 for (j = 0; j < ncopies; j++)
11303 /* 1. Create the vector or array pointer update chain. */
11304 if (j == 0 && !costing_p)
11306 bool simd_lane_access_p
11307 = STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) != 0;
11308 if (simd_lane_access_p
11309 && TREE_CODE (DR_BASE_ADDRESS (first_dr_info->dr)) == ADDR_EXPR
11310 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (first_dr_info->dr), 0))
11311 && integer_zerop (get_dr_vinfo_offset (vinfo, first_dr_info))
11312 && integer_zerop (DR_INIT (first_dr_info->dr))
11313 && alias_sets_conflict_p (get_alias_set (aggr_type),
11314 get_alias_set (TREE_TYPE (ref_type)))
11315 && (alignment_support_scheme == dr_aligned
11316 || alignment_support_scheme == dr_unaligned_supported))
11318 dataref_ptr = unshare_expr (DR_BASE_ADDRESS (first_dr_info->dr));
11319 dataref_offset = build_int_cst (ref_type, 0);
11321 else if (diff_first_stmt_info)
11323 dataref_ptr
11324 = vect_create_data_ref_ptr (vinfo, first_stmt_info_for_drptr,
11325 aggr_type, at_loop, offset, &dummy,
11326 gsi, &ptr_incr, simd_lane_access_p,
11327 bump);
11328 /* Adjust the pointer by the difference to first_stmt. */
11329 data_reference_p ptrdr
11330 = STMT_VINFO_DATA_REF (first_stmt_info_for_drptr);
11331 tree diff
11332 = fold_convert (sizetype,
11333 size_binop (MINUS_EXPR,
11334 DR_INIT (first_dr_info->dr),
11335 DR_INIT (ptrdr)));
11336 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
11337 stmt_info, diff);
11338 if (alignment_support_scheme == dr_explicit_realign)
11340 msq = vect_setup_realignment (vinfo,
11341 first_stmt_info_for_drptr, gsi,
11342 &realignment_token,
11343 alignment_support_scheme,
11344 dataref_ptr, &at_loop);
11345 gcc_assert (!compute_in_loop);
11348 else
11349 dataref_ptr
11350 = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
11351 at_loop,
11352 offset, &dummy, gsi, &ptr_incr,
11353 simd_lane_access_p, bump);
11355 else if (!costing_p)
11357 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
11358 if (dataref_offset)
11359 dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset,
11360 bump);
11361 else
11362 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
11363 stmt_info, bump);
11366 if (grouped_load || slp_perm)
11367 dr_chain.create (vec_num);
11369 gimple *new_stmt = NULL;
11370 for (i = 0; i < vec_num; i++)
11372 tree final_mask = NULL_TREE;
11373 tree final_len = NULL_TREE;
11374 tree bias = NULL_TREE;
11375 if (!costing_p)
11377 if (mask)
11378 vec_mask = vec_masks[vec_num * j + i];
11379 if (loop_masks)
11380 final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
11381 vec_num * ncopies, vectype,
11382 vec_num * j + i);
11383 if (vec_mask)
11384 final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
11385 final_mask, vec_mask, gsi);
11387 if (i > 0)
11388 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
11389 gsi, stmt_info, bump);
11392 /* 2. Create the vector-load in the loop. */
11393 switch (alignment_support_scheme)
11395 case dr_aligned:
11396 case dr_unaligned_supported:
11398 if (costing_p)
11399 break;
11401 unsigned int misalign;
11402 unsigned HOST_WIDE_INT align;
11403 align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
11404 if (alignment_support_scheme == dr_aligned)
11405 misalign = 0;
11406 else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
11408 align
11409 = dr_alignment (vect_dr_behavior (vinfo, first_dr_info));
11410 misalign = 0;
11412 else
11413 misalign = misalignment;
11414 if (dataref_offset == NULL_TREE
11415 && TREE_CODE (dataref_ptr) == SSA_NAME)
11416 set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
11417 misalign);
11418 align = least_bit_hwi (misalign | align);
11420 /* Compute IFN when LOOP_LENS or final_mask valid. */
11421 machine_mode vmode = TYPE_MODE (vectype);
11422 machine_mode new_vmode = vmode;
11423 internal_fn partial_ifn = IFN_LAST;
11424 if (loop_lens)
11426 opt_machine_mode new_ovmode
11427 = get_len_load_store_mode (vmode, true, &partial_ifn);
11428 new_vmode = new_ovmode.require ();
11429 unsigned factor
11430 = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode);
11431 final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
11432 vec_num * ncopies, vectype,
11433 vec_num * j + i, factor);
11435 else if (final_mask)
11437 if (!can_vec_mask_load_store_p (
11438 vmode, TYPE_MODE (TREE_TYPE (final_mask)), true,
11439 &partial_ifn))
11440 gcc_unreachable ();
11443 if (partial_ifn == IFN_MASK_LEN_LOAD)
11445 if (!final_len)
11447 /* Pass VF value to 'len' argument of
11448 MASK_LEN_LOAD if LOOP_LENS is invalid. */
11449 final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
11451 if (!final_mask)
11453 /* Pass all ones value to 'mask' argument of
11454 MASK_LEN_LOAD if final_mask is invalid. */
11455 mask_vectype = truth_type_for (vectype);
11456 final_mask = build_minus_one_cst (mask_vectype);
11459 if (final_len)
11461 signed char biasval
11462 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
11464 bias = build_int_cst (intQI_type_node, biasval);
11467 if (final_len)
11469 tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
11470 gcall *call;
11471 if (partial_ifn == IFN_MASK_LEN_LOAD)
11472 call = gimple_build_call_internal (IFN_MASK_LEN_LOAD, 5,
11473 dataref_ptr, ptr,
11474 final_mask, final_len,
11475 bias);
11476 else
11477 call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
11478 dataref_ptr, ptr,
11479 final_len, bias);
11480 gimple_call_set_nothrow (call, true);
11481 new_stmt = call;
11482 data_ref = NULL_TREE;
11484 /* Need conversion if it's wrapped with VnQI. */
11485 if (vmode != new_vmode)
11487 tree new_vtype = build_vector_type_for_mode (
11488 unsigned_intQI_type_node, new_vmode);
11489 tree var
11490 = vect_get_new_ssa_name (new_vtype, vect_simple_var);
11491 gimple_set_lhs (call, var);
11492 vect_finish_stmt_generation (vinfo, stmt_info, call,
11493 gsi);
11494 tree op = build1 (VIEW_CONVERT_EXPR, vectype, var);
11495 new_stmt = gimple_build_assign (vec_dest,
11496 VIEW_CONVERT_EXPR, op);
11499 else if (final_mask)
11501 tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
11502 gcall *call = gimple_build_call_internal (IFN_MASK_LOAD, 3,
11503 dataref_ptr, ptr,
11504 final_mask);
11505 gimple_call_set_nothrow (call, true);
11506 new_stmt = call;
11507 data_ref = NULL_TREE;
11509 else
11511 tree ltype = vectype;
11512 tree new_vtype = NULL_TREE;
11513 unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info);
11514 unsigned int vect_align
11515 = vect_known_alignment_in_bytes (first_dr_info, vectype);
11516 unsigned int scalar_dr_size
11517 = vect_get_scalar_dr_size (first_dr_info);
11518 /* If there's no peeling for gaps but we have a gap
11519 with slp loads then load the lower half of the
11520 vector only. See get_group_load_store_type for
11521 when we apply this optimization. */
11522 if (slp
11523 && loop_vinfo
11524 && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && gap != 0
11525 && known_eq (nunits, (group_size - gap) * 2)
11526 && known_eq (nunits, group_size)
11527 && gap >= (vect_align / scalar_dr_size))
11529 tree half_vtype;
11530 new_vtype
11531 = vector_vector_composition_type (vectype, 2,
11532 &half_vtype);
11533 if (new_vtype != NULL_TREE)
11534 ltype = half_vtype;
11536 /* Try to use a single smaller load when we are about
11537 to load excess elements compared to the unrolled
11538 scalar loop.
11539 ??? This should cover the above case as well. */
11540 else if (known_gt ((vec_num * j + i + 1) * nunits,
11541 (group_size * vf - gap)))
11543 if (known_ge ((vec_num * j + i + 1) * nunits
11544 - (group_size * vf - gap), nunits))
11545 /* DR will be unused. */
11546 ltype = NULL_TREE;
11547 else if (known_ge (vect_align,
11548 tree_to_poly_uint64
11549 (TYPE_SIZE_UNIT (vectype))))
11550 /* Aligned access to excess elements is OK if
11551 at least one element is accessed in the
11552 scalar loop. */
11554 else
11556 auto remain
11557 = ((group_size * vf - gap)
11558 - (vec_num * j + i) * nunits);
11559 /* remain should now be > 0 and < nunits. */
11560 unsigned num;
11561 if (constant_multiple_p (nunits, remain, &num))
11563 tree ptype;
11564 new_vtype
11565 = vector_vector_composition_type (vectype,
11566 num,
11567 &ptype);
11568 if (new_vtype)
11569 ltype = ptype;
11571 /* Else use multiple loads or a masked load? */
11574 tree offset
11575 = (dataref_offset ? dataref_offset
11576 : build_int_cst (ref_type, 0));
11577 if (!ltype)
11579 else if (ltype != vectype
11580 && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
11582 poly_uint64 gap_offset
11583 = (tree_to_poly_uint64 (TYPE_SIZE_UNIT (vectype))
11584 - tree_to_poly_uint64 (TYPE_SIZE_UNIT (ltype)));
11585 tree gapcst = build_int_cstu (ref_type, gap_offset);
11586 offset = size_binop (PLUS_EXPR, offset, gapcst);
11588 if (ltype)
11590 data_ref
11591 = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
11592 if (alignment_support_scheme == dr_aligned)
11594 else
11595 TREE_TYPE (data_ref)
11596 = build_aligned_type (TREE_TYPE (data_ref),
11597 align * BITS_PER_UNIT);
11599 if (!ltype)
11600 data_ref = build_constructor (vectype, NULL);
11601 else if (ltype != vectype)
11603 vect_copy_ref_info (data_ref,
11604 DR_REF (first_dr_info->dr));
11605 tree tem = make_ssa_name (ltype);
11606 new_stmt = gimple_build_assign (tem, data_ref);
11607 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt,
11608 gsi);
11609 data_ref = NULL;
11610 vec<constructor_elt, va_gc> *v;
11611 /* We've computed 'num' above to statically two
11612 or via constant_multiple_p. */
11613 unsigned num
11614 = (exact_div (tree_to_poly_uint64
11615 (TYPE_SIZE_UNIT (vectype)),
11616 tree_to_poly_uint64
11617 (TYPE_SIZE_UNIT (ltype)))
11618 .to_constant ());
11619 vec_alloc (v, num);
11620 if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
11622 while (--num)
11623 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
11624 build_zero_cst (ltype));
11625 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
11627 else
11629 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
11630 while (--num)
11631 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
11632 build_zero_cst (ltype));
11634 gcc_assert (new_vtype != NULL_TREE);
11635 if (new_vtype == vectype)
11636 new_stmt = gimple_build_assign (
11637 vec_dest, build_constructor (vectype, v));
11638 else
11640 tree new_vname = make_ssa_name (new_vtype);
11641 new_stmt = gimple_build_assign (
11642 new_vname, build_constructor (new_vtype, v));
11643 vect_finish_stmt_generation (vinfo, stmt_info,
11644 new_stmt, gsi);
11645 new_stmt = gimple_build_assign (
11646 vec_dest,
11647 build1 (VIEW_CONVERT_EXPR, vectype, new_vname));
11651 break;
11653 case dr_explicit_realign:
11655 if (costing_p)
11656 break;
11657 tree ptr, bump;
11659 tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
11661 if (compute_in_loop)
11662 msq = vect_setup_realignment (vinfo, first_stmt_info, gsi,
11663 &realignment_token,
11664 dr_explicit_realign,
11665 dataref_ptr, NULL);
11667 if (TREE_CODE (dataref_ptr) == SSA_NAME)
11668 ptr = copy_ssa_name (dataref_ptr);
11669 else
11670 ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
11671 // For explicit realign the target alignment should be
11672 // known at compile time.
11673 unsigned HOST_WIDE_INT align
11674 = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
11675 new_stmt = gimple_build_assign (
11676 ptr, BIT_AND_EXPR, dataref_ptr,
11677 build_int_cst (TREE_TYPE (dataref_ptr),
11678 -(HOST_WIDE_INT) align));
11679 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11680 data_ref
11681 = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0));
11682 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
11683 vec_dest = vect_create_destination_var (scalar_dest, vectype);
11684 new_stmt = gimple_build_assign (vec_dest, data_ref);
11685 new_temp = make_ssa_name (vec_dest, new_stmt);
11686 gimple_assign_set_lhs (new_stmt, new_temp);
11687 gimple_move_vops (new_stmt, stmt_info->stmt);
11688 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11689 msq = new_temp;
11691 bump = size_binop (MULT_EXPR, vs, TYPE_SIZE_UNIT (elem_type));
11692 bump = size_binop (MINUS_EXPR, bump, size_one_node);
11693 ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, stmt_info,
11694 bump);
11695 new_stmt = gimple_build_assign (
11696 NULL_TREE, BIT_AND_EXPR, ptr,
11697 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
11698 if (TREE_CODE (ptr) == SSA_NAME)
11699 ptr = copy_ssa_name (ptr, new_stmt);
11700 else
11701 ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt);
11702 gimple_assign_set_lhs (new_stmt, ptr);
11703 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11704 data_ref
11705 = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0));
11706 break;
11708 case dr_explicit_realign_optimized:
11710 if (costing_p)
11711 break;
11712 if (TREE_CODE (dataref_ptr) == SSA_NAME)
11713 new_temp = copy_ssa_name (dataref_ptr);
11714 else
11715 new_temp = make_ssa_name (TREE_TYPE (dataref_ptr));
11716 // We should only be doing this if we know the target
11717 // alignment at compile time.
11718 unsigned HOST_WIDE_INT align
11719 = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
11720 new_stmt = gimple_build_assign (
11721 new_temp, BIT_AND_EXPR, dataref_ptr,
11722 build_int_cst (TREE_TYPE (dataref_ptr),
11723 -(HOST_WIDE_INT) align));
11724 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11725 data_ref = build2 (MEM_REF, vectype, new_temp,
11726 build_int_cst (ref_type, 0));
11727 break;
11729 default:
11730 gcc_unreachable ();
11733 /* One common place to cost the above vect load for different
11734 alignment support schemes. */
11735 if (costing_p)
11737 /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we
11738 only need to take care of the first stmt, whose
11739 stmt_info is first_stmt_info, vec_num iterating on it
11740 will cover the cost for the remaining, it's consistent
11741 with transforming. For the prologue cost for realign,
11742 we only need to count it once for the whole group. */
11743 bool first_stmt_info_p = first_stmt_info == stmt_info;
11744 bool add_realign_cost = first_stmt_info_p && i == 0;
11745 if (memory_access_type == VMAT_CONTIGUOUS
11746 || memory_access_type == VMAT_CONTIGUOUS_REVERSE
11747 || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
11748 && (!grouped_load || first_stmt_info_p)))
11750 /* Leave realign cases alone to keep them simple. */
11751 if (alignment_support_scheme == dr_explicit_realign_optimized
11752 || alignment_support_scheme == dr_explicit_realign)
11753 vect_get_load_cost (vinfo, stmt_info, 1,
11754 alignment_support_scheme, misalignment,
11755 add_realign_cost, &inside_cost,
11756 &prologue_cost, cost_vec, cost_vec,
11757 true);
11758 else
11759 n_adjacent_loads++;
11762 else
11764 vec_dest = vect_create_destination_var (scalar_dest, vectype);
11765 /* DATA_REF is null if we've already built the statement. */
11766 if (data_ref)
11768 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
11769 new_stmt = gimple_build_assign (vec_dest, data_ref);
11771 new_temp = make_ssa_name (vec_dest, new_stmt);
11772 gimple_set_lhs (new_stmt, new_temp);
11773 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11776 /* 3. Handle explicit realignment if necessary/supported.
11777 Create in loop:
11778 vec_dest = realign_load (msq, lsq, realignment_token) */
11779 if (!costing_p
11780 && (alignment_support_scheme == dr_explicit_realign_optimized
11781 || alignment_support_scheme == dr_explicit_realign))
11783 lsq = gimple_assign_lhs (new_stmt);
11784 if (!realignment_token)
11785 realignment_token = dataref_ptr;
11786 vec_dest = vect_create_destination_var (scalar_dest, vectype);
11787 new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR, msq,
11788 lsq, realignment_token);
11789 new_temp = make_ssa_name (vec_dest, new_stmt);
11790 gimple_assign_set_lhs (new_stmt, new_temp);
11791 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11793 if (alignment_support_scheme == dr_explicit_realign_optimized)
11795 gcc_assert (phi);
11796 if (i == vec_num - 1 && j == ncopies - 1)
11797 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop),
11798 UNKNOWN_LOCATION);
11799 msq = lsq;
11803 if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
11805 if (costing_p)
11806 inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
11807 stmt_info, 0, vect_body);
11808 else
11810 tree perm_mask = perm_mask_for_reverse (vectype);
11811 new_temp = permute_vec_elements (vinfo, new_temp, new_temp,
11812 perm_mask, stmt_info, gsi);
11813 new_stmt = SSA_NAME_DEF_STMT (new_temp);
11817 /* Collect vector loads and later create their permutation in
11818 vect_transform_grouped_load (). */
11819 if (!costing_p && (grouped_load || slp_perm))
11820 dr_chain.quick_push (new_temp);
11822 /* Store vector loads in the corresponding SLP_NODE. */
11823 if (!costing_p && slp && !slp_perm)
11824 slp_node->push_vec_def (new_stmt);
11826 /* With SLP permutation we load the gaps as well, without
11827 we need to skip the gaps after we manage to fully load
11828 all elements. group_gap_adj is DR_GROUP_SIZE here. */
11829 group_elt += nunits;
11830 if (!costing_p
11831 && maybe_ne (group_gap_adj, 0U)
11832 && !slp_perm
11833 && known_eq (group_elt, group_size - group_gap_adj))
11835 poly_wide_int bump_val
11836 = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
11837 if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step)
11838 == -1)
11839 bump_val = -bump_val;
11840 tree bump = wide_int_to_tree (sizetype, bump_val);
11841 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
11842 stmt_info, bump);
11843 group_elt = 0;
11846 /* Bump the vector pointer to account for a gap or for excess
11847 elements loaded for a permuted SLP load. */
11848 if (!costing_p
11849 && maybe_ne (group_gap_adj, 0U)
11850 && slp_perm)
11852 poly_wide_int bump_val
11853 = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
11854 if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) == -1)
11855 bump_val = -bump_val;
11856 tree bump = wide_int_to_tree (sizetype, bump_val);
11857 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
11858 stmt_info, bump);
11861 if (slp && !slp_perm)
11862 continue;
11864 if (slp_perm)
11866 unsigned n_perms;
11867 /* For SLP we know we've seen all possible uses of dr_chain so
11868 direct vect_transform_slp_perm_load to DCE the unused parts.
11869 ??? This is a hack to prevent compile-time issues as seen
11870 in PR101120 and friends. */
11871 if (costing_p)
11873 vect_transform_slp_perm_load (vinfo, slp_node, vNULL, nullptr, vf,
11874 true, &n_perms, nullptr);
11875 inside_cost = record_stmt_cost (cost_vec, n_perms, vec_perm,
11876 stmt_info, 0, vect_body);
11878 else
11880 bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain,
11881 gsi, vf, false, &n_perms,
11882 nullptr, true);
11883 gcc_assert (ok);
11886 else
11888 if (grouped_load)
11890 gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
11891 /* We assume that the cost of a single load-lanes instruction
11892 is equivalent to the cost of DR_GROUP_SIZE separate loads.
11893 If a grouped access is instead being provided by a
11894 load-and-permute operation, include the cost of the
11895 permutes. */
11896 if (costing_p && first_stmt_info == stmt_info)
11898 /* Uses an even and odd extract operations or shuffle
11899 operations for each needed permute. */
11900 int group_size = DR_GROUP_SIZE (first_stmt_info);
11901 int nstmts = ceil_log2 (group_size) * group_size;
11902 inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm,
11903 stmt_info, 0, vect_body);
11905 if (dump_enabled_p ())
11906 dump_printf_loc (MSG_NOTE, vect_location,
11907 "vect_model_load_cost:"
11908 "strided group_size = %d .\n",
11909 group_size);
11911 else if (!costing_p)
11913 vect_transform_grouped_load (vinfo, stmt_info, dr_chain,
11914 group_size, gsi);
11915 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
11918 else if (!costing_p)
11919 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
11921 dr_chain.release ();
11923 if (!slp && !costing_p)
11924 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
11926 if (costing_p)
11928 gcc_assert (memory_access_type == VMAT_CONTIGUOUS
11929 || memory_access_type == VMAT_CONTIGUOUS_REVERSE
11930 || memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
11931 if (n_adjacent_loads > 0)
11932 vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
11933 alignment_support_scheme, misalignment, false,
11934 &inside_cost, &prologue_cost, cost_vec, cost_vec,
11935 true);
11936 if (dump_enabled_p ())
11937 dump_printf_loc (MSG_NOTE, vect_location,
11938 "vect_model_load_cost: inside_cost = %u, "
11939 "prologue_cost = %u .\n",
11940 inside_cost, prologue_cost);
11943 return true;
11946 /* Function vect_is_simple_cond.
11948 Input:
11949 LOOP - the loop that is being vectorized.
11950 COND - Condition that is checked for simple use.
11952 Output:
11953 *COMP_VECTYPE - the vector type for the comparison.
11954 *DTS - The def types for the arguments of the comparison
11956 Returns whether a COND can be vectorized. Checks whether
11957 condition operands are supportable using vec_is_simple_use. */
11959 static bool
11960 vect_is_simple_cond (tree cond, vec_info *vinfo, stmt_vec_info stmt_info,
11961 slp_tree slp_node, tree *comp_vectype,
11962 enum vect_def_type *dts, tree vectype)
11964 tree lhs, rhs;
11965 tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
11966 slp_tree slp_op;
11968 /* Mask case. */
11969 if (TREE_CODE (cond) == SSA_NAME
11970 && VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (cond)))
11972 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 0, &cond,
11973 &slp_op, &dts[0], comp_vectype)
11974 || !*comp_vectype
11975 || !VECTOR_BOOLEAN_TYPE_P (*comp_vectype))
11976 return false;
11977 return true;
11980 if (!COMPARISON_CLASS_P (cond))
11981 return false;
11983 lhs = TREE_OPERAND (cond, 0);
11984 rhs = TREE_OPERAND (cond, 1);
11986 if (TREE_CODE (lhs) == SSA_NAME)
11988 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 0,
11989 &lhs, &slp_op, &dts[0], &vectype1))
11990 return false;
11992 else if (TREE_CODE (lhs) == INTEGER_CST || TREE_CODE (lhs) == REAL_CST
11993 || TREE_CODE (lhs) == FIXED_CST)
11994 dts[0] = vect_constant_def;
11995 else
11996 return false;
11998 if (TREE_CODE (rhs) == SSA_NAME)
12000 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 1,
12001 &rhs, &slp_op, &dts[1], &vectype2))
12002 return false;
12004 else if (TREE_CODE (rhs) == INTEGER_CST || TREE_CODE (rhs) == REAL_CST
12005 || TREE_CODE (rhs) == FIXED_CST)
12006 dts[1] = vect_constant_def;
12007 else
12008 return false;
12010 if (vectype1 && vectype2
12011 && maybe_ne (TYPE_VECTOR_SUBPARTS (vectype1),
12012 TYPE_VECTOR_SUBPARTS (vectype2)))
12013 return false;
12015 *comp_vectype = vectype1 ? vectype1 : vectype2;
12016 /* Invariant comparison. */
12017 if (! *comp_vectype)
12019 tree scalar_type = TREE_TYPE (lhs);
12020 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type))
12021 *comp_vectype = truth_type_for (vectype);
12022 else
12024 /* If we can widen the comparison to match vectype do so. */
12025 if (INTEGRAL_TYPE_P (scalar_type)
12026 && !slp_node
12027 && tree_int_cst_lt (TYPE_SIZE (scalar_type),
12028 TYPE_SIZE (TREE_TYPE (vectype))))
12029 scalar_type = build_nonstandard_integer_type
12030 (vector_element_bits (vectype), TYPE_UNSIGNED (scalar_type));
12031 *comp_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
12032 slp_node);
12036 return true;
12039 /* vectorizable_condition.
12041 Check if STMT_INFO is conditional modify expression that can be vectorized.
12042 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
12043 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
12044 at GSI.
12046 When STMT_INFO is vectorized as a nested cycle, for_reduction is true.
12048 Return true if STMT_INFO is vectorizable in this way. */
12050 static bool
12051 vectorizable_condition (vec_info *vinfo,
12052 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
12053 gimple **vec_stmt,
12054 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
12056 tree scalar_dest = NULL_TREE;
12057 tree vec_dest = NULL_TREE;
12058 tree cond_expr, cond_expr0 = NULL_TREE, cond_expr1 = NULL_TREE;
12059 tree then_clause, else_clause;
12060 tree comp_vectype = NULL_TREE;
12061 tree vec_cond_lhs = NULL_TREE, vec_cond_rhs = NULL_TREE;
12062 tree vec_then_clause = NULL_TREE, vec_else_clause = NULL_TREE;
12063 tree vec_compare;
12064 tree new_temp;
12065 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
12066 enum vect_def_type dts[4]
12067 = {vect_unknown_def_type, vect_unknown_def_type,
12068 vect_unknown_def_type, vect_unknown_def_type};
12069 int ndts = 4;
12070 int ncopies;
12071 int vec_num;
12072 enum tree_code code, cond_code, bitop1 = NOP_EXPR, bitop2 = NOP_EXPR;
12073 int i;
12074 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
12075 vec<tree> vec_oprnds0 = vNULL;
12076 vec<tree> vec_oprnds1 = vNULL;
12077 vec<tree> vec_oprnds2 = vNULL;
12078 vec<tree> vec_oprnds3 = vNULL;
12079 tree vec_cmp_type;
12080 bool masked = false;
12082 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
12083 return false;
12085 /* Is vectorizable conditional operation? */
12086 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
12087 if (!stmt)
12088 return false;
12090 code = gimple_assign_rhs_code (stmt);
12091 if (code != COND_EXPR)
12092 return false;
12094 stmt_vec_info reduc_info = NULL;
12095 int reduc_index = -1;
12096 vect_reduction_type reduction_type = TREE_CODE_REDUCTION;
12097 bool for_reduction
12098 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) != NULL;
12099 if (for_reduction)
12101 if (slp_node)
12102 return false;
12103 reduc_info = info_for_reduction (vinfo, stmt_info);
12104 reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
12105 reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
12106 gcc_assert (reduction_type != EXTRACT_LAST_REDUCTION
12107 || reduc_index != -1);
12109 else
12111 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
12112 return false;
12115 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
12116 tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
12118 if (slp_node)
12120 ncopies = 1;
12121 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
12123 else
12125 ncopies = vect_get_num_copies (loop_vinfo, vectype);
12126 vec_num = 1;
12129 gcc_assert (ncopies >= 1);
12130 if (for_reduction && ncopies > 1)
12131 return false; /* FORNOW */
12133 cond_expr = gimple_assign_rhs1 (stmt);
12135 if (!vect_is_simple_cond (cond_expr, vinfo, stmt_info, slp_node,
12136 &comp_vectype, &dts[0], vectype)
12137 || !comp_vectype)
12138 return false;
12140 unsigned op_adjust = COMPARISON_CLASS_P (cond_expr) ? 1 : 0;
12141 slp_tree then_slp_node, else_slp_node;
12142 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 1 + op_adjust,
12143 &then_clause, &then_slp_node, &dts[2], &vectype1))
12144 return false;
12145 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 2 + op_adjust,
12146 &else_clause, &else_slp_node, &dts[3], &vectype2))
12147 return false;
12149 if (vectype1 && !useless_type_conversion_p (vectype, vectype1))
12150 return false;
12152 if (vectype2 && !useless_type_conversion_p (vectype, vectype2))
12153 return false;
12155 masked = !COMPARISON_CLASS_P (cond_expr);
12156 vec_cmp_type = truth_type_for (comp_vectype);
12158 if (vec_cmp_type == NULL_TREE)
12159 return false;
12161 cond_code = TREE_CODE (cond_expr);
12162 if (!masked)
12164 cond_expr0 = TREE_OPERAND (cond_expr, 0);
12165 cond_expr1 = TREE_OPERAND (cond_expr, 1);
12168 /* For conditional reductions, the "then" value needs to be the candidate
12169 value calculated by this iteration while the "else" value needs to be
12170 the result carried over from previous iterations. If the COND_EXPR
12171 is the other way around, we need to swap it. */
12172 bool must_invert_cmp_result = false;
12173 if (reduction_type == EXTRACT_LAST_REDUCTION && reduc_index == 1)
12175 if (masked)
12176 must_invert_cmp_result = true;
12177 else
12179 bool honor_nans = HONOR_NANS (TREE_TYPE (cond_expr0));
12180 tree_code new_code = invert_tree_comparison (cond_code, honor_nans);
12181 if (new_code == ERROR_MARK)
12182 must_invert_cmp_result = true;
12183 else
12185 cond_code = new_code;
12186 /* Make sure we don't accidentally use the old condition. */
12187 cond_expr = NULL_TREE;
12190 std::swap (then_clause, else_clause);
12193 if (!masked && VECTOR_BOOLEAN_TYPE_P (comp_vectype))
12195 /* Boolean values may have another representation in vectors
12196 and therefore we prefer bit operations over comparison for
12197 them (which also works for scalar masks). We store opcodes
12198 to use in bitop1 and bitop2. Statement is vectorized as
12199 BITOP2 (rhs1 BITOP1 rhs2) or rhs1 BITOP2 (BITOP1 rhs2)
12200 depending on bitop1 and bitop2 arity. */
12201 switch (cond_code)
12203 case GT_EXPR:
12204 bitop1 = BIT_NOT_EXPR;
12205 bitop2 = BIT_AND_EXPR;
12206 break;
12207 case GE_EXPR:
12208 bitop1 = BIT_NOT_EXPR;
12209 bitop2 = BIT_IOR_EXPR;
12210 break;
12211 case LT_EXPR:
12212 bitop1 = BIT_NOT_EXPR;
12213 bitop2 = BIT_AND_EXPR;
12214 std::swap (cond_expr0, cond_expr1);
12215 break;
12216 case LE_EXPR:
12217 bitop1 = BIT_NOT_EXPR;
12218 bitop2 = BIT_IOR_EXPR;
12219 std::swap (cond_expr0, cond_expr1);
12220 break;
12221 case NE_EXPR:
12222 bitop1 = BIT_XOR_EXPR;
12223 break;
12224 case EQ_EXPR:
12225 bitop1 = BIT_XOR_EXPR;
12226 bitop2 = BIT_NOT_EXPR;
12227 break;
12228 default:
12229 return false;
12231 cond_code = SSA_NAME;
12234 if (TREE_CODE_CLASS (cond_code) == tcc_comparison
12235 && reduction_type == EXTRACT_LAST_REDUCTION
12236 && !expand_vec_cmp_expr_p (comp_vectype, vec_cmp_type, cond_code))
12238 if (dump_enabled_p ())
12239 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12240 "reduction comparison operation not supported.\n");
12241 return false;
12244 if (!vec_stmt)
12246 if (bitop1 != NOP_EXPR)
12248 machine_mode mode = TYPE_MODE (comp_vectype);
12249 optab optab;
12251 optab = optab_for_tree_code (bitop1, comp_vectype, optab_default);
12252 if (!optab || optab_handler (optab, mode) == CODE_FOR_nothing)
12253 return false;
12255 if (bitop2 != NOP_EXPR)
12257 optab = optab_for_tree_code (bitop2, comp_vectype,
12258 optab_default);
12259 if (!optab || optab_handler (optab, mode) == CODE_FOR_nothing)
12260 return false;
12264 vect_cost_for_stmt kind = vector_stmt;
12265 if (reduction_type == EXTRACT_LAST_REDUCTION)
12266 /* Count one reduction-like operation per vector. */
12267 kind = vec_to_scalar;
12268 else if (!expand_vec_cond_expr_p (vectype, comp_vectype, cond_code)
12269 && (masked
12270 || (!expand_vec_cmp_expr_p (comp_vectype, vec_cmp_type,
12271 cond_code)
12272 || !expand_vec_cond_expr_p (vectype, vec_cmp_type,
12273 ERROR_MARK))))
12274 return false;
12276 if (slp_node
12277 && (!vect_maybe_update_slp_op_vectype
12278 (SLP_TREE_CHILDREN (slp_node)[0], comp_vectype)
12279 || (op_adjust == 1
12280 && !vect_maybe_update_slp_op_vectype
12281 (SLP_TREE_CHILDREN (slp_node)[1], comp_vectype))
12282 || !vect_maybe_update_slp_op_vectype (then_slp_node, vectype)
12283 || !vect_maybe_update_slp_op_vectype (else_slp_node, vectype)))
12285 if (dump_enabled_p ())
12286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12287 "incompatible vector types for invariants\n");
12288 return false;
12291 if (loop_vinfo && for_reduction
12292 && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
12294 if (reduction_type == EXTRACT_LAST_REDUCTION)
12296 if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
12297 vectype, OPTIMIZE_FOR_SPEED))
12298 vect_record_loop_len (loop_vinfo,
12299 &LOOP_VINFO_LENS (loop_vinfo),
12300 ncopies * vec_num, vectype, 1);
12301 else
12302 vect_record_loop_mask (loop_vinfo,
12303 &LOOP_VINFO_MASKS (loop_vinfo),
12304 ncopies * vec_num, vectype, NULL);
12306 /* Extra inactive lanes should be safe for vect_nested_cycle. */
12307 else if (STMT_VINFO_DEF_TYPE (reduc_info) != vect_nested_cycle)
12309 if (dump_enabled_p ())
12310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12311 "conditional reduction prevents the use"
12312 " of partial vectors.\n");
12313 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
12317 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
12318 vect_model_simple_cost (vinfo, stmt_info, ncopies, dts, ndts, slp_node,
12319 cost_vec, kind);
12320 return true;
12323 /* Transform. */
12325 /* Handle def. */
12326 scalar_dest = gimple_assign_lhs (stmt);
12327 if (reduction_type != EXTRACT_LAST_REDUCTION)
12328 vec_dest = vect_create_destination_var (scalar_dest, vectype);
12330 bool swap_cond_operands = false;
12332 /* See whether another part of the vectorized code applies a loop
12333 mask to the condition, or to its inverse. */
12335 vec_loop_masks *masks = NULL;
12336 vec_loop_lens *lens = NULL;
12337 if (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
12339 if (reduction_type == EXTRACT_LAST_REDUCTION)
12340 lens = &LOOP_VINFO_LENS (loop_vinfo);
12342 else if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
12344 if (reduction_type == EXTRACT_LAST_REDUCTION)
12345 masks = &LOOP_VINFO_MASKS (loop_vinfo);
12346 else
12348 scalar_cond_masked_key cond (cond_expr, ncopies);
12349 if (loop_vinfo->scalar_cond_masked_set.contains (cond))
12350 masks = &LOOP_VINFO_MASKS (loop_vinfo);
12351 else
12353 bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0));
12354 tree_code orig_code = cond.code;
12355 cond.code = invert_tree_comparison (cond.code, honor_nans);
12356 if (!masked && loop_vinfo->scalar_cond_masked_set.contains (cond))
12358 masks = &LOOP_VINFO_MASKS (loop_vinfo);
12359 cond_code = cond.code;
12360 swap_cond_operands = true;
12362 else
12364 /* Try the inverse of the current mask. We check if the
12365 inverse mask is live and if so we generate a negate of
12366 the current mask such that we still honor NaNs. */
12367 cond.inverted_p = true;
12368 cond.code = orig_code;
12369 if (loop_vinfo->scalar_cond_masked_set.contains (cond))
12371 masks = &LOOP_VINFO_MASKS (loop_vinfo);
12372 cond_code = cond.code;
12373 swap_cond_operands = true;
12374 must_invert_cmp_result = true;
12381 /* Handle cond expr. */
12382 if (masked)
12383 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
12384 cond_expr, comp_vectype, &vec_oprnds0,
12385 then_clause, vectype, &vec_oprnds2,
12386 reduction_type != EXTRACT_LAST_REDUCTION
12387 ? else_clause : NULL, vectype, &vec_oprnds3);
12388 else
12389 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
12390 cond_expr0, comp_vectype, &vec_oprnds0,
12391 cond_expr1, comp_vectype, &vec_oprnds1,
12392 then_clause, vectype, &vec_oprnds2,
12393 reduction_type != EXTRACT_LAST_REDUCTION
12394 ? else_clause : NULL, vectype, &vec_oprnds3);
12396 /* Arguments are ready. Create the new vector stmt. */
12397 FOR_EACH_VEC_ELT (vec_oprnds0, i, vec_cond_lhs)
12399 vec_then_clause = vec_oprnds2[i];
12400 if (reduction_type != EXTRACT_LAST_REDUCTION)
12401 vec_else_clause = vec_oprnds3[i];
12403 if (swap_cond_operands)
12404 std::swap (vec_then_clause, vec_else_clause);
12406 if (masked)
12407 vec_compare = vec_cond_lhs;
12408 else
12410 vec_cond_rhs = vec_oprnds1[i];
12411 if (bitop1 == NOP_EXPR)
12413 gimple_seq stmts = NULL;
12414 vec_compare = gimple_build (&stmts, cond_code, vec_cmp_type,
12415 vec_cond_lhs, vec_cond_rhs);
12416 gsi_insert_before (gsi, stmts, GSI_SAME_STMT);
12418 else
12420 new_temp = make_ssa_name (vec_cmp_type);
12421 gassign *new_stmt;
12422 if (bitop1 == BIT_NOT_EXPR)
12423 new_stmt = gimple_build_assign (new_temp, bitop1,
12424 vec_cond_rhs);
12425 else
12426 new_stmt
12427 = gimple_build_assign (new_temp, bitop1, vec_cond_lhs,
12428 vec_cond_rhs);
12429 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12430 if (bitop2 == NOP_EXPR)
12431 vec_compare = new_temp;
12432 else if (bitop2 == BIT_NOT_EXPR
12433 && reduction_type != EXTRACT_LAST_REDUCTION)
12435 /* Instead of doing ~x ? y : z do x ? z : y. */
12436 vec_compare = new_temp;
12437 std::swap (vec_then_clause, vec_else_clause);
12439 else
12441 vec_compare = make_ssa_name (vec_cmp_type);
12442 if (bitop2 == BIT_NOT_EXPR)
12443 new_stmt
12444 = gimple_build_assign (vec_compare, bitop2, new_temp);
12445 else
12446 new_stmt
12447 = gimple_build_assign (vec_compare, bitop2,
12448 vec_cond_lhs, new_temp);
12449 vect_finish_stmt_generation (vinfo, stmt_info,
12450 new_stmt, gsi);
12455 /* If we decided to apply a loop mask to the result of the vector
12456 comparison, AND the comparison with the mask now. Later passes
12457 should then be able to reuse the AND results between mulitple
12458 vector statements.
12460 For example:
12461 for (int i = 0; i < 100; ++i)
12462 x[i] = y[i] ? z[i] : 10;
12464 results in following optimized GIMPLE:
12466 mask__35.8_43 = vect__4.7_41 != { 0, ... };
12467 vec_mask_and_46 = loop_mask_40 & mask__35.8_43;
12468 _19 = &MEM[base: z_12(D), index: ivtmp_56, step: 4, offset: 0B];
12469 vect_iftmp.11_47 = .MASK_LOAD (_19, 4B, vec_mask_and_46);
12470 vect_iftmp.12_52 = VEC_COND_EXPR <vec_mask_and_46,
12471 vect_iftmp.11_47, { 10, ... }>;
12473 instead of using a masked and unmasked forms of
12474 vec != { 0, ... } (masked in the MASK_LOAD,
12475 unmasked in the VEC_COND_EXPR). */
12477 /* Force vec_compare to be an SSA_NAME rather than a comparison,
12478 in cases where that's necessary. */
12480 tree len = NULL_TREE, bias = NULL_TREE;
12481 if (masks || lens || reduction_type == EXTRACT_LAST_REDUCTION)
12483 if (!is_gimple_val (vec_compare))
12485 tree vec_compare_name = make_ssa_name (vec_cmp_type);
12486 gassign *new_stmt = gimple_build_assign (vec_compare_name,
12487 vec_compare);
12488 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12489 vec_compare = vec_compare_name;
12492 if (must_invert_cmp_result)
12494 tree vec_compare_name = make_ssa_name (vec_cmp_type);
12495 gassign *new_stmt = gimple_build_assign (vec_compare_name,
12496 BIT_NOT_EXPR,
12497 vec_compare);
12498 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12499 vec_compare = vec_compare_name;
12502 if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
12503 vectype, OPTIMIZE_FOR_SPEED))
12505 if (lens)
12507 len = vect_get_loop_len (loop_vinfo, gsi, lens,
12508 vec_num * ncopies, vectype, i, 1);
12509 signed char biasval
12510 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
12511 bias = build_int_cst (intQI_type_node, biasval);
12513 else
12515 len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
12516 bias = build_int_cst (intQI_type_node, 0);
12519 if (masks)
12521 tree loop_mask
12522 = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num * ncopies,
12523 vectype, i);
12524 tree tmp2 = make_ssa_name (vec_cmp_type);
12525 gassign *g
12526 = gimple_build_assign (tmp2, BIT_AND_EXPR, vec_compare,
12527 loop_mask);
12528 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
12529 vec_compare = tmp2;
12533 gimple *new_stmt;
12534 if (reduction_type == EXTRACT_LAST_REDUCTION)
12536 gimple *old_stmt = vect_orig_stmt (stmt_info)->stmt;
12537 tree lhs = gimple_get_lhs (old_stmt);
12538 if (len)
12539 new_stmt = gimple_build_call_internal
12540 (IFN_LEN_FOLD_EXTRACT_LAST, 5, else_clause, vec_compare,
12541 vec_then_clause, len, bias);
12542 else
12543 new_stmt = gimple_build_call_internal
12544 (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
12545 vec_then_clause);
12546 gimple_call_set_lhs (new_stmt, lhs);
12547 SSA_NAME_DEF_STMT (lhs) = new_stmt;
12548 if (old_stmt == gsi_stmt (*gsi))
12549 vect_finish_replace_stmt (vinfo, stmt_info, new_stmt);
12550 else
12552 /* In this case we're moving the definition to later in the
12553 block. That doesn't matter because the only uses of the
12554 lhs are in phi statements. */
12555 gimple_stmt_iterator old_gsi = gsi_for_stmt (old_stmt);
12556 gsi_remove (&old_gsi, true);
12557 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12560 else
12562 new_temp = make_ssa_name (vec_dest);
12563 new_stmt = gimple_build_assign (new_temp, VEC_COND_EXPR, vec_compare,
12564 vec_then_clause, vec_else_clause);
12565 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12567 if (slp_node)
12568 slp_node->push_vec_def (new_stmt);
12569 else
12570 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
12573 if (!slp_node)
12574 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
12576 vec_oprnds0.release ();
12577 vec_oprnds1.release ();
12578 vec_oprnds2.release ();
12579 vec_oprnds3.release ();
12581 return true;
12584 /* Helper of vectorizable_comparison.
12586 Check if STMT_INFO is comparison expression CODE that can be vectorized.
12587 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
12588 comparison, put it in VEC_STMT, and insert it at GSI.
12590 Return true if STMT_INFO is vectorizable in this way. */
12592 static bool
12593 vectorizable_comparison_1 (vec_info *vinfo, tree vectype,
12594 stmt_vec_info stmt_info, tree_code code,
12595 gimple_stmt_iterator *gsi, gimple **vec_stmt,
12596 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
12598 tree lhs, rhs1, rhs2;
12599 tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
12600 tree vec_rhs1 = NULL_TREE, vec_rhs2 = NULL_TREE;
12601 tree new_temp;
12602 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
12603 enum vect_def_type dts[2] = {vect_unknown_def_type, vect_unknown_def_type};
12604 int ndts = 2;
12605 poly_uint64 nunits;
12606 int ncopies;
12607 enum tree_code bitop1 = NOP_EXPR, bitop2 = NOP_EXPR;
12608 int i;
12609 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
12610 vec<tree> vec_oprnds0 = vNULL;
12611 vec<tree> vec_oprnds1 = vNULL;
12612 tree mask_type;
12613 tree mask = NULL_TREE;
12615 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
12616 return false;
12618 if (!vectype || !VECTOR_BOOLEAN_TYPE_P (vectype))
12619 return false;
12621 mask_type = vectype;
12622 nunits = TYPE_VECTOR_SUBPARTS (vectype);
12624 if (slp_node)
12625 ncopies = 1;
12626 else
12627 ncopies = vect_get_num_copies (loop_vinfo, vectype);
12629 gcc_assert (ncopies >= 1);
12631 if (TREE_CODE_CLASS (code) != tcc_comparison)
12632 return false;
12634 slp_tree slp_rhs1, slp_rhs2;
12635 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
12636 0, &rhs1, &slp_rhs1, &dts[0], &vectype1))
12637 return false;
12639 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
12640 1, &rhs2, &slp_rhs2, &dts[1], &vectype2))
12641 return false;
12643 if (vectype1 && vectype2
12644 && maybe_ne (TYPE_VECTOR_SUBPARTS (vectype1),
12645 TYPE_VECTOR_SUBPARTS (vectype2)))
12646 return false;
12648 vectype = vectype1 ? vectype1 : vectype2;
12650 /* Invariant comparison. */
12651 if (!vectype)
12653 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (rhs1)))
12654 vectype = mask_type;
12655 else
12656 vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (rhs1),
12657 slp_node);
12658 if (!vectype || maybe_ne (TYPE_VECTOR_SUBPARTS (vectype), nunits))
12659 return false;
12661 else if (maybe_ne (nunits, TYPE_VECTOR_SUBPARTS (vectype)))
12662 return false;
12664 /* Can't compare mask and non-mask types. */
12665 if (vectype1 && vectype2
12666 && (VECTOR_BOOLEAN_TYPE_P (vectype1) ^ VECTOR_BOOLEAN_TYPE_P (vectype2)))
12667 return false;
12669 /* Boolean values may have another representation in vectors
12670 and therefore we prefer bit operations over comparison for
12671 them (which also works for scalar masks). We store opcodes
12672 to use in bitop1 and bitop2. Statement is vectorized as
12673 BITOP2 (rhs1 BITOP1 rhs2) or
12674 rhs1 BITOP2 (BITOP1 rhs2)
12675 depending on bitop1 and bitop2 arity. */
12676 bool swap_p = false;
12677 if (VECTOR_BOOLEAN_TYPE_P (vectype))
12679 if (code == GT_EXPR)
12681 bitop1 = BIT_NOT_EXPR;
12682 bitop2 = BIT_AND_EXPR;
12684 else if (code == GE_EXPR)
12686 bitop1 = BIT_NOT_EXPR;
12687 bitop2 = BIT_IOR_EXPR;
12689 else if (code == LT_EXPR)
12691 bitop1 = BIT_NOT_EXPR;
12692 bitop2 = BIT_AND_EXPR;
12693 swap_p = true;
12695 else if (code == LE_EXPR)
12697 bitop1 = BIT_NOT_EXPR;
12698 bitop2 = BIT_IOR_EXPR;
12699 swap_p = true;
12701 else
12703 bitop1 = BIT_XOR_EXPR;
12704 if (code == EQ_EXPR)
12705 bitop2 = BIT_NOT_EXPR;
12709 if (!vec_stmt)
12711 if (bitop1 == NOP_EXPR)
12713 if (!expand_vec_cmp_expr_p (vectype, mask_type, code))
12714 return false;
12716 else
12718 machine_mode mode = TYPE_MODE (vectype);
12719 optab optab;
12721 optab = optab_for_tree_code (bitop1, vectype, optab_default);
12722 if (!optab || optab_handler (optab, mode) == CODE_FOR_nothing)
12723 return false;
12725 if (bitop2 != NOP_EXPR)
12727 optab = optab_for_tree_code (bitop2, vectype, optab_default);
12728 if (!optab || optab_handler (optab, mode) == CODE_FOR_nothing)
12729 return false;
12733 /* Put types on constant and invariant SLP children. */
12734 if (slp_node
12735 && (!vect_maybe_update_slp_op_vectype (slp_rhs1, vectype)
12736 || !vect_maybe_update_slp_op_vectype (slp_rhs2, vectype)))
12738 if (dump_enabled_p ())
12739 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12740 "incompatible vector types for invariants\n");
12741 return false;
12744 vect_model_simple_cost (vinfo, stmt_info,
12745 ncopies * (1 + (bitop2 != NOP_EXPR)),
12746 dts, ndts, slp_node, cost_vec);
12747 return true;
12750 /* Transform. */
12752 /* Handle def. */
12753 lhs = gimple_get_lhs (STMT_VINFO_STMT (stmt_info));
12754 if (lhs)
12755 mask = vect_create_destination_var (lhs, mask_type);
12757 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
12758 rhs1, vectype, &vec_oprnds0,
12759 rhs2, vectype, &vec_oprnds1);
12760 if (swap_p)
12761 std::swap (vec_oprnds0, vec_oprnds1);
12763 /* Arguments are ready. Create the new vector stmt. */
12764 FOR_EACH_VEC_ELT (vec_oprnds0, i, vec_rhs1)
12766 gimple *new_stmt;
12767 vec_rhs2 = vec_oprnds1[i];
12769 if (lhs)
12770 new_temp = make_ssa_name (mask);
12771 else
12772 new_temp = make_temp_ssa_name (mask_type, NULL, "cmp");
12773 if (bitop1 == NOP_EXPR)
12775 new_stmt = gimple_build_assign (new_temp, code,
12776 vec_rhs1, vec_rhs2);
12777 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12779 else
12781 if (bitop1 == BIT_NOT_EXPR)
12782 new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs2);
12783 else
12784 new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs1,
12785 vec_rhs2);
12786 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12787 if (bitop2 != NOP_EXPR)
12789 tree res = make_ssa_name (mask);
12790 if (bitop2 == BIT_NOT_EXPR)
12791 new_stmt = gimple_build_assign (res, bitop2, new_temp);
12792 else
12793 new_stmt = gimple_build_assign (res, bitop2, vec_rhs1,
12794 new_temp);
12795 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12798 if (slp_node)
12799 slp_node->push_vec_def (new_stmt);
12800 else
12801 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
12804 if (!slp_node)
12805 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
12807 vec_oprnds0.release ();
12808 vec_oprnds1.release ();
12810 return true;
12813 /* vectorizable_comparison.
12815 Check if STMT_INFO is comparison expression that can be vectorized.
12816 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
12817 comparison, put it in VEC_STMT, and insert it at GSI.
12819 Return true if STMT_INFO is vectorizable in this way. */
12821 static bool
12822 vectorizable_comparison (vec_info *vinfo,
12823 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
12824 gimple **vec_stmt,
12825 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
12827 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
12829 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
12830 return false;
12832 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
12833 return false;
12835 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
12836 if (!stmt)
12837 return false;
12839 enum tree_code code = gimple_assign_rhs_code (stmt);
12840 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
12841 if (!vectorizable_comparison_1 (vinfo, vectype, stmt_info, code, gsi,
12842 vec_stmt, slp_node, cost_vec))
12843 return false;
12845 if (!vec_stmt)
12846 STMT_VINFO_TYPE (stmt_info) = comparison_vec_info_type;
12848 return true;
12851 /* Check to see if the current early break given in STMT_INFO is valid for
12852 vectorization. */
12854 static bool
12855 vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
12856 gimple_stmt_iterator *gsi, gimple **vec_stmt,
12857 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
12859 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
12860 if (!loop_vinfo
12861 || !is_a <gcond *> (STMT_VINFO_STMT (stmt_info)))
12862 return false;
12864 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_condition_def)
12865 return false;
12867 if (!STMT_VINFO_RELEVANT_P (stmt_info))
12868 return false;
12870 DUMP_VECT_SCOPE ("vectorizable_early_exit");
12872 auto code = gimple_cond_code (STMT_VINFO_STMT (stmt_info));
12874 tree vectype = NULL_TREE;
12875 slp_tree slp_op0;
12876 tree op0;
12877 enum vect_def_type dt0;
12878 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 0, &op0, &slp_op0, &dt0,
12879 &vectype))
12881 if (dump_enabled_p ())
12882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12883 "use not simple.\n");
12884 return false;
12887 if (!vectype)
12888 return false;
12890 machine_mode mode = TYPE_MODE (vectype);
12891 int ncopies;
12893 if (slp_node)
12894 ncopies = 1;
12895 else
12896 ncopies = vect_get_num_copies (loop_vinfo, vectype);
12898 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
12899 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
12901 /* Now build the new conditional. Pattern gimple_conds get dropped during
12902 codegen so we must replace the original insn. */
12903 gimple *orig_stmt = STMT_VINFO_STMT (vect_orig_stmt (stmt_info));
12904 gcond *cond_stmt = as_a <gcond *>(orig_stmt);
12905 /* When vectorizing we assume that if the branch edge is taken that we're
12906 exiting the loop. This is not however always the case as the compiler will
12907 rewrite conditions to always be a comparison against 0. To do this it
12908 sometimes flips the edges. This is fine for scalar, but for vector we
12909 then have to flip the test, as we're still assuming that if you take the
12910 branch edge that we found the exit condition. i.e. we need to know whether
12911 we are generating a `forall` or an `exist` condition. */
12912 auto new_code = NE_EXPR;
12913 auto reduc_optab = ior_optab;
12914 auto reduc_op = BIT_IOR_EXPR;
12915 tree cst = build_zero_cst (vectype);
12916 edge exit_true_edge = EDGE_SUCC (gimple_bb (cond_stmt), 0);
12917 if (exit_true_edge->flags & EDGE_FALSE_VALUE)
12918 exit_true_edge = EDGE_SUCC (gimple_bb (cond_stmt), 1);
12919 gcc_assert (exit_true_edge->flags & EDGE_TRUE_VALUE);
12920 if (flow_bb_inside_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
12921 exit_true_edge->dest))
12923 new_code = EQ_EXPR;
12924 reduc_optab = and_optab;
12925 reduc_op = BIT_AND_EXPR;
12926 cst = build_minus_one_cst (vectype);
12929 /* Analyze only. */
12930 if (!vec_stmt)
12932 if (direct_optab_handler (cbranch_optab, mode) == CODE_FOR_nothing)
12934 if (dump_enabled_p ())
12935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12936 "can't vectorize early exit because the "
12937 "target doesn't support flag setting vector "
12938 "comparisons.\n");
12939 return false;
12942 if (ncopies > 1
12943 && direct_optab_handler (reduc_optab, mode) == CODE_FOR_nothing)
12945 if (dump_enabled_p ())
12946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12947 "can't vectorize early exit because the "
12948 "target does not support boolean vector %s "
12949 "for type %T.\n",
12950 reduc_optab == ior_optab ? "OR" : "AND",
12951 vectype);
12952 return false;
12955 if (!vectorizable_comparison_1 (vinfo, vectype, stmt_info, code, gsi,
12956 vec_stmt, slp_node, cost_vec))
12957 return false;
12959 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
12961 if (direct_internal_fn_supported_p (IFN_VCOND_MASK_LEN, vectype,
12962 OPTIMIZE_FOR_SPEED))
12963 return false;
12964 else
12965 vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, NULL);
12969 return true;
12972 /* Tranform. */
12974 tree new_temp = NULL_TREE;
12975 gimple *new_stmt = NULL;
12977 if (dump_enabled_p ())
12978 dump_printf_loc (MSG_NOTE, vect_location, "transform early-exit.\n");
12980 if (!vectorizable_comparison_1 (vinfo, vectype, stmt_info, code, gsi,
12981 vec_stmt, slp_node, cost_vec))
12982 gcc_unreachable ();
12984 gimple *stmt = STMT_VINFO_STMT (stmt_info);
12985 basic_block cond_bb = gimple_bb (stmt);
12986 gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
12988 auto_vec<tree> stmts;
12990 if (slp_node)
12991 stmts.safe_splice (SLP_TREE_VEC_DEFS (slp_node));
12992 else
12994 auto vec_stmts = STMT_VINFO_VEC_STMTS (stmt_info);
12995 stmts.reserve_exact (vec_stmts.length ());
12996 for (auto stmt : vec_stmts)
12997 stmts.quick_push (gimple_assign_lhs (stmt));
13000 /* Determine if we need to reduce the final value. */
13001 if (stmts.length () > 1)
13003 /* We build the reductions in a way to maintain as much parallelism as
13004 possible. */
13005 auto_vec<tree> workset (stmts.length ());
13007 /* Mask the statements as we queue them up. Normally we loop over
13008 vec_num, but since we inspect the exact results of vectorization
13009 we don't need to and instead can just use the stmts themselves. */
13010 if (masked_loop_p)
13011 for (unsigned i = 0; i < stmts.length (); i++)
13013 tree stmt_mask
13014 = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies, vectype,
13016 stmt_mask
13017 = prepare_vec_mask (loop_vinfo, TREE_TYPE (stmt_mask), stmt_mask,
13018 stmts[i], &cond_gsi);
13019 workset.quick_push (stmt_mask);
13021 else
13022 workset.splice (stmts);
13024 while (workset.length () > 1)
13026 new_temp = make_temp_ssa_name (vectype, NULL, "vexit_reduc");
13027 tree arg0 = workset.pop ();
13028 tree arg1 = workset.pop ();
13029 new_stmt = gimple_build_assign (new_temp, reduc_op, arg0, arg1);
13030 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
13031 &cond_gsi);
13032 workset.quick_insert (0, new_temp);
13035 else
13037 new_temp = stmts[0];
13038 if (masked_loop_p)
13040 tree mask
13041 = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies, vectype, 0);
13042 new_temp = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
13043 new_temp, &cond_gsi);
13047 gcc_assert (new_temp);
13049 gimple_cond_set_condition (cond_stmt, new_code, new_temp, cst);
13050 update_stmt (orig_stmt);
13052 if (slp_node)
13053 SLP_TREE_VEC_DEFS (slp_node).truncate (0);
13054 else
13055 STMT_VINFO_VEC_STMTS (stmt_info).truncate (0);
13057 if (!slp_node)
13058 *vec_stmt = orig_stmt;
13060 return true;
13063 /* If SLP_NODE is nonnull, return true if vectorizable_live_operation
13064 can handle all live statements in the node. Otherwise return true
13065 if STMT_INFO is not live or if vectorizable_live_operation can handle it.
13066 VEC_STMT_P is as for vectorizable_live_operation. */
13068 static bool
13069 can_vectorize_live_stmts (vec_info *vinfo, stmt_vec_info stmt_info,
13070 slp_tree slp_node, slp_instance slp_node_instance,
13071 bool vec_stmt_p,
13072 stmt_vector_for_cost *cost_vec)
13074 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
13075 if (slp_node)
13077 stmt_vec_info slp_stmt_info;
13078 unsigned int i;
13079 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, slp_stmt_info)
13081 if ((STMT_VINFO_LIVE_P (slp_stmt_info)
13082 || (loop_vinfo
13083 && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
13084 && STMT_VINFO_DEF_TYPE (slp_stmt_info)
13085 == vect_induction_def))
13086 && !vectorizable_live_operation (vinfo, slp_stmt_info, slp_node,
13087 slp_node_instance, i,
13088 vec_stmt_p, cost_vec))
13089 return false;
13092 else if ((STMT_VINFO_LIVE_P (stmt_info)
13093 || (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
13094 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def))
13095 && !vectorizable_live_operation (vinfo, stmt_info,
13096 slp_node, slp_node_instance, -1,
13097 vec_stmt_p, cost_vec))
13098 return false;
13100 return true;
13103 /* Make sure the statement is vectorizable. */
13105 opt_result
13106 vect_analyze_stmt (vec_info *vinfo,
13107 stmt_vec_info stmt_info, bool *need_to_vectorize,
13108 slp_tree node, slp_instance node_instance,
13109 stmt_vector_for_cost *cost_vec)
13111 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
13112 enum vect_relevant relevance = STMT_VINFO_RELEVANT (stmt_info);
13113 bool ok;
13114 gimple_seq pattern_def_seq;
13116 if (dump_enabled_p ())
13117 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
13118 stmt_info->stmt);
13120 if (gimple_has_volatile_ops (stmt_info->stmt))
13121 return opt_result::failure_at (stmt_info->stmt,
13122 "not vectorized:"
13123 " stmt has volatile operands: %G\n",
13124 stmt_info->stmt);
13126 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
13127 && node == NULL
13128 && (pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)))
13130 gimple_stmt_iterator si;
13132 for (si = gsi_start (pattern_def_seq); !gsi_end_p (si); gsi_next (&si))
13134 stmt_vec_info pattern_def_stmt_info
13135 = vinfo->lookup_stmt (gsi_stmt (si));
13136 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
13137 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
13139 /* Analyze def stmt of STMT if it's a pattern stmt. */
13140 if (dump_enabled_p ())
13141 dump_printf_loc (MSG_NOTE, vect_location,
13142 "==> examining pattern def statement: %G",
13143 pattern_def_stmt_info->stmt);
13145 opt_result res
13146 = vect_analyze_stmt (vinfo, pattern_def_stmt_info,
13147 need_to_vectorize, node, node_instance,
13148 cost_vec);
13149 if (!res)
13150 return res;
13155 /* Skip stmts that do not need to be vectorized. In loops this is expected
13156 to include:
13157 - the COND_EXPR which is the loop exit condition
13158 - any LABEL_EXPRs in the loop
13159 - computations that are used only for array indexing or loop control.
13160 In basic blocks we only analyze statements that are a part of some SLP
13161 instance, therefore, all the statements are relevant.
13163 Pattern statement needs to be analyzed instead of the original statement
13164 if the original statement is not relevant. Otherwise, we analyze both
13165 statements. In basic blocks we are called from some SLP instance
13166 traversal, don't analyze pattern stmts instead, the pattern stmts
13167 already will be part of SLP instance. */
13169 stmt_vec_info pattern_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
13170 if (!STMT_VINFO_RELEVANT_P (stmt_info)
13171 && !STMT_VINFO_LIVE_P (stmt_info))
13173 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
13174 && pattern_stmt_info
13175 && (STMT_VINFO_RELEVANT_P (pattern_stmt_info)
13176 || STMT_VINFO_LIVE_P (pattern_stmt_info)))
13178 /* Analyze PATTERN_STMT instead of the original stmt. */
13179 stmt_info = pattern_stmt_info;
13180 if (dump_enabled_p ())
13181 dump_printf_loc (MSG_NOTE, vect_location,
13182 "==> examining pattern statement: %G",
13183 stmt_info->stmt);
13185 else
13187 if (dump_enabled_p ())
13188 dump_printf_loc (MSG_NOTE, vect_location, "irrelevant.\n");
13190 return opt_result::success ();
13193 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
13194 && node == NULL
13195 && pattern_stmt_info
13196 && (STMT_VINFO_RELEVANT_P (pattern_stmt_info)
13197 || STMT_VINFO_LIVE_P (pattern_stmt_info)))
13199 /* Analyze PATTERN_STMT too. */
13200 if (dump_enabled_p ())
13201 dump_printf_loc (MSG_NOTE, vect_location,
13202 "==> examining pattern statement: %G",
13203 pattern_stmt_info->stmt);
13205 opt_result res
13206 = vect_analyze_stmt (vinfo, pattern_stmt_info, need_to_vectorize, node,
13207 node_instance, cost_vec);
13208 if (!res)
13209 return res;
13212 switch (STMT_VINFO_DEF_TYPE (stmt_info))
13214 case vect_internal_def:
13215 case vect_condition_def:
13216 break;
13218 case vect_reduction_def:
13219 case vect_nested_cycle:
13220 gcc_assert (!bb_vinfo
13221 && (relevance == vect_used_in_outer
13222 || relevance == vect_used_in_outer_by_reduction
13223 || relevance == vect_used_by_reduction
13224 || relevance == vect_unused_in_scope
13225 || relevance == vect_used_only_live));
13226 break;
13228 case vect_induction_def:
13229 case vect_first_order_recurrence:
13230 gcc_assert (!bb_vinfo);
13231 break;
13233 case vect_constant_def:
13234 case vect_external_def:
13235 case vect_unknown_def_type:
13236 default:
13237 gcc_unreachable ();
13240 tree saved_vectype = STMT_VINFO_VECTYPE (stmt_info);
13241 if (node)
13242 STMT_VINFO_VECTYPE (stmt_info) = SLP_TREE_VECTYPE (node);
13244 if (STMT_VINFO_RELEVANT_P (stmt_info))
13246 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
13247 gcc_assert (STMT_VINFO_VECTYPE (stmt_info)
13248 || gimple_code (stmt_info->stmt) == GIMPLE_COND
13249 || (call && gimple_call_lhs (call) == NULL_TREE));
13250 *need_to_vectorize = true;
13253 if (PURE_SLP_STMT (stmt_info) && !node)
13255 if (dump_enabled_p ())
13256 dump_printf_loc (MSG_NOTE, vect_location,
13257 "handled only by SLP analysis\n");
13258 return opt_result::success ();
13261 ok = true;
13262 if (!bb_vinfo
13263 && (STMT_VINFO_RELEVANT_P (stmt_info)
13264 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def))
13265 /* Prefer vectorizable_call over vectorizable_simd_clone_call so
13266 -mveclibabi= takes preference over library functions with
13267 the simd attribute. */
13268 ok = (vectorizable_call (vinfo, stmt_info, NULL, NULL, node, cost_vec)
13269 || vectorizable_simd_clone_call (vinfo, stmt_info, NULL, NULL, node,
13270 cost_vec)
13271 || vectorizable_conversion (vinfo, stmt_info,
13272 NULL, NULL, node, cost_vec)
13273 || vectorizable_operation (vinfo, stmt_info,
13274 NULL, NULL, node, cost_vec)
13275 || vectorizable_assignment (vinfo, stmt_info,
13276 NULL, NULL, node, cost_vec)
13277 || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec)
13278 || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec)
13279 || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
13280 node, node_instance, cost_vec)
13281 || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info,
13282 NULL, node, cost_vec)
13283 || vectorizable_shift (vinfo, stmt_info, NULL, NULL, node, cost_vec)
13284 || vectorizable_condition (vinfo, stmt_info,
13285 NULL, NULL, node, cost_vec)
13286 || vectorizable_comparison (vinfo, stmt_info, NULL, NULL, node,
13287 cost_vec)
13288 || vectorizable_lc_phi (as_a <loop_vec_info> (vinfo),
13289 stmt_info, NULL, node)
13290 || vectorizable_recurr (as_a <loop_vec_info> (vinfo),
13291 stmt_info, NULL, node, cost_vec)
13292 || vectorizable_early_exit (vinfo, stmt_info, NULL, NULL, node,
13293 cost_vec));
13294 else
13296 if (bb_vinfo)
13297 ok = (vectorizable_call (vinfo, stmt_info, NULL, NULL, node, cost_vec)
13298 || vectorizable_simd_clone_call (vinfo, stmt_info,
13299 NULL, NULL, node, cost_vec)
13300 || vectorizable_conversion (vinfo, stmt_info, NULL, NULL, node,
13301 cost_vec)
13302 || vectorizable_shift (vinfo, stmt_info,
13303 NULL, NULL, node, cost_vec)
13304 || vectorizable_operation (vinfo, stmt_info,
13305 NULL, NULL, node, cost_vec)
13306 || vectorizable_assignment (vinfo, stmt_info, NULL, NULL, node,
13307 cost_vec)
13308 || vectorizable_load (vinfo, stmt_info,
13309 NULL, NULL, node, cost_vec)
13310 || vectorizable_store (vinfo, stmt_info,
13311 NULL, NULL, node, cost_vec)
13312 || vectorizable_condition (vinfo, stmt_info,
13313 NULL, NULL, node, cost_vec)
13314 || vectorizable_comparison (vinfo, stmt_info, NULL, NULL, node,
13315 cost_vec)
13316 || vectorizable_phi (vinfo, stmt_info, NULL, node, cost_vec)
13317 || vectorizable_early_exit (vinfo, stmt_info, NULL, NULL, node,
13318 cost_vec));
13322 if (node)
13323 STMT_VINFO_VECTYPE (stmt_info) = saved_vectype;
13325 if (!ok)
13326 return opt_result::failure_at (stmt_info->stmt,
13327 "not vectorized:"
13328 " relevant stmt not supported: %G",
13329 stmt_info->stmt);
13331 /* Stmts that are (also) "live" (i.e. - that are used out of the loop)
13332 need extra handling, except for vectorizable reductions. */
13333 if (!bb_vinfo
13334 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type
13335 && STMT_VINFO_TYPE (stmt_info) != lc_phi_info_type
13336 && !can_vectorize_live_stmts (as_a <loop_vec_info> (vinfo),
13337 stmt_info, node, node_instance,
13338 false, cost_vec))
13339 return opt_result::failure_at (stmt_info->stmt,
13340 "not vectorized:"
13341 " live stmt not supported: %G",
13342 stmt_info->stmt);
13344 return opt_result::success ();
13348 /* Function vect_transform_stmt.
13350 Create a vectorized stmt to replace STMT_INFO, and insert it at GSI. */
13352 bool
13353 vect_transform_stmt (vec_info *vinfo,
13354 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
13355 slp_tree slp_node, slp_instance slp_node_instance)
13357 bool is_store = false;
13358 gimple *vec_stmt = NULL;
13359 bool done;
13361 gcc_assert (slp_node || !PURE_SLP_STMT (stmt_info));
13363 tree saved_vectype = STMT_VINFO_VECTYPE (stmt_info);
13364 if (slp_node)
13365 STMT_VINFO_VECTYPE (stmt_info) = SLP_TREE_VECTYPE (slp_node);
13367 switch (STMT_VINFO_TYPE (stmt_info))
13369 case type_demotion_vec_info_type:
13370 case type_promotion_vec_info_type:
13371 case type_conversion_vec_info_type:
13372 done = vectorizable_conversion (vinfo, stmt_info,
13373 gsi, &vec_stmt, slp_node, NULL);
13374 gcc_assert (done);
13375 break;
13377 case induc_vec_info_type:
13378 done = vectorizable_induction (as_a <loop_vec_info> (vinfo),
13379 stmt_info, &vec_stmt, slp_node,
13380 NULL);
13381 gcc_assert (done);
13382 break;
13384 case shift_vec_info_type:
13385 done = vectorizable_shift (vinfo, stmt_info,
13386 gsi, &vec_stmt, slp_node, NULL);
13387 gcc_assert (done);
13388 break;
13390 case op_vec_info_type:
13391 done = vectorizable_operation (vinfo, stmt_info, gsi, &vec_stmt, slp_node,
13392 NULL);
13393 gcc_assert (done);
13394 break;
13396 case assignment_vec_info_type:
13397 done = vectorizable_assignment (vinfo, stmt_info,
13398 gsi, &vec_stmt, slp_node, NULL);
13399 gcc_assert (done);
13400 break;
13402 case load_vec_info_type:
13403 done = vectorizable_load (vinfo, stmt_info, gsi, &vec_stmt, slp_node,
13404 NULL);
13405 gcc_assert (done);
13406 break;
13408 case store_vec_info_type:
13409 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
13410 && !slp_node
13411 && (++DR_GROUP_STORE_COUNT (DR_GROUP_FIRST_ELEMENT (stmt_info))
13412 < DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info))))
13413 /* In case of interleaving, the whole chain is vectorized when the
13414 last store in the chain is reached. Store stmts before the last
13415 one are skipped, and there vec_stmt_info shouldn't be freed
13416 meanwhile. */
13418 else
13420 done = vectorizable_store (vinfo, stmt_info,
13421 gsi, &vec_stmt, slp_node, NULL);
13422 gcc_assert (done);
13423 is_store = true;
13425 break;
13427 case condition_vec_info_type:
13428 done = vectorizable_condition (vinfo, stmt_info,
13429 gsi, &vec_stmt, slp_node, NULL);
13430 gcc_assert (done);
13431 break;
13433 case comparison_vec_info_type:
13434 done = vectorizable_comparison (vinfo, stmt_info, gsi, &vec_stmt,
13435 slp_node, NULL);
13436 gcc_assert (done);
13437 break;
13439 case call_vec_info_type:
13440 done = vectorizable_call (vinfo, stmt_info,
13441 gsi, &vec_stmt, slp_node, NULL);
13442 break;
13444 case call_simd_clone_vec_info_type:
13445 done = vectorizable_simd_clone_call (vinfo, stmt_info, gsi, &vec_stmt,
13446 slp_node, NULL);
13447 break;
13449 case reduc_vec_info_type:
13450 done = vect_transform_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
13451 gsi, &vec_stmt, slp_node);
13452 gcc_assert (done);
13453 break;
13455 case cycle_phi_info_type:
13456 done = vect_transform_cycle_phi (as_a <loop_vec_info> (vinfo), stmt_info,
13457 &vec_stmt, slp_node, slp_node_instance);
13458 gcc_assert (done);
13459 break;
13461 case lc_phi_info_type:
13462 done = vectorizable_lc_phi (as_a <loop_vec_info> (vinfo),
13463 stmt_info, &vec_stmt, slp_node);
13464 gcc_assert (done);
13465 break;
13467 case recurr_info_type:
13468 done = vectorizable_recurr (as_a <loop_vec_info> (vinfo),
13469 stmt_info, &vec_stmt, slp_node, NULL);
13470 gcc_assert (done);
13471 break;
13473 case phi_info_type:
13474 done = vectorizable_phi (vinfo, stmt_info, &vec_stmt, slp_node, NULL);
13475 gcc_assert (done);
13476 break;
13478 case loop_exit_ctrl_vec_info_type:
13479 done = vectorizable_early_exit (vinfo, stmt_info, gsi, &vec_stmt,
13480 slp_node, NULL);
13481 gcc_assert (done);
13482 break;
13484 default:
13485 if (!STMT_VINFO_LIVE_P (stmt_info))
13487 if (dump_enabled_p ())
13488 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
13489 "stmt not supported.\n");
13490 gcc_unreachable ();
13492 done = true;
13495 if (!slp_node && vec_stmt)
13496 gcc_assert (STMT_VINFO_VEC_STMTS (stmt_info).exists ());
13498 if (STMT_VINFO_TYPE (stmt_info) != store_vec_info_type)
13500 /* Handle stmts whose DEF is used outside the loop-nest that is
13501 being vectorized. */
13502 done = can_vectorize_live_stmts (vinfo, stmt_info, slp_node,
13503 slp_node_instance, true, NULL);
13504 gcc_assert (done);
13507 if (slp_node)
13508 STMT_VINFO_VECTYPE (stmt_info) = saved_vectype;
13510 return is_store;
13514 /* Remove a group of stores (for SLP or interleaving), free their
13515 stmt_vec_info. */
13517 void
13518 vect_remove_stores (vec_info *vinfo, stmt_vec_info first_stmt_info)
13520 stmt_vec_info next_stmt_info = first_stmt_info;
13522 while (next_stmt_info)
13524 stmt_vec_info tmp = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
13525 next_stmt_info = vect_orig_stmt (next_stmt_info);
13526 /* Free the attached stmt_vec_info and remove the stmt. */
13527 vinfo->remove_stmt (next_stmt_info);
13528 next_stmt_info = tmp;
13532 /* If NUNITS is nonzero, return a vector type that contains NUNITS
13533 elements of type SCALAR_TYPE, or null if the target doesn't support
13534 such a type.
13536 If NUNITS is zero, return a vector type that contains elements of
13537 type SCALAR_TYPE, choosing whichever vector size the target prefers.
13539 If PREVAILING_MODE is VOIDmode, we have not yet chosen a vector mode
13540 for this vectorization region and want to "autodetect" the best choice.
13541 Otherwise, PREVAILING_MODE is a previously-chosen vector TYPE_MODE
13542 and we want the new type to be interoperable with it. PREVAILING_MODE
13543 in this case can be a scalar integer mode or a vector mode; when it
13544 is a vector mode, the function acts like a tree-level version of
13545 related_vector_mode. */
13547 tree
13548 get_related_vectype_for_scalar_type (machine_mode prevailing_mode,
13549 tree scalar_type, poly_uint64 nunits)
13551 tree orig_scalar_type = scalar_type;
13552 scalar_mode inner_mode;
13553 machine_mode simd_mode;
13554 tree vectype;
13556 if ((!INTEGRAL_TYPE_P (scalar_type)
13557 && !POINTER_TYPE_P (scalar_type)
13558 && !SCALAR_FLOAT_TYPE_P (scalar_type))
13559 || (!is_int_mode (TYPE_MODE (scalar_type), &inner_mode)
13560 && !is_float_mode (TYPE_MODE (scalar_type), &inner_mode)))
13561 return NULL_TREE;
13563 unsigned int nbytes = GET_MODE_SIZE (inner_mode);
13565 /* Interoperability between modes requires one to be a constant multiple
13566 of the other, so that the number of vectors required for each operation
13567 is a compile-time constant. */
13568 if (prevailing_mode != VOIDmode
13569 && !constant_multiple_p (nunits * nbytes,
13570 GET_MODE_SIZE (prevailing_mode))
13571 && !constant_multiple_p (GET_MODE_SIZE (prevailing_mode),
13572 nunits * nbytes))
13573 return NULL_TREE;
13575 /* For vector types of elements whose mode precision doesn't
13576 match their types precision we use a element type of mode
13577 precision. The vectorization routines will have to make sure
13578 they support the proper result truncation/extension.
13579 We also make sure to build vector types with INTEGER_TYPE
13580 component type only. */
13581 if (INTEGRAL_TYPE_P (scalar_type)
13582 && (GET_MODE_BITSIZE (inner_mode) != TYPE_PRECISION (scalar_type)
13583 || TREE_CODE (scalar_type) != INTEGER_TYPE))
13584 scalar_type = build_nonstandard_integer_type (GET_MODE_BITSIZE (inner_mode),
13585 TYPE_UNSIGNED (scalar_type));
13587 /* We shouldn't end up building VECTOR_TYPEs of non-scalar components.
13588 When the component mode passes the above test simply use a type
13589 corresponding to that mode. The theory is that any use that
13590 would cause problems with this will disable vectorization anyway. */
13591 else if (!SCALAR_FLOAT_TYPE_P (scalar_type)
13592 && !INTEGRAL_TYPE_P (scalar_type))
13593 scalar_type = lang_hooks.types.type_for_mode (inner_mode, 1);
13595 /* We can't build a vector type of elements with alignment bigger than
13596 their size. */
13597 else if (nbytes < TYPE_ALIGN_UNIT (scalar_type))
13598 scalar_type = lang_hooks.types.type_for_mode (inner_mode,
13599 TYPE_UNSIGNED (scalar_type));
13601 /* If we felt back to using the mode fail if there was
13602 no scalar type for it. */
13603 if (scalar_type == NULL_TREE)
13604 return NULL_TREE;
13606 /* If no prevailing mode was supplied, use the mode the target prefers.
13607 Otherwise lookup a vector mode based on the prevailing mode. */
13608 if (prevailing_mode == VOIDmode)
13610 gcc_assert (known_eq (nunits, 0U));
13611 simd_mode = targetm.vectorize.preferred_simd_mode (inner_mode);
13612 if (SCALAR_INT_MODE_P (simd_mode))
13614 /* Traditional behavior is not to take the integer mode
13615 literally, but simply to use it as a way of determining
13616 the vector size. It is up to mode_for_vector to decide
13617 what the TYPE_MODE should be.
13619 Note that nunits == 1 is allowed in order to support single
13620 element vector types. */
13621 if (!multiple_p (GET_MODE_SIZE (simd_mode), nbytes, &nunits)
13622 || !mode_for_vector (inner_mode, nunits).exists (&simd_mode))
13623 return NULL_TREE;
13626 else if (SCALAR_INT_MODE_P (prevailing_mode)
13627 || !related_vector_mode (prevailing_mode,
13628 inner_mode, nunits).exists (&simd_mode))
13630 /* Fall back to using mode_for_vector, mostly in the hope of being
13631 able to use an integer mode. */
13632 if (known_eq (nunits, 0U)
13633 && !multiple_p (GET_MODE_SIZE (prevailing_mode), nbytes, &nunits))
13634 return NULL_TREE;
13636 if (!mode_for_vector (inner_mode, nunits).exists (&simd_mode))
13637 return NULL_TREE;
13640 vectype = build_vector_type_for_mode (scalar_type, simd_mode);
13642 /* In cases where the mode was chosen by mode_for_vector, check that
13643 the target actually supports the chosen mode, or that it at least
13644 allows the vector mode to be replaced by a like-sized integer. */
13645 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
13646 && !INTEGRAL_MODE_P (TYPE_MODE (vectype)))
13647 return NULL_TREE;
13649 /* Re-attach the address-space qualifier if we canonicalized the scalar
13650 type. */
13651 if (TYPE_ADDR_SPACE (orig_scalar_type) != TYPE_ADDR_SPACE (vectype))
13652 return build_qualified_type
13653 (vectype, KEEP_QUAL_ADDR_SPACE (TYPE_QUALS (orig_scalar_type)));
13655 return vectype;
13658 /* Function get_vectype_for_scalar_type.
13660 Returns the vector type corresponding to SCALAR_TYPE as supported
13661 by the target. If GROUP_SIZE is nonzero and we're performing BB
13662 vectorization, make sure that the number of elements in the vector
13663 is no bigger than GROUP_SIZE. */
13665 tree
13666 get_vectype_for_scalar_type (vec_info *vinfo, tree scalar_type,
13667 unsigned int group_size)
13669 /* For BB vectorization, we should always have a group size once we've
13670 constructed the SLP tree; the only valid uses of zero GROUP_SIZEs
13671 are tentative requests during things like early data reference
13672 analysis and pattern recognition. */
13673 if (is_a <bb_vec_info> (vinfo))
13674 gcc_assert (vinfo->slp_instances.is_empty () || group_size != 0);
13675 else
13676 group_size = 0;
13678 tree vectype = get_related_vectype_for_scalar_type (vinfo->vector_mode,
13679 scalar_type);
13680 if (vectype && vinfo->vector_mode == VOIDmode)
13681 vinfo->vector_mode = TYPE_MODE (vectype);
13683 /* Register the natural choice of vector type, before the group size
13684 has been applied. */
13685 if (vectype)
13686 vinfo->used_vector_modes.add (TYPE_MODE (vectype));
13688 /* If the natural choice of vector type doesn't satisfy GROUP_SIZE,
13689 try again with an explicit number of elements. */
13690 if (vectype
13691 && group_size
13692 && maybe_ge (TYPE_VECTOR_SUBPARTS (vectype), group_size))
13694 /* Start with the biggest number of units that fits within
13695 GROUP_SIZE and halve it until we find a valid vector type.
13696 Usually either the first attempt will succeed or all will
13697 fail (in the latter case because GROUP_SIZE is too small
13698 for the target), but it's possible that a target could have
13699 a hole between supported vector types.
13701 If GROUP_SIZE is not a power of 2, this has the effect of
13702 trying the largest power of 2 that fits within the group,
13703 even though the group is not a multiple of that vector size.
13704 The BB vectorizer will then try to carve up the group into
13705 smaller pieces. */
13706 unsigned int nunits = 1 << floor_log2 (group_size);
13709 vectype = get_related_vectype_for_scalar_type (vinfo->vector_mode,
13710 scalar_type, nunits);
13711 nunits /= 2;
13713 while (nunits > 1 && !vectype);
13716 return vectype;
13719 /* Return the vector type corresponding to SCALAR_TYPE as supported
13720 by the target. NODE, if nonnull, is the SLP tree node that will
13721 use the returned vector type. */
13723 tree
13724 get_vectype_for_scalar_type (vec_info *vinfo, tree scalar_type, slp_tree node)
13726 unsigned int group_size = 0;
13727 if (node)
13728 group_size = SLP_TREE_LANES (node);
13729 return get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
13732 /* Function get_mask_type_for_scalar_type.
13734 Returns the mask type corresponding to a result of comparison
13735 of vectors of specified SCALAR_TYPE as supported by target.
13736 If GROUP_SIZE is nonzero and we're performing BB vectorization,
13737 make sure that the number of elements in the vector is no bigger
13738 than GROUP_SIZE. */
13740 tree
13741 get_mask_type_for_scalar_type (vec_info *vinfo, tree scalar_type,
13742 unsigned int group_size)
13744 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
13746 if (!vectype)
13747 return NULL;
13749 return truth_type_for (vectype);
13752 /* Function get_mask_type_for_scalar_type.
13754 Returns the mask type corresponding to a result of comparison
13755 of vectors of specified SCALAR_TYPE as supported by target.
13756 NODE, if nonnull, is the SLP tree node that will use the returned
13757 vector type. */
13759 tree
13760 get_mask_type_for_scalar_type (vec_info *vinfo, tree scalar_type,
13761 slp_tree node)
13763 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type, node);
13765 if (!vectype)
13766 return NULL;
13768 return truth_type_for (vectype);
13771 /* Function get_same_sized_vectype
13773 Returns a vector type corresponding to SCALAR_TYPE of size
13774 VECTOR_TYPE if supported by the target. */
13776 tree
13777 get_same_sized_vectype (tree scalar_type, tree vector_type)
13779 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type))
13780 return truth_type_for (vector_type);
13782 poly_uint64 nunits;
13783 if (!multiple_p (GET_MODE_SIZE (TYPE_MODE (vector_type)),
13784 GET_MODE_SIZE (TYPE_MODE (scalar_type)), &nunits))
13785 return NULL_TREE;
13787 return get_related_vectype_for_scalar_type (TYPE_MODE (vector_type),
13788 scalar_type, nunits);
13791 /* Return true if replacing LOOP_VINFO->vector_mode with VECTOR_MODE
13792 would not change the chosen vector modes. */
13794 bool
13795 vect_chooses_same_modes_p (vec_info *vinfo, machine_mode vector_mode)
13797 for (vec_info::mode_set::iterator i = vinfo->used_vector_modes.begin ();
13798 i != vinfo->used_vector_modes.end (); ++i)
13799 if (!VECTOR_MODE_P (*i)
13800 || related_vector_mode (vector_mode, GET_MODE_INNER (*i), 0) != *i)
13801 return false;
13802 return true;
13805 /* Function vect_is_simple_use.
13807 Input:
13808 VINFO - the vect info of the loop or basic block that is being vectorized.
13809 OPERAND - operand in the loop or bb.
13810 Output:
13811 DEF_STMT_INFO_OUT (optional) - information about the defining stmt in
13812 case OPERAND is an SSA_NAME that is defined in the vectorizable region
13813 DEF_STMT_OUT (optional) - the defining stmt in case OPERAND is an SSA_NAME;
13814 the definition could be anywhere in the function
13815 DT - the type of definition
13817 Returns whether a stmt with OPERAND can be vectorized.
13818 For loops, supportable operands are constants, loop invariants, and operands
13819 that are defined by the current iteration of the loop. Unsupportable
13820 operands are those that are defined by a previous iteration of the loop (as
13821 is the case in reduction/induction computations).
13822 For basic blocks, supportable operands are constants and bb invariants.
13823 For now, operands defined outside the basic block are not supported. */
13825 bool
13826 vect_is_simple_use (tree operand, vec_info *vinfo, enum vect_def_type *dt,
13827 stmt_vec_info *def_stmt_info_out, gimple **def_stmt_out)
13829 if (def_stmt_info_out)
13830 *def_stmt_info_out = NULL;
13831 if (def_stmt_out)
13832 *def_stmt_out = NULL;
13833 *dt = vect_unknown_def_type;
13835 if (dump_enabled_p ())
13837 dump_printf_loc (MSG_NOTE, vect_location,
13838 "vect_is_simple_use: operand ");
13839 if (TREE_CODE (operand) == SSA_NAME
13840 && !SSA_NAME_IS_DEFAULT_DEF (operand))
13841 dump_gimple_expr (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (operand), 0);
13842 else
13843 dump_generic_expr (MSG_NOTE, TDF_SLIM, operand);
13846 if (CONSTANT_CLASS_P (operand))
13847 *dt = vect_constant_def;
13848 else if (is_gimple_min_invariant (operand))
13849 *dt = vect_external_def;
13850 else if (TREE_CODE (operand) != SSA_NAME)
13851 *dt = vect_unknown_def_type;
13852 else if (SSA_NAME_IS_DEFAULT_DEF (operand))
13853 *dt = vect_external_def;
13854 else
13856 gimple *def_stmt = SSA_NAME_DEF_STMT (operand);
13857 stmt_vec_info stmt_vinfo = vinfo->lookup_def (operand);
13858 if (!stmt_vinfo)
13859 *dt = vect_external_def;
13860 else
13862 stmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
13863 def_stmt = stmt_vinfo->stmt;
13864 *dt = STMT_VINFO_DEF_TYPE (stmt_vinfo);
13865 if (def_stmt_info_out)
13866 *def_stmt_info_out = stmt_vinfo;
13868 if (def_stmt_out)
13869 *def_stmt_out = def_stmt;
13872 if (dump_enabled_p ())
13874 dump_printf (MSG_NOTE, ", type of def: ");
13875 switch (*dt)
13877 case vect_uninitialized_def:
13878 dump_printf (MSG_NOTE, "uninitialized\n");
13879 break;
13880 case vect_constant_def:
13881 dump_printf (MSG_NOTE, "constant\n");
13882 break;
13883 case vect_external_def:
13884 dump_printf (MSG_NOTE, "external\n");
13885 break;
13886 case vect_internal_def:
13887 dump_printf (MSG_NOTE, "internal\n");
13888 break;
13889 case vect_induction_def:
13890 dump_printf (MSG_NOTE, "induction\n");
13891 break;
13892 case vect_reduction_def:
13893 dump_printf (MSG_NOTE, "reduction\n");
13894 break;
13895 case vect_double_reduction_def:
13896 dump_printf (MSG_NOTE, "double reduction\n");
13897 break;
13898 case vect_nested_cycle:
13899 dump_printf (MSG_NOTE, "nested cycle\n");
13900 break;
13901 case vect_first_order_recurrence:
13902 dump_printf (MSG_NOTE, "first order recurrence\n");
13903 break;
13904 case vect_condition_def:
13905 dump_printf (MSG_NOTE, "control flow\n");
13906 break;
13907 case vect_unknown_def_type:
13908 dump_printf (MSG_NOTE, "unknown\n");
13909 break;
13913 if (*dt == vect_unknown_def_type)
13915 if (dump_enabled_p ())
13916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
13917 "Unsupported pattern.\n");
13918 return false;
13921 return true;
13924 /* Function vect_is_simple_use.
13926 Same as vect_is_simple_use but also determines the vector operand
13927 type of OPERAND and stores it to *VECTYPE. If the definition of
13928 OPERAND is vect_uninitialized_def, vect_constant_def or
13929 vect_external_def *VECTYPE will be set to NULL_TREE and the caller
13930 is responsible to compute the best suited vector type for the
13931 scalar operand. */
13933 bool
13934 vect_is_simple_use (tree operand, vec_info *vinfo, enum vect_def_type *dt,
13935 tree *vectype, stmt_vec_info *def_stmt_info_out,
13936 gimple **def_stmt_out)
13938 stmt_vec_info def_stmt_info;
13939 gimple *def_stmt;
13940 if (!vect_is_simple_use (operand, vinfo, dt, &def_stmt_info, &def_stmt))
13941 return false;
13943 if (def_stmt_out)
13944 *def_stmt_out = def_stmt;
13945 if (def_stmt_info_out)
13946 *def_stmt_info_out = def_stmt_info;
13948 /* Now get a vector type if the def is internal, otherwise supply
13949 NULL_TREE and leave it up to the caller to figure out a proper
13950 type for the use stmt. */
13951 if (*dt == vect_internal_def
13952 || *dt == vect_induction_def
13953 || *dt == vect_reduction_def
13954 || *dt == vect_double_reduction_def
13955 || *dt == vect_nested_cycle
13956 || *dt == vect_first_order_recurrence)
13958 *vectype = STMT_VINFO_VECTYPE (def_stmt_info);
13959 gcc_assert (*vectype != NULL_TREE);
13960 if (dump_enabled_p ())
13961 dump_printf_loc (MSG_NOTE, vect_location,
13962 "vect_is_simple_use: vectype %T\n", *vectype);
13964 else if (*dt == vect_uninitialized_def
13965 || *dt == vect_constant_def
13966 || *dt == vect_external_def)
13967 *vectype = NULL_TREE;
13968 else
13969 gcc_unreachable ();
13971 return true;
13974 /* Function vect_is_simple_use.
13976 Same as vect_is_simple_use but determines the operand by operand
13977 position OPERAND from either STMT or SLP_NODE, filling in *OP
13978 and *SLP_DEF (when SLP_NODE is not NULL). */
13980 bool
13981 vect_is_simple_use (vec_info *vinfo, stmt_vec_info stmt, slp_tree slp_node,
13982 unsigned operand, tree *op, slp_tree *slp_def,
13983 enum vect_def_type *dt,
13984 tree *vectype, stmt_vec_info *def_stmt_info_out)
13986 if (slp_node)
13988 slp_tree child = SLP_TREE_CHILDREN (slp_node)[operand];
13989 *slp_def = child;
13990 *vectype = SLP_TREE_VECTYPE (child);
13991 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
13993 *op = gimple_get_lhs (SLP_TREE_REPRESENTATIVE (child)->stmt);
13994 return vect_is_simple_use (*op, vinfo, dt, def_stmt_info_out);
13996 else
13998 if (def_stmt_info_out)
13999 *def_stmt_info_out = NULL;
14000 *op = SLP_TREE_SCALAR_OPS (child)[0];
14001 *dt = SLP_TREE_DEF_TYPE (child);
14002 return true;
14005 else
14007 *slp_def = NULL;
14008 if (gassign *ass = dyn_cast <gassign *> (stmt->stmt))
14010 if (gimple_assign_rhs_code (ass) == COND_EXPR
14011 && COMPARISON_CLASS_P (gimple_assign_rhs1 (ass)))
14013 if (operand < 2)
14014 *op = TREE_OPERAND (gimple_assign_rhs1 (ass), operand);
14015 else
14016 *op = gimple_op (ass, operand);
14018 else if (gimple_assign_rhs_code (ass) == VIEW_CONVERT_EXPR)
14019 *op = TREE_OPERAND (gimple_assign_rhs1 (ass), 0);
14020 else
14021 *op = gimple_op (ass, operand + 1);
14023 else if (gcond *cond = dyn_cast <gcond *> (stmt->stmt))
14024 *op = gimple_op (cond, operand);
14025 else if (gcall *call = dyn_cast <gcall *> (stmt->stmt))
14026 *op = gimple_call_arg (call, operand);
14027 else
14028 gcc_unreachable ();
14029 return vect_is_simple_use (*op, vinfo, dt, vectype, def_stmt_info_out);
14033 /* If OP is not NULL and is external or constant update its vector
14034 type with VECTYPE. Returns true if successful or false if not,
14035 for example when conflicting vector types are present. */
14037 bool
14038 vect_maybe_update_slp_op_vectype (slp_tree op, tree vectype)
14040 if (!op || SLP_TREE_DEF_TYPE (op) == vect_internal_def)
14041 return true;
14042 if (SLP_TREE_VECTYPE (op))
14043 return types_compatible_p (SLP_TREE_VECTYPE (op), vectype);
14044 /* For external defs refuse to produce VECTOR_BOOLEAN_TYPE_P, those
14045 should be handled by patters. Allow vect_constant_def for now. */
14046 if (VECTOR_BOOLEAN_TYPE_P (vectype)
14047 && SLP_TREE_DEF_TYPE (op) == vect_external_def)
14048 return false;
14049 SLP_TREE_VECTYPE (op) = vectype;
14050 return true;
14053 /* Function supportable_widening_operation
14055 Check whether an operation represented by the code CODE is a
14056 widening operation that is supported by the target platform in
14057 vector form (i.e., when operating on arguments of type VECTYPE_IN
14058 producing a result of type VECTYPE_OUT).
14060 Widening operations we currently support are NOP (CONVERT), FLOAT,
14061 FIX_TRUNC and WIDEN_MULT. This function checks if these operations
14062 are supported by the target platform either directly (via vector
14063 tree-codes), or via target builtins.
14065 Output:
14066 - CODE1 and CODE2 are codes of vector operations to be used when
14067 vectorizing the operation, if available.
14068 - MULTI_STEP_CVT determines the number of required intermediate steps in
14069 case of multi-step conversion (like char->short->int - in that case
14070 MULTI_STEP_CVT will be 1).
14071 - INTERM_TYPES contains the intermediate type required to perform the
14072 widening operation (short in the above example). */
14074 bool
14075 supportable_widening_operation (vec_info *vinfo,
14076 code_helper code,
14077 stmt_vec_info stmt_info,
14078 tree vectype_out, tree vectype_in,
14079 code_helper *code1,
14080 code_helper *code2,
14081 int *multi_step_cvt,
14082 vec<tree> *interm_types)
14084 loop_vec_info loop_info = dyn_cast <loop_vec_info> (vinfo);
14085 class loop *vect_loop = NULL;
14086 machine_mode vec_mode;
14087 enum insn_code icode1, icode2;
14088 optab optab1 = unknown_optab, optab2 = unknown_optab;
14089 tree vectype = vectype_in;
14090 tree wide_vectype = vectype_out;
14091 tree_code c1 = MAX_TREE_CODES, c2 = MAX_TREE_CODES;
14092 int i;
14093 tree prev_type, intermediate_type;
14094 machine_mode intermediate_mode, prev_mode;
14095 optab optab3, optab4;
14097 *multi_step_cvt = 0;
14098 if (loop_info)
14099 vect_loop = LOOP_VINFO_LOOP (loop_info);
14101 switch (code.safe_as_tree_code ())
14103 case MAX_TREE_CODES:
14104 /* Don't set c1 and c2 if code is not a tree_code. */
14105 break;
14107 case WIDEN_MULT_EXPR:
14108 /* The result of a vectorized widening operation usually requires
14109 two vectors (because the widened results do not fit into one vector).
14110 The generated vector results would normally be expected to be
14111 generated in the same order as in the original scalar computation,
14112 i.e. if 8 results are generated in each vector iteration, they are
14113 to be organized as follows:
14114 vect1: [res1,res2,res3,res4],
14115 vect2: [res5,res6,res7,res8].
14117 However, in the special case that the result of the widening
14118 operation is used in a reduction computation only, the order doesn't
14119 matter (because when vectorizing a reduction we change the order of
14120 the computation). Some targets can take advantage of this and
14121 generate more efficient code. For example, targets like Altivec,
14122 that support widen_mult using a sequence of {mult_even,mult_odd}
14123 generate the following vectors:
14124 vect1: [res1,res3,res5,res7],
14125 vect2: [res2,res4,res6,res8].
14127 When vectorizing outer-loops, we execute the inner-loop sequentially
14128 (each vectorized inner-loop iteration contributes to VF outer-loop
14129 iterations in parallel). We therefore don't allow to change the
14130 order of the computation in the inner-loop during outer-loop
14131 vectorization. */
14132 /* TODO: Another case in which order doesn't *really* matter is when we
14133 widen and then contract again, e.g. (short)((int)x * y >> 8).
14134 Normally, pack_trunc performs an even/odd permute, whereas the
14135 repack from an even/odd expansion would be an interleave, which
14136 would be significantly simpler for e.g. AVX2. */
14137 /* In any case, in order to avoid duplicating the code below, recurse
14138 on VEC_WIDEN_MULT_EVEN_EXPR. If it succeeds, all the return values
14139 are properly set up for the caller. If we fail, we'll continue with
14140 a VEC_WIDEN_MULT_LO/HI_EXPR check. */
14141 if (vect_loop
14142 && STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
14143 && !nested_in_vect_loop_p (vect_loop, stmt_info)
14144 && supportable_widening_operation (vinfo, VEC_WIDEN_MULT_EVEN_EXPR,
14145 stmt_info, vectype_out,
14146 vectype_in, code1,
14147 code2, multi_step_cvt,
14148 interm_types))
14150 /* Elements in a vector with vect_used_by_reduction property cannot
14151 be reordered if the use chain with this property does not have the
14152 same operation. One such an example is s += a * b, where elements
14153 in a and b cannot be reordered. Here we check if the vector defined
14154 by STMT is only directly used in the reduction statement. */
14155 tree lhs = gimple_assign_lhs (stmt_info->stmt);
14156 stmt_vec_info use_stmt_info = loop_info->lookup_single_use (lhs);
14157 if (use_stmt_info
14158 && STMT_VINFO_DEF_TYPE (use_stmt_info) == vect_reduction_def)
14159 return true;
14161 c1 = VEC_WIDEN_MULT_LO_EXPR;
14162 c2 = VEC_WIDEN_MULT_HI_EXPR;
14163 break;
14165 case DOT_PROD_EXPR:
14166 c1 = DOT_PROD_EXPR;
14167 c2 = DOT_PROD_EXPR;
14168 break;
14170 case SAD_EXPR:
14171 c1 = SAD_EXPR;
14172 c2 = SAD_EXPR;
14173 break;
14175 case VEC_WIDEN_MULT_EVEN_EXPR:
14176 /* Support the recursion induced just above. */
14177 c1 = VEC_WIDEN_MULT_EVEN_EXPR;
14178 c2 = VEC_WIDEN_MULT_ODD_EXPR;
14179 break;
14181 case WIDEN_LSHIFT_EXPR:
14182 c1 = VEC_WIDEN_LSHIFT_LO_EXPR;
14183 c2 = VEC_WIDEN_LSHIFT_HI_EXPR;
14184 break;
14186 CASE_CONVERT:
14187 c1 = VEC_UNPACK_LO_EXPR;
14188 c2 = VEC_UNPACK_HI_EXPR;
14189 break;
14191 case FLOAT_EXPR:
14192 c1 = VEC_UNPACK_FLOAT_LO_EXPR;
14193 c2 = VEC_UNPACK_FLOAT_HI_EXPR;
14194 break;
14196 case FIX_TRUNC_EXPR:
14197 c1 = VEC_UNPACK_FIX_TRUNC_LO_EXPR;
14198 c2 = VEC_UNPACK_FIX_TRUNC_HI_EXPR;
14199 break;
14201 default:
14202 gcc_unreachable ();
14205 if (BYTES_BIG_ENDIAN && c1 != VEC_WIDEN_MULT_EVEN_EXPR)
14206 std::swap (c1, c2);
14208 if (code == FIX_TRUNC_EXPR)
14210 /* The signedness is determined from output operand. */
14211 optab1 = optab_for_tree_code (c1, vectype_out, optab_default);
14212 optab2 = optab_for_tree_code (c2, vectype_out, optab_default);
14214 else if (CONVERT_EXPR_CODE_P (code.safe_as_tree_code ())
14215 && VECTOR_BOOLEAN_TYPE_P (wide_vectype)
14216 && VECTOR_BOOLEAN_TYPE_P (vectype)
14217 && TYPE_MODE (wide_vectype) == TYPE_MODE (vectype)
14218 && SCALAR_INT_MODE_P (TYPE_MODE (vectype)))
14220 /* If the input and result modes are the same, a different optab
14221 is needed where we pass in the number of units in vectype. */
14222 optab1 = vec_unpacks_sbool_lo_optab;
14223 optab2 = vec_unpacks_sbool_hi_optab;
14226 vec_mode = TYPE_MODE (vectype);
14227 if (widening_fn_p (code))
14229 /* If this is an internal fn then we must check whether the target
14230 supports either a low-high split or an even-odd split. */
14231 internal_fn ifn = as_internal_fn ((combined_fn) code);
14233 internal_fn lo, hi, even, odd;
14234 lookup_hilo_internal_fn (ifn, &lo, &hi);
14235 *code1 = as_combined_fn (lo);
14236 *code2 = as_combined_fn (hi);
14237 optab1 = direct_internal_fn_optab (lo, {vectype, vectype});
14238 optab2 = direct_internal_fn_optab (hi, {vectype, vectype});
14240 /* If we don't support low-high, then check for even-odd. */
14241 if (!optab1
14242 || (icode1 = optab_handler (optab1, vec_mode)) == CODE_FOR_nothing
14243 || !optab2
14244 || (icode2 = optab_handler (optab2, vec_mode)) == CODE_FOR_nothing)
14246 lookup_evenodd_internal_fn (ifn, &even, &odd);
14247 *code1 = as_combined_fn (even);
14248 *code2 = as_combined_fn (odd);
14249 optab1 = direct_internal_fn_optab (even, {vectype, vectype});
14250 optab2 = direct_internal_fn_optab (odd, {vectype, vectype});
14253 else if (code.is_tree_code ())
14255 if (code == FIX_TRUNC_EXPR)
14257 /* The signedness is determined from output operand. */
14258 optab1 = optab_for_tree_code (c1, vectype_out, optab_default);
14259 optab2 = optab_for_tree_code (c2, vectype_out, optab_default);
14261 else if (CONVERT_EXPR_CODE_P ((tree_code) code.safe_as_tree_code ())
14262 && VECTOR_BOOLEAN_TYPE_P (wide_vectype)
14263 && VECTOR_BOOLEAN_TYPE_P (vectype)
14264 && TYPE_MODE (wide_vectype) == TYPE_MODE (vectype)
14265 && SCALAR_INT_MODE_P (TYPE_MODE (vectype)))
14267 /* If the input and result modes are the same, a different optab
14268 is needed where we pass in the number of units in vectype. */
14269 optab1 = vec_unpacks_sbool_lo_optab;
14270 optab2 = vec_unpacks_sbool_hi_optab;
14272 else
14274 optab1 = optab_for_tree_code (c1, vectype, optab_default);
14275 optab2 = optab_for_tree_code (c2, vectype, optab_default);
14277 *code1 = c1;
14278 *code2 = c2;
14281 if (!optab1 || !optab2)
14282 return false;
14284 if ((icode1 = optab_handler (optab1, vec_mode)) == CODE_FOR_nothing
14285 || (icode2 = optab_handler (optab2, vec_mode)) == CODE_FOR_nothing)
14286 return false;
14289 if (insn_data[icode1].operand[0].mode == TYPE_MODE (wide_vectype)
14290 && insn_data[icode2].operand[0].mode == TYPE_MODE (wide_vectype))
14292 if (!VECTOR_BOOLEAN_TYPE_P (vectype))
14293 return true;
14294 /* For scalar masks we may have different boolean
14295 vector types having the same QImode. Thus we
14296 add additional check for elements number. */
14297 if (known_eq (TYPE_VECTOR_SUBPARTS (vectype),
14298 TYPE_VECTOR_SUBPARTS (wide_vectype) * 2))
14299 return true;
14302 /* Check if it's a multi-step conversion that can be done using intermediate
14303 types. */
14305 prev_type = vectype;
14306 prev_mode = vec_mode;
14308 if (!CONVERT_EXPR_CODE_P (code.safe_as_tree_code ()))
14309 return false;
14311 /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
14312 intermediate steps in promotion sequence. We try
14313 MAX_INTERM_CVT_STEPS to get to NARROW_VECTYPE, and fail if we do
14314 not. */
14315 interm_types->create (MAX_INTERM_CVT_STEPS);
14316 for (i = 0; i < MAX_INTERM_CVT_STEPS; i++)
14318 intermediate_mode = insn_data[icode1].operand[0].mode;
14319 if (VECTOR_BOOLEAN_TYPE_P (prev_type))
14320 intermediate_type
14321 = vect_halve_mask_nunits (prev_type, intermediate_mode);
14322 else if (VECTOR_MODE_P (intermediate_mode))
14324 tree intermediate_element_type
14325 = lang_hooks.types.type_for_mode (GET_MODE_INNER (intermediate_mode),
14326 TYPE_UNSIGNED (prev_type));
14327 intermediate_type
14328 = build_vector_type_for_mode (intermediate_element_type,
14329 intermediate_mode);
14331 else
14332 intermediate_type
14333 = lang_hooks.types.type_for_mode (intermediate_mode,
14334 TYPE_UNSIGNED (prev_type));
14336 if (VECTOR_BOOLEAN_TYPE_P (intermediate_type)
14337 && VECTOR_BOOLEAN_TYPE_P (prev_type)
14338 && intermediate_mode == prev_mode
14339 && SCALAR_INT_MODE_P (prev_mode))
14341 /* If the input and result modes are the same, a different optab
14342 is needed where we pass in the number of units in vectype. */
14343 optab3 = vec_unpacks_sbool_lo_optab;
14344 optab4 = vec_unpacks_sbool_hi_optab;
14346 else
14348 optab3 = optab_for_tree_code (c1, intermediate_type, optab_default);
14349 optab4 = optab_for_tree_code (c2, intermediate_type, optab_default);
14352 if (!optab3 || !optab4
14353 || (icode1 = optab_handler (optab1, prev_mode)) == CODE_FOR_nothing
14354 || insn_data[icode1].operand[0].mode != intermediate_mode
14355 || (icode2 = optab_handler (optab2, prev_mode)) == CODE_FOR_nothing
14356 || insn_data[icode2].operand[0].mode != intermediate_mode
14357 || ((icode1 = optab_handler (optab3, intermediate_mode))
14358 == CODE_FOR_nothing)
14359 || ((icode2 = optab_handler (optab4, intermediate_mode))
14360 == CODE_FOR_nothing))
14361 break;
14363 interm_types->quick_push (intermediate_type);
14364 (*multi_step_cvt)++;
14366 if (insn_data[icode1].operand[0].mode == TYPE_MODE (wide_vectype)
14367 && insn_data[icode2].operand[0].mode == TYPE_MODE (wide_vectype))
14369 if (!VECTOR_BOOLEAN_TYPE_P (vectype))
14370 return true;
14371 if (known_eq (TYPE_VECTOR_SUBPARTS (intermediate_type),
14372 TYPE_VECTOR_SUBPARTS (wide_vectype) * 2))
14373 return true;
14376 prev_type = intermediate_type;
14377 prev_mode = intermediate_mode;
14380 interm_types->release ();
14381 return false;
14385 /* Function supportable_narrowing_operation
14387 Check whether an operation represented by the code CODE is a
14388 narrowing operation that is supported by the target platform in
14389 vector form (i.e., when operating on arguments of type VECTYPE_IN
14390 and producing a result of type VECTYPE_OUT).
14392 Narrowing operations we currently support are NOP (CONVERT), FIX_TRUNC
14393 and FLOAT. This function checks if these operations are supported by
14394 the target platform directly via vector tree-codes.
14396 Output:
14397 - CODE1 is the code of a vector operation to be used when
14398 vectorizing the operation, if available.
14399 - MULTI_STEP_CVT determines the number of required intermediate steps in
14400 case of multi-step conversion (like int->short->char - in that case
14401 MULTI_STEP_CVT will be 1).
14402 - INTERM_TYPES contains the intermediate type required to perform the
14403 narrowing operation (short in the above example). */
14405 bool
14406 supportable_narrowing_operation (code_helper code,
14407 tree vectype_out, tree vectype_in,
14408 code_helper *code1, int *multi_step_cvt,
14409 vec<tree> *interm_types)
14411 machine_mode vec_mode;
14412 enum insn_code icode1;
14413 optab optab1, interm_optab;
14414 tree vectype = vectype_in;
14415 tree narrow_vectype = vectype_out;
14416 enum tree_code c1;
14417 tree intermediate_type, prev_type;
14418 machine_mode intermediate_mode, prev_mode;
14419 int i;
14420 unsigned HOST_WIDE_INT n_elts;
14421 bool uns;
14423 if (!code.is_tree_code ())
14424 return false;
14426 *multi_step_cvt = 0;
14427 switch ((tree_code) code)
14429 CASE_CONVERT:
14430 c1 = VEC_PACK_TRUNC_EXPR;
14431 if (VECTOR_BOOLEAN_TYPE_P (narrow_vectype)
14432 && VECTOR_BOOLEAN_TYPE_P (vectype)
14433 && SCALAR_INT_MODE_P (TYPE_MODE (vectype))
14434 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&n_elts)
14435 && n_elts < BITS_PER_UNIT)
14436 optab1 = vec_pack_sbool_trunc_optab;
14437 else
14438 optab1 = optab_for_tree_code (c1, vectype, optab_default);
14439 break;
14441 case FIX_TRUNC_EXPR:
14442 c1 = VEC_PACK_FIX_TRUNC_EXPR;
14443 /* The signedness is determined from output operand. */
14444 optab1 = optab_for_tree_code (c1, vectype_out, optab_default);
14445 break;
14447 case FLOAT_EXPR:
14448 c1 = VEC_PACK_FLOAT_EXPR;
14449 optab1 = optab_for_tree_code (c1, vectype, optab_default);
14450 break;
14452 default:
14453 gcc_unreachable ();
14456 if (!optab1)
14457 return false;
14459 vec_mode = TYPE_MODE (vectype);
14460 if ((icode1 = optab_handler (optab1, vec_mode)) == CODE_FOR_nothing)
14461 return false;
14463 *code1 = c1;
14465 if (insn_data[icode1].operand[0].mode == TYPE_MODE (narrow_vectype))
14467 if (!VECTOR_BOOLEAN_TYPE_P (vectype))
14468 return true;
14469 /* For scalar masks we may have different boolean
14470 vector types having the same QImode. Thus we
14471 add additional check for elements number. */
14472 if (known_eq (TYPE_VECTOR_SUBPARTS (vectype) * 2,
14473 TYPE_VECTOR_SUBPARTS (narrow_vectype)))
14474 return true;
14477 if (code == FLOAT_EXPR)
14478 return false;
14480 /* Check if it's a multi-step conversion that can be done using intermediate
14481 types. */
14482 prev_mode = vec_mode;
14483 prev_type = vectype;
14484 if (code == FIX_TRUNC_EXPR)
14485 uns = TYPE_UNSIGNED (vectype_out);
14486 else
14487 uns = TYPE_UNSIGNED (vectype);
14489 /* For multi-step FIX_TRUNC_EXPR prefer signed floating to integer
14490 conversion over unsigned, as unsigned FIX_TRUNC_EXPR is often more
14491 costly than signed. */
14492 if (code == FIX_TRUNC_EXPR && uns)
14494 enum insn_code icode2;
14496 intermediate_type
14497 = lang_hooks.types.type_for_mode (TYPE_MODE (vectype_out), 0);
14498 interm_optab
14499 = optab_for_tree_code (c1, intermediate_type, optab_default);
14500 if (interm_optab != unknown_optab
14501 && (icode2 = optab_handler (optab1, vec_mode)) != CODE_FOR_nothing
14502 && insn_data[icode1].operand[0].mode
14503 == insn_data[icode2].operand[0].mode)
14505 uns = false;
14506 optab1 = interm_optab;
14507 icode1 = icode2;
14511 /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
14512 intermediate steps in promotion sequence. We try
14513 MAX_INTERM_CVT_STEPS to get to NARROW_VECTYPE, and fail if we do not. */
14514 interm_types->create (MAX_INTERM_CVT_STEPS);
14515 for (i = 0; i < MAX_INTERM_CVT_STEPS; i++)
14517 intermediate_mode = insn_data[icode1].operand[0].mode;
14518 if (VECTOR_BOOLEAN_TYPE_P (prev_type))
14519 intermediate_type
14520 = vect_double_mask_nunits (prev_type, intermediate_mode);
14521 else
14522 intermediate_type
14523 = lang_hooks.types.type_for_mode (intermediate_mode, uns);
14524 if (VECTOR_BOOLEAN_TYPE_P (intermediate_type)
14525 && VECTOR_BOOLEAN_TYPE_P (prev_type)
14526 && SCALAR_INT_MODE_P (prev_mode)
14527 && TYPE_VECTOR_SUBPARTS (intermediate_type).is_constant (&n_elts)
14528 && n_elts < BITS_PER_UNIT)
14529 interm_optab = vec_pack_sbool_trunc_optab;
14530 else
14531 interm_optab
14532 = optab_for_tree_code (VEC_PACK_TRUNC_EXPR, intermediate_type,
14533 optab_default);
14534 if (!interm_optab
14535 || ((icode1 = optab_handler (optab1, prev_mode)) == CODE_FOR_nothing)
14536 || insn_data[icode1].operand[0].mode != intermediate_mode
14537 || ((icode1 = optab_handler (interm_optab, intermediate_mode))
14538 == CODE_FOR_nothing))
14539 break;
14541 interm_types->quick_push (intermediate_type);
14542 (*multi_step_cvt)++;
14544 if (insn_data[icode1].operand[0].mode == TYPE_MODE (narrow_vectype))
14546 if (!VECTOR_BOOLEAN_TYPE_P (vectype))
14547 return true;
14548 if (known_eq (TYPE_VECTOR_SUBPARTS (intermediate_type) * 2,
14549 TYPE_VECTOR_SUBPARTS (narrow_vectype)))
14550 return true;
14553 prev_mode = intermediate_mode;
14554 prev_type = intermediate_type;
14555 optab1 = interm_optab;
14558 interm_types->release ();
14559 return false;
14562 /* Generate and return a vector mask of MASK_TYPE such that
14563 mask[I] is true iff J + START_INDEX < END_INDEX for all J <= I.
14564 Add the statements to SEQ. */
14566 tree
14567 vect_gen_while (gimple_seq *seq, tree mask_type, tree start_index,
14568 tree end_index, const char *name)
14570 tree cmp_type = TREE_TYPE (start_index);
14571 gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT,
14572 cmp_type, mask_type,
14573 OPTIMIZE_FOR_SPEED));
14574 gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
14575 start_index, end_index,
14576 build_zero_cst (mask_type));
14577 tree tmp;
14578 if (name)
14579 tmp = make_temp_ssa_name (mask_type, NULL, name);
14580 else
14581 tmp = make_ssa_name (mask_type);
14582 gimple_call_set_lhs (call, tmp);
14583 gimple_seq_add_stmt (seq, call);
14584 return tmp;
14587 /* Generate a vector mask of type MASK_TYPE for which index I is false iff
14588 J + START_INDEX < END_INDEX for all J <= I. Add the statements to SEQ. */
14590 tree
14591 vect_gen_while_not (gimple_seq *seq, tree mask_type, tree start_index,
14592 tree end_index)
14594 tree tmp = vect_gen_while (seq, mask_type, start_index, end_index);
14595 return gimple_build (seq, BIT_NOT_EXPR, mask_type, tmp);
14598 /* Try to compute the vector types required to vectorize STMT_INFO,
14599 returning true on success and false if vectorization isn't possible.
14600 If GROUP_SIZE is nonzero and we're performing BB vectorization,
14601 take sure that the number of elements in the vectors is no bigger
14602 than GROUP_SIZE.
14604 On success:
14606 - Set *STMT_VECTYPE_OUT to:
14607 - NULL_TREE if the statement doesn't need to be vectorized;
14608 - the equivalent of STMT_VINFO_VECTYPE otherwise.
14610 - Set *NUNITS_VECTYPE_OUT to the vector type that contains the maximum
14611 number of units needed to vectorize STMT_INFO, or NULL_TREE if the
14612 statement does not help to determine the overall number of units. */
14614 opt_result
14615 vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
14616 tree *stmt_vectype_out,
14617 tree *nunits_vectype_out,
14618 unsigned int group_size)
14620 gimple *stmt = stmt_info->stmt;
14622 /* For BB vectorization, we should always have a group size once we've
14623 constructed the SLP tree; the only valid uses of zero GROUP_SIZEs
14624 are tentative requests during things like early data reference
14625 analysis and pattern recognition. */
14626 if (is_a <bb_vec_info> (vinfo))
14627 gcc_assert (vinfo->slp_instances.is_empty () || group_size != 0);
14628 else
14629 group_size = 0;
14631 *stmt_vectype_out = NULL_TREE;
14632 *nunits_vectype_out = NULL_TREE;
14634 if (gimple_get_lhs (stmt) == NULL_TREE
14635 /* Allow vector conditionals through here. */
14636 && !is_a <gcond *> (stmt)
14637 /* MASK_STORE has no lhs, but is ok. */
14638 && !gimple_call_internal_p (stmt, IFN_MASK_STORE))
14640 if (is_a <gcall *> (stmt))
14642 /* Ignore calls with no lhs. These must be calls to
14643 #pragma omp simd functions, and what vectorization factor
14644 it really needs can't be determined until
14645 vectorizable_simd_clone_call. */
14646 if (dump_enabled_p ())
14647 dump_printf_loc (MSG_NOTE, vect_location,
14648 "defer to SIMD clone analysis.\n");
14649 return opt_result::success ();
14652 return opt_result::failure_at (stmt,
14653 "not vectorized: irregular stmt: %G", stmt);
14656 tree vectype;
14657 tree scalar_type = NULL_TREE;
14658 if (group_size == 0 && STMT_VINFO_VECTYPE (stmt_info))
14660 vectype = STMT_VINFO_VECTYPE (stmt_info);
14661 if (dump_enabled_p ())
14662 dump_printf_loc (MSG_NOTE, vect_location,
14663 "precomputed vectype: %T\n", vectype);
14665 else if (vect_use_mask_type_p (stmt_info))
14667 unsigned int precision = stmt_info->mask_precision;
14668 scalar_type = build_nonstandard_integer_type (precision, 1);
14669 vectype = get_mask_type_for_scalar_type (vinfo, scalar_type, group_size);
14670 if (!vectype)
14671 return opt_result::failure_at (stmt, "not vectorized: unsupported"
14672 " data-type %T\n", scalar_type);
14673 if (dump_enabled_p ())
14674 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype);
14676 else
14678 /* If we got here with a gcond it means that the target had no available vector
14679 mode for the scalar type. We can't vectorize so abort. */
14680 if (is_a <gcond *> (stmt))
14681 return opt_result::failure_at (stmt,
14682 "not vectorized:"
14683 " unsupported data-type for gcond %T\n",
14684 scalar_type);
14686 if (data_reference *dr = STMT_VINFO_DATA_REF (stmt_info))
14687 scalar_type = TREE_TYPE (DR_REF (dr));
14688 else if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
14689 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
14690 else
14691 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
14693 if (dump_enabled_p ())
14695 if (group_size)
14696 dump_printf_loc (MSG_NOTE, vect_location,
14697 "get vectype for scalar type (group size %d):"
14698 " %T\n", group_size, scalar_type);
14699 else
14700 dump_printf_loc (MSG_NOTE, vect_location,
14701 "get vectype for scalar type: %T\n", scalar_type);
14703 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
14704 if (!vectype)
14705 return opt_result::failure_at (stmt,
14706 "not vectorized:"
14707 " unsupported data-type %T\n",
14708 scalar_type);
14710 if (dump_enabled_p ())
14711 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype);
14714 if (scalar_type && VECTOR_MODE_P (TYPE_MODE (scalar_type)))
14715 return opt_result::failure_at (stmt,
14716 "not vectorized: vector stmt in loop:%G",
14717 stmt);
14719 *stmt_vectype_out = vectype;
14721 /* Don't try to compute scalar types if the stmt produces a boolean
14722 vector; use the existing vector type instead. */
14723 tree nunits_vectype = vectype;
14724 if (!VECTOR_BOOLEAN_TYPE_P (vectype))
14726 /* The number of units is set according to the smallest scalar
14727 type (or the largest vector size, but we only support one
14728 vector size per vectorization). */
14729 scalar_type = vect_get_smallest_scalar_type (stmt_info,
14730 TREE_TYPE (vectype));
14731 if (scalar_type != TREE_TYPE (vectype))
14733 if (dump_enabled_p ())
14734 dump_printf_loc (MSG_NOTE, vect_location,
14735 "get vectype for smallest scalar type: %T\n",
14736 scalar_type);
14737 nunits_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
14738 group_size);
14739 if (!nunits_vectype)
14740 return opt_result::failure_at
14741 (stmt, "not vectorized: unsupported data-type %T\n",
14742 scalar_type);
14743 if (dump_enabled_p ())
14744 dump_printf_loc (MSG_NOTE, vect_location, "nunits vectype: %T\n",
14745 nunits_vectype);
14749 if (!multiple_p (TYPE_VECTOR_SUBPARTS (nunits_vectype),
14750 TYPE_VECTOR_SUBPARTS (*stmt_vectype_out)))
14751 return opt_result::failure_at (stmt,
14752 "Not vectorized: Incompatible number "
14753 "of vector subparts between %T and %T\n",
14754 nunits_vectype, *stmt_vectype_out);
14756 if (dump_enabled_p ())
14758 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
14759 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (nunits_vectype));
14760 dump_printf (MSG_NOTE, "\n");
14763 *nunits_vectype_out = nunits_vectype;
14764 return opt_result::success ();
14767 /* Generate and return statement sequence that sets vector length LEN that is:
14769 min_of_start_and_end = min (START_INDEX, END_INDEX);
14770 left_len = END_INDEX - min_of_start_and_end;
14771 rhs = min (left_len, LEN_LIMIT);
14772 LEN = rhs;
14774 Note: the cost of the code generated by this function is modeled
14775 by vect_estimate_min_profitable_iters, so changes here may need
14776 corresponding changes there. */
14778 gimple_seq
14779 vect_gen_len (tree len, tree start_index, tree end_index, tree len_limit)
14781 gimple_seq stmts = NULL;
14782 tree len_type = TREE_TYPE (len);
14783 gcc_assert (TREE_TYPE (start_index) == len_type);
14785 tree min = gimple_build (&stmts, MIN_EXPR, len_type, start_index, end_index);
14786 tree left_len = gimple_build (&stmts, MINUS_EXPR, len_type, end_index, min);
14787 tree rhs = gimple_build (&stmts, MIN_EXPR, len_type, left_len, len_limit);
14788 gimple* stmt = gimple_build_assign (len, rhs);
14789 gimple_seq_add_stmt (&stmts, stmt);
14791 return stmts;