c++: #pragma target and deferred instantiation [PR115403]
[official-gcc.git] / gcc / tree-vect-stmts.cc
blob20cae83e8206eaff61a835a549fce088a093f096
1 /* Statement Analysis and Transformation for Vectorization
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "ssa.h"
31 #include "optabs-tree.h"
32 #include "insn-config.h"
33 #include "recog.h" /* FIXME: for insn_data */
34 #include "cgraph.h"
35 #include "dumpfile.h"
36 #include "alias.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-cfg.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "cfgloop.h"
46 #include "explow.h"
47 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "builtins.h"
51 #include "internal-fn.h"
52 #include "tree-vector-builder.h"
53 #include "vec-perm-indices.h"
54 #include "gimple-range.h"
55 #include "tree-ssa-loop-niter.h"
56 #include "gimple-fold.h"
57 #include "regs.h"
58 #include "attribs.h"
59 #include "optabs-libfuncs.h"
61 /* For lang_hooks.types.type_for_mode. */
62 #include "langhooks.h"
64 /* Return the vectorized type for the given statement. */
66 tree
67 stmt_vectype (class _stmt_vec_info *stmt_info)
69 return STMT_VINFO_VECTYPE (stmt_info);
72 /* Return TRUE iff the given statement is in an inner loop relative to
73 the loop being vectorized. */
74 bool
75 stmt_in_inner_loop_p (vec_info *vinfo, class _stmt_vec_info *stmt_info)
77 gimple *stmt = STMT_VINFO_STMT (stmt_info);
78 basic_block bb = gimple_bb (stmt);
79 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
80 class loop* loop;
82 if (!loop_vinfo)
83 return false;
85 loop = LOOP_VINFO_LOOP (loop_vinfo);
87 return (bb->loop_father == loop->inner);
90 /* Record the cost of a statement, either by directly informing the
91 target model or by saving it in a vector for later processing.
92 Return a preliminary estimate of the statement's cost. */
94 static unsigned
95 record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
96 enum vect_cost_for_stmt kind,
97 stmt_vec_info stmt_info, slp_tree node,
98 tree vectype, int misalign,
99 enum vect_cost_model_location where)
101 if ((kind == vector_load || kind == unaligned_load)
102 && (stmt_info && STMT_VINFO_GATHER_SCATTER_P (stmt_info)))
103 kind = vector_gather_load;
104 if ((kind == vector_store || kind == unaligned_store)
105 && (stmt_info && STMT_VINFO_GATHER_SCATTER_P (stmt_info)))
106 kind = vector_scatter_store;
108 stmt_info_for_cost si
109 = { count, kind, where, stmt_info, node, vectype, misalign };
110 body_cost_vec->safe_push (si);
112 return (unsigned)
113 (builtin_vectorization_cost (kind, vectype, misalign) * count);
116 unsigned
117 record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
118 enum vect_cost_for_stmt kind, stmt_vec_info stmt_info,
119 tree vectype, int misalign,
120 enum vect_cost_model_location where)
122 return record_stmt_cost (body_cost_vec, count, kind, stmt_info, NULL,
123 vectype, misalign, where);
126 unsigned
127 record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
128 enum vect_cost_for_stmt kind, slp_tree node,
129 tree vectype, int misalign,
130 enum vect_cost_model_location where)
132 return record_stmt_cost (body_cost_vec, count, kind, NULL, node,
133 vectype, misalign, where);
136 unsigned
137 record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
138 enum vect_cost_for_stmt kind,
139 enum vect_cost_model_location where)
141 gcc_assert (kind == cond_branch_taken || kind == cond_branch_not_taken
142 || kind == scalar_stmt);
143 return record_stmt_cost (body_cost_vec, count, kind, NULL, NULL,
144 NULL_TREE, 0, where);
147 /* Return a variable of type ELEM_TYPE[NELEMS]. */
149 static tree
150 create_vector_array (tree elem_type, unsigned HOST_WIDE_INT nelems)
152 return create_tmp_var (build_array_type_nelts (elem_type, nelems),
153 "vect_array");
156 /* ARRAY is an array of vectors created by create_vector_array.
157 Return an SSA_NAME for the vector in index N. The reference
158 is part of the vectorization of STMT_INFO and the vector is associated
159 with scalar destination SCALAR_DEST. */
161 static tree
162 read_vector_array (vec_info *vinfo,
163 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
164 tree scalar_dest, tree array, unsigned HOST_WIDE_INT n)
166 tree vect_type, vect, vect_name, array_ref;
167 gimple *new_stmt;
169 gcc_assert (TREE_CODE (TREE_TYPE (array)) == ARRAY_TYPE);
170 vect_type = TREE_TYPE (TREE_TYPE (array));
171 vect = vect_create_destination_var (scalar_dest, vect_type);
172 array_ref = build4 (ARRAY_REF, vect_type, array,
173 build_int_cst (size_type_node, n),
174 NULL_TREE, NULL_TREE);
176 new_stmt = gimple_build_assign (vect, array_ref);
177 vect_name = make_ssa_name (vect, new_stmt);
178 gimple_assign_set_lhs (new_stmt, vect_name);
179 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
181 return vect_name;
184 /* ARRAY is an array of vectors created by create_vector_array.
185 Emit code to store SSA_NAME VECT in index N of the array.
186 The store is part of the vectorization of STMT_INFO. */
188 static void
189 write_vector_array (vec_info *vinfo,
190 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
191 tree vect, tree array, unsigned HOST_WIDE_INT n)
193 tree array_ref;
194 gimple *new_stmt;
196 array_ref = build4 (ARRAY_REF, TREE_TYPE (vect), array,
197 build_int_cst (size_type_node, n),
198 NULL_TREE, NULL_TREE);
200 new_stmt = gimple_build_assign (array_ref, vect);
201 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
204 /* PTR is a pointer to an array of type TYPE. Return a representation
205 of *PTR. The memory reference replaces those in FIRST_DR
206 (and its group). */
208 static tree
209 create_array_ref (tree type, tree ptr, tree alias_ptr_type)
211 tree mem_ref;
213 mem_ref = build2 (MEM_REF, type, ptr, build_int_cst (alias_ptr_type, 0));
214 /* Arrays have the same alignment as their type. */
215 set_ptr_info_alignment (get_ptr_info (ptr), TYPE_ALIGN_UNIT (type), 0);
216 return mem_ref;
219 /* Add a clobber of variable VAR to the vectorization of STMT_INFO.
220 Emit the clobber before *GSI. */
222 static void
223 vect_clobber_variable (vec_info *vinfo, stmt_vec_info stmt_info,
224 gimple_stmt_iterator *gsi, tree var)
226 tree clobber = build_clobber (TREE_TYPE (var));
227 gimple *new_stmt = gimple_build_assign (var, clobber);
228 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
231 /* Utility functions used by vect_mark_stmts_to_be_vectorized. */
233 /* Function vect_mark_relevant.
235 Mark STMT_INFO as "relevant for vectorization" and add it to WORKLIST. */
237 static void
238 vect_mark_relevant (vec<stmt_vec_info> *worklist, stmt_vec_info stmt_info,
239 enum vect_relevant relevant, bool live_p)
241 enum vect_relevant save_relevant = STMT_VINFO_RELEVANT (stmt_info);
242 bool save_live_p = STMT_VINFO_LIVE_P (stmt_info);
244 if (dump_enabled_p ())
245 dump_printf_loc (MSG_NOTE, vect_location,
246 "mark relevant %d, live %d: %G", relevant, live_p,
247 stmt_info->stmt);
249 /* If this stmt is an original stmt in a pattern, we might need to mark its
250 related pattern stmt instead of the original stmt. However, such stmts
251 may have their own uses that are not in any pattern, in such cases the
252 stmt itself should be marked. */
253 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
255 /* This is the last stmt in a sequence that was detected as a
256 pattern that can potentially be vectorized. Don't mark the stmt
257 as relevant/live because it's not going to be vectorized.
258 Instead mark the pattern-stmt that replaces it. */
260 if (dump_enabled_p ())
261 dump_printf_loc (MSG_NOTE, vect_location,
262 "last stmt in pattern. don't mark"
263 " relevant/live.\n");
265 stmt_vec_info old_stmt_info = stmt_info;
266 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
267 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == old_stmt_info);
268 save_relevant = STMT_VINFO_RELEVANT (stmt_info);
269 save_live_p = STMT_VINFO_LIVE_P (stmt_info);
271 if (live_p && relevant == vect_unused_in_scope)
273 if (dump_enabled_p ())
274 dump_printf_loc (MSG_NOTE, vect_location,
275 "vec_stmt_relevant_p: forcing live pattern stmt "
276 "relevant.\n");
277 relevant = vect_used_only_live;
280 if (dump_enabled_p ())
281 dump_printf_loc (MSG_NOTE, vect_location,
282 "mark relevant %d, live %d: %G", relevant, live_p,
283 stmt_info->stmt);
286 STMT_VINFO_LIVE_P (stmt_info) |= live_p;
287 if (relevant > STMT_VINFO_RELEVANT (stmt_info))
288 STMT_VINFO_RELEVANT (stmt_info) = relevant;
290 if (STMT_VINFO_RELEVANT (stmt_info) == save_relevant
291 && STMT_VINFO_LIVE_P (stmt_info) == save_live_p)
293 if (dump_enabled_p ())
294 dump_printf_loc (MSG_NOTE, vect_location,
295 "already marked relevant/live.\n");
296 return;
299 worklist->safe_push (stmt_info);
303 /* Function is_simple_and_all_uses_invariant
305 Return true if STMT_INFO is simple and all uses of it are invariant. */
307 bool
308 is_simple_and_all_uses_invariant (stmt_vec_info stmt_info,
309 loop_vec_info loop_vinfo)
311 tree op;
312 ssa_op_iter iter;
314 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
315 if (!stmt)
316 return false;
318 FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
320 enum vect_def_type dt = vect_uninitialized_def;
322 if (!vect_is_simple_use (op, loop_vinfo, &dt))
324 if (dump_enabled_p ())
325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
326 "use not simple.\n");
327 return false;
330 if (dt != vect_external_def && dt != vect_constant_def)
331 return false;
333 return true;
336 /* Function vect_stmt_relevant_p.
338 Return true if STMT_INFO, in the loop that is represented by LOOP_VINFO,
339 is "relevant for vectorization".
341 A stmt is considered "relevant for vectorization" if:
342 - it has uses outside the loop.
343 - it has vdefs (it alters memory).
344 - control stmts in the loop (except for the exit condition).
345 - it is an induction and we have multiple exits.
347 CHECKME: what other side effects would the vectorizer allow? */
349 static bool
350 vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
351 enum vect_relevant *relevant, bool *live_p)
353 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
354 ssa_op_iter op_iter;
355 imm_use_iterator imm_iter;
356 use_operand_p use_p;
357 def_operand_p def_p;
359 *relevant = vect_unused_in_scope;
360 *live_p = false;
362 /* cond stmt other than loop exit cond. */
363 gimple *stmt = STMT_VINFO_STMT (stmt_info);
364 if (is_ctrl_stmt (stmt)
365 && LOOP_VINFO_LOOP_IV_COND (loop_vinfo) != stmt
366 && (!loop->inner || gimple_bb (stmt)->loop_father == loop))
367 *relevant = vect_used_in_scope;
369 /* changing memory. */
370 if (gimple_code (stmt_info->stmt) != GIMPLE_PHI)
371 if (gimple_vdef (stmt_info->stmt)
372 && !gimple_clobber_p (stmt_info->stmt))
374 if (dump_enabled_p ())
375 dump_printf_loc (MSG_NOTE, vect_location,
376 "vec_stmt_relevant_p: stmt has vdefs.\n");
377 *relevant = vect_used_in_scope;
380 /* uses outside the loop. */
381 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
383 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, DEF_FROM_PTR (def_p))
385 basic_block bb = gimple_bb (USE_STMT (use_p));
386 if (!flow_bb_inside_loop_p (loop, bb))
388 if (is_gimple_debug (USE_STMT (use_p)))
389 continue;
391 if (dump_enabled_p ())
392 dump_printf_loc (MSG_NOTE, vect_location,
393 "vec_stmt_relevant_p: used out of loop.\n");
395 /* We expect all such uses to be in the loop exit phis
396 (because of loop closed form) */
397 gcc_assert (gimple_code (USE_STMT (use_p)) == GIMPLE_PHI);
399 *live_p = true;
404 /* Check if it's an induction and multiple exits. In this case there will be
405 a usage later on after peeling which is needed for the alternate exit. */
406 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
407 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
409 if (dump_enabled_p ())
410 dump_printf_loc (MSG_NOTE, vect_location,
411 "vec_stmt_relevant_p: induction forced for "
412 "early break.\n");
413 *live_p = true;
417 if (*live_p && *relevant == vect_unused_in_scope
418 && !is_simple_and_all_uses_invariant (stmt_info, loop_vinfo))
420 if (dump_enabled_p ())
421 dump_printf_loc (MSG_NOTE, vect_location,
422 "vec_stmt_relevant_p: stmt live but not relevant.\n");
423 *relevant = vect_used_only_live;
426 return (*live_p || *relevant);
430 /* Function exist_non_indexing_operands_for_use_p
432 USE is one of the uses attached to STMT_INFO. Check if USE is
433 used in STMT_INFO for anything other than indexing an array. */
435 static bool
436 exist_non_indexing_operands_for_use_p (tree use, stmt_vec_info stmt_info)
438 tree operand;
440 /* USE corresponds to some operand in STMT. If there is no data
441 reference in STMT, then any operand that corresponds to USE
442 is not indexing an array. */
443 if (!STMT_VINFO_DATA_REF (stmt_info))
444 return true;
446 /* STMT has a data_ref. FORNOW this means that its of one of
447 the following forms:
448 -1- ARRAY_REF = var
449 -2- var = ARRAY_REF
450 (This should have been verified in analyze_data_refs).
452 'var' in the second case corresponds to a def, not a use,
453 so USE cannot correspond to any operands that are not used
454 for array indexing.
456 Therefore, all we need to check is if STMT falls into the
457 first case, and whether var corresponds to USE. */
459 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
460 if (!assign || !gimple_assign_copy_p (assign))
462 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
463 if (call && gimple_call_internal_p (call))
465 internal_fn ifn = gimple_call_internal_fn (call);
466 int mask_index = internal_fn_mask_index (ifn);
467 if (mask_index >= 0
468 && use == gimple_call_arg (call, mask_index))
469 return true;
470 int stored_value_index = internal_fn_stored_value_index (ifn);
471 if (stored_value_index >= 0
472 && use == gimple_call_arg (call, stored_value_index))
473 return true;
474 if (internal_gather_scatter_fn_p (ifn)
475 && use == gimple_call_arg (call, 1))
476 return true;
478 return false;
481 if (TREE_CODE (gimple_assign_lhs (assign)) == SSA_NAME)
482 return false;
483 operand = gimple_assign_rhs1 (assign);
484 if (TREE_CODE (operand) != SSA_NAME)
485 return false;
487 if (operand == use)
488 return true;
490 return false;
495 Function process_use.
497 Inputs:
498 - a USE in STMT_VINFO in a loop represented by LOOP_VINFO
499 - RELEVANT - enum value to be set in the STMT_VINFO of the stmt
500 that defined USE. This is done by calling mark_relevant and passing it
501 the WORKLIST (to add DEF_STMT to the WORKLIST in case it is relevant).
502 - FORCE is true if exist_non_indexing_operands_for_use_p check shouldn't
503 be performed.
505 Outputs:
506 Generally, LIVE_P and RELEVANT are used to define the liveness and
507 relevance info of the DEF_STMT of this USE:
508 STMT_VINFO_LIVE_P (DEF_stmt_vinfo) <-- live_p
509 STMT_VINFO_RELEVANT (DEF_stmt_vinfo) <-- relevant
510 Exceptions:
511 - case 1: If USE is used only for address computations (e.g. array indexing),
512 which does not need to be directly vectorized, then the liveness/relevance
513 of the respective DEF_STMT is left unchanged.
514 - case 2: If STMT_VINFO is a reduction phi and DEF_STMT is a reduction stmt,
515 we skip DEF_STMT cause it had already been processed.
516 - case 3: If DEF_STMT and STMT_VINFO are in different nests, then
517 "relevant" will be modified accordingly.
519 Return true if everything is as expected. Return false otherwise. */
521 static opt_result
522 process_use (stmt_vec_info stmt_vinfo, tree use, loop_vec_info loop_vinfo,
523 enum vect_relevant relevant, vec<stmt_vec_info> *worklist,
524 bool force)
526 stmt_vec_info dstmt_vinfo;
527 enum vect_def_type dt;
529 /* case 1: we are only interested in uses that need to be vectorized. Uses
530 that are used for address computation are not considered relevant. */
531 if (!force && !exist_non_indexing_operands_for_use_p (use, stmt_vinfo))
532 return opt_result::success ();
534 if (!vect_is_simple_use (use, loop_vinfo, &dt, &dstmt_vinfo))
535 return opt_result::failure_at (stmt_vinfo->stmt,
536 "not vectorized:"
537 " unsupported use in stmt.\n");
539 if (!dstmt_vinfo)
540 return opt_result::success ();
542 basic_block def_bb = gimple_bb (dstmt_vinfo->stmt);
543 basic_block bb = gimple_bb (stmt_vinfo->stmt);
545 /* case 2: A reduction phi (STMT) defined by a reduction stmt (DSTMT_VINFO).
546 We have to force the stmt live since the epilogue loop needs it to
547 continue computing the reduction. */
548 if (gimple_code (stmt_vinfo->stmt) == GIMPLE_PHI
549 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
550 && gimple_code (dstmt_vinfo->stmt) != GIMPLE_PHI
551 && STMT_VINFO_DEF_TYPE (dstmt_vinfo) == vect_reduction_def
552 && bb->loop_father == def_bb->loop_father)
554 if (dump_enabled_p ())
555 dump_printf_loc (MSG_NOTE, vect_location,
556 "reduc-stmt defining reduc-phi in the same nest.\n");
557 vect_mark_relevant (worklist, dstmt_vinfo, relevant, true);
558 return opt_result::success ();
561 /* case 3a: outer-loop stmt defining an inner-loop stmt:
562 outer-loop-header-bb:
563 d = dstmt_vinfo
564 inner-loop:
565 stmt # use (d)
566 outer-loop-tail-bb:
567 ... */
568 if (flow_loop_nested_p (def_bb->loop_father, bb->loop_father))
570 if (dump_enabled_p ())
571 dump_printf_loc (MSG_NOTE, vect_location,
572 "outer-loop def-stmt defining inner-loop stmt.\n");
574 switch (relevant)
576 case vect_unused_in_scope:
577 relevant = (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_nested_cycle) ?
578 vect_used_in_scope : vect_unused_in_scope;
579 break;
581 case vect_used_in_outer_by_reduction:
582 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_reduction_def);
583 relevant = vect_used_by_reduction;
584 break;
586 case vect_used_in_outer:
587 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_reduction_def);
588 relevant = vect_used_in_scope;
589 break;
591 case vect_used_in_scope:
592 break;
594 default:
595 gcc_unreachable ();
599 /* case 3b: inner-loop stmt defining an outer-loop stmt:
600 outer-loop-header-bb:
602 inner-loop:
603 d = dstmt_vinfo
604 outer-loop-tail-bb (or outer-loop-exit-bb in double reduction):
605 stmt # use (d) */
606 else if (flow_loop_nested_p (bb->loop_father, def_bb->loop_father))
608 if (dump_enabled_p ())
609 dump_printf_loc (MSG_NOTE, vect_location,
610 "inner-loop def-stmt defining outer-loop stmt.\n");
612 switch (relevant)
614 case vect_unused_in_scope:
615 relevant = (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
616 || STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_double_reduction_def) ?
617 vect_used_in_outer_by_reduction : vect_unused_in_scope;
618 break;
620 case vect_used_by_reduction:
621 case vect_used_only_live:
622 relevant = vect_used_in_outer_by_reduction;
623 break;
625 case vect_used_in_scope:
626 relevant = vect_used_in_outer;
627 break;
629 default:
630 gcc_unreachable ();
633 /* We are also not interested in uses on loop PHI backedges that are
634 inductions. Otherwise we'll needlessly vectorize the IV increment
635 and cause hybrid SLP for SLP inductions. Unless the PHI is live
636 of course. */
637 else if (gimple_code (stmt_vinfo->stmt) == GIMPLE_PHI
638 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_induction_def
639 && ! STMT_VINFO_LIVE_P (stmt_vinfo)
640 && (PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
641 loop_latch_edge (bb->loop_father))
642 == use))
644 if (dump_enabled_p ())
645 dump_printf_loc (MSG_NOTE, vect_location,
646 "induction value on backedge.\n");
647 return opt_result::success ();
651 vect_mark_relevant (worklist, dstmt_vinfo, relevant, false);
652 return opt_result::success ();
656 /* Function vect_mark_stmts_to_be_vectorized.
658 Not all stmts in the loop need to be vectorized. For example:
660 for i...
661 for j...
662 1. T0 = i + j
663 2. T1 = a[T0]
665 3. j = j + 1
667 Stmt 1 and 3 do not need to be vectorized, because loop control and
668 addressing of vectorized data-refs are handled differently.
670 This pass detects such stmts. */
672 opt_result
673 vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo, bool *fatal)
675 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
676 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
677 unsigned int nbbs = loop->num_nodes;
678 gimple_stmt_iterator si;
679 unsigned int i;
680 basic_block bb;
681 bool live_p;
682 enum vect_relevant relevant;
684 DUMP_VECT_SCOPE ("vect_mark_stmts_to_be_vectorized");
686 auto_vec<stmt_vec_info, 64> worklist;
688 /* 1. Init worklist. */
689 for (i = 0; i < nbbs; i++)
691 bb = bbs[i];
692 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
694 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
695 if (dump_enabled_p ())
696 dump_printf_loc (MSG_NOTE, vect_location, "init: phi relevant? %G",
697 phi_info->stmt);
699 if (vect_stmt_relevant_p (phi_info, loop_vinfo, &relevant, &live_p))
700 vect_mark_relevant (&worklist, phi_info, relevant, live_p);
702 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
704 if (is_gimple_debug (gsi_stmt (si)))
705 continue;
706 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
707 if (dump_enabled_p ())
708 dump_printf_loc (MSG_NOTE, vect_location,
709 "init: stmt relevant? %G", stmt_info->stmt);
711 if (vect_stmt_relevant_p (stmt_info, loop_vinfo, &relevant, &live_p))
712 vect_mark_relevant (&worklist, stmt_info, relevant, live_p);
716 /* 2. Process_worklist */
717 while (worklist.length () > 0)
719 use_operand_p use_p;
720 ssa_op_iter iter;
722 stmt_vec_info stmt_vinfo = worklist.pop ();
723 if (dump_enabled_p ())
724 dump_printf_loc (MSG_NOTE, vect_location,
725 "worklist: examine stmt: %G", stmt_vinfo->stmt);
727 /* Examine the USEs of STMT. For each USE, mark the stmt that defines it
728 (DEF_STMT) as relevant/irrelevant according to the relevance property
729 of STMT. */
730 relevant = STMT_VINFO_RELEVANT (stmt_vinfo);
732 /* Generally, the relevance property of STMT (in STMT_VINFO_RELEVANT) is
733 propagated as is to the DEF_STMTs of its USEs.
735 One exception is when STMT has been identified as defining a reduction
736 variable; in this case we set the relevance to vect_used_by_reduction.
737 This is because we distinguish between two kinds of relevant stmts -
738 those that are used by a reduction computation, and those that are
739 (also) used by a regular computation. This allows us later on to
740 identify stmts that are used solely by a reduction, and therefore the
741 order of the results that they produce does not have to be kept. */
743 switch (STMT_VINFO_DEF_TYPE (stmt_vinfo))
745 case vect_reduction_def:
746 gcc_assert (relevant != vect_unused_in_scope);
747 if (relevant != vect_unused_in_scope
748 && relevant != vect_used_in_scope
749 && relevant != vect_used_by_reduction
750 && relevant != vect_used_only_live)
751 return opt_result::failure_at
752 (stmt_vinfo->stmt, "unsupported use of reduction.\n");
753 break;
755 case vect_nested_cycle:
756 if (relevant != vect_unused_in_scope
757 && relevant != vect_used_in_outer_by_reduction
758 && relevant != vect_used_in_outer)
759 return opt_result::failure_at
760 (stmt_vinfo->stmt, "unsupported use of nested cycle.\n");
761 break;
763 case vect_double_reduction_def:
764 if (relevant != vect_unused_in_scope
765 && relevant != vect_used_by_reduction
766 && relevant != vect_used_only_live)
767 return opt_result::failure_at
768 (stmt_vinfo->stmt, "unsupported use of double reduction.\n");
769 break;
771 default:
772 break;
775 if (is_pattern_stmt_p (stmt_vinfo))
777 /* Pattern statements are not inserted into the code, so
778 FOR_EACH_PHI_OR_STMT_USE optimizes their operands out, and we
779 have to scan the RHS or function arguments instead. */
780 if (gassign *assign = dyn_cast <gassign *> (stmt_vinfo->stmt))
782 enum tree_code rhs_code = gimple_assign_rhs_code (assign);
783 tree op = gimple_assign_rhs1 (assign);
785 i = 1;
786 if (rhs_code == COND_EXPR && COMPARISON_CLASS_P (op))
788 opt_result res
789 = process_use (stmt_vinfo, TREE_OPERAND (op, 0),
790 loop_vinfo, relevant, &worklist, false);
791 if (!res)
792 return res;
793 res = process_use (stmt_vinfo, TREE_OPERAND (op, 1),
794 loop_vinfo, relevant, &worklist, false);
795 if (!res)
796 return res;
797 i = 2;
799 for (; i < gimple_num_ops (assign); i++)
801 op = gimple_op (assign, i);
802 if (TREE_CODE (op) == SSA_NAME)
804 opt_result res
805 = process_use (stmt_vinfo, op, loop_vinfo, relevant,
806 &worklist, false);
807 if (!res)
808 return res;
812 else if (gcond *cond = dyn_cast <gcond *> (stmt_vinfo->stmt))
814 tree_code rhs_code = gimple_cond_code (cond);
815 gcc_assert (TREE_CODE_CLASS (rhs_code) == tcc_comparison);
816 opt_result res
817 = process_use (stmt_vinfo, gimple_cond_lhs (cond),
818 loop_vinfo, relevant, &worklist, false);
819 if (!res)
820 return res;
821 res = process_use (stmt_vinfo, gimple_cond_rhs (cond),
822 loop_vinfo, relevant, &worklist, false);
823 if (!res)
824 return res;
826 else if (gcall *call = dyn_cast <gcall *> (stmt_vinfo->stmt))
828 for (i = 0; i < gimple_call_num_args (call); i++)
830 tree arg = gimple_call_arg (call, i);
831 opt_result res
832 = process_use (stmt_vinfo, arg, loop_vinfo, relevant,
833 &worklist, false);
834 if (!res)
835 return res;
838 else
839 gcc_unreachable ();
841 else
842 FOR_EACH_PHI_OR_STMT_USE (use_p, stmt_vinfo->stmt, iter, SSA_OP_USE)
844 tree op = USE_FROM_PTR (use_p);
845 opt_result res
846 = process_use (stmt_vinfo, op, loop_vinfo, relevant,
847 &worklist, false);
848 if (!res)
849 return res;
852 if (STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo))
854 gather_scatter_info gs_info;
855 if (!vect_check_gather_scatter (stmt_vinfo, loop_vinfo, &gs_info))
856 gcc_unreachable ();
857 opt_result res
858 = process_use (stmt_vinfo, gs_info.offset, loop_vinfo, relevant,
859 &worklist, true);
860 if (!res)
862 if (fatal)
863 *fatal = false;
864 return res;
867 } /* while worklist */
869 return opt_result::success ();
872 /* Function vect_model_simple_cost.
874 Models cost for simple operations, i.e. those that only emit ncopies of a
875 single op. Right now, this does not account for multiple insns that could
876 be generated for the single vector op. We will handle that shortly. */
878 static void
879 vect_model_simple_cost (vec_info *,
880 stmt_vec_info stmt_info, int ncopies,
881 enum vect_def_type *dt,
882 int ndts,
883 slp_tree node,
884 stmt_vector_for_cost *cost_vec,
885 vect_cost_for_stmt kind = vector_stmt)
887 int inside_cost = 0, prologue_cost = 0;
889 gcc_assert (cost_vec != NULL);
891 /* ??? Somehow we need to fix this at the callers. */
892 if (node)
893 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
895 if (!node)
896 /* Cost the "broadcast" of a scalar operand in to a vector operand.
897 Use scalar_to_vec to cost the broadcast, as elsewhere in the vector
898 cost model. */
899 for (int i = 0; i < ndts; i++)
900 if (dt[i] == vect_constant_def || dt[i] == vect_external_def)
901 prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
902 stmt_info, 0, vect_prologue);
904 /* Pass the inside-of-loop statements to the target-specific cost model. */
905 inside_cost += record_stmt_cost (cost_vec, ncopies, kind,
906 stmt_info, 0, vect_body);
908 if (dump_enabled_p ())
909 dump_printf_loc (MSG_NOTE, vect_location,
910 "vect_model_simple_cost: inside_cost = %d, "
911 "prologue_cost = %d .\n", inside_cost, prologue_cost);
915 /* Model cost for type demotion and promotion operations. PWR is
916 normally zero for single-step promotions and demotions. It will be
917 one if two-step promotion/demotion is required, and so on. NCOPIES
918 is the number of vector results (and thus number of instructions)
919 for the narrowest end of the operation chain. Each additional
920 step doubles the number of instructions required. If WIDEN_ARITH
921 is true the stmt is doing widening arithmetic. */
923 static void
924 vect_model_promotion_demotion_cost (stmt_vec_info stmt_info,
925 enum vect_def_type *dt,
926 unsigned int ncopies, int pwr,
927 stmt_vector_for_cost *cost_vec,
928 bool widen_arith)
930 int i;
931 int inside_cost = 0, prologue_cost = 0;
933 for (i = 0; i < pwr + 1; i++)
935 inside_cost += record_stmt_cost (cost_vec, ncopies,
936 widen_arith
937 ? vector_stmt : vec_promote_demote,
938 stmt_info, 0, vect_body);
939 ncopies *= 2;
942 /* FORNOW: Assuming maximum 2 args per stmts. */
943 for (i = 0; i < 2; i++)
944 if (dt[i] == vect_constant_def || dt[i] == vect_external_def)
945 prologue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
946 stmt_info, 0, vect_prologue);
948 if (dump_enabled_p ())
949 dump_printf_loc (MSG_NOTE, vect_location,
950 "vect_model_promotion_demotion_cost: inside_cost = %d, "
951 "prologue_cost = %d .\n", inside_cost, prologue_cost);
954 /* Returns true if the current function returns DECL. */
956 static bool
957 cfun_returns (tree decl)
959 edge_iterator ei;
960 edge e;
961 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
963 greturn *ret = safe_dyn_cast <greturn *> (*gsi_last_bb (e->src));
964 if (!ret)
965 continue;
966 if (gimple_return_retval (ret) == decl)
967 return true;
968 /* We often end up with an aggregate copy to the result decl,
969 handle that case as well. First skip intermediate clobbers
970 though. */
971 gimple *def = ret;
974 def = SSA_NAME_DEF_STMT (gimple_vuse (def));
976 while (gimple_clobber_p (def));
977 if (is_a <gassign *> (def)
978 && gimple_assign_lhs (def) == gimple_return_retval (ret)
979 && gimple_assign_rhs1 (def) == decl)
980 return true;
982 return false;
985 /* Calculate cost of DR's memory access. */
986 void
987 vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,
988 dr_alignment_support alignment_support_scheme,
989 int misalignment,
990 unsigned int *inside_cost,
991 stmt_vector_for_cost *body_cost_vec)
993 switch (alignment_support_scheme)
995 case dr_aligned:
997 *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
998 vector_store, stmt_info, 0,
999 vect_body);
1001 if (dump_enabled_p ())
1002 dump_printf_loc (MSG_NOTE, vect_location,
1003 "vect_model_store_cost: aligned.\n");
1004 break;
1007 case dr_unaligned_supported:
1009 /* Here, we assign an additional cost for the unaligned store. */
1010 *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
1011 unaligned_store, stmt_info,
1012 misalignment, vect_body);
1013 if (dump_enabled_p ())
1014 dump_printf_loc (MSG_NOTE, vect_location,
1015 "vect_model_store_cost: unaligned supported by "
1016 "hardware.\n");
1017 break;
1020 case dr_unaligned_unsupported:
1022 *inside_cost = VECT_MAX_COST;
1024 if (dump_enabled_p ())
1025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026 "vect_model_store_cost: unsupported access.\n");
1027 break;
1030 default:
1031 gcc_unreachable ();
1035 /* Calculate cost of DR's memory access. */
1036 void
1037 vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,
1038 dr_alignment_support alignment_support_scheme,
1039 int misalignment,
1040 bool add_realign_cost, unsigned int *inside_cost,
1041 unsigned int *prologue_cost,
1042 stmt_vector_for_cost *prologue_cost_vec,
1043 stmt_vector_for_cost *body_cost_vec,
1044 bool record_prologue_costs)
1046 switch (alignment_support_scheme)
1048 case dr_aligned:
1050 *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vector_load,
1051 stmt_info, 0, vect_body);
1053 if (dump_enabled_p ())
1054 dump_printf_loc (MSG_NOTE, vect_location,
1055 "vect_model_load_cost: aligned.\n");
1057 break;
1059 case dr_unaligned_supported:
1061 /* Here, we assign an additional cost for the unaligned load. */
1062 *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
1063 unaligned_load, stmt_info,
1064 misalignment, vect_body);
1066 if (dump_enabled_p ())
1067 dump_printf_loc (MSG_NOTE, vect_location,
1068 "vect_model_load_cost: unaligned supported by "
1069 "hardware.\n");
1071 break;
1073 case dr_explicit_realign:
1075 *inside_cost += record_stmt_cost (body_cost_vec, ncopies * 2,
1076 vector_load, stmt_info, 0, vect_body);
1077 *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
1078 vec_perm, stmt_info, 0, vect_body);
1080 /* FIXME: If the misalignment remains fixed across the iterations of
1081 the containing loop, the following cost should be added to the
1082 prologue costs. */
1083 if (targetm.vectorize.builtin_mask_for_load)
1084 *inside_cost += record_stmt_cost (body_cost_vec, 1, vector_stmt,
1085 stmt_info, 0, vect_body);
1087 if (dump_enabled_p ())
1088 dump_printf_loc (MSG_NOTE, vect_location,
1089 "vect_model_load_cost: explicit realign\n");
1091 break;
1093 case dr_explicit_realign_optimized:
1095 if (dump_enabled_p ())
1096 dump_printf_loc (MSG_NOTE, vect_location,
1097 "vect_model_load_cost: unaligned software "
1098 "pipelined.\n");
1100 /* Unaligned software pipeline has a load of an address, an initial
1101 load, and possibly a mask operation to "prime" the loop. However,
1102 if this is an access in a group of loads, which provide grouped
1103 access, then the above cost should only be considered for one
1104 access in the group. Inside the loop, there is a load op
1105 and a realignment op. */
1107 if (add_realign_cost && record_prologue_costs)
1109 *prologue_cost += record_stmt_cost (prologue_cost_vec, 2,
1110 vector_stmt, stmt_info,
1111 0, vect_prologue);
1112 if (targetm.vectorize.builtin_mask_for_load)
1113 *prologue_cost += record_stmt_cost (prologue_cost_vec, 1,
1114 vector_stmt, stmt_info,
1115 0, vect_prologue);
1118 *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vector_load,
1119 stmt_info, 0, vect_body);
1120 *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vec_perm,
1121 stmt_info, 0, vect_body);
1123 if (dump_enabled_p ())
1124 dump_printf_loc (MSG_NOTE, vect_location,
1125 "vect_model_load_cost: explicit realign optimized"
1126 "\n");
1128 break;
1131 case dr_unaligned_unsupported:
1133 *inside_cost = VECT_MAX_COST;
1135 if (dump_enabled_p ())
1136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1137 "vect_model_load_cost: unsupported access.\n");
1138 break;
1141 default:
1142 gcc_unreachable ();
1146 /* Insert the new stmt NEW_STMT at *GSI or at the appropriate place in
1147 the loop preheader for the vectorized stmt STMT_VINFO. */
1149 static void
1150 vect_init_vector_1 (vec_info *vinfo, stmt_vec_info stmt_vinfo, gimple *new_stmt,
1151 gimple_stmt_iterator *gsi)
1153 if (gsi)
1154 vect_finish_stmt_generation (vinfo, stmt_vinfo, new_stmt, gsi);
1155 else
1156 vinfo->insert_on_entry (stmt_vinfo, new_stmt);
1158 if (dump_enabled_p ())
1159 dump_printf_loc (MSG_NOTE, vect_location,
1160 "created new init_stmt: %G", new_stmt);
1163 /* Function vect_init_vector.
1165 Insert a new stmt (INIT_STMT) that initializes a new variable of type
1166 TYPE with the value VAL. If TYPE is a vector type and VAL does not have
1167 vector type a vector with all elements equal to VAL is created first.
1168 Place the initialization at GSI if it is not NULL. Otherwise, place the
1169 initialization at the loop preheader.
1170 Return the DEF of INIT_STMT.
1171 It will be used in the vectorization of STMT_INFO. */
1173 tree
1174 vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
1175 gimple_stmt_iterator *gsi)
1177 gimple *init_stmt;
1178 tree new_temp;
1180 /* We abuse this function to push sth to a SSA name with initial 'val'. */
1181 if (! useless_type_conversion_p (type, TREE_TYPE (val)))
1183 gcc_assert (VECTOR_TYPE_P (type));
1184 if (! types_compatible_p (TREE_TYPE (type), TREE_TYPE (val)))
1186 /* Scalar boolean value should be transformed into
1187 all zeros or all ones value before building a vector. */
1188 if (VECTOR_BOOLEAN_TYPE_P (type))
1190 tree true_val = build_all_ones_cst (TREE_TYPE (type));
1191 tree false_val = build_zero_cst (TREE_TYPE (type));
1193 if (CONSTANT_CLASS_P (val))
1194 val = integer_zerop (val) ? false_val : true_val;
1195 else
1197 new_temp = make_ssa_name (TREE_TYPE (type));
1198 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
1199 val, true_val, false_val);
1200 vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
1201 val = new_temp;
1204 else
1206 gimple_seq stmts = NULL;
1207 if (! INTEGRAL_TYPE_P (TREE_TYPE (val)))
1208 val = gimple_build (&stmts, VIEW_CONVERT_EXPR,
1209 TREE_TYPE (type), val);
1210 else
1211 /* ??? Condition vectorization expects us to do
1212 promotion of invariant/external defs. */
1213 val = gimple_convert (&stmts, TREE_TYPE (type), val);
1214 for (gimple_stmt_iterator gsi2 = gsi_start (stmts);
1215 !gsi_end_p (gsi2); )
1217 init_stmt = gsi_stmt (gsi2);
1218 gsi_remove (&gsi2, false);
1219 vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
1223 val = build_vector_from_val (type, val);
1226 new_temp = vect_get_new_ssa_name (type, vect_simple_var, "cst_");
1227 init_stmt = gimple_build_assign (new_temp, val);
1228 vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
1229 return new_temp;
1233 /* Function vect_get_vec_defs_for_operand.
1235 OP is an operand in STMT_VINFO. This function returns a vector of
1236 NCOPIES defs that will be used in the vectorized stmts for STMT_VINFO.
1238 In the case that OP is an SSA_NAME which is defined in the loop, then
1239 STMT_VINFO_VEC_STMTS of the defining stmt holds the relevant defs.
1241 In case OP is an invariant or constant, a new stmt that creates a vector def
1242 needs to be introduced. VECTYPE may be used to specify a required type for
1243 vector invariant. */
1245 void
1246 vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info stmt_vinfo,
1247 unsigned ncopies,
1248 tree op, vec<tree> *vec_oprnds, tree vectype)
1250 gimple *def_stmt;
1251 enum vect_def_type dt;
1252 bool is_simple_use;
1253 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1255 if (dump_enabled_p ())
1256 dump_printf_loc (MSG_NOTE, vect_location,
1257 "vect_get_vec_defs_for_operand: %T\n", op);
1259 stmt_vec_info def_stmt_info;
1260 is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt,
1261 &def_stmt_info, &def_stmt);
1262 gcc_assert (is_simple_use);
1263 if (def_stmt && dump_enabled_p ())
1264 dump_printf_loc (MSG_NOTE, vect_location, " def_stmt = %G", def_stmt);
1266 vec_oprnds->create (ncopies);
1267 if (dt == vect_constant_def || dt == vect_external_def)
1269 tree stmt_vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1270 tree vector_type;
1272 if (vectype)
1273 vector_type = vectype;
1274 else if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
1275 && VECTOR_BOOLEAN_TYPE_P (stmt_vectype))
1276 vector_type = truth_type_for (stmt_vectype);
1277 else
1278 vector_type = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op));
1280 gcc_assert (vector_type);
1281 tree vop = vect_init_vector (vinfo, stmt_vinfo, op, vector_type, NULL);
1282 while (ncopies--)
1283 vec_oprnds->quick_push (vop);
1285 else
1287 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
1288 gcc_assert (STMT_VINFO_VEC_STMTS (def_stmt_info).length () == ncopies);
1289 for (unsigned i = 0; i < ncopies; ++i)
1290 vec_oprnds->quick_push (gimple_get_lhs
1291 (STMT_VINFO_VEC_STMTS (def_stmt_info)[i]));
1296 /* Get vectorized definitions for OP0 and OP1. */
1298 void
1299 vect_get_vec_defs (vec_info *vinfo, stmt_vec_info stmt_info, slp_tree slp_node,
1300 unsigned ncopies,
1301 tree op0, tree vectype0, vec<tree> *vec_oprnds0,
1302 tree op1, tree vectype1, vec<tree> *vec_oprnds1,
1303 tree op2, tree vectype2, vec<tree> *vec_oprnds2,
1304 tree op3, tree vectype3, vec<tree> *vec_oprnds3)
1306 if (slp_node)
1308 if (op0)
1309 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], vec_oprnds0);
1310 if (op1)
1311 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[1], vec_oprnds1);
1312 if (op2)
1313 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[2], vec_oprnds2);
1314 if (op3)
1315 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[3], vec_oprnds3);
1317 else
1319 if (op0)
1320 vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
1321 op0, vec_oprnds0, vectype0);
1322 if (op1)
1323 vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
1324 op1, vec_oprnds1, vectype1);
1325 if (op2)
1326 vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
1327 op2, vec_oprnds2, vectype2);
1328 if (op3)
1329 vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
1330 op3, vec_oprnds3, vectype3);
1334 void
1335 vect_get_vec_defs (vec_info *vinfo, stmt_vec_info stmt_info, slp_tree slp_node,
1336 unsigned ncopies,
1337 tree op0, vec<tree> *vec_oprnds0,
1338 tree op1, vec<tree> *vec_oprnds1,
1339 tree op2, vec<tree> *vec_oprnds2,
1340 tree op3, vec<tree> *vec_oprnds3)
1342 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
1343 op0, NULL_TREE, vec_oprnds0,
1344 op1, NULL_TREE, vec_oprnds1,
1345 op2, NULL_TREE, vec_oprnds2,
1346 op3, NULL_TREE, vec_oprnds3);
1349 /* Helper function called by vect_finish_replace_stmt and
1350 vect_finish_stmt_generation. Set the location of the new
1351 statement and create and return a stmt_vec_info for it. */
1353 static void
1354 vect_finish_stmt_generation_1 (vec_info *,
1355 stmt_vec_info stmt_info, gimple *vec_stmt)
1357 if (dump_enabled_p ())
1358 dump_printf_loc (MSG_NOTE, vect_location, "add new stmt: %G", vec_stmt);
1360 if (stmt_info)
1362 gimple_set_location (vec_stmt, gimple_location (stmt_info->stmt));
1364 /* While EH edges will generally prevent vectorization, stmt might
1365 e.g. be in a must-not-throw region. Ensure newly created stmts
1366 that could throw are part of the same region. */
1367 int lp_nr = lookup_stmt_eh_lp (stmt_info->stmt);
1368 if (lp_nr != 0 && stmt_could_throw_p (cfun, vec_stmt))
1369 add_stmt_to_eh_lp (vec_stmt, lp_nr);
1371 else
1372 gcc_assert (!stmt_could_throw_p (cfun, vec_stmt));
1375 /* Replace the scalar statement STMT_INFO with a new vector statement VEC_STMT,
1376 which sets the same scalar result as STMT_INFO did. Create and return a
1377 stmt_vec_info for VEC_STMT. */
1379 void
1380 vect_finish_replace_stmt (vec_info *vinfo,
1381 stmt_vec_info stmt_info, gimple *vec_stmt)
1383 gimple *scalar_stmt = vect_orig_stmt (stmt_info)->stmt;
1384 gcc_assert (gimple_get_lhs (scalar_stmt) == gimple_get_lhs (vec_stmt));
1386 gimple_stmt_iterator gsi = gsi_for_stmt (scalar_stmt);
1387 gsi_replace (&gsi, vec_stmt, true);
1389 vect_finish_stmt_generation_1 (vinfo, stmt_info, vec_stmt);
1392 /* Add VEC_STMT to the vectorized implementation of STMT_INFO and insert it
1393 before *GSI. Create and return a stmt_vec_info for VEC_STMT. */
1395 void
1396 vect_finish_stmt_generation (vec_info *vinfo,
1397 stmt_vec_info stmt_info, gimple *vec_stmt,
1398 gimple_stmt_iterator *gsi)
1400 gcc_assert (!stmt_info || gimple_code (stmt_info->stmt) != GIMPLE_LABEL);
1402 if (!gsi_end_p (*gsi)
1403 && gimple_has_mem_ops (vec_stmt))
1405 gimple *at_stmt = gsi_stmt (*gsi);
1406 tree vuse = gimple_vuse (at_stmt);
1407 if (vuse && TREE_CODE (vuse) == SSA_NAME)
1409 tree vdef = gimple_vdef (at_stmt);
1410 gimple_set_vuse (vec_stmt, gimple_vuse (at_stmt));
1411 gimple_set_modified (vec_stmt, true);
1412 /* If we have an SSA vuse and insert a store, update virtual
1413 SSA form to avoid triggering the renamer. Do so only
1414 if we can easily see all uses - which is what almost always
1415 happens with the way vectorized stmts are inserted. */
1416 if ((vdef && TREE_CODE (vdef) == SSA_NAME)
1417 && ((is_gimple_assign (vec_stmt)
1418 && !is_gimple_reg (gimple_assign_lhs (vec_stmt)))
1419 || (is_gimple_call (vec_stmt)
1420 && (!(gimple_call_flags (vec_stmt)
1421 & (ECF_CONST|ECF_PURE|ECF_NOVOPS))
1422 || (gimple_call_lhs (vec_stmt)
1423 && !is_gimple_reg (gimple_call_lhs (vec_stmt)))))))
1425 tree new_vdef = copy_ssa_name (vuse, vec_stmt);
1426 gimple_set_vdef (vec_stmt, new_vdef);
1427 SET_USE (gimple_vuse_op (at_stmt), new_vdef);
1431 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
1432 vect_finish_stmt_generation_1 (vinfo, stmt_info, vec_stmt);
1435 /* We want to vectorize a call to combined function CFN with function
1436 decl FNDECL, using VECTYPE_OUT as the type of the output and VECTYPE_IN
1437 as the types of all inputs. Check whether this is possible using
1438 an internal function, returning its code if so or IFN_LAST if not. */
1440 static internal_fn
1441 vectorizable_internal_function (combined_fn cfn, tree fndecl,
1442 tree vectype_out, tree vectype_in)
1444 internal_fn ifn;
1445 if (internal_fn_p (cfn))
1446 ifn = as_internal_fn (cfn);
1447 else
1448 ifn = associated_internal_fn (fndecl);
1449 if (ifn != IFN_LAST && direct_internal_fn_p (ifn))
1451 const direct_internal_fn_info &info = direct_internal_fn (ifn);
1452 if (info.vectorizable)
1454 bool same_size_p = TYPE_SIZE (vectype_in) == TYPE_SIZE (vectype_out);
1455 tree type0 = (info.type0 < 0 ? vectype_out : vectype_in);
1456 tree type1 = (info.type1 < 0 ? vectype_out : vectype_in);
1458 /* The type size of both the vectype_in and vectype_out should be
1459 exactly the same when vectype_out isn't participating the optab.
1460 While there is no restriction for type size when vectype_out
1461 is part of the optab query. */
1462 if (type0 != vectype_out && type1 != vectype_out && !same_size_p)
1463 return IFN_LAST;
1465 if (direct_internal_fn_supported_p (ifn, tree_pair (type0, type1),
1466 OPTIMIZE_FOR_SPEED))
1467 return ifn;
1470 return IFN_LAST;
1474 static tree permute_vec_elements (vec_info *, tree, tree, tree, stmt_vec_info,
1475 gimple_stmt_iterator *);
1477 /* Check whether a load or store statement in the loop described by
1478 LOOP_VINFO is possible in a loop using partial vectors. This is
1479 testing whether the vectorizer pass has the appropriate support,
1480 as well as whether the target does.
1482 VLS_TYPE says whether the statement is a load or store and VECTYPE
1483 is the type of the vector being loaded or stored. SLP_NODE is the SLP
1484 node that contains the statement, or null if none. MEMORY_ACCESS_TYPE
1485 says how the load or store is going to be implemented and GROUP_SIZE
1486 is the number of load or store statements in the containing group.
1487 If the access is a gather load or scatter store, GS_INFO describes
1488 its arguments. If the load or store is conditional, SCALAR_MASK is the
1489 condition under which it occurs.
1491 Clear LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P if a loop using partial
1492 vectors is not supported, otherwise record the required rgroup control
1493 types. */
1495 static void
1496 check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
1497 slp_tree slp_node,
1498 vec_load_store_type vls_type,
1499 int group_size,
1500 vect_memory_access_type
1501 memory_access_type,
1502 gather_scatter_info *gs_info,
1503 tree scalar_mask)
1505 /* Invariant loads need no special support. */
1506 if (memory_access_type == VMAT_INVARIANT)
1507 return;
1509 unsigned int nvectors;
1510 if (slp_node)
1511 nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1512 else
1513 nvectors = vect_get_num_copies (loop_vinfo, vectype);
1515 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1516 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
1517 machine_mode vecmode = TYPE_MODE (vectype);
1518 bool is_load = (vls_type == VLS_LOAD);
1519 if (memory_access_type == VMAT_LOAD_STORE_LANES)
1521 internal_fn ifn
1522 = (is_load ? vect_load_lanes_supported (vectype, group_size, true)
1523 : vect_store_lanes_supported (vectype, group_size, true));
1524 if (ifn == IFN_MASK_LEN_LOAD_LANES || ifn == IFN_MASK_LEN_STORE_LANES)
1525 vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
1526 else if (ifn == IFN_MASK_LOAD_LANES || ifn == IFN_MASK_STORE_LANES)
1527 vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
1528 scalar_mask);
1529 else
1531 if (dump_enabled_p ())
1532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533 "can't operate on partial vectors because"
1534 " the target doesn't have an appropriate"
1535 " load/store-lanes instruction.\n");
1536 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
1538 return;
1541 if (memory_access_type == VMAT_GATHER_SCATTER)
1543 internal_fn ifn = (is_load
1544 ? IFN_MASK_GATHER_LOAD
1545 : IFN_MASK_SCATTER_STORE);
1546 internal_fn len_ifn = (is_load
1547 ? IFN_MASK_LEN_GATHER_LOAD
1548 : IFN_MASK_LEN_SCATTER_STORE);
1549 if (internal_gather_scatter_fn_supported_p (len_ifn, vectype,
1550 gs_info->memory_type,
1551 gs_info->offset_vectype,
1552 gs_info->scale))
1553 vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
1554 else if (internal_gather_scatter_fn_supported_p (ifn, vectype,
1555 gs_info->memory_type,
1556 gs_info->offset_vectype,
1557 gs_info->scale))
1558 vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
1559 scalar_mask);
1560 else
1562 if (dump_enabled_p ())
1563 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1564 "can't operate on partial vectors because"
1565 " the target doesn't have an appropriate"
1566 " gather load or scatter store instruction.\n");
1567 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
1569 return;
1572 if (memory_access_type != VMAT_CONTIGUOUS
1573 && memory_access_type != VMAT_CONTIGUOUS_PERMUTE)
1575 /* Element X of the data must come from iteration i * VF + X of the
1576 scalar loop. We need more work to support other mappings. */
1577 if (dump_enabled_p ())
1578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1579 "can't operate on partial vectors because an"
1580 " access isn't contiguous.\n");
1581 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
1582 return;
1585 if (!VECTOR_MODE_P (vecmode))
1587 if (dump_enabled_p ())
1588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1589 "can't operate on partial vectors when emulating"
1590 " vector operations.\n");
1591 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
1592 return;
1595 /* We might load more scalars than we need for permuting SLP loads.
1596 We checked in get_group_load_store_type that the extra elements
1597 don't leak into a new vector. */
1598 auto group_memory_nvectors = [](poly_uint64 size, poly_uint64 nunits)
1600 unsigned int nvectors;
1601 if (can_div_away_from_zero_p (size, nunits, &nvectors))
1602 return nvectors;
1603 gcc_unreachable ();
1606 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1607 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1608 machine_mode mask_mode;
1609 machine_mode vmode;
1610 bool using_partial_vectors_p = false;
1611 if (get_len_load_store_mode (vecmode, is_load).exists (&vmode))
1613 nvectors = group_memory_nvectors (group_size * vf, nunits);
1614 unsigned factor = (vecmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vecmode);
1615 vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, factor);
1616 using_partial_vectors_p = true;
1618 else if (targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode)
1619 && can_vec_mask_load_store_p (vecmode, mask_mode, is_load))
1621 nvectors = group_memory_nvectors (group_size * vf, nunits);
1622 vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
1623 using_partial_vectors_p = true;
1626 if (!using_partial_vectors_p)
1628 if (dump_enabled_p ())
1629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630 "can't operate on partial vectors because the"
1631 " target doesn't have the appropriate partial"
1632 " vectorization load or store.\n");
1633 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
1637 /* Return the mask input to a masked load or store. VEC_MASK is the vectorized
1638 form of the scalar mask condition and LOOP_MASK, if nonnull, is the mask
1639 that needs to be applied to all loads and stores in a vectorized loop.
1640 Return VEC_MASK if LOOP_MASK is null or if VEC_MASK is already masked,
1641 otherwise return VEC_MASK & LOOP_MASK.
1643 MASK_TYPE is the type of both masks. If new statements are needed,
1644 insert them before GSI. */
1646 tree
1647 prepare_vec_mask (loop_vec_info loop_vinfo, tree mask_type, tree loop_mask,
1648 tree vec_mask, gimple_stmt_iterator *gsi)
1650 gcc_assert (useless_type_conversion_p (mask_type, TREE_TYPE (vec_mask)));
1651 if (!loop_mask)
1652 return vec_mask;
1654 gcc_assert (TREE_TYPE (loop_mask) == mask_type);
1656 if (loop_vinfo->vec_cond_masked_set.contains ({ vec_mask, loop_mask }))
1657 return vec_mask;
1659 tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and");
1660 gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR,
1661 vec_mask, loop_mask);
1663 gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT);
1664 return and_res;
1667 /* Determine whether we can use a gather load or scatter store to vectorize
1668 strided load or store STMT_INFO by truncating the current offset to a
1669 smaller width. We need to be able to construct an offset vector:
1671 { 0, X, X*2, X*3, ... }
1673 without loss of precision, where X is STMT_INFO's DR_STEP.
1675 Return true if this is possible, describing the gather load or scatter
1676 store in GS_INFO. MASKED_P is true if the load or store is conditional. */
1678 static bool
1679 vect_truncate_gather_scatter_offset (stmt_vec_info stmt_info,
1680 loop_vec_info loop_vinfo, bool masked_p,
1681 gather_scatter_info *gs_info)
1683 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
1684 data_reference *dr = dr_info->dr;
1685 tree step = DR_STEP (dr);
1686 if (TREE_CODE (step) != INTEGER_CST)
1688 /* ??? Perhaps we could use range information here? */
1689 if (dump_enabled_p ())
1690 dump_printf_loc (MSG_NOTE, vect_location,
1691 "cannot truncate variable step.\n");
1692 return false;
1695 /* Get the number of bits in an element. */
1696 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1697 scalar_mode element_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
1698 unsigned int element_bits = GET_MODE_BITSIZE (element_mode);
1700 /* Set COUNT to the upper limit on the number of elements - 1.
1701 Start with the maximum vectorization factor. */
1702 unsigned HOST_WIDE_INT count = vect_max_vf (loop_vinfo) - 1;
1704 /* Try lowering COUNT to the number of scalar latch iterations. */
1705 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1706 widest_int max_iters;
1707 if (max_loop_iterations (loop, &max_iters)
1708 && max_iters < count)
1709 count = max_iters.to_shwi ();
1711 /* Try scales of 1 and the element size. */
1712 int scales[] = { 1, vect_get_scalar_dr_size (dr_info) };
1713 wi::overflow_type overflow = wi::OVF_NONE;
1714 for (int i = 0; i < 2; ++i)
1716 int scale = scales[i];
1717 widest_int factor;
1718 if (!wi::multiple_of_p (wi::to_widest (step), scale, SIGNED, &factor))
1719 continue;
1721 /* Determine the minimum precision of (COUNT - 1) * STEP / SCALE. */
1722 widest_int range = wi::mul (count, factor, SIGNED, &overflow);
1723 if (overflow)
1724 continue;
1725 signop sign = range >= 0 ? UNSIGNED : SIGNED;
1726 unsigned int min_offset_bits = wi::min_precision (range, sign);
1728 /* Find the narrowest viable offset type. */
1729 unsigned int offset_bits = 1U << ceil_log2 (min_offset_bits);
1730 tree offset_type = build_nonstandard_integer_type (offset_bits,
1731 sign == UNSIGNED);
1733 /* See whether the target supports the operation with an offset
1734 no narrower than OFFSET_TYPE. */
1735 tree memory_type = TREE_TYPE (DR_REF (dr));
1736 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
1737 vectype, memory_type, offset_type, scale,
1738 &gs_info->ifn, &gs_info->offset_vectype)
1739 || gs_info->ifn == IFN_LAST)
1740 continue;
1742 gs_info->decl = NULL_TREE;
1743 /* Logically the sum of DR_BASE_ADDRESS, DR_INIT and DR_OFFSET,
1744 but we don't need to store that here. */
1745 gs_info->base = NULL_TREE;
1746 gs_info->element_type = TREE_TYPE (vectype);
1747 gs_info->offset = fold_convert (offset_type, step);
1748 gs_info->offset_dt = vect_constant_def;
1749 gs_info->scale = scale;
1750 gs_info->memory_type = memory_type;
1751 return true;
1754 if (overflow && dump_enabled_p ())
1755 dump_printf_loc (MSG_NOTE, vect_location,
1756 "truncating gather/scatter offset to %d bits"
1757 " might change its value.\n", element_bits);
1759 return false;
1762 /* Return true if we can use gather/scatter internal functions to
1763 vectorize STMT_INFO, which is a grouped or strided load or store.
1764 MASKED_P is true if load or store is conditional. When returning
1765 true, fill in GS_INFO with the information required to perform the
1766 operation. */
1768 static bool
1769 vect_use_strided_gather_scatters_p (stmt_vec_info stmt_info,
1770 loop_vec_info loop_vinfo, bool masked_p,
1771 gather_scatter_info *gs_info)
1773 if (!vect_check_gather_scatter (stmt_info, loop_vinfo, gs_info)
1774 || gs_info->ifn == IFN_LAST)
1775 return vect_truncate_gather_scatter_offset (stmt_info, loop_vinfo,
1776 masked_p, gs_info);
1778 tree old_offset_type = TREE_TYPE (gs_info->offset);
1779 tree new_offset_type = TREE_TYPE (gs_info->offset_vectype);
1781 gcc_assert (TYPE_PRECISION (new_offset_type)
1782 >= TYPE_PRECISION (old_offset_type));
1783 gs_info->offset = fold_convert (new_offset_type, gs_info->offset);
1785 if (dump_enabled_p ())
1786 dump_printf_loc (MSG_NOTE, vect_location,
1787 "using gather/scatter for strided/grouped access,"
1788 " scale = %d\n", gs_info->scale);
1790 return true;
1793 /* STMT_INFO is a non-strided load or store, meaning that it accesses
1794 elements with a known constant step. Return -1 if that step
1795 is negative, 0 if it is zero, and 1 if it is greater than zero. */
1797 static int
1798 compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info)
1800 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
1801 return tree_int_cst_compare (vect_dr_behavior (vinfo, dr_info)->step,
1802 size_zero_node);
1805 /* If the target supports a permute mask that reverses the elements in
1806 a vector of type VECTYPE, return that mask, otherwise return null. */
1808 tree
1809 perm_mask_for_reverse (tree vectype)
1811 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1813 /* The encoding has a single stepped pattern. */
1814 vec_perm_builder sel (nunits, 1, 3);
1815 for (int i = 0; i < 3; ++i)
1816 sel.quick_push (nunits - 1 - i);
1818 vec_perm_indices indices (sel, 1, nunits);
1819 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
1820 indices))
1821 return NULL_TREE;
1822 return vect_gen_perm_mask_checked (vectype, indices);
1825 /* A subroutine of get_load_store_type, with a subset of the same
1826 arguments. Handle the case where STMT_INFO is a load or store that
1827 accesses consecutive elements with a negative step. Sets *POFFSET
1828 to the offset to be applied to the DR for the first access. */
1830 static vect_memory_access_type
1831 get_negative_load_store_type (vec_info *vinfo,
1832 stmt_vec_info stmt_info, tree vectype,
1833 vec_load_store_type vls_type,
1834 unsigned int ncopies, poly_int64 *poffset)
1836 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
1837 dr_alignment_support alignment_support_scheme;
1839 if (ncopies > 1)
1841 if (dump_enabled_p ())
1842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1843 "multiple types with negative step.\n");
1844 return VMAT_ELEMENTWISE;
1847 /* For backward running DRs the first access in vectype actually is
1848 N-1 elements before the address of the DR. */
1849 *poffset = ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1850 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1852 int misalignment = dr_misalignment (dr_info, vectype, *poffset);
1853 alignment_support_scheme
1854 = vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment);
1855 if (alignment_support_scheme != dr_aligned
1856 && alignment_support_scheme != dr_unaligned_supported)
1858 if (dump_enabled_p ())
1859 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1860 "negative step but alignment required.\n");
1861 *poffset = 0;
1862 return VMAT_ELEMENTWISE;
1865 if (vls_type == VLS_STORE_INVARIANT)
1867 if (dump_enabled_p ())
1868 dump_printf_loc (MSG_NOTE, vect_location,
1869 "negative step with invariant source;"
1870 " no permute needed.\n");
1871 return VMAT_CONTIGUOUS_DOWN;
1874 if (!perm_mask_for_reverse (vectype))
1876 if (dump_enabled_p ())
1877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1878 "negative step and reversing not supported.\n");
1879 *poffset = 0;
1880 return VMAT_ELEMENTWISE;
1883 return VMAT_CONTIGUOUS_REVERSE;
1886 /* STMT_INFO is either a masked or unconditional store. Return the value
1887 being stored. */
1889 tree
1890 vect_get_store_rhs (stmt_vec_info stmt_info)
1892 if (gassign *assign = dyn_cast <gassign *> (stmt_info->stmt))
1894 gcc_assert (gimple_assign_single_p (assign));
1895 return gimple_assign_rhs1 (assign);
1897 if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
1899 internal_fn ifn = gimple_call_internal_fn (call);
1900 int index = internal_fn_stored_value_index (ifn);
1901 gcc_assert (index >= 0);
1902 return gimple_call_arg (call, index);
1904 gcc_unreachable ();
1907 /* Function VECTOR_VECTOR_COMPOSITION_TYPE
1909 This function returns a vector type which can be composed with NETLS pieces,
1910 whose type is recorded in PTYPE. VTYPE should be a vector type, and has the
1911 same vector size as the return vector. It checks target whether supports
1912 pieces-size vector mode for construction firstly, if target fails to, check
1913 pieces-size scalar mode for construction further. It returns NULL_TREE if
1914 fails to find the available composition.
1916 For example, for (vtype=V16QI, nelts=4), we can probably get:
1917 - V16QI with PTYPE V4QI.
1918 - V4SI with PTYPE SI.
1919 - NULL_TREE. */
1921 static tree
1922 vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype)
1924 gcc_assert (VECTOR_TYPE_P (vtype));
1925 gcc_assert (known_gt (nelts, 0U));
1927 machine_mode vmode = TYPE_MODE (vtype);
1928 if (!VECTOR_MODE_P (vmode))
1929 return NULL_TREE;
1931 /* When we are asked to compose the vector from its components let
1932 that happen directly. */
1933 if (known_eq (TYPE_VECTOR_SUBPARTS (vtype), nelts))
1935 *ptype = TREE_TYPE (vtype);
1936 return vtype;
1939 poly_uint64 vbsize = GET_MODE_BITSIZE (vmode);
1940 unsigned int pbsize;
1941 if (constant_multiple_p (vbsize, nelts, &pbsize))
1943 /* First check if vec_init optab supports construction from
1944 vector pieces directly. */
1945 scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vtype));
1946 poly_uint64 inelts = pbsize / GET_MODE_BITSIZE (elmode);
1947 machine_mode rmode;
1948 if (related_vector_mode (vmode, elmode, inelts).exists (&rmode)
1949 && (convert_optab_handler (vec_init_optab, vmode, rmode)
1950 != CODE_FOR_nothing))
1952 *ptype = build_vector_type (TREE_TYPE (vtype), inelts);
1953 return vtype;
1956 /* Otherwise check if exists an integer type of the same piece size and
1957 if vec_init optab supports construction from it directly. */
1958 if (int_mode_for_size (pbsize, 0).exists (&elmode)
1959 && related_vector_mode (vmode, elmode, nelts).exists (&rmode)
1960 && (convert_optab_handler (vec_init_optab, rmode, elmode)
1961 != CODE_FOR_nothing))
1963 *ptype = build_nonstandard_integer_type (pbsize, 1);
1964 return build_vector_type (*ptype, nelts);
1968 return NULL_TREE;
1971 /* A subroutine of get_load_store_type, with a subset of the same
1972 arguments. Handle the case where STMT_INFO is part of a grouped load
1973 or store.
1975 For stores, the statements in the group are all consecutive
1976 and there is no gap at the end. For loads, the statements in the
1977 group might not be consecutive; there can be gaps between statements
1978 as well as at the end. */
1980 static bool
1981 get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
1982 tree vectype, slp_tree slp_node,
1983 bool masked_p, vec_load_store_type vls_type,
1984 vect_memory_access_type *memory_access_type,
1985 poly_int64 *poffset,
1986 dr_alignment_support *alignment_support_scheme,
1987 int *misalignment,
1988 gather_scatter_info *gs_info,
1989 internal_fn *lanes_ifn)
1991 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1992 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1993 stmt_vec_info first_stmt_info;
1994 unsigned int group_size;
1995 unsigned HOST_WIDE_INT gap;
1996 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1998 first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1999 group_size = DR_GROUP_SIZE (first_stmt_info);
2000 gap = DR_GROUP_GAP (first_stmt_info);
2002 else
2004 first_stmt_info = stmt_info;
2005 group_size = 1;
2006 gap = 0;
2008 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2009 bool single_element_p = (stmt_info == first_stmt_info
2010 && !DR_GROUP_NEXT_ELEMENT (stmt_info));
2011 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2013 /* True if the vectorized statements would access beyond the last
2014 statement in the group. */
2015 bool overrun_p = false;
2017 /* True if we can cope with such overrun by peeling for gaps, so that
2018 there is at least one final scalar iteration after the vector loop. */
2019 bool can_overrun_p = (!masked_p
2020 && vls_type == VLS_LOAD
2021 && loop_vinfo
2022 && !loop->inner);
2024 /* There can only be a gap at the end of the group if the stride is
2025 known at compile time. */
2026 gcc_assert (!STMT_VINFO_STRIDED_P (first_stmt_info) || gap == 0);
2028 /* Stores can't yet have gaps. */
2029 gcc_assert (slp_node || vls_type == VLS_LOAD || gap == 0);
2031 if (slp_node)
2033 /* For SLP vectorization we directly vectorize a subchain
2034 without permutation. */
2035 if (! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
2036 first_dr_info
2037 = STMT_VINFO_DR_INFO (SLP_TREE_SCALAR_STMTS (slp_node)[0]);
2038 if (STMT_VINFO_STRIDED_P (first_stmt_info))
2039 /* Try to use consecutive accesses of as many elements as possible,
2040 separated by the stride, until we have a complete vector.
2041 Fall back to scalar accesses if that isn't possible. */
2042 *memory_access_type = VMAT_STRIDED_SLP;
2043 else
2045 int cmp = compare_step_with_zero (vinfo, stmt_info);
2046 if (cmp < 0)
2048 if (single_element_p)
2049 /* ??? The VMAT_CONTIGUOUS_REVERSE code generation is
2050 only correct for single element "interleaving" SLP. */
2051 *memory_access_type = get_negative_load_store_type
2052 (vinfo, stmt_info, vectype, vls_type, 1, poffset);
2053 else
2055 /* Try to use consecutive accesses of DR_GROUP_SIZE elements,
2056 separated by the stride, until we have a complete vector.
2057 Fall back to scalar accesses if that isn't possible. */
2058 if (multiple_p (nunits, group_size))
2059 *memory_access_type = VMAT_STRIDED_SLP;
2060 else
2061 *memory_access_type = VMAT_ELEMENTWISE;
2064 else if (cmp == 0 && loop_vinfo)
2066 gcc_assert (vls_type == VLS_LOAD);
2067 *memory_access_type = VMAT_INVARIANT;
2068 /* Invariant accesses perform only component accesses, alignment
2069 is irrelevant for them. */
2070 *alignment_support_scheme = dr_unaligned_supported;
2072 else
2073 *memory_access_type = VMAT_CONTIGUOUS;
2075 overrun_p = loop_vinfo && gap != 0;
2076 if (overrun_p && vls_type != VLS_LOAD)
2078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2079 "Grouped store with gaps requires"
2080 " non-consecutive accesses\n");
2081 return false;
2083 /* An overrun is fine if the trailing elements are smaller
2084 than the alignment boundary B. Every vector access will
2085 be a multiple of B and so we are guaranteed to access a
2086 non-gap element in the same B-sized block. */
2087 if (overrun_p
2088 && gap < (vect_known_alignment_in_bytes (first_dr_info,
2089 vectype)
2090 / vect_get_scalar_dr_size (first_dr_info)))
2091 overrun_p = false;
2093 /* When we have a contiguous access across loop iterations
2094 but the access in the loop doesn't cover the full vector
2095 we can end up with no gap recorded but still excess
2096 elements accessed, see PR103116. Make sure we peel for
2097 gaps if necessary and sufficient and give up if not.
2099 If there is a combination of the access not covering the full
2100 vector and a gap recorded then we may need to peel twice. */
2101 if (loop_vinfo
2102 && (*memory_access_type == VMAT_CONTIGUOUS
2103 || *memory_access_type == VMAT_CONTIGUOUS_REVERSE)
2104 && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
2105 && !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2106 nunits))
2107 overrun_p = true;
2109 /* If the gap splits the vector in half and the target
2110 can do half-vector operations avoid the epilogue peeling
2111 by simply loading half of the vector only. Usually
2112 the construction with an upper zero half will be elided. */
2113 dr_alignment_support alss;
2114 int misalign = dr_misalignment (first_dr_info, vectype);
2115 tree half_vtype;
2116 poly_uint64 remain;
2117 unsigned HOST_WIDE_INT tem, num;
2118 if (overrun_p
2119 && !masked_p
2120 && (((alss = vect_supportable_dr_alignment (vinfo, first_dr_info,
2121 vectype, misalign)))
2122 == dr_aligned
2123 || alss == dr_unaligned_supported)
2124 && can_div_trunc_p (group_size
2125 * LOOP_VINFO_VECT_FACTOR (loop_vinfo) - gap,
2126 nunits, &tem, &remain)
2127 && (known_eq (remain, 0u)
2128 || (constant_multiple_p (nunits, remain, &num)
2129 && (vector_vector_composition_type (vectype, num,
2130 &half_vtype)
2131 != NULL_TREE))))
2132 overrun_p = false;
2134 if (overrun_p && !can_overrun_p)
2136 if (dump_enabled_p ())
2137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2138 "Peeling for outer loop is not supported\n");
2139 return false;
2141 /* Peeling for gaps assumes that a single scalar iteration
2142 is enough to make sure the last vector iteration doesn't
2143 access excess elements. */
2144 if (overrun_p
2145 && (!can_div_trunc_p (group_size
2146 * LOOP_VINFO_VECT_FACTOR (loop_vinfo) - gap,
2147 nunits, &tem, &remain)
2148 || maybe_lt (remain + group_size, nunits)))
2150 /* But peeling a single scalar iteration is enough if
2151 we can use the next power-of-two sized partial
2152 access and that is sufficiently small to be covered
2153 by the single scalar iteration. */
2154 unsigned HOST_WIDE_INT cnunits, cvf, cremain, cpart_size;
2155 if (!nunits.is_constant (&cnunits)
2156 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf)
2157 || (((cremain = group_size * cvf - gap % cnunits), true)
2158 && ((cpart_size = (1 << ceil_log2 (cremain))) != cnunits)
2159 && (cremain + group_size < cpart_size
2160 || vector_vector_composition_type
2161 (vectype, cnunits / cpart_size,
2162 &half_vtype) == NULL_TREE)))
2164 if (dump_enabled_p ())
2165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2166 "peeling for gaps insufficient for "
2167 "access\n");
2168 return false;
2172 /* If this is single-element interleaving with an element
2173 distance that leaves unused vector loads around punt - we
2174 at least create very sub-optimal code in that case (and
2175 blow up memory, see PR65518). */
2176 if (loop_vinfo
2177 && *memory_access_type == VMAT_CONTIGUOUS
2178 && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
2179 && single_element_p
2180 && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184 "single-element interleaving not supported "
2185 "for not adjacent vector loads\n");
2186 return false;
2190 else
2192 /* We can always handle this case using elementwise accesses,
2193 but see if something more efficient is available. */
2194 *memory_access_type = VMAT_ELEMENTWISE;
2196 /* If there is a gap at the end of the group then these optimizations
2197 would access excess elements in the last iteration. */
2198 bool would_overrun_p = (gap != 0);
2199 /* An overrun is fine if the trailing elements are smaller than the
2200 alignment boundary B. Every vector access will be a multiple of B
2201 and so we are guaranteed to access a non-gap element in the
2202 same B-sized block. */
2203 if (would_overrun_p
2204 && !masked_p
2205 && gap < (vect_known_alignment_in_bytes (first_dr_info, vectype)
2206 / vect_get_scalar_dr_size (first_dr_info)))
2207 would_overrun_p = false;
2209 if (!STMT_VINFO_STRIDED_P (first_stmt_info)
2210 && (can_overrun_p || !would_overrun_p)
2211 && compare_step_with_zero (vinfo, stmt_info) > 0)
2213 /* First cope with the degenerate case of a single-element
2214 vector. */
2215 if (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U))
2218 else
2220 /* Otherwise try using LOAD/STORE_LANES. */
2221 *lanes_ifn
2222 = vls_type == VLS_LOAD
2223 ? vect_load_lanes_supported (vectype, group_size, masked_p)
2224 : vect_store_lanes_supported (vectype, group_size,
2225 masked_p);
2226 if (*lanes_ifn != IFN_LAST)
2228 *memory_access_type = VMAT_LOAD_STORE_LANES;
2229 overrun_p = would_overrun_p;
2232 /* If that fails, try using permuting loads. */
2233 else if (vls_type == VLS_LOAD
2234 ? vect_grouped_load_supported (vectype,
2235 single_element_p,
2236 group_size)
2237 : vect_grouped_store_supported (vectype, group_size))
2239 *memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
2240 overrun_p = would_overrun_p;
2245 /* As a last resort, trying using a gather load or scatter store.
2247 ??? Although the code can handle all group sizes correctly,
2248 it probably isn't a win to use separate strided accesses based
2249 on nearby locations. Or, even if it's a win over scalar code,
2250 it might not be a win over vectorizing at a lower VF, if that
2251 allows us to use contiguous accesses. */
2252 if (*memory_access_type == VMAT_ELEMENTWISE
2253 && single_element_p
2254 && loop_vinfo
2255 && vect_use_strided_gather_scatters_p (stmt_info, loop_vinfo,
2256 masked_p, gs_info))
2257 *memory_access_type = VMAT_GATHER_SCATTER;
2260 if (*memory_access_type == VMAT_GATHER_SCATTER
2261 || *memory_access_type == VMAT_ELEMENTWISE)
2263 *alignment_support_scheme = dr_unaligned_supported;
2264 *misalignment = DR_MISALIGNMENT_UNKNOWN;
2266 else
2268 *misalignment = dr_misalignment (first_dr_info, vectype, *poffset);
2269 *alignment_support_scheme
2270 = vect_supportable_dr_alignment (vinfo, first_dr_info, vectype,
2271 *misalignment);
2274 if (vls_type != VLS_LOAD && first_stmt_info == stmt_info)
2276 /* STMT is the leader of the group. Check the operands of all the
2277 stmts of the group. */
2278 stmt_vec_info next_stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2279 while (next_stmt_info)
2281 tree op = vect_get_store_rhs (next_stmt_info);
2282 enum vect_def_type dt;
2283 if (!vect_is_simple_use (op, vinfo, &dt))
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287 "use not simple.\n");
2288 return false;
2290 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
2294 if (overrun_p)
2296 gcc_assert (can_overrun_p);
2297 if (dump_enabled_p ())
2298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2299 "Data access with gaps requires scalar "
2300 "epilogue loop\n");
2301 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2304 return true;
2307 /* Analyze load or store statement STMT_INFO of type VLS_TYPE. Return true
2308 if there is a memory access type that the vectorized form can use,
2309 storing it in *MEMORY_ACCESS_TYPE if so. If we decide to use gathers
2310 or scatters, fill in GS_INFO accordingly. In addition
2311 *ALIGNMENT_SUPPORT_SCHEME is filled out and false is returned if
2312 the target does not support the alignment scheme. *MISALIGNMENT
2313 is set according to the alignment of the access (including
2314 DR_MISALIGNMENT_UNKNOWN when it is unknown).
2316 SLP says whether we're performing SLP rather than loop vectorization.
2317 MASKED_P is true if the statement is conditional on a vectorized mask.
2318 VECTYPE is the vector type that the vectorized statements will use.
2319 NCOPIES is the number of vector statements that will be needed. */
2321 static bool
2322 get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
2323 tree vectype, slp_tree slp_node,
2324 bool masked_p, vec_load_store_type vls_type,
2325 unsigned int ncopies,
2326 vect_memory_access_type *memory_access_type,
2327 poly_int64 *poffset,
2328 dr_alignment_support *alignment_support_scheme,
2329 int *misalignment,
2330 gather_scatter_info *gs_info,
2331 internal_fn *lanes_ifn)
2333 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2334 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2335 *misalignment = DR_MISALIGNMENT_UNKNOWN;
2336 *poffset = 0;
2337 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2339 *memory_access_type = VMAT_GATHER_SCATTER;
2340 if (!vect_check_gather_scatter (stmt_info, loop_vinfo, gs_info))
2341 gcc_unreachable ();
2342 /* When using internal functions, we rely on pattern recognition
2343 to convert the type of the offset to the type that the target
2344 requires, with the result being a call to an internal function.
2345 If that failed for some reason (e.g. because another pattern
2346 took priority), just handle cases in which the offset already
2347 has the right type. */
2348 else if (gs_info->ifn != IFN_LAST
2349 && !is_gimple_call (stmt_info->stmt)
2350 && !tree_nop_conversion_p (TREE_TYPE (gs_info->offset),
2351 TREE_TYPE (gs_info->offset_vectype)))
2353 if (dump_enabled_p ())
2354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2355 "%s offset requires a conversion\n",
2356 vls_type == VLS_LOAD ? "gather" : "scatter");
2357 return false;
2359 else if (!vect_is_simple_use (gs_info->offset, vinfo,
2360 &gs_info->offset_dt,
2361 &gs_info->offset_vectype))
2363 if (dump_enabled_p ())
2364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2365 "%s index use not simple.\n",
2366 vls_type == VLS_LOAD ? "gather" : "scatter");
2367 return false;
2369 else if (gs_info->ifn == IFN_LAST && !gs_info->decl)
2371 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()
2372 || !TYPE_VECTOR_SUBPARTS (gs_info->offset_vectype).is_constant ()
2373 || !constant_multiple_p (TYPE_VECTOR_SUBPARTS
2374 (gs_info->offset_vectype),
2375 TYPE_VECTOR_SUBPARTS (vectype)))
2377 if (dump_enabled_p ())
2378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2379 "unsupported vector types for emulated "
2380 "gather.\n");
2381 return false;
2384 /* Gather-scatter accesses perform only component accesses, alignment
2385 is irrelevant for them. */
2386 *alignment_support_scheme = dr_unaligned_supported;
2388 else if (STMT_VINFO_GROUPED_ACCESS (stmt_info) || slp_node)
2390 if (!get_group_load_store_type (vinfo, stmt_info, vectype, slp_node,
2391 masked_p,
2392 vls_type, memory_access_type, poffset,
2393 alignment_support_scheme,
2394 misalignment, gs_info, lanes_ifn))
2395 return false;
2397 else if (STMT_VINFO_STRIDED_P (stmt_info))
2399 gcc_assert (!slp_node);
2400 if (loop_vinfo
2401 && vect_use_strided_gather_scatters_p (stmt_info, loop_vinfo,
2402 masked_p, gs_info))
2403 *memory_access_type = VMAT_GATHER_SCATTER;
2404 else
2405 *memory_access_type = VMAT_ELEMENTWISE;
2406 /* Alignment is irrelevant here. */
2407 *alignment_support_scheme = dr_unaligned_supported;
2409 else
2411 int cmp = compare_step_with_zero (vinfo, stmt_info);
2412 if (cmp == 0)
2414 gcc_assert (vls_type == VLS_LOAD);
2415 *memory_access_type = VMAT_INVARIANT;
2416 /* Invariant accesses perform only component accesses, alignment
2417 is irrelevant for them. */
2418 *alignment_support_scheme = dr_unaligned_supported;
2420 else
2422 if (cmp < 0)
2423 *memory_access_type = get_negative_load_store_type
2424 (vinfo, stmt_info, vectype, vls_type, ncopies, poffset);
2425 else
2426 *memory_access_type = VMAT_CONTIGUOUS;
2427 *misalignment = dr_misalignment (STMT_VINFO_DR_INFO (stmt_info),
2428 vectype, *poffset);
2429 *alignment_support_scheme
2430 = vect_supportable_dr_alignment (vinfo,
2431 STMT_VINFO_DR_INFO (stmt_info),
2432 vectype, *misalignment);
2436 if ((*memory_access_type == VMAT_ELEMENTWISE
2437 || *memory_access_type == VMAT_STRIDED_SLP)
2438 && !nunits.is_constant ())
2440 if (dump_enabled_p ())
2441 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2442 "Not using elementwise accesses due to variable "
2443 "vectorization factor.\n");
2444 return false;
2447 if (*alignment_support_scheme == dr_unaligned_unsupported)
2449 if (dump_enabled_p ())
2450 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2451 "unsupported unaligned access\n");
2452 return false;
2455 /* FIXME: At the moment the cost model seems to underestimate the
2456 cost of using elementwise accesses. This check preserves the
2457 traditional behavior until that can be fixed. */
2458 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
2459 if (!first_stmt_info)
2460 first_stmt_info = stmt_info;
2461 if (*memory_access_type == VMAT_ELEMENTWISE
2462 && !STMT_VINFO_STRIDED_P (first_stmt_info)
2463 && !(stmt_info == DR_GROUP_FIRST_ELEMENT (stmt_info)
2464 && !DR_GROUP_NEXT_ELEMENT (stmt_info)
2465 && !pow2p_hwi (DR_GROUP_SIZE (stmt_info))))
2467 if (dump_enabled_p ())
2468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2469 "not falling back to elementwise accesses\n");
2470 return false;
2472 return true;
2475 /* Return true if boolean argument at MASK_INDEX is suitable for vectorizing
2476 conditional operation STMT_INFO. When returning true, store the mask
2477 in *MASK, the type of its definition in *MASK_DT_OUT, the type of the
2478 vectorized mask in *MASK_VECTYPE_OUT and the SLP node corresponding
2479 to the mask in *MASK_NODE if MASK_NODE is not NULL. */
2481 static bool
2482 vect_check_scalar_mask (vec_info *vinfo, stmt_vec_info stmt_info,
2483 slp_tree slp_node, unsigned mask_index,
2484 tree *mask, slp_tree *mask_node,
2485 vect_def_type *mask_dt_out, tree *mask_vectype_out)
2487 enum vect_def_type mask_dt;
2488 tree mask_vectype;
2489 slp_tree mask_node_1;
2490 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, mask_index,
2491 mask, &mask_node_1, &mask_dt, &mask_vectype))
2493 if (dump_enabled_p ())
2494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2495 "mask use not simple.\n");
2496 return false;
2499 if (!VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (*mask)))
2501 if (dump_enabled_p ())
2502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2503 "mask argument is not a boolean.\n");
2504 return false;
2507 /* If the caller is not prepared for adjusting an external/constant
2508 SLP mask vector type fail. */
2509 if (slp_node
2510 && !mask_node
2511 && SLP_TREE_DEF_TYPE (mask_node_1) != vect_internal_def)
2513 if (dump_enabled_p ())
2514 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2515 "SLP mask argument is not vectorized.\n");
2516 return false;
2519 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2520 if (!mask_vectype)
2521 mask_vectype = get_mask_type_for_scalar_type (vinfo, TREE_TYPE (vectype),
2522 mask_node_1);
2524 if (!mask_vectype || !VECTOR_BOOLEAN_TYPE_P (mask_vectype))
2526 if (dump_enabled_p ())
2527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2528 "could not find an appropriate vector mask type.\n");
2529 return false;
2532 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_vectype),
2533 TYPE_VECTOR_SUBPARTS (vectype)))
2535 if (dump_enabled_p ())
2536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2537 "vector mask type %T"
2538 " does not match vector data type %T.\n",
2539 mask_vectype, vectype);
2541 return false;
2544 *mask_dt_out = mask_dt;
2545 *mask_vectype_out = mask_vectype;
2546 if (mask_node)
2547 *mask_node = mask_node_1;
2548 return true;
2551 /* Return true if stored value is suitable for vectorizing store
2552 statement STMT_INFO. When returning true, store the scalar stored
2553 in *RHS and *RHS_NODE, the type of the definition in *RHS_DT_OUT,
2554 the type of the vectorized store value in
2555 *RHS_VECTYPE_OUT and the type of the store in *VLS_TYPE_OUT. */
2557 static bool
2558 vect_check_store_rhs (vec_info *vinfo, stmt_vec_info stmt_info,
2559 slp_tree slp_node, tree *rhs, slp_tree *rhs_node,
2560 vect_def_type *rhs_dt_out, tree *rhs_vectype_out,
2561 vec_load_store_type *vls_type_out)
2563 int op_no = 0;
2564 if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
2566 if (gimple_call_internal_p (call)
2567 && internal_store_fn_p (gimple_call_internal_fn (call)))
2568 op_no = internal_fn_stored_value_index (gimple_call_internal_fn (call));
2570 if (slp_node)
2571 op_no = vect_slp_child_index_for_operand
2572 (stmt_info->stmt, op_no, STMT_VINFO_GATHER_SCATTER_P (stmt_info));
2574 enum vect_def_type rhs_dt;
2575 tree rhs_vectype;
2576 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, op_no,
2577 rhs, rhs_node, &rhs_dt, &rhs_vectype))
2579 if (dump_enabled_p ())
2580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2581 "use not simple.\n");
2582 return false;
2585 /* In the case this is a store from a constant make sure
2586 native_encode_expr can handle it. */
2587 if (rhs_dt == vect_constant_def
2588 && CONSTANT_CLASS_P (*rhs) && native_encode_expr (*rhs, NULL, 64) == 0)
2590 if (dump_enabled_p ())
2591 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2592 "cannot encode constant as a byte sequence.\n");
2593 return false;
2596 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2597 if (rhs_vectype && !useless_type_conversion_p (vectype, rhs_vectype))
2599 if (dump_enabled_p ())
2600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2601 "incompatible vector types.\n");
2602 return false;
2605 *rhs_dt_out = rhs_dt;
2606 *rhs_vectype_out = rhs_vectype;
2607 if (rhs_dt == vect_constant_def || rhs_dt == vect_external_def)
2608 *vls_type_out = VLS_STORE_INVARIANT;
2609 else
2610 *vls_type_out = VLS_STORE;
2611 return true;
2614 /* Build an all-ones vector mask of type MASKTYPE while vectorizing STMT_INFO.
2615 Note that we support masks with floating-point type, in which case the
2616 floats are interpreted as a bitmask. */
2618 static tree
2619 vect_build_all_ones_mask (vec_info *vinfo,
2620 stmt_vec_info stmt_info, tree masktype)
2622 if (TREE_CODE (masktype) == INTEGER_TYPE)
2623 return build_int_cst (masktype, -1);
2624 else if (VECTOR_BOOLEAN_TYPE_P (masktype)
2625 || TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
2627 tree mask = build_int_cst (TREE_TYPE (masktype), -1);
2628 mask = build_vector_from_val (masktype, mask);
2629 return vect_init_vector (vinfo, stmt_info, mask, masktype, NULL);
2631 else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (masktype)))
2633 REAL_VALUE_TYPE r;
2634 long tmp[6];
2635 for (int j = 0; j < 6; ++j)
2636 tmp[j] = -1;
2637 real_from_target (&r, tmp, TYPE_MODE (TREE_TYPE (masktype)));
2638 tree mask = build_real (TREE_TYPE (masktype), r);
2639 mask = build_vector_from_val (masktype, mask);
2640 return vect_init_vector (vinfo, stmt_info, mask, masktype, NULL);
2642 gcc_unreachable ();
2645 /* Build an all-zero merge value of type VECTYPE while vectorizing
2646 STMT_INFO as a gather load. */
2648 static tree
2649 vect_build_zero_merge_argument (vec_info *vinfo,
2650 stmt_vec_info stmt_info, tree vectype)
2652 tree merge;
2653 if (TREE_CODE (TREE_TYPE (vectype)) == INTEGER_TYPE)
2654 merge = build_int_cst (TREE_TYPE (vectype), 0);
2655 else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (vectype)))
2657 REAL_VALUE_TYPE r;
2658 long tmp[6];
2659 for (int j = 0; j < 6; ++j)
2660 tmp[j] = 0;
2661 real_from_target (&r, tmp, TYPE_MODE (TREE_TYPE (vectype)));
2662 merge = build_real (TREE_TYPE (vectype), r);
2664 else
2665 gcc_unreachable ();
2666 merge = build_vector_from_val (vectype, merge);
2667 return vect_init_vector (vinfo, stmt_info, merge, vectype, NULL);
2670 /* Build a gather load call while vectorizing STMT_INFO. Insert new
2671 instructions before GSI and add them to VEC_STMT. GS_INFO describes
2672 the gather load operation. If the load is conditional, MASK is the
2673 vectorized condition, otherwise MASK is null. PTR is the base
2674 pointer and OFFSET is the vectorized offset. */
2676 static gimple *
2677 vect_build_one_gather_load_call (vec_info *vinfo, stmt_vec_info stmt_info,
2678 gimple_stmt_iterator *gsi,
2679 gather_scatter_info *gs_info,
2680 tree ptr, tree offset, tree mask)
2682 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2683 tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info->decl));
2684 tree rettype = TREE_TYPE (TREE_TYPE (gs_info->decl));
2685 tree srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
2686 /* ptrtype */ arglist = TREE_CHAIN (arglist);
2687 tree idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
2688 tree masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
2689 tree scaletype = TREE_VALUE (arglist);
2690 tree var;
2691 gcc_checking_assert (types_compatible_p (srctype, rettype)
2692 && (!mask
2693 || TREE_CODE (masktype) == INTEGER_TYPE
2694 || types_compatible_p (srctype, masktype)));
2696 tree op = offset;
2697 if (!useless_type_conversion_p (idxtype, TREE_TYPE (op)))
2699 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (op)),
2700 TYPE_VECTOR_SUBPARTS (idxtype)));
2701 var = vect_get_new_ssa_name (idxtype, vect_simple_var);
2702 op = build1 (VIEW_CONVERT_EXPR, idxtype, op);
2703 gassign *new_stmt = gimple_build_assign (var, VIEW_CONVERT_EXPR, op);
2704 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
2705 op = var;
2708 tree src_op = NULL_TREE;
2709 tree mask_op = NULL_TREE;
2710 if (mask)
2712 if (!useless_type_conversion_p (masktype, TREE_TYPE (mask)))
2714 tree utype, optype = TREE_TYPE (mask);
2715 if (VECTOR_TYPE_P (masktype)
2716 || TYPE_MODE (masktype) == TYPE_MODE (optype))
2717 utype = masktype;
2718 else
2719 utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1);
2720 var = vect_get_new_ssa_name (utype, vect_scalar_var);
2721 tree mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask);
2722 gassign *new_stmt
2723 = gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg);
2724 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
2725 mask_arg = var;
2726 if (!useless_type_conversion_p (masktype, utype))
2728 gcc_assert (TYPE_PRECISION (utype)
2729 <= TYPE_PRECISION (masktype));
2730 var = vect_get_new_ssa_name (masktype, vect_scalar_var);
2731 new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg);
2732 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
2733 mask_arg = var;
2735 src_op = build_zero_cst (srctype);
2736 mask_op = mask_arg;
2738 else
2740 src_op = mask;
2741 mask_op = mask;
2744 else
2746 src_op = vect_build_zero_merge_argument (vinfo, stmt_info, rettype);
2747 mask_op = vect_build_all_ones_mask (vinfo, stmt_info, masktype);
2750 tree scale = build_int_cst (scaletype, gs_info->scale);
2751 gimple *new_stmt = gimple_build_call (gs_info->decl, 5, src_op, ptr, op,
2752 mask_op, scale);
2754 if (!useless_type_conversion_p (vectype, rettype))
2756 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype),
2757 TYPE_VECTOR_SUBPARTS (rettype)));
2758 op = vect_get_new_ssa_name (rettype, vect_simple_var);
2759 gimple_call_set_lhs (new_stmt, op);
2760 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
2761 op = build1 (VIEW_CONVERT_EXPR, vectype, op);
2762 new_stmt = gimple_build_assign (NULL_TREE, VIEW_CONVERT_EXPR, op);
2765 return new_stmt;
2768 /* Build a scatter store call while vectorizing STMT_INFO. Insert new
2769 instructions before GSI. GS_INFO describes the scatter store operation.
2770 PTR is the base pointer, OFFSET the vectorized offsets and OPRND the
2771 vectorized data to store.
2772 If the store is conditional, MASK is the vectorized condition, otherwise
2773 MASK is null. */
2775 static gimple *
2776 vect_build_one_scatter_store_call (vec_info *vinfo, stmt_vec_info stmt_info,
2777 gimple_stmt_iterator *gsi,
2778 gather_scatter_info *gs_info,
2779 tree ptr, tree offset, tree oprnd, tree mask)
2781 tree rettype = TREE_TYPE (TREE_TYPE (gs_info->decl));
2782 tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info->decl));
2783 /* tree ptrtype = TREE_VALUE (arglist); */ arglist = TREE_CHAIN (arglist);
2784 tree masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
2785 tree idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
2786 tree srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
2787 tree scaletype = TREE_VALUE (arglist);
2788 gcc_checking_assert (TREE_CODE (masktype) == INTEGER_TYPE
2789 && TREE_CODE (rettype) == VOID_TYPE);
2791 tree mask_arg = NULL_TREE;
2792 if (mask)
2794 mask_arg = mask;
2795 tree optype = TREE_TYPE (mask_arg);
2796 tree utype;
2797 if (TYPE_MODE (masktype) == TYPE_MODE (optype))
2798 utype = masktype;
2799 else
2800 utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1);
2801 tree var = vect_get_new_ssa_name (utype, vect_scalar_var);
2802 mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_arg);
2803 gassign *new_stmt
2804 = gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg);
2805 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
2806 mask_arg = var;
2807 if (!useless_type_conversion_p (masktype, utype))
2809 gcc_assert (TYPE_PRECISION (utype) <= TYPE_PRECISION (masktype));
2810 tree var = vect_get_new_ssa_name (masktype, vect_scalar_var);
2811 new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg);
2812 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
2813 mask_arg = var;
2816 else
2818 mask_arg = build_int_cst (masktype, -1);
2819 mask_arg = vect_init_vector (vinfo, stmt_info, mask_arg, masktype, NULL);
2822 tree src = oprnd;
2823 if (!useless_type_conversion_p (srctype, TREE_TYPE (src)))
2825 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (src)),
2826 TYPE_VECTOR_SUBPARTS (srctype)));
2827 tree var = vect_get_new_ssa_name (srctype, vect_simple_var);
2828 src = build1 (VIEW_CONVERT_EXPR, srctype, src);
2829 gassign *new_stmt = gimple_build_assign (var, VIEW_CONVERT_EXPR, src);
2830 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
2831 src = var;
2834 tree op = offset;
2835 if (!useless_type_conversion_p (idxtype, TREE_TYPE (op)))
2837 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (op)),
2838 TYPE_VECTOR_SUBPARTS (idxtype)));
2839 tree var = vect_get_new_ssa_name (idxtype, vect_simple_var);
2840 op = build1 (VIEW_CONVERT_EXPR, idxtype, op);
2841 gassign *new_stmt = gimple_build_assign (var, VIEW_CONVERT_EXPR, op);
2842 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
2843 op = var;
2846 tree scale = build_int_cst (scaletype, gs_info->scale);
2847 gcall *new_stmt
2848 = gimple_build_call (gs_info->decl, 5, ptr, mask_arg, op, src, scale);
2849 return new_stmt;
2852 /* Prepare the base and offset in GS_INFO for vectorization.
2853 Set *DATAREF_PTR to the loop-invariant base address and *VEC_OFFSET
2854 to the vectorized offset argument for the first copy of STMT_INFO.
2855 STMT_INFO is the statement described by GS_INFO and LOOP is the
2856 containing loop. */
2858 static void
2859 vect_get_gather_scatter_ops (loop_vec_info loop_vinfo,
2860 class loop *loop, stmt_vec_info stmt_info,
2861 slp_tree slp_node, gather_scatter_info *gs_info,
2862 tree *dataref_ptr, vec<tree> *vec_offset)
2864 gimple_seq stmts = NULL;
2865 *dataref_ptr = force_gimple_operand (gs_info->base, &stmts, true, NULL_TREE);
2866 if (stmts != NULL)
2868 basic_block new_bb;
2869 edge pe = loop_preheader_edge (loop);
2870 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2871 gcc_assert (!new_bb);
2873 if (slp_node)
2874 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], vec_offset);
2875 else
2877 unsigned ncopies
2878 = vect_get_num_copies (loop_vinfo, gs_info->offset_vectype);
2879 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, ncopies,
2880 gs_info->offset, vec_offset,
2881 gs_info->offset_vectype);
2885 /* Prepare to implement a grouped or strided load or store using
2886 the gather load or scatter store operation described by GS_INFO.
2887 STMT_INFO is the load or store statement.
2889 Set *DATAREF_BUMP to the amount that should be added to the base
2890 address after each copy of the vectorized statement. Set *VEC_OFFSET
2891 to an invariant offset vector in which element I has the value
2892 I * DR_STEP / SCALE. */
2894 static void
2895 vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
2896 loop_vec_info loop_vinfo,
2897 gimple_stmt_iterator *gsi,
2898 gather_scatter_info *gs_info,
2899 tree *dataref_bump, tree *vec_offset,
2900 vec_loop_lens *loop_lens)
2902 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2903 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2905 if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2907 /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]);
2908 ivtmp_8 = _31 * 16 (step in bytes);
2909 .MASK_LEN_SCATTER_STORE (vectp_a.9_7, ... );
2910 vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */
2911 tree loop_len
2912 = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0);
2913 tree tmp
2914 = fold_build2 (MULT_EXPR, sizetype,
2915 fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
2916 loop_len);
2917 *dataref_bump = force_gimple_operand_gsi (gsi, tmp, true, NULL_TREE, true,
2918 GSI_SAME_STMT);
2920 else
2922 tree bump
2923 = size_binop (MULT_EXPR,
2924 fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
2925 size_int (TYPE_VECTOR_SUBPARTS (vectype)));
2926 *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
2929 /* The offset given in GS_INFO can have pointer type, so use the element
2930 type of the vector instead. */
2931 tree offset_type = TREE_TYPE (gs_info->offset_vectype);
2933 /* Calculate X = DR_STEP / SCALE and convert it to the appropriate type. */
2934 tree step = size_binop (EXACT_DIV_EXPR, unshare_expr (DR_STEP (dr)),
2935 ssize_int (gs_info->scale));
2936 step = fold_convert (offset_type, step);
2938 /* Create {0, X, X*2, X*3, ...}. */
2939 tree offset = fold_build2 (VEC_SERIES_EXPR, gs_info->offset_vectype,
2940 build_zero_cst (offset_type), step);
2941 *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo, offset);
2944 /* Prepare the pointer IVs which needs to be updated by a variable amount.
2945 Such variable amount is the outcome of .SELECT_VL. In this case, we can
2946 allow each iteration process the flexible number of elements as long as
2947 the number <= vf elments.
2949 Return data reference according to SELECT_VL.
2950 If new statements are needed, insert them before GSI. */
2952 static tree
2953 vect_get_loop_variant_data_ptr_increment (
2954 vec_info *vinfo, tree aggr_type, gimple_stmt_iterator *gsi,
2955 vec_loop_lens *loop_lens, dr_vec_info *dr_info,
2956 vect_memory_access_type memory_access_type)
2958 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
2959 tree step = vect_dr_behavior (vinfo, dr_info)->step;
2961 /* gather/scatter never reach here. */
2962 gcc_assert (memory_access_type != VMAT_GATHER_SCATTER);
2964 /* When we support SELECT_VL pattern, we dynamic adjust
2965 the memory address by .SELECT_VL result.
2967 The result of .SELECT_VL is the number of elements to
2968 be processed of each iteration. So the memory address
2969 adjustment operation should be:
2971 addr = addr + .SELECT_VL (ARG..) * step;
2973 tree loop_len
2974 = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, aggr_type, 0, 0);
2975 tree len_type = TREE_TYPE (loop_len);
2976 /* Since the outcome of .SELECT_VL is element size, we should adjust
2977 it into bytesize so that it can be used in address pointer variable
2978 amount IVs adjustment. */
2979 tree tmp = fold_build2 (MULT_EXPR, len_type, loop_len,
2980 wide_int_to_tree (len_type, wi::to_widest (step)));
2981 tree bump = make_temp_ssa_name (len_type, NULL, "ivtmp");
2982 gassign *assign = gimple_build_assign (bump, tmp);
2983 gsi_insert_before (gsi, assign, GSI_SAME_STMT);
2984 return bump;
2987 /* Return the amount that should be added to a vector pointer to move
2988 to the next or previous copy of AGGR_TYPE. DR_INFO is the data reference
2989 being vectorized and MEMORY_ACCESS_TYPE describes the type of
2990 vectorization. */
2992 static tree
2993 vect_get_data_ptr_increment (vec_info *vinfo, gimple_stmt_iterator *gsi,
2994 dr_vec_info *dr_info, tree aggr_type,
2995 vect_memory_access_type memory_access_type,
2996 vec_loop_lens *loop_lens = nullptr)
2998 if (memory_access_type == VMAT_INVARIANT)
2999 return size_zero_node;
3001 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
3002 if (loop_vinfo && LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
3003 return vect_get_loop_variant_data_ptr_increment (vinfo, aggr_type, gsi,
3004 loop_lens, dr_info,
3005 memory_access_type);
3007 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
3008 tree step = vect_dr_behavior (vinfo, dr_info)->step;
3009 if (tree_int_cst_sgn (step) == -1)
3010 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
3011 return iv_step;
3014 /* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64,128}. */
3016 static bool
3017 vectorizable_bswap (vec_info *vinfo,
3018 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
3019 gimple **vec_stmt, slp_tree slp_node,
3020 slp_tree *slp_op,
3021 tree vectype_in, stmt_vector_for_cost *cost_vec)
3023 tree op, vectype;
3024 gcall *stmt = as_a <gcall *> (stmt_info->stmt);
3025 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3026 unsigned ncopies;
3028 op = gimple_call_arg (stmt, 0);
3029 vectype = STMT_VINFO_VECTYPE (stmt_info);
3030 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3032 /* Multiple types in SLP are handled by creating the appropriate number of
3033 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
3034 case of SLP. */
3035 if (slp_node)
3036 ncopies = 1;
3037 else
3038 ncopies = vect_get_num_copies (loop_vinfo, vectype);
3040 gcc_assert (ncopies >= 1);
3042 if (TYPE_SIZE (vectype_in) != TYPE_SIZE (vectype))
3044 if (dump_enabled_p ())
3045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3046 "mismatched vector sizes %T and %T\n",
3047 vectype_in, vectype);
3048 return false;
3051 tree char_vectype = get_same_sized_vectype (char_type_node, vectype_in);
3052 if (! char_vectype)
3053 return false;
3055 poly_uint64 num_bytes = TYPE_VECTOR_SUBPARTS (char_vectype);
3056 unsigned word_bytes;
3057 if (!constant_multiple_p (num_bytes, nunits, &word_bytes))
3058 return false;
3060 /* The encoding uses one stepped pattern for each byte in the word. */
3061 vec_perm_builder elts (num_bytes, word_bytes, 3);
3062 for (unsigned i = 0; i < 3; ++i)
3063 for (unsigned j = 0; j < word_bytes; ++j)
3064 elts.quick_push ((i + 1) * word_bytes - j - 1);
3066 vec_perm_indices indices (elts, 1, num_bytes);
3067 machine_mode vmode = TYPE_MODE (char_vectype);
3068 if (!can_vec_perm_const_p (vmode, vmode, indices))
3069 return false;
3071 if (! vec_stmt)
3073 if (slp_node
3074 && !vect_maybe_update_slp_op_vectype (slp_op[0], vectype_in))
3076 if (dump_enabled_p ())
3077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3078 "incompatible vector types for invariants\n");
3079 return false;
3082 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3083 DUMP_VECT_SCOPE ("vectorizable_bswap");
3084 record_stmt_cost (cost_vec,
3085 1, vector_stmt, stmt_info, 0, vect_prologue);
3086 record_stmt_cost (cost_vec,
3087 slp_node
3088 ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) : ncopies,
3089 vec_perm, stmt_info, 0, vect_body);
3090 return true;
3093 tree bswap_vconst = vec_perm_indices_to_tree (char_vectype, indices);
3095 /* Transform. */
3096 vec<tree> vec_oprnds = vNULL;
3097 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
3098 op, &vec_oprnds);
3099 /* Arguments are ready. create the new vector stmt. */
3100 unsigned i;
3101 tree vop;
3102 FOR_EACH_VEC_ELT (vec_oprnds, i, vop)
3104 gimple *new_stmt;
3105 tree tem = make_ssa_name (char_vectype);
3106 new_stmt = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR,
3107 char_vectype, vop));
3108 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3109 tree tem2 = make_ssa_name (char_vectype);
3110 new_stmt = gimple_build_assign (tem2, VEC_PERM_EXPR,
3111 tem, tem, bswap_vconst);
3112 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3113 tem = make_ssa_name (vectype);
3114 new_stmt = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR,
3115 vectype, tem2));
3116 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3117 if (slp_node)
3118 slp_node->push_vec_def (new_stmt);
3119 else
3120 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
3123 if (!slp_node)
3124 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
3126 vec_oprnds.release ();
3127 return true;
3130 /* Return true if vector types VECTYPE_IN and VECTYPE_OUT have
3131 integer elements and if we can narrow VECTYPE_IN to VECTYPE_OUT
3132 in a single step. On success, store the binary pack code in
3133 *CONVERT_CODE. */
3135 static bool
3136 simple_integer_narrowing (tree vectype_out, tree vectype_in,
3137 code_helper *convert_code)
3139 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype_out))
3140 || !INTEGRAL_TYPE_P (TREE_TYPE (vectype_in)))
3141 return false;
3143 code_helper code;
3144 int multi_step_cvt = 0;
3145 auto_vec <tree, 8> interm_types;
3146 if (!supportable_narrowing_operation (NOP_EXPR, vectype_out, vectype_in,
3147 &code, &multi_step_cvt, &interm_types)
3148 || multi_step_cvt)
3149 return false;
3151 *convert_code = code;
3152 return true;
3155 /* Function vectorizable_call.
3157 Check if STMT_INFO performs a function call that can be vectorized.
3158 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
3159 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3160 Return true if STMT_INFO is vectorizable in this way. */
3162 static bool
3163 vectorizable_call (vec_info *vinfo,
3164 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
3165 gimple **vec_stmt, slp_tree slp_node,
3166 stmt_vector_for_cost *cost_vec)
3168 gcall *stmt;
3169 tree vec_dest;
3170 tree scalar_dest;
3171 tree op;
3172 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3173 tree vectype_out, vectype_in;
3174 poly_uint64 nunits_in;
3175 poly_uint64 nunits_out;
3176 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3177 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
3178 tree fndecl, new_temp, rhs_type;
3179 enum vect_def_type dt[4]
3180 = { vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type,
3181 vect_unknown_def_type };
3182 tree vectypes[ARRAY_SIZE (dt)] = {};
3183 slp_tree slp_op[ARRAY_SIZE (dt)] = {};
3184 int ndts = ARRAY_SIZE (dt);
3185 int ncopies, j;
3186 auto_vec<tree, 8> vargs;
3187 enum { NARROW, NONE, WIDEN } modifier;
3188 size_t i, nargs;
3189 tree lhs;
3190 tree clz_ctz_arg1 = NULL_TREE;
3192 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
3193 return false;
3195 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
3196 && ! vec_stmt)
3197 return false;
3199 /* Is STMT_INFO a vectorizable call? */
3200 stmt = dyn_cast <gcall *> (stmt_info->stmt);
3201 if (!stmt)
3202 return false;
3204 if (gimple_call_internal_p (stmt)
3205 && (internal_load_fn_p (gimple_call_internal_fn (stmt))
3206 || internal_store_fn_p (gimple_call_internal_fn (stmt))))
3207 /* Handled by vectorizable_load and vectorizable_store. */
3208 return false;
3210 if (gimple_call_lhs (stmt) == NULL_TREE
3211 || TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
3212 return false;
3214 gcc_checking_assert (!stmt_can_throw_internal (cfun, stmt));
3216 vectype_out = STMT_VINFO_VECTYPE (stmt_info);
3218 /* Process function arguments. */
3219 rhs_type = NULL_TREE;
3220 vectype_in = NULL_TREE;
3221 nargs = gimple_call_num_args (stmt);
3223 /* Bail out if the function has more than four arguments, we do not have
3224 interesting builtin functions to vectorize with more than two arguments
3225 except for fma. No arguments is also not good. */
3226 if (nargs == 0 || nargs > 4)
3227 return false;
3229 /* Ignore the arguments of IFN_GOMP_SIMD_LANE, they are magic. */
3230 combined_fn cfn = gimple_call_combined_fn (stmt);
3231 if (cfn == CFN_GOMP_SIMD_LANE)
3233 nargs = 0;
3234 rhs_type = unsigned_type_node;
3236 /* Similarly pretend IFN_CLZ and IFN_CTZ only has one argument, the second
3237 argument just says whether it is well-defined at zero or not and what
3238 value should be returned for it. */
3239 if ((cfn == CFN_CLZ || cfn == CFN_CTZ) && nargs == 2)
3241 nargs = 1;
3242 clz_ctz_arg1 = gimple_call_arg (stmt, 1);
3245 int mask_opno = -1;
3246 if (internal_fn_p (cfn))
3247 mask_opno = internal_fn_mask_index (as_internal_fn (cfn));
3249 for (i = 0; i < nargs; i++)
3251 if ((int) i == mask_opno)
3253 if (!vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_opno,
3254 &op, &slp_op[i], &dt[i], &vectypes[i]))
3255 return false;
3256 continue;
3259 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
3260 i, &op, &slp_op[i], &dt[i], &vectypes[i]))
3262 if (dump_enabled_p ())
3263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3264 "use not simple.\n");
3265 return false;
3268 /* We can only handle calls with arguments of the same type. */
3269 if (rhs_type
3270 && !types_compatible_p (rhs_type, TREE_TYPE (op)))
3272 if (dump_enabled_p ())
3273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3274 "argument types differ.\n");
3275 return false;
3277 if (!rhs_type)
3278 rhs_type = TREE_TYPE (op);
3280 if (!vectype_in)
3281 vectype_in = vectypes[i];
3282 else if (vectypes[i]
3283 && !types_compatible_p (vectypes[i], vectype_in))
3285 if (dump_enabled_p ())
3286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3287 "argument vector types differ.\n");
3288 return false;
3291 /* If all arguments are external or constant defs, infer the vector type
3292 from the scalar type. */
3293 if (!vectype_in)
3294 vectype_in = get_vectype_for_scalar_type (vinfo, rhs_type, slp_node);
3295 if (vec_stmt)
3296 gcc_assert (vectype_in);
3297 if (!vectype_in)
3299 if (dump_enabled_p ())
3300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3301 "no vectype for scalar type %T\n", rhs_type);
3303 return false;
3306 if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
3307 != VECTOR_BOOLEAN_TYPE_P (vectype_in))
3309 if (dump_enabled_p ())
3310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3311 "mixed mask and nonmask vector types\n");
3312 return false;
3315 if (vect_emulated_vector_p (vectype_in) || vect_emulated_vector_p (vectype_out))
3317 if (dump_enabled_p ())
3318 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3319 "use emulated vector type for call\n");
3320 return false;
3323 /* FORNOW */
3324 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3325 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3326 if (known_eq (nunits_in * 2, nunits_out))
3327 modifier = NARROW;
3328 else if (known_eq (nunits_out, nunits_in))
3329 modifier = NONE;
3330 else if (known_eq (nunits_out * 2, nunits_in))
3331 modifier = WIDEN;
3332 else
3333 return false;
3335 /* We only handle functions that do not read or clobber memory. */
3336 if (gimple_vuse (stmt))
3338 if (dump_enabled_p ())
3339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3340 "function reads from or writes to memory.\n");
3341 return false;
3344 /* For now, we only vectorize functions if a target specific builtin
3345 is available. TODO -- in some cases, it might be profitable to
3346 insert the calls for pieces of the vector, in order to be able
3347 to vectorize other operations in the loop. */
3348 fndecl = NULL_TREE;
3349 internal_fn ifn = IFN_LAST;
3350 tree callee = gimple_call_fndecl (stmt);
3352 /* First try using an internal function. */
3353 code_helper convert_code = MAX_TREE_CODES;
3354 if (cfn != CFN_LAST
3355 && (modifier == NONE
3356 || (modifier == NARROW
3357 && simple_integer_narrowing (vectype_out, vectype_in,
3358 &convert_code))))
3359 ifn = vectorizable_internal_function (cfn, callee, vectype_out,
3360 vectype_in);
3362 /* If that fails, try asking for a target-specific built-in function. */
3363 if (ifn == IFN_LAST)
3365 if (cfn != CFN_LAST)
3366 fndecl = targetm.vectorize.builtin_vectorized_function
3367 (cfn, vectype_out, vectype_in);
3368 else if (callee && fndecl_built_in_p (callee, BUILT_IN_MD))
3369 fndecl = targetm.vectorize.builtin_md_vectorized_function
3370 (callee, vectype_out, vectype_in);
3373 if (ifn == IFN_LAST && !fndecl)
3375 if (cfn == CFN_GOMP_SIMD_LANE
3376 && !slp_node
3377 && loop_vinfo
3378 && LOOP_VINFO_LOOP (loop_vinfo)->simduid
3379 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
3380 && LOOP_VINFO_LOOP (loop_vinfo)->simduid
3381 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
3383 /* We can handle IFN_GOMP_SIMD_LANE by returning a
3384 { 0, 1, 2, ... vf - 1 } vector. */
3385 gcc_assert (nargs == 0);
3387 else if (modifier == NONE
3388 && (gimple_call_builtin_p (stmt, BUILT_IN_BSWAP16)
3389 || gimple_call_builtin_p (stmt, BUILT_IN_BSWAP32)
3390 || gimple_call_builtin_p (stmt, BUILT_IN_BSWAP64)
3391 || gimple_call_builtin_p (stmt, BUILT_IN_BSWAP128)))
3392 return vectorizable_bswap (vinfo, stmt_info, gsi, vec_stmt, slp_node,
3393 slp_op, vectype_in, cost_vec);
3394 else
3396 if (dump_enabled_p ())
3397 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3398 "function is not vectorizable.\n");
3399 return false;
3403 if (slp_node)
3404 ncopies = 1;
3405 else if (modifier == NARROW && ifn == IFN_LAST)
3406 ncopies = vect_get_num_copies (loop_vinfo, vectype_out);
3407 else
3408 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
3410 /* Sanity check: make sure that at least one copy of the vectorized stmt
3411 needs to be generated. */
3412 gcc_assert (ncopies >= 1);
3414 int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
3415 internal_fn cond_fn = get_conditional_internal_fn (ifn);
3416 internal_fn cond_len_fn = get_len_internal_fn (ifn);
3417 int len_opno = internal_fn_len_index (cond_len_fn);
3418 vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
3419 vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
3420 if (!vec_stmt) /* transformation not required. */
3422 if (slp_node)
3423 for (i = 0; i < nargs; ++i)
3424 if (!vect_maybe_update_slp_op_vectype (slp_op[i],
3425 vectypes[i]
3426 ? vectypes[i] : vectype_in))
3428 if (dump_enabled_p ())
3429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3430 "incompatible vector types for invariants\n");
3431 return false;
3433 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3434 DUMP_VECT_SCOPE ("vectorizable_call");
3435 vect_model_simple_cost (vinfo, stmt_info,
3436 ncopies, dt, ndts, slp_node, cost_vec);
3437 if (ifn != IFN_LAST && modifier == NARROW && !slp_node)
3438 record_stmt_cost (cost_vec, ncopies / 2,
3439 vec_promote_demote, stmt_info, 0, vect_body);
3441 if (loop_vinfo
3442 && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3443 && (reduc_idx >= 0 || mask_opno >= 0))
3445 if (reduc_idx >= 0
3446 && (cond_fn == IFN_LAST
3447 || !direct_internal_fn_supported_p (cond_fn, vectype_out,
3448 OPTIMIZE_FOR_SPEED))
3449 && (cond_len_fn == IFN_LAST
3450 || !direct_internal_fn_supported_p (cond_len_fn, vectype_out,
3451 OPTIMIZE_FOR_SPEED)))
3453 if (dump_enabled_p ())
3454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3455 "can't use a fully-masked loop because no"
3456 " conditional operation is available.\n");
3457 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3459 else
3461 unsigned int nvectors
3462 = (slp_node
3463 ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
3464 : ncopies);
3465 tree scalar_mask = NULL_TREE;
3466 if (mask_opno >= 0)
3467 scalar_mask = gimple_call_arg (stmt_info->stmt, mask_opno);
3468 if (cond_len_fn != IFN_LAST
3469 && direct_internal_fn_supported_p (cond_len_fn, vectype_out,
3470 OPTIMIZE_FOR_SPEED))
3471 vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_out,
3473 else
3474 vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out,
3475 scalar_mask);
3478 return true;
3481 /* Transform. */
3483 if (dump_enabled_p ())
3484 dump_printf_loc (MSG_NOTE, vect_location, "transform call.\n");
3486 /* Handle def. */
3487 scalar_dest = gimple_call_lhs (stmt);
3488 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3490 bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
3491 bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
3492 unsigned int vect_nargs = nargs;
3493 if (len_loop_p)
3495 if (len_opno >= 0)
3497 ifn = cond_len_fn;
3498 /* COND_* -> COND_LEN_* takes 2 extra arguments:LEN,BIAS. */
3499 vect_nargs += 2;
3501 else if (reduc_idx >= 0)
3502 gcc_unreachable ();
3504 else if (masked_loop_p && reduc_idx >= 0)
3506 ifn = cond_fn;
3507 vect_nargs += 2;
3509 if (clz_ctz_arg1)
3510 ++vect_nargs;
3512 if (modifier == NONE || ifn != IFN_LAST)
3514 tree prev_res = NULL_TREE;
3515 vargs.safe_grow (vect_nargs, true);
3516 auto_vec<vec<tree> > vec_defs (nargs);
3517 for (j = 0; j < ncopies; ++j)
3519 /* Build argument list for the vectorized call. */
3520 if (slp_node)
3522 vec<tree> vec_oprnds0;
3524 vect_get_slp_defs (vinfo, slp_node, &vec_defs);
3525 vec_oprnds0 = vec_defs[0];
3527 /* Arguments are ready. Create the new vector stmt. */
3528 FOR_EACH_VEC_ELT (vec_oprnds0, i, vec_oprnd0)
3530 int varg = 0;
3531 if (masked_loop_p && reduc_idx >= 0)
3533 unsigned int vec_num = vec_oprnds0.length ();
3534 /* Always true for SLP. */
3535 gcc_assert (ncopies == 1);
3536 vargs[varg++] = vect_get_loop_mask (loop_vinfo,
3537 gsi, masks, vec_num,
3538 vectype_out, i);
3540 size_t k;
3541 for (k = 0; k < nargs; k++)
3543 vec<tree> vec_oprndsk = vec_defs[k];
3544 vargs[varg++] = vec_oprndsk[i];
3546 if (masked_loop_p && reduc_idx >= 0)
3547 vargs[varg++] = vargs[reduc_idx + 1];
3548 if (clz_ctz_arg1)
3549 vargs[varg++] = clz_ctz_arg1;
3551 gimple *new_stmt;
3552 if (modifier == NARROW)
3554 /* We don't define any narrowing conditional functions
3555 at present. */
3556 gcc_assert (mask_opno < 0);
3557 tree half_res = make_ssa_name (vectype_in);
3558 gcall *call
3559 = gimple_build_call_internal_vec (ifn, vargs);
3560 gimple_call_set_lhs (call, half_res);
3561 gimple_call_set_nothrow (call, true);
3562 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
3563 if ((i & 1) == 0)
3565 prev_res = half_res;
3566 continue;
3568 new_temp = make_ssa_name (vec_dest);
3569 new_stmt = vect_gimple_build (new_temp, convert_code,
3570 prev_res, half_res);
3571 vect_finish_stmt_generation (vinfo, stmt_info,
3572 new_stmt, gsi);
3574 else
3576 if (len_opno >= 0 && len_loop_p)
3578 unsigned int vec_num = vec_oprnds0.length ();
3579 /* Always true for SLP. */
3580 gcc_assert (ncopies == 1);
3581 tree len
3582 = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num,
3583 vectype_out, i, 1);
3584 signed char biasval
3585 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
3586 tree bias = build_int_cst (intQI_type_node, biasval);
3587 vargs[len_opno] = len;
3588 vargs[len_opno + 1] = bias;
3590 else if (mask_opno >= 0 && masked_loop_p)
3592 unsigned int vec_num = vec_oprnds0.length ();
3593 /* Always true for SLP. */
3594 gcc_assert (ncopies == 1);
3595 tree mask = vect_get_loop_mask (loop_vinfo,
3596 gsi, masks, vec_num,
3597 vectype_out, i);
3598 vargs[mask_opno] = prepare_vec_mask
3599 (loop_vinfo, TREE_TYPE (mask), mask,
3600 vargs[mask_opno], gsi);
3603 gcall *call;
3604 if (ifn != IFN_LAST)
3605 call = gimple_build_call_internal_vec (ifn, vargs);
3606 else
3607 call = gimple_build_call_vec (fndecl, vargs);
3608 new_temp = make_ssa_name (vec_dest, call);
3609 gimple_call_set_lhs (call, new_temp);
3610 gimple_call_set_nothrow (call, true);
3611 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
3612 new_stmt = call;
3614 slp_node->push_vec_def (new_stmt);
3616 continue;
3619 int varg = 0;
3620 if (masked_loop_p && reduc_idx >= 0)
3621 vargs[varg++] = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies,
3622 vectype_out, j);
3623 for (i = 0; i < nargs; i++)
3625 op = gimple_call_arg (stmt, i);
3626 if (j == 0)
3628 vec_defs.quick_push (vNULL);
3629 vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
3630 op, &vec_defs[i],
3631 vectypes[i]);
3633 vargs[varg++] = vec_defs[i][j];
3635 if (masked_loop_p && reduc_idx >= 0)
3636 vargs[varg++] = vargs[reduc_idx + 1];
3637 if (clz_ctz_arg1)
3638 vargs[varg++] = clz_ctz_arg1;
3640 if (len_opno >= 0 && len_loop_p)
3642 tree len = vect_get_loop_len (loop_vinfo, gsi, lens, ncopies,
3643 vectype_out, j, 1);
3644 signed char biasval
3645 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
3646 tree bias = build_int_cst (intQI_type_node, biasval);
3647 vargs[len_opno] = len;
3648 vargs[len_opno + 1] = bias;
3650 else if (mask_opno >= 0 && masked_loop_p)
3652 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies,
3653 vectype_out, j);
3654 vargs[mask_opno]
3655 = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
3656 vargs[mask_opno], gsi);
3659 gimple *new_stmt;
3660 if (cfn == CFN_GOMP_SIMD_LANE)
3662 tree cst = build_index_vector (vectype_out, j * nunits_out, 1);
3663 tree new_var
3664 = vect_get_new_ssa_name (vectype_out, vect_simple_var, "cst_");
3665 gimple *init_stmt = gimple_build_assign (new_var, cst);
3666 vect_init_vector_1 (vinfo, stmt_info, init_stmt, NULL);
3667 new_temp = make_ssa_name (vec_dest);
3668 new_stmt = gimple_build_assign (new_temp, new_var);
3669 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3671 else if (modifier == NARROW)
3673 /* We don't define any narrowing conditional functions at
3674 present. */
3675 gcc_assert (mask_opno < 0);
3676 tree half_res = make_ssa_name (vectype_in);
3677 gcall *call = gimple_build_call_internal_vec (ifn, vargs);
3678 gimple_call_set_lhs (call, half_res);
3679 gimple_call_set_nothrow (call, true);
3680 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
3681 if ((j & 1) == 0)
3683 prev_res = half_res;
3684 continue;
3686 new_temp = make_ssa_name (vec_dest);
3687 new_stmt = vect_gimple_build (new_temp, convert_code, prev_res,
3688 half_res);
3689 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3691 else
3693 gcall *call;
3694 if (ifn != IFN_LAST)
3695 call = gimple_build_call_internal_vec (ifn, vargs);
3696 else
3697 call = gimple_build_call_vec (fndecl, vargs);
3698 new_temp = make_ssa_name (vec_dest, call);
3699 gimple_call_set_lhs (call, new_temp);
3700 gimple_call_set_nothrow (call, true);
3701 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
3702 new_stmt = call;
3705 if (j == (modifier == NARROW ? 1 : 0))
3706 *vec_stmt = new_stmt;
3707 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
3709 for (i = 0; i < nargs; i++)
3711 vec<tree> vec_oprndsi = vec_defs[i];
3712 vec_oprndsi.release ();
3715 else if (modifier == NARROW)
3717 auto_vec<vec<tree> > vec_defs (nargs);
3718 /* We don't define any narrowing conditional functions at present. */
3719 gcc_assert (mask_opno < 0);
3720 for (j = 0; j < ncopies; ++j)
3722 /* Build argument list for the vectorized call. */
3723 if (j == 0)
3724 vargs.create (nargs * 2);
3725 else
3726 vargs.truncate (0);
3728 if (slp_node)
3730 vec<tree> vec_oprnds0;
3732 vect_get_slp_defs (vinfo, slp_node, &vec_defs);
3733 vec_oprnds0 = vec_defs[0];
3735 /* Arguments are ready. Create the new vector stmt. */
3736 for (i = 0; vec_oprnds0.iterate (i, &vec_oprnd0); i += 2)
3738 size_t k;
3739 vargs.truncate (0);
3740 for (k = 0; k < nargs; k++)
3742 vec<tree> vec_oprndsk = vec_defs[k];
3743 vargs.quick_push (vec_oprndsk[i]);
3744 vargs.quick_push (vec_oprndsk[i + 1]);
3746 gcall *call;
3747 if (ifn != IFN_LAST)
3748 call = gimple_build_call_internal_vec (ifn, vargs);
3749 else
3750 call = gimple_build_call_vec (fndecl, vargs);
3751 new_temp = make_ssa_name (vec_dest, call);
3752 gimple_call_set_lhs (call, new_temp);
3753 gimple_call_set_nothrow (call, true);
3754 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
3755 slp_node->push_vec_def (call);
3757 continue;
3760 for (i = 0; i < nargs; i++)
3762 op = gimple_call_arg (stmt, i);
3763 if (j == 0)
3765 vec_defs.quick_push (vNULL);
3766 vect_get_vec_defs_for_operand (vinfo, stmt_info, 2 * ncopies,
3767 op, &vec_defs[i], vectypes[i]);
3769 vec_oprnd0 = vec_defs[i][2*j];
3770 vec_oprnd1 = vec_defs[i][2*j+1];
3772 vargs.quick_push (vec_oprnd0);
3773 vargs.quick_push (vec_oprnd1);
3776 gcall *new_stmt = gimple_build_call_vec (fndecl, vargs);
3777 new_temp = make_ssa_name (vec_dest, new_stmt);
3778 gimple_call_set_lhs (new_stmt, new_temp);
3779 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3781 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
3784 if (!slp_node)
3785 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
3787 for (i = 0; i < nargs; i++)
3789 vec<tree> vec_oprndsi = vec_defs[i];
3790 vec_oprndsi.release ();
3793 else
3794 /* No current target implements this case. */
3795 return false;
3797 vargs.release ();
3799 /* The call in STMT might prevent it from being removed in dce.
3800 We however cannot remove it here, due to the way the ssa name
3801 it defines is mapped to the new definition. So just replace
3802 rhs of the statement with something harmless. */
3804 if (slp_node)
3805 return true;
3807 stmt_info = vect_orig_stmt (stmt_info);
3808 lhs = gimple_get_lhs (stmt_info->stmt);
3810 gassign *new_stmt
3811 = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3812 vinfo->replace_stmt (gsi, stmt_info, new_stmt);
3814 return true;
3818 struct simd_call_arg_info
3820 tree vectype;
3821 tree op;
3822 HOST_WIDE_INT linear_step;
3823 enum vect_def_type dt;
3824 unsigned int align;
3825 bool simd_lane_linear;
3828 /* Helper function of vectorizable_simd_clone_call. If OP, an SSA_NAME,
3829 is linear within simd lane (but not within whole loop), note it in
3830 *ARGINFO. */
3832 static void
3833 vect_simd_lane_linear (tree op, class loop *loop,
3834 struct simd_call_arg_info *arginfo)
3836 gimple *def_stmt = SSA_NAME_DEF_STMT (op);
3838 if (!is_gimple_assign (def_stmt)
3839 || gimple_assign_rhs_code (def_stmt) != POINTER_PLUS_EXPR
3840 || !is_gimple_min_invariant (gimple_assign_rhs1 (def_stmt)))
3841 return;
3843 tree base = gimple_assign_rhs1 (def_stmt);
3844 HOST_WIDE_INT linear_step = 0;
3845 tree v = gimple_assign_rhs2 (def_stmt);
3846 while (TREE_CODE (v) == SSA_NAME)
3848 tree t;
3849 def_stmt = SSA_NAME_DEF_STMT (v);
3850 if (is_gimple_assign (def_stmt))
3851 switch (gimple_assign_rhs_code (def_stmt))
3853 case PLUS_EXPR:
3854 t = gimple_assign_rhs2 (def_stmt);
3855 if (linear_step || TREE_CODE (t) != INTEGER_CST)
3856 return;
3857 base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (base), base, t);
3858 v = gimple_assign_rhs1 (def_stmt);
3859 continue;
3860 case MULT_EXPR:
3861 t = gimple_assign_rhs2 (def_stmt);
3862 if (linear_step || !tree_fits_shwi_p (t) || integer_zerop (t))
3863 return;
3864 linear_step = tree_to_shwi (t);
3865 v = gimple_assign_rhs1 (def_stmt);
3866 continue;
3867 CASE_CONVERT:
3868 t = gimple_assign_rhs1 (def_stmt);
3869 if (TREE_CODE (TREE_TYPE (t)) != INTEGER_TYPE
3870 || (TYPE_PRECISION (TREE_TYPE (v))
3871 < TYPE_PRECISION (TREE_TYPE (t))))
3872 return;
3873 if (!linear_step)
3874 linear_step = 1;
3875 v = t;
3876 continue;
3877 default:
3878 return;
3880 else if (gimple_call_internal_p (def_stmt, IFN_GOMP_SIMD_LANE)
3881 && loop->simduid
3882 && TREE_CODE (gimple_call_arg (def_stmt, 0)) == SSA_NAME
3883 && (SSA_NAME_VAR (gimple_call_arg (def_stmt, 0))
3884 == loop->simduid))
3886 if (!linear_step)
3887 linear_step = 1;
3888 arginfo->linear_step = linear_step;
3889 arginfo->op = base;
3890 arginfo->simd_lane_linear = true;
3891 return;
3896 /* Function vectorizable_simd_clone_call.
3898 Check if STMT_INFO performs a function call that can be vectorized
3899 by calling a simd clone of the function.
3900 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
3901 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3902 Return true if STMT_INFO is vectorizable in this way. */
3904 static bool
3905 vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
3906 gimple_stmt_iterator *gsi,
3907 gimple **vec_stmt, slp_tree slp_node,
3908 stmt_vector_for_cost *)
3910 tree vec_dest;
3911 tree scalar_dest;
3912 tree op, type;
3913 tree vec_oprnd0 = NULL_TREE;
3914 tree vectype;
3915 poly_uint64 nunits;
3916 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3917 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
3918 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
3919 tree fndecl, new_temp;
3920 int ncopies, j;
3921 auto_vec<simd_call_arg_info> arginfo;
3922 vec<tree> vargs = vNULL;
3923 size_t i, nargs;
3924 tree lhs, rtype, ratype;
3925 vec<constructor_elt, va_gc> *ret_ctor_elts = NULL;
3926 int masked_call_offset = 0;
3928 /* Is STMT a vectorizable call? */
3929 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
3930 if (!stmt)
3931 return false;
3933 fndecl = gimple_call_fndecl (stmt);
3934 if (fndecl == NULL_TREE
3935 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
3937 fndecl = gimple_call_arg (stmt, 0);
3938 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
3939 fndecl = TREE_OPERAND (fndecl, 0);
3940 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
3941 masked_call_offset = 1;
3943 if (fndecl == NULL_TREE)
3944 return false;
3946 struct cgraph_node *node = cgraph_node::get (fndecl);
3947 if (node == NULL || node->simd_clones == NULL)
3948 return false;
3950 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
3951 return false;
3953 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
3954 && ! vec_stmt)
3955 return false;
3957 if (gimple_call_lhs (stmt)
3958 && TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
3959 return false;
3961 gcc_checking_assert (!stmt_can_throw_internal (cfun, stmt));
3963 vectype = STMT_VINFO_VECTYPE (stmt_info);
3965 if (loop_vinfo && nested_in_vect_loop_p (loop, stmt_info))
3966 return false;
3968 /* Process function arguments. */
3969 nargs = gimple_call_num_args (stmt) - masked_call_offset;
3971 /* Bail out if the function has zero arguments. */
3972 if (nargs == 0)
3973 return false;
3975 vec<tree>& simd_clone_info = (slp_node ? SLP_TREE_SIMD_CLONE_INFO (slp_node)
3976 : STMT_VINFO_SIMD_CLONE_INFO (stmt_info));
3977 arginfo.reserve (nargs, true);
3978 auto_vec<slp_tree> slp_op;
3979 slp_op.safe_grow_cleared (nargs);
3981 for (i = 0; i < nargs; i++)
3983 simd_call_arg_info thisarginfo;
3984 affine_iv iv;
3986 thisarginfo.linear_step = 0;
3987 thisarginfo.align = 0;
3988 thisarginfo.op = NULL_TREE;
3989 thisarginfo.simd_lane_linear = false;
3991 int op_no = i + masked_call_offset;
3992 if (slp_node)
3993 op_no = vect_slp_child_index_for_operand (stmt, op_no, false);
3994 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
3995 op_no, &op, &slp_op[i],
3996 &thisarginfo.dt, &thisarginfo.vectype)
3997 || thisarginfo.dt == vect_uninitialized_def)
3999 if (dump_enabled_p ())
4000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4001 "use not simple.\n");
4002 return false;
4005 if (thisarginfo.dt == vect_constant_def
4006 || thisarginfo.dt == vect_external_def)
4008 /* With SLP we determine the vector type of constants/externals
4009 at analysis time, handling conflicts via
4010 vect_maybe_update_slp_op_vectype. At transform time
4011 we have a vector type recorded for SLP. */
4012 gcc_assert (!vec_stmt
4013 || !slp_node
4014 || thisarginfo.vectype != NULL_TREE);
4015 if (!vec_stmt)
4016 thisarginfo.vectype = get_vectype_for_scalar_type (vinfo,
4017 TREE_TYPE (op),
4018 slp_node);
4020 else
4021 gcc_assert (thisarginfo.vectype != NULL_TREE);
4023 /* For linear arguments, the analyze phase should have saved
4024 the base and step in {STMT_VINFO,SLP_TREE}_SIMD_CLONE_INFO. */
4025 if (i * 3 + 4 <= simd_clone_info.length ()
4026 && simd_clone_info[i * 3 + 2])
4028 gcc_assert (vec_stmt);
4029 thisarginfo.linear_step = tree_to_shwi (simd_clone_info[i * 3 + 2]);
4030 thisarginfo.op = simd_clone_info[i * 3 + 1];
4031 thisarginfo.simd_lane_linear
4032 = (simd_clone_info[i * 3 + 3] == boolean_true_node);
4033 /* If loop has been peeled for alignment, we need to adjust it. */
4034 tree n1 = LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo);
4035 tree n2 = LOOP_VINFO_NITERS (loop_vinfo);
4036 if (n1 != n2 && !thisarginfo.simd_lane_linear)
4038 tree bias = fold_build2 (MINUS_EXPR, TREE_TYPE (n1), n1, n2);
4039 tree step = simd_clone_info[i * 3 + 2];
4040 tree opt = TREE_TYPE (thisarginfo.op);
4041 bias = fold_convert (TREE_TYPE (step), bias);
4042 bias = fold_build2 (MULT_EXPR, TREE_TYPE (step), bias, step);
4043 thisarginfo.op
4044 = fold_build2 (POINTER_TYPE_P (opt)
4045 ? POINTER_PLUS_EXPR : PLUS_EXPR, opt,
4046 thisarginfo.op, bias);
4049 else if (!vec_stmt
4050 && thisarginfo.dt != vect_constant_def
4051 && thisarginfo.dt != vect_external_def
4052 && loop_vinfo
4053 && TREE_CODE (op) == SSA_NAME
4054 && simple_iv (loop, loop_containing_stmt (stmt), op,
4055 &iv, false)
4056 && tree_fits_shwi_p (iv.step))
4058 thisarginfo.linear_step = tree_to_shwi (iv.step);
4059 thisarginfo.op = iv.base;
4061 else if ((thisarginfo.dt == vect_constant_def
4062 || thisarginfo.dt == vect_external_def)
4063 && POINTER_TYPE_P (TREE_TYPE (op)))
4064 thisarginfo.align = get_pointer_alignment (op) / BITS_PER_UNIT;
4065 /* Addresses of array elements indexed by GOMP_SIMD_LANE are
4066 linear too. */
4067 if (POINTER_TYPE_P (TREE_TYPE (op))
4068 && !thisarginfo.linear_step
4069 && !vec_stmt
4070 && thisarginfo.dt != vect_constant_def
4071 && thisarginfo.dt != vect_external_def
4072 && loop_vinfo
4073 && TREE_CODE (op) == SSA_NAME)
4074 vect_simd_lane_linear (op, loop, &thisarginfo);
4076 arginfo.quick_push (thisarginfo);
4079 poly_uint64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
4080 unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 1;
4081 unsigned int badness = 0;
4082 struct cgraph_node *bestn = NULL;
4083 if (simd_clone_info.exists ())
4084 bestn = cgraph_node::get (simd_clone_info[0]);
4085 else
4086 for (struct cgraph_node *n = node->simd_clones; n != NULL;
4087 n = n->simdclone->next_clone)
4089 unsigned int this_badness = 0;
4090 unsigned int num_calls;
4091 /* The number of arguments in the call and the number of parameters in
4092 the simdclone should match. However, when the simdclone is
4093 'inbranch', it could have one more paramater than nargs when using
4094 an inbranch simdclone to call a non-inbranch call, either in a
4095 non-masked loop using a all true constant mask, or inside a masked
4096 loop using it's mask. */
4097 size_t simd_nargs = n->simdclone->nargs;
4098 if (!masked_call_offset && n->simdclone->inbranch)
4099 simd_nargs--;
4100 if (!constant_multiple_p (vf * group_size, n->simdclone->simdlen,
4101 &num_calls)
4102 || (!n->simdclone->inbranch && (masked_call_offset > 0))
4103 || (nargs != simd_nargs))
4104 continue;
4105 if (num_calls != 1)
4106 this_badness += floor_log2 (num_calls) * 4096;
4107 if (n->simdclone->inbranch)
4108 this_badness += 8192;
4109 int target_badness = targetm.simd_clone.usable (n);
4110 if (target_badness < 0)
4111 continue;
4112 this_badness += target_badness * 512;
4113 for (i = 0; i < nargs; i++)
4115 switch (n->simdclone->args[i].arg_type)
4117 case SIMD_CLONE_ARG_TYPE_VECTOR:
4118 if (!useless_type_conversion_p
4119 (n->simdclone->args[i].orig_type,
4120 TREE_TYPE (gimple_call_arg (stmt,
4121 i + masked_call_offset))))
4122 i = -1;
4123 else if (arginfo[i].dt == vect_constant_def
4124 || arginfo[i].dt == vect_external_def
4125 || arginfo[i].linear_step)
4126 this_badness += 64;
4127 break;
4128 case SIMD_CLONE_ARG_TYPE_UNIFORM:
4129 if (arginfo[i].dt != vect_constant_def
4130 && arginfo[i].dt != vect_external_def)
4131 i = -1;
4132 break;
4133 case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
4134 case SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP:
4135 if (arginfo[i].dt == vect_constant_def
4136 || arginfo[i].dt == vect_external_def
4137 || (arginfo[i].linear_step
4138 != n->simdclone->args[i].linear_step))
4139 i = -1;
4140 break;
4141 case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
4142 case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP:
4143 case SIMD_CLONE_ARG_TYPE_LINEAR_UVAL_CONSTANT_STEP:
4144 case SIMD_CLONE_ARG_TYPE_LINEAR_REF_VARIABLE_STEP:
4145 case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP:
4146 case SIMD_CLONE_ARG_TYPE_LINEAR_UVAL_VARIABLE_STEP:
4147 /* FORNOW */
4148 i = -1;
4149 break;
4150 case SIMD_CLONE_ARG_TYPE_MASK:
4151 /* While we can create a traditional data vector from
4152 an incoming integer mode mask we have no good way to
4153 force generate an integer mode mask from a traditional
4154 boolean vector input. */
4155 if (SCALAR_INT_MODE_P (n->simdclone->mask_mode)
4156 && !SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype)))
4157 i = -1;
4158 else if (!SCALAR_INT_MODE_P (n->simdclone->mask_mode)
4159 && SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype)))
4160 this_badness += 2048;
4161 break;
4163 if (i == (size_t) -1)
4164 break;
4165 if (n->simdclone->args[i].alignment > arginfo[i].align)
4167 i = -1;
4168 break;
4170 if (arginfo[i].align)
4171 this_badness += (exact_log2 (arginfo[i].align)
4172 - exact_log2 (n->simdclone->args[i].alignment));
4174 if (i == (size_t) -1)
4175 continue;
4176 if (masked_call_offset == 0
4177 && n->simdclone->inbranch
4178 && n->simdclone->nargs > nargs)
4180 gcc_assert (n->simdclone->args[n->simdclone->nargs - 1].arg_type ==
4181 SIMD_CLONE_ARG_TYPE_MASK);
4182 /* Penalize using a masked SIMD clone in a non-masked loop, that is
4183 not in a branch, as we'd have to construct an all-true mask. */
4184 if (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4185 this_badness += 64;
4187 if (bestn == NULL || this_badness < badness)
4189 bestn = n;
4190 badness = this_badness;
4194 if (bestn == NULL)
4195 return false;
4197 unsigned int num_mask_args = 0;
4198 if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
4199 for (i = 0; i < nargs; i++)
4200 if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK)
4201 num_mask_args++;
4203 for (i = 0; i < nargs; i++)
4205 if ((arginfo[i].dt == vect_constant_def
4206 || arginfo[i].dt == vect_external_def)
4207 && bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
4209 tree arg_type = TREE_TYPE (gimple_call_arg (stmt,
4210 i + masked_call_offset));
4211 arginfo[i].vectype = get_vectype_for_scalar_type (vinfo, arg_type,
4212 slp_node);
4213 if (arginfo[i].vectype == NULL
4214 || !constant_multiple_p (bestn->simdclone->simdlen,
4215 TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
4216 return false;
4219 if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR
4220 && VECTOR_BOOLEAN_TYPE_P (bestn->simdclone->args[i].vector_type))
4222 if (dump_enabled_p ())
4223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4224 "vector mask arguments are not supported.\n");
4225 return false;
4228 if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK)
4230 tree clone_arg_vectype = bestn->simdclone->args[i].vector_type;
4231 if (bestn->simdclone->mask_mode == VOIDmode)
4233 if (maybe_ne (TYPE_VECTOR_SUBPARTS (clone_arg_vectype),
4234 TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
4236 /* FORNOW we only have partial support for vector-type masks
4237 that can't hold all of simdlen. */
4238 if (dump_enabled_p ())
4239 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4240 vect_location,
4241 "in-branch vector clones are not yet"
4242 " supported for mismatched vector sizes.\n");
4243 return false;
4245 if (!expand_vec_cond_expr_p (clone_arg_vectype,
4246 arginfo[i].vectype, ERROR_MARK))
4248 if (dump_enabled_p ())
4249 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4250 vect_location,
4251 "cannot compute mask argument for"
4252 " in-branch vector clones.\n");
4253 return false;
4256 else if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
4258 if (!SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype))
4259 || maybe_ne (exact_div (bestn->simdclone->simdlen,
4260 num_mask_args),
4261 TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
4263 /* FORNOW we only have partial support for integer-type masks
4264 that represent the same number of lanes as the
4265 vectorized mask inputs. */
4266 if (dump_enabled_p ())
4267 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4268 vect_location,
4269 "in-branch vector clones are not yet "
4270 "supported for mismatched vector sizes.\n");
4271 return false;
4274 else
4276 if (dump_enabled_p ())
4277 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4278 vect_location,
4279 "in-branch vector clones not supported"
4280 " on this target.\n");
4281 return false;
4286 fndecl = bestn->decl;
4287 nunits = bestn->simdclone->simdlen;
4288 if (slp_node)
4289 ncopies = vector_unroll_factor (vf * group_size, nunits);
4290 else
4291 ncopies = vector_unroll_factor (vf, nunits);
4293 /* If the function isn't const, only allow it in simd loops where user
4294 has asserted that at least nunits consecutive iterations can be
4295 performed using SIMD instructions. */
4296 if ((loop == NULL || maybe_lt ((unsigned) loop->safelen, nunits))
4297 && gimple_vuse (stmt))
4298 return false;
4300 /* Sanity check: make sure that at least one copy of the vectorized stmt
4301 needs to be generated. */
4302 gcc_assert (ncopies >= 1);
4304 if (!vec_stmt) /* transformation not required. */
4306 if (slp_node)
4307 for (unsigned i = 0; i < nargs; ++i)
4308 if (!vect_maybe_update_slp_op_vectype (slp_op[i], arginfo[i].vectype))
4310 if (dump_enabled_p ())
4311 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4312 "incompatible vector types for invariants\n");
4313 return false;
4315 /* When the original call is pure or const but the SIMD ABI dictates
4316 an aggregate return we will have to use a virtual definition and
4317 in a loop eventually even need to add a virtual PHI. That's
4318 not straight-forward so allow to fix this up via renaming. */
4319 if (gimple_call_lhs (stmt)
4320 && !gimple_vdef (stmt)
4321 && TREE_CODE (TREE_TYPE (TREE_TYPE (bestn->decl))) == ARRAY_TYPE)
4322 vinfo->any_known_not_updated_vssa = true;
4323 /* ??? For SLP code-gen we end up inserting after the last
4324 vector argument def rather than at the original call position
4325 so automagic virtual operand updating doesn't work. */
4326 if (gimple_vuse (stmt) && slp_node)
4327 vinfo->any_known_not_updated_vssa = true;
4328 simd_clone_info.safe_push (bestn->decl);
4329 for (i = 0; i < bestn->simdclone->nargs; i++)
4331 switch (bestn->simdclone->args[i].arg_type)
4333 default:
4334 continue;
4335 case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
4336 case SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP:
4338 simd_clone_info.safe_grow_cleared (i * 3 + 1, true);
4339 simd_clone_info.safe_push (arginfo[i].op);
4340 tree lst = POINTER_TYPE_P (TREE_TYPE (arginfo[i].op))
4341 ? size_type_node : TREE_TYPE (arginfo[i].op);
4342 tree ls = build_int_cst (lst, arginfo[i].linear_step);
4343 simd_clone_info.safe_push (ls);
4344 tree sll = arginfo[i].simd_lane_linear
4345 ? boolean_true_node : boolean_false_node;
4346 simd_clone_info.safe_push (sll);
4348 break;
4349 case SIMD_CLONE_ARG_TYPE_MASK:
4350 if (loop_vinfo
4351 && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
4353 unsigned nmasks
4354 = exact_div (ncopies * bestn->simdclone->simdlen,
4355 TYPE_VECTOR_SUBPARTS (vectype)).to_constant ();
4356 vect_record_loop_mask (loop_vinfo,
4357 &LOOP_VINFO_MASKS (loop_vinfo),
4358 nmasks, vectype, op);
4361 break;
4365 if (!bestn->simdclone->inbranch && loop_vinfo)
4367 if (dump_enabled_p ()
4368 && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
4369 dump_printf_loc (MSG_NOTE, vect_location,
4370 "can't use a fully-masked loop because a"
4371 " non-masked simd clone was selected.\n");
4372 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
4375 STMT_VINFO_TYPE (stmt_info) = call_simd_clone_vec_info_type;
4376 DUMP_VECT_SCOPE ("vectorizable_simd_clone_call");
4377 /* vect_model_simple_cost (vinfo, stmt_info, ncopies,
4378 dt, slp_node, cost_vec); */
4379 return true;
4382 /* Transform. */
4384 if (dump_enabled_p ())
4385 dump_printf_loc (MSG_NOTE, vect_location, "transform call.\n");
4387 /* Handle def. */
4388 scalar_dest = gimple_call_lhs (stmt);
4389 vec_dest = NULL_TREE;
4390 rtype = NULL_TREE;
4391 ratype = NULL_TREE;
4392 if (scalar_dest)
4394 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4395 rtype = TREE_TYPE (TREE_TYPE (fndecl));
4396 if (TREE_CODE (rtype) == ARRAY_TYPE)
4398 ratype = rtype;
4399 rtype = TREE_TYPE (ratype);
4403 auto_vec<vec<tree> > vec_oprnds;
4404 auto_vec<unsigned> vec_oprnds_i;
4405 vec_oprnds_i.safe_grow_cleared (nargs, true);
4406 if (slp_node)
4408 vec_oprnds.reserve_exact (nargs);
4409 vect_get_slp_defs (vinfo, slp_node, &vec_oprnds);
4411 else
4412 vec_oprnds.safe_grow_cleared (nargs, true);
4413 for (j = 0; j < ncopies; ++j)
4415 poly_uint64 callee_nelements;
4416 poly_uint64 caller_nelements;
4417 /* Build argument list for the vectorized call. */
4418 if (j == 0)
4419 vargs.create (nargs);
4420 else
4421 vargs.truncate (0);
4423 for (i = 0; i < nargs; i++)
4425 unsigned int k, l, m, o;
4426 tree atype;
4427 op = gimple_call_arg (stmt, i + masked_call_offset);
4428 switch (bestn->simdclone->args[i].arg_type)
4430 case SIMD_CLONE_ARG_TYPE_VECTOR:
4431 atype = bestn->simdclone->args[i].vector_type;
4432 caller_nelements = TYPE_VECTOR_SUBPARTS (arginfo[i].vectype);
4433 callee_nelements = TYPE_VECTOR_SUBPARTS (atype);
4434 o = vector_unroll_factor (nunits, callee_nelements);
4435 for (m = j * o; m < (j + 1) * o; m++)
4437 if (known_lt (callee_nelements, caller_nelements))
4439 poly_uint64 prec = GET_MODE_BITSIZE (TYPE_MODE (atype));
4440 if (!constant_multiple_p (caller_nelements,
4441 callee_nelements, &k))
4442 gcc_unreachable ();
4444 gcc_assert ((k & (k - 1)) == 0);
4445 if (m == 0)
4447 if (!slp_node)
4448 vect_get_vec_defs_for_operand (vinfo, stmt_info,
4449 ncopies * o / k, op,
4450 &vec_oprnds[i]);
4451 vec_oprnds_i[i] = 0;
4452 vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4454 else
4456 vec_oprnd0 = arginfo[i].op;
4457 if ((m & (k - 1)) == 0)
4458 vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4460 arginfo[i].op = vec_oprnd0;
4461 vec_oprnd0
4462 = build3 (BIT_FIELD_REF, atype, vec_oprnd0,
4463 bitsize_int (prec),
4464 bitsize_int ((m & (k - 1)) * prec));
4465 gassign *new_stmt
4466 = gimple_build_assign (make_ssa_name (atype),
4467 vec_oprnd0);
4468 vect_finish_stmt_generation (vinfo, stmt_info,
4469 new_stmt, gsi);
4470 vargs.safe_push (gimple_assign_lhs (new_stmt));
4472 else
4474 if (!constant_multiple_p (callee_nelements,
4475 caller_nelements, &k))
4476 gcc_unreachable ();
4477 gcc_assert ((k & (k - 1)) == 0);
4478 vec<constructor_elt, va_gc> *ctor_elts;
4479 if (k != 1)
4480 vec_alloc (ctor_elts, k);
4481 else
4482 ctor_elts = NULL;
4483 for (l = 0; l < k; l++)
4485 if (m == 0 && l == 0)
4487 if (!slp_node)
4488 vect_get_vec_defs_for_operand (vinfo, stmt_info,
4489 k * o * ncopies,
4491 &vec_oprnds[i]);
4492 vec_oprnds_i[i] = 0;
4493 vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4495 else
4496 vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4497 arginfo[i].op = vec_oprnd0;
4498 if (k == 1)
4499 break;
4500 CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE,
4501 vec_oprnd0);
4503 if (k == 1)
4504 if (!useless_type_conversion_p (TREE_TYPE (vec_oprnd0),
4505 atype))
4507 vec_oprnd0 = build1 (VIEW_CONVERT_EXPR, atype,
4508 vec_oprnd0);
4509 gassign *new_stmt
4510 = gimple_build_assign (make_ssa_name (atype),
4511 vec_oprnd0);
4512 vect_finish_stmt_generation (vinfo, stmt_info,
4513 new_stmt, gsi);
4514 vargs.safe_push (gimple_get_lhs (new_stmt));
4516 else
4517 vargs.safe_push (vec_oprnd0);
4518 else
4520 vec_oprnd0 = build_constructor (atype, ctor_elts);
4521 gassign *new_stmt
4522 = gimple_build_assign (make_ssa_name (atype),
4523 vec_oprnd0);
4524 vect_finish_stmt_generation (vinfo, stmt_info,
4525 new_stmt, gsi);
4526 vargs.safe_push (gimple_assign_lhs (new_stmt));
4530 break;
4531 case SIMD_CLONE_ARG_TYPE_MASK:
4532 if (bestn->simdclone->mask_mode == VOIDmode)
4534 atype = bestn->simdclone->args[i].vector_type;
4535 tree elt_type = TREE_TYPE (atype);
4536 tree one = fold_convert (elt_type, integer_one_node);
4537 tree zero = fold_convert (elt_type, integer_zero_node);
4538 callee_nelements = TYPE_VECTOR_SUBPARTS (atype);
4539 caller_nelements = TYPE_VECTOR_SUBPARTS (arginfo[i].vectype);
4540 o = vector_unroll_factor (nunits, callee_nelements);
4541 for (m = j * o; m < (j + 1) * o; m++)
4543 if (maybe_lt (callee_nelements, caller_nelements))
4545 /* The mask type has fewer elements than simdlen. */
4547 /* FORNOW */
4548 gcc_unreachable ();
4550 else if (known_eq (callee_nelements, caller_nelements))
4552 /* The SIMD clone function has the same number of
4553 elements as the current function. */
4554 if (m == 0)
4556 if (!slp_node)
4557 vect_get_vec_defs_for_operand (vinfo, stmt_info,
4558 o * ncopies,
4560 &vec_oprnds[i]);
4561 vec_oprnds_i[i] = 0;
4563 vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4564 if (loop_vinfo
4565 && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4567 vec_loop_masks *loop_masks
4568 = &LOOP_VINFO_MASKS (loop_vinfo);
4569 tree loop_mask
4570 = vect_get_loop_mask (loop_vinfo, gsi,
4571 loop_masks, ncopies,
4572 vectype, j);
4573 vec_oprnd0
4574 = prepare_vec_mask (loop_vinfo,
4575 TREE_TYPE (loop_mask),
4576 loop_mask, vec_oprnd0,
4577 gsi);
4578 loop_vinfo->vec_cond_masked_set.add ({ vec_oprnd0,
4579 loop_mask });
4582 vec_oprnd0
4583 = build3 (VEC_COND_EXPR, atype, vec_oprnd0,
4584 build_vector_from_val (atype, one),
4585 build_vector_from_val (atype, zero));
4586 gassign *new_stmt
4587 = gimple_build_assign (make_ssa_name (atype),
4588 vec_oprnd0);
4589 vect_finish_stmt_generation (vinfo, stmt_info,
4590 new_stmt, gsi);
4591 vargs.safe_push (gimple_assign_lhs (new_stmt));
4593 else
4595 /* The mask type has more elements than simdlen. */
4597 /* FORNOW */
4598 gcc_unreachable ();
4602 else if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
4604 atype = bestn->simdclone->args[i].vector_type;
4605 /* Guess the number of lanes represented by atype. */
4606 poly_uint64 atype_subparts
4607 = exact_div (bestn->simdclone->simdlen,
4608 num_mask_args);
4609 o = vector_unroll_factor (nunits, atype_subparts);
4610 for (m = j * o; m < (j + 1) * o; m++)
4612 if (m == 0)
4614 if (!slp_node)
4615 vect_get_vec_defs_for_operand (vinfo, stmt_info,
4616 o * ncopies,
4618 &vec_oprnds[i]);
4619 vec_oprnds_i[i] = 0;
4621 if (maybe_lt (atype_subparts,
4622 TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
4624 /* The mask argument has fewer elements than the
4625 input vector. */
4626 /* FORNOW */
4627 gcc_unreachable ();
4629 else if (known_eq (atype_subparts,
4630 TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
4632 /* The vector mask argument matches the input
4633 in the number of lanes, but not necessarily
4634 in the mode. */
4635 vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4636 tree st = lang_hooks.types.type_for_mode
4637 (TYPE_MODE (TREE_TYPE (vec_oprnd0)), 1);
4638 vec_oprnd0 = build1 (VIEW_CONVERT_EXPR, st,
4639 vec_oprnd0);
4640 gassign *new_stmt
4641 = gimple_build_assign (make_ssa_name (st),
4642 vec_oprnd0);
4643 vect_finish_stmt_generation (vinfo, stmt_info,
4644 new_stmt, gsi);
4645 if (!types_compatible_p (atype, st))
4647 new_stmt
4648 = gimple_build_assign (make_ssa_name (atype),
4649 NOP_EXPR,
4650 gimple_assign_lhs
4651 (new_stmt));
4652 vect_finish_stmt_generation (vinfo, stmt_info,
4653 new_stmt, gsi);
4655 vargs.safe_push (gimple_assign_lhs (new_stmt));
4657 else
4659 /* The mask argument has more elements than the
4660 input vector. */
4661 /* FORNOW */
4662 gcc_unreachable ();
4666 else
4667 gcc_unreachable ();
4668 break;
4669 case SIMD_CLONE_ARG_TYPE_UNIFORM:
4670 vargs.safe_push (op);
4671 break;
4672 case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
4673 case SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP:
4674 if (j == 0)
4676 gimple_seq stmts;
4677 arginfo[i].op
4678 = force_gimple_operand (unshare_expr (arginfo[i].op),
4679 &stmts, true, NULL_TREE);
4680 if (stmts != NULL)
4682 basic_block new_bb;
4683 edge pe = loop_preheader_edge (loop);
4684 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4685 gcc_assert (!new_bb);
4687 if (arginfo[i].simd_lane_linear)
4689 vargs.safe_push (arginfo[i].op);
4690 break;
4692 tree phi_res = copy_ssa_name (op);
4693 gphi *new_phi = create_phi_node (phi_res, loop->header);
4694 add_phi_arg (new_phi, arginfo[i].op,
4695 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4696 enum tree_code code
4697 = POINTER_TYPE_P (TREE_TYPE (op))
4698 ? POINTER_PLUS_EXPR : PLUS_EXPR;
4699 tree type = POINTER_TYPE_P (TREE_TYPE (op))
4700 ? sizetype : TREE_TYPE (op);
4701 poly_widest_int cst
4702 = wi::mul (bestn->simdclone->args[i].linear_step,
4703 ncopies * nunits);
4704 tree tcst = wide_int_to_tree (type, cst);
4705 tree phi_arg = copy_ssa_name (op);
4706 gassign *new_stmt
4707 = gimple_build_assign (phi_arg, code, phi_res, tcst);
4708 gimple_stmt_iterator si = gsi_after_labels (loop->header);
4709 gsi_insert_after (&si, new_stmt, GSI_NEW_STMT);
4710 add_phi_arg (new_phi, phi_arg, loop_latch_edge (loop),
4711 UNKNOWN_LOCATION);
4712 arginfo[i].op = phi_res;
4713 vargs.safe_push (phi_res);
4715 else
4717 enum tree_code code
4718 = POINTER_TYPE_P (TREE_TYPE (op))
4719 ? POINTER_PLUS_EXPR : PLUS_EXPR;
4720 tree type = POINTER_TYPE_P (TREE_TYPE (op))
4721 ? sizetype : TREE_TYPE (op);
4722 poly_widest_int cst
4723 = wi::mul (bestn->simdclone->args[i].linear_step,
4724 j * nunits);
4725 tree tcst = wide_int_to_tree (type, cst);
4726 new_temp = make_ssa_name (TREE_TYPE (op));
4727 gassign *new_stmt
4728 = gimple_build_assign (new_temp, code,
4729 arginfo[i].op, tcst);
4730 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
4731 vargs.safe_push (new_temp);
4733 break;
4734 case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP:
4735 case SIMD_CLONE_ARG_TYPE_LINEAR_UVAL_CONSTANT_STEP:
4736 case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
4737 case SIMD_CLONE_ARG_TYPE_LINEAR_REF_VARIABLE_STEP:
4738 case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP:
4739 case SIMD_CLONE_ARG_TYPE_LINEAR_UVAL_VARIABLE_STEP:
4740 default:
4741 gcc_unreachable ();
4745 if (masked_call_offset == 0
4746 && bestn->simdclone->inbranch
4747 && bestn->simdclone->nargs > nargs)
4749 unsigned long m, o;
4750 size_t mask_i = bestn->simdclone->nargs - 1;
4751 tree mask;
4752 gcc_assert (bestn->simdclone->args[mask_i].arg_type ==
4753 SIMD_CLONE_ARG_TYPE_MASK);
4755 tree masktype = bestn->simdclone->args[mask_i].vector_type;
4756 if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
4757 /* Guess the number of lanes represented by masktype. */
4758 callee_nelements = exact_div (bestn->simdclone->simdlen,
4759 bestn->simdclone->nargs - nargs);
4760 else
4761 callee_nelements = TYPE_VECTOR_SUBPARTS (masktype);
4762 o = vector_unroll_factor (nunits, callee_nelements);
4763 for (m = j * o; m < (j + 1) * o; m++)
4765 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4767 vec_loop_masks *loop_masks = &LOOP_VINFO_MASKS (loop_vinfo);
4768 mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
4769 ncopies, vectype, j);
4771 else
4772 mask = vect_build_all_ones_mask (vinfo, stmt_info, masktype);
4774 gassign *new_stmt;
4775 if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
4777 /* This means we are dealing with integer mask modes.
4778 First convert to an integer type with the same size as
4779 the current vector type. */
4780 unsigned HOST_WIDE_INT intermediate_size
4781 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (mask)));
4782 tree mid_int_type =
4783 build_nonstandard_integer_type (intermediate_size, 1);
4784 mask = build1 (VIEW_CONVERT_EXPR, mid_int_type, mask);
4785 new_stmt
4786 = gimple_build_assign (make_ssa_name (mid_int_type),
4787 mask);
4788 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4789 /* Then zero-extend to the mask mode. */
4790 mask = fold_build1 (NOP_EXPR, masktype,
4791 gimple_get_lhs (new_stmt));
4793 else if (bestn->simdclone->mask_mode == VOIDmode)
4795 tree one = fold_convert (TREE_TYPE (masktype),
4796 integer_one_node);
4797 tree zero = fold_convert (TREE_TYPE (masktype),
4798 integer_zero_node);
4799 mask = build3 (VEC_COND_EXPR, masktype, mask,
4800 build_vector_from_val (masktype, one),
4801 build_vector_from_val (masktype, zero));
4803 else
4804 gcc_unreachable ();
4806 new_stmt = gimple_build_assign (make_ssa_name (masktype), mask);
4807 vect_finish_stmt_generation (vinfo, stmt_info,
4808 new_stmt, gsi);
4809 mask = gimple_assign_lhs (new_stmt);
4810 vargs.safe_push (mask);
4814 gcall *new_call = gimple_build_call_vec (fndecl, vargs);
4815 if (vec_dest)
4817 gcc_assert (ratype
4818 || known_eq (TYPE_VECTOR_SUBPARTS (rtype), nunits));
4819 if (ratype)
4820 new_temp = create_tmp_var (ratype);
4821 else if (useless_type_conversion_p (vectype, rtype))
4822 new_temp = make_ssa_name (vec_dest, new_call);
4823 else
4824 new_temp = make_ssa_name (rtype, new_call);
4825 gimple_call_set_lhs (new_call, new_temp);
4827 vect_finish_stmt_generation (vinfo, stmt_info, new_call, gsi);
4828 gimple *new_stmt = new_call;
4830 if (vec_dest)
4832 if (!multiple_p (TYPE_VECTOR_SUBPARTS (vectype), nunits))
4834 unsigned int k, l;
4835 poly_uint64 prec = GET_MODE_BITSIZE (TYPE_MODE (vectype));
4836 poly_uint64 bytes = GET_MODE_SIZE (TYPE_MODE (vectype));
4837 k = vector_unroll_factor (nunits,
4838 TYPE_VECTOR_SUBPARTS (vectype));
4839 gcc_assert ((k & (k - 1)) == 0);
4840 for (l = 0; l < k; l++)
4842 tree t;
4843 if (ratype)
4845 t = build_fold_addr_expr (new_temp);
4846 t = build2 (MEM_REF, vectype, t,
4847 build_int_cst (TREE_TYPE (t), l * bytes));
4849 else
4850 t = build3 (BIT_FIELD_REF, vectype, new_temp,
4851 bitsize_int (prec), bitsize_int (l * prec));
4852 new_stmt = gimple_build_assign (make_ssa_name (vectype), t);
4853 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
4855 if (j == 0 && l == 0)
4856 *vec_stmt = new_stmt;
4857 if (slp_node)
4858 SLP_TREE_VEC_DEFS (slp_node)
4859 .quick_push (gimple_assign_lhs (new_stmt));
4860 else
4861 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
4864 if (ratype)
4865 vect_clobber_variable (vinfo, stmt_info, gsi, new_temp);
4866 continue;
4868 else if (!multiple_p (nunits, TYPE_VECTOR_SUBPARTS (vectype)))
4870 unsigned int k;
4871 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
4872 TYPE_VECTOR_SUBPARTS (rtype), &k))
4873 gcc_unreachable ();
4874 gcc_assert ((k & (k - 1)) == 0);
4875 if ((j & (k - 1)) == 0)
4876 vec_alloc (ret_ctor_elts, k);
4877 if (ratype)
4879 unsigned int m, o;
4880 o = vector_unroll_factor (nunits,
4881 TYPE_VECTOR_SUBPARTS (rtype));
4882 for (m = 0; m < o; m++)
4884 tree tem = build4 (ARRAY_REF, rtype, new_temp,
4885 size_int (m), NULL_TREE, NULL_TREE);
4886 new_stmt = gimple_build_assign (make_ssa_name (rtype),
4887 tem);
4888 vect_finish_stmt_generation (vinfo, stmt_info,
4889 new_stmt, gsi);
4890 CONSTRUCTOR_APPEND_ELT (ret_ctor_elts, NULL_TREE,
4891 gimple_assign_lhs (new_stmt));
4893 vect_clobber_variable (vinfo, stmt_info, gsi, new_temp);
4895 else
4896 CONSTRUCTOR_APPEND_ELT (ret_ctor_elts, NULL_TREE, new_temp);
4897 if ((j & (k - 1)) != k - 1)
4898 continue;
4899 vec_oprnd0 = build_constructor (vectype, ret_ctor_elts);
4900 new_stmt
4901 = gimple_build_assign (make_ssa_name (vec_dest), vec_oprnd0);
4902 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
4904 if ((unsigned) j == k - 1)
4905 *vec_stmt = new_stmt;
4906 if (slp_node)
4907 SLP_TREE_VEC_DEFS (slp_node)
4908 .quick_push (gimple_assign_lhs (new_stmt));
4909 else
4910 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
4911 continue;
4913 else if (ratype)
4915 tree t = build_fold_addr_expr (new_temp);
4916 t = build2 (MEM_REF, vectype, t,
4917 build_int_cst (TREE_TYPE (t), 0));
4918 new_stmt = gimple_build_assign (make_ssa_name (vec_dest), t);
4919 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
4920 vect_clobber_variable (vinfo, stmt_info, gsi, new_temp);
4922 else if (!useless_type_conversion_p (vectype, rtype))
4924 vec_oprnd0 = build1 (VIEW_CONVERT_EXPR, vectype, new_temp);
4925 new_stmt
4926 = gimple_build_assign (make_ssa_name (vec_dest), vec_oprnd0);
4927 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
4931 if (j == 0)
4932 *vec_stmt = new_stmt;
4933 if (slp_node)
4934 SLP_TREE_VEC_DEFS (slp_node).quick_push (gimple_get_lhs (new_stmt));
4935 else
4936 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
4939 for (i = 0; i < nargs; ++i)
4941 vec<tree> oprndsi = vec_oprnds[i];
4942 oprndsi.release ();
4944 vargs.release ();
4946 /* Mark the clone as no longer being a candidate for GC. */
4947 bestn->gc_candidate = false;
4949 /* The call in STMT might prevent it from being removed in dce.
4950 We however cannot remove it here, due to the way the ssa name
4951 it defines is mapped to the new definition. So just replace
4952 rhs of the statement with something harmless. */
4954 if (slp_node)
4955 return true;
4957 gimple *new_stmt;
4958 if (scalar_dest)
4960 type = TREE_TYPE (scalar_dest);
4961 lhs = gimple_call_lhs (vect_orig_stmt (stmt_info)->stmt);
4962 new_stmt = gimple_build_assign (lhs, build_zero_cst (type));
4964 else
4965 new_stmt = gimple_build_nop ();
4966 vinfo->replace_stmt (gsi, vect_orig_stmt (stmt_info), new_stmt);
4967 unlink_stmt_vdef (stmt);
4969 return true;
4973 /* Function vect_gen_widened_results_half
4975 Create a vector stmt whose code, type, number of arguments, and result
4976 variable are CODE, OP_TYPE, and VEC_DEST, and its arguments are
4977 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at GSI.
4978 In the case that CODE is a CALL_EXPR, this means that a call to DECL
4979 needs to be created (DECL is a function-decl of a target-builtin).
4980 STMT_INFO is the original scalar stmt that we are vectorizing. */
4982 static gimple *
4983 vect_gen_widened_results_half (vec_info *vinfo, code_helper ch,
4984 tree vec_oprnd0, tree vec_oprnd1, int op_type,
4985 tree vec_dest, gimple_stmt_iterator *gsi,
4986 stmt_vec_info stmt_info)
4988 gimple *new_stmt;
4989 tree new_temp;
4991 /* Generate half of the widened result: */
4992 if (op_type != binary_op)
4993 vec_oprnd1 = NULL;
4994 new_stmt = vect_gimple_build (vec_dest, ch, vec_oprnd0, vec_oprnd1);
4995 new_temp = make_ssa_name (vec_dest, new_stmt);
4996 gimple_set_lhs (new_stmt, new_temp);
4997 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
4999 return new_stmt;
5003 /* Create vectorized demotion statements for vector operands from VEC_OPRNDS.
5004 For multi-step conversions store the resulting vectors and call the function
5005 recursively. When NARROW_SRC_P is true, there's still a conversion after
5006 narrowing, don't store the vectors in the SLP_NODE or in vector info of
5007 the scalar statement(or in STMT_VINFO_RELATED_STMT chain). */
5009 static void
5010 vect_create_vectorized_demotion_stmts (vec_info *vinfo, vec<tree> *vec_oprnds,
5011 int multi_step_cvt,
5012 stmt_vec_info stmt_info,
5013 vec<tree> &vec_dsts,
5014 gimple_stmt_iterator *gsi,
5015 slp_tree slp_node, code_helper code,
5016 bool narrow_src_p)
5018 unsigned int i;
5019 tree vop0, vop1, new_tmp, vec_dest;
5021 vec_dest = vec_dsts.pop ();
5023 for (i = 0; i < vec_oprnds->length (); i += 2)
5025 /* Create demotion operation. */
5026 vop0 = (*vec_oprnds)[i];
5027 vop1 = (*vec_oprnds)[i + 1];
5028 gimple *new_stmt = vect_gimple_build (vec_dest, code, vop0, vop1);
5029 new_tmp = make_ssa_name (vec_dest, new_stmt);
5030 gimple_set_lhs (new_stmt, new_tmp);
5031 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5032 if (multi_step_cvt || narrow_src_p)
5033 /* Store the resulting vector for next recursive call,
5034 or return the resulting vector_tmp for NARROW FLOAT_EXPR. */
5035 (*vec_oprnds)[i/2] = new_tmp;
5036 else
5038 /* This is the last step of the conversion sequence. Store the
5039 vectors in SLP_NODE or in vector info of the scalar statement
5040 (or in STMT_VINFO_RELATED_STMT chain). */
5041 if (slp_node)
5042 slp_node->push_vec_def (new_stmt);
5043 else
5044 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
5048 /* For multi-step demotion operations we first generate demotion operations
5049 from the source type to the intermediate types, and then combine the
5050 results (stored in VEC_OPRNDS) in demotion operation to the destination
5051 type. */
5052 if (multi_step_cvt)
5054 /* At each level of recursion we have half of the operands we had at the
5055 previous level. */
5056 vec_oprnds->truncate ((i+1)/2);
5057 vect_create_vectorized_demotion_stmts (vinfo, vec_oprnds,
5058 multi_step_cvt - 1,
5059 stmt_info, vec_dsts, gsi,
5060 slp_node, VEC_PACK_TRUNC_EXPR,
5061 narrow_src_p);
5064 vec_dsts.quick_push (vec_dest);
5068 /* Create vectorized promotion statements for vector operands from VEC_OPRNDS0
5069 and VEC_OPRNDS1, for a binary operation associated with scalar statement
5070 STMT_INFO. For multi-step conversions store the resulting vectors and
5071 call the function recursively. */
5073 static void
5074 vect_create_vectorized_promotion_stmts (vec_info *vinfo,
5075 vec<tree> *vec_oprnds0,
5076 vec<tree> *vec_oprnds1,
5077 stmt_vec_info stmt_info, tree vec_dest,
5078 gimple_stmt_iterator *gsi,
5079 code_helper ch1,
5080 code_helper ch2, int op_type)
5082 int i;
5083 tree vop0, vop1, new_tmp1, new_tmp2;
5084 gimple *new_stmt1, *new_stmt2;
5085 vec<tree> vec_tmp = vNULL;
5087 vec_tmp.create (vec_oprnds0->length () * 2);
5088 FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
5090 if (op_type == binary_op)
5091 vop1 = (*vec_oprnds1)[i];
5092 else
5093 vop1 = NULL_TREE;
5095 /* Generate the two halves of promotion operation. */
5096 new_stmt1 = vect_gen_widened_results_half (vinfo, ch1, vop0, vop1,
5097 op_type, vec_dest, gsi,
5098 stmt_info);
5099 new_stmt2 = vect_gen_widened_results_half (vinfo, ch2, vop0, vop1,
5100 op_type, vec_dest, gsi,
5101 stmt_info);
5102 if (is_gimple_call (new_stmt1))
5104 new_tmp1 = gimple_call_lhs (new_stmt1);
5105 new_tmp2 = gimple_call_lhs (new_stmt2);
5107 else
5109 new_tmp1 = gimple_assign_lhs (new_stmt1);
5110 new_tmp2 = gimple_assign_lhs (new_stmt2);
5113 /* Store the results for the next step. */
5114 vec_tmp.quick_push (new_tmp1);
5115 vec_tmp.quick_push (new_tmp2);
5118 vec_oprnds0->release ();
5119 *vec_oprnds0 = vec_tmp;
5122 /* Create vectorized promotion stmts for widening stmts using only half the
5123 potential vector size for input. */
5124 static void
5125 vect_create_half_widening_stmts (vec_info *vinfo,
5126 vec<tree> *vec_oprnds0,
5127 vec<tree> *vec_oprnds1,
5128 stmt_vec_info stmt_info, tree vec_dest,
5129 gimple_stmt_iterator *gsi,
5130 code_helper code1,
5131 int op_type)
5133 int i;
5134 tree vop0, vop1;
5135 gimple *new_stmt1;
5136 gimple *new_stmt2;
5137 gimple *new_stmt3;
5138 vec<tree> vec_tmp = vNULL;
5140 vec_tmp.create (vec_oprnds0->length ());
5141 FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
5143 tree new_tmp1, new_tmp2, new_tmp3, out_type;
5145 gcc_assert (op_type == binary_op);
5146 vop1 = (*vec_oprnds1)[i];
5148 /* Widen the first vector input. */
5149 out_type = TREE_TYPE (vec_dest);
5150 new_tmp1 = make_ssa_name (out_type);
5151 new_stmt1 = gimple_build_assign (new_tmp1, NOP_EXPR, vop0);
5152 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt1, gsi);
5153 if (VECTOR_TYPE_P (TREE_TYPE (vop1)))
5155 /* Widen the second vector input. */
5156 new_tmp2 = make_ssa_name (out_type);
5157 new_stmt2 = gimple_build_assign (new_tmp2, NOP_EXPR, vop1);
5158 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt2, gsi);
5159 /* Perform the operation. With both vector inputs widened. */
5160 new_stmt3 = vect_gimple_build (vec_dest, code1, new_tmp1, new_tmp2);
5162 else
5164 /* Perform the operation. With the single vector input widened. */
5165 new_stmt3 = vect_gimple_build (vec_dest, code1, new_tmp1, vop1);
5168 new_tmp3 = make_ssa_name (vec_dest, new_stmt3);
5169 gimple_assign_set_lhs (new_stmt3, new_tmp3);
5170 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt3, gsi);
5172 /* Store the results for the next step. */
5173 vec_tmp.quick_push (new_tmp3);
5176 vec_oprnds0->release ();
5177 *vec_oprnds0 = vec_tmp;
5181 /* Check if STMT_INFO performs a conversion operation that can be vectorized.
5182 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5183 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5184 Return true if STMT_INFO is vectorizable in this way. */
5186 static bool
5187 vectorizable_conversion (vec_info *vinfo,
5188 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5189 gimple **vec_stmt, slp_tree slp_node,
5190 stmt_vector_for_cost *cost_vec)
5192 tree vec_dest, cvt_op = NULL_TREE;
5193 tree scalar_dest;
5194 tree op0, op1 = NULL_TREE;
5195 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5196 tree_code tc1;
5197 code_helper code, code1, code2;
5198 code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
5199 tree new_temp;
5200 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
5201 int ndts = 2;
5202 poly_uint64 nunits_in;
5203 poly_uint64 nunits_out;
5204 tree vectype_out, vectype_in;
5205 int ncopies, i;
5206 tree lhs_type, rhs_type;
5207 /* For conversions between floating point and integer, there're 2 NARROW
5208 cases. NARROW_SRC is for FLOAT_EXPR, means
5209 integer --DEMOTION--> integer --FLOAT_EXPR--> floating point.
5210 This is safe when the range of the source integer can fit into the lower
5211 precision. NARROW_DST is for FIX_TRUNC_EXPR, means
5212 floating point --FIX_TRUNC_EXPR--> integer --DEMOTION--> INTEGER.
5213 For other conversions, when there's narrowing, NARROW_DST is used as
5214 default. */
5215 enum { NARROW_SRC, NARROW_DST, NONE, WIDEN } modifier;
5216 vec<tree> vec_oprnds0 = vNULL;
5217 vec<tree> vec_oprnds1 = vNULL;
5218 tree vop0;
5219 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5220 int multi_step_cvt = 0;
5221 vec<tree> interm_types = vNULL;
5222 tree intermediate_type, cvt_type = NULL_TREE;
5223 int op_type;
5224 unsigned short fltsz;
5226 /* Is STMT a vectorizable conversion? */
5228 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
5229 return false;
5231 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
5232 && ! vec_stmt)
5233 return false;
5235 gimple* stmt = stmt_info->stmt;
5236 if (!(is_gimple_assign (stmt) || is_gimple_call (stmt)))
5237 return false;
5239 if (gimple_get_lhs (stmt) == NULL_TREE
5240 || TREE_CODE (gimple_get_lhs (stmt)) != SSA_NAME)
5241 return false;
5243 if (TREE_CODE (gimple_get_lhs (stmt)) != SSA_NAME)
5244 return false;
5246 if (is_gimple_assign (stmt))
5248 code = gimple_assign_rhs_code (stmt);
5249 op_type = TREE_CODE_LENGTH ((tree_code) code);
5251 else if (gimple_call_internal_p (stmt))
5253 code = gimple_call_internal_fn (stmt);
5254 op_type = gimple_call_num_args (stmt);
5256 else
5257 return false;
5259 bool widen_arith = (code == WIDEN_MULT_EXPR
5260 || code == WIDEN_LSHIFT_EXPR
5261 || widening_fn_p (code));
5263 if (!widen_arith
5264 && !CONVERT_EXPR_CODE_P (code)
5265 && code != FIX_TRUNC_EXPR
5266 && code != FLOAT_EXPR)
5267 return false;
5269 /* Check types of lhs and rhs. */
5270 scalar_dest = gimple_get_lhs (stmt);
5271 lhs_type = TREE_TYPE (scalar_dest);
5272 vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5274 /* Check the operands of the operation. */
5275 slp_tree slp_op0, slp_op1 = NULL;
5276 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
5277 0, &op0, &slp_op0, &dt[0], &vectype_in))
5279 if (dump_enabled_p ())
5280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5281 "use not simple.\n");
5282 return false;
5285 rhs_type = TREE_TYPE (op0);
5286 if ((code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
5287 && !((INTEGRAL_TYPE_P (lhs_type)
5288 && INTEGRAL_TYPE_P (rhs_type))
5289 || (SCALAR_FLOAT_TYPE_P (lhs_type)
5290 && SCALAR_FLOAT_TYPE_P (rhs_type))))
5291 return false;
5293 if (!VECTOR_BOOLEAN_TYPE_P (vectype_out)
5294 && ((INTEGRAL_TYPE_P (lhs_type)
5295 && !type_has_mode_precision_p (lhs_type))
5296 || (INTEGRAL_TYPE_P (rhs_type)
5297 && !type_has_mode_precision_p (rhs_type))))
5299 if (dump_enabled_p ())
5300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5301 "type conversion to/from bit-precision unsupported."
5302 "\n");
5303 return false;
5306 if (op_type == binary_op)
5308 gcc_assert (code == WIDEN_MULT_EXPR
5309 || code == WIDEN_LSHIFT_EXPR
5310 || widening_fn_p (code));
5312 op1 = is_gimple_assign (stmt) ? gimple_assign_rhs2 (stmt) :
5313 gimple_call_arg (stmt, 0);
5314 tree vectype1_in;
5315 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 1,
5316 &op1, &slp_op1, &dt[1], &vectype1_in))
5318 if (dump_enabled_p ())
5319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5320 "use not simple.\n");
5321 return false;
5323 /* For WIDEN_MULT_EXPR, if OP0 is a constant, use the type of
5324 OP1. */
5325 if (!vectype_in)
5326 vectype_in = vectype1_in;
5329 /* If op0 is an external or constant def, infer the vector type
5330 from the scalar type. */
5331 if (!vectype_in)
5332 vectype_in = get_vectype_for_scalar_type (vinfo, rhs_type, slp_node);
5333 if (vec_stmt)
5334 gcc_assert (vectype_in);
5335 if (!vectype_in)
5337 if (dump_enabled_p ())
5338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5339 "no vectype for scalar type %T\n", rhs_type);
5341 return false;
5344 if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
5345 && !VECTOR_BOOLEAN_TYPE_P (vectype_in))
5347 if (dump_enabled_p ())
5348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5349 "can't convert between boolean and non "
5350 "boolean vectors %T\n", rhs_type);
5352 return false;
5355 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
5356 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
5357 if (known_eq (nunits_out, nunits_in))
5358 if (widen_arith)
5359 modifier = WIDEN;
5360 else
5361 modifier = NONE;
5362 else if (multiple_p (nunits_out, nunits_in))
5363 modifier = NARROW_DST;
5364 else
5366 gcc_checking_assert (multiple_p (nunits_in, nunits_out));
5367 modifier = WIDEN;
5370 /* Multiple types in SLP are handled by creating the appropriate number of
5371 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
5372 case of SLP. */
5373 if (slp_node)
5374 ncopies = 1;
5375 else if (modifier == NARROW_DST)
5376 ncopies = vect_get_num_copies (loop_vinfo, vectype_out);
5377 else
5378 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5380 /* Sanity check: make sure that at least one copy of the vectorized stmt
5381 needs to be generated. */
5382 gcc_assert (ncopies >= 1);
5384 bool found_mode = false;
5385 scalar_mode lhs_mode = SCALAR_TYPE_MODE (lhs_type);
5386 scalar_mode rhs_mode = SCALAR_TYPE_MODE (rhs_type);
5387 opt_scalar_mode rhs_mode_iter;
5388 vec<std::pair<tree, tree_code> > converts = vNULL;
5390 /* Supportable by target? */
5391 switch (modifier)
5393 case NONE:
5394 if (code != FIX_TRUNC_EXPR
5395 && code != FLOAT_EXPR
5396 && !CONVERT_EXPR_CODE_P (code))
5397 return false;
5398 gcc_assert (code.is_tree_code ());
5399 if (supportable_indirect_convert_operation (code,
5400 vectype_out,
5401 vectype_in,
5402 &converts,
5403 op0))
5405 gcc_assert (converts.length () <= 2);
5406 if (converts.length () == 1)
5407 code1 = converts[0].second;
5408 else
5410 cvt_type = NULL_TREE;
5411 multi_step_cvt = converts.length () - 1;
5412 codecvt1 = converts[0].second;
5413 code1 = converts[1].second;
5414 interm_types.safe_push (converts[0].first);
5416 break;
5419 /* FALLTHRU */
5420 unsupported:
5421 if (dump_enabled_p ())
5422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5423 "conversion not supported by target.\n");
5424 return false;
5426 case WIDEN:
5427 if (known_eq (nunits_in, nunits_out))
5429 if (!(code.is_tree_code ()
5430 && supportable_half_widening_operation ((tree_code) code,
5431 vectype_out, vectype_in,
5432 &tc1)))
5433 goto unsupported;
5434 code1 = tc1;
5435 gcc_assert (!(multi_step_cvt && op_type == binary_op));
5436 break;
5438 if (supportable_widening_operation (vinfo, code, stmt_info,
5439 vectype_out, vectype_in, &code1,
5440 &code2, &multi_step_cvt,
5441 &interm_types))
5443 /* Binary widening operation can only be supported directly by the
5444 architecture. */
5445 gcc_assert (!(multi_step_cvt && op_type == binary_op));
5446 break;
5449 if (code != FLOAT_EXPR
5450 || GET_MODE_SIZE (lhs_mode) <= GET_MODE_SIZE (rhs_mode))
5451 goto unsupported;
5453 fltsz = GET_MODE_SIZE (lhs_mode);
5454 FOR_EACH_2XWIDER_MODE (rhs_mode_iter, rhs_mode)
5456 rhs_mode = rhs_mode_iter.require ();
5457 if (GET_MODE_SIZE (rhs_mode) > fltsz)
5458 break;
5460 cvt_type
5461 = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), 0);
5462 cvt_type = get_same_sized_vectype (cvt_type, vectype_in);
5463 if (cvt_type == NULL_TREE)
5464 goto unsupported;
5466 if (GET_MODE_SIZE (rhs_mode) == fltsz)
5468 tc1 = ERROR_MARK;
5469 gcc_assert (code.is_tree_code ());
5470 if (!supportable_convert_operation ((tree_code) code, vectype_out,
5471 cvt_type, &tc1))
5472 goto unsupported;
5473 codecvt1 = tc1;
5475 else if (!supportable_widening_operation (vinfo, code,
5476 stmt_info, vectype_out,
5477 cvt_type, &codecvt1,
5478 &codecvt2, &multi_step_cvt,
5479 &interm_types))
5480 continue;
5481 else
5482 gcc_assert (multi_step_cvt == 0);
5484 if (supportable_widening_operation (vinfo, NOP_EXPR, stmt_info,
5485 cvt_type,
5486 vectype_in, &code1,
5487 &code2, &multi_step_cvt,
5488 &interm_types))
5490 found_mode = true;
5491 break;
5495 if (!found_mode)
5496 goto unsupported;
5498 if (GET_MODE_SIZE (rhs_mode) == fltsz)
5499 codecvt2 = ERROR_MARK;
5500 else
5502 multi_step_cvt++;
5503 interm_types.safe_push (cvt_type);
5504 cvt_type = NULL_TREE;
5506 break;
5508 case NARROW_DST:
5509 gcc_assert (op_type == unary_op);
5510 if (supportable_narrowing_operation (code, vectype_out, vectype_in,
5511 &code1, &multi_step_cvt,
5512 &interm_types))
5513 break;
5515 if (GET_MODE_SIZE (lhs_mode) >= GET_MODE_SIZE (rhs_mode))
5516 goto unsupported;
5518 if (code == FIX_TRUNC_EXPR)
5520 cvt_type
5521 = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), 0);
5522 cvt_type = get_same_sized_vectype (cvt_type, vectype_in);
5523 if (cvt_type == NULL_TREE)
5524 goto unsupported;
5525 if (supportable_convert_operation ((tree_code) code, cvt_type, vectype_in,
5526 &tc1))
5527 codecvt1 = tc1;
5528 else
5529 goto unsupported;
5530 if (supportable_narrowing_operation (NOP_EXPR, vectype_out, cvt_type,
5531 &code1, &multi_step_cvt,
5532 &interm_types))
5533 break;
5535 /* If op0 can be represented with low precision integer,
5536 truncate it to cvt_type and the do FLOAT_EXPR. */
5537 else if (code == FLOAT_EXPR)
5539 wide_int op_min_value, op_max_value;
5540 if (!vect_get_range_info (op0, &op_min_value, &op_max_value))
5541 goto unsupported;
5543 cvt_type
5544 = build_nonstandard_integer_type (GET_MODE_BITSIZE (lhs_mode), 0);
5545 if (cvt_type == NULL_TREE
5546 || (wi::min_precision (op_max_value, SIGNED)
5547 > TYPE_PRECISION (cvt_type))
5548 || (wi::min_precision (op_min_value, SIGNED)
5549 > TYPE_PRECISION (cvt_type)))
5550 goto unsupported;
5552 cvt_type = get_same_sized_vectype (cvt_type, vectype_out);
5553 if (cvt_type == NULL_TREE)
5554 goto unsupported;
5555 if (!supportable_narrowing_operation (NOP_EXPR, cvt_type, vectype_in,
5556 &code1, &multi_step_cvt,
5557 &interm_types))
5558 goto unsupported;
5559 if (supportable_convert_operation ((tree_code) code, vectype_out,
5560 cvt_type, &tc1))
5562 codecvt1 = tc1;
5563 modifier = NARROW_SRC;
5564 break;
5568 goto unsupported;
5570 default:
5571 gcc_unreachable ();
5574 if (!vec_stmt) /* transformation not required. */
5576 if (slp_node
5577 && (!vect_maybe_update_slp_op_vectype (slp_op0, vectype_in)
5578 || !vect_maybe_update_slp_op_vectype (slp_op1, vectype_in)))
5580 if (dump_enabled_p ())
5581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5582 "incompatible vector types for invariants\n");
5583 return false;
5585 DUMP_VECT_SCOPE ("vectorizable_conversion");
5586 if (modifier == NONE)
5588 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
5589 vect_model_simple_cost (vinfo, stmt_info,
5590 ncopies * (1 + multi_step_cvt),
5591 dt, ndts, slp_node, cost_vec);
5593 else if (modifier == NARROW_SRC || modifier == NARROW_DST)
5595 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
5596 /* The final packing step produces one vector result per copy. */
5597 unsigned int nvectors
5598 = (slp_node ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) : ncopies);
5599 vect_model_promotion_demotion_cost (stmt_info, dt, nvectors,
5600 multi_step_cvt, cost_vec,
5601 widen_arith);
5603 else
5605 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
5606 /* The initial unpacking step produces two vector results
5607 per copy. MULTI_STEP_CVT is 0 for a single conversion,
5608 so >> MULTI_STEP_CVT divides by 2^(number of steps - 1). */
5609 unsigned int nvectors
5610 = (slp_node
5611 ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) >> multi_step_cvt
5612 : ncopies * 2);
5613 vect_model_promotion_demotion_cost (stmt_info, dt, nvectors,
5614 multi_step_cvt, cost_vec,
5615 widen_arith);
5617 interm_types.release ();
5618 return true;
5621 /* Transform. */
5622 if (dump_enabled_p ())
5623 dump_printf_loc (MSG_NOTE, vect_location,
5624 "transform conversion. ncopies = %d.\n", ncopies);
5626 if (op_type == binary_op)
5628 if (CONSTANT_CLASS_P (op0))
5629 op0 = fold_convert (TREE_TYPE (op1), op0);
5630 else if (CONSTANT_CLASS_P (op1))
5631 op1 = fold_convert (TREE_TYPE (op0), op1);
5634 /* In case of multi-step conversion, we first generate conversion operations
5635 to the intermediate types, and then from that types to the final one.
5636 We create vector destinations for the intermediate type (TYPES) received
5637 from supportable_*_operation, and store them in the correct order
5638 for future use in vect_create_vectorized_*_stmts (). */
5639 auto_vec<tree> vec_dsts (multi_step_cvt + 1);
5640 bool widen_or_narrow_float_p
5641 = cvt_type && (modifier == WIDEN || modifier == NARROW_SRC);
5642 vec_dest = vect_create_destination_var (scalar_dest,
5643 widen_or_narrow_float_p
5644 ? cvt_type : vectype_out);
5645 vec_dsts.quick_push (vec_dest);
5647 if (multi_step_cvt)
5649 for (i = interm_types.length () - 1;
5650 interm_types.iterate (i, &intermediate_type); i--)
5652 vec_dest = vect_create_destination_var (scalar_dest,
5653 intermediate_type);
5654 vec_dsts.quick_push (vec_dest);
5658 if (cvt_type)
5659 vec_dest = vect_create_destination_var (scalar_dest,
5660 widen_or_narrow_float_p
5661 ? vectype_out : cvt_type);
5663 int ninputs = 1;
5664 if (!slp_node)
5666 if (modifier == WIDEN)
5668 else if (modifier == NARROW_SRC || modifier == NARROW_DST)
5670 if (multi_step_cvt)
5671 ninputs = vect_pow2 (multi_step_cvt);
5672 ninputs *= 2;
5676 switch (modifier)
5678 case NONE:
5679 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
5680 op0, vectype_in, &vec_oprnds0);
5681 /* vec_dest is intermediate type operand when multi_step_cvt. */
5682 if (multi_step_cvt)
5684 cvt_op = vec_dest;
5685 vec_dest = vec_dsts[0];
5688 FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
5690 /* Arguments are ready, create the new vector stmt. */
5691 gimple* new_stmt;
5692 if (multi_step_cvt)
5694 gcc_assert (multi_step_cvt == 1);
5695 new_stmt = vect_gimple_build (cvt_op, codecvt1, vop0);
5696 new_temp = make_ssa_name (cvt_op, new_stmt);
5697 gimple_assign_set_lhs (new_stmt, new_temp);
5698 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5699 vop0 = new_temp;
5701 new_stmt = vect_gimple_build (vec_dest, code1, vop0);
5702 new_temp = make_ssa_name (vec_dest, new_stmt);
5703 gimple_set_lhs (new_stmt, new_temp);
5704 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5706 if (slp_node)
5707 slp_node->push_vec_def (new_stmt);
5708 else
5709 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
5711 break;
5713 case WIDEN:
5714 /* In case the vectorization factor (VF) is bigger than the number
5715 of elements that we can fit in a vectype (nunits), we have to
5716 generate more than one vector stmt - i.e - we need to "unroll"
5717 the vector stmt by a factor VF/nunits. */
5718 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies * ninputs,
5719 op0, vectype_in, &vec_oprnds0,
5720 code == WIDEN_LSHIFT_EXPR ? NULL_TREE : op1,
5721 vectype_in, &vec_oprnds1);
5722 if (code == WIDEN_LSHIFT_EXPR)
5724 int oprnds_size = vec_oprnds0.length ();
5725 vec_oprnds1.create (oprnds_size);
5726 for (i = 0; i < oprnds_size; ++i)
5727 vec_oprnds1.quick_push (op1);
5729 /* Arguments are ready. Create the new vector stmts. */
5730 for (i = multi_step_cvt; i >= 0; i--)
5732 tree this_dest = vec_dsts[i];
5733 code_helper c1 = code1, c2 = code2;
5734 if (i == 0 && codecvt2 != ERROR_MARK)
5736 c1 = codecvt1;
5737 c2 = codecvt2;
5739 if (known_eq (nunits_out, nunits_in))
5740 vect_create_half_widening_stmts (vinfo, &vec_oprnds0, &vec_oprnds1,
5741 stmt_info, this_dest, gsi, c1,
5742 op_type);
5743 else
5744 vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
5745 &vec_oprnds1, stmt_info,
5746 this_dest, gsi,
5747 c1, c2, op_type);
5750 FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
5752 gimple *new_stmt;
5753 if (cvt_type)
5755 new_temp = make_ssa_name (vec_dest);
5756 new_stmt = vect_gimple_build (new_temp, codecvt1, vop0);
5757 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5759 else
5760 new_stmt = SSA_NAME_DEF_STMT (vop0);
5762 if (slp_node)
5763 slp_node->push_vec_def (new_stmt);
5764 else
5765 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
5767 break;
5769 case NARROW_SRC:
5770 case NARROW_DST:
5771 /* In case the vectorization factor (VF) is bigger than the number
5772 of elements that we can fit in a vectype (nunits), we have to
5773 generate more than one vector stmt - i.e - we need to "unroll"
5774 the vector stmt by a factor VF/nunits. */
5775 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies * ninputs,
5776 op0, vectype_in, &vec_oprnds0);
5777 /* Arguments are ready. Create the new vector stmts. */
5778 if (cvt_type && modifier == NARROW_DST)
5779 FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
5781 new_temp = make_ssa_name (vec_dest);
5782 gimple *new_stmt = vect_gimple_build (new_temp, codecvt1, vop0);
5783 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5784 vec_oprnds0[i] = new_temp;
5787 vect_create_vectorized_demotion_stmts (vinfo, &vec_oprnds0,
5788 multi_step_cvt,
5789 stmt_info, vec_dsts, gsi,
5790 slp_node, code1,
5791 modifier == NARROW_SRC);
5792 /* After demoting op0 to cvt_type, convert it to dest. */
5793 if (cvt_type && code == FLOAT_EXPR)
5795 for (unsigned int i = 0; i != vec_oprnds0.length() / 2; i++)
5797 /* Arguments are ready, create the new vector stmt. */
5798 gcc_assert (TREE_CODE_LENGTH ((tree_code) codecvt1) == unary_op);
5799 gimple *new_stmt
5800 = vect_gimple_build (vec_dest, codecvt1, vec_oprnds0[i]);
5801 new_temp = make_ssa_name (vec_dest, new_stmt);
5802 gimple_set_lhs (new_stmt, new_temp);
5803 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5805 /* This is the last step of the conversion sequence. Store the
5806 vectors in SLP_NODE or in vector info of the scalar statement
5807 (or in STMT_VINFO_RELATED_STMT chain). */
5808 if (slp_node)
5809 slp_node->push_vec_def (new_stmt);
5810 else
5811 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
5814 break;
5816 if (!slp_node)
5817 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
5819 vec_oprnds0.release ();
5820 vec_oprnds1.release ();
5821 interm_types.release ();
5823 return true;
5826 /* Return true if we can assume from the scalar form of STMT_INFO that
5827 neither the scalar nor the vector forms will generate code. STMT_INFO
5828 is known not to involve a data reference. */
5830 bool
5831 vect_nop_conversion_p (stmt_vec_info stmt_info)
5833 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
5834 if (!stmt)
5835 return false;
5837 tree lhs = gimple_assign_lhs (stmt);
5838 tree_code code = gimple_assign_rhs_code (stmt);
5839 tree rhs = gimple_assign_rhs1 (stmt);
5841 if (code == SSA_NAME || code == VIEW_CONVERT_EXPR)
5842 return true;
5844 if (CONVERT_EXPR_CODE_P (code))
5845 return tree_nop_conversion_p (TREE_TYPE (lhs), TREE_TYPE (rhs));
5847 return false;
5850 /* Function vectorizable_assignment.
5852 Check if STMT_INFO performs an assignment (copy) that can be vectorized.
5853 If VEC_STMT is also passed, vectorize the STMT_INFO: create a vectorized
5854 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5855 Return true if STMT_INFO is vectorizable in this way. */
5857 static bool
5858 vectorizable_assignment (vec_info *vinfo,
5859 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5860 gimple **vec_stmt, slp_tree slp_node,
5861 stmt_vector_for_cost *cost_vec)
5863 tree vec_dest;
5864 tree scalar_dest;
5865 tree op;
5866 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5867 tree new_temp;
5868 enum vect_def_type dt[1] = {vect_unknown_def_type};
5869 int ndts = 1;
5870 int ncopies;
5871 int i;
5872 vec<tree> vec_oprnds = vNULL;
5873 tree vop;
5874 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5875 enum tree_code code;
5876 tree vectype_in;
5878 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
5879 return false;
5881 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
5882 && ! vec_stmt)
5883 return false;
5885 /* Is vectorizable assignment? */
5886 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
5887 if (!stmt)
5888 return false;
5890 scalar_dest = gimple_assign_lhs (stmt);
5891 if (TREE_CODE (scalar_dest) != SSA_NAME)
5892 return false;
5894 if (STMT_VINFO_DATA_REF (stmt_info))
5895 return false;
5897 code = gimple_assign_rhs_code (stmt);
5898 if (!(gimple_assign_single_p (stmt)
5899 || code == PAREN_EXPR
5900 || CONVERT_EXPR_CODE_P (code)))
5901 return false;
5903 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5904 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5906 /* Multiple types in SLP are handled by creating the appropriate number of
5907 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
5908 case of SLP. */
5909 if (slp_node)
5910 ncopies = 1;
5911 else
5912 ncopies = vect_get_num_copies (loop_vinfo, vectype);
5914 gcc_assert (ncopies >= 1);
5916 slp_tree slp_op;
5917 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 0, &op, &slp_op,
5918 &dt[0], &vectype_in))
5920 if (dump_enabled_p ())
5921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5922 "use not simple.\n");
5923 return false;
5925 if (!vectype_in)
5926 vectype_in = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op), slp_node);
5928 /* We can handle VIEW_CONVERT conversions that do not change the number
5929 of elements or the vector size or other conversions when the component
5930 types are nop-convertible. */
5931 if (!vectype_in
5932 || maybe_ne (TYPE_VECTOR_SUBPARTS (vectype_in), nunits)
5933 || (code == VIEW_CONVERT_EXPR
5934 && maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
5935 GET_MODE_SIZE (TYPE_MODE (vectype_in))))
5936 || (CONVERT_EXPR_CODE_P (code)
5937 && !tree_nop_conversion_p (TREE_TYPE (vectype),
5938 TREE_TYPE (vectype_in))))
5939 return false;
5941 if (VECTOR_BOOLEAN_TYPE_P (vectype) != VECTOR_BOOLEAN_TYPE_P (vectype_in))
5943 if (dump_enabled_p ())
5944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5945 "can't convert between boolean and non "
5946 "boolean vectors %T\n", TREE_TYPE (op));
5948 return false;
5951 /* We do not handle bit-precision changes. */
5952 if ((CONVERT_EXPR_CODE_P (code)
5953 || code == VIEW_CONVERT_EXPR)
5954 && ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
5955 && !type_has_mode_precision_p (TREE_TYPE (scalar_dest)))
5956 || (INTEGRAL_TYPE_P (TREE_TYPE (op))
5957 && !type_has_mode_precision_p (TREE_TYPE (op))))
5958 /* But a conversion that does not change the bit-pattern is ok. */
5959 && !(INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
5960 && INTEGRAL_TYPE_P (TREE_TYPE (op))
5961 && (((TYPE_PRECISION (TREE_TYPE (scalar_dest))
5962 > TYPE_PRECISION (TREE_TYPE (op)))
5963 && TYPE_UNSIGNED (TREE_TYPE (op)))
5964 || (TYPE_PRECISION (TREE_TYPE (scalar_dest))
5965 == TYPE_PRECISION (TREE_TYPE (op))))))
5967 if (dump_enabled_p ())
5968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5969 "type conversion to/from bit-precision "
5970 "unsupported.\n");
5971 return false;
5974 if (!vec_stmt) /* transformation not required. */
5976 if (slp_node
5977 && !vect_maybe_update_slp_op_vectype (slp_op, vectype_in))
5979 if (dump_enabled_p ())
5980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5981 "incompatible vector types for invariants\n");
5982 return false;
5984 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
5985 DUMP_VECT_SCOPE ("vectorizable_assignment");
5986 if (!vect_nop_conversion_p (stmt_info))
5987 vect_model_simple_cost (vinfo, stmt_info, ncopies, dt, ndts, slp_node,
5988 cost_vec);
5989 return true;
5992 /* Transform. */
5993 if (dump_enabled_p ())
5994 dump_printf_loc (MSG_NOTE, vect_location, "transform assignment.\n");
5996 /* Handle def. */
5997 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5999 /* Handle use. */
6000 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies, op, &vec_oprnds);
6002 /* Arguments are ready. create the new vector stmt. */
6003 FOR_EACH_VEC_ELT (vec_oprnds, i, vop)
6005 if (CONVERT_EXPR_CODE_P (code)
6006 || code == VIEW_CONVERT_EXPR)
6007 vop = build1 (VIEW_CONVERT_EXPR, vectype, vop);
6008 gassign *new_stmt = gimple_build_assign (vec_dest, vop);
6009 new_temp = make_ssa_name (vec_dest, new_stmt);
6010 gimple_assign_set_lhs (new_stmt, new_temp);
6011 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6012 if (slp_node)
6013 slp_node->push_vec_def (new_stmt);
6014 else
6015 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6017 if (!slp_node)
6018 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
6020 vec_oprnds.release ();
6021 return true;
6025 /* Return TRUE if CODE (a shift operation) is supported for SCALAR_TYPE
6026 either as shift by a scalar or by a vector. */
6028 bool
6029 vect_supportable_shift (vec_info *vinfo, enum tree_code code, tree scalar_type)
6032 machine_mode vec_mode;
6033 optab optab;
6034 int icode;
6035 tree vectype;
6037 vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
6038 if (!vectype)
6039 return false;
6041 optab = optab_for_tree_code (code, vectype, optab_scalar);
6042 if (!optab
6043 || optab_handler (optab, TYPE_MODE (vectype)) == CODE_FOR_nothing)
6045 optab = optab_for_tree_code (code, vectype, optab_vector);
6046 if (!optab
6047 || (optab_handler (optab, TYPE_MODE (vectype))
6048 == CODE_FOR_nothing))
6049 return false;
6052 vec_mode = TYPE_MODE (vectype);
6053 icode = (int) optab_handler (optab, vec_mode);
6054 if (icode == CODE_FOR_nothing)
6055 return false;
6057 return true;
6061 /* Function vectorizable_shift.
6063 Check if STMT_INFO performs a shift operation that can be vectorized.
6064 If VEC_STMT is also passed, vectorize the STMT_INFO: create a vectorized
6065 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6066 Return true if STMT_INFO is vectorizable in this way. */
6068 static bool
6069 vectorizable_shift (vec_info *vinfo,
6070 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6071 gimple **vec_stmt, slp_tree slp_node,
6072 stmt_vector_for_cost *cost_vec)
6074 tree vec_dest;
6075 tree scalar_dest;
6076 tree op0, op1 = NULL;
6077 tree vec_oprnd1 = NULL_TREE;
6078 tree vectype;
6079 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6080 enum tree_code code;
6081 machine_mode vec_mode;
6082 tree new_temp;
6083 optab optab;
6084 int icode;
6085 machine_mode optab_op2_mode;
6086 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
6087 int ndts = 2;
6088 poly_uint64 nunits_in;
6089 poly_uint64 nunits_out;
6090 tree vectype_out;
6091 tree op1_vectype;
6092 int ncopies;
6093 int i;
6094 vec<tree> vec_oprnds0 = vNULL;
6095 vec<tree> vec_oprnds1 = vNULL;
6096 tree vop0, vop1;
6097 unsigned int k;
6098 bool scalar_shift_arg = true;
6099 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
6100 bool incompatible_op1_vectype_p = false;
6102 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
6103 return false;
6105 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
6106 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle
6107 && ! vec_stmt)
6108 return false;
6110 /* Is STMT a vectorizable binary/unary operation? */
6111 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
6112 if (!stmt)
6113 return false;
6115 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6116 return false;
6118 code = gimple_assign_rhs_code (stmt);
6120 if (!(code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
6121 || code == RROTATE_EXPR))
6122 return false;
6124 scalar_dest = gimple_assign_lhs (stmt);
6125 vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6126 if (!type_has_mode_precision_p (TREE_TYPE (scalar_dest)))
6128 if (dump_enabled_p ())
6129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6130 "bit-precision shifts not supported.\n");
6131 return false;
6134 slp_tree slp_op0;
6135 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
6136 0, &op0, &slp_op0, &dt[0], &vectype))
6138 if (dump_enabled_p ())
6139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6140 "use not simple.\n");
6141 return false;
6143 /* If op0 is an external or constant def, infer the vector type
6144 from the scalar type. */
6145 if (!vectype)
6146 vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op0), slp_node);
6147 if (vec_stmt)
6148 gcc_assert (vectype);
6149 if (!vectype)
6151 if (dump_enabled_p ())
6152 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6153 "no vectype for scalar type\n");
6154 return false;
6157 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6158 nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
6159 if (maybe_ne (nunits_out, nunits_in))
6160 return false;
6162 stmt_vec_info op1_def_stmt_info;
6163 slp_tree slp_op1;
6164 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 1, &op1, &slp_op1,
6165 &dt[1], &op1_vectype, &op1_def_stmt_info))
6167 if (dump_enabled_p ())
6168 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6169 "use not simple.\n");
6170 return false;
6173 /* Multiple types in SLP are handled by creating the appropriate number of
6174 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
6175 case of SLP. */
6176 if (slp_node)
6177 ncopies = 1;
6178 else
6179 ncopies = vect_get_num_copies (loop_vinfo, vectype);
6181 gcc_assert (ncopies >= 1);
6183 /* Determine whether the shift amount is a vector, or scalar. If the
6184 shift/rotate amount is a vector, use the vector/vector shift optabs. */
6186 if ((dt[1] == vect_internal_def
6187 || dt[1] == vect_induction_def
6188 || dt[1] == vect_nested_cycle)
6189 && (!slp_node || SLP_TREE_LANES (slp_node) == 1))
6190 scalar_shift_arg = false;
6191 else if (dt[1] == vect_constant_def
6192 || dt[1] == vect_external_def
6193 || dt[1] == vect_internal_def)
6195 /* In SLP, need to check whether the shift count is the same,
6196 in loops if it is a constant or invariant, it is always
6197 a scalar shift. */
6198 if (slp_node)
6200 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6201 stmt_vec_info slpstmt_info;
6203 FOR_EACH_VEC_ELT (stmts, k, slpstmt_info)
6204 if (slpstmt_info)
6206 gassign *slpstmt = as_a <gassign *> (slpstmt_info->stmt);
6207 if (!operand_equal_p (gimple_assign_rhs2 (slpstmt), op1, 0))
6208 scalar_shift_arg = false;
6211 /* For internal SLP defs we have to make sure we see scalar stmts
6212 for all vector elements.
6213 ??? For different vectors we could resort to a different
6214 scalar shift operand but code-generation below simply always
6215 takes the first. */
6216 if (dt[1] == vect_internal_def
6217 && maybe_ne (nunits_out * SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
6218 stmts.length ()))
6219 scalar_shift_arg = false;
6222 /* If the shift amount is computed by a pattern stmt we cannot
6223 use the scalar amount directly thus give up and use a vector
6224 shift. */
6225 if (op1_def_stmt_info && is_pattern_stmt_p (op1_def_stmt_info))
6226 scalar_shift_arg = false;
6228 else
6230 if (dump_enabled_p ())
6231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6232 "operand mode requires invariant argument.\n");
6233 return false;
6236 /* Vector shifted by vector. */
6237 bool was_scalar_shift_arg = scalar_shift_arg;
6238 if (!scalar_shift_arg)
6240 optab = optab_for_tree_code (code, vectype, optab_vector);
6241 if (dump_enabled_p ())
6242 dump_printf_loc (MSG_NOTE, vect_location,
6243 "vector/vector shift/rotate found.\n");
6245 if (!op1_vectype)
6246 op1_vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op1),
6247 slp_op1);
6248 incompatible_op1_vectype_p
6249 = (op1_vectype == NULL_TREE
6250 || maybe_ne (TYPE_VECTOR_SUBPARTS (op1_vectype),
6251 TYPE_VECTOR_SUBPARTS (vectype))
6252 || TYPE_MODE (op1_vectype) != TYPE_MODE (vectype));
6253 if (incompatible_op1_vectype_p
6254 && (!slp_node
6255 || SLP_TREE_DEF_TYPE (slp_op1) != vect_constant_def
6256 || slp_op1->refcnt != 1))
6258 if (dump_enabled_p ())
6259 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6260 "unusable type for last operand in"
6261 " vector/vector shift/rotate.\n");
6262 return false;
6265 /* See if the machine has a vector shifted by scalar insn and if not
6266 then see if it has a vector shifted by vector insn. */
6267 else
6269 optab = optab_for_tree_code (code, vectype, optab_scalar);
6270 if (optab
6271 && optab_handler (optab, TYPE_MODE (vectype)) != CODE_FOR_nothing)
6273 if (dump_enabled_p ())
6274 dump_printf_loc (MSG_NOTE, vect_location,
6275 "vector/scalar shift/rotate found.\n");
6277 else
6279 optab = optab_for_tree_code (code, vectype, optab_vector);
6280 if (optab
6281 && (optab_handler (optab, TYPE_MODE (vectype))
6282 != CODE_FOR_nothing))
6284 scalar_shift_arg = false;
6286 if (dump_enabled_p ())
6287 dump_printf_loc (MSG_NOTE, vect_location,
6288 "vector/vector shift/rotate found.\n");
6290 if (!op1_vectype)
6291 op1_vectype = get_vectype_for_scalar_type (vinfo,
6292 TREE_TYPE (op1),
6293 slp_op1);
6295 /* Unlike the other binary operators, shifts/rotates have
6296 the rhs being int, instead of the same type as the lhs,
6297 so make sure the scalar is the right type if we are
6298 dealing with vectors of long long/long/short/char. */
6299 incompatible_op1_vectype_p
6300 = (!op1_vectype
6301 || !tree_nop_conversion_p (TREE_TYPE (vectype),
6302 TREE_TYPE (op1)));
6303 if (incompatible_op1_vectype_p
6304 && dt[1] == vect_internal_def)
6306 if (dump_enabled_p ())
6307 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6308 "unusable type for last operand in"
6309 " vector/vector shift/rotate.\n");
6310 return false;
6316 /* Supportable by target? */
6317 if (!optab)
6319 if (dump_enabled_p ())
6320 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6321 "no optab.\n");
6322 return false;
6324 vec_mode = TYPE_MODE (vectype);
6325 icode = (int) optab_handler (optab, vec_mode);
6326 if (icode == CODE_FOR_nothing)
6328 if (dump_enabled_p ())
6329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6330 "op not supported by target.\n");
6331 return false;
6333 /* vector lowering cannot optimize vector shifts using word arithmetic. */
6334 if (vect_emulated_vector_p (vectype))
6335 return false;
6337 if (!vec_stmt) /* transformation not required. */
6339 if (slp_node
6340 && (!vect_maybe_update_slp_op_vectype (slp_op0, vectype)
6341 || ((!scalar_shift_arg || dt[1] == vect_internal_def)
6342 && (!incompatible_op1_vectype_p
6343 || dt[1] == vect_constant_def)
6344 && !vect_maybe_update_slp_op_vectype
6345 (slp_op1,
6346 incompatible_op1_vectype_p ? vectype : op1_vectype))))
6348 if (dump_enabled_p ())
6349 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6350 "incompatible vector types for invariants\n");
6351 return false;
6353 /* Now adjust the constant shift amount in place. */
6354 if (slp_node
6355 && incompatible_op1_vectype_p
6356 && dt[1] == vect_constant_def)
6358 for (unsigned i = 0;
6359 i < SLP_TREE_SCALAR_OPS (slp_op1).length (); ++i)
6361 SLP_TREE_SCALAR_OPS (slp_op1)[i]
6362 = fold_convert (TREE_TYPE (vectype),
6363 SLP_TREE_SCALAR_OPS (slp_op1)[i]);
6364 gcc_assert ((TREE_CODE (SLP_TREE_SCALAR_OPS (slp_op1)[i])
6365 == INTEGER_CST));
6368 STMT_VINFO_TYPE (stmt_info) = shift_vec_info_type;
6369 DUMP_VECT_SCOPE ("vectorizable_shift");
6370 vect_model_simple_cost (vinfo, stmt_info, ncopies, dt,
6371 scalar_shift_arg ? 1 : ndts, slp_node, cost_vec);
6372 return true;
6375 /* Transform. */
6377 if (dump_enabled_p ())
6378 dump_printf_loc (MSG_NOTE, vect_location,
6379 "transform binary/unary operation.\n");
6381 if (incompatible_op1_vectype_p && !slp_node)
6383 gcc_assert (!scalar_shift_arg && was_scalar_shift_arg);
6384 op1 = fold_convert (TREE_TYPE (vectype), op1);
6385 if (dt[1] != vect_constant_def)
6386 op1 = vect_init_vector (vinfo, stmt_info, op1,
6387 TREE_TYPE (vectype), NULL);
6390 /* Handle def. */
6391 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6393 if (scalar_shift_arg && dt[1] != vect_internal_def)
6395 /* Vector shl and shr insn patterns can be defined with scalar
6396 operand 2 (shift operand). In this case, use constant or loop
6397 invariant op1 directly, without extending it to vector mode
6398 first. */
6399 optab_op2_mode = insn_data[icode].operand[2].mode;
6400 if (!VECTOR_MODE_P (optab_op2_mode))
6402 if (dump_enabled_p ())
6403 dump_printf_loc (MSG_NOTE, vect_location,
6404 "operand 1 using scalar mode.\n");
6405 vec_oprnd1 = op1;
6406 vec_oprnds1.create (slp_node ? slp_node->vec_stmts_size : ncopies);
6407 vec_oprnds1.quick_push (vec_oprnd1);
6408 /* Store vec_oprnd1 for every vector stmt to be created.
6409 We check during the analysis that all the shift arguments
6410 are the same.
6411 TODO: Allow different constants for different vector
6412 stmts generated for an SLP instance. */
6413 for (k = 0;
6414 k < (slp_node ? slp_node->vec_stmts_size - 1 : ncopies - 1); k++)
6415 vec_oprnds1.quick_push (vec_oprnd1);
6418 else if (!scalar_shift_arg && slp_node && incompatible_op1_vectype_p)
6420 if (was_scalar_shift_arg)
6422 /* If the argument was the same in all lanes create
6423 the correctly typed vector shift amount directly. */
6424 op1 = fold_convert (TREE_TYPE (vectype), op1);
6425 op1 = vect_init_vector (vinfo, stmt_info, op1, TREE_TYPE (vectype),
6426 !loop_vinfo ? gsi : NULL);
6427 vec_oprnd1 = vect_init_vector (vinfo, stmt_info, op1, vectype,
6428 !loop_vinfo ? gsi : NULL);
6429 vec_oprnds1.create (slp_node->vec_stmts_size);
6430 for (k = 0; k < slp_node->vec_stmts_size; k++)
6431 vec_oprnds1.quick_push (vec_oprnd1);
6433 else if (dt[1] == vect_constant_def)
6434 /* The constant shift amount has been adjusted in place. */
6436 else
6437 gcc_assert (TYPE_MODE (op1_vectype) == TYPE_MODE (vectype));
6440 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
6441 (a special case for certain kind of vector shifts); otherwise,
6442 operand 1 should be of a vector type (the usual case). */
6443 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
6444 op0, &vec_oprnds0,
6445 vec_oprnd1 ? NULL_TREE : op1, &vec_oprnds1);
6447 /* Arguments are ready. Create the new vector stmt. */
6448 FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
6450 /* For internal defs where we need to use a scalar shift arg
6451 extract the first lane. */
6452 if (scalar_shift_arg && dt[1] == vect_internal_def)
6454 vop1 = vec_oprnds1[0];
6455 new_temp = make_ssa_name (TREE_TYPE (TREE_TYPE (vop1)));
6456 gassign *new_stmt
6457 = gimple_build_assign (new_temp,
6458 build3 (BIT_FIELD_REF, TREE_TYPE (new_temp),
6459 vop1,
6460 TYPE_SIZE (TREE_TYPE (new_temp)),
6461 bitsize_zero_node));
6462 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6463 vop1 = new_temp;
6465 else
6466 vop1 = vec_oprnds1[i];
6467 gassign *new_stmt = gimple_build_assign (vec_dest, code, vop0, vop1);
6468 new_temp = make_ssa_name (vec_dest, new_stmt);
6469 gimple_assign_set_lhs (new_stmt, new_temp);
6470 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6471 if (slp_node)
6472 slp_node->push_vec_def (new_stmt);
6473 else
6474 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6477 if (!slp_node)
6478 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
6480 vec_oprnds0.release ();
6481 vec_oprnds1.release ();
6483 return true;
6486 /* Function vectorizable_operation.
6488 Check if STMT_INFO performs a binary, unary or ternary operation that can
6489 be vectorized.
6490 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6491 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6492 Return true if STMT_INFO is vectorizable in this way. */
6494 static bool
6495 vectorizable_operation (vec_info *vinfo,
6496 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6497 gimple **vec_stmt, slp_tree slp_node,
6498 stmt_vector_for_cost *cost_vec)
6500 tree vec_dest;
6501 tree scalar_dest;
6502 tree op0, op1 = NULL_TREE, op2 = NULL_TREE;
6503 tree vectype;
6504 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6505 enum tree_code code, orig_code;
6506 machine_mode vec_mode;
6507 tree new_temp;
6508 int op_type;
6509 optab optab;
6510 bool target_support_p;
6511 enum vect_def_type dt[3]
6512 = {vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type};
6513 int ndts = 3;
6514 poly_uint64 nunits_in;
6515 poly_uint64 nunits_out;
6516 tree vectype_out;
6517 int ncopies, vec_num;
6518 int i;
6519 vec<tree> vec_oprnds0 = vNULL;
6520 vec<tree> vec_oprnds1 = vNULL;
6521 vec<tree> vec_oprnds2 = vNULL;
6522 tree vop0, vop1, vop2;
6523 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
6525 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
6526 return false;
6528 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
6529 && ! vec_stmt)
6530 return false;
6532 /* Is STMT a vectorizable binary/unary operation? */
6533 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
6534 if (!stmt)
6535 return false;
6537 /* Loads and stores are handled in vectorizable_{load,store}. */
6538 if (STMT_VINFO_DATA_REF (stmt_info))
6539 return false;
6541 orig_code = code = gimple_assign_rhs_code (stmt);
6543 /* Shifts are handled in vectorizable_shift. */
6544 if (code == LSHIFT_EXPR
6545 || code == RSHIFT_EXPR
6546 || code == LROTATE_EXPR
6547 || code == RROTATE_EXPR)
6548 return false;
6550 /* Comparisons are handled in vectorizable_comparison. */
6551 if (TREE_CODE_CLASS (code) == tcc_comparison)
6552 return false;
6554 /* Conditions are handled in vectorizable_condition. */
6555 if (code == COND_EXPR)
6556 return false;
6558 /* For pointer addition and subtraction, we should use the normal
6559 plus and minus for the vector operation. */
6560 if (code == POINTER_PLUS_EXPR)
6561 code = PLUS_EXPR;
6562 if (code == POINTER_DIFF_EXPR)
6563 code = MINUS_EXPR;
6565 /* Support only unary or binary operations. */
6566 op_type = TREE_CODE_LENGTH (code);
6567 if (op_type != unary_op && op_type != binary_op && op_type != ternary_op)
6569 if (dump_enabled_p ())
6570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6571 "num. args = %d (not unary/binary/ternary op).\n",
6572 op_type);
6573 return false;
6576 scalar_dest = gimple_assign_lhs (stmt);
6577 vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6579 /* Most operations cannot handle bit-precision types without extra
6580 truncations. */
6581 bool mask_op_p = VECTOR_BOOLEAN_TYPE_P (vectype_out);
6582 if (!mask_op_p
6583 && !type_has_mode_precision_p (TREE_TYPE (scalar_dest))
6584 /* Exception are bitwise binary operations. */
6585 && code != BIT_IOR_EXPR
6586 && code != BIT_XOR_EXPR
6587 && code != BIT_AND_EXPR)
6589 if (dump_enabled_p ())
6590 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6591 "bit-precision arithmetic not supported.\n");
6592 return false;
6595 slp_tree slp_op0;
6596 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
6597 0, &op0, &slp_op0, &dt[0], &vectype))
6599 if (dump_enabled_p ())
6600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6601 "use not simple.\n");
6602 return false;
6604 bool is_invariant = (dt[0] == vect_external_def
6605 || dt[0] == vect_constant_def);
6606 /* If op0 is an external or constant def, infer the vector type
6607 from the scalar type. */
6608 if (!vectype)
6610 /* For boolean type we cannot determine vectype by
6611 invariant value (don't know whether it is a vector
6612 of booleans or vector of integers). We use output
6613 vectype because operations on boolean don't change
6614 type. */
6615 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op0)))
6617 if (!VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (scalar_dest)))
6619 if (dump_enabled_p ())
6620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6621 "not supported operation on bool value.\n");
6622 return false;
6624 vectype = vectype_out;
6626 else
6627 vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op0),
6628 slp_node);
6630 if (vec_stmt)
6631 gcc_assert (vectype);
6632 if (!vectype)
6634 if (dump_enabled_p ())
6635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6636 "no vectype for scalar type %T\n",
6637 TREE_TYPE (op0));
6639 return false;
6642 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6643 nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
6644 if (maybe_ne (nunits_out, nunits_in)
6645 || !tree_nop_conversion_p (TREE_TYPE (vectype_out), TREE_TYPE (vectype)))
6646 return false;
6648 tree vectype2 = NULL_TREE, vectype3 = NULL_TREE;
6649 slp_tree slp_op1 = NULL, slp_op2 = NULL;
6650 if (op_type == binary_op || op_type == ternary_op)
6652 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
6653 1, &op1, &slp_op1, &dt[1], &vectype2))
6655 if (dump_enabled_p ())
6656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6657 "use not simple.\n");
6658 return false;
6660 is_invariant &= (dt[1] == vect_external_def
6661 || dt[1] == vect_constant_def);
6662 if (vectype2
6663 && (maybe_ne (nunits_out, TYPE_VECTOR_SUBPARTS (vectype2))
6664 || !tree_nop_conversion_p (TREE_TYPE (vectype_out),
6665 TREE_TYPE (vectype2))))
6666 return false;
6668 if (op_type == ternary_op)
6670 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
6671 2, &op2, &slp_op2, &dt[2], &vectype3))
6673 if (dump_enabled_p ())
6674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6675 "use not simple.\n");
6676 return false;
6678 is_invariant &= (dt[2] == vect_external_def
6679 || dt[2] == vect_constant_def);
6680 if (vectype3
6681 && (maybe_ne (nunits_out, TYPE_VECTOR_SUBPARTS (vectype3))
6682 || !tree_nop_conversion_p (TREE_TYPE (vectype_out),
6683 TREE_TYPE (vectype3))))
6684 return false;
6687 /* Multiple types in SLP are handled by creating the appropriate number of
6688 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
6689 case of SLP. */
6690 if (slp_node)
6692 ncopies = 1;
6693 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6695 else
6697 ncopies = vect_get_num_copies (loop_vinfo, vectype);
6698 vec_num = 1;
6701 gcc_assert (ncopies >= 1);
6703 /* Reject attempts to combine mask types with nonmask types, e.g. if
6704 we have an AND between a (nonmask) boolean loaded from memory and
6705 a (mask) boolean result of a comparison.
6707 TODO: We could easily fix these cases up using pattern statements. */
6708 if (VECTOR_BOOLEAN_TYPE_P (vectype) != mask_op_p
6709 || (vectype2 && VECTOR_BOOLEAN_TYPE_P (vectype2) != mask_op_p)
6710 || (vectype3 && VECTOR_BOOLEAN_TYPE_P (vectype3) != mask_op_p))
6712 if (dump_enabled_p ())
6713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6714 "mixed mask and nonmask vector types\n");
6715 return false;
6718 /* Supportable by target? */
6720 vec_mode = TYPE_MODE (vectype);
6721 if (code == MULT_HIGHPART_EXPR)
6722 target_support_p = can_mult_highpart_p (vec_mode, TYPE_UNSIGNED (vectype));
6723 else
6725 optab = optab_for_tree_code (code, vectype, optab_default);
6726 if (!optab)
6728 if (dump_enabled_p ())
6729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6730 "no optab.\n");
6731 return false;
6733 target_support_p = (optab_handler (optab, vec_mode) != CODE_FOR_nothing
6734 || optab_libfunc (optab, vec_mode));
6737 bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);
6738 if (!target_support_p || using_emulated_vectors_p)
6740 if (dump_enabled_p ())
6741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6742 "op not supported by target.\n");
6743 /* When vec_mode is not a vector mode and we verified ops we
6744 do not have to lower like AND are natively supported let
6745 those through even when the mode isn't word_mode. For
6746 ops we have to lower the lowering code assumes we are
6747 dealing with word_mode. */
6748 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype))
6749 || (((code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR)
6750 || !target_support_p)
6751 && maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD))
6752 /* Check only during analysis. */
6753 || (!vec_stmt && !vect_can_vectorize_without_simd_p (code)))
6755 if (dump_enabled_p ())
6756 dump_printf (MSG_NOTE, "using word mode not possible.\n");
6757 return false;
6759 if (dump_enabled_p ())
6760 dump_printf_loc (MSG_NOTE, vect_location,
6761 "proceeding using word mode.\n");
6762 using_emulated_vectors_p = true;
6765 int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
6766 vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
6767 vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
6768 internal_fn cond_fn = get_conditional_internal_fn (code);
6769 internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
6771 /* If operating on inactive elements could generate spurious traps,
6772 we need to restrict the operation to active lanes. Note that this
6773 specifically doesn't apply to unhoisted invariants, since they
6774 operate on the same value for every lane.
6776 Similarly, if this operation is part of a reduction, a fully-masked
6777 loop should only change the active lanes of the reduction chain,
6778 keeping the inactive lanes as-is. */
6779 bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
6780 || reduc_idx >= 0);
6782 if (!vec_stmt) /* transformation not required. */
6784 if (loop_vinfo
6785 && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
6786 && mask_out_inactive)
6788 if (cond_len_fn != IFN_LAST
6789 && direct_internal_fn_supported_p (cond_len_fn, vectype,
6790 OPTIMIZE_FOR_SPEED))
6791 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype,
6793 else if (cond_fn != IFN_LAST
6794 && direct_internal_fn_supported_p (cond_fn, vectype,
6795 OPTIMIZE_FOR_SPEED))
6796 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6797 vectype, NULL);
6798 else
6800 if (dump_enabled_p ())
6801 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6802 "can't use a fully-masked loop because no"
6803 " conditional operation is available.\n");
6804 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6808 /* Put types on constant and invariant SLP children. */
6809 if (slp_node
6810 && (!vect_maybe_update_slp_op_vectype (slp_op0, vectype)
6811 || !vect_maybe_update_slp_op_vectype (slp_op1, vectype)
6812 || !vect_maybe_update_slp_op_vectype (slp_op2, vectype)))
6814 if (dump_enabled_p ())
6815 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6816 "incompatible vector types for invariants\n");
6817 return false;
6820 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
6821 DUMP_VECT_SCOPE ("vectorizable_operation");
6822 vect_model_simple_cost (vinfo, stmt_info,
6823 ncopies, dt, ndts, slp_node, cost_vec);
6824 if (using_emulated_vectors_p)
6826 /* The above vect_model_simple_cost call handles constants
6827 in the prologue and (mis-)costs one of the stmts as
6828 vector stmt. See below for the actual lowering that will
6829 be applied. */
6830 unsigned n
6831 = slp_node ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) : ncopies;
6832 switch (code)
6834 case PLUS_EXPR:
6835 n *= 5;
6836 break;
6837 case MINUS_EXPR:
6838 n *= 6;
6839 break;
6840 case NEGATE_EXPR:
6841 n *= 4;
6842 break;
6843 default:
6844 /* Bit operations do not have extra cost and are accounted
6845 as vector stmt by vect_model_simple_cost. */
6846 n = 0;
6847 break;
6849 if (n != 0)
6851 /* We also need to materialize two large constants. */
6852 record_stmt_cost (cost_vec, 2, scalar_stmt, stmt_info,
6853 0, vect_prologue);
6854 record_stmt_cost (cost_vec, n, scalar_stmt, stmt_info,
6855 0, vect_body);
6858 return true;
6861 /* Transform. */
6863 if (dump_enabled_p ())
6864 dump_printf_loc (MSG_NOTE, vect_location,
6865 "transform binary/unary operation.\n");
6867 bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6868 bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
6870 /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
6871 vectors with unsigned elements, but the result is signed. So, we
6872 need to compute the MINUS_EXPR into vectype temporary and
6873 VIEW_CONVERT_EXPR it into the final vectype_out result. */
6874 tree vec_cvt_dest = NULL_TREE;
6875 if (orig_code == POINTER_DIFF_EXPR)
6877 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6878 vec_cvt_dest = vect_create_destination_var (scalar_dest, vectype_out);
6880 /* Handle def. */
6881 else
6882 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6884 /* In case the vectorization factor (VF) is bigger than the number
6885 of elements that we can fit in a vectype (nunits), we have to generate
6886 more than one vector stmt - i.e - we need to "unroll" the
6887 vector stmt by a factor VF/nunits. In doing so, we record a pointer
6888 from one copy of the vector stmt to the next, in the field
6889 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
6890 stages to find the correct vector defs to be used when vectorizing
6891 stmts that use the defs of the current stmt. The example below
6892 illustrates the vectorization process when VF=16 and nunits=4 (i.e.,
6893 we need to create 4 vectorized stmts):
6895 before vectorization:
6896 RELATED_STMT VEC_STMT
6897 S1: x = memref - -
6898 S2: z = x + 1 - -
6900 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
6901 there):
6902 RELATED_STMT VEC_STMT
6903 VS1_0: vx0 = memref0 VS1_1 -
6904 VS1_1: vx1 = memref1 VS1_2 -
6905 VS1_2: vx2 = memref2 VS1_3 -
6906 VS1_3: vx3 = memref3 - -
6907 S1: x = load - VS1_0
6908 S2: z = x + 1 - -
6910 step2: vectorize stmt S2 (done here):
6911 To vectorize stmt S2 we first need to find the relevant vector
6912 def for the first operand 'x'. This is, as usual, obtained from
6913 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
6914 that defines 'x' (S1). This way we find the stmt VS1_0, and the
6915 relevant vector def 'vx0'. Having found 'vx0' we can generate
6916 the vector stmt VS2_0, and as usual, record it in the
6917 STMT_VINFO_VEC_STMT of stmt S2.
6918 When creating the second copy (VS2_1), we obtain the relevant vector
6919 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
6920 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
6921 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
6922 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
6923 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
6924 chain of stmts and pointers:
6925 RELATED_STMT VEC_STMT
6926 VS1_0: vx0 = memref0 VS1_1 -
6927 VS1_1: vx1 = memref1 VS1_2 -
6928 VS1_2: vx2 = memref2 VS1_3 -
6929 VS1_3: vx3 = memref3 - -
6930 S1: x = load - VS1_0
6931 VS2_0: vz0 = vx0 + v1 VS2_1 -
6932 VS2_1: vz1 = vx1 + v1 VS2_2 -
6933 VS2_2: vz2 = vx2 + v1 VS2_3 -
6934 VS2_3: vz3 = vx3 + v1 - -
6935 S2: z = x + 1 - VS2_0 */
6937 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
6938 op0, &vec_oprnds0, op1, &vec_oprnds1, op2, &vec_oprnds2);
6939 /* Arguments are ready. Create the new vector stmt. */
6940 FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
6942 gimple *new_stmt = NULL;
6943 vop1 = ((op_type == binary_op || op_type == ternary_op)
6944 ? vec_oprnds1[i] : NULL_TREE);
6945 vop2 = ((op_type == ternary_op) ? vec_oprnds2[i] : NULL_TREE);
6946 if (using_emulated_vectors_p
6947 && (code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR))
6949 /* Lower the operation. This follows vector lowering. */
6950 unsigned int width = vector_element_bits (vectype);
6951 tree inner_type = TREE_TYPE (vectype);
6952 tree word_type
6953 = build_nonstandard_integer_type (GET_MODE_BITSIZE (word_mode), 1);
6954 HOST_WIDE_INT max = GET_MODE_MASK (TYPE_MODE (inner_type));
6955 tree low_bits = build_replicated_int_cst (word_type, width, max >> 1);
6956 tree high_bits
6957 = build_replicated_int_cst (word_type, width, max & ~(max >> 1));
6958 tree wvop0 = make_ssa_name (word_type);
6959 new_stmt = gimple_build_assign (wvop0, VIEW_CONVERT_EXPR,
6960 build1 (VIEW_CONVERT_EXPR,
6961 word_type, vop0));
6962 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6963 tree result_low, signs;
6964 if (code == PLUS_EXPR || code == MINUS_EXPR)
6966 tree wvop1 = make_ssa_name (word_type);
6967 new_stmt = gimple_build_assign (wvop1, VIEW_CONVERT_EXPR,
6968 build1 (VIEW_CONVERT_EXPR,
6969 word_type, vop1));
6970 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6971 signs = make_ssa_name (word_type);
6972 new_stmt = gimple_build_assign (signs,
6973 BIT_XOR_EXPR, wvop0, wvop1);
6974 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6975 tree b_low = make_ssa_name (word_type);
6976 new_stmt = gimple_build_assign (b_low,
6977 BIT_AND_EXPR, wvop1, low_bits);
6978 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6979 tree a_low = make_ssa_name (word_type);
6980 if (code == PLUS_EXPR)
6981 new_stmt = gimple_build_assign (a_low,
6982 BIT_AND_EXPR, wvop0, low_bits);
6983 else
6984 new_stmt = gimple_build_assign (a_low,
6985 BIT_IOR_EXPR, wvop0, high_bits);
6986 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6987 if (code == MINUS_EXPR)
6989 new_stmt = gimple_build_assign (NULL_TREE,
6990 BIT_NOT_EXPR, signs);
6991 signs = make_ssa_name (word_type);
6992 gimple_assign_set_lhs (new_stmt, signs);
6993 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6995 new_stmt = gimple_build_assign (NULL_TREE,
6996 BIT_AND_EXPR, signs, high_bits);
6997 signs = make_ssa_name (word_type);
6998 gimple_assign_set_lhs (new_stmt, signs);
6999 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7000 result_low = make_ssa_name (word_type);
7001 new_stmt = gimple_build_assign (result_low, code, a_low, b_low);
7002 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7004 else
7006 tree a_low = make_ssa_name (word_type);
7007 new_stmt = gimple_build_assign (a_low,
7008 BIT_AND_EXPR, wvop0, low_bits);
7009 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7010 signs = make_ssa_name (word_type);
7011 new_stmt = gimple_build_assign (signs, BIT_NOT_EXPR, wvop0);
7012 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7013 new_stmt = gimple_build_assign (NULL_TREE,
7014 BIT_AND_EXPR, signs, high_bits);
7015 signs = make_ssa_name (word_type);
7016 gimple_assign_set_lhs (new_stmt, signs);
7017 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7018 result_low = make_ssa_name (word_type);
7019 new_stmt = gimple_build_assign (result_low,
7020 MINUS_EXPR, high_bits, a_low);
7021 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7023 new_stmt = gimple_build_assign (NULL_TREE, BIT_XOR_EXPR, result_low,
7024 signs);
7025 result_low = make_ssa_name (word_type);
7026 gimple_assign_set_lhs (new_stmt, result_low);
7027 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7028 new_stmt = gimple_build_assign (NULL_TREE, VIEW_CONVERT_EXPR,
7029 build1 (VIEW_CONVERT_EXPR,
7030 vectype, result_low));
7031 new_temp = make_ssa_name (vectype);
7032 gimple_assign_set_lhs (new_stmt, new_temp);
7033 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7035 else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
7037 tree mask;
7038 if (masked_loop_p)
7039 mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
7040 vec_num * ncopies, vectype, i);
7041 else
7042 /* Dummy mask. */
7043 mask = build_minus_one_cst (truth_type_for (vectype));
7044 auto_vec<tree> vops (6);
7045 vops.quick_push (mask);
7046 vops.quick_push (vop0);
7047 if (vop1)
7048 vops.quick_push (vop1);
7049 if (vop2)
7050 vops.quick_push (vop2);
7051 if (reduc_idx >= 0)
7053 /* Perform the operation on active elements only and take
7054 inactive elements from the reduction chain input. */
7055 gcc_assert (!vop2);
7056 vops.quick_push (reduc_idx == 1 ? vop1 : vop0);
7058 else
7060 auto else_value = targetm.preferred_else_value
7061 (cond_fn, vectype, vops.length () - 1, &vops[1]);
7062 vops.quick_push (else_value);
7064 if (len_loop_p)
7066 tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
7067 vec_num * ncopies, vectype, i, 1);
7068 signed char biasval
7069 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7070 tree bias = build_int_cst (intQI_type_node, biasval);
7071 vops.quick_push (len);
7072 vops.quick_push (bias);
7074 gcall *call
7075 = gimple_build_call_internal_vec (masked_loop_p ? cond_fn
7076 : cond_len_fn,
7077 vops);
7078 new_temp = make_ssa_name (vec_dest, call);
7079 gimple_call_set_lhs (call, new_temp);
7080 gimple_call_set_nothrow (call, true);
7081 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
7082 new_stmt = call;
7084 else
7086 tree mask = NULL_TREE;
7087 /* When combining two masks check if either of them is elsewhere
7088 combined with a loop mask, if that's the case we can mark that the
7089 new combined mask doesn't need to be combined with a loop mask. */
7090 if (masked_loop_p
7091 && code == BIT_AND_EXPR
7092 && VECTOR_BOOLEAN_TYPE_P (vectype))
7094 if (loop_vinfo->scalar_cond_masked_set.contains ({ op0,
7095 ncopies}))
7097 mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
7098 vec_num * ncopies, vectype, i);
7100 vop0 = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
7101 vop0, gsi);
7104 if (loop_vinfo->scalar_cond_masked_set.contains ({ op1,
7105 ncopies }))
7107 mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
7108 vec_num * ncopies, vectype, i);
7110 vop1 = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
7111 vop1, gsi);
7115 new_stmt = gimple_build_assign (vec_dest, code, vop0, vop1, vop2);
7116 new_temp = make_ssa_name (vec_dest, new_stmt);
7117 gimple_assign_set_lhs (new_stmt, new_temp);
7118 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7119 if (using_emulated_vectors_p)
7120 suppress_warning (new_stmt, OPT_Wvector_operation_performance);
7122 /* Enter the combined value into the vector cond hash so we don't
7123 AND it with a loop mask again. */
7124 if (mask)
7125 loop_vinfo->vec_cond_masked_set.add ({ new_temp, mask });
7128 if (vec_cvt_dest)
7130 new_temp = build1 (VIEW_CONVERT_EXPR, vectype_out, new_temp);
7131 new_stmt = gimple_build_assign (vec_cvt_dest, VIEW_CONVERT_EXPR,
7132 new_temp);
7133 new_temp = make_ssa_name (vec_cvt_dest, new_stmt);
7134 gimple_assign_set_lhs (new_stmt, new_temp);
7135 vect_finish_stmt_generation (vinfo, stmt_info,
7136 new_stmt, gsi);
7139 if (slp_node)
7140 slp_node->push_vec_def (new_stmt);
7141 else
7142 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7145 if (!slp_node)
7146 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7148 vec_oprnds0.release ();
7149 vec_oprnds1.release ();
7150 vec_oprnds2.release ();
7152 return true;
7155 /* A helper function to ensure data reference DR_INFO's base alignment. */
7157 static void
7158 ensure_base_align (dr_vec_info *dr_info)
7160 /* Alignment is only analyzed for the first element of a DR group,
7161 use that to look at base alignment we need to enforce. */
7162 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
7163 dr_info = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
7165 gcc_assert (dr_info->misalignment != DR_MISALIGNMENT_UNINITIALIZED);
7167 if (dr_info->base_misaligned)
7169 tree base_decl = dr_info->base_decl;
7171 // We should only be able to increase the alignment of a base object if
7172 // we know what its new alignment should be at compile time.
7173 unsigned HOST_WIDE_INT align_base_to =
7174 DR_TARGET_ALIGNMENT (dr_info).to_constant () * BITS_PER_UNIT;
7176 if (decl_in_symtab_p (base_decl))
7177 symtab_node::get (base_decl)->increase_alignment (align_base_to);
7178 else if (DECL_ALIGN (base_decl) < align_base_to)
7180 SET_DECL_ALIGN (base_decl, align_base_to);
7181 DECL_USER_ALIGN (base_decl) = 1;
7183 dr_info->base_misaligned = false;
7188 /* Function get_group_alias_ptr_type.
7190 Return the alias type for the group starting at FIRST_STMT_INFO. */
7192 static tree
7193 get_group_alias_ptr_type (stmt_vec_info first_stmt_info)
7195 struct data_reference *first_dr, *next_dr;
7197 first_dr = STMT_VINFO_DATA_REF (first_stmt_info);
7198 stmt_vec_info next_stmt_info = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
7199 while (next_stmt_info)
7201 next_dr = STMT_VINFO_DATA_REF (next_stmt_info);
7202 if (get_alias_set (DR_REF (first_dr))
7203 != get_alias_set (DR_REF (next_dr)))
7205 if (dump_enabled_p ())
7206 dump_printf_loc (MSG_NOTE, vect_location,
7207 "conflicting alias set types.\n");
7208 return ptr_type_node;
7210 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7212 return reference_alias_ptr_type (DR_REF (first_dr));
7216 /* Function scan_operand_equal_p.
7218 Helper function for check_scan_store. Compare two references
7219 with .GOMP_SIMD_LANE bases. */
7221 static bool
7222 scan_operand_equal_p (tree ref1, tree ref2)
7224 tree ref[2] = { ref1, ref2 };
7225 poly_int64 bitsize[2], bitpos[2];
7226 tree offset[2], base[2];
7227 for (int i = 0; i < 2; ++i)
7229 machine_mode mode;
7230 int unsignedp, reversep, volatilep = 0;
7231 base[i] = get_inner_reference (ref[i], &bitsize[i], &bitpos[i],
7232 &offset[i], &mode, &unsignedp,
7233 &reversep, &volatilep);
7234 if (reversep || volatilep || maybe_ne (bitpos[i], 0))
7235 return false;
7236 if (TREE_CODE (base[i]) == MEM_REF
7237 && offset[i] == NULL_TREE
7238 && TREE_CODE (TREE_OPERAND (base[i], 0)) == SSA_NAME)
7240 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base[i], 0));
7241 if (is_gimple_assign (def_stmt)
7242 && gimple_assign_rhs_code (def_stmt) == POINTER_PLUS_EXPR
7243 && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == ADDR_EXPR
7244 && TREE_CODE (gimple_assign_rhs2 (def_stmt)) == SSA_NAME)
7246 if (maybe_ne (mem_ref_offset (base[i]), 0))
7247 return false;
7248 base[i] = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
7249 offset[i] = gimple_assign_rhs2 (def_stmt);
7254 if (!operand_equal_p (base[0], base[1], 0))
7255 return false;
7256 if (maybe_ne (bitsize[0], bitsize[1]))
7257 return false;
7258 if (offset[0] != offset[1])
7260 if (!offset[0] || !offset[1])
7261 return false;
7262 if (!operand_equal_p (offset[0], offset[1], 0))
7264 tree step[2];
7265 for (int i = 0; i < 2; ++i)
7267 step[i] = integer_one_node;
7268 if (TREE_CODE (offset[i]) == SSA_NAME)
7270 gimple *def_stmt = SSA_NAME_DEF_STMT (offset[i]);
7271 if (is_gimple_assign (def_stmt)
7272 && gimple_assign_rhs_code (def_stmt) == MULT_EXPR
7273 && (TREE_CODE (gimple_assign_rhs2 (def_stmt))
7274 == INTEGER_CST))
7276 step[i] = gimple_assign_rhs2 (def_stmt);
7277 offset[i] = gimple_assign_rhs1 (def_stmt);
7280 else if (TREE_CODE (offset[i]) == MULT_EXPR)
7282 step[i] = TREE_OPERAND (offset[i], 1);
7283 offset[i] = TREE_OPERAND (offset[i], 0);
7285 tree rhs1 = NULL_TREE;
7286 if (TREE_CODE (offset[i]) == SSA_NAME)
7288 gimple *def_stmt = SSA_NAME_DEF_STMT (offset[i]);
7289 if (gimple_assign_cast_p (def_stmt))
7290 rhs1 = gimple_assign_rhs1 (def_stmt);
7292 else if (CONVERT_EXPR_P (offset[i]))
7293 rhs1 = TREE_OPERAND (offset[i], 0);
7294 if (rhs1
7295 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
7296 && INTEGRAL_TYPE_P (TREE_TYPE (offset[i]))
7297 && (TYPE_PRECISION (TREE_TYPE (offset[i]))
7298 >= TYPE_PRECISION (TREE_TYPE (rhs1))))
7299 offset[i] = rhs1;
7301 if (!operand_equal_p (offset[0], offset[1], 0)
7302 || !operand_equal_p (step[0], step[1], 0))
7303 return false;
7306 return true;
7310 enum scan_store_kind {
7311 /* Normal permutation. */
7312 scan_store_kind_perm,
7314 /* Whole vector left shift permutation with zero init. */
7315 scan_store_kind_lshift_zero,
7317 /* Whole vector left shift permutation and VEC_COND_EXPR. */
7318 scan_store_kind_lshift_cond
7321 /* Function check_scan_store.
7323 Verify if we can perform the needed permutations or whole vector shifts.
7324 Return -1 on failure, otherwise exact log2 of vectype's nunits.
7325 USE_WHOLE_VECTOR is a vector of enum scan_store_kind which operation
7326 to do at each step. */
7328 static int
7329 scan_store_can_perm_p (tree vectype, tree init,
7330 vec<enum scan_store_kind> *use_whole_vector = NULL)
7332 enum machine_mode vec_mode = TYPE_MODE (vectype);
7333 unsigned HOST_WIDE_INT nunits;
7334 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits))
7335 return -1;
7336 int units_log2 = exact_log2 (nunits);
7337 if (units_log2 <= 0)
7338 return -1;
7340 int i;
7341 enum scan_store_kind whole_vector_shift_kind = scan_store_kind_perm;
7342 for (i = 0; i <= units_log2; ++i)
7344 unsigned HOST_WIDE_INT j, k;
7345 enum scan_store_kind kind = scan_store_kind_perm;
7346 vec_perm_builder sel (nunits, nunits, 1);
7347 sel.quick_grow (nunits);
7348 if (i == units_log2)
7350 for (j = 0; j < nunits; ++j)
7351 sel[j] = nunits - 1;
7353 else
7355 for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j)
7356 sel[j] = j;
7357 for (k = 0; j < nunits; ++j, ++k)
7358 sel[j] = nunits + k;
7360 vec_perm_indices indices (sel, i == units_log2 ? 1 : 2, nunits);
7361 if (!can_vec_perm_const_p (vec_mode, vec_mode, indices))
7363 if (i == units_log2)
7364 return -1;
7366 if (whole_vector_shift_kind == scan_store_kind_perm)
7368 if (optab_handler (vec_shl_optab, vec_mode) == CODE_FOR_nothing)
7369 return -1;
7370 whole_vector_shift_kind = scan_store_kind_lshift_zero;
7371 /* Whole vector shifts shift in zeros, so if init is all zero
7372 constant, there is no need to do anything further. */
7373 if ((TREE_CODE (init) != INTEGER_CST
7374 && TREE_CODE (init) != REAL_CST)
7375 || !initializer_zerop (init))
7377 tree masktype = truth_type_for (vectype);
7378 if (!expand_vec_cond_expr_p (vectype, masktype, VECTOR_CST))
7379 return -1;
7380 whole_vector_shift_kind = scan_store_kind_lshift_cond;
7383 kind = whole_vector_shift_kind;
7385 if (use_whole_vector)
7387 if (kind != scan_store_kind_perm && use_whole_vector->is_empty ())
7388 use_whole_vector->safe_grow_cleared (i, true);
7389 if (kind != scan_store_kind_perm || !use_whole_vector->is_empty ())
7390 use_whole_vector->safe_push (kind);
7394 return units_log2;
7398 /* Function check_scan_store.
7400 Check magic stores for #pragma omp scan {in,ex}clusive reductions. */
7402 static bool
7403 check_scan_store (vec_info *vinfo, stmt_vec_info stmt_info, tree vectype,
7404 enum vect_def_type rhs_dt, bool slp, tree mask,
7405 vect_memory_access_type memory_access_type)
7407 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7408 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
7409 tree ref_type;
7411 gcc_assert (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) > 1);
7412 if (slp
7413 || mask
7414 || memory_access_type != VMAT_CONTIGUOUS
7415 || TREE_CODE (DR_BASE_ADDRESS (dr_info->dr)) != ADDR_EXPR
7416 || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0))
7417 || loop_vinfo == NULL
7418 || LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7419 || STMT_VINFO_GROUPED_ACCESS (stmt_info)
7420 || !integer_zerop (get_dr_vinfo_offset (vinfo, dr_info))
7421 || !integer_zerop (DR_INIT (dr_info->dr))
7422 || !(ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr)))
7423 || !alias_sets_conflict_p (get_alias_set (vectype),
7424 get_alias_set (TREE_TYPE (ref_type))))
7426 if (dump_enabled_p ())
7427 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7428 "unsupported OpenMP scan store.\n");
7429 return false;
7432 /* We need to pattern match code built by OpenMP lowering and simplified
7433 by following optimizations into something we can handle.
7434 #pragma omp simd reduction(inscan,+:r)
7435 for (...)
7437 r += something ();
7438 #pragma omp scan inclusive (r)
7439 use (r);
7441 shall have body with:
7442 // Initialization for input phase, store the reduction initializer:
7443 _20 = .GOMP_SIMD_LANE (simduid.3_14(D), 0);
7444 _21 = .GOMP_SIMD_LANE (simduid.3_14(D), 1);
7445 D.2042[_21] = 0;
7446 // Actual input phase:
7448 r.0_5 = D.2042[_20];
7449 _6 = _4 + r.0_5;
7450 D.2042[_20] = _6;
7451 // Initialization for scan phase:
7452 _25 = .GOMP_SIMD_LANE (simduid.3_14(D), 2);
7453 _26 = D.2043[_25];
7454 _27 = D.2042[_25];
7455 _28 = _26 + _27;
7456 D.2043[_25] = _28;
7457 D.2042[_25] = _28;
7458 // Actual scan phase:
7460 r.1_8 = D.2042[_20];
7462 The "omp simd array" variable D.2042 holds the privatized copy used
7463 inside of the loop and D.2043 is another one that holds copies of
7464 the current original list item. The separate GOMP_SIMD_LANE ifn
7465 kinds are there in order to allow optimizing the initializer store
7466 and combiner sequence, e.g. if it is originally some C++ish user
7467 defined reduction, but allow the vectorizer to pattern recognize it
7468 and turn into the appropriate vectorized scan.
7470 For exclusive scan, this is slightly different:
7471 #pragma omp simd reduction(inscan,+:r)
7472 for (...)
7474 use (r);
7475 #pragma omp scan exclusive (r)
7476 r += something ();
7478 shall have body with:
7479 // Initialization for input phase, store the reduction initializer:
7480 _20 = .GOMP_SIMD_LANE (simduid.3_14(D), 0);
7481 _21 = .GOMP_SIMD_LANE (simduid.3_14(D), 1);
7482 D.2042[_21] = 0;
7483 // Actual input phase:
7485 r.0_5 = D.2042[_20];
7486 _6 = _4 + r.0_5;
7487 D.2042[_20] = _6;
7488 // Initialization for scan phase:
7489 _25 = .GOMP_SIMD_LANE (simduid.3_14(D), 3);
7490 _26 = D.2043[_25];
7491 D.2044[_25] = _26;
7492 _27 = D.2042[_25];
7493 _28 = _26 + _27;
7494 D.2043[_25] = _28;
7495 // Actual scan phase:
7497 r.1_8 = D.2044[_20];
7498 ... */
7500 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 2)
7502 /* Match the D.2042[_21] = 0; store above. Just require that
7503 it is a constant or external definition store. */
7504 if (rhs_dt != vect_constant_def && rhs_dt != vect_external_def)
7506 fail_init:
7507 if (dump_enabled_p ())
7508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7509 "unsupported OpenMP scan initializer store.\n");
7510 return false;
7513 if (! loop_vinfo->scan_map)
7514 loop_vinfo->scan_map = new hash_map<tree, tree>;
7515 tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
7516 tree &cached = loop_vinfo->scan_map->get_or_insert (var);
7517 if (cached)
7518 goto fail_init;
7519 cached = gimple_assign_rhs1 (STMT_VINFO_STMT (stmt_info));
7521 /* These stores can be vectorized normally. */
7522 return true;
7525 if (rhs_dt != vect_internal_def)
7527 fail:
7528 if (dump_enabled_p ())
7529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7530 "unsupported OpenMP scan combiner pattern.\n");
7531 return false;
7534 gimple *stmt = STMT_VINFO_STMT (stmt_info);
7535 tree rhs = gimple_assign_rhs1 (stmt);
7536 if (TREE_CODE (rhs) != SSA_NAME)
7537 goto fail;
7539 gimple *other_store_stmt = NULL;
7540 tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
7541 bool inscan_var_store
7542 = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL;
7544 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
7546 if (!inscan_var_store)
7548 use_operand_p use_p;
7549 imm_use_iterator iter;
7550 FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
7552 gimple *use_stmt = USE_STMT (use_p);
7553 if (use_stmt == stmt || is_gimple_debug (use_stmt))
7554 continue;
7555 if (gimple_bb (use_stmt) != gimple_bb (stmt)
7556 || !is_gimple_assign (use_stmt)
7557 || gimple_assign_rhs_class (use_stmt) != GIMPLE_BINARY_RHS
7558 || other_store_stmt
7559 || TREE_CODE (gimple_assign_lhs (use_stmt)) != SSA_NAME)
7560 goto fail;
7561 other_store_stmt = use_stmt;
7563 if (other_store_stmt == NULL)
7564 goto fail;
7565 rhs = gimple_assign_lhs (other_store_stmt);
7566 if (!single_imm_use (rhs, &use_p, &other_store_stmt))
7567 goto fail;
7570 else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3)
7572 use_operand_p use_p;
7573 imm_use_iterator iter;
7574 FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
7576 gimple *use_stmt = USE_STMT (use_p);
7577 if (use_stmt == stmt || is_gimple_debug (use_stmt))
7578 continue;
7579 if (other_store_stmt)
7580 goto fail;
7581 other_store_stmt = use_stmt;
7584 else
7585 goto fail;
7587 gimple *def_stmt = SSA_NAME_DEF_STMT (rhs);
7588 if (gimple_bb (def_stmt) != gimple_bb (stmt)
7589 || !is_gimple_assign (def_stmt)
7590 || gimple_assign_rhs_class (def_stmt) != GIMPLE_BINARY_RHS)
7591 goto fail;
7593 enum tree_code code = gimple_assign_rhs_code (def_stmt);
7594 /* For pointer addition, we should use the normal plus for the vector
7595 operation. */
7596 switch (code)
7598 case POINTER_PLUS_EXPR:
7599 code = PLUS_EXPR;
7600 break;
7601 case MULT_HIGHPART_EXPR:
7602 goto fail;
7603 default:
7604 break;
7606 if (TREE_CODE_LENGTH (code) != binary_op || !commutative_tree_code (code))
7607 goto fail;
7609 tree rhs1 = gimple_assign_rhs1 (def_stmt);
7610 tree rhs2 = gimple_assign_rhs2 (def_stmt);
7611 if (TREE_CODE (rhs1) != SSA_NAME || TREE_CODE (rhs2) != SSA_NAME)
7612 goto fail;
7614 gimple *load1_stmt = SSA_NAME_DEF_STMT (rhs1);
7615 gimple *load2_stmt = SSA_NAME_DEF_STMT (rhs2);
7616 if (gimple_bb (load1_stmt) != gimple_bb (stmt)
7617 || !gimple_assign_load_p (load1_stmt)
7618 || gimple_bb (load2_stmt) != gimple_bb (stmt)
7619 || !gimple_assign_load_p (load2_stmt))
7620 goto fail;
7622 stmt_vec_info load1_stmt_info = loop_vinfo->lookup_stmt (load1_stmt);
7623 stmt_vec_info load2_stmt_info = loop_vinfo->lookup_stmt (load2_stmt);
7624 if (load1_stmt_info == NULL
7625 || load2_stmt_info == NULL
7626 || (STMT_VINFO_SIMD_LANE_ACCESS_P (load1_stmt_info)
7627 != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info))
7628 || (STMT_VINFO_SIMD_LANE_ACCESS_P (load2_stmt_info)
7629 != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)))
7630 goto fail;
7632 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && inscan_var_store)
7634 dr_vec_info *load1_dr_info = STMT_VINFO_DR_INFO (load1_stmt_info);
7635 if (TREE_CODE (DR_BASE_ADDRESS (load1_dr_info->dr)) != ADDR_EXPR
7636 || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0)))
7637 goto fail;
7638 tree var1 = TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0);
7639 tree lrhs;
7640 if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
7641 lrhs = rhs1;
7642 else
7643 lrhs = rhs2;
7644 use_operand_p use_p;
7645 imm_use_iterator iter;
7646 FOR_EACH_IMM_USE_FAST (use_p, iter, lrhs)
7648 gimple *use_stmt = USE_STMT (use_p);
7649 if (use_stmt == def_stmt || is_gimple_debug (use_stmt))
7650 continue;
7651 if (other_store_stmt)
7652 goto fail;
7653 other_store_stmt = use_stmt;
7657 if (other_store_stmt == NULL)
7658 goto fail;
7659 if (gimple_bb (other_store_stmt) != gimple_bb (stmt)
7660 || !gimple_store_p (other_store_stmt))
7661 goto fail;
7663 stmt_vec_info other_store_stmt_info
7664 = loop_vinfo->lookup_stmt (other_store_stmt);
7665 if (other_store_stmt_info == NULL
7666 || (STMT_VINFO_SIMD_LANE_ACCESS_P (other_store_stmt_info)
7667 != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)))
7668 goto fail;
7670 gimple *stmt1 = stmt;
7671 gimple *stmt2 = other_store_stmt;
7672 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store)
7673 std::swap (stmt1, stmt2);
7674 if (scan_operand_equal_p (gimple_assign_lhs (stmt1),
7675 gimple_assign_rhs1 (load2_stmt)))
7677 std::swap (rhs1, rhs2);
7678 std::swap (load1_stmt, load2_stmt);
7679 std::swap (load1_stmt_info, load2_stmt_info);
7681 if (!scan_operand_equal_p (gimple_assign_lhs (stmt1),
7682 gimple_assign_rhs1 (load1_stmt)))
7683 goto fail;
7685 tree var3 = NULL_TREE;
7686 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3
7687 && !scan_operand_equal_p (gimple_assign_lhs (stmt2),
7688 gimple_assign_rhs1 (load2_stmt)))
7689 goto fail;
7690 else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
7692 dr_vec_info *load2_dr_info = STMT_VINFO_DR_INFO (load2_stmt_info);
7693 if (TREE_CODE (DR_BASE_ADDRESS (load2_dr_info->dr)) != ADDR_EXPR
7694 || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0)))
7695 goto fail;
7696 var3 = TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0);
7697 if (!lookup_attribute ("omp simd array", DECL_ATTRIBUTES (var3))
7698 || lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var3))
7699 || lookup_attribute ("omp simd inscan exclusive",
7700 DECL_ATTRIBUTES (var3)))
7701 goto fail;
7704 dr_vec_info *other_dr_info = STMT_VINFO_DR_INFO (other_store_stmt_info);
7705 if (TREE_CODE (DR_BASE_ADDRESS (other_dr_info->dr)) != ADDR_EXPR
7706 || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (other_dr_info->dr), 0)))
7707 goto fail;
7709 tree var1 = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
7710 tree var2 = TREE_OPERAND (DR_BASE_ADDRESS (other_dr_info->dr), 0);
7711 if (!lookup_attribute ("omp simd array", DECL_ATTRIBUTES (var1))
7712 || !lookup_attribute ("omp simd array", DECL_ATTRIBUTES (var2))
7713 || (!lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
7714 == (!lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var2))))
7715 goto fail;
7717 if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
7718 std::swap (var1, var2);
7720 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
7722 if (!lookup_attribute ("omp simd inscan exclusive",
7723 DECL_ATTRIBUTES (var1)))
7724 goto fail;
7725 var1 = var3;
7728 if (loop_vinfo->scan_map == NULL)
7729 goto fail;
7730 tree *init = loop_vinfo->scan_map->get (var1);
7731 if (init == NULL)
7732 goto fail;
7734 /* The IL is as expected, now check if we can actually vectorize it.
7735 Inclusive scan:
7736 _26 = D.2043[_25];
7737 _27 = D.2042[_25];
7738 _28 = _26 + _27;
7739 D.2043[_25] = _28;
7740 D.2042[_25] = _28;
7741 should be vectorized as (where _40 is the vectorized rhs
7742 from the D.2042[_21] = 0; store):
7743 _30 = MEM <vector(8) int> [(int *)&D.2043];
7744 _31 = MEM <vector(8) int> [(int *)&D.2042];
7745 _32 = VEC_PERM_EXPR <_40, _31, { 0, 8, 9, 10, 11, 12, 13, 14 }>;
7746 _33 = _31 + _32;
7747 // _33 = { _31[0], _31[0]+_31[1], _31[1]+_31[2], ..., _31[6]+_31[7] };
7748 _34 = VEC_PERM_EXPR <_40, _33, { 0, 1, 8, 9, 10, 11, 12, 13 }>;
7749 _35 = _33 + _34;
7750 // _35 = { _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
7751 // _31[1]+.._31[4], ... _31[4]+.._31[7] };
7752 _36 = VEC_PERM_EXPR <_40, _35, { 0, 1, 2, 3, 8, 9, 10, 11 }>;
7753 _37 = _35 + _36;
7754 // _37 = { _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
7755 // _31[0]+.._31[4], ... _31[0]+.._31[7] };
7756 _38 = _30 + _37;
7757 _39 = VEC_PERM_EXPR <_38, _38, { 7, 7, 7, 7, 7, 7, 7, 7 }>;
7758 MEM <vector(8) int> [(int *)&D.2043] = _39;
7759 MEM <vector(8) int> [(int *)&D.2042] = _38;
7760 Exclusive scan:
7761 _26 = D.2043[_25];
7762 D.2044[_25] = _26;
7763 _27 = D.2042[_25];
7764 _28 = _26 + _27;
7765 D.2043[_25] = _28;
7766 should be vectorized as (where _40 is the vectorized rhs
7767 from the D.2042[_21] = 0; store):
7768 _30 = MEM <vector(8) int> [(int *)&D.2043];
7769 _31 = MEM <vector(8) int> [(int *)&D.2042];
7770 _32 = VEC_PERM_EXPR <_40, _31, { 0, 8, 9, 10, 11, 12, 13, 14 }>;
7771 _33 = VEC_PERM_EXPR <_40, _32, { 0, 8, 9, 10, 11, 12, 13, 14 }>;
7772 _34 = _32 + _33;
7773 // _34 = { 0, _31[0], _31[0]+_31[1], _31[1]+_31[2], _31[2]+_31[3],
7774 // _31[3]+_31[4], ... _31[5]+.._31[6] };
7775 _35 = VEC_PERM_EXPR <_40, _34, { 0, 1, 8, 9, 10, 11, 12, 13 }>;
7776 _36 = _34 + _35;
7777 // _36 = { 0, _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
7778 // _31[1]+.._31[4], ... _31[3]+.._31[6] };
7779 _37 = VEC_PERM_EXPR <_40, _36, { 0, 1, 2, 3, 8, 9, 10, 11 }>;
7780 _38 = _36 + _37;
7781 // _38 = { 0, _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
7782 // _31[0]+.._31[4], ... _31[0]+.._31[6] };
7783 _39 = _30 + _38;
7784 _50 = _31 + _39;
7785 _51 = VEC_PERM_EXPR <_50, _50, { 7, 7, 7, 7, 7, 7, 7, 7 }>;
7786 MEM <vector(8) int> [(int *)&D.2044] = _39;
7787 MEM <vector(8) int> [(int *)&D.2042] = _51; */
7788 enum machine_mode vec_mode = TYPE_MODE (vectype);
7789 optab optab = optab_for_tree_code (code, vectype, optab_default);
7790 if (!optab || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7791 goto fail;
7793 int units_log2 = scan_store_can_perm_p (vectype, *init);
7794 if (units_log2 == -1)
7795 goto fail;
7797 return true;
7801 /* Function vectorizable_scan_store.
7803 Helper of vectorizable_score, arguments like on vectorizable_store.
7804 Handle only the transformation, checking is done in check_scan_store. */
7806 static bool
7807 vectorizable_scan_store (vec_info *vinfo,
7808 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7809 gimple **vec_stmt, int ncopies)
7811 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7812 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
7813 tree ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr));
7814 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7816 if (dump_enabled_p ())
7817 dump_printf_loc (MSG_NOTE, vect_location,
7818 "transform scan store. ncopies = %d\n", ncopies);
7820 gimple *stmt = STMT_VINFO_STMT (stmt_info);
7821 tree rhs = gimple_assign_rhs1 (stmt);
7822 gcc_assert (TREE_CODE (rhs) == SSA_NAME);
7824 tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
7825 bool inscan_var_store
7826 = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL;
7828 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store)
7830 use_operand_p use_p;
7831 imm_use_iterator iter;
7832 FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
7834 gimple *use_stmt = USE_STMT (use_p);
7835 if (use_stmt == stmt || is_gimple_debug (use_stmt))
7836 continue;
7837 rhs = gimple_assign_lhs (use_stmt);
7838 break;
7842 gimple *def_stmt = SSA_NAME_DEF_STMT (rhs);
7843 enum tree_code code = gimple_assign_rhs_code (def_stmt);
7844 if (code == POINTER_PLUS_EXPR)
7845 code = PLUS_EXPR;
7846 gcc_assert (TREE_CODE_LENGTH (code) == binary_op
7847 && commutative_tree_code (code));
7848 tree rhs1 = gimple_assign_rhs1 (def_stmt);
7849 tree rhs2 = gimple_assign_rhs2 (def_stmt);
7850 gcc_assert (TREE_CODE (rhs1) == SSA_NAME && TREE_CODE (rhs2) == SSA_NAME);
7851 gimple *load1_stmt = SSA_NAME_DEF_STMT (rhs1);
7852 gimple *load2_stmt = SSA_NAME_DEF_STMT (rhs2);
7853 stmt_vec_info load1_stmt_info = loop_vinfo->lookup_stmt (load1_stmt);
7854 stmt_vec_info load2_stmt_info = loop_vinfo->lookup_stmt (load2_stmt);
7855 dr_vec_info *load1_dr_info = STMT_VINFO_DR_INFO (load1_stmt_info);
7856 dr_vec_info *load2_dr_info = STMT_VINFO_DR_INFO (load2_stmt_info);
7857 tree var1 = TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0);
7858 tree var2 = TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0);
7860 if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
7862 std::swap (rhs1, rhs2);
7863 std::swap (var1, var2);
7864 std::swap (load1_dr_info, load2_dr_info);
7867 tree *init = loop_vinfo->scan_map->get (var1);
7868 gcc_assert (init);
7870 unsigned HOST_WIDE_INT nunits;
7871 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits))
7872 gcc_unreachable ();
7873 auto_vec<enum scan_store_kind, 16> use_whole_vector;
7874 int units_log2 = scan_store_can_perm_p (vectype, *init, &use_whole_vector);
7875 gcc_assert (units_log2 > 0);
7876 auto_vec<tree, 16> perms;
7877 perms.quick_grow (units_log2 + 1);
7878 tree zero_vec = NULL_TREE, masktype = NULL_TREE;
7879 for (int i = 0; i <= units_log2; ++i)
7881 unsigned HOST_WIDE_INT j, k;
7882 vec_perm_builder sel (nunits, nunits, 1);
7883 sel.quick_grow (nunits);
7884 if (i == units_log2)
7885 for (j = 0; j < nunits; ++j)
7886 sel[j] = nunits - 1;
7887 else
7889 for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j)
7890 sel[j] = j;
7891 for (k = 0; j < nunits; ++j, ++k)
7892 sel[j] = nunits + k;
7894 vec_perm_indices indices (sel, i == units_log2 ? 1 : 2, nunits);
7895 if (!use_whole_vector.is_empty ()
7896 && use_whole_vector[i] != scan_store_kind_perm)
7898 if (zero_vec == NULL_TREE)
7899 zero_vec = build_zero_cst (vectype);
7900 if (masktype == NULL_TREE
7901 && use_whole_vector[i] == scan_store_kind_lshift_cond)
7902 masktype = truth_type_for (vectype);
7903 perms[i] = vect_gen_perm_mask_any (vectype, indices);
7905 else
7906 perms[i] = vect_gen_perm_mask_checked (vectype, indices);
7909 tree vec_oprnd1 = NULL_TREE;
7910 tree vec_oprnd2 = NULL_TREE;
7911 tree vec_oprnd3 = NULL_TREE;
7912 tree dataref_ptr = DR_BASE_ADDRESS (dr_info->dr);
7913 tree dataref_offset = build_int_cst (ref_type, 0);
7914 tree bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info,
7915 vectype, VMAT_CONTIGUOUS);
7916 tree ldataref_ptr = NULL_TREE;
7917 tree orig = NULL_TREE;
7918 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store)
7919 ldataref_ptr = DR_BASE_ADDRESS (load1_dr_info->dr);
7920 auto_vec<tree> vec_oprnds1;
7921 auto_vec<tree> vec_oprnds2;
7922 auto_vec<tree> vec_oprnds3;
7923 vect_get_vec_defs (vinfo, stmt_info, NULL, ncopies,
7924 *init, &vec_oprnds1,
7925 ldataref_ptr == NULL ? rhs1 : NULL, &vec_oprnds2,
7926 rhs2, &vec_oprnds3);
7927 for (int j = 0; j < ncopies; j++)
7929 vec_oprnd1 = vec_oprnds1[j];
7930 if (ldataref_ptr == NULL)
7931 vec_oprnd2 = vec_oprnds2[j];
7932 vec_oprnd3 = vec_oprnds3[j];
7933 if (j == 0)
7934 orig = vec_oprnd3;
7935 else if (!inscan_var_store)
7936 dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump);
7938 if (ldataref_ptr)
7940 vec_oprnd2 = make_ssa_name (vectype);
7941 tree data_ref = fold_build2 (MEM_REF, vectype,
7942 unshare_expr (ldataref_ptr),
7943 dataref_offset);
7944 vect_copy_ref_info (data_ref, DR_REF (load1_dr_info->dr));
7945 gimple *g = gimple_build_assign (vec_oprnd2, data_ref);
7946 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
7947 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
7948 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7951 tree v = vec_oprnd2;
7952 for (int i = 0; i < units_log2; ++i)
7954 tree new_temp = make_ssa_name (vectype);
7955 gimple *g = gimple_build_assign (new_temp, VEC_PERM_EXPR,
7956 (zero_vec
7957 && (use_whole_vector[i]
7958 != scan_store_kind_perm))
7959 ? zero_vec : vec_oprnd1, v,
7960 perms[i]);
7961 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
7962 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
7963 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7965 if (zero_vec && use_whole_vector[i] == scan_store_kind_lshift_cond)
7967 /* Whole vector shift shifted in zero bits, but if *init
7968 is not initializer_zerop, we need to replace those elements
7969 with elements from vec_oprnd1. */
7970 tree_vector_builder vb (masktype, nunits, 1);
7971 for (unsigned HOST_WIDE_INT k = 0; k < nunits; ++k)
7972 vb.quick_push (k < (HOST_WIDE_INT_1U << i)
7973 ? boolean_false_node : boolean_true_node);
7975 tree new_temp2 = make_ssa_name (vectype);
7976 g = gimple_build_assign (new_temp2, VEC_COND_EXPR, vb.build (),
7977 new_temp, vec_oprnd1);
7978 vect_finish_stmt_generation (vinfo, stmt_info,
7979 g, gsi);
7980 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
7981 new_temp = new_temp2;
7984 /* For exclusive scan, perform the perms[i] permutation once
7985 more. */
7986 if (i == 0
7987 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4
7988 && v == vec_oprnd2)
7990 v = new_temp;
7991 --i;
7992 continue;
7995 tree new_temp2 = make_ssa_name (vectype);
7996 g = gimple_build_assign (new_temp2, code, v, new_temp);
7997 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
7998 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
8000 v = new_temp2;
8003 tree new_temp = make_ssa_name (vectype);
8004 gimple *g = gimple_build_assign (new_temp, code, orig, v);
8005 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8006 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
8008 tree last_perm_arg = new_temp;
8009 /* For exclusive scan, new_temp computed above is the exclusive scan
8010 prefix sum. Turn it into inclusive prefix sum for the broadcast
8011 of the last element into orig. */
8012 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
8014 last_perm_arg = make_ssa_name (vectype);
8015 g = gimple_build_assign (last_perm_arg, code, new_temp, vec_oprnd2);
8016 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8017 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
8020 orig = make_ssa_name (vectype);
8021 g = gimple_build_assign (orig, VEC_PERM_EXPR, last_perm_arg,
8022 last_perm_arg, perms[units_log2]);
8023 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8024 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
8026 if (!inscan_var_store)
8028 tree data_ref = fold_build2 (MEM_REF, vectype,
8029 unshare_expr (dataref_ptr),
8030 dataref_offset);
8031 vect_copy_ref_info (data_ref, DR_REF (dr_info->dr));
8032 g = gimple_build_assign (data_ref, new_temp);
8033 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8034 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
8038 if (inscan_var_store)
8039 for (int j = 0; j < ncopies; j++)
8041 if (j != 0)
8042 dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump);
8044 tree data_ref = fold_build2 (MEM_REF, vectype,
8045 unshare_expr (dataref_ptr),
8046 dataref_offset);
8047 vect_copy_ref_info (data_ref, DR_REF (dr_info->dr));
8048 gimple *g = gimple_build_assign (data_ref, orig);
8049 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8050 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (g);
8052 return true;
8056 /* Function vectorizable_store.
8058 Check if STMT_INFO defines a non scalar data-ref (array/pointer/structure)
8059 that can be vectorized.
8060 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
8061 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
8062 Return true if STMT_INFO is vectorizable in this way. */
8064 static bool
8065 vectorizable_store (vec_info *vinfo,
8066 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8067 gimple **vec_stmt, slp_tree slp_node,
8068 stmt_vector_for_cost *cost_vec)
8070 tree data_ref;
8071 tree vec_oprnd = NULL_TREE;
8072 tree elem_type;
8073 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8074 class loop *loop = NULL;
8075 machine_mode vec_mode;
8076 tree dummy;
8077 enum vect_def_type rhs_dt = vect_unknown_def_type;
8078 enum vect_def_type mask_dt = vect_unknown_def_type;
8079 tree dataref_ptr = NULL_TREE;
8080 tree dataref_offset = NULL_TREE;
8081 gimple *ptr_incr = NULL;
8082 int ncopies;
8083 int j;
8084 stmt_vec_info first_stmt_info;
8085 bool grouped_store;
8086 unsigned int group_size, i;
8087 bool slp = (slp_node != NULL);
8088 unsigned int vec_num;
8089 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
8090 tree aggr_type;
8091 gather_scatter_info gs_info;
8092 poly_uint64 vf;
8093 vec_load_store_type vls_type;
8094 tree ref_type;
8096 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
8097 return false;
8099 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8100 && ! vec_stmt)
8101 return false;
8103 /* Is vectorizable store? */
8105 tree mask = NULL_TREE, mask_vectype = NULL_TREE;
8106 slp_tree mask_node = NULL;
8107 if (gassign *assign = dyn_cast <gassign *> (stmt_info->stmt))
8109 tree scalar_dest = gimple_assign_lhs (assign);
8110 if (TREE_CODE (scalar_dest) == VIEW_CONVERT_EXPR
8111 && is_pattern_stmt_p (stmt_info))
8112 scalar_dest = TREE_OPERAND (scalar_dest, 0);
8113 if (TREE_CODE (scalar_dest) != ARRAY_REF
8114 && TREE_CODE (scalar_dest) != BIT_FIELD_REF
8115 && TREE_CODE (scalar_dest) != INDIRECT_REF
8116 && TREE_CODE (scalar_dest) != COMPONENT_REF
8117 && TREE_CODE (scalar_dest) != IMAGPART_EXPR
8118 && TREE_CODE (scalar_dest) != REALPART_EXPR
8119 && TREE_CODE (scalar_dest) != MEM_REF)
8120 return false;
8122 else
8124 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
8125 if (!call || !gimple_call_internal_p (call))
8126 return false;
8128 internal_fn ifn = gimple_call_internal_fn (call);
8129 if (!internal_store_fn_p (ifn))
8130 return false;
8132 int mask_index = internal_fn_mask_index (ifn);
8133 if (mask_index >= 0 && slp_node)
8134 mask_index = vect_slp_child_index_for_operand
8135 (call, mask_index, STMT_VINFO_GATHER_SCATTER_P (stmt_info));
8136 if (mask_index >= 0
8137 && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
8138 &mask, &mask_node, &mask_dt,
8139 &mask_vectype))
8140 return false;
8143 /* Cannot have hybrid store SLP -- that would mean storing to the
8144 same location twice. */
8145 gcc_assert (slp == PURE_SLP_STMT (stmt_info));
8147 tree vectype = STMT_VINFO_VECTYPE (stmt_info), rhs_vectype = NULL_TREE;
8148 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8150 if (loop_vinfo)
8152 loop = LOOP_VINFO_LOOP (loop_vinfo);
8153 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8155 else
8156 vf = 1;
8158 /* Multiple types in SLP are handled by creating the appropriate number of
8159 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
8160 case of SLP. */
8161 if (slp)
8162 ncopies = 1;
8163 else
8164 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8166 gcc_assert (ncopies >= 1);
8168 /* FORNOW. This restriction should be relaxed. */
8169 if (loop
8170 && nested_in_vect_loop_p (loop, stmt_info)
8171 && (ncopies > 1 || (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)))
8173 if (dump_enabled_p ())
8174 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8175 "multiple types in nested loop.\n");
8176 return false;
8179 tree op;
8180 slp_tree op_node;
8181 if (!vect_check_store_rhs (vinfo, stmt_info, slp_node,
8182 &op, &op_node, &rhs_dt, &rhs_vectype, &vls_type))
8183 return false;
8185 elem_type = TREE_TYPE (vectype);
8186 vec_mode = TYPE_MODE (vectype);
8188 if (!STMT_VINFO_DATA_REF (stmt_info))
8189 return false;
8191 vect_memory_access_type memory_access_type;
8192 enum dr_alignment_support alignment_support_scheme;
8193 int misalignment;
8194 poly_int64 poffset;
8195 internal_fn lanes_ifn;
8196 if (!get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask, vls_type,
8197 ncopies, &memory_access_type, &poffset,
8198 &alignment_support_scheme, &misalignment, &gs_info,
8199 &lanes_ifn))
8200 return false;
8202 if (mask)
8204 if (memory_access_type == VMAT_CONTIGUOUS)
8206 if (!VECTOR_MODE_P (vec_mode)
8207 || !can_vec_mask_load_store_p (vec_mode,
8208 TYPE_MODE (mask_vectype), false))
8209 return false;
8211 else if (memory_access_type != VMAT_LOAD_STORE_LANES
8212 && (memory_access_type != VMAT_GATHER_SCATTER
8213 || (gs_info.decl && !VECTOR_BOOLEAN_TYPE_P (mask_vectype))))
8215 if (dump_enabled_p ())
8216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8217 "unsupported access type for masked store.\n");
8218 return false;
8220 else if (memory_access_type == VMAT_GATHER_SCATTER
8221 && gs_info.ifn == IFN_LAST
8222 && !gs_info.decl)
8224 if (dump_enabled_p ())
8225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8226 "unsupported masked emulated scatter.\n");
8227 return false;
8230 else
8232 /* FORNOW. In some cases can vectorize even if data-type not supported
8233 (e.g. - array initialization with 0). */
8234 if (optab_handler (mov_optab, vec_mode) == CODE_FOR_nothing)
8235 return false;
8238 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info), *first_dr_info = NULL;
8239 grouped_store = (STMT_VINFO_GROUPED_ACCESS (stmt_info)
8240 && memory_access_type != VMAT_GATHER_SCATTER
8241 && (slp || memory_access_type != VMAT_CONTIGUOUS));
8242 if (grouped_store)
8244 first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8245 first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
8246 group_size = DR_GROUP_SIZE (first_stmt_info);
8248 else
8250 first_stmt_info = stmt_info;
8251 first_dr_info = dr_info;
8252 group_size = vec_num = 1;
8255 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) > 1 && !vec_stmt)
8257 if (!check_scan_store (vinfo, stmt_info, vectype, rhs_dt, slp, mask,
8258 memory_access_type))
8259 return false;
8262 bool costing_p = !vec_stmt;
8263 if (costing_p) /* transformation not required. */
8265 STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
8267 if (loop_vinfo
8268 && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8269 check_load_store_for_partial_vectors (loop_vinfo, vectype, slp_node,
8270 vls_type, group_size,
8271 memory_access_type, &gs_info,
8272 mask);
8274 if (slp_node
8275 && (!vect_maybe_update_slp_op_vectype (op_node, vectype)
8276 || (mask
8277 && !vect_maybe_update_slp_op_vectype (mask_node,
8278 mask_vectype))))
8280 if (dump_enabled_p ())
8281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8282 "incompatible vector types for invariants\n");
8283 return false;
8286 if (dump_enabled_p ()
8287 && memory_access_type != VMAT_ELEMENTWISE
8288 && memory_access_type != VMAT_GATHER_SCATTER
8289 && alignment_support_scheme != dr_aligned)
8290 dump_printf_loc (MSG_NOTE, vect_location,
8291 "Vectorizing an unaligned access.\n");
8293 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
8295 /* As function vect_transform_stmt shows, for interleaving stores
8296 the whole chain is vectorized when the last store in the chain
8297 is reached, the other stores in the group are skipped. So we
8298 want to only cost the last one here, but it's not trivial to
8299 get the last, as it's equivalent to use the first one for
8300 costing, use the first one instead. */
8301 if (grouped_store
8302 && !slp
8303 && first_stmt_info != stmt_info)
8304 return true;
8306 gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
8308 /* Transform. */
8310 ensure_base_align (dr_info);
8312 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) >= 3)
8314 gcc_assert (memory_access_type == VMAT_CONTIGUOUS);
8315 gcc_assert (!slp);
8316 if (costing_p)
8318 unsigned int inside_cost = 0, prologue_cost = 0;
8319 if (vls_type == VLS_STORE_INVARIANT)
8320 prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
8321 stmt_info, 0, vect_prologue);
8322 vect_get_store_cost (vinfo, stmt_info, ncopies,
8323 alignment_support_scheme, misalignment,
8324 &inside_cost, cost_vec);
8326 if (dump_enabled_p ())
8327 dump_printf_loc (MSG_NOTE, vect_location,
8328 "vect_model_store_cost: inside_cost = %d, "
8329 "prologue_cost = %d .\n",
8330 inside_cost, prologue_cost);
8332 return true;
8334 return vectorizable_scan_store (vinfo, stmt_info, gsi, vec_stmt, ncopies);
8337 if (grouped_store)
8339 /* FORNOW */
8340 gcc_assert (!loop || !nested_in_vect_loop_p (loop, stmt_info));
8342 if (slp)
8344 grouped_store = false;
8345 /* VEC_NUM is the number of vect stmts to be created for this
8346 group. */
8347 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8348 first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
8349 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_stmt_info)
8350 == first_stmt_info);
8351 first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
8352 op = vect_get_store_rhs (first_stmt_info);
8354 else
8355 /* VEC_NUM is the number of vect stmts to be created for this
8356 group. */
8357 vec_num = group_size;
8359 ref_type = get_group_alias_ptr_type (first_stmt_info);
8361 else
8362 ref_type = reference_alias_ptr_type (DR_REF (first_dr_info->dr));
8364 if (!costing_p && dump_enabled_p ())
8365 dump_printf_loc (MSG_NOTE, vect_location, "transform store. ncopies = %d\n",
8366 ncopies);
8368 /* Check if we need to update prologue cost for invariant,
8369 and update it accordingly if so. If it's not for
8370 interleaving store, we can just check vls_type; but if
8371 it's for interleaving store, need to check the def_type
8372 of the stored value since the current vls_type is just
8373 for first_stmt_info. */
8374 auto update_prologue_cost = [&](unsigned *prologue_cost, tree store_rhs)
8376 gcc_assert (costing_p);
8377 if (slp)
8378 return;
8379 if (grouped_store)
8381 gcc_assert (store_rhs);
8382 enum vect_def_type cdt;
8383 gcc_assert (vect_is_simple_use (store_rhs, vinfo, &cdt));
8384 if (cdt != vect_constant_def && cdt != vect_external_def)
8385 return;
8387 else if (vls_type != VLS_STORE_INVARIANT)
8388 return;
8389 *prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, stmt_info,
8390 0, vect_prologue);
8393 if (memory_access_type == VMAT_ELEMENTWISE
8394 || memory_access_type == VMAT_STRIDED_SLP)
8396 unsigned inside_cost = 0, prologue_cost = 0;
8397 gimple_stmt_iterator incr_gsi;
8398 bool insert_after;
8399 gimple *incr;
8400 tree offvar;
8401 tree ivstep;
8402 tree running_off;
8403 tree stride_base, stride_step, alias_off;
8404 tree vec_oprnd = NULL_TREE;
8405 tree dr_offset;
8406 unsigned int g;
8407 /* Checked by get_load_store_type. */
8408 unsigned int const_nunits = nunits.to_constant ();
8410 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8411 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
8413 dr_offset = get_dr_vinfo_offset (vinfo, first_dr_info);
8414 stride_base
8415 = fold_build_pointer_plus
8416 (DR_BASE_ADDRESS (first_dr_info->dr),
8417 size_binop (PLUS_EXPR,
8418 convert_to_ptrofftype (dr_offset),
8419 convert_to_ptrofftype (DR_INIT (first_dr_info->dr))));
8420 stride_step = fold_convert (sizetype, DR_STEP (first_dr_info->dr));
8422 /* For a store with loop-invariant (but other than power-of-2)
8423 stride (i.e. not a grouped access) like so:
8425 for (i = 0; i < n; i += stride)
8426 array[i] = ...;
8428 we generate a new induction variable and new stores from
8429 the components of the (vectorized) rhs:
8431 for (j = 0; ; j += VF*stride)
8432 vectemp = ...;
8433 tmp1 = vectemp[0];
8434 array[j] = tmp1;
8435 tmp2 = vectemp[1];
8436 array[j + stride] = tmp2;
8440 unsigned nstores = const_nunits;
8441 unsigned lnel = 1;
8442 tree ltype = elem_type;
8443 tree lvectype = vectype;
8444 if (slp)
8446 HOST_WIDE_INT n = gcd (group_size, const_nunits);
8447 if (n == const_nunits)
8449 int mis_align = dr_misalignment (first_dr_info, vectype);
8450 dr_alignment_support dr_align
8451 = vect_supportable_dr_alignment (vinfo, dr_info, vectype,
8452 mis_align);
8453 if (dr_align == dr_aligned
8454 || dr_align == dr_unaligned_supported)
8456 nstores = 1;
8457 lnel = const_nunits;
8458 ltype = vectype;
8459 lvectype = vectype;
8460 alignment_support_scheme = dr_align;
8461 misalignment = mis_align;
8464 else if (n > 1)
8466 nstores = const_nunits / n;
8467 lnel = n;
8468 ltype = build_vector_type (elem_type, n);
8469 lvectype = vectype;
8471 /* First check if vec_extract optab doesn't support extraction
8472 of vector elts directly. */
8473 scalar_mode elmode = SCALAR_TYPE_MODE (elem_type);
8474 machine_mode vmode;
8475 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
8476 || !related_vector_mode (TYPE_MODE (vectype), elmode,
8477 n).exists (&vmode)
8478 || (convert_optab_handler (vec_extract_optab,
8479 TYPE_MODE (vectype), vmode)
8480 == CODE_FOR_nothing))
8482 /* Try to avoid emitting an extract of vector elements
8483 by performing the extracts using an integer type of the
8484 same size, extracting from a vector of those and then
8485 re-interpreting it as the original vector type if
8486 supported. */
8487 unsigned lsize
8488 = n * GET_MODE_BITSIZE (elmode);
8489 unsigned int lnunits = const_nunits / n;
8490 /* If we can't construct such a vector fall back to
8491 element extracts from the original vector type and
8492 element size stores. */
8493 if (int_mode_for_size (lsize, 0).exists (&elmode)
8494 && VECTOR_MODE_P (TYPE_MODE (vectype))
8495 && related_vector_mode (TYPE_MODE (vectype), elmode,
8496 lnunits).exists (&vmode)
8497 && (convert_optab_handler (vec_extract_optab,
8498 vmode, elmode)
8499 != CODE_FOR_nothing))
8501 nstores = lnunits;
8502 lnel = n;
8503 ltype = build_nonstandard_integer_type (lsize, 1);
8504 lvectype = build_vector_type (ltype, nstores);
8506 /* Else fall back to vector extraction anyway.
8507 Fewer stores are more important than avoiding spilling
8508 of the vector we extract from. Compared to the
8509 construction case in vectorizable_load no store-forwarding
8510 issue exists here for reasonable archs. */
8513 ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type));
8514 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8517 if (!costing_p)
8519 ivstep = stride_step;
8520 ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
8521 build_int_cst (TREE_TYPE (ivstep), vf));
8523 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
8525 stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base);
8526 ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep);
8527 create_iv (stride_base, PLUS_EXPR, ivstep, NULL, loop, &incr_gsi,
8528 insert_after, &offvar, NULL);
8529 incr = gsi_stmt (incr_gsi);
8531 stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step);
8534 alias_off = build_int_cst (ref_type, 0);
8535 stmt_vec_info next_stmt_info = first_stmt_info;
8536 auto_vec<tree> vec_oprnds;
8537 /* For costing some adjacent vector stores, we'd like to cost with
8538 the total number of them once instead of cost each one by one. */
8539 unsigned int n_adjacent_stores = 0;
8540 for (g = 0; g < group_size; g++)
8542 running_off = offvar;
8543 if (!costing_p)
8545 if (g)
8547 tree size = TYPE_SIZE_UNIT (ltype);
8548 tree pos
8549 = fold_build2 (MULT_EXPR, sizetype, size_int (g), size);
8550 tree newoff = copy_ssa_name (running_off, NULL);
8551 incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
8552 running_off, pos);
8553 vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi);
8554 running_off = newoff;
8557 if (!slp)
8558 op = vect_get_store_rhs (next_stmt_info);
8559 if (!costing_p)
8560 vect_get_vec_defs (vinfo, next_stmt_info, slp_node, ncopies, op,
8561 &vec_oprnds);
8562 else
8563 update_prologue_cost (&prologue_cost, op);
8564 unsigned int group_el = 0;
8565 unsigned HOST_WIDE_INT
8566 elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
8567 for (j = 0; j < ncopies; j++)
8569 if (!costing_p)
8571 vec_oprnd = vec_oprnds[j];
8572 /* Pun the vector to extract from if necessary. */
8573 if (lvectype != vectype)
8575 tree tem = make_ssa_name (lvectype);
8576 tree cvt
8577 = build1 (VIEW_CONVERT_EXPR, lvectype, vec_oprnd);
8578 gimple *pun = gimple_build_assign (tem, cvt);
8579 vect_finish_stmt_generation (vinfo, stmt_info, pun, gsi);
8580 vec_oprnd = tem;
8583 for (i = 0; i < nstores; i++)
8585 if (costing_p)
8587 /* Only need vector extracting when there are more
8588 than one stores. */
8589 if (nstores > 1)
8590 inside_cost
8591 += record_stmt_cost (cost_vec, 1, vec_to_scalar,
8592 stmt_info, 0, vect_body);
8593 /* Take a single lane vector type store as scalar
8594 store to avoid ICE like 110776. */
8595 if (VECTOR_TYPE_P (ltype)
8596 && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
8597 n_adjacent_stores++;
8598 else
8599 inside_cost
8600 += record_stmt_cost (cost_vec, 1, scalar_store,
8601 stmt_info, 0, vect_body);
8602 continue;
8604 tree newref, newoff;
8605 gimple *incr, *assign;
8606 tree size = TYPE_SIZE (ltype);
8607 /* Extract the i'th component. */
8608 tree pos = fold_build2 (MULT_EXPR, bitsizetype,
8609 bitsize_int (i), size);
8610 tree elem = fold_build3 (BIT_FIELD_REF, ltype, vec_oprnd,
8611 size, pos);
8613 elem = force_gimple_operand_gsi (gsi, elem, true,
8614 NULL_TREE, true,
8615 GSI_SAME_STMT);
8617 tree this_off = build_int_cst (TREE_TYPE (alias_off),
8618 group_el * elsz);
8619 newref = build2 (MEM_REF, ltype,
8620 running_off, this_off);
8621 vect_copy_ref_info (newref, DR_REF (first_dr_info->dr));
8623 /* And store it to *running_off. */
8624 assign = gimple_build_assign (newref, elem);
8625 vect_finish_stmt_generation (vinfo, stmt_info, assign, gsi);
8627 group_el += lnel;
8628 if (! slp
8629 || group_el == group_size)
8631 newoff = copy_ssa_name (running_off, NULL);
8632 incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
8633 running_off, stride_step);
8634 vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi);
8636 running_off = newoff;
8637 group_el = 0;
8639 if (g == group_size - 1
8640 && !slp)
8642 if (j == 0 && i == 0)
8643 *vec_stmt = assign;
8644 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (assign);
8648 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
8649 vec_oprnds.truncate(0);
8650 if (slp)
8651 break;
8654 if (costing_p)
8656 if (n_adjacent_stores > 0)
8657 vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
8658 alignment_support_scheme, misalignment,
8659 &inside_cost, cost_vec);
8660 if (dump_enabled_p ())
8661 dump_printf_loc (MSG_NOTE, vect_location,
8662 "vect_model_store_cost: inside_cost = %d, "
8663 "prologue_cost = %d .\n",
8664 inside_cost, prologue_cost);
8667 return true;
8670 gcc_assert (alignment_support_scheme);
8671 vec_loop_masks *loop_masks
8672 = (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8673 ? &LOOP_VINFO_MASKS (loop_vinfo)
8674 : NULL);
8675 vec_loop_lens *loop_lens
8676 = (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)
8677 ? &LOOP_VINFO_LENS (loop_vinfo)
8678 : NULL);
8680 /* The vect_transform_stmt and vect_analyze_stmt will go here but there
8681 are some difference here. We cannot enable both the lens and masks
8682 during transform but it is allowed during analysis.
8683 Shouldn't go with length-based approach if fully masked. */
8684 if (cost_vec == NULL)
8685 /* The cost_vec is NULL during transfrom. */
8686 gcc_assert ((!loop_lens || !loop_masks));
8688 /* Targets with store-lane instructions must not require explicit
8689 realignment. vect_supportable_dr_alignment always returns either
8690 dr_aligned or dr_unaligned_supported for masked operations. */
8691 gcc_assert ((memory_access_type != VMAT_LOAD_STORE_LANES
8692 && !mask
8693 && !loop_masks)
8694 || alignment_support_scheme == dr_aligned
8695 || alignment_support_scheme == dr_unaligned_supported);
8697 tree offset = NULL_TREE;
8698 if (!known_eq (poffset, 0))
8699 offset = size_int (poffset);
8701 tree bump;
8702 tree vec_offset = NULL_TREE;
8703 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
8705 aggr_type = NULL_TREE;
8706 bump = NULL_TREE;
8708 else if (memory_access_type == VMAT_GATHER_SCATTER)
8710 aggr_type = elem_type;
8711 if (!costing_p)
8712 vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
8713 &bump, &vec_offset, loop_lens);
8715 else
8717 if (memory_access_type == VMAT_LOAD_STORE_LANES)
8718 aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
8719 else
8720 aggr_type = vectype;
8721 bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
8722 memory_access_type, loop_lens);
8725 if (mask && !costing_p)
8726 LOOP_VINFO_HAS_MASK_STORE (loop_vinfo) = true;
8728 /* In case the vectorization factor (VF) is bigger than the number
8729 of elements that we can fit in a vectype (nunits), we have to generate
8730 more than one vector stmt - i.e - we need to "unroll" the
8731 vector stmt by a factor VF/nunits. */
8733 /* In case of interleaving (non-unit grouped access):
8735 S1: &base + 2 = x2
8736 S2: &base = x0
8737 S3: &base + 1 = x1
8738 S4: &base + 3 = x3
8740 We create vectorized stores starting from base address (the access of the
8741 first stmt in the chain (S2 in the above example), when the last store stmt
8742 of the chain (S4) is reached:
8744 VS1: &base = vx2
8745 VS2: &base + vec_size*1 = vx0
8746 VS3: &base + vec_size*2 = vx1
8747 VS4: &base + vec_size*3 = vx3
8749 Then permutation statements are generated:
8751 VS5: vx5 = VEC_PERM_EXPR < vx0, vx3, {0, 8, 1, 9, 2, 10, 3, 11} >
8752 VS6: vx6 = VEC_PERM_EXPR < vx0, vx3, {4, 12, 5, 13, 6, 14, 7, 15} >
8755 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
8756 (the order of the data-refs in the output of vect_permute_store_chain
8757 corresponds to the order of scalar stmts in the interleaving chain - see
8758 the documentation of vect_permute_store_chain()).
8760 In case of both multiple types and interleaving, above vector stores and
8761 permutation stmts are created for every copy. The result vector stmts are
8762 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
8763 STMT_VINFO_RELATED_STMT for the next copies.
8766 auto_vec<tree> dr_chain (group_size);
8767 auto_vec<tree> vec_masks;
8768 tree vec_mask = NULL;
8769 auto_delete_vec<auto_vec<tree>> gvec_oprnds (group_size);
8770 for (i = 0; i < group_size; i++)
8771 gvec_oprnds.quick_push (new auto_vec<tree> ());
8773 if (memory_access_type == VMAT_LOAD_STORE_LANES)
8775 gcc_assert (!slp && grouped_store);
8776 unsigned inside_cost = 0, prologue_cost = 0;
8777 /* For costing some adjacent vector stores, we'd like to cost with
8778 the total number of them once instead of cost each one by one. */
8779 unsigned int n_adjacent_stores = 0;
8780 for (j = 0; j < ncopies; j++)
8782 gimple *new_stmt;
8783 if (j == 0)
8785 /* For interleaved stores we collect vectorized defs for all
8786 the stores in the group in DR_CHAIN. DR_CHAIN is then used
8787 as an input to vect_permute_store_chain(). */
8788 stmt_vec_info next_stmt_info = first_stmt_info;
8789 for (i = 0; i < group_size; i++)
8791 /* Since gaps are not supported for interleaved stores,
8792 DR_GROUP_SIZE is the exact number of stmts in the
8793 chain. Therefore, NEXT_STMT_INFO can't be NULL_TREE. */
8794 op = vect_get_store_rhs (next_stmt_info);
8795 if (costing_p)
8796 update_prologue_cost (&prologue_cost, op);
8797 else
8799 vect_get_vec_defs_for_operand (vinfo, next_stmt_info,
8800 ncopies, op,
8801 gvec_oprnds[i]);
8802 vec_oprnd = (*gvec_oprnds[i])[0];
8803 dr_chain.quick_push (vec_oprnd);
8805 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
8808 if (!costing_p)
8810 if (mask)
8812 vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
8813 mask, &vec_masks,
8814 mask_vectype);
8815 vec_mask = vec_masks[0];
8818 /* We should have catched mismatched types earlier. */
8819 gcc_assert (
8820 useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd)));
8821 dataref_ptr
8822 = vect_create_data_ref_ptr (vinfo, first_stmt_info,
8823 aggr_type, NULL, offset, &dummy,
8824 gsi, &ptr_incr, false, bump);
8827 else if (!costing_p)
8829 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
8830 /* DR_CHAIN is then used as an input to
8831 vect_permute_store_chain(). */
8832 for (i = 0; i < group_size; i++)
8834 vec_oprnd = (*gvec_oprnds[i])[j];
8835 dr_chain[i] = vec_oprnd;
8837 if (mask)
8838 vec_mask = vec_masks[j];
8839 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
8840 stmt_info, bump);
8843 if (costing_p)
8845 n_adjacent_stores += vec_num;
8846 continue;
8849 /* Get an array into which we can store the individual vectors. */
8850 tree vec_array = create_vector_array (vectype, vec_num);
8852 /* Invalidate the current contents of VEC_ARRAY. This should
8853 become an RTL clobber too, which prevents the vector registers
8854 from being upward-exposed. */
8855 vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
8857 /* Store the individual vectors into the array. */
8858 for (i = 0; i < vec_num; i++)
8860 vec_oprnd = dr_chain[i];
8861 write_vector_array (vinfo, stmt_info, gsi, vec_oprnd, vec_array,
8865 tree final_mask = NULL;
8866 tree final_len = NULL;
8867 tree bias = NULL;
8868 if (loop_masks)
8869 final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
8870 ncopies, vectype, j);
8871 if (vec_mask)
8872 final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask,
8873 vec_mask, gsi);
8875 if (lanes_ifn == IFN_MASK_LEN_STORE_LANES)
8877 if (loop_lens)
8878 final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
8879 ncopies, vectype, j, 1);
8880 else
8881 final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
8882 signed char biasval
8883 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
8884 bias = build_int_cst (intQI_type_node, biasval);
8885 if (!final_mask)
8887 mask_vectype = truth_type_for (vectype);
8888 final_mask = build_minus_one_cst (mask_vectype);
8892 gcall *call;
8893 if (final_len && final_mask)
8895 /* Emit:
8896 MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
8897 LEN, BIAS, VEC_ARRAY). */
8898 unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
8899 tree alias_ptr = build_int_cst (ref_type, align);
8900 call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
8901 dataref_ptr, alias_ptr,
8902 final_mask, final_len, bias,
8903 vec_array);
8905 else if (final_mask)
8907 /* Emit:
8908 MASK_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
8909 VEC_ARRAY). */
8910 unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
8911 tree alias_ptr = build_int_cst (ref_type, align);
8912 call = gimple_build_call_internal (IFN_MASK_STORE_LANES, 4,
8913 dataref_ptr, alias_ptr,
8914 final_mask, vec_array);
8916 else
8918 /* Emit:
8919 MEM_REF[...all elements...] = STORE_LANES (VEC_ARRAY). */
8920 data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
8921 call = gimple_build_call_internal (IFN_STORE_LANES, 1, vec_array);
8922 gimple_call_set_lhs (call, data_ref);
8924 gimple_call_set_nothrow (call, true);
8925 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
8926 new_stmt = call;
8928 /* Record that VEC_ARRAY is now dead. */
8929 vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
8930 if (j == 0)
8931 *vec_stmt = new_stmt;
8932 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8935 if (costing_p)
8937 if (n_adjacent_stores > 0)
8938 vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
8939 alignment_support_scheme, misalignment,
8940 &inside_cost, cost_vec);
8941 if (dump_enabled_p ())
8942 dump_printf_loc (MSG_NOTE, vect_location,
8943 "vect_model_store_cost: inside_cost = %d, "
8944 "prologue_cost = %d .\n",
8945 inside_cost, prologue_cost);
8948 return true;
8951 if (memory_access_type == VMAT_GATHER_SCATTER)
8953 gcc_assert (!grouped_store);
8954 auto_vec<tree> vec_offsets;
8955 unsigned int inside_cost = 0, prologue_cost = 0;
8956 for (j = 0; j < ncopies; j++)
8958 gimple *new_stmt;
8959 if (j == 0)
8961 if (costing_p && vls_type == VLS_STORE_INVARIANT)
8962 prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
8963 stmt_info, 0, vect_prologue);
8964 else if (!costing_p)
8966 /* Since the store is not grouped, DR_GROUP_SIZE is 1, and
8967 DR_CHAIN is of size 1. */
8968 gcc_assert (group_size == 1);
8969 if (slp_node)
8970 vect_get_slp_defs (op_node, gvec_oprnds[0]);
8971 else
8972 vect_get_vec_defs_for_operand (vinfo, first_stmt_info,
8973 ncopies, op, gvec_oprnds[0]);
8974 if (mask)
8976 if (slp_node)
8977 vect_get_slp_defs (mask_node, &vec_masks);
8978 else
8979 vect_get_vec_defs_for_operand (vinfo, stmt_info,
8980 ncopies,
8981 mask, &vec_masks,
8982 mask_vectype);
8985 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
8986 vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
8987 slp_node, &gs_info,
8988 &dataref_ptr, &vec_offsets);
8989 else
8990 dataref_ptr
8991 = vect_create_data_ref_ptr (vinfo, first_stmt_info,
8992 aggr_type, NULL, offset,
8993 &dummy, gsi, &ptr_incr, false,
8994 bump);
8997 else if (!costing_p)
8999 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
9000 if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
9001 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
9002 gsi, stmt_info, bump);
9005 new_stmt = NULL;
9006 for (i = 0; i < vec_num; ++i)
9008 if (!costing_p)
9010 vec_oprnd = (*gvec_oprnds[0])[vec_num * j + i];
9011 if (mask)
9012 vec_mask = vec_masks[vec_num * j + i];
9013 /* We should have catched mismatched types earlier. */
9014 gcc_assert (useless_type_conversion_p (vectype,
9015 TREE_TYPE (vec_oprnd)));
9017 unsigned HOST_WIDE_INT align;
9018 tree final_mask = NULL_TREE;
9019 tree final_len = NULL_TREE;
9020 tree bias = NULL_TREE;
9021 if (!costing_p)
9023 if (loop_masks)
9024 final_mask = vect_get_loop_mask (loop_vinfo, gsi,
9025 loop_masks, ncopies,
9026 vectype, j);
9027 if (vec_mask)
9028 final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
9029 final_mask, vec_mask, gsi);
9032 if (gs_info.ifn != IFN_LAST)
9034 if (costing_p)
9036 unsigned int cnunits = vect_nunits_for_cost (vectype);
9037 inside_cost
9038 += record_stmt_cost (cost_vec, cnunits, scalar_store,
9039 stmt_info, 0, vect_body);
9040 continue;
9043 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
9044 vec_offset = vec_offsets[vec_num * j + i];
9045 tree scale = size_int (gs_info.scale);
9047 if (gs_info.ifn == IFN_MASK_LEN_SCATTER_STORE)
9049 if (loop_lens)
9050 final_len = vect_get_loop_len (loop_vinfo, gsi,
9051 loop_lens, ncopies,
9052 vectype, j, 1);
9053 else
9054 final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
9055 signed char biasval
9056 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
9057 bias = build_int_cst (intQI_type_node, biasval);
9058 if (!final_mask)
9060 mask_vectype = truth_type_for (vectype);
9061 final_mask = build_minus_one_cst (mask_vectype);
9065 gcall *call;
9066 if (final_len && final_mask)
9067 call = gimple_build_call_internal
9068 (IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
9069 vec_offset, scale, vec_oprnd, final_mask,
9070 final_len, bias);
9071 else if (final_mask)
9072 call = gimple_build_call_internal
9073 (IFN_MASK_SCATTER_STORE, 5, dataref_ptr,
9074 vec_offset, scale, vec_oprnd, final_mask);
9075 else
9076 call = gimple_build_call_internal (IFN_SCATTER_STORE, 4,
9077 dataref_ptr, vec_offset,
9078 scale, vec_oprnd);
9079 gimple_call_set_nothrow (call, true);
9080 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
9081 new_stmt = call;
9083 else if (gs_info.decl)
9085 /* The builtin decls path for scatter is legacy, x86 only. */
9086 gcc_assert (nunits.is_constant ()
9087 && (!final_mask
9088 || SCALAR_INT_MODE_P
9089 (TYPE_MODE (TREE_TYPE (final_mask)))));
9090 if (costing_p)
9092 unsigned int cnunits = vect_nunits_for_cost (vectype);
9093 inside_cost
9094 += record_stmt_cost (cost_vec, cnunits, scalar_store,
9095 stmt_info, 0, vect_body);
9096 continue;
9098 poly_uint64 offset_nunits
9099 = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype);
9100 if (known_eq (nunits, offset_nunits))
9102 new_stmt = vect_build_one_scatter_store_call
9103 (vinfo, stmt_info, gsi, &gs_info,
9104 dataref_ptr, vec_offsets[vec_num * j + i],
9105 vec_oprnd, final_mask);
9106 vect_finish_stmt_generation (vinfo, stmt_info,
9107 new_stmt, gsi);
9109 else if (known_eq (nunits, offset_nunits * 2))
9111 /* We have a offset vector with half the number of
9112 lanes but the builtins will store full vectype
9113 data from the lower lanes. */
9114 new_stmt = vect_build_one_scatter_store_call
9115 (vinfo, stmt_info, gsi, &gs_info,
9116 dataref_ptr,
9117 vec_offsets[2 * vec_num * j + 2 * i],
9118 vec_oprnd, final_mask);
9119 vect_finish_stmt_generation (vinfo, stmt_info,
9120 new_stmt, gsi);
9121 int count = nunits.to_constant ();
9122 vec_perm_builder sel (count, count, 1);
9123 sel.quick_grow (count);
9124 for (int i = 0; i < count; ++i)
9125 sel[i] = i | (count / 2);
9126 vec_perm_indices indices (sel, 2, count);
9127 tree perm_mask
9128 = vect_gen_perm_mask_checked (vectype, indices);
9129 new_stmt = gimple_build_assign (NULL_TREE, VEC_PERM_EXPR,
9130 vec_oprnd, vec_oprnd,
9131 perm_mask);
9132 vec_oprnd = make_ssa_name (vectype);
9133 gimple_set_lhs (new_stmt, vec_oprnd);
9134 vect_finish_stmt_generation (vinfo, stmt_info,
9135 new_stmt, gsi);
9136 if (final_mask)
9138 new_stmt = gimple_build_assign (NULL_TREE,
9139 VEC_UNPACK_HI_EXPR,
9140 final_mask);
9141 final_mask = make_ssa_name
9142 (truth_type_for (gs_info.offset_vectype));
9143 gimple_set_lhs (new_stmt, final_mask);
9144 vect_finish_stmt_generation (vinfo, stmt_info,
9145 new_stmt, gsi);
9147 new_stmt = vect_build_one_scatter_store_call
9148 (vinfo, stmt_info, gsi, &gs_info,
9149 dataref_ptr,
9150 vec_offsets[2 * vec_num * j + 2 * i + 1],
9151 vec_oprnd, final_mask);
9152 vect_finish_stmt_generation (vinfo, stmt_info,
9153 new_stmt, gsi);
9155 else if (known_eq (nunits * 2, offset_nunits))
9157 /* We have a offset vector with double the number of
9158 lanes. Select the low/high part accordingly. */
9159 vec_offset = vec_offsets[(vec_num * j + i) / 2];
9160 if ((vec_num * j + i) & 1)
9162 int count = offset_nunits.to_constant ();
9163 vec_perm_builder sel (count, count, 1);
9164 sel.quick_grow (count);
9165 for (int i = 0; i < count; ++i)
9166 sel[i] = i | (count / 2);
9167 vec_perm_indices indices (sel, 2, count);
9168 tree perm_mask = vect_gen_perm_mask_checked
9169 (TREE_TYPE (vec_offset), indices);
9170 new_stmt = gimple_build_assign (NULL_TREE,
9171 VEC_PERM_EXPR,
9172 vec_offset,
9173 vec_offset,
9174 perm_mask);
9175 vec_offset = make_ssa_name (TREE_TYPE (vec_offset));
9176 gimple_set_lhs (new_stmt, vec_offset);
9177 vect_finish_stmt_generation (vinfo, stmt_info,
9178 new_stmt, gsi);
9180 new_stmt = vect_build_one_scatter_store_call
9181 (vinfo, stmt_info, gsi, &gs_info,
9182 dataref_ptr, vec_offset,
9183 vec_oprnd, final_mask);
9184 vect_finish_stmt_generation (vinfo, stmt_info,
9185 new_stmt, gsi);
9187 else
9188 gcc_unreachable ();
9190 else
9192 /* Emulated scatter. */
9193 gcc_assert (!final_mask);
9194 if (costing_p)
9196 unsigned int cnunits = vect_nunits_for_cost (vectype);
9197 /* For emulated scatter N offset vector element extracts
9198 (we assume the scalar scaling and ptr + offset add is
9199 consumed by the load). */
9200 inside_cost
9201 += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
9202 stmt_info, 0, vect_body);
9203 /* N scalar stores plus extracting the elements. */
9204 inside_cost
9205 += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
9206 stmt_info, 0, vect_body);
9207 inside_cost
9208 += record_stmt_cost (cost_vec, cnunits, scalar_store,
9209 stmt_info, 0, vect_body);
9210 continue;
9213 unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
9214 unsigned HOST_WIDE_INT const_offset_nunits
9215 = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype).to_constant ();
9216 vec<constructor_elt, va_gc> *ctor_elts;
9217 vec_alloc (ctor_elts, const_nunits);
9218 gimple_seq stmts = NULL;
9219 tree elt_type = TREE_TYPE (vectype);
9220 unsigned HOST_WIDE_INT elt_size
9221 = tree_to_uhwi (TYPE_SIZE (elt_type));
9222 /* We support offset vectors with more elements
9223 than the data vector for now. */
9224 unsigned HOST_WIDE_INT factor
9225 = const_offset_nunits / const_nunits;
9226 vec_offset = vec_offsets[(vec_num * j + i) / factor];
9227 unsigned elt_offset
9228 = ((vec_num * j + i) % factor) * const_nunits;
9229 tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
9230 tree scale = size_int (gs_info.scale);
9231 align = get_object_alignment (DR_REF (first_dr_info->dr));
9232 tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
9233 for (unsigned k = 0; k < const_nunits; ++k)
9235 /* Compute the offsetted pointer. */
9236 tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
9237 bitsize_int (k + elt_offset));
9238 tree idx
9239 = gimple_build (&stmts, BIT_FIELD_REF, idx_type,
9240 vec_offset, TYPE_SIZE (idx_type), boff);
9241 idx = gimple_convert (&stmts, sizetype, idx);
9242 idx = gimple_build (&stmts, MULT_EXPR, sizetype,
9243 idx, scale);
9244 tree ptr
9245 = gimple_build (&stmts, PLUS_EXPR,
9246 TREE_TYPE (dataref_ptr),
9247 dataref_ptr, idx);
9248 ptr = gimple_convert (&stmts, ptr_type_node, ptr);
9249 /* Extract the element to be stored. */
9250 tree elt
9251 = gimple_build (&stmts, BIT_FIELD_REF,
9252 TREE_TYPE (vectype),
9253 vec_oprnd, TYPE_SIZE (elt_type),
9254 bitsize_int (k * elt_size));
9255 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
9256 stmts = NULL;
9257 tree ref
9258 = build2 (MEM_REF, ltype, ptr,
9259 build_int_cst (ref_type, 0));
9260 new_stmt = gimple_build_assign (ref, elt);
9261 vect_finish_stmt_generation (vinfo, stmt_info,
9262 new_stmt, gsi);
9264 if (slp)
9265 slp_node->push_vec_def (new_stmt);
9268 if (!slp && !costing_p)
9269 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9272 if (!slp && !costing_p)
9273 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9275 if (costing_p && dump_enabled_p ())
9276 dump_printf_loc (MSG_NOTE, vect_location,
9277 "vect_model_store_cost: inside_cost = %d, "
9278 "prologue_cost = %d .\n",
9279 inside_cost, prologue_cost);
9281 return true;
9284 gcc_assert (memory_access_type == VMAT_CONTIGUOUS
9285 || memory_access_type == VMAT_CONTIGUOUS_DOWN
9286 || memory_access_type == VMAT_CONTIGUOUS_PERMUTE
9287 || memory_access_type == VMAT_CONTIGUOUS_REVERSE);
9289 unsigned inside_cost = 0, prologue_cost = 0;
9290 /* For costing some adjacent vector stores, we'd like to cost with
9291 the total number of them once instead of cost each one by one. */
9292 unsigned int n_adjacent_stores = 0;
9293 auto_vec<tree> result_chain (group_size);
9294 auto_vec<tree, 1> vec_oprnds;
9295 for (j = 0; j < ncopies; j++)
9297 gimple *new_stmt;
9298 if (j == 0)
9300 if (slp && !costing_p)
9302 /* Get vectorized arguments for SLP_NODE. */
9303 vect_get_vec_defs (vinfo, stmt_info, slp_node, 1, op,
9304 &vec_oprnds, mask, &vec_masks);
9305 vec_oprnd = vec_oprnds[0];
9306 if (mask)
9307 vec_mask = vec_masks[0];
9309 else
9311 /* For interleaved stores we collect vectorized defs for all the
9312 stores in the group in DR_CHAIN. DR_CHAIN is then used as an
9313 input to vect_permute_store_chain().
9315 If the store is not grouped, DR_GROUP_SIZE is 1, and DR_CHAIN
9316 is of size 1. */
9317 stmt_vec_info next_stmt_info = first_stmt_info;
9318 for (i = 0; i < group_size; i++)
9320 /* Since gaps are not supported for interleaved stores,
9321 DR_GROUP_SIZE is the exact number of stmts in the chain.
9322 Therefore, NEXT_STMT_INFO can't be NULL_TREE. In case
9323 that there is no interleaving, DR_GROUP_SIZE is 1,
9324 and only one iteration of the loop will be executed. */
9325 op = vect_get_store_rhs (next_stmt_info);
9326 if (costing_p)
9327 update_prologue_cost (&prologue_cost, op);
9328 else
9330 vect_get_vec_defs_for_operand (vinfo, next_stmt_info,
9331 ncopies, op,
9332 gvec_oprnds[i]);
9333 vec_oprnd = (*gvec_oprnds[i])[0];
9334 dr_chain.quick_push (vec_oprnd);
9336 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
9338 if (mask && !costing_p)
9340 vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
9341 mask, &vec_masks,
9342 mask_vectype);
9343 vec_mask = vec_masks[0];
9347 /* We should have catched mismatched types earlier. */
9348 gcc_assert (costing_p
9349 || useless_type_conversion_p (vectype,
9350 TREE_TYPE (vec_oprnd)));
9351 bool simd_lane_access_p
9352 = STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) != 0;
9353 if (!costing_p
9354 && simd_lane_access_p
9355 && !loop_masks
9356 && TREE_CODE (DR_BASE_ADDRESS (first_dr_info->dr)) == ADDR_EXPR
9357 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (first_dr_info->dr), 0))
9358 && integer_zerop (get_dr_vinfo_offset (vinfo, first_dr_info))
9359 && integer_zerop (DR_INIT (first_dr_info->dr))
9360 && alias_sets_conflict_p (get_alias_set (aggr_type),
9361 get_alias_set (TREE_TYPE (ref_type))))
9363 dataref_ptr = unshare_expr (DR_BASE_ADDRESS (first_dr_info->dr));
9364 dataref_offset = build_int_cst (ref_type, 0);
9366 else if (!costing_p)
9367 dataref_ptr
9368 = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
9369 simd_lane_access_p ? loop : NULL,
9370 offset, &dummy, gsi, &ptr_incr,
9371 simd_lane_access_p, bump);
9373 else if (!costing_p)
9375 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
9376 /* DR_CHAIN is then used as an input to vect_permute_store_chain().
9377 If the store is not grouped, DR_GROUP_SIZE is 1, and DR_CHAIN is
9378 of size 1. */
9379 for (i = 0; i < group_size; i++)
9381 vec_oprnd = (*gvec_oprnds[i])[j];
9382 dr_chain[i] = vec_oprnd;
9384 if (mask)
9385 vec_mask = vec_masks[j];
9386 if (dataref_offset)
9387 dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump);
9388 else
9389 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
9390 stmt_info, bump);
9393 new_stmt = NULL;
9394 if (grouped_store)
9396 /* Permute. */
9397 gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
9398 if (costing_p)
9400 int group_size = DR_GROUP_SIZE (first_stmt_info);
9401 int nstmts = ceil_log2 (group_size) * group_size;
9402 inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm,
9403 stmt_info, 0, vect_body);
9404 if (dump_enabled_p ())
9405 dump_printf_loc (MSG_NOTE, vect_location,
9406 "vect_model_store_cost: "
9407 "strided group_size = %d .\n",
9408 group_size);
9410 else
9411 vect_permute_store_chain (vinfo, dr_chain, group_size, stmt_info,
9412 gsi, &result_chain);
9415 stmt_vec_info next_stmt_info = first_stmt_info;
9416 for (i = 0; i < vec_num; i++)
9418 if (!costing_p)
9420 if (slp)
9421 vec_oprnd = vec_oprnds[i];
9422 else if (grouped_store)
9423 /* For grouped stores vectorized defs are interleaved in
9424 vect_permute_store_chain(). */
9425 vec_oprnd = result_chain[i];
9428 if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
9430 if (costing_p)
9431 inside_cost += record_stmt_cost (cost_vec, 1, vec_perm,
9432 stmt_info, 0, vect_body);
9433 else
9435 tree perm_mask = perm_mask_for_reverse (vectype);
9436 tree perm_dest = vect_create_destination_var (
9437 vect_get_store_rhs (stmt_info), vectype);
9438 tree new_temp = make_ssa_name (perm_dest);
9440 /* Generate the permute statement. */
9441 gimple *perm_stmt
9442 = gimple_build_assign (new_temp, VEC_PERM_EXPR, vec_oprnd,
9443 vec_oprnd, perm_mask);
9444 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
9445 gsi);
9447 perm_stmt = SSA_NAME_DEF_STMT (new_temp);
9448 vec_oprnd = new_temp;
9452 if (costing_p)
9454 n_adjacent_stores++;
9456 if (!slp)
9458 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
9459 if (!next_stmt_info)
9460 break;
9463 continue;
9466 tree final_mask = NULL_TREE;
9467 tree final_len = NULL_TREE;
9468 tree bias = NULL_TREE;
9469 if (loop_masks)
9470 final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
9471 vec_num * ncopies, vectype,
9472 vec_num * j + i);
9473 if (slp && vec_mask)
9474 vec_mask = vec_masks[i];
9475 if (vec_mask)
9476 final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask,
9477 vec_mask, gsi);
9479 if (i > 0)
9480 /* Bump the vector pointer. */
9481 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
9482 stmt_info, bump);
9484 unsigned misalign;
9485 unsigned HOST_WIDE_INT align;
9486 align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
9487 if (alignment_support_scheme == dr_aligned)
9488 misalign = 0;
9489 else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
9491 align = dr_alignment (vect_dr_behavior (vinfo, first_dr_info));
9492 misalign = 0;
9494 else
9495 misalign = misalignment;
9496 if (dataref_offset == NULL_TREE
9497 && TREE_CODE (dataref_ptr) == SSA_NAME)
9498 set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
9499 misalign);
9500 align = least_bit_hwi (misalign | align);
9502 /* Compute IFN when LOOP_LENS or final_mask valid. */
9503 machine_mode vmode = TYPE_MODE (vectype);
9504 machine_mode new_vmode = vmode;
9505 internal_fn partial_ifn = IFN_LAST;
9506 if (loop_lens)
9508 opt_machine_mode new_ovmode
9509 = get_len_load_store_mode (vmode, false, &partial_ifn);
9510 new_vmode = new_ovmode.require ();
9511 unsigned factor
9512 = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode);
9513 final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
9514 vec_num * ncopies, vectype,
9515 vec_num * j + i, factor);
9517 else if (final_mask)
9519 if (!can_vec_mask_load_store_p (
9520 vmode, TYPE_MODE (TREE_TYPE (final_mask)), false,
9521 &partial_ifn))
9522 gcc_unreachable ();
9525 if (partial_ifn == IFN_MASK_LEN_STORE)
9527 if (!final_len)
9529 /* Pass VF value to 'len' argument of
9530 MASK_LEN_STORE if LOOP_LENS is invalid. */
9531 final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
9533 if (!final_mask)
9535 /* Pass all ones value to 'mask' argument of
9536 MASK_LEN_STORE if final_mask is invalid. */
9537 mask_vectype = truth_type_for (vectype);
9538 final_mask = build_minus_one_cst (mask_vectype);
9541 if (final_len)
9543 signed char biasval
9544 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
9546 bias = build_int_cst (intQI_type_node, biasval);
9549 /* Arguments are ready. Create the new vector stmt. */
9550 if (final_len)
9552 gcall *call;
9553 tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
9554 /* Need conversion if it's wrapped with VnQI. */
9555 if (vmode != new_vmode)
9557 tree new_vtype
9558 = build_vector_type_for_mode (unsigned_intQI_type_node,
9559 new_vmode);
9560 tree var = vect_get_new_ssa_name (new_vtype, vect_simple_var);
9561 vec_oprnd = build1 (VIEW_CONVERT_EXPR, new_vtype, vec_oprnd);
9562 gassign *new_stmt
9563 = gimple_build_assign (var, VIEW_CONVERT_EXPR, vec_oprnd);
9564 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
9565 vec_oprnd = var;
9568 if (partial_ifn == IFN_MASK_LEN_STORE)
9569 call = gimple_build_call_internal (IFN_MASK_LEN_STORE, 6,
9570 dataref_ptr, ptr, final_mask,
9571 final_len, bias, vec_oprnd);
9572 else
9573 call = gimple_build_call_internal (IFN_LEN_STORE, 5,
9574 dataref_ptr, ptr, final_len,
9575 bias, vec_oprnd);
9576 gimple_call_set_nothrow (call, true);
9577 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
9578 new_stmt = call;
9580 else if (final_mask)
9582 tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
9583 gcall *call
9584 = gimple_build_call_internal (IFN_MASK_STORE, 4, dataref_ptr,
9585 ptr, final_mask, vec_oprnd);
9586 gimple_call_set_nothrow (call, true);
9587 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
9588 new_stmt = call;
9590 else
9592 data_ref
9593 = fold_build2 (MEM_REF, vectype, dataref_ptr,
9594 dataref_offset ? dataref_offset
9595 : build_int_cst (ref_type, 0));
9596 if (alignment_support_scheme == dr_aligned)
9598 else
9599 TREE_TYPE (data_ref)
9600 = build_aligned_type (TREE_TYPE (data_ref),
9601 align * BITS_PER_UNIT);
9602 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
9603 new_stmt = gimple_build_assign (data_ref, vec_oprnd);
9604 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
9607 if (slp)
9608 continue;
9610 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
9611 if (!next_stmt_info)
9612 break;
9614 if (!slp && !costing_p)
9616 if (j == 0)
9617 *vec_stmt = new_stmt;
9618 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9622 if (costing_p)
9624 if (n_adjacent_stores > 0)
9625 vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
9626 alignment_support_scheme, misalignment,
9627 &inside_cost, cost_vec);
9629 /* When vectorizing a store into the function result assign
9630 a penalty if the function returns in a multi-register location.
9631 In this case we assume we'll end up with having to spill the
9632 vector result and do piecewise loads as a conservative estimate. */
9633 tree base = get_base_address (STMT_VINFO_DATA_REF (stmt_info)->ref);
9634 if (base
9635 && (TREE_CODE (base) == RESULT_DECL
9636 || (DECL_P (base) && cfun_returns (base)))
9637 && !aggregate_value_p (base, cfun->decl))
9639 rtx reg = hard_function_value (TREE_TYPE (base), cfun->decl, 0, 1);
9640 /* ??? Handle PARALLEL in some way. */
9641 if (REG_P (reg))
9643 int nregs = hard_regno_nregs (REGNO (reg), GET_MODE (reg));
9644 /* Assume that a single reg-reg move is possible and cheap,
9645 do not account for vector to gp register move cost. */
9646 if (nregs > 1)
9648 /* Spill. */
9649 prologue_cost
9650 += record_stmt_cost (cost_vec, ncopies, vector_store,
9651 stmt_info, 0, vect_epilogue);
9652 /* Loads. */
9653 prologue_cost
9654 += record_stmt_cost (cost_vec, ncopies * nregs, scalar_load,
9655 stmt_info, 0, vect_epilogue);
9659 if (dump_enabled_p ())
9660 dump_printf_loc (MSG_NOTE, vect_location,
9661 "vect_model_store_cost: inside_cost = %d, "
9662 "prologue_cost = %d .\n",
9663 inside_cost, prologue_cost);
9666 return true;
9669 /* Given a vector type VECTYPE, turns permutation SEL into the equivalent
9670 VECTOR_CST mask. No checks are made that the target platform supports the
9671 mask, so callers may wish to test can_vec_perm_const_p separately, or use
9672 vect_gen_perm_mask_checked. */
9674 tree
9675 vect_gen_perm_mask_any (tree vectype, const vec_perm_indices &sel)
9677 tree mask_type;
9679 poly_uint64 nunits = sel.length ();
9680 gcc_assert (known_eq (nunits, TYPE_VECTOR_SUBPARTS (vectype)));
9682 mask_type = build_vector_type (ssizetype, nunits);
9683 return vec_perm_indices_to_tree (mask_type, sel);
9686 /* Checked version of vect_gen_perm_mask_any. Asserts can_vec_perm_const_p,
9687 i.e. that the target supports the pattern _for arbitrary input vectors_. */
9689 tree
9690 vect_gen_perm_mask_checked (tree vectype, const vec_perm_indices &sel)
9692 machine_mode vmode = TYPE_MODE (vectype);
9693 gcc_assert (can_vec_perm_const_p (vmode, vmode, sel));
9694 return vect_gen_perm_mask_any (vectype, sel);
9697 /* Given a vector variable X and Y, that was generated for the scalar
9698 STMT_INFO, generate instructions to permute the vector elements of X and Y
9699 using permutation mask MASK_VEC, insert them at *GSI and return the
9700 permuted vector variable. */
9702 static tree
9703 permute_vec_elements (vec_info *vinfo,
9704 tree x, tree y, tree mask_vec, stmt_vec_info stmt_info,
9705 gimple_stmt_iterator *gsi)
9707 tree vectype = TREE_TYPE (x);
9708 tree perm_dest, data_ref;
9709 gimple *perm_stmt;
9711 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
9712 if (scalar_dest && TREE_CODE (scalar_dest) == SSA_NAME)
9713 perm_dest = vect_create_destination_var (scalar_dest, vectype);
9714 else
9715 perm_dest = vect_get_new_vect_var (vectype, vect_simple_var, NULL);
9716 data_ref = make_ssa_name (perm_dest);
9718 /* Generate the permute statement. */
9719 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, x, y, mask_vec);
9720 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
9722 return data_ref;
9725 /* Hoist the definitions of all SSA uses on STMT_INFO out of the loop LOOP,
9726 inserting them on the loops preheader edge. Returns true if we
9727 were successful in doing so (and thus STMT_INFO can be moved then),
9728 otherwise returns false. HOIST_P indicates if we want to hoist the
9729 definitions of all SSA uses, it would be false when we are costing. */
9731 static bool
9732 hoist_defs_of_uses (stmt_vec_info stmt_info, class loop *loop, bool hoist_p)
9734 ssa_op_iter i;
9735 tree op;
9736 bool any = false;
9738 FOR_EACH_SSA_TREE_OPERAND (op, stmt_info->stmt, i, SSA_OP_USE)
9740 gimple *def_stmt = SSA_NAME_DEF_STMT (op);
9741 if (!gimple_nop_p (def_stmt)
9742 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
9744 /* Make sure we don't need to recurse. While we could do
9745 so in simple cases when there are more complex use webs
9746 we don't have an easy way to preserve stmt order to fulfil
9747 dependencies within them. */
9748 tree op2;
9749 ssa_op_iter i2;
9750 if (gimple_code (def_stmt) == GIMPLE_PHI)
9751 return false;
9752 FOR_EACH_SSA_TREE_OPERAND (op2, def_stmt, i2, SSA_OP_USE)
9754 gimple *def_stmt2 = SSA_NAME_DEF_STMT (op2);
9755 if (!gimple_nop_p (def_stmt2)
9756 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt2)))
9757 return false;
9759 any = true;
9763 if (!any)
9764 return true;
9766 if (!hoist_p)
9767 return true;
9769 FOR_EACH_SSA_TREE_OPERAND (op, stmt_info->stmt, i, SSA_OP_USE)
9771 gimple *def_stmt = SSA_NAME_DEF_STMT (op);
9772 if (!gimple_nop_p (def_stmt)
9773 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
9775 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
9776 gsi_remove (&gsi, false);
9777 gsi_insert_on_edge_immediate (loop_preheader_edge (loop), def_stmt);
9781 return true;
9784 /* vectorizable_load.
9786 Check if STMT_INFO reads a non scalar data-ref (array/pointer/structure)
9787 that can be vectorized.
9788 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
9789 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
9790 Return true if STMT_INFO is vectorizable in this way. */
9792 static bool
9793 vectorizable_load (vec_info *vinfo,
9794 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
9795 gimple **vec_stmt, slp_tree slp_node,
9796 stmt_vector_for_cost *cost_vec)
9798 tree scalar_dest;
9799 tree vec_dest = NULL;
9800 tree data_ref = NULL;
9801 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
9802 class loop *loop = NULL;
9803 class loop *containing_loop = gimple_bb (stmt_info->stmt)->loop_father;
9804 bool nested_in_vect_loop = false;
9805 tree elem_type;
9806 /* Avoid false positive uninitialized warning, see PR110652. */
9807 tree new_temp = NULL_TREE;
9808 machine_mode mode;
9809 tree dummy;
9810 tree dataref_ptr = NULL_TREE;
9811 tree dataref_offset = NULL_TREE;
9812 gimple *ptr_incr = NULL;
9813 int ncopies;
9814 int i, j;
9815 unsigned int group_size;
9816 poly_uint64 group_gap_adj;
9817 tree msq = NULL_TREE, lsq;
9818 tree realignment_token = NULL_TREE;
9819 gphi *phi = NULL;
9820 vec<tree> dr_chain = vNULL;
9821 bool grouped_load = false;
9822 stmt_vec_info first_stmt_info;
9823 stmt_vec_info first_stmt_info_for_drptr = NULL;
9824 bool compute_in_loop = false;
9825 class loop *at_loop;
9826 int vec_num;
9827 bool slp = (slp_node != NULL);
9828 bool slp_perm = false;
9829 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
9830 poly_uint64 vf;
9831 tree aggr_type;
9832 gather_scatter_info gs_info;
9833 tree ref_type;
9834 enum vect_def_type mask_dt = vect_unknown_def_type;
9836 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
9837 return false;
9839 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
9840 && ! vec_stmt)
9841 return false;
9843 if (!STMT_VINFO_DATA_REF (stmt_info))
9844 return false;
9846 tree mask = NULL_TREE, mask_vectype = NULL_TREE;
9847 int mask_index = -1;
9848 slp_tree slp_op = NULL;
9849 if (gassign *assign = dyn_cast <gassign *> (stmt_info->stmt))
9851 scalar_dest = gimple_assign_lhs (assign);
9852 if (TREE_CODE (scalar_dest) != SSA_NAME)
9853 return false;
9855 tree_code code = gimple_assign_rhs_code (assign);
9856 if (code != ARRAY_REF
9857 && code != BIT_FIELD_REF
9858 && code != INDIRECT_REF
9859 && code != COMPONENT_REF
9860 && code != IMAGPART_EXPR
9861 && code != REALPART_EXPR
9862 && code != MEM_REF
9863 && TREE_CODE_CLASS (code) != tcc_declaration)
9864 return false;
9866 else
9868 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
9869 if (!call || !gimple_call_internal_p (call))
9870 return false;
9872 internal_fn ifn = gimple_call_internal_fn (call);
9873 if (!internal_load_fn_p (ifn))
9874 return false;
9876 scalar_dest = gimple_call_lhs (call);
9877 if (!scalar_dest)
9878 return false;
9880 mask_index = internal_fn_mask_index (ifn);
9881 if (mask_index >= 0 && slp_node)
9882 mask_index = vect_slp_child_index_for_operand
9883 (call, mask_index, STMT_VINFO_GATHER_SCATTER_P (stmt_info));
9884 if (mask_index >= 0
9885 && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
9886 &mask, &slp_op, &mask_dt, &mask_vectype))
9887 return false;
9890 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9891 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9893 if (loop_vinfo)
9895 loop = LOOP_VINFO_LOOP (loop_vinfo);
9896 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
9897 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9899 else
9900 vf = 1;
9902 /* Multiple types in SLP are handled by creating the appropriate number of
9903 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
9904 case of SLP. */
9905 if (slp)
9906 ncopies = 1;
9907 else
9908 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9910 gcc_assert (ncopies >= 1);
9912 /* FORNOW. This restriction should be relaxed. */
9913 if (nested_in_vect_loop
9914 && (ncopies > 1 || (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)))
9916 if (dump_enabled_p ())
9917 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9918 "multiple types in nested loop.\n");
9919 return false;
9922 /* Invalidate assumptions made by dependence analysis when vectorization
9923 on the unrolled body effectively re-orders stmts. */
9924 if (ncopies > 1
9925 && STMT_VINFO_MIN_NEG_DIST (stmt_info) != 0
9926 && maybe_gt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9927 STMT_VINFO_MIN_NEG_DIST (stmt_info)))
9929 if (dump_enabled_p ())
9930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9931 "cannot perform implicit CSE when unrolling "
9932 "with negative dependence distance\n");
9933 return false;
9936 elem_type = TREE_TYPE (vectype);
9937 mode = TYPE_MODE (vectype);
9939 /* FORNOW. In some cases can vectorize even if data-type not supported
9940 (e.g. - data copies). */
9941 if (optab_handler (mov_optab, mode) == CODE_FOR_nothing)
9943 if (dump_enabled_p ())
9944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9945 "Aligned load, but unsupported type.\n");
9946 return false;
9949 /* Check if the load is a part of an interleaving chain. */
9950 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
9952 grouped_load = true;
9953 /* FORNOW */
9954 gcc_assert (!nested_in_vect_loop);
9955 gcc_assert (!STMT_VINFO_GATHER_SCATTER_P (stmt_info));
9957 first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
9958 group_size = DR_GROUP_SIZE (first_stmt_info);
9960 /* Refuse non-SLP vectorization of SLP-only groups. */
9961 if (!slp && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))
9963 if (dump_enabled_p ())
9964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9965 "cannot vectorize load in non-SLP mode.\n");
9966 return false;
9969 /* Invalidate assumptions made by dependence analysis when vectorization
9970 on the unrolled body effectively re-orders stmts. */
9971 if (STMT_VINFO_MIN_NEG_DIST (stmt_info) != 0
9972 && maybe_gt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9973 STMT_VINFO_MIN_NEG_DIST (stmt_info)))
9975 if (dump_enabled_p ())
9976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9977 "cannot perform implicit CSE when performing "
9978 "group loads with negative dependence distance\n");
9979 return false;
9982 else
9983 group_size = 1;
9985 if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
9987 slp_perm = true;
9989 if (!loop_vinfo)
9991 /* In BB vectorization we may not actually use a loaded vector
9992 accessing elements in excess of DR_GROUP_SIZE. */
9993 stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
9994 group_info = DR_GROUP_FIRST_ELEMENT (group_info);
9995 unsigned HOST_WIDE_INT nunits;
9996 unsigned j, k, maxk = 0;
9997 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
9998 if (k > maxk)
9999 maxk = k;
10000 tree vectype = SLP_TREE_VECTYPE (slp_node);
10001 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
10002 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
10004 if (dump_enabled_p ())
10005 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10006 "BB vectorization with gaps at the end of "
10007 "a load is not supported\n");
10008 return false;
10012 auto_vec<tree> tem;
10013 unsigned n_perms;
10014 if (!vect_transform_slp_perm_load (vinfo, slp_node, tem, NULL, vf,
10015 true, &n_perms))
10017 if (dump_enabled_p ())
10018 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
10019 vect_location,
10020 "unsupported load permutation\n");
10021 return false;
10025 vect_memory_access_type memory_access_type;
10026 enum dr_alignment_support alignment_support_scheme;
10027 int misalignment;
10028 poly_int64 poffset;
10029 internal_fn lanes_ifn;
10030 if (!get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask, VLS_LOAD,
10031 ncopies, &memory_access_type, &poffset,
10032 &alignment_support_scheme, &misalignment, &gs_info,
10033 &lanes_ifn))
10034 return false;
10036 if (mask)
10038 if (memory_access_type == VMAT_CONTIGUOUS)
10040 machine_mode vec_mode = TYPE_MODE (vectype);
10041 if (!VECTOR_MODE_P (vec_mode)
10042 || !can_vec_mask_load_store_p (vec_mode,
10043 TYPE_MODE (mask_vectype), true))
10044 return false;
10046 else if (memory_access_type != VMAT_LOAD_STORE_LANES
10047 && memory_access_type != VMAT_GATHER_SCATTER)
10049 if (dump_enabled_p ())
10050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10051 "unsupported access type for masked load.\n");
10052 return false;
10054 else if (memory_access_type == VMAT_GATHER_SCATTER
10055 && gs_info.ifn == IFN_LAST
10056 && !gs_info.decl)
10058 if (dump_enabled_p ())
10059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10060 "unsupported masked emulated gather.\n");
10061 return false;
10063 else if (memory_access_type == VMAT_ELEMENTWISE
10064 || memory_access_type == VMAT_STRIDED_SLP)
10066 if (dump_enabled_p ())
10067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10068 "unsupported masked strided access.\n");
10069 return false;
10073 bool costing_p = !vec_stmt;
10075 if (costing_p) /* transformation not required. */
10077 if (slp_node
10078 && mask
10079 && !vect_maybe_update_slp_op_vectype (slp_op,
10080 mask_vectype))
10082 if (dump_enabled_p ())
10083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10084 "incompatible vector types for invariants\n");
10085 return false;
10088 if (!slp)
10089 STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
10091 if (loop_vinfo
10092 && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10093 check_load_store_for_partial_vectors (loop_vinfo, vectype, slp_node,
10094 VLS_LOAD, group_size,
10095 memory_access_type, &gs_info,
10096 mask);
10098 if (dump_enabled_p ()
10099 && memory_access_type != VMAT_ELEMENTWISE
10100 && memory_access_type != VMAT_GATHER_SCATTER
10101 && alignment_support_scheme != dr_aligned)
10102 dump_printf_loc (MSG_NOTE, vect_location,
10103 "Vectorizing an unaligned access.\n");
10105 if (memory_access_type == VMAT_LOAD_STORE_LANES)
10106 vinfo->any_known_not_updated_vssa = true;
10108 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
10111 if (!slp)
10112 gcc_assert (memory_access_type
10113 == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
10115 if (dump_enabled_p () && !costing_p)
10116 dump_printf_loc (MSG_NOTE, vect_location,
10117 "transform load. ncopies = %d\n", ncopies);
10119 /* Transform. */
10121 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info), *first_dr_info = NULL;
10122 ensure_base_align (dr_info);
10124 if (memory_access_type == VMAT_INVARIANT)
10126 gcc_assert (!grouped_load && !mask && !bb_vinfo);
10127 /* If we have versioned for aliasing or the loop doesn't
10128 have any data dependencies that would preclude this,
10129 then we are sure this is a loop invariant load and
10130 thus we can insert it on the preheader edge.
10131 TODO: hoist_defs_of_uses should ideally be computed
10132 once at analysis time, remembered and used in the
10133 transform time. */
10134 bool hoist_p = (LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo)
10135 && !nested_in_vect_loop
10136 && hoist_defs_of_uses (stmt_info, loop, !costing_p));
10137 if (costing_p)
10139 enum vect_cost_model_location cost_loc
10140 = hoist_p ? vect_prologue : vect_body;
10141 unsigned int cost = record_stmt_cost (cost_vec, 1, scalar_load,
10142 stmt_info, 0, cost_loc);
10143 cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, stmt_info, 0,
10144 cost_loc);
10145 unsigned int prologue_cost = hoist_p ? cost : 0;
10146 unsigned int inside_cost = hoist_p ? 0 : cost;
10147 if (dump_enabled_p ())
10148 dump_printf_loc (MSG_NOTE, vect_location,
10149 "vect_model_load_cost: inside_cost = %d, "
10150 "prologue_cost = %d .\n",
10151 inside_cost, prologue_cost);
10152 return true;
10154 if (hoist_p)
10156 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
10157 if (dump_enabled_p ())
10158 dump_printf_loc (MSG_NOTE, vect_location,
10159 "hoisting out of the vectorized loop: %G",
10160 (gimple *) stmt);
10161 scalar_dest = copy_ssa_name (scalar_dest);
10162 tree rhs = unshare_expr (gimple_assign_rhs1 (stmt));
10163 edge pe = loop_preheader_edge (loop);
10164 gphi *vphi = get_virtual_phi (loop->header);
10165 tree vuse;
10166 if (vphi)
10167 vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
10168 else
10169 vuse = gimple_vuse (gsi_stmt (*gsi));
10170 gimple *new_stmt = gimple_build_assign (scalar_dest, rhs);
10171 gimple_set_vuse (new_stmt, vuse);
10172 gsi_insert_on_edge_immediate (pe, new_stmt);
10174 /* These copies are all equivalent. */
10175 if (hoist_p)
10176 new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest,
10177 vectype, NULL);
10178 else
10180 gimple_stmt_iterator gsi2 = *gsi;
10181 gsi_next (&gsi2);
10182 new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest,
10183 vectype, &gsi2);
10185 gimple *new_stmt = SSA_NAME_DEF_STMT (new_temp);
10186 if (slp)
10187 for (j = 0; j < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); ++j)
10188 slp_node->push_vec_def (new_stmt);
10189 else
10191 for (j = 0; j < ncopies; ++j)
10192 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10193 *vec_stmt = new_stmt;
10195 return true;
10198 if (memory_access_type == VMAT_ELEMENTWISE
10199 || memory_access_type == VMAT_STRIDED_SLP)
10201 gimple_stmt_iterator incr_gsi;
10202 bool insert_after;
10203 tree offvar;
10204 tree ivstep;
10205 tree running_off;
10206 vec<constructor_elt, va_gc> *v = NULL;
10207 tree stride_base, stride_step, alias_off;
10208 /* Checked by get_load_store_type. */
10209 unsigned int const_nunits = nunits.to_constant ();
10210 unsigned HOST_WIDE_INT cst_offset = 0;
10211 tree dr_offset;
10212 unsigned int inside_cost = 0;
10214 gcc_assert (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
10215 gcc_assert (!nested_in_vect_loop);
10217 if (grouped_load)
10219 first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
10220 first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
10222 else
10224 first_stmt_info = stmt_info;
10225 first_dr_info = dr_info;
10228 if (slp && grouped_load)
10230 group_size = DR_GROUP_SIZE (first_stmt_info);
10231 ref_type = get_group_alias_ptr_type (first_stmt_info);
10233 else
10235 if (grouped_load)
10236 cst_offset
10237 = (tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)))
10238 * vect_get_place_in_interleaving_chain (stmt_info,
10239 first_stmt_info));
10240 group_size = 1;
10241 ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr));
10244 if (!costing_p)
10246 dr_offset = get_dr_vinfo_offset (vinfo, first_dr_info);
10247 stride_base = fold_build_pointer_plus (
10248 DR_BASE_ADDRESS (first_dr_info->dr),
10249 size_binop (PLUS_EXPR, convert_to_ptrofftype (dr_offset),
10250 convert_to_ptrofftype (DR_INIT (first_dr_info->dr))));
10251 stride_step = fold_convert (sizetype, DR_STEP (first_dr_info->dr));
10253 /* For a load with loop-invariant (but other than power-of-2)
10254 stride (i.e. not a grouped access) like so:
10256 for (i = 0; i < n; i += stride)
10257 ... = array[i];
10259 we generate a new induction variable and new accesses to
10260 form a new vector (or vectors, depending on ncopies):
10262 for (j = 0; ; j += VF*stride)
10263 tmp1 = array[j];
10264 tmp2 = array[j + stride];
10266 vectemp = {tmp1, tmp2, ...}
10269 ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (stride_step), stride_step,
10270 build_int_cst (TREE_TYPE (stride_step), vf));
10272 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
10274 stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base);
10275 ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep);
10276 create_iv (stride_base, PLUS_EXPR, ivstep, NULL,
10277 loop, &incr_gsi, insert_after,
10278 &offvar, NULL);
10280 stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step);
10283 running_off = offvar;
10284 alias_off = build_int_cst (ref_type, 0);
10285 int nloads = const_nunits;
10286 int lnel = 1;
10287 tree ltype = TREE_TYPE (vectype);
10288 tree lvectype = vectype;
10289 auto_vec<tree> dr_chain;
10290 if (memory_access_type == VMAT_STRIDED_SLP)
10292 HOST_WIDE_INT n = gcd (group_size, const_nunits);
10293 /* Use the target vector type if the group size is a multiple
10294 of it. */
10295 if (n == const_nunits)
10297 nloads = 1;
10298 lnel = const_nunits;
10299 ltype = vectype;
10301 /* Else use the biggest vector we can load the group without
10302 accessing excess elements. */
10303 else if (n > 1)
10305 tree ptype;
10306 tree vtype
10307 = vector_vector_composition_type (vectype, const_nunits / n,
10308 &ptype);
10309 if (vtype != NULL_TREE)
10311 nloads = const_nunits / n;
10312 lnel = n;
10313 lvectype = vtype;
10314 ltype = ptype;
10317 /* Else fall back to the default element-wise access. */
10318 ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
10320 /* Load vector(1) scalar_type if it's 1 element-wise vectype. */
10321 else if (nloads == 1)
10322 ltype = vectype;
10324 if (slp)
10326 /* For SLP permutation support we need to load the whole group,
10327 not only the number of vector stmts the permutation result
10328 fits in. */
10329 if (slp_perm)
10331 /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
10332 variable VF. */
10333 unsigned int const_vf = vf.to_constant ();
10334 ncopies = CEIL (group_size * const_vf, const_nunits);
10335 dr_chain.create (ncopies);
10337 else
10338 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10340 unsigned int group_el = 0;
10341 unsigned HOST_WIDE_INT
10342 elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
10343 unsigned int n_groups = 0;
10344 /* For costing some adjacent vector loads, we'd like to cost with
10345 the total number of them once instead of cost each one by one. */
10346 unsigned int n_adjacent_loads = 0;
10347 for (j = 0; j < ncopies; j++)
10349 if (nloads > 1 && !costing_p)
10350 vec_alloc (v, nloads);
10351 gimple *new_stmt = NULL;
10352 for (i = 0; i < nloads; i++)
10354 if (costing_p)
10356 /* For VMAT_ELEMENTWISE, just cost it as scalar_load to
10357 avoid ICE, see PR110776. */
10358 if (VECTOR_TYPE_P (ltype)
10359 && memory_access_type != VMAT_ELEMENTWISE)
10360 n_adjacent_loads++;
10361 else
10362 inside_cost += record_stmt_cost (cost_vec, 1, scalar_load,
10363 stmt_info, 0, vect_body);
10364 continue;
10366 tree this_off = build_int_cst (TREE_TYPE (alias_off),
10367 group_el * elsz + cst_offset);
10368 tree data_ref = build2 (MEM_REF, ltype, running_off, this_off);
10369 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
10370 new_stmt = gimple_build_assign (make_ssa_name (ltype), data_ref);
10371 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
10372 if (nloads > 1)
10373 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
10374 gimple_assign_lhs (new_stmt));
10376 group_el += lnel;
10377 if (! slp
10378 || group_el == group_size)
10380 n_groups++;
10381 /* When doing SLP make sure to not load elements from
10382 the next vector iteration, those will not be accessed
10383 so just use the last element again. See PR107451. */
10384 if (!slp || known_lt (n_groups, vf))
10386 tree newoff = copy_ssa_name (running_off);
10387 gimple *incr
10388 = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
10389 running_off, stride_step);
10390 vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi);
10391 running_off = newoff;
10393 group_el = 0;
10397 if (nloads > 1)
10399 if (costing_p)
10400 inside_cost += record_stmt_cost (cost_vec, 1, vec_construct,
10401 stmt_info, 0, vect_body);
10402 else
10404 tree vec_inv = build_constructor (lvectype, v);
10405 new_temp = vect_init_vector (vinfo, stmt_info, vec_inv,
10406 lvectype, gsi);
10407 new_stmt = SSA_NAME_DEF_STMT (new_temp);
10408 if (lvectype != vectype)
10410 new_stmt
10411 = gimple_build_assign (make_ssa_name (vectype),
10412 VIEW_CONVERT_EXPR,
10413 build1 (VIEW_CONVERT_EXPR,
10414 vectype, new_temp));
10415 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt,
10416 gsi);
10421 if (!costing_p)
10423 if (slp)
10425 if (slp_perm)
10426 dr_chain.quick_push (gimple_assign_lhs (new_stmt));
10427 else
10428 slp_node->push_vec_def (new_stmt);
10430 else
10432 if (j == 0)
10433 *vec_stmt = new_stmt;
10434 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10438 if (slp_perm)
10440 unsigned n_perms;
10441 if (costing_p)
10443 unsigned n_loads;
10444 vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf,
10445 true, &n_perms, &n_loads);
10446 inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm,
10447 first_stmt_info, 0, vect_body);
10449 else
10450 vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
10451 false, &n_perms);
10454 if (costing_p)
10456 if (n_adjacent_loads > 0)
10457 vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
10458 alignment_support_scheme, misalignment, false,
10459 &inside_cost, nullptr, cost_vec, cost_vec,
10460 true);
10461 if (dump_enabled_p ())
10462 dump_printf_loc (MSG_NOTE, vect_location,
10463 "vect_model_load_cost: inside_cost = %u, "
10464 "prologue_cost = 0 .\n",
10465 inside_cost);
10468 return true;
10471 if (memory_access_type == VMAT_GATHER_SCATTER
10472 || (!slp && memory_access_type == VMAT_CONTIGUOUS))
10473 grouped_load = false;
10475 if (grouped_load
10476 || (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()))
10478 if (grouped_load)
10480 first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
10481 group_size = DR_GROUP_SIZE (first_stmt_info);
10483 else
10485 first_stmt_info = stmt_info;
10486 group_size = 1;
10488 /* For SLP vectorization we directly vectorize a subchain
10489 without permutation. */
10490 if (slp && ! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
10491 first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
10492 /* For BB vectorization always use the first stmt to base
10493 the data ref pointer on. */
10494 if (bb_vinfo)
10495 first_stmt_info_for_drptr
10496 = vect_find_first_scalar_stmt_in_slp (slp_node);
10498 /* Check if the chain of loads is already vectorized. */
10499 if (STMT_VINFO_VEC_STMTS (first_stmt_info).exists ()
10500 /* For SLP we would need to copy over SLP_TREE_VEC_DEFS.
10501 ??? But we can only do so if there is exactly one
10502 as we have no way to get at the rest. Leave the CSE
10503 opportunity alone.
10504 ??? With the group load eventually participating
10505 in multiple different permutations (having multiple
10506 slp nodes which refer to the same group) the CSE
10507 is even wrong code. See PR56270. */
10508 && !slp)
10510 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10511 return true;
10513 first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
10514 group_gap_adj = 0;
10516 /* VEC_NUM is the number of vect stmts to be created for this group. */
10517 if (slp)
10519 grouped_load = false;
10520 /* If an SLP permutation is from N elements to N elements,
10521 and if one vector holds a whole number of N, we can load
10522 the inputs to the permutation in the same way as an
10523 unpermuted sequence. In other cases we need to load the
10524 whole group, not only the number of vector stmts the
10525 permutation result fits in. */
10526 unsigned scalar_lanes = SLP_TREE_LANES (slp_node);
10527 if (nested_in_vect_loop)
10528 /* We do not support grouped accesses in a nested loop,
10529 instead the access is contiguous but it might be
10530 permuted. No gap adjustment is needed though. */
10531 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10532 else if (slp_perm
10533 && (group_size != scalar_lanes
10534 || !multiple_p (nunits, group_size)))
10536 /* We don't yet generate such SLP_TREE_LOAD_PERMUTATIONs for
10537 variable VF; see vect_transform_slp_perm_load. */
10538 unsigned int const_vf = vf.to_constant ();
10539 unsigned int const_nunits = nunits.to_constant ();
10540 vec_num = CEIL (group_size * const_vf, const_nunits);
10541 group_gap_adj = vf * group_size - nunits * vec_num;
10543 else
10545 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10546 group_gap_adj
10547 = group_size - scalar_lanes;
10550 else
10551 vec_num = group_size;
10553 ref_type = get_group_alias_ptr_type (first_stmt_info);
10555 else
10557 first_stmt_info = stmt_info;
10558 first_dr_info = dr_info;
10559 group_size = vec_num = 1;
10560 group_gap_adj = 0;
10561 ref_type = reference_alias_ptr_type (DR_REF (first_dr_info->dr));
10562 if (slp)
10563 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10566 gcc_assert (alignment_support_scheme);
10567 vec_loop_masks *loop_masks
10568 = (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10569 ? &LOOP_VINFO_MASKS (loop_vinfo)
10570 : NULL);
10571 vec_loop_lens *loop_lens
10572 = (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)
10573 ? &LOOP_VINFO_LENS (loop_vinfo)
10574 : NULL);
10576 /* The vect_transform_stmt and vect_analyze_stmt will go here but there
10577 are some difference here. We cannot enable both the lens and masks
10578 during transform but it is allowed during analysis.
10579 Shouldn't go with length-based approach if fully masked. */
10580 if (cost_vec == NULL)
10581 /* The cost_vec is NULL during transfrom. */
10582 gcc_assert ((!loop_lens || !loop_masks));
10584 /* Targets with store-lane instructions must not require explicit
10585 realignment. vect_supportable_dr_alignment always returns either
10586 dr_aligned or dr_unaligned_supported for masked operations. */
10587 gcc_assert ((memory_access_type != VMAT_LOAD_STORE_LANES
10588 && !mask
10589 && !loop_masks)
10590 || alignment_support_scheme == dr_aligned
10591 || alignment_support_scheme == dr_unaligned_supported);
10593 /* In case the vectorization factor (VF) is bigger than the number
10594 of elements that we can fit in a vectype (nunits), we have to generate
10595 more than one vector stmt - i.e - we need to "unroll" the
10596 vector stmt by a factor VF/nunits. In doing so, we record a pointer
10597 from one copy of the vector stmt to the next, in the field
10598 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
10599 stages to find the correct vector defs to be used when vectorizing
10600 stmts that use the defs of the current stmt. The example below
10601 illustrates the vectorization process when VF=16 and nunits=4 (i.e., we
10602 need to create 4 vectorized stmts):
10604 before vectorization:
10605 RELATED_STMT VEC_STMT
10606 S1: x = memref - -
10607 S2: z = x + 1 - -
10609 step 1: vectorize stmt S1:
10610 We first create the vector stmt VS1_0, and, as usual, record a
10611 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
10612 Next, we create the vector stmt VS1_1, and record a pointer to
10613 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
10614 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
10615 stmts and pointers:
10616 RELATED_STMT VEC_STMT
10617 VS1_0: vx0 = memref0 VS1_1 -
10618 VS1_1: vx1 = memref1 VS1_2 -
10619 VS1_2: vx2 = memref2 VS1_3 -
10620 VS1_3: vx3 = memref3 - -
10621 S1: x = load - VS1_0
10622 S2: z = x + 1 - -
10625 /* In case of interleaving (non-unit grouped access):
10627 S1: x2 = &base + 2
10628 S2: x0 = &base
10629 S3: x1 = &base + 1
10630 S4: x3 = &base + 3
10632 Vectorized loads are created in the order of memory accesses
10633 starting from the access of the first stmt of the chain:
10635 VS1: vx0 = &base
10636 VS2: vx1 = &base + vec_size*1
10637 VS3: vx3 = &base + vec_size*2
10638 VS4: vx4 = &base + vec_size*3
10640 Then permutation statements are generated:
10642 VS5: vx5 = VEC_PERM_EXPR < vx0, vx1, { 0, 2, ..., i*2 } >
10643 VS6: vx6 = VEC_PERM_EXPR < vx0, vx1, { 1, 3, ..., i*2+1 } >
10646 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
10647 (the order of the data-refs in the output of vect_permute_load_chain
10648 corresponds to the order of scalar stmts in the interleaving chain - see
10649 the documentation of vect_permute_load_chain()).
10650 The generation of permutation stmts and recording them in
10651 STMT_VINFO_VEC_STMT is done in vect_transform_grouped_load().
10653 In case of both multiple types and interleaving, the vector loads and
10654 permutation stmts above are created for every copy. The result vector
10655 stmts are put in STMT_VINFO_VEC_STMT for the first copy and in the
10656 corresponding STMT_VINFO_RELATED_STMT for the next copies. */
10658 /* If the data reference is aligned (dr_aligned) or potentially unaligned
10659 on a target that supports unaligned accesses (dr_unaligned_supported)
10660 we generate the following code:
10661 p = initial_addr;
10662 indx = 0;
10663 loop {
10664 p = p + indx * vectype_size;
10665 vec_dest = *(p);
10666 indx = indx + 1;
10669 Otherwise, the data reference is potentially unaligned on a target that
10670 does not support unaligned accesses (dr_explicit_realign_optimized) -
10671 then generate the following code, in which the data in each iteration is
10672 obtained by two vector loads, one from the previous iteration, and one
10673 from the current iteration:
10674 p1 = initial_addr;
10675 msq_init = *(floor(p1))
10676 p2 = initial_addr + VS - 1;
10677 realignment_token = call target_builtin;
10678 indx = 0;
10679 loop {
10680 p2 = p2 + indx * vectype_size
10681 lsq = *(floor(p2))
10682 vec_dest = realign_load (msq, lsq, realignment_token)
10683 indx = indx + 1;
10684 msq = lsq;
10685 } */
10687 /* If the misalignment remains the same throughout the execution of the
10688 loop, we can create the init_addr and permutation mask at the loop
10689 preheader. Otherwise, it needs to be created inside the loop.
10690 This can only occur when vectorizing memory accesses in the inner-loop
10691 nested within an outer-loop that is being vectorized. */
10693 if (nested_in_vect_loop
10694 && !multiple_p (DR_STEP_ALIGNMENT (dr_info->dr),
10695 GET_MODE_SIZE (TYPE_MODE (vectype))))
10697 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
10698 compute_in_loop = true;
10701 bool diff_first_stmt_info
10702 = first_stmt_info_for_drptr && first_stmt_info != first_stmt_info_for_drptr;
10704 tree offset = NULL_TREE;
10705 if ((alignment_support_scheme == dr_explicit_realign_optimized
10706 || alignment_support_scheme == dr_explicit_realign)
10707 && !compute_in_loop)
10709 /* If we have different first_stmt_info, we can't set up realignment
10710 here, since we can't guarantee first_stmt_info DR has been
10711 initialized yet, use first_stmt_info_for_drptr DR by bumping the
10712 distance from first_stmt_info DR instead as below. */
10713 if (!costing_p)
10715 if (!diff_first_stmt_info)
10716 msq = vect_setup_realignment (vinfo, first_stmt_info, gsi,
10717 &realignment_token,
10718 alignment_support_scheme, NULL_TREE,
10719 &at_loop);
10720 if (alignment_support_scheme == dr_explicit_realign_optimized)
10722 phi = as_a<gphi *> (SSA_NAME_DEF_STMT (msq));
10723 offset = size_binop (MINUS_EXPR, TYPE_SIZE_UNIT (vectype),
10724 size_one_node);
10725 gcc_assert (!first_stmt_info_for_drptr);
10729 else
10730 at_loop = loop;
10732 if (!known_eq (poffset, 0))
10733 offset = (offset
10734 ? size_binop (PLUS_EXPR, offset, size_int (poffset))
10735 : size_int (poffset));
10737 tree bump;
10738 tree vec_offset = NULL_TREE;
10739 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
10741 aggr_type = NULL_TREE;
10742 bump = NULL_TREE;
10744 else if (memory_access_type == VMAT_GATHER_SCATTER)
10746 aggr_type = elem_type;
10747 if (!costing_p)
10748 vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
10749 &bump, &vec_offset, loop_lens);
10751 else
10753 if (memory_access_type == VMAT_LOAD_STORE_LANES)
10754 aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
10755 else
10756 aggr_type = vectype;
10757 bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
10758 memory_access_type, loop_lens);
10761 auto_vec<tree> vec_offsets;
10762 auto_vec<tree> vec_masks;
10763 if (mask && !costing_p)
10765 if (slp_node)
10766 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[mask_index],
10767 &vec_masks);
10768 else
10769 vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask,
10770 &vec_masks, mask_vectype);
10773 tree vec_mask = NULL_TREE;
10774 if (memory_access_type == VMAT_LOAD_STORE_LANES)
10776 gcc_assert (alignment_support_scheme == dr_aligned
10777 || alignment_support_scheme == dr_unaligned_supported);
10778 gcc_assert (grouped_load && !slp);
10780 unsigned int inside_cost = 0, prologue_cost = 0;
10781 /* For costing some adjacent vector loads, we'd like to cost with
10782 the total number of them once instead of cost each one by one. */
10783 unsigned int n_adjacent_loads = 0;
10784 for (j = 0; j < ncopies; j++)
10786 if (costing_p)
10788 /* An IFN_LOAD_LANES will load all its vector results,
10789 regardless of which ones we actually need. Account
10790 for the cost of unused results. */
10791 if (first_stmt_info == stmt_info)
10793 unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
10794 stmt_vec_info next_stmt_info = first_stmt_info;
10797 gaps -= 1;
10798 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
10800 while (next_stmt_info);
10801 if (gaps)
10803 if (dump_enabled_p ())
10804 dump_printf_loc (MSG_NOTE, vect_location,
10805 "vect_model_load_cost: %d "
10806 "unused vectors.\n",
10807 gaps);
10808 vect_get_load_cost (vinfo, stmt_info, gaps,
10809 alignment_support_scheme,
10810 misalignment, false, &inside_cost,
10811 &prologue_cost, cost_vec, cost_vec,
10812 true);
10815 n_adjacent_loads++;
10816 continue;
10819 /* 1. Create the vector or array pointer update chain. */
10820 if (j == 0)
10821 dataref_ptr
10822 = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
10823 at_loop, offset, &dummy, gsi,
10824 &ptr_incr, false, bump);
10825 else
10827 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10828 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
10829 stmt_info, bump);
10831 if (mask)
10832 vec_mask = vec_masks[j];
10834 tree vec_array = create_vector_array (vectype, vec_num);
10836 tree final_mask = NULL_TREE;
10837 tree final_len = NULL_TREE;
10838 tree bias = NULL_TREE;
10839 if (loop_masks)
10840 final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
10841 ncopies, vectype, j);
10842 if (vec_mask)
10843 final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask,
10844 vec_mask, gsi);
10846 if (lanes_ifn == IFN_MASK_LEN_LOAD_LANES)
10848 if (loop_lens)
10849 final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
10850 ncopies, vectype, j, 1);
10851 else
10852 final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
10853 signed char biasval
10854 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10855 bias = build_int_cst (intQI_type_node, biasval);
10856 if (!final_mask)
10858 mask_vectype = truth_type_for (vectype);
10859 final_mask = build_minus_one_cst (mask_vectype);
10863 gcall *call;
10864 if (final_len && final_mask)
10866 /* Emit:
10867 VEC_ARRAY = MASK_LEN_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
10868 VEC_MASK, LEN, BIAS). */
10869 unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
10870 tree alias_ptr = build_int_cst (ref_type, align);
10871 call = gimple_build_call_internal (IFN_MASK_LEN_LOAD_LANES, 5,
10872 dataref_ptr, alias_ptr,
10873 final_mask, final_len, bias);
10875 else if (final_mask)
10877 /* Emit:
10878 VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
10879 VEC_MASK). */
10880 unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
10881 tree alias_ptr = build_int_cst (ref_type, align);
10882 call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
10883 dataref_ptr, alias_ptr,
10884 final_mask);
10886 else
10888 /* Emit:
10889 VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]). */
10890 data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
10891 call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref);
10893 gimple_call_set_lhs (call, vec_array);
10894 gimple_call_set_nothrow (call, true);
10895 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
10897 dr_chain.create (vec_num);
10898 /* Extract each vector into an SSA_NAME. */
10899 for (i = 0; i < vec_num; i++)
10901 new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
10902 vec_array, i);
10903 dr_chain.quick_push (new_temp);
10906 /* Record the mapping between SSA_NAMEs and statements. */
10907 vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
10909 /* Record that VEC_ARRAY is now dead. */
10910 vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
10912 dr_chain.release ();
10914 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10917 if (costing_p)
10919 if (n_adjacent_loads > 0)
10920 vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
10921 alignment_support_scheme, misalignment, false,
10922 &inside_cost, &prologue_cost, cost_vec,
10923 cost_vec, true);
10924 if (dump_enabled_p ())
10925 dump_printf_loc (MSG_NOTE, vect_location,
10926 "vect_model_load_cost: inside_cost = %u, "
10927 "prologue_cost = %u .\n",
10928 inside_cost, prologue_cost);
10931 return true;
10934 if (memory_access_type == VMAT_GATHER_SCATTER)
10936 gcc_assert (alignment_support_scheme == dr_aligned
10937 || alignment_support_scheme == dr_unaligned_supported);
10938 gcc_assert (!grouped_load && !slp_perm);
10940 unsigned int inside_cost = 0, prologue_cost = 0;
10941 for (j = 0; j < ncopies; j++)
10943 /* 1. Create the vector or array pointer update chain. */
10944 if (j == 0 && !costing_p)
10946 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
10947 vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
10948 slp_node, &gs_info, &dataref_ptr,
10949 &vec_offsets);
10950 else
10951 dataref_ptr
10952 = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
10953 at_loop, offset, &dummy, gsi,
10954 &ptr_incr, false, bump);
10956 else if (!costing_p)
10958 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10959 if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
10960 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
10961 gsi, stmt_info, bump);
10964 gimple *new_stmt = NULL;
10965 for (i = 0; i < vec_num; i++)
10967 tree final_mask = NULL_TREE;
10968 tree final_len = NULL_TREE;
10969 tree bias = NULL_TREE;
10970 if (!costing_p)
10972 if (mask)
10973 vec_mask = vec_masks[vec_num * j + i];
10974 if (loop_masks)
10975 final_mask
10976 = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
10977 vec_num * ncopies, vectype,
10978 vec_num * j + i);
10979 if (vec_mask)
10980 final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
10981 final_mask, vec_mask, gsi);
10983 if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
10984 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
10985 gsi, stmt_info, bump);
10988 /* 2. Create the vector-load in the loop. */
10989 unsigned HOST_WIDE_INT align;
10990 if (gs_info.ifn != IFN_LAST)
10992 if (costing_p)
10994 unsigned int cnunits = vect_nunits_for_cost (vectype);
10995 inside_cost
10996 = record_stmt_cost (cost_vec, cnunits, scalar_load,
10997 stmt_info, 0, vect_body);
10998 continue;
11000 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
11001 vec_offset = vec_offsets[vec_num * j + i];
11002 tree zero = build_zero_cst (vectype);
11003 tree scale = size_int (gs_info.scale);
11005 if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
11007 if (loop_lens)
11008 final_len
11009 = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
11010 vec_num * ncopies, vectype,
11011 vec_num * j + i, 1);
11012 else
11013 final_len
11014 = build_int_cst (sizetype,
11015 TYPE_VECTOR_SUBPARTS (vectype));
11016 signed char biasval
11017 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
11018 bias = build_int_cst (intQI_type_node, biasval);
11019 if (!final_mask)
11021 mask_vectype = truth_type_for (vectype);
11022 final_mask = build_minus_one_cst (mask_vectype);
11026 gcall *call;
11027 if (final_len && final_mask)
11028 call
11029 = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7,
11030 dataref_ptr, vec_offset,
11031 scale, zero, final_mask,
11032 final_len, bias);
11033 else if (final_mask)
11034 call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
11035 dataref_ptr, vec_offset,
11036 scale, zero, final_mask);
11037 else
11038 call = gimple_build_call_internal (IFN_GATHER_LOAD, 4,
11039 dataref_ptr, vec_offset,
11040 scale, zero);
11041 gimple_call_set_nothrow (call, true);
11042 new_stmt = call;
11043 data_ref = NULL_TREE;
11045 else if (gs_info.decl)
11047 /* The builtin decls path for gather is legacy, x86 only. */
11048 gcc_assert (!final_len && nunits.is_constant ());
11049 if (costing_p)
11051 unsigned int cnunits = vect_nunits_for_cost (vectype);
11052 inside_cost
11053 = record_stmt_cost (cost_vec, cnunits, scalar_load,
11054 stmt_info, 0, vect_body);
11055 continue;
11057 poly_uint64 offset_nunits
11058 = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype);
11059 if (known_eq (nunits, offset_nunits))
11061 new_stmt = vect_build_one_gather_load_call
11062 (vinfo, stmt_info, gsi, &gs_info,
11063 dataref_ptr, vec_offsets[vec_num * j + i],
11064 final_mask);
11065 data_ref = NULL_TREE;
11067 else if (known_eq (nunits, offset_nunits * 2))
11069 /* We have a offset vector with half the number of
11070 lanes but the builtins will produce full vectype
11071 data with just the lower lanes filled. */
11072 new_stmt = vect_build_one_gather_load_call
11073 (vinfo, stmt_info, gsi, &gs_info,
11074 dataref_ptr, vec_offsets[2 * vec_num * j + 2 * i],
11075 final_mask);
11076 tree low = make_ssa_name (vectype);
11077 gimple_set_lhs (new_stmt, low);
11078 vect_finish_stmt_generation (vinfo, stmt_info,
11079 new_stmt, gsi);
11081 /* now put upper half of final_mask in final_mask low. */
11082 if (final_mask
11083 && !SCALAR_INT_MODE_P
11084 (TYPE_MODE (TREE_TYPE (final_mask))))
11086 int count = nunits.to_constant ();
11087 vec_perm_builder sel (count, count, 1);
11088 sel.quick_grow (count);
11089 for (int i = 0; i < count; ++i)
11090 sel[i] = i | (count / 2);
11091 vec_perm_indices indices (sel, 2, count);
11092 tree perm_mask = vect_gen_perm_mask_checked
11093 (TREE_TYPE (final_mask), indices);
11094 new_stmt = gimple_build_assign (NULL_TREE,
11095 VEC_PERM_EXPR,
11096 final_mask,
11097 final_mask,
11098 perm_mask);
11099 final_mask = make_ssa_name (TREE_TYPE (final_mask));
11100 gimple_set_lhs (new_stmt, final_mask);
11101 vect_finish_stmt_generation (vinfo, stmt_info,
11102 new_stmt, gsi);
11104 else if (final_mask)
11106 new_stmt = gimple_build_assign (NULL_TREE,
11107 VEC_UNPACK_HI_EXPR,
11108 final_mask);
11109 final_mask = make_ssa_name
11110 (truth_type_for (gs_info.offset_vectype));
11111 gimple_set_lhs (new_stmt, final_mask);
11112 vect_finish_stmt_generation (vinfo, stmt_info,
11113 new_stmt, gsi);
11116 new_stmt = vect_build_one_gather_load_call
11117 (vinfo, stmt_info, gsi, &gs_info,
11118 dataref_ptr,
11119 vec_offsets[2 * vec_num * j + 2 * i + 1],
11120 final_mask);
11121 tree high = make_ssa_name (vectype);
11122 gimple_set_lhs (new_stmt, high);
11123 vect_finish_stmt_generation (vinfo, stmt_info,
11124 new_stmt, gsi);
11126 /* compose low + high. */
11127 int count = nunits.to_constant ();
11128 vec_perm_builder sel (count, count, 1);
11129 sel.quick_grow (count);
11130 for (int i = 0; i < count; ++i)
11131 sel[i] = i < count / 2 ? i : i + count / 2;
11132 vec_perm_indices indices (sel, 2, count);
11133 tree perm_mask
11134 = vect_gen_perm_mask_checked (vectype, indices);
11135 new_stmt = gimple_build_assign (NULL_TREE,
11136 VEC_PERM_EXPR,
11137 low, high, perm_mask);
11138 data_ref = NULL_TREE;
11140 else if (known_eq (nunits * 2, offset_nunits))
11142 /* We have a offset vector with double the number of
11143 lanes. Select the low/high part accordingly. */
11144 vec_offset = vec_offsets[(vec_num * j + i) / 2];
11145 if ((vec_num * j + i) & 1)
11147 int count = offset_nunits.to_constant ();
11148 vec_perm_builder sel (count, count, 1);
11149 sel.quick_grow (count);
11150 for (int i = 0; i < count; ++i)
11151 sel[i] = i | (count / 2);
11152 vec_perm_indices indices (sel, 2, count);
11153 tree perm_mask = vect_gen_perm_mask_checked
11154 (TREE_TYPE (vec_offset), indices);
11155 new_stmt = gimple_build_assign (NULL_TREE,
11156 VEC_PERM_EXPR,
11157 vec_offset,
11158 vec_offset,
11159 perm_mask);
11160 vec_offset = make_ssa_name (TREE_TYPE (vec_offset));
11161 gimple_set_lhs (new_stmt, vec_offset);
11162 vect_finish_stmt_generation (vinfo, stmt_info,
11163 new_stmt, gsi);
11165 new_stmt = vect_build_one_gather_load_call
11166 (vinfo, stmt_info, gsi, &gs_info,
11167 dataref_ptr, vec_offset, final_mask);
11168 data_ref = NULL_TREE;
11170 else
11171 gcc_unreachable ();
11173 else
11175 /* Emulated gather-scatter. */
11176 gcc_assert (!final_mask);
11177 unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
11178 if (costing_p)
11180 /* For emulated gathers N offset vector element
11181 offset add is consumed by the load). */
11182 inside_cost = record_stmt_cost (cost_vec, const_nunits,
11183 vec_to_scalar, stmt_info,
11184 0, vect_body);
11185 /* N scalar loads plus gathering them into a
11186 vector. */
11187 inside_cost
11188 = record_stmt_cost (cost_vec, const_nunits, scalar_load,
11189 stmt_info, 0, vect_body);
11190 inside_cost
11191 = record_stmt_cost (cost_vec, 1, vec_construct,
11192 stmt_info, 0, vect_body);
11193 continue;
11195 unsigned HOST_WIDE_INT const_offset_nunits
11196 = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
11197 .to_constant ();
11198 vec<constructor_elt, va_gc> *ctor_elts;
11199 vec_alloc (ctor_elts, const_nunits);
11200 gimple_seq stmts = NULL;
11201 /* We support offset vectors with more elements
11202 than the data vector for now. */
11203 unsigned HOST_WIDE_INT factor
11204 = const_offset_nunits / const_nunits;
11205 vec_offset = vec_offsets[(vec_num * j + i) / factor];
11206 unsigned elt_offset
11207 = ((vec_num * j + i) % factor) * const_nunits;
11208 tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
11209 tree scale = size_int (gs_info.scale);
11210 align = get_object_alignment (DR_REF (first_dr_info->dr));
11211 tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
11212 for (unsigned k = 0; k < const_nunits; ++k)
11214 tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
11215 bitsize_int (k + elt_offset));
11216 tree idx
11217 = gimple_build (&stmts, BIT_FIELD_REF, idx_type,
11218 vec_offset, TYPE_SIZE (idx_type), boff);
11219 idx = gimple_convert (&stmts, sizetype, idx);
11220 idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx,
11221 scale);
11222 tree ptr = gimple_build (&stmts, PLUS_EXPR,
11223 TREE_TYPE (dataref_ptr),
11224 dataref_ptr, idx);
11225 ptr = gimple_convert (&stmts, ptr_type_node, ptr);
11226 tree elt = make_ssa_name (TREE_TYPE (vectype));
11227 tree ref = build2 (MEM_REF, ltype, ptr,
11228 build_int_cst (ref_type, 0));
11229 new_stmt = gimple_build_assign (elt, ref);
11230 gimple_set_vuse (new_stmt, gimple_vuse (gsi_stmt (*gsi)));
11231 gimple_seq_add_stmt (&stmts, new_stmt);
11232 CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
11234 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
11235 new_stmt = gimple_build_assign (
11236 NULL_TREE, build_constructor (vectype, ctor_elts));
11237 data_ref = NULL_TREE;
11240 vec_dest = vect_create_destination_var (scalar_dest, vectype);
11241 /* DATA_REF is null if we've already built the statement. */
11242 if (data_ref)
11244 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
11245 new_stmt = gimple_build_assign (vec_dest, data_ref);
11247 new_temp = make_ssa_name (vec_dest, new_stmt);
11248 gimple_set_lhs (new_stmt, new_temp);
11249 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11251 /* Store vector loads in the corresponding SLP_NODE. */
11252 if (slp)
11253 slp_node->push_vec_def (new_stmt);
11256 if (!slp && !costing_p)
11257 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
11260 if (!slp && !costing_p)
11261 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
11263 if (costing_p && dump_enabled_p ())
11264 dump_printf_loc (MSG_NOTE, vect_location,
11265 "vect_model_load_cost: inside_cost = %u, "
11266 "prologue_cost = %u .\n",
11267 inside_cost, prologue_cost);
11268 return true;
11271 poly_uint64 group_elt = 0;
11272 unsigned int inside_cost = 0, prologue_cost = 0;
11273 /* For costing some adjacent vector loads, we'd like to cost with
11274 the total number of them once instead of cost each one by one. */
11275 unsigned int n_adjacent_loads = 0;
11276 for (j = 0; j < ncopies; j++)
11278 /* 1. Create the vector or array pointer update chain. */
11279 if (j == 0 && !costing_p)
11281 bool simd_lane_access_p
11282 = STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) != 0;
11283 if (simd_lane_access_p
11284 && TREE_CODE (DR_BASE_ADDRESS (first_dr_info->dr)) == ADDR_EXPR
11285 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (first_dr_info->dr), 0))
11286 && integer_zerop (get_dr_vinfo_offset (vinfo, first_dr_info))
11287 && integer_zerop (DR_INIT (first_dr_info->dr))
11288 && alias_sets_conflict_p (get_alias_set (aggr_type),
11289 get_alias_set (TREE_TYPE (ref_type)))
11290 && (alignment_support_scheme == dr_aligned
11291 || alignment_support_scheme == dr_unaligned_supported))
11293 dataref_ptr = unshare_expr (DR_BASE_ADDRESS (first_dr_info->dr));
11294 dataref_offset = build_int_cst (ref_type, 0);
11296 else if (diff_first_stmt_info)
11298 dataref_ptr
11299 = vect_create_data_ref_ptr (vinfo, first_stmt_info_for_drptr,
11300 aggr_type, at_loop, offset, &dummy,
11301 gsi, &ptr_incr, simd_lane_access_p,
11302 bump);
11303 /* Adjust the pointer by the difference to first_stmt. */
11304 data_reference_p ptrdr
11305 = STMT_VINFO_DATA_REF (first_stmt_info_for_drptr);
11306 tree diff
11307 = fold_convert (sizetype,
11308 size_binop (MINUS_EXPR,
11309 DR_INIT (first_dr_info->dr),
11310 DR_INIT (ptrdr)));
11311 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
11312 stmt_info, diff);
11313 if (alignment_support_scheme == dr_explicit_realign)
11315 msq = vect_setup_realignment (vinfo,
11316 first_stmt_info_for_drptr, gsi,
11317 &realignment_token,
11318 alignment_support_scheme,
11319 dataref_ptr, &at_loop);
11320 gcc_assert (!compute_in_loop);
11323 else
11324 dataref_ptr
11325 = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
11326 at_loop,
11327 offset, &dummy, gsi, &ptr_incr,
11328 simd_lane_access_p, bump);
11330 else if (!costing_p)
11332 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
11333 if (dataref_offset)
11334 dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset,
11335 bump);
11336 else
11337 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
11338 stmt_info, bump);
11341 if (grouped_load || slp_perm)
11342 dr_chain.create (vec_num);
11344 gimple *new_stmt = NULL;
11345 for (i = 0; i < vec_num; i++)
11347 tree final_mask = NULL_TREE;
11348 tree final_len = NULL_TREE;
11349 tree bias = NULL_TREE;
11350 if (!costing_p)
11352 if (mask)
11353 vec_mask = vec_masks[vec_num * j + i];
11354 if (loop_masks)
11355 final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
11356 vec_num * ncopies, vectype,
11357 vec_num * j + i);
11358 if (vec_mask)
11359 final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
11360 final_mask, vec_mask, gsi);
11362 if (i > 0)
11363 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
11364 gsi, stmt_info, bump);
11367 /* 2. Create the vector-load in the loop. */
11368 switch (alignment_support_scheme)
11370 case dr_aligned:
11371 case dr_unaligned_supported:
11373 if (costing_p)
11374 break;
11376 unsigned int misalign;
11377 unsigned HOST_WIDE_INT align;
11378 align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
11379 if (alignment_support_scheme == dr_aligned)
11380 misalign = 0;
11381 else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
11383 align
11384 = dr_alignment (vect_dr_behavior (vinfo, first_dr_info));
11385 misalign = 0;
11387 else
11388 misalign = misalignment;
11389 if (dataref_offset == NULL_TREE
11390 && TREE_CODE (dataref_ptr) == SSA_NAME)
11391 set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
11392 misalign);
11393 align = least_bit_hwi (misalign | align);
11395 /* Compute IFN when LOOP_LENS or final_mask valid. */
11396 machine_mode vmode = TYPE_MODE (vectype);
11397 machine_mode new_vmode = vmode;
11398 internal_fn partial_ifn = IFN_LAST;
11399 if (loop_lens)
11401 opt_machine_mode new_ovmode
11402 = get_len_load_store_mode (vmode, true, &partial_ifn);
11403 new_vmode = new_ovmode.require ();
11404 unsigned factor
11405 = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode);
11406 final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
11407 vec_num * ncopies, vectype,
11408 vec_num * j + i, factor);
11410 else if (final_mask)
11412 if (!can_vec_mask_load_store_p (
11413 vmode, TYPE_MODE (TREE_TYPE (final_mask)), true,
11414 &partial_ifn))
11415 gcc_unreachable ();
11418 if (partial_ifn == IFN_MASK_LEN_LOAD)
11420 if (!final_len)
11422 /* Pass VF value to 'len' argument of
11423 MASK_LEN_LOAD if LOOP_LENS is invalid. */
11424 final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
11426 if (!final_mask)
11428 /* Pass all ones value to 'mask' argument of
11429 MASK_LEN_LOAD if final_mask is invalid. */
11430 mask_vectype = truth_type_for (vectype);
11431 final_mask = build_minus_one_cst (mask_vectype);
11434 if (final_len)
11436 signed char biasval
11437 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
11439 bias = build_int_cst (intQI_type_node, biasval);
11442 if (final_len)
11444 tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
11445 gcall *call;
11446 if (partial_ifn == IFN_MASK_LEN_LOAD)
11447 call = gimple_build_call_internal (IFN_MASK_LEN_LOAD, 5,
11448 dataref_ptr, ptr,
11449 final_mask, final_len,
11450 bias);
11451 else
11452 call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
11453 dataref_ptr, ptr,
11454 final_len, bias);
11455 gimple_call_set_nothrow (call, true);
11456 new_stmt = call;
11457 data_ref = NULL_TREE;
11459 /* Need conversion if it's wrapped with VnQI. */
11460 if (vmode != new_vmode)
11462 tree new_vtype = build_vector_type_for_mode (
11463 unsigned_intQI_type_node, new_vmode);
11464 tree var
11465 = vect_get_new_ssa_name (new_vtype, vect_simple_var);
11466 gimple_set_lhs (call, var);
11467 vect_finish_stmt_generation (vinfo, stmt_info, call,
11468 gsi);
11469 tree op = build1 (VIEW_CONVERT_EXPR, vectype, var);
11470 new_stmt = gimple_build_assign (vec_dest,
11471 VIEW_CONVERT_EXPR, op);
11474 else if (final_mask)
11476 tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
11477 gcall *call = gimple_build_call_internal (IFN_MASK_LOAD, 3,
11478 dataref_ptr, ptr,
11479 final_mask);
11480 gimple_call_set_nothrow (call, true);
11481 new_stmt = call;
11482 data_ref = NULL_TREE;
11484 else
11486 tree ltype = vectype;
11487 tree new_vtype = NULL_TREE;
11488 unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info);
11489 unsigned int vect_align
11490 = vect_known_alignment_in_bytes (first_dr_info, vectype);
11491 /* Try to use a single smaller load when we are about
11492 to load excess elements compared to the unrolled
11493 scalar loop. */
11494 if (known_gt ((vec_num * j + i + 1) * nunits,
11495 (group_size * vf - gap)))
11497 poly_uint64 remain = ((group_size * vf - gap)
11498 - (vec_num * j + i) * nunits);
11499 if (known_ge ((vec_num * j + i + 1) * nunits
11500 - (group_size * vf - gap), nunits))
11501 /* DR will be unused. */
11502 ltype = NULL_TREE;
11503 else if (known_ge (vect_align,
11504 tree_to_poly_uint64
11505 (TYPE_SIZE_UNIT (vectype))))
11506 /* Aligned access to excess elements is OK if
11507 at least one element is accessed in the
11508 scalar loop. */
11510 else if (known_gt (vect_align,
11511 ((nunits - remain)
11512 * vect_get_scalar_dr_size
11513 (first_dr_info))))
11514 /* Aligned access to the gap area when there's
11515 at least one element in it is OK. */
11517 else
11519 /* remain should now be > 0 and < nunits. */
11520 unsigned num;
11521 if (constant_multiple_p (nunits, remain, &num))
11523 tree ptype;
11524 new_vtype
11525 = vector_vector_composition_type (vectype,
11526 num,
11527 &ptype);
11528 if (new_vtype)
11529 ltype = ptype;
11531 /* Else use multiple loads or a masked load? */
11532 /* For loop vectorization we now should have
11533 an alternate type or LOOP_VINFO_PEELING_FOR_GAPS
11534 set. */
11535 if (loop_vinfo)
11536 gcc_assert (new_vtype
11537 || LOOP_VINFO_PEELING_FOR_GAPS
11538 (loop_vinfo));
11539 /* But still reduce the access size to the next
11540 required power-of-two so peeling a single
11541 scalar iteration is sufficient. */
11542 unsigned HOST_WIDE_INT cremain;
11543 if (remain.is_constant (&cremain))
11545 unsigned HOST_WIDE_INT cpart_size
11546 = 1 << ceil_log2 (cremain);
11547 if (known_gt (nunits, cpart_size)
11548 && constant_multiple_p (nunits, cpart_size,
11549 &num))
11551 tree ptype;
11552 new_vtype
11553 = vector_vector_composition_type (vectype,
11554 num,
11555 &ptype);
11556 if (new_vtype)
11557 ltype = ptype;
11562 tree offset
11563 = (dataref_offset ? dataref_offset
11564 : build_int_cst (ref_type, 0));
11565 if (!ltype)
11567 else if (ltype != vectype
11568 && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
11570 poly_uint64 gap_offset
11571 = (tree_to_poly_uint64 (TYPE_SIZE_UNIT (vectype))
11572 - tree_to_poly_uint64 (TYPE_SIZE_UNIT (ltype)));
11573 tree gapcst = build_int_cstu (ref_type, gap_offset);
11574 offset = size_binop (PLUS_EXPR, offset, gapcst);
11576 if (ltype)
11578 data_ref
11579 = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
11580 if (alignment_support_scheme == dr_aligned)
11582 else
11583 TREE_TYPE (data_ref)
11584 = build_aligned_type (TREE_TYPE (data_ref),
11585 align * BITS_PER_UNIT);
11587 if (!ltype)
11588 data_ref = build_constructor (vectype, NULL);
11589 else if (ltype != vectype)
11591 vect_copy_ref_info (data_ref,
11592 DR_REF (first_dr_info->dr));
11593 tree tem = make_ssa_name (ltype);
11594 new_stmt = gimple_build_assign (tem, data_ref);
11595 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt,
11596 gsi);
11597 data_ref = NULL;
11598 vec<constructor_elt, va_gc> *v;
11599 /* We've computed 'num' above to statically two
11600 or via constant_multiple_p. */
11601 unsigned num
11602 = (exact_div (tree_to_poly_uint64
11603 (TYPE_SIZE_UNIT (vectype)),
11604 tree_to_poly_uint64
11605 (TYPE_SIZE_UNIT (ltype)))
11606 .to_constant ());
11607 vec_alloc (v, num);
11608 if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
11610 while (--num)
11611 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
11612 build_zero_cst (ltype));
11613 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
11615 else
11617 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
11618 while (--num)
11619 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
11620 build_zero_cst (ltype));
11622 gcc_assert (new_vtype != NULL_TREE);
11623 if (new_vtype == vectype)
11624 new_stmt = gimple_build_assign (
11625 vec_dest, build_constructor (vectype, v));
11626 else
11628 tree new_vname = make_ssa_name (new_vtype);
11629 new_stmt = gimple_build_assign (
11630 new_vname, build_constructor (new_vtype, v));
11631 vect_finish_stmt_generation (vinfo, stmt_info,
11632 new_stmt, gsi);
11633 new_stmt = gimple_build_assign (
11634 vec_dest,
11635 build1 (VIEW_CONVERT_EXPR, vectype, new_vname));
11639 break;
11641 case dr_explicit_realign:
11643 if (costing_p)
11644 break;
11645 tree ptr, bump;
11647 tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
11649 if (compute_in_loop)
11650 msq = vect_setup_realignment (vinfo, first_stmt_info, gsi,
11651 &realignment_token,
11652 dr_explicit_realign,
11653 dataref_ptr, NULL);
11655 if (TREE_CODE (dataref_ptr) == SSA_NAME)
11656 ptr = copy_ssa_name (dataref_ptr);
11657 else
11658 ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
11659 // For explicit realign the target alignment should be
11660 // known at compile time.
11661 unsigned HOST_WIDE_INT align
11662 = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
11663 new_stmt = gimple_build_assign (
11664 ptr, BIT_AND_EXPR, dataref_ptr,
11665 build_int_cst (TREE_TYPE (dataref_ptr),
11666 -(HOST_WIDE_INT) align));
11667 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11668 data_ref
11669 = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0));
11670 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
11671 vec_dest = vect_create_destination_var (scalar_dest, vectype);
11672 new_stmt = gimple_build_assign (vec_dest, data_ref);
11673 new_temp = make_ssa_name (vec_dest, new_stmt);
11674 gimple_assign_set_lhs (new_stmt, new_temp);
11675 gimple_move_vops (new_stmt, stmt_info->stmt);
11676 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11677 msq = new_temp;
11679 bump = size_binop (MULT_EXPR, vs, TYPE_SIZE_UNIT (elem_type));
11680 bump = size_binop (MINUS_EXPR, bump, size_one_node);
11681 ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, stmt_info,
11682 bump);
11683 new_stmt = gimple_build_assign (
11684 NULL_TREE, BIT_AND_EXPR, ptr,
11685 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
11686 if (TREE_CODE (ptr) == SSA_NAME)
11687 ptr = copy_ssa_name (ptr, new_stmt);
11688 else
11689 ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt);
11690 gimple_assign_set_lhs (new_stmt, ptr);
11691 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11692 data_ref
11693 = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0));
11694 break;
11696 case dr_explicit_realign_optimized:
11698 if (costing_p)
11699 break;
11700 if (TREE_CODE (dataref_ptr) == SSA_NAME)
11701 new_temp = copy_ssa_name (dataref_ptr);
11702 else
11703 new_temp = make_ssa_name (TREE_TYPE (dataref_ptr));
11704 // We should only be doing this if we know the target
11705 // alignment at compile time.
11706 unsigned HOST_WIDE_INT align
11707 = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
11708 new_stmt = gimple_build_assign (
11709 new_temp, BIT_AND_EXPR, dataref_ptr,
11710 build_int_cst (TREE_TYPE (dataref_ptr),
11711 -(HOST_WIDE_INT) align));
11712 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11713 data_ref = build2 (MEM_REF, vectype, new_temp,
11714 build_int_cst (ref_type, 0));
11715 break;
11717 default:
11718 gcc_unreachable ();
11721 /* One common place to cost the above vect load for different
11722 alignment support schemes. */
11723 if (costing_p)
11725 /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we
11726 only need to take care of the first stmt, whose
11727 stmt_info is first_stmt_info, vec_num iterating on it
11728 will cover the cost for the remaining, it's consistent
11729 with transforming. For the prologue cost for realign,
11730 we only need to count it once for the whole group. */
11731 bool first_stmt_info_p = first_stmt_info == stmt_info;
11732 bool add_realign_cost = first_stmt_info_p && i == 0;
11733 if (memory_access_type == VMAT_CONTIGUOUS
11734 || memory_access_type == VMAT_CONTIGUOUS_REVERSE
11735 || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
11736 && (!grouped_load || first_stmt_info_p)))
11738 /* Leave realign cases alone to keep them simple. */
11739 if (alignment_support_scheme == dr_explicit_realign_optimized
11740 || alignment_support_scheme == dr_explicit_realign)
11741 vect_get_load_cost (vinfo, stmt_info, 1,
11742 alignment_support_scheme, misalignment,
11743 add_realign_cost, &inside_cost,
11744 &prologue_cost, cost_vec, cost_vec,
11745 true);
11746 else
11747 n_adjacent_loads++;
11750 else
11752 vec_dest = vect_create_destination_var (scalar_dest, vectype);
11753 /* DATA_REF is null if we've already built the statement. */
11754 if (data_ref)
11756 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
11757 new_stmt = gimple_build_assign (vec_dest, data_ref);
11759 new_temp = make_ssa_name (vec_dest, new_stmt);
11760 gimple_set_lhs (new_stmt, new_temp);
11761 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11764 /* 3. Handle explicit realignment if necessary/supported.
11765 Create in loop:
11766 vec_dest = realign_load (msq, lsq, realignment_token) */
11767 if (!costing_p
11768 && (alignment_support_scheme == dr_explicit_realign_optimized
11769 || alignment_support_scheme == dr_explicit_realign))
11771 lsq = gimple_assign_lhs (new_stmt);
11772 if (!realignment_token)
11773 realignment_token = dataref_ptr;
11774 vec_dest = vect_create_destination_var (scalar_dest, vectype);
11775 new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR, msq,
11776 lsq, realignment_token);
11777 new_temp = make_ssa_name (vec_dest, new_stmt);
11778 gimple_assign_set_lhs (new_stmt, new_temp);
11779 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11781 if (alignment_support_scheme == dr_explicit_realign_optimized)
11783 gcc_assert (phi);
11784 if (i == vec_num - 1 && j == ncopies - 1)
11785 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop),
11786 UNKNOWN_LOCATION);
11787 msq = lsq;
11791 if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
11793 if (costing_p)
11794 inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
11795 stmt_info, 0, vect_body);
11796 else
11798 tree perm_mask = perm_mask_for_reverse (vectype);
11799 new_temp = permute_vec_elements (vinfo, new_temp, new_temp,
11800 perm_mask, stmt_info, gsi);
11801 new_stmt = SSA_NAME_DEF_STMT (new_temp);
11805 /* Collect vector loads and later create their permutation in
11806 vect_transform_grouped_load (). */
11807 if (!costing_p && (grouped_load || slp_perm))
11808 dr_chain.quick_push (new_temp);
11810 /* Store vector loads in the corresponding SLP_NODE. */
11811 if (!costing_p && slp && !slp_perm)
11812 slp_node->push_vec_def (new_stmt);
11814 /* With SLP permutation we load the gaps as well, without
11815 we need to skip the gaps after we manage to fully load
11816 all elements. group_gap_adj is DR_GROUP_SIZE here. */
11817 group_elt += nunits;
11818 if (!costing_p
11819 && maybe_ne (group_gap_adj, 0U)
11820 && !slp_perm
11821 && known_eq (group_elt, group_size - group_gap_adj))
11823 poly_wide_int bump_val
11824 = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
11825 if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step)
11826 == -1)
11827 bump_val = -bump_val;
11828 tree bump = wide_int_to_tree (sizetype, bump_val);
11829 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
11830 stmt_info, bump);
11831 group_elt = 0;
11834 /* Bump the vector pointer to account for a gap or for excess
11835 elements loaded for a permuted SLP load. */
11836 if (!costing_p
11837 && maybe_ne (group_gap_adj, 0U)
11838 && slp_perm)
11840 poly_wide_int bump_val
11841 = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
11842 if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) == -1)
11843 bump_val = -bump_val;
11844 tree bump = wide_int_to_tree (sizetype, bump_val);
11845 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
11846 stmt_info, bump);
11849 if (slp && !slp_perm)
11850 continue;
11852 if (slp_perm)
11854 unsigned n_perms;
11855 /* For SLP we know we've seen all possible uses of dr_chain so
11856 direct vect_transform_slp_perm_load to DCE the unused parts.
11857 ??? This is a hack to prevent compile-time issues as seen
11858 in PR101120 and friends. */
11859 if (costing_p)
11861 vect_transform_slp_perm_load (vinfo, slp_node, vNULL, nullptr, vf,
11862 true, &n_perms, nullptr);
11863 inside_cost = record_stmt_cost (cost_vec, n_perms, vec_perm,
11864 stmt_info, 0, vect_body);
11866 else
11868 bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain,
11869 gsi, vf, false, &n_perms,
11870 nullptr, true);
11871 gcc_assert (ok);
11874 else
11876 if (grouped_load)
11878 gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
11879 /* We assume that the cost of a single load-lanes instruction
11880 is equivalent to the cost of DR_GROUP_SIZE separate loads.
11881 If a grouped access is instead being provided by a
11882 load-and-permute operation, include the cost of the
11883 permutes. */
11884 if (costing_p && first_stmt_info == stmt_info)
11886 /* Uses an even and odd extract operations or shuffle
11887 operations for each needed permute. */
11888 int group_size = DR_GROUP_SIZE (first_stmt_info);
11889 int nstmts = ceil_log2 (group_size) * group_size;
11890 inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm,
11891 stmt_info, 0, vect_body);
11893 if (dump_enabled_p ())
11894 dump_printf_loc (MSG_NOTE, vect_location,
11895 "vect_model_load_cost:"
11896 "strided group_size = %d .\n",
11897 group_size);
11899 else if (!costing_p)
11901 vect_transform_grouped_load (vinfo, stmt_info, dr_chain,
11902 group_size, gsi);
11903 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
11906 else if (!costing_p)
11907 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
11909 dr_chain.release ();
11911 if (!slp && !costing_p)
11912 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
11914 if (costing_p)
11916 gcc_assert (memory_access_type == VMAT_CONTIGUOUS
11917 || memory_access_type == VMAT_CONTIGUOUS_REVERSE
11918 || memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
11919 if (n_adjacent_loads > 0)
11920 vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
11921 alignment_support_scheme, misalignment, false,
11922 &inside_cost, &prologue_cost, cost_vec, cost_vec,
11923 true);
11924 if (dump_enabled_p ())
11925 dump_printf_loc (MSG_NOTE, vect_location,
11926 "vect_model_load_cost: inside_cost = %u, "
11927 "prologue_cost = %u .\n",
11928 inside_cost, prologue_cost);
11931 return true;
11934 /* Function vect_is_simple_cond.
11936 Input:
11937 LOOP - the loop that is being vectorized.
11938 COND - Condition that is checked for simple use.
11940 Output:
11941 *COMP_VECTYPE - the vector type for the comparison.
11942 *DTS - The def types for the arguments of the comparison
11944 Returns whether a COND can be vectorized. Checks whether
11945 condition operands are supportable using vec_is_simple_use. */
11947 static bool
11948 vect_is_simple_cond (tree cond, vec_info *vinfo, stmt_vec_info stmt_info,
11949 slp_tree slp_node, tree *comp_vectype,
11950 enum vect_def_type *dts, tree vectype)
11952 tree lhs, rhs;
11953 tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
11954 slp_tree slp_op;
11956 /* Mask case. */
11957 if (TREE_CODE (cond) == SSA_NAME
11958 && VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (cond)))
11960 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 0, &cond,
11961 &slp_op, &dts[0], comp_vectype)
11962 || !*comp_vectype
11963 || !VECTOR_BOOLEAN_TYPE_P (*comp_vectype))
11964 return false;
11965 return true;
11968 if (!COMPARISON_CLASS_P (cond))
11969 return false;
11971 lhs = TREE_OPERAND (cond, 0);
11972 rhs = TREE_OPERAND (cond, 1);
11974 if (TREE_CODE (lhs) == SSA_NAME)
11976 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 0,
11977 &lhs, &slp_op, &dts[0], &vectype1))
11978 return false;
11980 else if (TREE_CODE (lhs) == INTEGER_CST || TREE_CODE (lhs) == REAL_CST
11981 || TREE_CODE (lhs) == FIXED_CST)
11982 dts[0] = vect_constant_def;
11983 else
11984 return false;
11986 if (TREE_CODE (rhs) == SSA_NAME)
11988 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 1,
11989 &rhs, &slp_op, &dts[1], &vectype2))
11990 return false;
11992 else if (TREE_CODE (rhs) == INTEGER_CST || TREE_CODE (rhs) == REAL_CST
11993 || TREE_CODE (rhs) == FIXED_CST)
11994 dts[1] = vect_constant_def;
11995 else
11996 return false;
11998 if (vectype1 && vectype2
11999 && maybe_ne (TYPE_VECTOR_SUBPARTS (vectype1),
12000 TYPE_VECTOR_SUBPARTS (vectype2)))
12001 return false;
12003 *comp_vectype = vectype1 ? vectype1 : vectype2;
12004 /* Invariant comparison. */
12005 if (! *comp_vectype)
12007 tree scalar_type = TREE_TYPE (lhs);
12008 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type))
12009 *comp_vectype = truth_type_for (vectype);
12010 else
12012 /* If we can widen the comparison to match vectype do so. */
12013 if (INTEGRAL_TYPE_P (scalar_type)
12014 && !slp_node
12015 && tree_int_cst_lt (TYPE_SIZE (scalar_type),
12016 TYPE_SIZE (TREE_TYPE (vectype))))
12017 scalar_type = build_nonstandard_integer_type
12018 (vector_element_bits (vectype), TYPE_UNSIGNED (scalar_type));
12019 *comp_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
12020 slp_node);
12024 return true;
12027 /* vectorizable_condition.
12029 Check if STMT_INFO is conditional modify expression that can be vectorized.
12030 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
12031 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
12032 at GSI.
12034 When STMT_INFO is vectorized as a nested cycle, for_reduction is true.
12036 Return true if STMT_INFO is vectorizable in this way. */
12038 static bool
12039 vectorizable_condition (vec_info *vinfo,
12040 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
12041 gimple **vec_stmt,
12042 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
12044 tree scalar_dest = NULL_TREE;
12045 tree vec_dest = NULL_TREE;
12046 tree cond_expr, cond_expr0 = NULL_TREE, cond_expr1 = NULL_TREE;
12047 tree then_clause, else_clause;
12048 tree comp_vectype = NULL_TREE;
12049 tree vec_cond_lhs = NULL_TREE, vec_cond_rhs = NULL_TREE;
12050 tree vec_then_clause = NULL_TREE, vec_else_clause = NULL_TREE;
12051 tree vec_compare;
12052 tree new_temp;
12053 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
12054 enum vect_def_type dts[4]
12055 = {vect_unknown_def_type, vect_unknown_def_type,
12056 vect_unknown_def_type, vect_unknown_def_type};
12057 int ndts = 4;
12058 int ncopies;
12059 int vec_num;
12060 enum tree_code code, cond_code, bitop1 = NOP_EXPR, bitop2 = NOP_EXPR;
12061 int i;
12062 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
12063 vec<tree> vec_oprnds0 = vNULL;
12064 vec<tree> vec_oprnds1 = vNULL;
12065 vec<tree> vec_oprnds2 = vNULL;
12066 vec<tree> vec_oprnds3 = vNULL;
12067 tree vec_cmp_type;
12068 bool masked = false;
12070 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
12071 return false;
12073 /* Is vectorizable conditional operation? */
12074 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
12075 if (!stmt)
12076 return false;
12078 code = gimple_assign_rhs_code (stmt);
12079 if (code != COND_EXPR)
12080 return false;
12082 stmt_vec_info reduc_info = NULL;
12083 int reduc_index = -1;
12084 vect_reduction_type reduction_type = TREE_CODE_REDUCTION;
12085 bool for_reduction
12086 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) != NULL;
12087 if (for_reduction)
12089 if (slp_node && SLP_TREE_LANES (slp_node) > 1)
12090 return false;
12091 reduc_info = info_for_reduction (vinfo, stmt_info);
12092 reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
12093 reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
12094 gcc_assert (reduction_type != EXTRACT_LAST_REDUCTION
12095 || reduc_index != -1);
12097 else
12099 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
12100 return false;
12103 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
12104 tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
12106 if (slp_node)
12108 ncopies = 1;
12109 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
12111 else
12113 ncopies = vect_get_num_copies (loop_vinfo, vectype);
12114 vec_num = 1;
12117 gcc_assert (ncopies >= 1);
12118 if (for_reduction && ncopies > 1)
12119 return false; /* FORNOW */
12121 cond_expr = gimple_assign_rhs1 (stmt);
12123 if (!vect_is_simple_cond (cond_expr, vinfo, stmt_info, slp_node,
12124 &comp_vectype, &dts[0], vectype)
12125 || !comp_vectype)
12126 return false;
12128 unsigned op_adjust = COMPARISON_CLASS_P (cond_expr) ? 1 : 0;
12129 slp_tree then_slp_node, else_slp_node;
12130 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 1 + op_adjust,
12131 &then_clause, &then_slp_node, &dts[2], &vectype1))
12132 return false;
12133 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 2 + op_adjust,
12134 &else_clause, &else_slp_node, &dts[3], &vectype2))
12135 return false;
12137 if (vectype1 && !useless_type_conversion_p (vectype, vectype1))
12138 return false;
12140 if (vectype2 && !useless_type_conversion_p (vectype, vectype2))
12141 return false;
12143 masked = !COMPARISON_CLASS_P (cond_expr);
12144 vec_cmp_type = truth_type_for (comp_vectype);
12146 if (vec_cmp_type == NULL_TREE)
12147 return false;
12149 cond_code = TREE_CODE (cond_expr);
12150 if (!masked)
12152 cond_expr0 = TREE_OPERAND (cond_expr, 0);
12153 cond_expr1 = TREE_OPERAND (cond_expr, 1);
12156 /* For conditional reductions, the "then" value needs to be the candidate
12157 value calculated by this iteration while the "else" value needs to be
12158 the result carried over from previous iterations. If the COND_EXPR
12159 is the other way around, we need to swap it. */
12160 bool must_invert_cmp_result = false;
12161 if (reduction_type == EXTRACT_LAST_REDUCTION && reduc_index == 1)
12163 if (masked)
12164 must_invert_cmp_result = true;
12165 else
12167 bool honor_nans = HONOR_NANS (TREE_TYPE (cond_expr0));
12168 tree_code new_code = invert_tree_comparison (cond_code, honor_nans);
12169 if (new_code == ERROR_MARK)
12170 must_invert_cmp_result = true;
12171 else
12173 cond_code = new_code;
12174 /* Make sure we don't accidentally use the old condition. */
12175 cond_expr = NULL_TREE;
12178 /* ??? The vectorized operand query below doesn't allow swapping
12179 this way for SLP. */
12180 if (slp_node)
12181 return false;
12182 std::swap (then_clause, else_clause);
12185 if (!masked && VECTOR_BOOLEAN_TYPE_P (comp_vectype))
12187 /* Boolean values may have another representation in vectors
12188 and therefore we prefer bit operations over comparison for
12189 them (which also works for scalar masks). We store opcodes
12190 to use in bitop1 and bitop2. Statement is vectorized as
12191 BITOP2 (rhs1 BITOP1 rhs2) or rhs1 BITOP2 (BITOP1 rhs2)
12192 depending on bitop1 and bitop2 arity. */
12193 switch (cond_code)
12195 case GT_EXPR:
12196 bitop1 = BIT_NOT_EXPR;
12197 bitop2 = BIT_AND_EXPR;
12198 break;
12199 case GE_EXPR:
12200 bitop1 = BIT_NOT_EXPR;
12201 bitop2 = BIT_IOR_EXPR;
12202 break;
12203 case LT_EXPR:
12204 bitop1 = BIT_NOT_EXPR;
12205 bitop2 = BIT_AND_EXPR;
12206 std::swap (cond_expr0, cond_expr1);
12207 break;
12208 case LE_EXPR:
12209 bitop1 = BIT_NOT_EXPR;
12210 bitop2 = BIT_IOR_EXPR;
12211 std::swap (cond_expr0, cond_expr1);
12212 break;
12213 case NE_EXPR:
12214 bitop1 = BIT_XOR_EXPR;
12215 break;
12216 case EQ_EXPR:
12217 bitop1 = BIT_XOR_EXPR;
12218 bitop2 = BIT_NOT_EXPR;
12219 break;
12220 default:
12221 return false;
12223 cond_code = SSA_NAME;
12226 if (TREE_CODE_CLASS (cond_code) == tcc_comparison
12227 && reduction_type == EXTRACT_LAST_REDUCTION
12228 && !expand_vec_cmp_expr_p (comp_vectype, vec_cmp_type, cond_code))
12230 if (dump_enabled_p ())
12231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12232 "reduction comparison operation not supported.\n");
12233 return false;
12236 if (!vec_stmt)
12238 if (bitop1 != NOP_EXPR)
12240 machine_mode mode = TYPE_MODE (comp_vectype);
12241 optab optab;
12243 optab = optab_for_tree_code (bitop1, comp_vectype, optab_default);
12244 if (!optab || optab_handler (optab, mode) == CODE_FOR_nothing)
12245 return false;
12247 if (bitop2 != NOP_EXPR)
12249 optab = optab_for_tree_code (bitop2, comp_vectype,
12250 optab_default);
12251 if (!optab || optab_handler (optab, mode) == CODE_FOR_nothing)
12252 return false;
12256 vect_cost_for_stmt kind = vector_stmt;
12257 if (reduction_type == EXTRACT_LAST_REDUCTION)
12258 /* Count one reduction-like operation per vector. */
12259 kind = vec_to_scalar;
12260 else if (!expand_vec_cond_expr_p (vectype, comp_vectype, cond_code)
12261 && (masked
12262 || (!expand_vec_cmp_expr_p (comp_vectype, vec_cmp_type,
12263 cond_code)
12264 || !expand_vec_cond_expr_p (vectype, vec_cmp_type,
12265 ERROR_MARK))))
12266 return false;
12268 if (slp_node
12269 && (!vect_maybe_update_slp_op_vectype
12270 (SLP_TREE_CHILDREN (slp_node)[0], comp_vectype)
12271 || (op_adjust == 1
12272 && !vect_maybe_update_slp_op_vectype
12273 (SLP_TREE_CHILDREN (slp_node)[1], comp_vectype))
12274 || !vect_maybe_update_slp_op_vectype (then_slp_node, vectype)
12275 || !vect_maybe_update_slp_op_vectype (else_slp_node, vectype)))
12277 if (dump_enabled_p ())
12278 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12279 "incompatible vector types for invariants\n");
12280 return false;
12283 if (loop_vinfo && for_reduction
12284 && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
12286 if (reduction_type == EXTRACT_LAST_REDUCTION)
12288 if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
12289 vectype, OPTIMIZE_FOR_SPEED))
12290 vect_record_loop_len (loop_vinfo,
12291 &LOOP_VINFO_LENS (loop_vinfo),
12292 ncopies * vec_num, vectype, 1);
12293 else
12294 vect_record_loop_mask (loop_vinfo,
12295 &LOOP_VINFO_MASKS (loop_vinfo),
12296 ncopies * vec_num, vectype, NULL);
12298 /* Extra inactive lanes should be safe for vect_nested_cycle. */
12299 else if (STMT_VINFO_DEF_TYPE (reduc_info) != vect_nested_cycle)
12301 if (dump_enabled_p ())
12302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12303 "conditional reduction prevents the use"
12304 " of partial vectors.\n");
12305 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
12309 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
12310 vect_model_simple_cost (vinfo, stmt_info, ncopies, dts, ndts, slp_node,
12311 cost_vec, kind);
12312 return true;
12315 /* Transform. */
12317 /* Handle def. */
12318 scalar_dest = gimple_assign_lhs (stmt);
12319 if (reduction_type != EXTRACT_LAST_REDUCTION)
12320 vec_dest = vect_create_destination_var (scalar_dest, vectype);
12322 bool swap_cond_operands = false;
12324 /* See whether another part of the vectorized code applies a loop
12325 mask to the condition, or to its inverse. */
12327 vec_loop_masks *masks = NULL;
12328 vec_loop_lens *lens = NULL;
12329 if (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
12331 if (reduction_type == EXTRACT_LAST_REDUCTION)
12332 lens = &LOOP_VINFO_LENS (loop_vinfo);
12334 else if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
12336 if (reduction_type == EXTRACT_LAST_REDUCTION)
12337 masks = &LOOP_VINFO_MASKS (loop_vinfo);
12338 else
12340 scalar_cond_masked_key cond (cond_expr, ncopies);
12341 if (loop_vinfo->scalar_cond_masked_set.contains (cond))
12342 masks = &LOOP_VINFO_MASKS (loop_vinfo);
12343 else
12345 bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0));
12346 tree_code orig_code = cond.code;
12347 cond.code = invert_tree_comparison (cond.code, honor_nans);
12348 if (!masked && loop_vinfo->scalar_cond_masked_set.contains (cond))
12350 masks = &LOOP_VINFO_MASKS (loop_vinfo);
12351 cond_code = cond.code;
12352 swap_cond_operands = true;
12354 else
12356 /* Try the inverse of the current mask. We check if the
12357 inverse mask is live and if so we generate a negate of
12358 the current mask such that we still honor NaNs. */
12359 cond.inverted_p = true;
12360 cond.code = orig_code;
12361 if (loop_vinfo->scalar_cond_masked_set.contains (cond))
12363 masks = &LOOP_VINFO_MASKS (loop_vinfo);
12364 cond_code = cond.code;
12365 swap_cond_operands = true;
12366 must_invert_cmp_result = true;
12373 /* Handle cond expr. */
12374 if (masked)
12375 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
12376 cond_expr, comp_vectype, &vec_oprnds0,
12377 then_clause, vectype, &vec_oprnds2,
12378 reduction_type != EXTRACT_LAST_REDUCTION
12379 ? else_clause : NULL, vectype, &vec_oprnds3);
12380 else
12381 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
12382 cond_expr0, comp_vectype, &vec_oprnds0,
12383 cond_expr1, comp_vectype, &vec_oprnds1,
12384 then_clause, vectype, &vec_oprnds2,
12385 reduction_type != EXTRACT_LAST_REDUCTION
12386 ? else_clause : NULL, vectype, &vec_oprnds3);
12388 if (reduction_type == EXTRACT_LAST_REDUCTION)
12389 vec_else_clause = else_clause;
12391 /* Arguments are ready. Create the new vector stmt. */
12392 FOR_EACH_VEC_ELT (vec_oprnds0, i, vec_cond_lhs)
12394 vec_then_clause = vec_oprnds2[i];
12395 if (reduction_type != EXTRACT_LAST_REDUCTION)
12396 vec_else_clause = vec_oprnds3[i];
12398 if (swap_cond_operands)
12399 std::swap (vec_then_clause, vec_else_clause);
12401 if (masked)
12402 vec_compare = vec_cond_lhs;
12403 else
12405 vec_cond_rhs = vec_oprnds1[i];
12406 if (bitop1 == NOP_EXPR)
12408 gimple_seq stmts = NULL;
12409 vec_compare = gimple_build (&stmts, cond_code, vec_cmp_type,
12410 vec_cond_lhs, vec_cond_rhs);
12411 gsi_insert_before (gsi, stmts, GSI_SAME_STMT);
12413 else
12415 new_temp = make_ssa_name (vec_cmp_type);
12416 gassign *new_stmt;
12417 if (bitop1 == BIT_NOT_EXPR)
12418 new_stmt = gimple_build_assign (new_temp, bitop1,
12419 vec_cond_rhs);
12420 else
12421 new_stmt
12422 = gimple_build_assign (new_temp, bitop1, vec_cond_lhs,
12423 vec_cond_rhs);
12424 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12425 if (bitop2 == NOP_EXPR)
12426 vec_compare = new_temp;
12427 else if (bitop2 == BIT_NOT_EXPR
12428 && reduction_type != EXTRACT_LAST_REDUCTION)
12430 /* Instead of doing ~x ? y : z do x ? z : y. */
12431 vec_compare = new_temp;
12432 std::swap (vec_then_clause, vec_else_clause);
12434 else
12436 vec_compare = make_ssa_name (vec_cmp_type);
12437 if (bitop2 == BIT_NOT_EXPR)
12438 new_stmt
12439 = gimple_build_assign (vec_compare, bitop2, new_temp);
12440 else
12441 new_stmt
12442 = gimple_build_assign (vec_compare, bitop2,
12443 vec_cond_lhs, new_temp);
12444 vect_finish_stmt_generation (vinfo, stmt_info,
12445 new_stmt, gsi);
12450 /* If we decided to apply a loop mask to the result of the vector
12451 comparison, AND the comparison with the mask now. Later passes
12452 should then be able to reuse the AND results between mulitple
12453 vector statements.
12455 For example:
12456 for (int i = 0; i < 100; ++i)
12457 x[i] = y[i] ? z[i] : 10;
12459 results in following optimized GIMPLE:
12461 mask__35.8_43 = vect__4.7_41 != { 0, ... };
12462 vec_mask_and_46 = loop_mask_40 & mask__35.8_43;
12463 _19 = &MEM[base: z_12(D), index: ivtmp_56, step: 4, offset: 0B];
12464 vect_iftmp.11_47 = .MASK_LOAD (_19, 4B, vec_mask_and_46);
12465 vect_iftmp.12_52 = VEC_COND_EXPR <vec_mask_and_46,
12466 vect_iftmp.11_47, { 10, ... }>;
12468 instead of using a masked and unmasked forms of
12469 vec != { 0, ... } (masked in the MASK_LOAD,
12470 unmasked in the VEC_COND_EXPR). */
12472 /* Force vec_compare to be an SSA_NAME rather than a comparison,
12473 in cases where that's necessary. */
12475 tree len = NULL_TREE, bias = NULL_TREE;
12476 if (masks || lens || reduction_type == EXTRACT_LAST_REDUCTION)
12478 if (!is_gimple_val (vec_compare))
12480 tree vec_compare_name = make_ssa_name (vec_cmp_type);
12481 gassign *new_stmt = gimple_build_assign (vec_compare_name,
12482 vec_compare);
12483 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12484 vec_compare = vec_compare_name;
12487 if (must_invert_cmp_result)
12489 tree vec_compare_name = make_ssa_name (vec_cmp_type);
12490 gassign *new_stmt = gimple_build_assign (vec_compare_name,
12491 BIT_NOT_EXPR,
12492 vec_compare);
12493 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12494 vec_compare = vec_compare_name;
12497 if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
12498 vectype, OPTIMIZE_FOR_SPEED))
12500 if (lens)
12502 len = vect_get_loop_len (loop_vinfo, gsi, lens,
12503 vec_num * ncopies, vectype, i, 1);
12504 signed char biasval
12505 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
12506 bias = build_int_cst (intQI_type_node, biasval);
12508 else
12510 len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
12511 bias = build_int_cst (intQI_type_node, 0);
12514 if (masks)
12516 tree loop_mask
12517 = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num * ncopies,
12518 vectype, i);
12519 tree tmp2 = make_ssa_name (vec_cmp_type);
12520 gassign *g
12521 = gimple_build_assign (tmp2, BIT_AND_EXPR, vec_compare,
12522 loop_mask);
12523 vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
12524 vec_compare = tmp2;
12528 gimple *new_stmt;
12529 if (reduction_type == EXTRACT_LAST_REDUCTION)
12531 gimple *old_stmt = vect_orig_stmt (stmt_info)->stmt;
12532 tree lhs = gimple_get_lhs (old_stmt);
12533 if ((unsigned)i != vec_oprnds0.length () - 1)
12534 lhs = copy_ssa_name (lhs);
12535 if (len)
12536 new_stmt = gimple_build_call_internal
12537 (IFN_LEN_FOLD_EXTRACT_LAST, 5, vec_else_clause, vec_compare,
12538 vec_then_clause, len, bias);
12539 else
12540 new_stmt = gimple_build_call_internal
12541 (IFN_FOLD_EXTRACT_LAST, 3, vec_else_clause, vec_compare,
12542 vec_then_clause);
12543 gimple_call_set_lhs (new_stmt, lhs);
12544 SSA_NAME_DEF_STMT (lhs) = new_stmt;
12545 if ((unsigned)i != vec_oprnds0.length () - 1)
12547 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12548 vec_else_clause = lhs;
12550 else if (old_stmt == gsi_stmt (*gsi))
12551 vect_finish_replace_stmt (vinfo, stmt_info, new_stmt);
12552 else
12554 /* In this case we're moving the definition to later in the
12555 block. That doesn't matter because the only uses of the
12556 lhs are in phi statements. */
12557 gimple_stmt_iterator old_gsi = gsi_for_stmt (old_stmt);
12558 gsi_remove (&old_gsi, true);
12559 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12562 else
12564 new_temp = make_ssa_name (vec_dest);
12565 new_stmt = gimple_build_assign (new_temp, VEC_COND_EXPR, vec_compare,
12566 vec_then_clause, vec_else_clause);
12567 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12569 if (slp_node)
12570 slp_node->push_vec_def (new_stmt);
12571 else
12572 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
12575 if (!slp_node)
12576 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
12578 vec_oprnds0.release ();
12579 vec_oprnds1.release ();
12580 vec_oprnds2.release ();
12581 vec_oprnds3.release ();
12583 return true;
12586 /* Helper of vectorizable_comparison.
12588 Check if STMT_INFO is comparison expression CODE that can be vectorized.
12589 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
12590 comparison, put it in VEC_STMT, and insert it at GSI.
12592 Return true if STMT_INFO is vectorizable in this way. */
12594 static bool
12595 vectorizable_comparison_1 (vec_info *vinfo, tree vectype,
12596 stmt_vec_info stmt_info, tree_code code,
12597 gimple_stmt_iterator *gsi, gimple **vec_stmt,
12598 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
12600 tree lhs, rhs1, rhs2;
12601 tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
12602 tree vec_rhs1 = NULL_TREE, vec_rhs2 = NULL_TREE;
12603 tree new_temp;
12604 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
12605 enum vect_def_type dts[2] = {vect_unknown_def_type, vect_unknown_def_type};
12606 int ndts = 2;
12607 poly_uint64 nunits;
12608 int ncopies;
12609 enum tree_code bitop1 = NOP_EXPR, bitop2 = NOP_EXPR;
12610 int i;
12611 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
12612 vec<tree> vec_oprnds0 = vNULL;
12613 vec<tree> vec_oprnds1 = vNULL;
12614 tree mask_type;
12615 tree mask = NULL_TREE;
12617 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
12618 return false;
12620 if (!vectype || !VECTOR_BOOLEAN_TYPE_P (vectype))
12621 return false;
12623 mask_type = vectype;
12624 nunits = TYPE_VECTOR_SUBPARTS (vectype);
12626 if (slp_node)
12627 ncopies = 1;
12628 else
12629 ncopies = vect_get_num_copies (loop_vinfo, vectype);
12631 gcc_assert (ncopies >= 1);
12633 if (TREE_CODE_CLASS (code) != tcc_comparison)
12634 return false;
12636 slp_tree slp_rhs1, slp_rhs2;
12637 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
12638 0, &rhs1, &slp_rhs1, &dts[0], &vectype1))
12639 return false;
12641 if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
12642 1, &rhs2, &slp_rhs2, &dts[1], &vectype2))
12643 return false;
12645 if (vectype1 && vectype2
12646 && maybe_ne (TYPE_VECTOR_SUBPARTS (vectype1),
12647 TYPE_VECTOR_SUBPARTS (vectype2)))
12648 return false;
12650 vectype = vectype1 ? vectype1 : vectype2;
12652 /* Invariant comparison. */
12653 if (!vectype)
12655 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (rhs1)))
12656 vectype = mask_type;
12657 else
12658 vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (rhs1),
12659 slp_node);
12660 if (!vectype || maybe_ne (TYPE_VECTOR_SUBPARTS (vectype), nunits))
12661 return false;
12663 else if (maybe_ne (nunits, TYPE_VECTOR_SUBPARTS (vectype)))
12664 return false;
12666 /* Can't compare mask and non-mask types. */
12667 if (vectype1 && vectype2
12668 && (VECTOR_BOOLEAN_TYPE_P (vectype1) ^ VECTOR_BOOLEAN_TYPE_P (vectype2)))
12669 return false;
12671 /* Boolean values may have another representation in vectors
12672 and therefore we prefer bit operations over comparison for
12673 them (which also works for scalar masks). We store opcodes
12674 to use in bitop1 and bitop2. Statement is vectorized as
12675 BITOP2 (rhs1 BITOP1 rhs2) or
12676 rhs1 BITOP2 (BITOP1 rhs2)
12677 depending on bitop1 and bitop2 arity. */
12678 bool swap_p = false;
12679 if (VECTOR_BOOLEAN_TYPE_P (vectype))
12681 if (code == GT_EXPR)
12683 bitop1 = BIT_NOT_EXPR;
12684 bitop2 = BIT_AND_EXPR;
12686 else if (code == GE_EXPR)
12688 bitop1 = BIT_NOT_EXPR;
12689 bitop2 = BIT_IOR_EXPR;
12691 else if (code == LT_EXPR)
12693 bitop1 = BIT_NOT_EXPR;
12694 bitop2 = BIT_AND_EXPR;
12695 swap_p = true;
12697 else if (code == LE_EXPR)
12699 bitop1 = BIT_NOT_EXPR;
12700 bitop2 = BIT_IOR_EXPR;
12701 swap_p = true;
12703 else
12705 bitop1 = BIT_XOR_EXPR;
12706 if (code == EQ_EXPR)
12707 bitop2 = BIT_NOT_EXPR;
12711 if (!vec_stmt)
12713 if (bitop1 == NOP_EXPR)
12715 if (!expand_vec_cmp_expr_p (vectype, mask_type, code))
12716 return false;
12718 else
12720 machine_mode mode = TYPE_MODE (vectype);
12721 optab optab;
12723 optab = optab_for_tree_code (bitop1, vectype, optab_default);
12724 if (!optab || optab_handler (optab, mode) == CODE_FOR_nothing)
12725 return false;
12727 if (bitop2 != NOP_EXPR)
12729 optab = optab_for_tree_code (bitop2, vectype, optab_default);
12730 if (!optab || optab_handler (optab, mode) == CODE_FOR_nothing)
12731 return false;
12735 /* Put types on constant and invariant SLP children. */
12736 if (slp_node
12737 && (!vect_maybe_update_slp_op_vectype (slp_rhs1, vectype)
12738 || !vect_maybe_update_slp_op_vectype (slp_rhs2, vectype)))
12740 if (dump_enabled_p ())
12741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12742 "incompatible vector types for invariants\n");
12743 return false;
12746 vect_model_simple_cost (vinfo, stmt_info,
12747 ncopies * (1 + (bitop2 != NOP_EXPR)),
12748 dts, ndts, slp_node, cost_vec);
12749 return true;
12752 /* Transform. */
12754 /* Handle def. */
12755 lhs = gimple_get_lhs (STMT_VINFO_STMT (stmt_info));
12756 if (lhs)
12757 mask = vect_create_destination_var (lhs, mask_type);
12759 vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
12760 rhs1, vectype, &vec_oprnds0,
12761 rhs2, vectype, &vec_oprnds1);
12762 if (swap_p)
12763 std::swap (vec_oprnds0, vec_oprnds1);
12765 /* Arguments are ready. Create the new vector stmt. */
12766 FOR_EACH_VEC_ELT (vec_oprnds0, i, vec_rhs1)
12768 gimple *new_stmt;
12769 vec_rhs2 = vec_oprnds1[i];
12771 if (lhs)
12772 new_temp = make_ssa_name (mask);
12773 else
12774 new_temp = make_temp_ssa_name (mask_type, NULL, "cmp");
12775 if (bitop1 == NOP_EXPR)
12777 new_stmt = gimple_build_assign (new_temp, code,
12778 vec_rhs1, vec_rhs2);
12779 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12781 else
12783 if (bitop1 == BIT_NOT_EXPR)
12784 new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs2);
12785 else
12786 new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs1,
12787 vec_rhs2);
12788 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12789 if (bitop2 != NOP_EXPR)
12791 tree res = make_ssa_name (mask);
12792 if (bitop2 == BIT_NOT_EXPR)
12793 new_stmt = gimple_build_assign (res, bitop2, new_temp);
12794 else
12795 new_stmt = gimple_build_assign (res, bitop2, vec_rhs1,
12796 new_temp);
12797 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12800 if (slp_node)
12801 slp_node->push_vec_def (new_stmt);
12802 else
12803 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
12806 if (!slp_node)
12807 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
12809 vec_oprnds0.release ();
12810 vec_oprnds1.release ();
12812 return true;
12815 /* vectorizable_comparison.
12817 Check if STMT_INFO is comparison expression that can be vectorized.
12818 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
12819 comparison, put it in VEC_STMT, and insert it at GSI.
12821 Return true if STMT_INFO is vectorizable in this way. */
12823 static bool
12824 vectorizable_comparison (vec_info *vinfo,
12825 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
12826 gimple **vec_stmt,
12827 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
12829 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
12831 if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
12832 return false;
12834 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
12835 return false;
12837 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
12838 if (!stmt)
12839 return false;
12841 enum tree_code code = gimple_assign_rhs_code (stmt);
12842 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
12843 if (!vectorizable_comparison_1 (vinfo, vectype, stmt_info, code, gsi,
12844 vec_stmt, slp_node, cost_vec))
12845 return false;
12847 if (!vec_stmt)
12848 STMT_VINFO_TYPE (stmt_info) = comparison_vec_info_type;
12850 return true;
12853 /* Check to see if the current early break given in STMT_INFO is valid for
12854 vectorization. */
12856 static bool
12857 vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
12858 gimple_stmt_iterator *gsi, gimple **vec_stmt,
12859 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
12861 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
12862 if (!loop_vinfo
12863 || !is_a <gcond *> (STMT_VINFO_STMT (stmt_info)))
12864 return false;
12866 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_condition_def)
12867 return false;
12869 if (!STMT_VINFO_RELEVANT_P (stmt_info))
12870 return false;
12872 DUMP_VECT_SCOPE ("vectorizable_early_exit");
12874 auto code = gimple_cond_code (STMT_VINFO_STMT (stmt_info));
12876 tree vectype = NULL_TREE;
12877 slp_tree slp_op0;
12878 tree op0;
12879 enum vect_def_type dt0;
12880 if (!vect_is_simple_use (vinfo, stmt_info, slp_node, 0, &op0, &slp_op0, &dt0,
12881 &vectype))
12883 if (dump_enabled_p ())
12884 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12885 "use not simple.\n");
12886 return false;
12889 if (!vectype)
12890 return false;
12892 machine_mode mode = TYPE_MODE (vectype);
12893 int ncopies;
12895 if (slp_node)
12896 ncopies = 1;
12897 else
12898 ncopies = vect_get_num_copies (loop_vinfo, vectype);
12900 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
12901 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
12902 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
12903 bool len_loop_p = LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
12905 /* Now build the new conditional. Pattern gimple_conds get dropped during
12906 codegen so we must replace the original insn. */
12907 gimple *orig_stmt = STMT_VINFO_STMT (vect_orig_stmt (stmt_info));
12908 gcond *cond_stmt = as_a <gcond *>(orig_stmt);
12909 /* When vectorizing we assume that if the branch edge is taken that we're
12910 exiting the loop. This is not however always the case as the compiler will
12911 rewrite conditions to always be a comparison against 0. To do this it
12912 sometimes flips the edges. This is fine for scalar, but for vector we
12913 then have to flip the test, as we're still assuming that if you take the
12914 branch edge that we found the exit condition. i.e. we need to know whether
12915 we are generating a `forall` or an `exist` condition. */
12916 auto new_code = NE_EXPR;
12917 auto reduc_optab = ior_optab;
12918 auto reduc_op = BIT_IOR_EXPR;
12919 tree cst = build_zero_cst (vectype);
12920 edge exit_true_edge = EDGE_SUCC (gimple_bb (cond_stmt), 0);
12921 if (exit_true_edge->flags & EDGE_FALSE_VALUE)
12922 exit_true_edge = EDGE_SUCC (gimple_bb (cond_stmt), 1);
12923 gcc_assert (exit_true_edge->flags & EDGE_TRUE_VALUE);
12924 if (flow_bb_inside_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
12925 exit_true_edge->dest))
12927 new_code = EQ_EXPR;
12928 reduc_optab = and_optab;
12929 reduc_op = BIT_AND_EXPR;
12930 cst = build_minus_one_cst (vectype);
12933 /* Analyze only. */
12934 if (!vec_stmt)
12936 if (direct_optab_handler (cbranch_optab, mode) == CODE_FOR_nothing)
12938 if (dump_enabled_p ())
12939 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12940 "can't vectorize early exit because the "
12941 "target doesn't support flag setting vector "
12942 "comparisons.\n");
12943 return false;
12946 if (ncopies > 1
12947 && direct_optab_handler (reduc_optab, mode) == CODE_FOR_nothing)
12949 if (dump_enabled_p ())
12950 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12951 "can't vectorize early exit because the "
12952 "target does not support boolean vector %s "
12953 "for type %T.\n",
12954 reduc_optab == ior_optab ? "OR" : "AND",
12955 vectype);
12956 return false;
12959 if (!vectorizable_comparison_1 (vinfo, vectype, stmt_info, code, gsi,
12960 vec_stmt, slp_node, cost_vec))
12961 return false;
12963 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
12965 if (direct_internal_fn_supported_p (IFN_VCOND_MASK_LEN, vectype,
12966 OPTIMIZE_FOR_SPEED))
12967 vect_record_loop_len (loop_vinfo, lens, ncopies, vectype, 1);
12968 else
12969 vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, NULL);
12972 return true;
12975 /* Tranform. */
12977 tree new_temp = NULL_TREE;
12978 gimple *new_stmt = NULL;
12980 if (dump_enabled_p ())
12981 dump_printf_loc (MSG_NOTE, vect_location, "transform early-exit.\n");
12983 if (!vectorizable_comparison_1 (vinfo, vectype, stmt_info, code, gsi,
12984 vec_stmt, slp_node, cost_vec))
12985 gcc_unreachable ();
12987 gimple *stmt = STMT_VINFO_STMT (stmt_info);
12988 basic_block cond_bb = gimple_bb (stmt);
12989 gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
12991 auto_vec<tree> stmts;
12993 if (slp_node)
12994 stmts.safe_splice (SLP_TREE_VEC_DEFS (slp_node));
12995 else
12997 auto vec_stmts = STMT_VINFO_VEC_STMTS (stmt_info);
12998 stmts.reserve_exact (vec_stmts.length ());
12999 for (auto stmt : vec_stmts)
13000 stmts.quick_push (gimple_assign_lhs (stmt));
13003 /* Determine if we need to reduce the final value. */
13004 if (stmts.length () > 1)
13006 /* We build the reductions in a way to maintain as much parallelism as
13007 possible. */
13008 auto_vec<tree> workset (stmts.length ());
13010 /* Mask the statements as we queue them up. Normally we loop over
13011 vec_num, but since we inspect the exact results of vectorization
13012 we don't need to and instead can just use the stmts themselves. */
13013 if (masked_loop_p)
13014 for (unsigned i = 0; i < stmts.length (); i++)
13016 tree stmt_mask
13017 = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies, vectype,
13019 stmt_mask
13020 = prepare_vec_mask (loop_vinfo, TREE_TYPE (stmt_mask), stmt_mask,
13021 stmts[i], &cond_gsi);
13022 workset.quick_push (stmt_mask);
13024 else if (len_loop_p)
13025 for (unsigned i = 0; i < stmts.length (); i++)
13027 tree len_mask = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi,
13028 lens, ncopies, vectype,
13029 stmts[i], i, 1);
13031 workset.quick_push (len_mask);
13033 else
13034 workset.splice (stmts);
13036 while (workset.length () > 1)
13038 new_temp = make_temp_ssa_name (vectype, NULL, "vexit_reduc");
13039 tree arg0 = workset.pop ();
13040 tree arg1 = workset.pop ();
13041 new_stmt = gimple_build_assign (new_temp, reduc_op, arg0, arg1);
13042 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
13043 &cond_gsi);
13044 workset.quick_insert (0, new_temp);
13047 else
13049 new_temp = stmts[0];
13050 if (masked_loop_p)
13052 tree mask
13053 = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies, vectype, 0);
13054 new_temp = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
13055 new_temp, &cond_gsi);
13057 else if (len_loop_p)
13058 new_temp = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi, lens,
13059 ncopies, vectype, new_temp, 0, 1);
13062 gcc_assert (new_temp);
13064 gimple_cond_set_condition (cond_stmt, new_code, new_temp, cst);
13065 update_stmt (orig_stmt);
13067 if (slp_node)
13068 SLP_TREE_VEC_DEFS (slp_node).truncate (0);
13069 else
13070 STMT_VINFO_VEC_STMTS (stmt_info).truncate (0);
13072 if (!slp_node)
13073 *vec_stmt = orig_stmt;
13075 return true;
13078 /* If SLP_NODE is nonnull, return true if vectorizable_live_operation
13079 can handle all live statements in the node. Otherwise return true
13080 if STMT_INFO is not live or if vectorizable_live_operation can handle it.
13081 VEC_STMT_P is as for vectorizable_live_operation. */
13083 static bool
13084 can_vectorize_live_stmts (vec_info *vinfo, stmt_vec_info stmt_info,
13085 slp_tree slp_node, slp_instance slp_node_instance,
13086 bool vec_stmt_p,
13087 stmt_vector_for_cost *cost_vec)
13089 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
13090 if (slp_node)
13092 stmt_vec_info slp_stmt_info;
13093 unsigned int i;
13094 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, slp_stmt_info)
13096 if (slp_stmt_info
13097 && (STMT_VINFO_LIVE_P (slp_stmt_info)
13098 || (loop_vinfo
13099 && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
13100 && STMT_VINFO_DEF_TYPE (slp_stmt_info)
13101 == vect_induction_def))
13102 && !vectorizable_live_operation (vinfo, slp_stmt_info, slp_node,
13103 slp_node_instance, i,
13104 vec_stmt_p, cost_vec))
13105 return false;
13108 else if ((STMT_VINFO_LIVE_P (stmt_info)
13109 || (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
13110 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def))
13111 && !vectorizable_live_operation (vinfo, stmt_info,
13112 slp_node, slp_node_instance, -1,
13113 vec_stmt_p, cost_vec))
13114 return false;
13116 return true;
13119 /* Make sure the statement is vectorizable. */
13121 opt_result
13122 vect_analyze_stmt (vec_info *vinfo,
13123 stmt_vec_info stmt_info, bool *need_to_vectorize,
13124 slp_tree node, slp_instance node_instance,
13125 stmt_vector_for_cost *cost_vec)
13127 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
13128 enum vect_relevant relevance = STMT_VINFO_RELEVANT (stmt_info);
13129 bool ok;
13130 gimple_seq pattern_def_seq;
13132 if (dump_enabled_p ())
13133 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
13134 stmt_info->stmt);
13136 if (gimple_has_volatile_ops (stmt_info->stmt))
13137 return opt_result::failure_at (stmt_info->stmt,
13138 "not vectorized:"
13139 " stmt has volatile operands: %G\n",
13140 stmt_info->stmt);
13142 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
13143 && node == NULL
13144 && (pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)))
13146 gimple_stmt_iterator si;
13148 for (si = gsi_start (pattern_def_seq); !gsi_end_p (si); gsi_next (&si))
13150 stmt_vec_info pattern_def_stmt_info
13151 = vinfo->lookup_stmt (gsi_stmt (si));
13152 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
13153 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
13155 /* Analyze def stmt of STMT if it's a pattern stmt. */
13156 if (dump_enabled_p ())
13157 dump_printf_loc (MSG_NOTE, vect_location,
13158 "==> examining pattern def statement: %G",
13159 pattern_def_stmt_info->stmt);
13161 opt_result res
13162 = vect_analyze_stmt (vinfo, pattern_def_stmt_info,
13163 need_to_vectorize, node, node_instance,
13164 cost_vec);
13165 if (!res)
13166 return res;
13171 /* Skip stmts that do not need to be vectorized. In loops this is expected
13172 to include:
13173 - the COND_EXPR which is the loop exit condition
13174 - any LABEL_EXPRs in the loop
13175 - computations that are used only for array indexing or loop control.
13176 In basic blocks we only analyze statements that are a part of some SLP
13177 instance, therefore, all the statements are relevant.
13179 Pattern statement needs to be analyzed instead of the original statement
13180 if the original statement is not relevant. Otherwise, we analyze both
13181 statements. In basic blocks we are called from some SLP instance
13182 traversal, don't analyze pattern stmts instead, the pattern stmts
13183 already will be part of SLP instance. */
13185 stmt_vec_info pattern_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
13186 if (!STMT_VINFO_RELEVANT_P (stmt_info)
13187 && !STMT_VINFO_LIVE_P (stmt_info))
13189 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
13190 && pattern_stmt_info
13191 && (STMT_VINFO_RELEVANT_P (pattern_stmt_info)
13192 || STMT_VINFO_LIVE_P (pattern_stmt_info)))
13194 /* Analyze PATTERN_STMT instead of the original stmt. */
13195 stmt_info = pattern_stmt_info;
13196 if (dump_enabled_p ())
13197 dump_printf_loc (MSG_NOTE, vect_location,
13198 "==> examining pattern statement: %G",
13199 stmt_info->stmt);
13201 else
13203 if (dump_enabled_p ())
13204 dump_printf_loc (MSG_NOTE, vect_location, "irrelevant.\n");
13206 return opt_result::success ();
13209 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
13210 && node == NULL
13211 && pattern_stmt_info
13212 && (STMT_VINFO_RELEVANT_P (pattern_stmt_info)
13213 || STMT_VINFO_LIVE_P (pattern_stmt_info)))
13215 /* Analyze PATTERN_STMT too. */
13216 if (dump_enabled_p ())
13217 dump_printf_loc (MSG_NOTE, vect_location,
13218 "==> examining pattern statement: %G",
13219 pattern_stmt_info->stmt);
13221 opt_result res
13222 = vect_analyze_stmt (vinfo, pattern_stmt_info, need_to_vectorize, node,
13223 node_instance, cost_vec);
13224 if (!res)
13225 return res;
13228 switch (STMT_VINFO_DEF_TYPE (stmt_info))
13230 case vect_internal_def:
13231 case vect_condition_def:
13232 break;
13234 case vect_reduction_def:
13235 case vect_nested_cycle:
13236 gcc_assert (!bb_vinfo
13237 && (relevance == vect_used_in_outer
13238 || relevance == vect_used_in_outer_by_reduction
13239 || relevance == vect_used_by_reduction
13240 || relevance == vect_unused_in_scope
13241 || relevance == vect_used_only_live));
13242 break;
13244 case vect_double_reduction_def:
13245 gcc_assert (!bb_vinfo && node);
13246 break;
13248 case vect_induction_def:
13249 case vect_first_order_recurrence:
13250 gcc_assert (!bb_vinfo);
13251 break;
13253 case vect_constant_def:
13254 case vect_external_def:
13255 case vect_unknown_def_type:
13256 default:
13257 gcc_unreachable ();
13260 tree saved_vectype = STMT_VINFO_VECTYPE (stmt_info);
13261 if (node)
13262 STMT_VINFO_VECTYPE (stmt_info) = SLP_TREE_VECTYPE (node);
13264 if (STMT_VINFO_RELEVANT_P (stmt_info))
13266 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
13267 gcc_assert (STMT_VINFO_VECTYPE (stmt_info)
13268 || gimple_code (stmt_info->stmt) == GIMPLE_COND
13269 || (call && gimple_call_lhs (call) == NULL_TREE));
13270 *need_to_vectorize = true;
13273 if (PURE_SLP_STMT (stmt_info) && !node)
13275 if (dump_enabled_p ())
13276 dump_printf_loc (MSG_NOTE, vect_location,
13277 "handled only by SLP analysis\n");
13278 return opt_result::success ();
13281 ok = true;
13282 if (!bb_vinfo
13283 && (STMT_VINFO_RELEVANT_P (stmt_info)
13284 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def))
13285 /* Prefer vectorizable_call over vectorizable_simd_clone_call so
13286 -mveclibabi= takes preference over library functions with
13287 the simd attribute. */
13288 ok = (vectorizable_call (vinfo, stmt_info, NULL, NULL, node, cost_vec)
13289 || vectorizable_simd_clone_call (vinfo, stmt_info, NULL, NULL, node,
13290 cost_vec)
13291 || vectorizable_conversion (vinfo, stmt_info,
13292 NULL, NULL, node, cost_vec)
13293 || vectorizable_operation (vinfo, stmt_info,
13294 NULL, NULL, node, cost_vec)
13295 || vectorizable_assignment (vinfo, stmt_info,
13296 NULL, NULL, node, cost_vec)
13297 || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec)
13298 || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec)
13299 || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo),
13300 stmt_info, node, cost_vec)
13301 || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
13302 node, node_instance, cost_vec)
13303 || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info,
13304 NULL, node, cost_vec)
13305 || vectorizable_shift (vinfo, stmt_info, NULL, NULL, node, cost_vec)
13306 || vectorizable_condition (vinfo, stmt_info,
13307 NULL, NULL, node, cost_vec)
13308 || vectorizable_comparison (vinfo, stmt_info, NULL, NULL, node,
13309 cost_vec)
13310 || vectorizable_lc_phi (as_a <loop_vec_info> (vinfo),
13311 stmt_info, NULL, node)
13312 || vectorizable_recurr (as_a <loop_vec_info> (vinfo),
13313 stmt_info, NULL, node, cost_vec)
13314 || vectorizable_early_exit (vinfo, stmt_info, NULL, NULL, node,
13315 cost_vec));
13316 else
13318 if (bb_vinfo)
13319 ok = (vectorizable_call (vinfo, stmt_info, NULL, NULL, node, cost_vec)
13320 || vectorizable_simd_clone_call (vinfo, stmt_info,
13321 NULL, NULL, node, cost_vec)
13322 || vectorizable_conversion (vinfo, stmt_info, NULL, NULL, node,
13323 cost_vec)
13324 || vectorizable_shift (vinfo, stmt_info,
13325 NULL, NULL, node, cost_vec)
13326 || vectorizable_operation (vinfo, stmt_info,
13327 NULL, NULL, node, cost_vec)
13328 || vectorizable_assignment (vinfo, stmt_info, NULL, NULL, node,
13329 cost_vec)
13330 || vectorizable_load (vinfo, stmt_info,
13331 NULL, NULL, node, cost_vec)
13332 || vectorizable_store (vinfo, stmt_info,
13333 NULL, NULL, node, cost_vec)
13334 || vectorizable_condition (vinfo, stmt_info,
13335 NULL, NULL, node, cost_vec)
13336 || vectorizable_comparison (vinfo, stmt_info, NULL, NULL, node,
13337 cost_vec)
13338 || vectorizable_phi (vinfo, stmt_info, NULL, node, cost_vec)
13339 || vectorizable_early_exit (vinfo, stmt_info, NULL, NULL, node,
13340 cost_vec));
13344 if (node)
13345 STMT_VINFO_VECTYPE (stmt_info) = saved_vectype;
13347 if (!ok)
13348 return opt_result::failure_at (stmt_info->stmt,
13349 "not vectorized:"
13350 " relevant stmt not supported: %G",
13351 stmt_info->stmt);
13353 /* Stmts that are (also) "live" (i.e. - that are used out of the loop)
13354 need extra handling, except for vectorizable reductions. */
13355 if (!bb_vinfo
13356 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type
13357 && STMT_VINFO_TYPE (stmt_info) != lc_phi_info_type
13358 && !can_vectorize_live_stmts (as_a <loop_vec_info> (vinfo),
13359 stmt_info, node, node_instance,
13360 false, cost_vec))
13361 return opt_result::failure_at (stmt_info->stmt,
13362 "not vectorized:"
13363 " live stmt not supported: %G",
13364 stmt_info->stmt);
13366 return opt_result::success ();
13370 /* Function vect_transform_stmt.
13372 Create a vectorized stmt to replace STMT_INFO, and insert it at GSI. */
13374 bool
13375 vect_transform_stmt (vec_info *vinfo,
13376 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
13377 slp_tree slp_node, slp_instance slp_node_instance)
13379 bool is_store = false;
13380 gimple *vec_stmt = NULL;
13381 bool done;
13383 gcc_assert (slp_node || !PURE_SLP_STMT (stmt_info));
13385 tree saved_vectype = STMT_VINFO_VECTYPE (stmt_info);
13386 if (slp_node)
13387 STMT_VINFO_VECTYPE (stmt_info) = SLP_TREE_VECTYPE (slp_node);
13389 switch (STMT_VINFO_TYPE (stmt_info))
13391 case type_demotion_vec_info_type:
13392 case type_promotion_vec_info_type:
13393 case type_conversion_vec_info_type:
13394 done = vectorizable_conversion (vinfo, stmt_info,
13395 gsi, &vec_stmt, slp_node, NULL);
13396 gcc_assert (done);
13397 break;
13399 case induc_vec_info_type:
13400 done = vectorizable_induction (as_a <loop_vec_info> (vinfo),
13401 stmt_info, &vec_stmt, slp_node,
13402 NULL);
13403 gcc_assert (done);
13404 break;
13406 case shift_vec_info_type:
13407 done = vectorizable_shift (vinfo, stmt_info,
13408 gsi, &vec_stmt, slp_node, NULL);
13409 gcc_assert (done);
13410 break;
13412 case op_vec_info_type:
13413 done = vectorizable_operation (vinfo, stmt_info, gsi, &vec_stmt, slp_node,
13414 NULL);
13415 gcc_assert (done);
13416 break;
13418 case assignment_vec_info_type:
13419 done = vectorizable_assignment (vinfo, stmt_info,
13420 gsi, &vec_stmt, slp_node, NULL);
13421 gcc_assert (done);
13422 break;
13424 case load_vec_info_type:
13425 done = vectorizable_load (vinfo, stmt_info, gsi, &vec_stmt, slp_node,
13426 NULL);
13427 gcc_assert (done);
13428 break;
13430 case store_vec_info_type:
13431 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
13432 && !slp_node
13433 && (++DR_GROUP_STORE_COUNT (DR_GROUP_FIRST_ELEMENT (stmt_info))
13434 < DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info))))
13435 /* In case of interleaving, the whole chain is vectorized when the
13436 last store in the chain is reached. Store stmts before the last
13437 one are skipped, and there vec_stmt_info shouldn't be freed
13438 meanwhile. */
13440 else
13442 done = vectorizable_store (vinfo, stmt_info,
13443 gsi, &vec_stmt, slp_node, NULL);
13444 gcc_assert (done);
13445 is_store = true;
13447 break;
13449 case condition_vec_info_type:
13450 done = vectorizable_condition (vinfo, stmt_info,
13451 gsi, &vec_stmt, slp_node, NULL);
13452 gcc_assert (done);
13453 break;
13455 case comparison_vec_info_type:
13456 done = vectorizable_comparison (vinfo, stmt_info, gsi, &vec_stmt,
13457 slp_node, NULL);
13458 gcc_assert (done);
13459 break;
13461 case call_vec_info_type:
13462 done = vectorizable_call (vinfo, stmt_info,
13463 gsi, &vec_stmt, slp_node, NULL);
13464 break;
13466 case call_simd_clone_vec_info_type:
13467 done = vectorizable_simd_clone_call (vinfo, stmt_info, gsi, &vec_stmt,
13468 slp_node, NULL);
13469 break;
13471 case reduc_vec_info_type:
13472 done = vect_transform_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
13473 gsi, &vec_stmt, slp_node);
13474 gcc_assert (done);
13475 break;
13477 case cycle_phi_info_type:
13478 done = vect_transform_cycle_phi (as_a <loop_vec_info> (vinfo), stmt_info,
13479 &vec_stmt, slp_node, slp_node_instance);
13480 gcc_assert (done);
13481 break;
13483 case lc_phi_info_type:
13484 done = vectorizable_lc_phi (as_a <loop_vec_info> (vinfo),
13485 stmt_info, &vec_stmt, slp_node);
13486 gcc_assert (done);
13487 break;
13489 case recurr_info_type:
13490 done = vectorizable_recurr (as_a <loop_vec_info> (vinfo),
13491 stmt_info, &vec_stmt, slp_node, NULL);
13492 gcc_assert (done);
13493 break;
13495 case phi_info_type:
13496 done = vectorizable_phi (vinfo, stmt_info, &vec_stmt, slp_node, NULL);
13497 gcc_assert (done);
13498 break;
13500 case loop_exit_ctrl_vec_info_type:
13501 done = vectorizable_early_exit (vinfo, stmt_info, gsi, &vec_stmt,
13502 slp_node, NULL);
13503 gcc_assert (done);
13504 break;
13506 default:
13507 if (!STMT_VINFO_LIVE_P (stmt_info))
13509 if (dump_enabled_p ())
13510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
13511 "stmt not supported.\n");
13512 gcc_unreachable ();
13514 done = true;
13517 if (!slp_node && vec_stmt)
13518 gcc_assert (STMT_VINFO_VEC_STMTS (stmt_info).exists ());
13520 if (STMT_VINFO_TYPE (stmt_info) != store_vec_info_type)
13522 /* Handle stmts whose DEF is used outside the loop-nest that is
13523 being vectorized. */
13524 done = can_vectorize_live_stmts (vinfo, stmt_info, slp_node,
13525 slp_node_instance, true, NULL);
13526 gcc_assert (done);
13529 if (slp_node)
13530 STMT_VINFO_VECTYPE (stmt_info) = saved_vectype;
13532 return is_store;
13536 /* Remove a group of stores (for SLP or interleaving), free their
13537 stmt_vec_info. */
13539 void
13540 vect_remove_stores (vec_info *vinfo, stmt_vec_info first_stmt_info)
13542 stmt_vec_info next_stmt_info = first_stmt_info;
13544 while (next_stmt_info)
13546 stmt_vec_info tmp = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
13547 next_stmt_info = vect_orig_stmt (next_stmt_info);
13548 /* Free the attached stmt_vec_info and remove the stmt. */
13549 vinfo->remove_stmt (next_stmt_info);
13550 next_stmt_info = tmp;
13554 /* If NUNITS is nonzero, return a vector type that contains NUNITS
13555 elements of type SCALAR_TYPE, or null if the target doesn't support
13556 such a type.
13558 If NUNITS is zero, return a vector type that contains elements of
13559 type SCALAR_TYPE, choosing whichever vector size the target prefers.
13561 If PREVAILING_MODE is VOIDmode, we have not yet chosen a vector mode
13562 for this vectorization region and want to "autodetect" the best choice.
13563 Otherwise, PREVAILING_MODE is a previously-chosen vector TYPE_MODE
13564 and we want the new type to be interoperable with it. PREVAILING_MODE
13565 in this case can be a scalar integer mode or a vector mode; when it
13566 is a vector mode, the function acts like a tree-level version of
13567 related_vector_mode. */
13569 tree
13570 get_related_vectype_for_scalar_type (machine_mode prevailing_mode,
13571 tree scalar_type, poly_uint64 nunits)
13573 tree orig_scalar_type = scalar_type;
13574 scalar_mode inner_mode;
13575 machine_mode simd_mode;
13576 tree vectype;
13578 if ((!INTEGRAL_TYPE_P (scalar_type)
13579 && !POINTER_TYPE_P (scalar_type)
13580 && !SCALAR_FLOAT_TYPE_P (scalar_type))
13581 || (!is_int_mode (TYPE_MODE (scalar_type), &inner_mode)
13582 && !is_float_mode (TYPE_MODE (scalar_type), &inner_mode)))
13583 return NULL_TREE;
13585 unsigned int nbytes = GET_MODE_SIZE (inner_mode);
13587 /* Interoperability between modes requires one to be a constant multiple
13588 of the other, so that the number of vectors required for each operation
13589 is a compile-time constant. */
13590 if (prevailing_mode != VOIDmode
13591 && !constant_multiple_p (nunits * nbytes,
13592 GET_MODE_SIZE (prevailing_mode))
13593 && !constant_multiple_p (GET_MODE_SIZE (prevailing_mode),
13594 nunits * nbytes))
13595 return NULL_TREE;
13597 /* For vector types of elements whose mode precision doesn't
13598 match their types precision we use a element type of mode
13599 precision. The vectorization routines will have to make sure
13600 they support the proper result truncation/extension.
13601 We also make sure to build vector types with INTEGER_TYPE
13602 component type only. */
13603 if (INTEGRAL_TYPE_P (scalar_type)
13604 && (GET_MODE_BITSIZE (inner_mode) != TYPE_PRECISION (scalar_type)
13605 || TREE_CODE (scalar_type) != INTEGER_TYPE))
13606 scalar_type = build_nonstandard_integer_type (GET_MODE_BITSIZE (inner_mode),
13607 TYPE_UNSIGNED (scalar_type));
13609 /* We shouldn't end up building VECTOR_TYPEs of non-scalar components.
13610 When the component mode passes the above test simply use a type
13611 corresponding to that mode. The theory is that any use that
13612 would cause problems with this will disable vectorization anyway. */
13613 else if (!SCALAR_FLOAT_TYPE_P (scalar_type)
13614 && !INTEGRAL_TYPE_P (scalar_type))
13615 scalar_type = lang_hooks.types.type_for_mode (inner_mode, 1);
13617 /* We can't build a vector type of elements with alignment bigger than
13618 their size. */
13619 else if (nbytes < TYPE_ALIGN_UNIT (scalar_type))
13620 scalar_type = lang_hooks.types.type_for_mode (inner_mode,
13621 TYPE_UNSIGNED (scalar_type));
13623 /* If we felt back to using the mode fail if there was
13624 no scalar type for it. */
13625 if (scalar_type == NULL_TREE)
13626 return NULL_TREE;
13628 /* If no prevailing mode was supplied, use the mode the target prefers.
13629 Otherwise lookup a vector mode based on the prevailing mode. */
13630 if (prevailing_mode == VOIDmode)
13632 gcc_assert (known_eq (nunits, 0U));
13633 simd_mode = targetm.vectorize.preferred_simd_mode (inner_mode);
13634 if (SCALAR_INT_MODE_P (simd_mode))
13636 /* Traditional behavior is not to take the integer mode
13637 literally, but simply to use it as a way of determining
13638 the vector size. It is up to mode_for_vector to decide
13639 what the TYPE_MODE should be.
13641 Note that nunits == 1 is allowed in order to support single
13642 element vector types. */
13643 if (!multiple_p (GET_MODE_SIZE (simd_mode), nbytes, &nunits)
13644 || !mode_for_vector (inner_mode, nunits).exists (&simd_mode))
13645 return NULL_TREE;
13648 else if (SCALAR_INT_MODE_P (prevailing_mode)
13649 || !related_vector_mode (prevailing_mode,
13650 inner_mode, nunits).exists (&simd_mode))
13652 /* Fall back to using mode_for_vector, mostly in the hope of being
13653 able to use an integer mode. */
13654 if (known_eq (nunits, 0U)
13655 && !multiple_p (GET_MODE_SIZE (prevailing_mode), nbytes, &nunits))
13656 return NULL_TREE;
13658 if (!mode_for_vector (inner_mode, nunits).exists (&simd_mode))
13659 return NULL_TREE;
13662 vectype = build_vector_type_for_mode (scalar_type, simd_mode);
13664 /* In cases where the mode was chosen by mode_for_vector, check that
13665 the target actually supports the chosen mode, or that it at least
13666 allows the vector mode to be replaced by a like-sized integer. */
13667 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
13668 && !INTEGRAL_MODE_P (TYPE_MODE (vectype)))
13669 return NULL_TREE;
13671 /* Re-attach the address-space qualifier if we canonicalized the scalar
13672 type. */
13673 if (TYPE_ADDR_SPACE (orig_scalar_type) != TYPE_ADDR_SPACE (vectype))
13674 return build_qualified_type
13675 (vectype, KEEP_QUAL_ADDR_SPACE (TYPE_QUALS (orig_scalar_type)));
13677 return vectype;
13680 /* Function get_vectype_for_scalar_type.
13682 Returns the vector type corresponding to SCALAR_TYPE as supported
13683 by the target. If GROUP_SIZE is nonzero and we're performing BB
13684 vectorization, make sure that the number of elements in the vector
13685 is no bigger than GROUP_SIZE. */
13687 tree
13688 get_vectype_for_scalar_type (vec_info *vinfo, tree scalar_type,
13689 unsigned int group_size)
13691 /* For BB vectorization, we should always have a group size once we've
13692 constructed the SLP tree; the only valid uses of zero GROUP_SIZEs
13693 are tentative requests during things like early data reference
13694 analysis and pattern recognition. */
13695 if (is_a <bb_vec_info> (vinfo))
13696 gcc_assert (vinfo->slp_instances.is_empty () || group_size != 0);
13697 else
13698 group_size = 0;
13700 tree vectype = get_related_vectype_for_scalar_type (vinfo->vector_mode,
13701 scalar_type);
13702 if (vectype && vinfo->vector_mode == VOIDmode)
13703 vinfo->vector_mode = TYPE_MODE (vectype);
13705 /* Register the natural choice of vector type, before the group size
13706 has been applied. */
13707 if (vectype)
13708 vinfo->used_vector_modes.add (TYPE_MODE (vectype));
13710 /* If the natural choice of vector type doesn't satisfy GROUP_SIZE,
13711 try again with an explicit number of elements. */
13712 if (vectype
13713 && group_size
13714 && maybe_ge (TYPE_VECTOR_SUBPARTS (vectype), group_size))
13716 /* Start with the biggest number of units that fits within
13717 GROUP_SIZE and halve it until we find a valid vector type.
13718 Usually either the first attempt will succeed or all will
13719 fail (in the latter case because GROUP_SIZE is too small
13720 for the target), but it's possible that a target could have
13721 a hole between supported vector types.
13723 If GROUP_SIZE is not a power of 2, this has the effect of
13724 trying the largest power of 2 that fits within the group,
13725 even though the group is not a multiple of that vector size.
13726 The BB vectorizer will then try to carve up the group into
13727 smaller pieces. */
13728 unsigned int nunits = 1 << floor_log2 (group_size);
13731 vectype = get_related_vectype_for_scalar_type (vinfo->vector_mode,
13732 scalar_type, nunits);
13733 nunits /= 2;
13735 while (nunits > 1 && !vectype);
13738 return vectype;
13741 /* Return the vector type corresponding to SCALAR_TYPE as supported
13742 by the target. NODE, if nonnull, is the SLP tree node that will
13743 use the returned vector type. */
13745 tree
13746 get_vectype_for_scalar_type (vec_info *vinfo, tree scalar_type, slp_tree node)
13748 unsigned int group_size = 0;
13749 if (node)
13750 group_size = SLP_TREE_LANES (node);
13751 return get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
13754 /* Function get_mask_type_for_scalar_type.
13756 Returns the mask type corresponding to a result of comparison
13757 of vectors of specified SCALAR_TYPE as supported by target.
13758 If GROUP_SIZE is nonzero and we're performing BB vectorization,
13759 make sure that the number of elements in the vector is no bigger
13760 than GROUP_SIZE. */
13762 tree
13763 get_mask_type_for_scalar_type (vec_info *vinfo, tree scalar_type,
13764 unsigned int group_size)
13766 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
13768 if (!vectype)
13769 return NULL;
13771 return truth_type_for (vectype);
13774 /* Function get_mask_type_for_scalar_type.
13776 Returns the mask type corresponding to a result of comparison
13777 of vectors of specified SCALAR_TYPE as supported by target.
13778 NODE, if nonnull, is the SLP tree node that will use the returned
13779 vector type. */
13781 tree
13782 get_mask_type_for_scalar_type (vec_info *vinfo, tree scalar_type,
13783 slp_tree node)
13785 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type, node);
13787 if (!vectype)
13788 return NULL;
13790 return truth_type_for (vectype);
13793 /* Function get_same_sized_vectype
13795 Returns a vector type corresponding to SCALAR_TYPE of size
13796 VECTOR_TYPE if supported by the target. */
13798 tree
13799 get_same_sized_vectype (tree scalar_type, tree vector_type)
13801 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type))
13802 return truth_type_for (vector_type);
13804 poly_uint64 nunits;
13805 if (!multiple_p (GET_MODE_SIZE (TYPE_MODE (vector_type)),
13806 GET_MODE_SIZE (TYPE_MODE (scalar_type)), &nunits))
13807 return NULL_TREE;
13809 return get_related_vectype_for_scalar_type (TYPE_MODE (vector_type),
13810 scalar_type, nunits);
13813 /* Return true if replacing LOOP_VINFO->vector_mode with VECTOR_MODE
13814 would not change the chosen vector modes. */
13816 bool
13817 vect_chooses_same_modes_p (vec_info *vinfo, machine_mode vector_mode)
13819 for (vec_info::mode_set::iterator i = vinfo->used_vector_modes.begin ();
13820 i != vinfo->used_vector_modes.end (); ++i)
13821 if (!VECTOR_MODE_P (*i)
13822 || related_vector_mode (vector_mode, GET_MODE_INNER (*i), 0) != *i)
13823 return false;
13824 return true;
13827 /* Function vect_is_simple_use.
13829 Input:
13830 VINFO - the vect info of the loop or basic block that is being vectorized.
13831 OPERAND - operand in the loop or bb.
13832 Output:
13833 DEF_STMT_INFO_OUT (optional) - information about the defining stmt in
13834 case OPERAND is an SSA_NAME that is defined in the vectorizable region
13835 DEF_STMT_OUT (optional) - the defining stmt in case OPERAND is an SSA_NAME;
13836 the definition could be anywhere in the function
13837 DT - the type of definition
13839 Returns whether a stmt with OPERAND can be vectorized.
13840 For loops, supportable operands are constants, loop invariants, and operands
13841 that are defined by the current iteration of the loop. Unsupportable
13842 operands are those that are defined by a previous iteration of the loop (as
13843 is the case in reduction/induction computations).
13844 For basic blocks, supportable operands are constants and bb invariants.
13845 For now, operands defined outside the basic block are not supported. */
13847 bool
13848 vect_is_simple_use (tree operand, vec_info *vinfo, enum vect_def_type *dt,
13849 stmt_vec_info *def_stmt_info_out, gimple **def_stmt_out)
13851 if (def_stmt_info_out)
13852 *def_stmt_info_out = NULL;
13853 if (def_stmt_out)
13854 *def_stmt_out = NULL;
13855 *dt = vect_unknown_def_type;
13857 if (dump_enabled_p ())
13859 dump_printf_loc (MSG_NOTE, vect_location,
13860 "vect_is_simple_use: operand ");
13861 if (TREE_CODE (operand) == SSA_NAME
13862 && !SSA_NAME_IS_DEFAULT_DEF (operand))
13863 dump_gimple_expr (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (operand), 0);
13864 else
13865 dump_generic_expr (MSG_NOTE, TDF_SLIM, operand);
13868 if (CONSTANT_CLASS_P (operand))
13869 *dt = vect_constant_def;
13870 else if (is_gimple_min_invariant (operand))
13871 *dt = vect_external_def;
13872 else if (TREE_CODE (operand) != SSA_NAME)
13873 *dt = vect_unknown_def_type;
13874 else if (SSA_NAME_IS_DEFAULT_DEF (operand))
13875 *dt = vect_external_def;
13876 else
13878 gimple *def_stmt = SSA_NAME_DEF_STMT (operand);
13879 stmt_vec_info stmt_vinfo = vinfo->lookup_def (operand);
13880 if (!stmt_vinfo)
13881 *dt = vect_external_def;
13882 else
13884 stmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
13885 def_stmt = stmt_vinfo->stmt;
13886 *dt = STMT_VINFO_DEF_TYPE (stmt_vinfo);
13887 if (def_stmt_info_out)
13888 *def_stmt_info_out = stmt_vinfo;
13890 if (def_stmt_out)
13891 *def_stmt_out = def_stmt;
13894 if (dump_enabled_p ())
13896 dump_printf (MSG_NOTE, ", type of def: ");
13897 switch (*dt)
13899 case vect_uninitialized_def:
13900 dump_printf (MSG_NOTE, "uninitialized\n");
13901 break;
13902 case vect_constant_def:
13903 dump_printf (MSG_NOTE, "constant\n");
13904 break;
13905 case vect_external_def:
13906 dump_printf (MSG_NOTE, "external\n");
13907 break;
13908 case vect_internal_def:
13909 dump_printf (MSG_NOTE, "internal\n");
13910 break;
13911 case vect_induction_def:
13912 dump_printf (MSG_NOTE, "induction\n");
13913 break;
13914 case vect_reduction_def:
13915 dump_printf (MSG_NOTE, "reduction\n");
13916 break;
13917 case vect_double_reduction_def:
13918 dump_printf (MSG_NOTE, "double reduction\n");
13919 break;
13920 case vect_nested_cycle:
13921 dump_printf (MSG_NOTE, "nested cycle\n");
13922 break;
13923 case vect_first_order_recurrence:
13924 dump_printf (MSG_NOTE, "first order recurrence\n");
13925 break;
13926 case vect_condition_def:
13927 dump_printf (MSG_NOTE, "control flow\n");
13928 break;
13929 case vect_unknown_def_type:
13930 dump_printf (MSG_NOTE, "unknown\n");
13931 break;
13935 if (*dt == vect_unknown_def_type)
13937 if (dump_enabled_p ())
13938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
13939 "Unsupported pattern.\n");
13940 return false;
13943 return true;
13946 /* Function vect_is_simple_use.
13948 Same as vect_is_simple_use but also determines the vector operand
13949 type of OPERAND and stores it to *VECTYPE. If the definition of
13950 OPERAND is vect_uninitialized_def, vect_constant_def or
13951 vect_external_def *VECTYPE will be set to NULL_TREE and the caller
13952 is responsible to compute the best suited vector type for the
13953 scalar operand. */
13955 bool
13956 vect_is_simple_use (tree operand, vec_info *vinfo, enum vect_def_type *dt,
13957 tree *vectype, stmt_vec_info *def_stmt_info_out,
13958 gimple **def_stmt_out)
13960 stmt_vec_info def_stmt_info;
13961 gimple *def_stmt;
13962 if (!vect_is_simple_use (operand, vinfo, dt, &def_stmt_info, &def_stmt))
13963 return false;
13965 if (def_stmt_out)
13966 *def_stmt_out = def_stmt;
13967 if (def_stmt_info_out)
13968 *def_stmt_info_out = def_stmt_info;
13970 /* Now get a vector type if the def is internal, otherwise supply
13971 NULL_TREE and leave it up to the caller to figure out a proper
13972 type for the use stmt. */
13973 if (*dt == vect_internal_def
13974 || *dt == vect_induction_def
13975 || *dt == vect_reduction_def
13976 || *dt == vect_double_reduction_def
13977 || *dt == vect_nested_cycle
13978 || *dt == vect_first_order_recurrence)
13980 *vectype = STMT_VINFO_VECTYPE (def_stmt_info);
13981 gcc_assert (*vectype != NULL_TREE);
13982 if (dump_enabled_p ())
13983 dump_printf_loc (MSG_NOTE, vect_location,
13984 "vect_is_simple_use: vectype %T\n", *vectype);
13986 else if (*dt == vect_uninitialized_def
13987 || *dt == vect_constant_def
13988 || *dt == vect_external_def)
13989 *vectype = NULL_TREE;
13990 else
13991 gcc_unreachable ();
13993 return true;
13996 /* Function vect_is_simple_use.
13998 Same as vect_is_simple_use but determines the operand by operand
13999 position OPERAND from either STMT or SLP_NODE, filling in *OP
14000 and *SLP_DEF (when SLP_NODE is not NULL). */
14002 bool
14003 vect_is_simple_use (vec_info *vinfo, stmt_vec_info stmt, slp_tree slp_node,
14004 unsigned operand, tree *op, slp_tree *slp_def,
14005 enum vect_def_type *dt,
14006 tree *vectype, stmt_vec_info *def_stmt_info_out)
14008 if (slp_node)
14010 slp_tree child = SLP_TREE_CHILDREN (slp_node)[operand];
14011 *slp_def = child;
14012 *vectype = SLP_TREE_VECTYPE (child);
14013 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
14015 /* ??? VEC_PERM nodes might be intermediate and their lane value
14016 have no representative (nor do we build a VEC_PERM stmt for
14017 the actual operation). Note for two-operator nodes we set
14018 a representative but leave scalar stmts empty as we'd only
14019 have one for a subset of lanes. Ideally no caller would
14020 require *op for internal defs. */
14021 if (SLP_TREE_REPRESENTATIVE (child))
14023 *op = gimple_get_lhs (SLP_TREE_REPRESENTATIVE (child)->stmt);
14024 return vect_is_simple_use (*op, vinfo, dt, def_stmt_info_out);
14026 else
14028 gcc_assert (SLP_TREE_CODE (child) == VEC_PERM_EXPR);
14029 *op = error_mark_node;
14030 *dt = vect_internal_def;
14031 if (def_stmt_info_out)
14032 *def_stmt_info_out = NULL;
14033 return true;
14036 else
14038 if (def_stmt_info_out)
14039 *def_stmt_info_out = NULL;
14040 *op = SLP_TREE_SCALAR_OPS (child)[0];
14041 *dt = SLP_TREE_DEF_TYPE (child);
14042 return true;
14045 else
14047 *slp_def = NULL;
14048 if (gassign *ass = dyn_cast <gassign *> (stmt->stmt))
14050 if (gimple_assign_rhs_code (ass) == COND_EXPR
14051 && COMPARISON_CLASS_P (gimple_assign_rhs1 (ass)))
14053 if (operand < 2)
14054 *op = TREE_OPERAND (gimple_assign_rhs1 (ass), operand);
14055 else
14056 *op = gimple_op (ass, operand);
14058 else if (gimple_assign_rhs_code (ass) == VIEW_CONVERT_EXPR)
14059 *op = TREE_OPERAND (gimple_assign_rhs1 (ass), 0);
14060 else
14061 *op = gimple_op (ass, operand + 1);
14063 else if (gcond *cond = dyn_cast <gcond *> (stmt->stmt))
14064 *op = gimple_op (cond, operand);
14065 else if (gcall *call = dyn_cast <gcall *> (stmt->stmt))
14066 *op = gimple_call_arg (call, operand);
14067 else
14068 gcc_unreachable ();
14069 return vect_is_simple_use (*op, vinfo, dt, vectype, def_stmt_info_out);
14073 /* If OP is not NULL and is external or constant update its vector
14074 type with VECTYPE. Returns true if successful or false if not,
14075 for example when conflicting vector types are present. */
14077 bool
14078 vect_maybe_update_slp_op_vectype (slp_tree op, tree vectype)
14080 if (!op || SLP_TREE_DEF_TYPE (op) == vect_internal_def)
14081 return true;
14082 if (SLP_TREE_VECTYPE (op))
14083 return types_compatible_p (SLP_TREE_VECTYPE (op), vectype);
14084 /* For external defs refuse to produce VECTOR_BOOLEAN_TYPE_P, those
14085 should be handled by patters. Allow vect_constant_def for now. */
14086 if (VECTOR_BOOLEAN_TYPE_P (vectype)
14087 && SLP_TREE_DEF_TYPE (op) == vect_external_def)
14088 return false;
14089 SLP_TREE_VECTYPE (op) = vectype;
14090 return true;
14093 /* Function supportable_widening_operation
14095 Check whether an operation represented by the code CODE is a
14096 widening operation that is supported by the target platform in
14097 vector form (i.e., when operating on arguments of type VECTYPE_IN
14098 producing a result of type VECTYPE_OUT).
14100 Widening operations we currently support are NOP (CONVERT), FLOAT,
14101 FIX_TRUNC and WIDEN_MULT. This function checks if these operations
14102 are supported by the target platform either directly (via vector
14103 tree-codes), or via target builtins.
14105 Output:
14106 - CODE1 and CODE2 are codes of vector operations to be used when
14107 vectorizing the operation, if available.
14108 - MULTI_STEP_CVT determines the number of required intermediate steps in
14109 case of multi-step conversion (like char->short->int - in that case
14110 MULTI_STEP_CVT will be 1).
14111 - INTERM_TYPES contains the intermediate type required to perform the
14112 widening operation (short in the above example). */
14114 bool
14115 supportable_widening_operation (vec_info *vinfo,
14116 code_helper code,
14117 stmt_vec_info stmt_info,
14118 tree vectype_out, tree vectype_in,
14119 code_helper *code1,
14120 code_helper *code2,
14121 int *multi_step_cvt,
14122 vec<tree> *interm_types)
14124 loop_vec_info loop_info = dyn_cast <loop_vec_info> (vinfo);
14125 class loop *vect_loop = NULL;
14126 machine_mode vec_mode;
14127 enum insn_code icode1, icode2;
14128 optab optab1 = unknown_optab, optab2 = unknown_optab;
14129 tree vectype = vectype_in;
14130 tree wide_vectype = vectype_out;
14131 tree_code c1 = MAX_TREE_CODES, c2 = MAX_TREE_CODES;
14132 int i;
14133 tree prev_type, intermediate_type;
14134 machine_mode intermediate_mode, prev_mode;
14135 optab optab3, optab4;
14137 *multi_step_cvt = 0;
14138 if (loop_info)
14139 vect_loop = LOOP_VINFO_LOOP (loop_info);
14141 switch (code.safe_as_tree_code ())
14143 case MAX_TREE_CODES:
14144 /* Don't set c1 and c2 if code is not a tree_code. */
14145 break;
14147 case WIDEN_MULT_EXPR:
14148 /* The result of a vectorized widening operation usually requires
14149 two vectors (because the widened results do not fit into one vector).
14150 The generated vector results would normally be expected to be
14151 generated in the same order as in the original scalar computation,
14152 i.e. if 8 results are generated in each vector iteration, they are
14153 to be organized as follows:
14154 vect1: [res1,res2,res3,res4],
14155 vect2: [res5,res6,res7,res8].
14157 However, in the special case that the result of the widening
14158 operation is used in a reduction computation only, the order doesn't
14159 matter (because when vectorizing a reduction we change the order of
14160 the computation). Some targets can take advantage of this and
14161 generate more efficient code. For example, targets like Altivec,
14162 that support widen_mult using a sequence of {mult_even,mult_odd}
14163 generate the following vectors:
14164 vect1: [res1,res3,res5,res7],
14165 vect2: [res2,res4,res6,res8].
14167 When vectorizing outer-loops, we execute the inner-loop sequentially
14168 (each vectorized inner-loop iteration contributes to VF outer-loop
14169 iterations in parallel). We therefore don't allow to change the
14170 order of the computation in the inner-loop during outer-loop
14171 vectorization. */
14172 /* TODO: Another case in which order doesn't *really* matter is when we
14173 widen and then contract again, e.g. (short)((int)x * y >> 8).
14174 Normally, pack_trunc performs an even/odd permute, whereas the
14175 repack from an even/odd expansion would be an interleave, which
14176 would be significantly simpler for e.g. AVX2. */
14177 /* In any case, in order to avoid duplicating the code below, recurse
14178 on VEC_WIDEN_MULT_EVEN_EXPR. If it succeeds, all the return values
14179 are properly set up for the caller. If we fail, we'll continue with
14180 a VEC_WIDEN_MULT_LO/HI_EXPR check. */
14181 if (vect_loop
14182 && STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
14183 && !nested_in_vect_loop_p (vect_loop, stmt_info)
14184 && supportable_widening_operation (vinfo, VEC_WIDEN_MULT_EVEN_EXPR,
14185 stmt_info, vectype_out,
14186 vectype_in, code1,
14187 code2, multi_step_cvt,
14188 interm_types))
14190 /* Elements in a vector with vect_used_by_reduction property cannot
14191 be reordered if the use chain with this property does not have the
14192 same operation. One such an example is s += a * b, where elements
14193 in a and b cannot be reordered. Here we check if the vector defined
14194 by STMT is only directly used in the reduction statement. */
14195 tree lhs = gimple_assign_lhs (stmt_info->stmt);
14196 stmt_vec_info use_stmt_info = loop_info->lookup_single_use (lhs);
14197 if (use_stmt_info
14198 && STMT_VINFO_DEF_TYPE (use_stmt_info) == vect_reduction_def)
14199 return true;
14201 c1 = VEC_WIDEN_MULT_LO_EXPR;
14202 c2 = VEC_WIDEN_MULT_HI_EXPR;
14203 break;
14205 case DOT_PROD_EXPR:
14206 c1 = DOT_PROD_EXPR;
14207 c2 = DOT_PROD_EXPR;
14208 break;
14210 case SAD_EXPR:
14211 c1 = SAD_EXPR;
14212 c2 = SAD_EXPR;
14213 break;
14215 case VEC_WIDEN_MULT_EVEN_EXPR:
14216 /* Support the recursion induced just above. */
14217 c1 = VEC_WIDEN_MULT_EVEN_EXPR;
14218 c2 = VEC_WIDEN_MULT_ODD_EXPR;
14219 break;
14221 case WIDEN_LSHIFT_EXPR:
14222 c1 = VEC_WIDEN_LSHIFT_LO_EXPR;
14223 c2 = VEC_WIDEN_LSHIFT_HI_EXPR;
14224 break;
14226 CASE_CONVERT:
14227 c1 = VEC_UNPACK_LO_EXPR;
14228 c2 = VEC_UNPACK_HI_EXPR;
14229 break;
14231 case FLOAT_EXPR:
14232 c1 = VEC_UNPACK_FLOAT_LO_EXPR;
14233 c2 = VEC_UNPACK_FLOAT_HI_EXPR;
14234 break;
14236 case FIX_TRUNC_EXPR:
14237 c1 = VEC_UNPACK_FIX_TRUNC_LO_EXPR;
14238 c2 = VEC_UNPACK_FIX_TRUNC_HI_EXPR;
14239 break;
14241 default:
14242 gcc_unreachable ();
14245 if (BYTES_BIG_ENDIAN && c1 != VEC_WIDEN_MULT_EVEN_EXPR)
14246 std::swap (c1, c2);
14248 if (code == FIX_TRUNC_EXPR)
14250 /* The signedness is determined from output operand. */
14251 optab1 = optab_for_tree_code (c1, vectype_out, optab_default);
14252 optab2 = optab_for_tree_code (c2, vectype_out, optab_default);
14254 else if (CONVERT_EXPR_CODE_P (code.safe_as_tree_code ())
14255 && VECTOR_BOOLEAN_TYPE_P (wide_vectype)
14256 && VECTOR_BOOLEAN_TYPE_P (vectype)
14257 && TYPE_MODE (wide_vectype) == TYPE_MODE (vectype)
14258 && SCALAR_INT_MODE_P (TYPE_MODE (vectype)))
14260 /* If the input and result modes are the same, a different optab
14261 is needed where we pass in the number of units in vectype. */
14262 optab1 = vec_unpacks_sbool_lo_optab;
14263 optab2 = vec_unpacks_sbool_hi_optab;
14266 vec_mode = TYPE_MODE (vectype);
14267 if (widening_fn_p (code))
14269 /* If this is an internal fn then we must check whether the target
14270 supports either a low-high split or an even-odd split. */
14271 internal_fn ifn = as_internal_fn ((combined_fn) code);
14273 internal_fn lo, hi, even, odd;
14274 lookup_hilo_internal_fn (ifn, &lo, &hi);
14275 *code1 = as_combined_fn (lo);
14276 *code2 = as_combined_fn (hi);
14277 optab1 = direct_internal_fn_optab (lo, {vectype, vectype});
14278 optab2 = direct_internal_fn_optab (hi, {vectype, vectype});
14280 /* If we don't support low-high, then check for even-odd. */
14281 if (!optab1
14282 || (icode1 = optab_handler (optab1, vec_mode)) == CODE_FOR_nothing
14283 || !optab2
14284 || (icode2 = optab_handler (optab2, vec_mode)) == CODE_FOR_nothing)
14286 lookup_evenodd_internal_fn (ifn, &even, &odd);
14287 *code1 = as_combined_fn (even);
14288 *code2 = as_combined_fn (odd);
14289 optab1 = direct_internal_fn_optab (even, {vectype, vectype});
14290 optab2 = direct_internal_fn_optab (odd, {vectype, vectype});
14293 else if (code.is_tree_code ())
14295 if (code == FIX_TRUNC_EXPR)
14297 /* The signedness is determined from output operand. */
14298 optab1 = optab_for_tree_code (c1, vectype_out, optab_default);
14299 optab2 = optab_for_tree_code (c2, vectype_out, optab_default);
14301 else if (CONVERT_EXPR_CODE_P ((tree_code) code.safe_as_tree_code ())
14302 && VECTOR_BOOLEAN_TYPE_P (wide_vectype)
14303 && VECTOR_BOOLEAN_TYPE_P (vectype)
14304 && TYPE_MODE (wide_vectype) == TYPE_MODE (vectype)
14305 && SCALAR_INT_MODE_P (TYPE_MODE (vectype)))
14307 /* If the input and result modes are the same, a different optab
14308 is needed where we pass in the number of units in vectype. */
14309 optab1 = vec_unpacks_sbool_lo_optab;
14310 optab2 = vec_unpacks_sbool_hi_optab;
14312 else
14314 optab1 = optab_for_tree_code (c1, vectype, optab_default);
14315 optab2 = optab_for_tree_code (c2, vectype, optab_default);
14317 *code1 = c1;
14318 *code2 = c2;
14321 if (!optab1 || !optab2)
14322 return false;
14324 if ((icode1 = optab_handler (optab1, vec_mode)) == CODE_FOR_nothing
14325 || (icode2 = optab_handler (optab2, vec_mode)) == CODE_FOR_nothing)
14326 return false;
14329 if (insn_data[icode1].operand[0].mode == TYPE_MODE (wide_vectype)
14330 && insn_data[icode2].operand[0].mode == TYPE_MODE (wide_vectype))
14332 if (!VECTOR_BOOLEAN_TYPE_P (vectype))
14333 return true;
14334 /* For scalar masks we may have different boolean
14335 vector types having the same QImode. Thus we
14336 add additional check for elements number. */
14337 if (known_eq (TYPE_VECTOR_SUBPARTS (vectype),
14338 TYPE_VECTOR_SUBPARTS (wide_vectype) * 2))
14339 return true;
14342 /* Check if it's a multi-step conversion that can be done using intermediate
14343 types. */
14345 prev_type = vectype;
14346 prev_mode = vec_mode;
14348 if (!CONVERT_EXPR_CODE_P (code.safe_as_tree_code ()))
14349 return false;
14351 /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
14352 intermediate steps in promotion sequence. We try
14353 MAX_INTERM_CVT_STEPS to get to NARROW_VECTYPE, and fail if we do
14354 not. */
14355 interm_types->create (MAX_INTERM_CVT_STEPS);
14356 for (i = 0; i < MAX_INTERM_CVT_STEPS; i++)
14358 intermediate_mode = insn_data[icode1].operand[0].mode;
14359 if (VECTOR_BOOLEAN_TYPE_P (prev_type))
14360 intermediate_type
14361 = vect_halve_mask_nunits (prev_type, intermediate_mode);
14362 else if (VECTOR_MODE_P (intermediate_mode))
14364 tree intermediate_element_type
14365 = lang_hooks.types.type_for_mode (GET_MODE_INNER (intermediate_mode),
14366 TYPE_UNSIGNED (prev_type));
14367 intermediate_type
14368 = build_vector_type_for_mode (intermediate_element_type,
14369 intermediate_mode);
14371 else
14372 intermediate_type
14373 = lang_hooks.types.type_for_mode (intermediate_mode,
14374 TYPE_UNSIGNED (prev_type));
14376 if (VECTOR_BOOLEAN_TYPE_P (intermediate_type)
14377 && VECTOR_BOOLEAN_TYPE_P (prev_type)
14378 && intermediate_mode == prev_mode
14379 && SCALAR_INT_MODE_P (prev_mode))
14381 /* If the input and result modes are the same, a different optab
14382 is needed where we pass in the number of units in vectype. */
14383 optab3 = vec_unpacks_sbool_lo_optab;
14384 optab4 = vec_unpacks_sbool_hi_optab;
14386 else
14388 optab3 = optab_for_tree_code (c1, intermediate_type, optab_default);
14389 optab4 = optab_for_tree_code (c2, intermediate_type, optab_default);
14392 if (!optab3 || !optab4
14393 || (icode1 = optab_handler (optab1, prev_mode)) == CODE_FOR_nothing
14394 || insn_data[icode1].operand[0].mode != intermediate_mode
14395 || (icode2 = optab_handler (optab2, prev_mode)) == CODE_FOR_nothing
14396 || insn_data[icode2].operand[0].mode != intermediate_mode
14397 || ((icode1 = optab_handler (optab3, intermediate_mode))
14398 == CODE_FOR_nothing)
14399 || ((icode2 = optab_handler (optab4, intermediate_mode))
14400 == CODE_FOR_nothing))
14401 break;
14403 interm_types->quick_push (intermediate_type);
14404 (*multi_step_cvt)++;
14406 if (insn_data[icode1].operand[0].mode == TYPE_MODE (wide_vectype)
14407 && insn_data[icode2].operand[0].mode == TYPE_MODE (wide_vectype))
14409 if (!VECTOR_BOOLEAN_TYPE_P (vectype))
14410 return true;
14411 if (known_eq (TYPE_VECTOR_SUBPARTS (intermediate_type),
14412 TYPE_VECTOR_SUBPARTS (wide_vectype) * 2))
14413 return true;
14416 prev_type = intermediate_type;
14417 prev_mode = intermediate_mode;
14420 interm_types->release ();
14421 return false;
14425 /* Function supportable_narrowing_operation
14427 Check whether an operation represented by the code CODE is a
14428 narrowing operation that is supported by the target platform in
14429 vector form (i.e., when operating on arguments of type VECTYPE_IN
14430 and producing a result of type VECTYPE_OUT).
14432 Narrowing operations we currently support are NOP (CONVERT), FIX_TRUNC
14433 and FLOAT. This function checks if these operations are supported by
14434 the target platform directly via vector tree-codes.
14436 Output:
14437 - CODE1 is the code of a vector operation to be used when
14438 vectorizing the operation, if available.
14439 - MULTI_STEP_CVT determines the number of required intermediate steps in
14440 case of multi-step conversion (like int->short->char - in that case
14441 MULTI_STEP_CVT will be 1).
14442 - INTERM_TYPES contains the intermediate type required to perform the
14443 narrowing operation (short in the above example). */
14445 bool
14446 supportable_narrowing_operation (code_helper code,
14447 tree vectype_out, tree vectype_in,
14448 code_helper *code1, int *multi_step_cvt,
14449 vec<tree> *interm_types)
14451 machine_mode vec_mode;
14452 enum insn_code icode1;
14453 optab optab1, interm_optab;
14454 tree vectype = vectype_in;
14455 tree narrow_vectype = vectype_out;
14456 enum tree_code c1;
14457 tree intermediate_type, prev_type;
14458 machine_mode intermediate_mode, prev_mode;
14459 int i;
14460 unsigned HOST_WIDE_INT n_elts;
14461 bool uns;
14463 if (!code.is_tree_code ())
14464 return false;
14466 *multi_step_cvt = 0;
14467 switch ((tree_code) code)
14469 CASE_CONVERT:
14470 c1 = VEC_PACK_TRUNC_EXPR;
14471 if (VECTOR_BOOLEAN_TYPE_P (narrow_vectype)
14472 && VECTOR_BOOLEAN_TYPE_P (vectype)
14473 && SCALAR_INT_MODE_P (TYPE_MODE (vectype))
14474 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&n_elts)
14475 && n_elts < BITS_PER_UNIT)
14476 optab1 = vec_pack_sbool_trunc_optab;
14477 else
14478 optab1 = optab_for_tree_code (c1, vectype, optab_default);
14479 break;
14481 case FIX_TRUNC_EXPR:
14482 c1 = VEC_PACK_FIX_TRUNC_EXPR;
14483 /* The signedness is determined from output operand. */
14484 optab1 = optab_for_tree_code (c1, vectype_out, optab_default);
14485 break;
14487 case FLOAT_EXPR:
14488 c1 = VEC_PACK_FLOAT_EXPR;
14489 optab1 = optab_for_tree_code (c1, vectype, optab_default);
14490 break;
14492 default:
14493 gcc_unreachable ();
14496 if (!optab1)
14497 return false;
14499 vec_mode = TYPE_MODE (vectype);
14500 if ((icode1 = optab_handler (optab1, vec_mode)) == CODE_FOR_nothing)
14501 return false;
14503 *code1 = c1;
14505 if (insn_data[icode1].operand[0].mode == TYPE_MODE (narrow_vectype))
14507 if (!VECTOR_BOOLEAN_TYPE_P (vectype))
14508 return true;
14509 /* For scalar masks we may have different boolean
14510 vector types having the same QImode. Thus we
14511 add additional check for elements number. */
14512 if (known_eq (TYPE_VECTOR_SUBPARTS (vectype) * 2,
14513 TYPE_VECTOR_SUBPARTS (narrow_vectype)))
14514 return true;
14517 if (code == FLOAT_EXPR)
14518 return false;
14520 /* Check if it's a multi-step conversion that can be done using intermediate
14521 types. */
14522 prev_mode = vec_mode;
14523 prev_type = vectype;
14524 if (code == FIX_TRUNC_EXPR)
14525 uns = TYPE_UNSIGNED (vectype_out);
14526 else
14527 uns = TYPE_UNSIGNED (vectype);
14529 /* For multi-step FIX_TRUNC_EXPR prefer signed floating to integer
14530 conversion over unsigned, as unsigned FIX_TRUNC_EXPR is often more
14531 costly than signed. */
14532 if (code == FIX_TRUNC_EXPR && uns)
14534 enum insn_code icode2;
14536 intermediate_type
14537 = lang_hooks.types.type_for_mode (TYPE_MODE (vectype_out), 0);
14538 interm_optab
14539 = optab_for_tree_code (c1, intermediate_type, optab_default);
14540 if (interm_optab != unknown_optab
14541 && (icode2 = optab_handler (optab1, vec_mode)) != CODE_FOR_nothing
14542 && insn_data[icode1].operand[0].mode
14543 == insn_data[icode2].operand[0].mode)
14545 uns = false;
14546 optab1 = interm_optab;
14547 icode1 = icode2;
14551 /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
14552 intermediate steps in promotion sequence. We try
14553 MAX_INTERM_CVT_STEPS to get to NARROW_VECTYPE, and fail if we do not. */
14554 interm_types->create (MAX_INTERM_CVT_STEPS);
14555 for (i = 0; i < MAX_INTERM_CVT_STEPS; i++)
14557 intermediate_mode = insn_data[icode1].operand[0].mode;
14558 if (VECTOR_BOOLEAN_TYPE_P (prev_type))
14559 intermediate_type
14560 = vect_double_mask_nunits (prev_type, intermediate_mode);
14561 else
14562 intermediate_type
14563 = lang_hooks.types.type_for_mode (intermediate_mode, uns);
14564 if (VECTOR_BOOLEAN_TYPE_P (intermediate_type)
14565 && VECTOR_BOOLEAN_TYPE_P (prev_type)
14566 && SCALAR_INT_MODE_P (prev_mode)
14567 && TYPE_VECTOR_SUBPARTS (intermediate_type).is_constant (&n_elts)
14568 && n_elts < BITS_PER_UNIT)
14569 interm_optab = vec_pack_sbool_trunc_optab;
14570 else
14571 interm_optab
14572 = optab_for_tree_code (VEC_PACK_TRUNC_EXPR, intermediate_type,
14573 optab_default);
14574 if (!interm_optab
14575 || ((icode1 = optab_handler (optab1, prev_mode)) == CODE_FOR_nothing)
14576 || insn_data[icode1].operand[0].mode != intermediate_mode
14577 || ((icode1 = optab_handler (interm_optab, intermediate_mode))
14578 == CODE_FOR_nothing))
14579 break;
14581 interm_types->quick_push (intermediate_type);
14582 (*multi_step_cvt)++;
14584 if (insn_data[icode1].operand[0].mode == TYPE_MODE (narrow_vectype))
14586 if (!VECTOR_BOOLEAN_TYPE_P (vectype))
14587 return true;
14588 if (known_eq (TYPE_VECTOR_SUBPARTS (intermediate_type) * 2,
14589 TYPE_VECTOR_SUBPARTS (narrow_vectype)))
14590 return true;
14593 prev_mode = intermediate_mode;
14594 prev_type = intermediate_type;
14595 optab1 = interm_optab;
14598 interm_types->release ();
14599 return false;
14602 /* Function supportable_indirect_convert_operation
14604 Check whether an operation represented by the code CODE is single or multi
14605 operations that are supported by the target platform in
14606 vector form (i.e., when operating on arguments of type VECTYPE_IN
14607 producing a result of type VECTYPE_OUT).
14609 Convert operations we currently support directly are FIX_TRUNC and FLOAT.
14610 This function checks if these operations are supported
14611 by the target platform directly (via vector tree-codes).
14613 Output:
14614 - converts contains some pairs to perform the convert operation,
14615 the pair's first is the intermediate type, and its second is the code of
14616 a vector operation to be used when converting the operation from the
14617 previous type to the intermediate type. */
14618 bool
14619 supportable_indirect_convert_operation (code_helper code,
14620 tree vectype_out,
14621 tree vectype_in,
14622 vec<std::pair<tree, tree_code> > *converts,
14623 tree op0)
14625 bool found_mode = false;
14626 scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_out));
14627 scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_in));
14628 opt_scalar_mode mode_iter;
14629 tree_code tc1, tc2, code1, code2;
14631 tree cvt_type = NULL_TREE;
14632 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype_in);
14634 if (supportable_convert_operation ((tree_code) code,
14635 vectype_out,
14636 vectype_in,
14637 &tc1))
14639 converts->safe_push (std::make_pair (vectype_out, tc1));
14640 return true;
14643 /* For conversions between float and integer types try whether
14644 we can use intermediate signed integer types to support the
14645 conversion. */
14646 if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
14647 && (code == FLOAT_EXPR
14648 || (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
14650 bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
14651 bool float_expr_p = code == FLOAT_EXPR;
14652 unsigned short target_size;
14653 scalar_mode intermediate_mode;
14654 if (demotion)
14656 intermediate_mode = lhs_mode;
14657 target_size = GET_MODE_SIZE (rhs_mode);
14659 else
14661 target_size = GET_MODE_SIZE (lhs_mode);
14662 if (!int_mode_for_size
14663 (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
14664 return false;
14666 code1 = float_expr_p ? (tree_code) code : NOP_EXPR;
14667 code2 = float_expr_p ? NOP_EXPR : (tree_code) code;
14668 opt_scalar_mode mode_iter;
14669 FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
14671 intermediate_mode = mode_iter.require ();
14673 if (GET_MODE_SIZE (intermediate_mode) > target_size)
14674 break;
14676 scalar_mode cvt_mode;
14677 if (!int_mode_for_size
14678 (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
14679 break;
14681 cvt_type = build_nonstandard_integer_type
14682 (GET_MODE_BITSIZE (cvt_mode), 0);
14684 /* Check if the intermediate type can hold OP0's range.
14685 When converting from float to integer this is not necessary
14686 because values that do not fit the (smaller) target type are
14687 unspecified anyway. */
14688 if (demotion && float_expr_p)
14690 wide_int op_min_value, op_max_value;
14691 /* For vector form, it looks like op0 doesn't have RANGE_INFO.
14692 In the future, if it is supported, changes may need to be made
14693 to this part, such as checking the RANGE of each element
14694 in the vector. */
14695 if ((TREE_CODE (op0) == SSA_NAME && !SSA_NAME_RANGE_INFO (op0))
14696 || !vect_get_range_info (op0, &op_min_value, &op_max_value))
14697 break;
14699 if (cvt_type == NULL_TREE
14700 || (wi::min_precision (op_max_value, SIGNED)
14701 > TYPE_PRECISION (cvt_type))
14702 || (wi::min_precision (op_min_value, SIGNED)
14703 > TYPE_PRECISION (cvt_type)))
14704 continue;
14707 cvt_type = get_related_vectype_for_scalar_type (TYPE_MODE (vectype_in),
14708 cvt_type,
14709 nelts);
14710 /* This should only happened for SLP as long as loop vectorizer
14711 only supports same-sized vector. */
14712 if (cvt_type == NULL_TREE
14713 || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nelts)
14714 || !supportable_convert_operation ((tree_code) code1,
14715 vectype_out,
14716 cvt_type, &tc1)
14717 || !supportable_convert_operation ((tree_code) code2,
14718 cvt_type,
14719 vectype_in, &tc2))
14720 continue;
14722 found_mode = true;
14723 break;
14726 if (found_mode)
14728 converts->safe_push (std::make_pair (cvt_type, tc2));
14729 if (TYPE_MODE (cvt_type) != TYPE_MODE (vectype_out))
14730 converts->safe_push (std::make_pair (vectype_out, tc1));
14731 return true;
14734 return false;
14737 /* Generate and return a vector mask of MASK_TYPE such that
14738 mask[I] is true iff J + START_INDEX < END_INDEX for all J <= I.
14739 Add the statements to SEQ. */
14741 tree
14742 vect_gen_while (gimple_seq *seq, tree mask_type, tree start_index,
14743 tree end_index, const char *name)
14745 tree cmp_type = TREE_TYPE (start_index);
14746 gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT,
14747 cmp_type, mask_type,
14748 OPTIMIZE_FOR_SPEED));
14749 gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
14750 start_index, end_index,
14751 build_zero_cst (mask_type));
14752 tree tmp;
14753 if (name)
14754 tmp = make_temp_ssa_name (mask_type, NULL, name);
14755 else
14756 tmp = make_ssa_name (mask_type);
14757 gimple_call_set_lhs (call, tmp);
14758 gimple_seq_add_stmt (seq, call);
14759 return tmp;
14762 /* Generate a vector mask of type MASK_TYPE for which index I is false iff
14763 J + START_INDEX < END_INDEX for all J <= I. Add the statements to SEQ. */
14765 tree
14766 vect_gen_while_not (gimple_seq *seq, tree mask_type, tree start_index,
14767 tree end_index)
14769 tree tmp = vect_gen_while (seq, mask_type, start_index, end_index);
14770 return gimple_build (seq, BIT_NOT_EXPR, mask_type, tmp);
14773 /* Try to compute the vector types required to vectorize STMT_INFO,
14774 returning true on success and false if vectorization isn't possible.
14775 If GROUP_SIZE is nonzero and we're performing BB vectorization,
14776 take sure that the number of elements in the vectors is no bigger
14777 than GROUP_SIZE.
14779 On success:
14781 - Set *STMT_VECTYPE_OUT to:
14782 - NULL_TREE if the statement doesn't need to be vectorized;
14783 - the equivalent of STMT_VINFO_VECTYPE otherwise.
14785 - Set *NUNITS_VECTYPE_OUT to the vector type that contains the maximum
14786 number of units needed to vectorize STMT_INFO, or NULL_TREE if the
14787 statement does not help to determine the overall number of units. */
14789 opt_result
14790 vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
14791 tree *stmt_vectype_out,
14792 tree *nunits_vectype_out,
14793 unsigned int group_size)
14795 gimple *stmt = stmt_info->stmt;
14797 /* For BB vectorization, we should always have a group size once we've
14798 constructed the SLP tree; the only valid uses of zero GROUP_SIZEs
14799 are tentative requests during things like early data reference
14800 analysis and pattern recognition. */
14801 if (is_a <bb_vec_info> (vinfo))
14802 gcc_assert (vinfo->slp_instances.is_empty () || group_size != 0);
14803 else
14804 group_size = 0;
14806 *stmt_vectype_out = NULL_TREE;
14807 *nunits_vectype_out = NULL_TREE;
14809 if (gimple_get_lhs (stmt) == NULL_TREE
14810 /* Allow vector conditionals through here. */
14811 && !is_a <gcond *> (stmt)
14812 /* MASK_STORE has no lhs, but is ok. */
14813 && !gimple_call_internal_p (stmt, IFN_MASK_STORE))
14815 if (is_a <gcall *> (stmt))
14817 /* Ignore calls with no lhs. These must be calls to
14818 #pragma omp simd functions, and what vectorization factor
14819 it really needs can't be determined until
14820 vectorizable_simd_clone_call. */
14821 if (dump_enabled_p ())
14822 dump_printf_loc (MSG_NOTE, vect_location,
14823 "defer to SIMD clone analysis.\n");
14824 return opt_result::success ();
14827 return opt_result::failure_at (stmt,
14828 "not vectorized: irregular stmt: %G", stmt);
14831 tree vectype;
14832 tree scalar_type = NULL_TREE;
14833 if (group_size == 0 && STMT_VINFO_VECTYPE (stmt_info))
14835 vectype = STMT_VINFO_VECTYPE (stmt_info);
14836 if (dump_enabled_p ())
14837 dump_printf_loc (MSG_NOTE, vect_location,
14838 "precomputed vectype: %T\n", vectype);
14840 else if (vect_use_mask_type_p (stmt_info))
14842 unsigned int precision = stmt_info->mask_precision;
14843 scalar_type = build_nonstandard_integer_type (precision, 1);
14844 vectype = get_mask_type_for_scalar_type (vinfo, scalar_type, group_size);
14845 if (!vectype)
14846 return opt_result::failure_at (stmt, "not vectorized: unsupported"
14847 " data-type %T\n", scalar_type);
14848 if (dump_enabled_p ())
14849 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype);
14851 else
14853 /* If we got here with a gcond it means that the target had no available vector
14854 mode for the scalar type. We can't vectorize so abort. */
14855 if (is_a <gcond *> (stmt))
14856 return opt_result::failure_at (stmt,
14857 "not vectorized:"
14858 " unsupported data-type for gcond %T\n",
14859 scalar_type);
14861 if (data_reference *dr = STMT_VINFO_DATA_REF (stmt_info))
14862 scalar_type = TREE_TYPE (DR_REF (dr));
14863 else if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
14864 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
14865 else
14866 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
14868 if (dump_enabled_p ())
14870 if (group_size)
14871 dump_printf_loc (MSG_NOTE, vect_location,
14872 "get vectype for scalar type (group size %d):"
14873 " %T\n", group_size, scalar_type);
14874 else
14875 dump_printf_loc (MSG_NOTE, vect_location,
14876 "get vectype for scalar type: %T\n", scalar_type);
14878 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
14879 if (!vectype)
14880 return opt_result::failure_at (stmt,
14881 "not vectorized:"
14882 " unsupported data-type %T\n",
14883 scalar_type);
14885 if (dump_enabled_p ())
14886 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype);
14889 if (scalar_type && VECTOR_MODE_P (TYPE_MODE (scalar_type)))
14890 return opt_result::failure_at (stmt,
14891 "not vectorized: vector stmt in loop:%G",
14892 stmt);
14894 *stmt_vectype_out = vectype;
14896 /* Don't try to compute scalar types if the stmt produces a boolean
14897 vector; use the existing vector type instead. */
14898 tree nunits_vectype = vectype;
14899 if (!VECTOR_BOOLEAN_TYPE_P (vectype))
14901 /* The number of units is set according to the smallest scalar
14902 type (or the largest vector size, but we only support one
14903 vector size per vectorization). */
14904 scalar_type = vect_get_smallest_scalar_type (stmt_info,
14905 TREE_TYPE (vectype));
14906 if (!types_compatible_p (scalar_type, TREE_TYPE (vectype)))
14908 if (dump_enabled_p ())
14909 dump_printf_loc (MSG_NOTE, vect_location,
14910 "get vectype for smallest scalar type: %T\n",
14911 scalar_type);
14912 nunits_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
14913 group_size);
14914 if (!nunits_vectype)
14915 return opt_result::failure_at
14916 (stmt, "not vectorized: unsupported data-type %T\n",
14917 scalar_type);
14918 if (dump_enabled_p ())
14919 dump_printf_loc (MSG_NOTE, vect_location, "nunits vectype: %T\n",
14920 nunits_vectype);
14924 if (!multiple_p (TYPE_VECTOR_SUBPARTS (nunits_vectype),
14925 TYPE_VECTOR_SUBPARTS (*stmt_vectype_out)))
14926 return opt_result::failure_at (stmt,
14927 "Not vectorized: Incompatible number "
14928 "of vector subparts between %T and %T\n",
14929 nunits_vectype, *stmt_vectype_out);
14931 if (dump_enabled_p ())
14933 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
14934 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (nunits_vectype));
14935 dump_printf (MSG_NOTE, "\n");
14938 *nunits_vectype_out = nunits_vectype;
14939 return opt_result::success ();
14942 /* Generate and return statement sequence that sets vector length LEN that is:
14944 min_of_start_and_end = min (START_INDEX, END_INDEX);
14945 left_len = END_INDEX - min_of_start_and_end;
14946 rhs = min (left_len, LEN_LIMIT);
14947 LEN = rhs;
14949 Note: the cost of the code generated by this function is modeled
14950 by vect_estimate_min_profitable_iters, so changes here may need
14951 corresponding changes there. */
14953 gimple_seq
14954 vect_gen_len (tree len, tree start_index, tree end_index, tree len_limit)
14956 gimple_seq stmts = NULL;
14957 tree len_type = TREE_TYPE (len);
14958 gcc_assert (TREE_TYPE (start_index) == len_type);
14960 tree min = gimple_build (&stmts, MIN_EXPR, len_type, start_index, end_index);
14961 tree left_len = gimple_build (&stmts, MINUS_EXPR, len_type, end_index, min);
14962 tree rhs = gimple_build (&stmts, MIN_EXPR, len_type, left_len, len_limit);
14963 gimple* stmt = gimple_build_assign (len, rhs);
14964 gimple_seq_add_stmt (&stmts, stmt);
14966 return stmts;