hppa: Fix bug in atomic_storedi_1 pattern
[official-gcc.git] / gcc / tree-vect-slp.cc
blob7cf9504398c98cccfdba3b91c3e995f73f8b5d50
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #define INCLUDE_ALGORITHM
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "insn-config.h"
35 #include "recog.h" /* FIXME: for insn_data */
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 #include "dump-context.h"
49 #include "cfganal.h"
50 #include "tree-eh.h"
51 #include "tree-cfg.h"
52 #include "alloc-pool.h"
53 #include "sreal.h"
54 #include "predict.h"
56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 load_permutation_t &,
58 const vec<tree> &,
59 gimple_stmt_iterator *,
60 poly_uint64, bool, bool,
61 unsigned *,
62 unsigned * = nullptr,
63 bool = false);
64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 slp_tree, lane_permutation_t &,
66 vec<slp_tree> &, bool);
67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
68 slp_tree, stmt_vector_for_cost *);
69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
71 static object_allocator<_slp_tree> *slp_tree_pool;
72 static slp_tree slp_first_node;
74 void
75 vect_slp_init (void)
77 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
80 void
81 vect_slp_fini (void)
83 while (slp_first_node)
84 delete slp_first_node;
85 delete slp_tree_pool;
86 slp_tree_pool = NULL;
89 void *
90 _slp_tree::operator new (size_t n)
92 gcc_assert (n == sizeof (_slp_tree));
93 return slp_tree_pool->allocate_raw ();
96 void
97 _slp_tree::operator delete (void *node, size_t n)
99 gcc_assert (n == sizeof (_slp_tree));
100 slp_tree_pool->remove_raw (node);
104 /* Initialize a SLP node. */
106 _slp_tree::_slp_tree ()
108 this->prev_node = NULL;
109 if (slp_first_node)
110 slp_first_node->prev_node = this;
111 this->next_node = slp_first_node;
112 slp_first_node = this;
113 SLP_TREE_SCALAR_STMTS (this) = vNULL;
114 SLP_TREE_SCALAR_OPS (this) = vNULL;
115 SLP_TREE_VEC_DEFS (this) = vNULL;
116 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
117 SLP_TREE_CHILDREN (this) = vNULL;
118 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
119 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
120 SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
121 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
122 SLP_TREE_CODE (this) = ERROR_MARK;
123 SLP_TREE_VECTYPE (this) = NULL_TREE;
124 SLP_TREE_REPRESENTATIVE (this) = NULL;
125 SLP_TREE_REF_COUNT (this) = 1;
126 this->failed = NULL;
127 this->max_nunits = 1;
128 this->lanes = 0;
131 /* Tear down a SLP node. */
133 _slp_tree::~_slp_tree ()
135 if (this->prev_node)
136 this->prev_node->next_node = this->next_node;
137 else
138 slp_first_node = this->next_node;
139 if (this->next_node)
140 this->next_node->prev_node = this->prev_node;
141 SLP_TREE_CHILDREN (this).release ();
142 SLP_TREE_SCALAR_STMTS (this).release ();
143 SLP_TREE_SCALAR_OPS (this).release ();
144 SLP_TREE_VEC_DEFS (this).release ();
145 SLP_TREE_LOAD_PERMUTATION (this).release ();
146 SLP_TREE_LANE_PERMUTATION (this).release ();
147 SLP_TREE_SIMD_CLONE_INFO (this).release ();
148 if (this->failed)
149 free (failed);
152 /* Push the single SSA definition in DEF to the vector of vector defs. */
154 void
155 _slp_tree::push_vec_def (gimple *def)
157 if (gphi *phi = dyn_cast <gphi *> (def))
158 vec_defs.quick_push (gimple_phi_result (phi));
159 else
161 def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
162 vec_defs.quick_push (get_def_from_ptr (defop));
166 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
168 void
169 vect_free_slp_tree (slp_tree node)
171 int i;
172 slp_tree child;
174 if (--SLP_TREE_REF_COUNT (node) != 0)
175 return;
177 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
178 if (child)
179 vect_free_slp_tree (child);
181 /* If the node defines any SLP only patterns then those patterns are no
182 longer valid and should be removed. */
183 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
184 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
186 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
187 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
188 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
191 delete node;
194 /* Return a location suitable for dumpings related to the SLP instance. */
196 dump_user_location_t
197 _slp_instance::location () const
199 if (!root_stmts.is_empty ())
200 return root_stmts[0]->stmt;
201 else
202 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
206 /* Free the memory allocated for the SLP instance. */
208 void
209 vect_free_slp_instance (slp_instance instance)
211 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
212 SLP_INSTANCE_LOADS (instance).release ();
213 SLP_INSTANCE_ROOT_STMTS (instance).release ();
214 SLP_INSTANCE_REMAIN_DEFS (instance).release ();
215 instance->subgraph_entries.release ();
216 instance->cost_vec.release ();
217 free (instance);
221 /* Create an SLP node for SCALAR_STMTS. */
223 slp_tree
224 vect_create_new_slp_node (unsigned nops, tree_code code)
226 slp_tree node = new _slp_tree;
227 SLP_TREE_SCALAR_STMTS (node) = vNULL;
228 SLP_TREE_CHILDREN (node).create (nops);
229 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
230 SLP_TREE_CODE (node) = code;
231 return node;
233 /* Create an SLP node for SCALAR_STMTS. */
235 static slp_tree
236 vect_create_new_slp_node (slp_tree node,
237 vec<stmt_vec_info> scalar_stmts, unsigned nops)
239 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
240 SLP_TREE_CHILDREN (node).create (nops);
241 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
242 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
243 SLP_TREE_LANES (node) = scalar_stmts.length ();
244 return node;
247 /* Create an SLP node for SCALAR_STMTS. */
249 static slp_tree
250 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
252 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
255 /* Create an SLP node for OPS. */
257 static slp_tree
258 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
260 SLP_TREE_SCALAR_OPS (node) = ops;
261 SLP_TREE_DEF_TYPE (node) = vect_external_def;
262 SLP_TREE_LANES (node) = ops.length ();
263 return node;
266 /* Create an SLP node for OPS. */
268 static slp_tree
269 vect_create_new_slp_node (vec<tree> ops)
271 return vect_create_new_slp_node (new _slp_tree, ops);
275 /* This structure is used in creation of an SLP tree. Each instance
276 corresponds to the same operand in a group of scalar stmts in an SLP
277 node. */
278 typedef struct _slp_oprnd_info
280 /* Def-stmts for the operands. */
281 vec<stmt_vec_info> def_stmts;
282 /* Operands. */
283 vec<tree> ops;
284 /* Information about the first statement, its vector def-type, type, the
285 operand itself in case it's constant, and an indication if it's a pattern
286 stmt and gather/scatter info. */
287 tree first_op_type;
288 enum vect_def_type first_dt;
289 bool any_pattern;
290 bool first_gs_p;
291 gather_scatter_info first_gs_info;
292 } *slp_oprnd_info;
295 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
296 operand. */
297 static vec<slp_oprnd_info>
298 vect_create_oprnd_info (int nops, int group_size)
300 int i;
301 slp_oprnd_info oprnd_info;
302 vec<slp_oprnd_info> oprnds_info;
304 oprnds_info.create (nops);
305 for (i = 0; i < nops; i++)
307 oprnd_info = XNEW (struct _slp_oprnd_info);
308 oprnd_info->def_stmts.create (group_size);
309 oprnd_info->ops.create (group_size);
310 oprnd_info->first_dt = vect_uninitialized_def;
311 oprnd_info->first_op_type = NULL_TREE;
312 oprnd_info->any_pattern = false;
313 oprnd_info->first_gs_p = false;
314 oprnds_info.quick_push (oprnd_info);
317 return oprnds_info;
321 /* Free operands info. */
323 static void
324 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
326 int i;
327 slp_oprnd_info oprnd_info;
329 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
331 oprnd_info->def_stmts.release ();
332 oprnd_info->ops.release ();
333 XDELETE (oprnd_info);
336 oprnds_info.release ();
339 /* Return the execution frequency of NODE (so that a higher value indicates
340 a "more important" node when optimizing for speed). */
342 static sreal
343 vect_slp_node_weight (slp_tree node)
345 stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
346 basic_block bb = gimple_bb (stmt_info->stmt);
347 return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
350 /* Return true if STMTS contains a pattern statement. */
352 static bool
353 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
355 stmt_vec_info stmt_info;
356 unsigned int i;
357 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
358 if (is_pattern_stmt_p (stmt_info))
359 return true;
360 return false;
363 /* Return true when all lanes in the external or constant NODE have
364 the same value. */
366 static bool
367 vect_slp_tree_uniform_p (slp_tree node)
369 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
370 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
372 /* Pre-exsting vectors. */
373 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
374 return false;
376 unsigned i;
377 tree op, first = NULL_TREE;
378 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
379 if (!first)
380 first = op;
381 else if (!operand_equal_p (first, op, 0))
382 return false;
384 return true;
387 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
388 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
389 of the chain. */
392 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
393 stmt_vec_info first_stmt_info)
395 stmt_vec_info next_stmt_info = first_stmt_info;
396 int result = 0;
398 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
399 return -1;
403 if (next_stmt_info == stmt_info)
404 return result;
405 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
406 if (next_stmt_info)
407 result += DR_GROUP_GAP (next_stmt_info);
409 while (next_stmt_info);
411 return -1;
414 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
415 using the method implemented by duplicate_and_interleave. Return true
416 if so, returning the number of intermediate vectors in *NVECTORS_OUT
417 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
418 (if nonnull). */
420 bool
421 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
422 tree elt_type, unsigned int *nvectors_out,
423 tree *vector_type_out,
424 tree *permutes)
426 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
427 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
428 return false;
430 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
431 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
432 unsigned int nvectors = 1;
433 for (;;)
435 scalar_int_mode int_mode;
436 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
437 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
439 /* Get the natural vector type for this SLP group size. */
440 tree int_type = build_nonstandard_integer_type
441 (GET_MODE_BITSIZE (int_mode), 1);
442 tree vector_type
443 = get_vectype_for_scalar_type (vinfo, int_type, count);
444 poly_int64 half_nelts;
445 if (vector_type
446 && VECTOR_MODE_P (TYPE_MODE (vector_type))
447 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
448 GET_MODE_SIZE (base_vector_mode))
449 && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
450 2, &half_nelts))
452 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
453 together into elements of type INT_TYPE and using the result
454 to build NVECTORS vectors. */
455 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
456 vec_perm_builder sel1 (nelts, 2, 3);
457 vec_perm_builder sel2 (nelts, 2, 3);
459 for (unsigned int i = 0; i < 3; ++i)
461 sel1.quick_push (i);
462 sel1.quick_push (i + nelts);
463 sel2.quick_push (half_nelts + i);
464 sel2.quick_push (half_nelts + i + nelts);
466 vec_perm_indices indices1 (sel1, 2, nelts);
467 vec_perm_indices indices2 (sel2, 2, nelts);
468 machine_mode vmode = TYPE_MODE (vector_type);
469 if (can_vec_perm_const_p (vmode, vmode, indices1)
470 && can_vec_perm_const_p (vmode, vmode, indices2))
472 if (nvectors_out)
473 *nvectors_out = nvectors;
474 if (vector_type_out)
475 *vector_type_out = vector_type;
476 if (permutes)
478 permutes[0] = vect_gen_perm_mask_checked (vector_type,
479 indices1);
480 permutes[1] = vect_gen_perm_mask_checked (vector_type,
481 indices2);
483 return true;
487 if (!multiple_p (elt_bytes, 2, &elt_bytes))
488 return false;
489 nvectors *= 2;
493 /* Return true if DTA and DTB match. */
495 static bool
496 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
498 return (dta == dtb
499 || ((dta == vect_external_def || dta == vect_constant_def)
500 && (dtb == vect_external_def || dtb == vect_constant_def)));
503 static const int cond_expr_maps[3][5] = {
504 { 4, -1, -2, 1, 2 },
505 { 4, -2, -1, 1, 2 },
506 { 4, -1, -2, 2, 1 }
508 static const int arg0_map[] = { 1, 0 };
509 static const int arg1_map[] = { 1, 1 };
510 static const int arg2_map[] = { 1, 2 };
511 static const int arg1_arg4_map[] = { 2, 1, 4 };
512 static const int arg3_arg2_map[] = { 2, 3, 2 };
513 static const int op1_op0_map[] = { 2, 1, 0 };
514 static const int off_map[] = { 1, -3 };
515 static const int off_op0_map[] = { 2, -3, 0 };
516 static const int off_arg2_map[] = { 2, -3, 2 };
517 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
518 static const int mask_call_maps[6][7] = {
519 { 1, 1, },
520 { 2, 1, 2, },
521 { 3, 1, 2, 3, },
522 { 4, 1, 2, 3, 4, },
523 { 5, 1, 2, 3, 4, 5, },
524 { 6, 1, 2, 3, 4, 5, 6 },
527 /* For most SLP statements, there is a one-to-one mapping between
528 gimple arguments and child nodes. If that is not true for STMT,
529 return an array that contains:
531 - the number of child nodes, followed by
532 - for each child node, the index of the argument associated with that node.
533 The special index -1 is the first operand of an embedded comparison and
534 the special index -2 is the second operand of an embedded comparison.
535 The special indes -3 is the offset of a gather as analyzed by
536 vect_check_gather_scatter.
538 SWAP is as for vect_get_and_check_slp_defs. */
540 static const int *
541 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
542 unsigned char swap = 0)
544 if (auto assign = dyn_cast<const gassign *> (stmt))
546 if (gimple_assign_rhs_code (assign) == COND_EXPR
547 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
548 return cond_expr_maps[swap];
549 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
550 && swap)
551 return op1_op0_map;
552 if (gather_scatter_p)
553 return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
554 ? off_op0_map : off_map);
556 gcc_assert (!swap);
557 if (auto call = dyn_cast<const gcall *> (stmt))
559 if (gimple_call_internal_p (call))
560 switch (gimple_call_internal_fn (call))
562 case IFN_MASK_LOAD:
563 return gather_scatter_p ? off_arg2_map : arg2_map;
565 case IFN_GATHER_LOAD:
566 return arg1_map;
568 case IFN_MASK_GATHER_LOAD:
569 case IFN_MASK_LEN_GATHER_LOAD:
570 return arg1_arg4_map;
572 case IFN_MASK_STORE:
573 return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
575 case IFN_MASK_CALL:
577 unsigned nargs = gimple_call_num_args (call);
578 if (nargs >= 2 && nargs <= 7)
579 return mask_call_maps[nargs-2];
580 else
581 return nullptr;
584 case IFN_CLZ:
585 case IFN_CTZ:
586 return arg0_map;
588 default:
589 break;
592 return nullptr;
595 /* Return the SLP node child index for operand OP of STMT. */
598 vect_slp_child_index_for_operand (const gimple *stmt, int op,
599 bool gather_scatter_p)
601 const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
602 if (!opmap)
603 return op;
604 for (int i = 1; i < 1 + opmap[0]; ++i)
605 if (opmap[i] == op)
606 return i - 1;
607 gcc_unreachable ();
610 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
611 they are of a valid type and that they match the defs of the first stmt of
612 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
613 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
614 indicates swap is required for cond_expr stmts. Specifically, SWAP
615 is 1 if STMT is cond and operands of comparison need to be swapped;
616 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
618 If there was a fatal error return -1; if the error could be corrected by
619 swapping operands of father node of this one, return 1; if everything is
620 ok return 0. */
621 static int
622 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
623 bool *skip_args,
624 vec<stmt_vec_info> stmts, unsigned stmt_num,
625 vec<slp_oprnd_info> *oprnds_info)
627 stmt_vec_info stmt_info = stmts[stmt_num];
628 tree oprnd;
629 unsigned int i, number_of_oprnds;
630 enum vect_def_type dt = vect_uninitialized_def;
631 slp_oprnd_info oprnd_info;
632 gather_scatter_info gs_info;
633 unsigned int gs_op = -1u;
634 unsigned int commutative_op = -1U;
635 bool first = stmt_num == 0;
637 if (!is_a<gcall *> (stmt_info->stmt)
638 && !is_a<gassign *> (stmt_info->stmt)
639 && !is_a<gphi *> (stmt_info->stmt))
640 return -1;
642 number_of_oprnds = gimple_num_args (stmt_info->stmt);
643 const int *map
644 = vect_get_operand_map (stmt_info->stmt,
645 STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
646 if (map)
647 number_of_oprnds = *map++;
648 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
650 if (gimple_call_internal_p (stmt))
652 internal_fn ifn = gimple_call_internal_fn (stmt);
653 commutative_op = first_commutative_argument (ifn);
656 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
658 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
659 commutative_op = 0;
662 bool swapped = (swap != 0);
663 bool backedge = false;
664 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
665 for (i = 0; i < number_of_oprnds; i++)
667 oprnd_info = (*oprnds_info)[i];
668 int opno = map ? map[i] : int (i);
669 if (opno == -3)
671 gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
672 if (!is_a <loop_vec_info> (vinfo)
673 || !vect_check_gather_scatter (stmt_info,
674 as_a <loop_vec_info> (vinfo),
675 first ? &oprnd_info->first_gs_info
676 : &gs_info))
677 return -1;
679 if (first)
681 oprnd_info->first_gs_p = true;
682 oprnd = oprnd_info->first_gs_info.offset;
684 else
686 gs_op = i;
687 oprnd = gs_info.offset;
690 else if (opno < 0)
691 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
692 else
694 oprnd = gimple_arg (stmt_info->stmt, opno);
695 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
697 edge e = gimple_phi_arg_edge (stmt, opno);
698 backedge = (is_a <bb_vec_info> (vinfo)
699 ? e->flags & EDGE_DFS_BACK
700 : dominated_by_p (CDI_DOMINATORS, e->src,
701 gimple_bb (stmt_info->stmt)));
704 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
705 oprnd = TREE_OPERAND (oprnd, 0);
707 stmt_vec_info def_stmt_info;
708 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
710 if (dump_enabled_p ())
711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
712 "Build SLP failed: can't analyze def for %T\n",
713 oprnd);
715 return -1;
718 if (skip_args[i])
720 oprnd_info->def_stmts.quick_push (NULL);
721 oprnd_info->ops.quick_push (NULL_TREE);
722 oprnd_info->first_dt = vect_uninitialized_def;
723 continue;
726 oprnd_info->def_stmts.quick_push (def_stmt_info);
727 oprnd_info->ops.quick_push (oprnd);
729 if (def_stmt_info
730 && is_pattern_stmt_p (def_stmt_info))
732 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
733 != def_stmt_info)
734 oprnd_info->any_pattern = true;
735 else
736 /* If we promote this to external use the original stmt def. */
737 oprnd_info->ops.last ()
738 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
741 /* If there's a extern def on a backedge make sure we can
742 code-generate at the region start.
743 ??? This is another case that could be fixed by adjusting
744 how we split the function but at the moment we'd have conflicting
745 goals there. */
746 if (backedge
747 && dts[i] == vect_external_def
748 && is_a <bb_vec_info> (vinfo)
749 && TREE_CODE (oprnd) == SSA_NAME
750 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
751 && !dominated_by_p (CDI_DOMINATORS,
752 as_a <bb_vec_info> (vinfo)->bbs[0],
753 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
755 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757 "Build SLP failed: extern def %T only defined "
758 "on backedge\n", oprnd);
759 return -1;
762 if (first)
764 tree type = TREE_TYPE (oprnd);
765 dt = dts[i];
767 /* For the swapping logic below force vect_reduction_def
768 for the reduction op in a SLP reduction group. */
769 if (!STMT_VINFO_DATA_REF (stmt_info)
770 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
771 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
772 && def_stmt_info)
773 dts[i] = dt = vect_reduction_def;
775 /* Check the types of the definition. */
776 switch (dt)
778 case vect_external_def:
779 case vect_constant_def:
780 case vect_internal_def:
781 case vect_reduction_def:
782 case vect_induction_def:
783 case vect_nested_cycle:
784 case vect_first_order_recurrence:
785 break;
787 default:
788 /* FORNOW: Not supported. */
789 if (dump_enabled_p ())
790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
791 "Build SLP failed: illegal type of def %T\n",
792 oprnd);
793 return -1;
796 oprnd_info->first_dt = dt;
797 oprnd_info->first_op_type = type;
800 if (first)
801 return 0;
803 /* Now match the operand definition types to that of the first stmt. */
804 for (i = 0; i < number_of_oprnds;)
806 if (skip_args[i])
808 ++i;
809 continue;
812 oprnd_info = (*oprnds_info)[i];
813 dt = dts[i];
814 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
815 oprnd = oprnd_info->ops[stmt_num];
816 tree type = TREE_TYPE (oprnd);
818 if (!types_compatible_p (oprnd_info->first_op_type, type))
820 if (dump_enabled_p ())
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 "Build SLP failed: different operand types\n");
823 return 1;
826 if ((gs_op == i) != oprnd_info->first_gs_p)
828 if (dump_enabled_p ())
829 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
830 "Build SLP failed: mixed gather and non-gather\n");
831 return 1;
833 else if (gs_op == i)
835 if (!operand_equal_p (oprnd_info->first_gs_info.base,
836 gs_info.base))
838 if (dump_enabled_p ())
839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
840 "Build SLP failed: different gather base\n");
841 return 1;
843 if (oprnd_info->first_gs_info.scale != gs_info.scale)
845 if (dump_enabled_p ())
846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
847 "Build SLP failed: different gather scale\n");
848 return 1;
852 /* Not first stmt of the group, check that the def-stmt/s match
853 the def-stmt/s of the first stmt. Allow different definition
854 types for reduction chains: the first stmt must be a
855 vect_reduction_def (a phi node), and the rest
856 end in the reduction chain. */
857 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
858 && !(oprnd_info->first_dt == vect_reduction_def
859 && !STMT_VINFO_DATA_REF (stmt_info)
860 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
861 && def_stmt_info
862 && !STMT_VINFO_DATA_REF (def_stmt_info)
863 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
864 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
865 || (!STMT_VINFO_DATA_REF (stmt_info)
866 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
867 && ((!def_stmt_info
868 || STMT_VINFO_DATA_REF (def_stmt_info)
869 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
870 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
871 != (oprnd_info->first_dt != vect_reduction_def))))
873 /* Try swapping operands if we got a mismatch. For BB
874 vectorization only in case it will clearly improve things. */
875 if (i == commutative_op && !swapped
876 && (!is_a <bb_vec_info> (vinfo)
877 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
878 dts[i+1])
879 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
880 || vect_def_types_match
881 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
883 if (dump_enabled_p ())
884 dump_printf_loc (MSG_NOTE, vect_location,
885 "trying swapped operands\n");
886 std::swap (dts[i], dts[i+1]);
887 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
888 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
889 std::swap ((*oprnds_info)[i]->ops[stmt_num],
890 (*oprnds_info)[i+1]->ops[stmt_num]);
891 swapped = true;
892 continue;
895 if (is_a <bb_vec_info> (vinfo)
896 && !oprnd_info->any_pattern)
898 /* Now for commutative ops we should see whether we can
899 make the other operand matching. */
900 if (dump_enabled_p ())
901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
902 "treating operand as external\n");
903 oprnd_info->first_dt = dt = vect_external_def;
905 else
907 if (dump_enabled_p ())
908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
909 "Build SLP failed: different types\n");
910 return 1;
914 /* Make sure to demote the overall operand to external. */
915 if (dt == vect_external_def)
916 oprnd_info->first_dt = vect_external_def;
917 /* For a SLP reduction chain we want to duplicate the reduction to
918 each of the chain members. That gets us a sane SLP graph (still
919 the stmts are not 100% correct wrt the initial values). */
920 else if ((dt == vect_internal_def
921 || dt == vect_reduction_def)
922 && oprnd_info->first_dt == vect_reduction_def
923 && !STMT_VINFO_DATA_REF (stmt_info)
924 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
925 && !STMT_VINFO_DATA_REF (def_stmt_info)
926 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
927 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
929 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
930 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
933 ++i;
936 /* Swap operands. */
937 if (swapped)
939 if (dump_enabled_p ())
940 dump_printf_loc (MSG_NOTE, vect_location,
941 "swapped operands to match def types in %G",
942 stmt_info->stmt);
945 return 0;
948 /* Return true if call statements CALL1 and CALL2 are similar enough
949 to be combined into the same SLP group. */
951 bool
952 compatible_calls_p (gcall *call1, gcall *call2)
954 unsigned int nargs = gimple_call_num_args (call1);
955 if (nargs != gimple_call_num_args (call2))
956 return false;
958 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
959 return false;
961 if (gimple_call_internal_p (call1))
963 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
964 TREE_TYPE (gimple_call_lhs (call2))))
965 return false;
966 for (unsigned int i = 0; i < nargs; ++i)
967 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
968 TREE_TYPE (gimple_call_arg (call2, i))))
969 return false;
971 else
973 if (!operand_equal_p (gimple_call_fn (call1),
974 gimple_call_fn (call2), 0))
975 return false;
977 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
978 return false;
981 /* Check that any unvectorized arguments are equal. */
982 if (const int *map = vect_get_operand_map (call1))
984 unsigned int nkept = *map++;
985 unsigned int mapi = 0;
986 for (unsigned int i = 0; i < nargs; ++i)
987 if (mapi < nkept && map[mapi] == int (i))
988 mapi += 1;
989 else if (!operand_equal_p (gimple_call_arg (call1, i),
990 gimple_call_arg (call2, i)))
991 return false;
994 return true;
997 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
998 caller's attempt to find the vector type in STMT_INFO with the narrowest
999 element type. Return true if VECTYPE is nonnull and if it is valid
1000 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1001 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1002 vect_build_slp_tree. */
1004 static bool
1005 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1006 unsigned int group_size,
1007 tree vectype, poly_uint64 *max_nunits)
1009 if (!vectype)
1011 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1013 "Build SLP failed: unsupported data-type in %G\n",
1014 stmt_info->stmt);
1015 /* Fatal mismatch. */
1016 return false;
1019 /* If populating the vector type requires unrolling then fail
1020 before adjusting *max_nunits for basic-block vectorization. */
1021 if (is_a <bb_vec_info> (vinfo)
1022 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1024 if (dump_enabled_p ())
1025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026 "Build SLP failed: unrolling required "
1027 "in basic block SLP\n");
1028 /* Fatal mismatch. */
1029 return false;
1032 /* In case of multiple types we need to detect the smallest type. */
1033 vect_update_max_nunits (max_nunits, vectype);
1034 return true;
1037 /* Verify if the scalar stmts STMTS are isomorphic, require data
1038 permutation or are of unsupported types of operation. Return
1039 true if they are, otherwise return false and indicate in *MATCHES
1040 which stmts are not isomorphic to the first one. If MATCHES[0]
1041 is false then this indicates the comparison could not be
1042 carried out or the stmts will never be vectorized by SLP.
1044 Note COND_EXPR is possibly isomorphic to another one after swapping its
1045 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1046 the first stmt by swapping the two operands of comparison; set SWAP[i]
1047 to 2 if stmt I is isormorphic to the first stmt by inverting the code
1048 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1049 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1051 static bool
1052 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1053 vec<stmt_vec_info> stmts, unsigned int group_size,
1054 poly_uint64 *max_nunits, bool *matches,
1055 bool *two_operators, tree *node_vectype)
1057 unsigned int i;
1058 stmt_vec_info first_stmt_info = stmts[0];
1059 code_helper first_stmt_code = ERROR_MARK;
1060 code_helper alt_stmt_code = ERROR_MARK;
1061 code_helper rhs_code = ERROR_MARK;
1062 code_helper first_cond_code = ERROR_MARK;
1063 tree lhs;
1064 bool need_same_oprnds = false;
1065 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1066 stmt_vec_info first_load = NULL, prev_first_load = NULL;
1067 bool first_stmt_ldst_p = false, ldst_p = false;
1068 bool first_stmt_phi_p = false, phi_p = false;
1069 bool maybe_soft_fail = false;
1070 tree soft_fail_nunits_vectype = NULL_TREE;
1072 /* For every stmt in NODE find its def stmt/s. */
1073 stmt_vec_info stmt_info;
1074 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1076 gimple *stmt = stmt_info->stmt;
1077 swap[i] = 0;
1078 matches[i] = false;
1080 if (dump_enabled_p ())
1081 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1083 /* Fail to vectorize statements marked as unvectorizable, throw
1084 or are volatile. */
1085 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1086 || stmt_can_throw_internal (cfun, stmt)
1087 || gimple_has_volatile_ops (stmt))
1089 if (dump_enabled_p ())
1090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1091 "Build SLP failed: unvectorizable statement %G",
1092 stmt);
1093 /* ??? For BB vectorization we want to commutate operands in a way
1094 to shuffle all unvectorizable defs into one operand and have
1095 the other still vectorized. The following doesn't reliably
1096 work for this though but it's the easiest we can do here. */
1097 if (is_a <bb_vec_info> (vinfo) && i != 0)
1098 continue;
1099 /* Fatal mismatch. */
1100 matches[0] = false;
1101 return false;
1104 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1105 lhs = gimple_get_lhs (stmt);
1106 if (lhs == NULL_TREE
1107 && (!call_stmt
1108 || !gimple_call_internal_p (stmt)
1109 || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1111 if (dump_enabled_p ())
1112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113 "Build SLP failed: not GIMPLE_ASSIGN nor "
1114 "GIMPLE_CALL %G", stmt);
1115 if (is_a <bb_vec_info> (vinfo) && i != 0)
1116 continue;
1117 /* Fatal mismatch. */
1118 matches[0] = false;
1119 return false;
1122 tree nunits_vectype;
1123 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1124 &nunits_vectype, group_size))
1126 if (is_a <bb_vec_info> (vinfo) && i != 0)
1127 continue;
1128 /* Fatal mismatch. */
1129 matches[0] = false;
1130 return false;
1132 /* Record nunits required but continue analysis, producing matches[]
1133 as if nunits was not an issue. This allows splitting of groups
1134 to happen. */
1135 if (nunits_vectype
1136 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1137 nunits_vectype, max_nunits))
1139 gcc_assert (is_a <bb_vec_info> (vinfo));
1140 maybe_soft_fail = true;
1141 soft_fail_nunits_vectype = nunits_vectype;
1144 gcc_assert (vectype);
1146 if (call_stmt)
1148 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1149 if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1150 rhs_code = cfn;
1151 else
1152 rhs_code = CALL_EXPR;
1154 if (cfn == CFN_MASK_LOAD
1155 || cfn == CFN_GATHER_LOAD
1156 || cfn == CFN_MASK_GATHER_LOAD
1157 || cfn == CFN_MASK_LEN_GATHER_LOAD)
1158 ldst_p = true;
1159 else if (cfn == CFN_MASK_STORE)
1161 ldst_p = true;
1162 rhs_code = CFN_MASK_STORE;
1164 else if ((cfn != CFN_LAST
1165 && cfn != CFN_MASK_CALL
1166 && internal_fn_p (cfn)
1167 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1168 || gimple_call_tail_p (call_stmt)
1169 || gimple_call_noreturn_p (call_stmt)
1170 || gimple_call_chain (call_stmt))
1172 if (dump_enabled_p ())
1173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1174 "Build SLP failed: unsupported call type %G",
1175 (gimple *) call_stmt);
1176 if (is_a <bb_vec_info> (vinfo) && i != 0)
1177 continue;
1178 /* Fatal mismatch. */
1179 matches[0] = false;
1180 return false;
1183 else if (gimple_code (stmt) == GIMPLE_PHI)
1185 rhs_code = ERROR_MARK;
1186 phi_p = true;
1188 else
1190 rhs_code = gimple_assign_rhs_code (stmt);
1191 ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1194 /* Check the operation. */
1195 if (i == 0)
1197 *node_vectype = vectype;
1198 first_stmt_code = rhs_code;
1199 first_stmt_ldst_p = ldst_p;
1200 first_stmt_phi_p = phi_p;
1202 /* Shift arguments should be equal in all the packed stmts for a
1203 vector shift with scalar shift operand. */
1204 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1205 || rhs_code == LROTATE_EXPR
1206 || rhs_code == RROTATE_EXPR)
1208 /* First see if we have a vector/vector shift. */
1209 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1211 /* No vector/vector shift, try for a vector/scalar shift. */
1212 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1214 if (dump_enabled_p ())
1215 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1216 "Build SLP failed: "
1217 "op not supported by target.\n");
1218 if (is_a <bb_vec_info> (vinfo) && i != 0)
1219 continue;
1220 /* Fatal mismatch. */
1221 matches[0] = false;
1222 return false;
1224 need_same_oprnds = true;
1225 first_op1 = gimple_assign_rhs2 (stmt);
1228 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1230 need_same_oprnds = true;
1231 first_op1 = gimple_assign_rhs2 (stmt);
1233 else if (!ldst_p
1234 && rhs_code == BIT_FIELD_REF)
1236 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1237 if (!is_a <bb_vec_info> (vinfo)
1238 || TREE_CODE (vec) != SSA_NAME
1239 /* When the element types are not compatible we pun the
1240 source to the target vectype which requires equal size. */
1241 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1242 || !types_compatible_p (TREE_TYPE (vectype),
1243 TREE_TYPE (TREE_TYPE (vec))))
1244 && !operand_equal_p (TYPE_SIZE (vectype),
1245 TYPE_SIZE (TREE_TYPE (vec)))))
1247 if (dump_enabled_p ())
1248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249 "Build SLP failed: "
1250 "BIT_FIELD_REF not supported\n");
1251 /* Fatal mismatch. */
1252 matches[0] = false;
1253 return false;
1256 else if (rhs_code == CFN_DIV_POW2)
1258 need_same_oprnds = true;
1259 first_op1 = gimple_call_arg (call_stmt, 1);
1262 else
1264 if (first_stmt_code != rhs_code
1265 && alt_stmt_code == ERROR_MARK)
1266 alt_stmt_code = rhs_code;
1267 if ((first_stmt_code != rhs_code
1268 && (first_stmt_code != IMAGPART_EXPR
1269 || rhs_code != REALPART_EXPR)
1270 && (first_stmt_code != REALPART_EXPR
1271 || rhs_code != IMAGPART_EXPR)
1272 /* Handle mismatches in plus/minus by computing both
1273 and merging the results. */
1274 && !((first_stmt_code == PLUS_EXPR
1275 || first_stmt_code == MINUS_EXPR)
1276 && (alt_stmt_code == PLUS_EXPR
1277 || alt_stmt_code == MINUS_EXPR)
1278 && rhs_code == alt_stmt_code)
1279 && !(first_stmt_code.is_tree_code ()
1280 && rhs_code.is_tree_code ()
1281 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1282 == tcc_comparison)
1283 && (swap_tree_comparison (tree_code (first_stmt_code))
1284 == tree_code (rhs_code)))
1285 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1286 && (first_stmt_code == ARRAY_REF
1287 || first_stmt_code == BIT_FIELD_REF
1288 || first_stmt_code == INDIRECT_REF
1289 || first_stmt_code == COMPONENT_REF
1290 || first_stmt_code == MEM_REF)
1291 && (rhs_code == ARRAY_REF
1292 || rhs_code == BIT_FIELD_REF
1293 || rhs_code == INDIRECT_REF
1294 || rhs_code == COMPONENT_REF
1295 || rhs_code == MEM_REF)))
1296 || (ldst_p
1297 && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1298 != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1299 || (ldst_p
1300 && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1301 != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1302 || first_stmt_ldst_p != ldst_p
1303 || first_stmt_phi_p != phi_p)
1305 if (dump_enabled_p ())
1307 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1308 "Build SLP failed: different operation "
1309 "in stmt %G", stmt);
1310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1311 "original stmt %G", first_stmt_info->stmt);
1313 /* Mismatch. */
1314 continue;
1317 if (!ldst_p
1318 && first_stmt_code == BIT_FIELD_REF
1319 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1320 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1322 if (dump_enabled_p ())
1323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1324 "Build SLP failed: different BIT_FIELD_REF "
1325 "arguments in %G", stmt);
1326 /* Mismatch. */
1327 continue;
1330 if (call_stmt
1331 && first_stmt_code != CFN_MASK_LOAD
1332 && first_stmt_code != CFN_MASK_STORE)
1334 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1335 call_stmt))
1337 if (dump_enabled_p ())
1338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1339 "Build SLP failed: different calls in %G",
1340 stmt);
1341 /* Mismatch. */
1342 continue;
1346 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1347 && (gimple_bb (first_stmt_info->stmt)
1348 != gimple_bb (stmt_info->stmt)))
1350 if (dump_enabled_p ())
1351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1352 "Build SLP failed: different BB for PHI "
1353 "or possibly trapping operation in %G", stmt);
1354 /* Mismatch. */
1355 continue;
1358 if (need_same_oprnds)
1360 tree other_op1 = gimple_arg (stmt, 1);
1361 if (!operand_equal_p (first_op1, other_op1, 0))
1363 if (dump_enabled_p ())
1364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1365 "Build SLP failed: different shift "
1366 "arguments in %G", stmt);
1367 /* Mismatch. */
1368 continue;
1372 if (!types_compatible_p (vectype, *node_vectype))
1374 if (dump_enabled_p ())
1375 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1376 "Build SLP failed: different vector type "
1377 "in %G", stmt);
1378 /* Mismatch. */
1379 continue;
1383 /* Grouped store or load. */
1384 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1386 gcc_assert (ldst_p);
1387 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1389 /* Store. */
1390 gcc_assert (rhs_code == CFN_MASK_STORE
1391 || REFERENCE_CLASS_P (lhs)
1392 || DECL_P (lhs));
1394 else
1396 /* Load. */
1397 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1398 if (prev_first_load)
1400 /* Check that there are no loads from different interleaving
1401 chains in the same node. */
1402 if (prev_first_load != first_load)
1404 if (dump_enabled_p ())
1405 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1406 vect_location,
1407 "Build SLP failed: different "
1408 "interleaving chains in one node %G",
1409 stmt);
1410 /* Mismatch. */
1411 continue;
1414 else
1415 prev_first_load = first_load;
1418 /* Non-grouped store or load. */
1419 else if (ldst_p)
1421 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1422 && rhs_code != CFN_GATHER_LOAD
1423 && rhs_code != CFN_MASK_GATHER_LOAD
1424 && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1425 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1426 /* Not grouped loads are handled as externals for BB
1427 vectorization. For loop vectorization we can handle
1428 splats the same we handle single element interleaving. */
1429 && (is_a <bb_vec_info> (vinfo)
1430 || stmt_info != first_stmt_info))
1432 /* Not grouped load. */
1433 if (dump_enabled_p ())
1434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1435 "Build SLP failed: not grouped load %G", stmt);
1437 if (i != 0)
1438 continue;
1439 /* Fatal mismatch. */
1440 matches[0] = false;
1441 return false;
1444 /* Not memory operation. */
1445 else
1447 if (!phi_p
1448 && rhs_code.is_tree_code ()
1449 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1450 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1451 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1452 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1453 && rhs_code != VIEW_CONVERT_EXPR
1454 && rhs_code != CALL_EXPR
1455 && rhs_code != BIT_FIELD_REF)
1457 if (dump_enabled_p ())
1458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459 "Build SLP failed: operation unsupported %G",
1460 stmt);
1461 if (is_a <bb_vec_info> (vinfo) && i != 0)
1462 continue;
1463 /* Fatal mismatch. */
1464 matches[0] = false;
1465 return false;
1468 if (rhs_code == COND_EXPR)
1470 tree cond_expr = gimple_assign_rhs1 (stmt);
1471 enum tree_code cond_code = TREE_CODE (cond_expr);
1472 enum tree_code swap_code = ERROR_MARK;
1473 enum tree_code invert_code = ERROR_MARK;
1475 if (i == 0)
1476 first_cond_code = TREE_CODE (cond_expr);
1477 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1479 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1480 swap_code = swap_tree_comparison (cond_code);
1481 invert_code = invert_tree_comparison (cond_code, honor_nans);
1484 if (first_cond_code == cond_code)
1486 /* Isomorphic can be achieved by swapping. */
1487 else if (first_cond_code == swap_code)
1488 swap[i] = 1;
1489 /* Isomorphic can be achieved by inverting. */
1490 else if (first_cond_code == invert_code)
1491 swap[i] = 2;
1492 else
1494 if (dump_enabled_p ())
1495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1496 "Build SLP failed: different"
1497 " operation %G", stmt);
1498 /* Mismatch. */
1499 continue;
1503 if (rhs_code.is_tree_code ()
1504 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1505 && (swap_tree_comparison ((tree_code)first_stmt_code)
1506 == (tree_code)rhs_code))
1507 swap[i] = 1;
1510 matches[i] = true;
1513 for (i = 0; i < group_size; ++i)
1514 if (!matches[i])
1515 return false;
1517 /* If we allowed a two-operation SLP node verify the target can cope
1518 with the permute we are going to use. */
1519 if (alt_stmt_code != ERROR_MARK
1520 && (!alt_stmt_code.is_tree_code ()
1521 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1522 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1524 *two_operators = true;
1527 if (maybe_soft_fail)
1529 unsigned HOST_WIDE_INT const_nunits;
1530 if (!TYPE_VECTOR_SUBPARTS
1531 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1532 || const_nunits > group_size)
1533 matches[0] = false;
1534 else
1536 /* With constant vector elements simulate a mismatch at the
1537 point we need to split. */
1538 unsigned tail = group_size & (const_nunits - 1);
1539 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1541 return false;
1544 return true;
1547 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1548 Note we never remove apart from at destruction time so we do not
1549 need a special value for deleted that differs from empty. */
1550 struct bst_traits
1552 typedef vec <stmt_vec_info> value_type;
1553 typedef vec <stmt_vec_info> compare_type;
1554 static inline hashval_t hash (value_type);
1555 static inline bool equal (value_type existing, value_type candidate);
1556 static inline bool is_empty (value_type x) { return !x.exists (); }
1557 static inline bool is_deleted (value_type x) { return !x.exists (); }
1558 static const bool empty_zero_p = true;
1559 static inline void mark_empty (value_type &x) { x.release (); }
1560 static inline void mark_deleted (value_type &x) { x.release (); }
1561 static inline void remove (value_type &x) { x.release (); }
1563 inline hashval_t
1564 bst_traits::hash (value_type x)
1566 inchash::hash h;
1567 for (unsigned i = 0; i < x.length (); ++i)
1568 h.add_int (gimple_uid (x[i]->stmt));
1569 return h.end ();
1571 inline bool
1572 bst_traits::equal (value_type existing, value_type candidate)
1574 if (existing.length () != candidate.length ())
1575 return false;
1576 for (unsigned i = 0; i < existing.length (); ++i)
1577 if (existing[i] != candidate[i])
1578 return false;
1579 return true;
1582 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1583 but then vec::insert does memmove and that's not compatible with
1584 std::pair. */
1585 struct chain_op_t
1587 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1588 : code (code_), dt (dt_), op (op_) {}
1589 tree_code code;
1590 vect_def_type dt;
1591 tree op;
1594 /* Comparator for sorting associatable chains. */
1596 static int
1597 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1599 auto *op1 = (const chain_op_t *) op1_;
1600 auto *op2 = (const chain_op_t *) op2_;
1601 if (op1->dt != op2->dt)
1602 return (int)op1->dt - (int)op2->dt;
1603 return (int)op1->code - (int)op2->code;
1606 /* Linearize the associatable expression chain at START with the
1607 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1608 filling CHAIN with the result and using WORKLIST as intermediate storage.
1609 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1610 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1611 stmts, starting with START. */
1613 static void
1614 vect_slp_linearize_chain (vec_info *vinfo,
1615 vec<std::pair<tree_code, gimple *> > &worklist,
1616 vec<chain_op_t> &chain,
1617 enum tree_code code, gimple *start,
1618 gimple *&code_stmt, gimple *&alt_code_stmt,
1619 vec<gimple *> *chain_stmts)
1621 /* For each lane linearize the addition/subtraction (or other
1622 uniform associatable operation) expression tree. */
1623 worklist.safe_push (std::make_pair (code, start));
1624 while (!worklist.is_empty ())
1626 auto entry = worklist.pop ();
1627 gassign *stmt = as_a <gassign *> (entry.second);
1628 enum tree_code in_code = entry.first;
1629 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1630 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1631 if (!code_stmt
1632 && gimple_assign_rhs_code (stmt) == code)
1633 code_stmt = stmt;
1634 else if (!alt_code_stmt
1635 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1636 alt_code_stmt = stmt;
1637 if (chain_stmts)
1638 chain_stmts->safe_push (stmt);
1639 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1641 tree op = gimple_op (stmt, opnum);
1642 vect_def_type dt;
1643 stmt_vec_info def_stmt_info;
1644 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1645 gcc_assert (res);
1646 if (dt == vect_internal_def
1647 && is_pattern_stmt_p (def_stmt_info))
1648 op = gimple_get_lhs (def_stmt_info->stmt);
1649 gimple *use_stmt;
1650 use_operand_p use_p;
1651 if (dt == vect_internal_def
1652 && single_imm_use (op, &use_p, &use_stmt)
1653 && is_gimple_assign (def_stmt_info->stmt)
1654 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1655 || (code == PLUS_EXPR
1656 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1657 == MINUS_EXPR))))
1659 tree_code op_def_code = this_code;
1660 if (op_def_code == MINUS_EXPR && opnum == 1)
1661 op_def_code = PLUS_EXPR;
1662 if (in_code == MINUS_EXPR)
1663 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1664 worklist.safe_push (std::make_pair (op_def_code,
1665 def_stmt_info->stmt));
1667 else
1669 tree_code op_def_code = this_code;
1670 if (op_def_code == MINUS_EXPR && opnum == 1)
1671 op_def_code = PLUS_EXPR;
1672 if (in_code == MINUS_EXPR)
1673 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1674 chain.safe_push (chain_op_t (op_def_code, dt, op));
1680 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1681 simple_hashmap_traits <bst_traits, slp_tree> >
1682 scalar_stmts_to_slp_tree_map_t;
1684 static slp_tree
1685 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1686 vec<stmt_vec_info> stmts, unsigned int group_size,
1687 poly_uint64 *max_nunits,
1688 bool *matches, unsigned *limit, unsigned *tree_size,
1689 scalar_stmts_to_slp_tree_map_t *bst_map);
1691 static slp_tree
1692 vect_build_slp_tree (vec_info *vinfo,
1693 vec<stmt_vec_info> stmts, unsigned int group_size,
1694 poly_uint64 *max_nunits,
1695 bool *matches, unsigned *limit, unsigned *tree_size,
1696 scalar_stmts_to_slp_tree_map_t *bst_map)
1698 if (slp_tree *leader = bst_map->get (stmts))
1700 if (dump_enabled_p ())
1701 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1702 !(*leader)->failed ? "" : "failed ",
1703 (void *) *leader);
1704 if (!(*leader)->failed)
1706 SLP_TREE_REF_COUNT (*leader)++;
1707 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1708 stmts.release ();
1709 return *leader;
1711 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1712 return NULL;
1715 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1716 so we can pick up backedge destinations during discovery. */
1717 slp_tree res = new _slp_tree;
1718 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1719 SLP_TREE_SCALAR_STMTS (res) = stmts;
1720 bst_map->put (stmts.copy (), res);
1722 if (*limit == 0)
1724 if (dump_enabled_p ())
1725 dump_printf_loc (MSG_NOTE, vect_location,
1726 "SLP discovery limit exceeded\n");
1727 /* Mark the node invalid so we can detect those when still in use
1728 as backedge destinations. */
1729 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1730 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1731 res->failed = XNEWVEC (bool, group_size);
1732 memset (res->failed, 0, sizeof (bool) * group_size);
1733 memset (matches, 0, sizeof (bool) * group_size);
1734 return NULL;
1736 --*limit;
1738 if (dump_enabled_p ())
1739 dump_printf_loc (MSG_NOTE, vect_location,
1740 "starting SLP discovery for node %p\n", (void *) res);
1742 poly_uint64 this_max_nunits = 1;
1743 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1744 &this_max_nunits,
1745 matches, limit, tree_size, bst_map);
1746 if (!res_)
1748 if (dump_enabled_p ())
1749 dump_printf_loc (MSG_NOTE, vect_location,
1750 "SLP discovery for node %p failed\n", (void *) res);
1751 /* Mark the node invalid so we can detect those when still in use
1752 as backedge destinations. */
1753 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1754 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1755 res->failed = XNEWVEC (bool, group_size);
1756 if (flag_checking)
1758 unsigned i;
1759 for (i = 0; i < group_size; ++i)
1760 if (!matches[i])
1761 break;
1762 gcc_assert (i < group_size);
1764 memcpy (res->failed, matches, sizeof (bool) * group_size);
1766 else
1768 if (dump_enabled_p ())
1769 dump_printf_loc (MSG_NOTE, vect_location,
1770 "SLP discovery for node %p succeeded\n",
1771 (void *) res);
1772 gcc_assert (res_ == res);
1773 res->max_nunits = this_max_nunits;
1774 vect_update_max_nunits (max_nunits, this_max_nunits);
1775 /* Keep a reference for the bst_map use. */
1776 SLP_TREE_REF_COUNT (res)++;
1778 return res_;
1781 /* Helper for building an associated SLP node chain. */
1783 static void
1784 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1785 slp_tree op0, slp_tree op1,
1786 stmt_vec_info oper1, stmt_vec_info oper2,
1787 vec<std::pair<unsigned, unsigned> > lperm)
1789 unsigned group_size = SLP_TREE_LANES (op1);
1791 slp_tree child1 = new _slp_tree;
1792 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1793 SLP_TREE_VECTYPE (child1) = vectype;
1794 SLP_TREE_LANES (child1) = group_size;
1795 SLP_TREE_CHILDREN (child1).create (2);
1796 SLP_TREE_CHILDREN (child1).quick_push (op0);
1797 SLP_TREE_CHILDREN (child1).quick_push (op1);
1798 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1800 slp_tree child2 = new _slp_tree;
1801 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1802 SLP_TREE_VECTYPE (child2) = vectype;
1803 SLP_TREE_LANES (child2) = group_size;
1804 SLP_TREE_CHILDREN (child2).create (2);
1805 SLP_TREE_CHILDREN (child2).quick_push (op0);
1806 SLP_TREE_REF_COUNT (op0)++;
1807 SLP_TREE_CHILDREN (child2).quick_push (op1);
1808 SLP_TREE_REF_COUNT (op1)++;
1809 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1811 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1812 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1813 SLP_TREE_VECTYPE (perm) = vectype;
1814 SLP_TREE_LANES (perm) = group_size;
1815 /* ??? We should set this NULL but that's not expected. */
1816 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1817 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1818 SLP_TREE_CHILDREN (perm).quick_push (child1);
1819 SLP_TREE_CHILDREN (perm).quick_push (child2);
1822 /* Recursively build an SLP tree starting from NODE.
1823 Fail (and return a value not equal to zero) if def-stmts are not
1824 isomorphic, require data permutation or are of unsupported types of
1825 operation. Otherwise, return 0.
1826 The value returned is the depth in the SLP tree where a mismatch
1827 was found. */
1829 static slp_tree
1830 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1831 vec<stmt_vec_info> stmts, unsigned int group_size,
1832 poly_uint64 *max_nunits,
1833 bool *matches, unsigned *limit, unsigned *tree_size,
1834 scalar_stmts_to_slp_tree_map_t *bst_map)
1836 unsigned nops, i, this_tree_size = 0;
1837 poly_uint64 this_max_nunits = *max_nunits;
1839 matches[0] = false;
1841 stmt_vec_info stmt_info = stmts[0];
1842 if (!is_a<gcall *> (stmt_info->stmt)
1843 && !is_a<gassign *> (stmt_info->stmt)
1844 && !is_a<gphi *> (stmt_info->stmt))
1845 return NULL;
1847 nops = gimple_num_args (stmt_info->stmt);
1848 if (const int *map = vect_get_operand_map (stmt_info->stmt,
1849 STMT_VINFO_GATHER_SCATTER_P
1850 (stmt_info)))
1851 nops = map[0];
1853 /* If the SLP node is a PHI (induction or reduction), terminate
1854 the recursion. */
1855 bool *skip_args = XALLOCAVEC (bool, nops);
1856 memset (skip_args, 0, sizeof (bool) * nops);
1857 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1858 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1860 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1861 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1862 group_size);
1863 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1864 max_nunits))
1865 return NULL;
1867 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1868 if (def_type == vect_induction_def)
1870 /* Induction PHIs are not cycles but walk the initial
1871 value. Only for inner loops through, for outer loops
1872 we need to pick up the value from the actual PHIs
1873 to more easily support peeling and epilogue vectorization. */
1874 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1875 if (!nested_in_vect_loop_p (loop, stmt_info))
1876 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1877 else
1878 loop = loop->inner;
1879 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1881 else if (def_type == vect_reduction_def
1882 || def_type == vect_double_reduction_def
1883 || def_type == vect_nested_cycle
1884 || def_type == vect_first_order_recurrence)
1886 /* Else def types have to match. */
1887 stmt_vec_info other_info;
1888 bool all_same = true;
1889 FOR_EACH_VEC_ELT (stmts, i, other_info)
1891 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1892 return NULL;
1893 if (other_info != stmt_info)
1894 all_same = false;
1896 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1897 /* Reduction initial values are not explicitely represented. */
1898 if (def_type != vect_first_order_recurrence
1899 && !nested_in_vect_loop_p (loop, stmt_info))
1900 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1901 /* Reduction chain backedge defs are filled manually.
1902 ??? Need a better way to identify a SLP reduction chain PHI.
1903 Or a better overall way to SLP match those. */
1904 if (all_same && def_type == vect_reduction_def)
1905 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1907 else if (def_type != vect_internal_def)
1908 return NULL;
1912 bool two_operators = false;
1913 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1914 tree vectype = NULL_TREE;
1915 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1916 &this_max_nunits, matches, &two_operators,
1917 &vectype))
1918 return NULL;
1920 /* If the SLP node is a load, terminate the recursion unless masked. */
1921 if (STMT_VINFO_DATA_REF (stmt_info)
1922 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1924 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1925 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1926 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1927 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
1928 || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD));
1929 else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1930 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1931 else
1933 *max_nunits = this_max_nunits;
1934 (*tree_size)++;
1935 node = vect_create_new_slp_node (node, stmts, 0);
1936 SLP_TREE_VECTYPE (node) = vectype;
1937 /* And compute the load permutation. Whether it is actually
1938 a permutation depends on the unrolling factor which is
1939 decided later. */
1940 vec<unsigned> load_permutation;
1941 int j;
1942 stmt_vec_info load_info;
1943 load_permutation.create (group_size);
1944 stmt_vec_info first_stmt_info
1945 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1946 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1948 int load_place;
1949 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1950 load_place = vect_get_place_in_interleaving_chain
1951 (load_info, first_stmt_info);
1952 else
1953 load_place = 0;
1954 gcc_assert (load_place != -1);
1955 load_permutation.safe_push (load_place);
1957 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1958 return node;
1961 else if (gimple_assign_single_p (stmt_info->stmt)
1962 && !gimple_vuse (stmt_info->stmt)
1963 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1965 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1966 the same SSA name vector of a compatible type to vectype. */
1967 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1968 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1969 stmt_vec_info estmt_info;
1970 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1972 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1973 tree bfref = gimple_assign_rhs1 (estmt);
1974 HOST_WIDE_INT lane;
1975 if (!known_eq (bit_field_size (bfref),
1976 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1977 || !constant_multiple_p (bit_field_offset (bfref),
1978 bit_field_size (bfref), &lane))
1980 lperm.release ();
1981 matches[0] = false;
1982 return NULL;
1984 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1986 slp_tree vnode = vect_create_new_slp_node (vNULL);
1987 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1988 /* ??? We record vectype here but we hide eventually necessary
1989 punning and instead rely on code generation to materialize
1990 VIEW_CONVERT_EXPRs as necessary. We instead should make
1991 this explicit somehow. */
1992 SLP_TREE_VECTYPE (vnode) = vectype;
1993 else
1995 /* For different size but compatible elements we can still
1996 use VEC_PERM_EXPR without punning. */
1997 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1998 && types_compatible_p (TREE_TYPE (vectype),
1999 TREE_TYPE (TREE_TYPE (vec))));
2000 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2002 auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2003 unsigned HOST_WIDE_INT const_nunits;
2004 if (nunits.is_constant (&const_nunits))
2005 SLP_TREE_LANES (vnode) = const_nunits;
2006 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2007 /* We are always building a permutation node even if it is an identity
2008 permute to shield the rest of the vectorizer from the odd node
2009 representing an actual vector without any scalar ops.
2010 ??? We could hide it completely with making the permute node
2011 external? */
2012 node = vect_create_new_slp_node (node, stmts, 1);
2013 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2014 SLP_TREE_LANE_PERMUTATION (node) = lperm;
2015 SLP_TREE_VECTYPE (node) = vectype;
2016 SLP_TREE_CHILDREN (node).quick_push (vnode);
2017 return node;
2019 /* When discovery reaches an associatable operation see whether we can
2020 improve that to match up lanes in a way superior to the operand
2021 swapping code which at most looks at two defs.
2022 ??? For BB vectorization we cannot do the brute-force search
2023 for matching as we can succeed by means of builds from scalars
2024 and have no good way to "cost" one build against another. */
2025 else if (is_a <loop_vec_info> (vinfo)
2026 /* ??? We don't handle !vect_internal_def defs below. */
2027 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2028 && is_gimple_assign (stmt_info->stmt)
2029 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2030 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2031 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2032 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2033 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2035 /* See if we have a chain of (mixed) adds or subtracts or other
2036 associatable ops. */
2037 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2038 if (code == MINUS_EXPR)
2039 code = PLUS_EXPR;
2040 stmt_vec_info other_op_stmt_info = NULL;
2041 stmt_vec_info op_stmt_info = NULL;
2042 unsigned chain_len = 0;
2043 auto_vec<chain_op_t> chain;
2044 auto_vec<std::pair<tree_code, gimple *> > worklist;
2045 auto_vec<vec<chain_op_t> > chains (group_size);
2046 auto_vec<slp_tree, 4> children;
2047 bool hard_fail = true;
2048 for (unsigned lane = 0; lane < group_size; ++lane)
2050 /* For each lane linearize the addition/subtraction (or other
2051 uniform associatable operation) expression tree. */
2052 gimple *op_stmt = NULL, *other_op_stmt = NULL;
2053 vect_slp_linearize_chain (vinfo, worklist, chain, code,
2054 stmts[lane]->stmt, op_stmt, other_op_stmt,
2055 NULL);
2056 if (!op_stmt_info && op_stmt)
2057 op_stmt_info = vinfo->lookup_stmt (op_stmt);
2058 if (!other_op_stmt_info && other_op_stmt)
2059 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2060 if (chain.length () == 2)
2062 /* In a chain of just two elements resort to the regular
2063 operand swapping scheme. If we run into a length
2064 mismatch still hard-FAIL. */
2065 if (chain_len == 0)
2066 hard_fail = false;
2067 else
2069 matches[lane] = false;
2070 /* ??? We might want to process the other lanes, but
2071 make sure to not give false matching hints to the
2072 caller for lanes we did not process. */
2073 if (lane != group_size - 1)
2074 matches[0] = false;
2076 break;
2078 else if (chain_len == 0)
2079 chain_len = chain.length ();
2080 else if (chain.length () != chain_len)
2082 /* ??? Here we could slip in magic to compensate with
2083 neutral operands. */
2084 matches[lane] = false;
2085 if (lane != group_size - 1)
2086 matches[0] = false;
2087 break;
2089 chains.quick_push (chain.copy ());
2090 chain.truncate (0);
2092 if (chains.length () == group_size)
2094 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2095 if (!op_stmt_info)
2097 hard_fail = false;
2098 goto out;
2100 /* Now we have a set of chains with the same length. */
2101 /* 1. pre-sort according to def_type and operation. */
2102 for (unsigned lane = 0; lane < group_size; ++lane)
2103 chains[lane].stablesort (dt_sort_cmp, vinfo);
2104 if (dump_enabled_p ())
2106 dump_printf_loc (MSG_NOTE, vect_location,
2107 "pre-sorted chains of %s\n",
2108 get_tree_code_name (code));
2109 for (unsigned lane = 0; lane < group_size; ++lane)
2111 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2112 dump_printf (MSG_NOTE, "%s %T ",
2113 get_tree_code_name (chains[lane][opnum].code),
2114 chains[lane][opnum].op);
2115 dump_printf (MSG_NOTE, "\n");
2118 /* 2. try to build children nodes, associating as necessary. */
2119 for (unsigned n = 0; n < chain_len; ++n)
2121 vect_def_type dt = chains[0][n].dt;
2122 unsigned lane;
2123 for (lane = 0; lane < group_size; ++lane)
2124 if (chains[lane][n].dt != dt)
2126 if (dt == vect_constant_def
2127 && chains[lane][n].dt == vect_external_def)
2128 dt = vect_external_def;
2129 else if (dt == vect_external_def
2130 && chains[lane][n].dt == vect_constant_def)
2132 else
2133 break;
2135 if (lane != group_size)
2137 if (dump_enabled_p ())
2138 dump_printf_loc (MSG_NOTE, vect_location,
2139 "giving up on chain due to mismatched "
2140 "def types\n");
2141 matches[lane] = false;
2142 if (lane != group_size - 1)
2143 matches[0] = false;
2144 goto out;
2146 if (dt == vect_constant_def
2147 || dt == vect_external_def)
2149 /* Check whether we can build the invariant. If we can't
2150 we never will be able to. */
2151 tree type = TREE_TYPE (chains[0][n].op);
2152 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2153 && (TREE_CODE (type) == BOOLEAN_TYPE
2154 || !can_duplicate_and_interleave_p (vinfo, group_size,
2155 type)))
2157 matches[0] = false;
2158 goto out;
2160 vec<tree> ops;
2161 ops.create (group_size);
2162 for (lane = 0; lane < group_size; ++lane)
2163 ops.quick_push (chains[lane][n].op);
2164 slp_tree child = vect_create_new_slp_node (ops);
2165 SLP_TREE_DEF_TYPE (child) = dt;
2166 children.safe_push (child);
2168 else if (dt != vect_internal_def)
2170 /* Not sure, we might need sth special.
2171 gcc.dg/vect/pr96854.c,
2172 gfortran.dg/vect/fast-math-pr37021.f90
2173 and gfortran.dg/vect/pr61171.f trigger. */
2174 /* Soft-fail for now. */
2175 hard_fail = false;
2176 goto out;
2178 else
2180 vec<stmt_vec_info> op_stmts;
2181 op_stmts.create (group_size);
2182 slp_tree child = NULL;
2183 /* Brute-force our way. We have to consider a lane
2184 failing after fixing an earlier fail up in the
2185 SLP discovery recursion. So track the current
2186 permute per lane. */
2187 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2188 memset (perms, 0, sizeof (unsigned) * group_size);
2191 op_stmts.truncate (0);
2192 for (lane = 0; lane < group_size; ++lane)
2193 op_stmts.quick_push
2194 (vinfo->lookup_def (chains[lane][n].op));
2195 child = vect_build_slp_tree (vinfo, op_stmts,
2196 group_size, &this_max_nunits,
2197 matches, limit,
2198 &this_tree_size, bst_map);
2199 /* ??? We're likely getting too many fatal mismatches
2200 here so maybe we want to ignore them (but then we
2201 have no idea which lanes fatally mismatched). */
2202 if (child || !matches[0])
2203 break;
2204 /* Swap another lane we have not yet matched up into
2205 lanes that did not match. If we run out of
2206 permute possibilities for a lane terminate the
2207 search. */
2208 bool term = false;
2209 for (lane = 1; lane < group_size; ++lane)
2210 if (!matches[lane])
2212 if (n + perms[lane] + 1 == chain_len)
2214 term = true;
2215 break;
2217 std::swap (chains[lane][n],
2218 chains[lane][n + perms[lane] + 1]);
2219 perms[lane]++;
2221 if (term)
2222 break;
2224 while (1);
2225 if (!child)
2227 if (dump_enabled_p ())
2228 dump_printf_loc (MSG_NOTE, vect_location,
2229 "failed to match up op %d\n", n);
2230 op_stmts.release ();
2231 if (lane != group_size - 1)
2232 matches[0] = false;
2233 else
2234 matches[lane] = false;
2235 goto out;
2237 if (dump_enabled_p ())
2239 dump_printf_loc (MSG_NOTE, vect_location,
2240 "matched up op %d to\n", n);
2241 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2243 children.safe_push (child);
2246 /* 3. build SLP nodes to combine the chain. */
2247 for (unsigned lane = 0; lane < group_size; ++lane)
2248 if (chains[lane][0].code != code)
2250 /* See if there's any alternate all-PLUS entry. */
2251 unsigned n;
2252 for (n = 1; n < chain_len; ++n)
2254 for (lane = 0; lane < group_size; ++lane)
2255 if (chains[lane][n].code != code)
2256 break;
2257 if (lane == group_size)
2258 break;
2260 if (n != chain_len)
2262 /* Swap that in at first position. */
2263 std::swap (children[0], children[n]);
2264 for (lane = 0; lane < group_size; ++lane)
2265 std::swap (chains[lane][0], chains[lane][n]);
2267 else
2269 /* ??? When this triggers and we end up with two
2270 vect_constant/external_def up-front things break (ICE)
2271 spectacularly finding an insertion place for the
2272 all-constant op. We should have a fully
2273 vect_internal_def operand though(?) so we can swap
2274 that into first place and then prepend the all-zero
2275 constant. */
2276 if (dump_enabled_p ())
2277 dump_printf_loc (MSG_NOTE, vect_location,
2278 "inserting constant zero to compensate "
2279 "for (partially) negated first "
2280 "operand\n");
2281 chain_len++;
2282 for (lane = 0; lane < group_size; ++lane)
2283 chains[lane].safe_insert
2284 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2285 vec<tree> zero_ops;
2286 zero_ops.create (group_size);
2287 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2288 for (lane = 1; lane < group_size; ++lane)
2289 zero_ops.quick_push (zero_ops[0]);
2290 slp_tree zero = vect_create_new_slp_node (zero_ops);
2291 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2292 children.safe_insert (0, zero);
2294 break;
2296 for (unsigned i = 1; i < children.length (); ++i)
2298 slp_tree op0 = children[i - 1];
2299 slp_tree op1 = children[i];
2300 bool this_two_op = false;
2301 for (unsigned lane = 0; lane < group_size; ++lane)
2302 if (chains[lane][i].code != chains[0][i].code)
2304 this_two_op = true;
2305 break;
2307 slp_tree child;
2308 if (i == children.length () - 1)
2309 child = vect_create_new_slp_node (node, stmts, 2);
2310 else
2311 child = vect_create_new_slp_node (2, ERROR_MARK);
2312 if (this_two_op)
2314 vec<std::pair<unsigned, unsigned> > lperm;
2315 lperm.create (group_size);
2316 for (unsigned lane = 0; lane < group_size; ++lane)
2317 lperm.quick_push (std::make_pair
2318 (chains[lane][i].code != chains[0][i].code, lane));
2319 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2320 (chains[0][i].code == code
2321 ? op_stmt_info
2322 : other_op_stmt_info),
2323 (chains[0][i].code == code
2324 ? other_op_stmt_info
2325 : op_stmt_info),
2326 lperm);
2328 else
2330 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2331 SLP_TREE_VECTYPE (child) = vectype;
2332 SLP_TREE_LANES (child) = group_size;
2333 SLP_TREE_CHILDREN (child).quick_push (op0);
2334 SLP_TREE_CHILDREN (child).quick_push (op1);
2335 SLP_TREE_REPRESENTATIVE (child)
2336 = (chains[0][i].code == code
2337 ? op_stmt_info : other_op_stmt_info);
2339 children[i] = child;
2341 *tree_size += this_tree_size + 1;
2342 *max_nunits = this_max_nunits;
2343 while (!chains.is_empty ())
2344 chains.pop ().release ();
2345 return node;
2347 out:
2348 while (!children.is_empty ())
2349 vect_free_slp_tree (children.pop ());
2350 while (!chains.is_empty ())
2351 chains.pop ().release ();
2352 /* Hard-fail, otherwise we might run into quadratic processing of the
2353 chains starting one stmt into the chain again. */
2354 if (hard_fail)
2355 return NULL;
2356 /* Fall thru to normal processing. */
2359 /* Get at the operands, verifying they are compatible. */
2360 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2361 slp_oprnd_info oprnd_info;
2362 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2364 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2365 stmts, i, &oprnds_info);
2366 if (res != 0)
2367 matches[(res == -1) ? 0 : i] = false;
2368 if (!matches[0])
2369 break;
2371 for (i = 0; i < group_size; ++i)
2372 if (!matches[i])
2374 vect_free_oprnd_info (oprnds_info);
2375 return NULL;
2377 swap = NULL;
2379 auto_vec<slp_tree, 4> children;
2381 stmt_info = stmts[0];
2383 /* Create SLP_TREE nodes for the definition node/s. */
2384 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2386 slp_tree child = nullptr;
2387 unsigned int j;
2389 /* We're skipping certain operands from processing, for example
2390 outer loop reduction initial defs. */
2391 if (skip_args[i])
2393 children.safe_push (NULL);
2394 continue;
2397 if (oprnd_info->first_dt == vect_uninitialized_def)
2399 /* COND_EXPR have one too many eventually if the condition
2400 is a SSA name. */
2401 gcc_assert (i == 3 && nops == 4);
2402 continue;
2405 if (is_a <bb_vec_info> (vinfo)
2406 && oprnd_info->first_dt == vect_internal_def
2407 && !oprnd_info->any_pattern)
2409 /* For BB vectorization, if all defs are the same do not
2410 bother to continue the build along the single-lane
2411 graph but use a splat of the scalar value. */
2412 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2413 for (j = 1; j < group_size; ++j)
2414 if (oprnd_info->def_stmts[j] != first_def)
2415 break;
2416 if (j == group_size
2417 /* But avoid doing this for loads where we may be
2418 able to CSE things, unless the stmt is not
2419 vectorizable. */
2420 && (!STMT_VINFO_VECTORIZABLE (first_def)
2421 || !gimple_vuse (first_def->stmt)))
2423 if (dump_enabled_p ())
2424 dump_printf_loc (MSG_NOTE, vect_location,
2425 "Using a splat of the uniform operand %G",
2426 first_def->stmt);
2427 oprnd_info->first_dt = vect_external_def;
2431 if (oprnd_info->first_dt == vect_external_def
2432 || oprnd_info->first_dt == vect_constant_def)
2434 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2436 tree op0;
2437 tree uniform_val = op0 = oprnd_info->ops[0];
2438 for (j = 1; j < oprnd_info->ops.length (); ++j)
2439 if (!operand_equal_p (uniform_val, oprnd_info->ops[j]))
2441 uniform_val = NULL_TREE;
2442 break;
2444 if (!uniform_val
2445 && !can_duplicate_and_interleave_p (vinfo,
2446 oprnd_info->ops.length (),
2447 TREE_TYPE (op0)))
2449 matches[j] = false;
2450 if (dump_enabled_p ())
2451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2452 "Build SLP failed: invalid type of def "
2453 "for variable-length SLP %T\n", op0);
2454 goto fail;
2457 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2458 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2459 oprnd_info->ops = vNULL;
2460 children.safe_push (invnode);
2461 continue;
2464 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2465 group_size, &this_max_nunits,
2466 matches, limit,
2467 &this_tree_size, bst_map)) != NULL)
2469 oprnd_info->def_stmts = vNULL;
2470 children.safe_push (child);
2471 continue;
2474 /* If the SLP build for operand zero failed and operand zero
2475 and one can be commutated try that for the scalar stmts
2476 that failed the match. */
2477 if (i == 0
2478 /* A first scalar stmt mismatch signals a fatal mismatch. */
2479 && matches[0]
2480 /* ??? For COND_EXPRs we can swap the comparison operands
2481 as well as the arms under some constraints. */
2482 && nops == 2
2483 && oprnds_info[1]->first_dt == vect_internal_def
2484 && is_gimple_assign (stmt_info->stmt)
2485 /* Swapping operands for reductions breaks assumptions later on. */
2486 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2487 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2489 /* See whether we can swap the matching or the non-matching
2490 stmt operands. */
2491 bool swap_not_matching = true;
2494 for (j = 0; j < group_size; ++j)
2496 if (matches[j] != !swap_not_matching)
2497 continue;
2498 stmt_vec_info stmt_info = stmts[j];
2499 /* Verify if we can swap operands of this stmt. */
2500 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2501 if (!stmt
2502 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2504 if (!swap_not_matching)
2505 goto fail;
2506 swap_not_matching = false;
2507 break;
2511 while (j != group_size);
2513 /* Swap mismatched definition stmts. */
2514 if (dump_enabled_p ())
2515 dump_printf_loc (MSG_NOTE, vect_location,
2516 "Re-trying with swapped operands of stmts ");
2517 for (j = 0; j < group_size; ++j)
2518 if (matches[j] == !swap_not_matching)
2520 std::swap (oprnds_info[0]->def_stmts[j],
2521 oprnds_info[1]->def_stmts[j]);
2522 std::swap (oprnds_info[0]->ops[j],
2523 oprnds_info[1]->ops[j]);
2524 if (dump_enabled_p ())
2525 dump_printf (MSG_NOTE, "%d ", j);
2527 if (dump_enabled_p ())
2528 dump_printf (MSG_NOTE, "\n");
2529 /* After swapping some operands we lost track whether an
2530 operand has any pattern defs so be conservative here. */
2531 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2532 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2533 /* And try again with scratch 'matches' ... */
2534 bool *tem = XALLOCAVEC (bool, group_size);
2535 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2536 group_size, &this_max_nunits,
2537 tem, limit,
2538 &this_tree_size, bst_map)) != NULL)
2540 oprnd_info->def_stmts = vNULL;
2541 children.safe_push (child);
2542 continue;
2545 fail:
2547 /* If the SLP build failed and we analyze a basic-block
2548 simply treat nodes we fail to build as externally defined
2549 (and thus build vectors from the scalar defs).
2550 The cost model will reject outright expensive cases.
2551 ??? This doesn't treat cases where permutation ultimatively
2552 fails (or we don't try permutation below). Ideally we'd
2553 even compute a permutation that will end up with the maximum
2554 SLP tree size... */
2555 if (is_a <bb_vec_info> (vinfo)
2556 /* ??? Rejecting patterns this way doesn't work. We'd have to
2557 do extra work to cancel the pattern so the uses see the
2558 scalar version. */
2559 && !is_pattern_stmt_p (stmt_info)
2560 && !oprnd_info->any_pattern)
2562 /* But if there's a leading vector sized set of matching stmts
2563 fail here so we can split the group. This matches the condition
2564 vect_analyze_slp_instance uses. */
2565 /* ??? We might want to split here and combine the results to support
2566 multiple vector sizes better. */
2567 for (j = 0; j < group_size; ++j)
2568 if (!matches[j])
2569 break;
2570 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2572 if (dump_enabled_p ())
2573 dump_printf_loc (MSG_NOTE, vect_location,
2574 "Building vector operands from scalars\n");
2575 this_tree_size++;
2576 child = vect_create_new_slp_node (oprnd_info->ops);
2577 children.safe_push (child);
2578 oprnd_info->ops = vNULL;
2579 continue;
2583 gcc_assert (child == NULL);
2584 FOR_EACH_VEC_ELT (children, j, child)
2585 if (child)
2586 vect_free_slp_tree (child);
2587 vect_free_oprnd_info (oprnds_info);
2588 return NULL;
2591 vect_free_oprnd_info (oprnds_info);
2593 /* If we have all children of a child built up from uniform scalars
2594 or does more than one possibly expensive vector construction then
2595 just throw that away, causing it built up from scalars.
2596 The exception is the SLP node for the vector store. */
2597 if (is_a <bb_vec_info> (vinfo)
2598 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2599 /* ??? Rejecting patterns this way doesn't work. We'd have to
2600 do extra work to cancel the pattern so the uses see the
2601 scalar version. */
2602 && !is_pattern_stmt_p (stmt_info))
2604 slp_tree child;
2605 unsigned j;
2606 bool all_uniform_p = true;
2607 unsigned n_vector_builds = 0;
2608 FOR_EACH_VEC_ELT (children, j, child)
2610 if (!child)
2612 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2613 all_uniform_p = false;
2614 else if (!vect_slp_tree_uniform_p (child))
2616 all_uniform_p = false;
2617 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2618 n_vector_builds++;
2621 if (all_uniform_p
2622 || n_vector_builds > 1
2623 || (n_vector_builds == children.length ()
2624 && is_a <gphi *> (stmt_info->stmt)))
2626 /* Roll back. */
2627 matches[0] = false;
2628 FOR_EACH_VEC_ELT (children, j, child)
2629 if (child)
2630 vect_free_slp_tree (child);
2632 if (dump_enabled_p ())
2633 dump_printf_loc (MSG_NOTE, vect_location,
2634 "Building parent vector operands from "
2635 "scalars instead\n");
2636 return NULL;
2640 *tree_size += this_tree_size + 1;
2641 *max_nunits = this_max_nunits;
2643 if (two_operators)
2645 /* ??? We'd likely want to either cache in bst_map sth like
2646 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2647 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2648 explicit stmts to put in so the keying on 'stmts' doesn't
2649 work (but we have the same issue with nodes that use 'ops'). */
2650 slp_tree one = new _slp_tree;
2651 slp_tree two = new _slp_tree;
2652 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2653 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2654 SLP_TREE_VECTYPE (one) = vectype;
2655 SLP_TREE_VECTYPE (two) = vectype;
2656 SLP_TREE_CHILDREN (one).safe_splice (children);
2657 SLP_TREE_CHILDREN (two).safe_splice (children);
2658 slp_tree child;
2659 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2660 SLP_TREE_REF_COUNT (child)++;
2662 /* Here we record the original defs since this
2663 node represents the final lane configuration. */
2664 node = vect_create_new_slp_node (node, stmts, 2);
2665 SLP_TREE_VECTYPE (node) = vectype;
2666 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2667 SLP_TREE_CHILDREN (node).quick_push (one);
2668 SLP_TREE_CHILDREN (node).quick_push (two);
2669 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2670 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2671 enum tree_code ocode = ERROR_MARK;
2672 stmt_vec_info ostmt_info;
2673 unsigned j = 0;
2674 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2676 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2677 if (gimple_assign_rhs_code (ostmt) != code0)
2679 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2680 ocode = gimple_assign_rhs_code (ostmt);
2681 j = i;
2683 else
2684 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2686 SLP_TREE_CODE (one) = code0;
2687 SLP_TREE_CODE (two) = ocode;
2688 SLP_TREE_LANES (one) = stmts.length ();
2689 SLP_TREE_LANES (two) = stmts.length ();
2690 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2691 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2692 return node;
2695 node = vect_create_new_slp_node (node, stmts, nops);
2696 SLP_TREE_VECTYPE (node) = vectype;
2697 SLP_TREE_CHILDREN (node).splice (children);
2698 return node;
2701 /* Dump a single SLP tree NODE. */
2703 static void
2704 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2705 slp_tree node)
2707 unsigned i, j;
2708 slp_tree child;
2709 stmt_vec_info stmt_info;
2710 tree op;
2712 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2713 dump_user_location_t user_loc = loc.get_user_location ();
2714 dump_printf_loc (metadata, user_loc,
2715 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2716 ", refcnt=%u)",
2717 SLP_TREE_DEF_TYPE (node) == vect_external_def
2718 ? " (external)"
2719 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2720 ? " (constant)"
2721 : ""), (void *) node,
2722 estimated_poly_value (node->max_nunits),
2723 SLP_TREE_REF_COUNT (node));
2724 if (SLP_TREE_VECTYPE (node))
2725 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2726 dump_printf (metadata, "\n");
2727 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2729 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2730 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2731 else
2732 dump_printf_loc (metadata, user_loc, "op template: %G",
2733 SLP_TREE_REPRESENTATIVE (node)->stmt);
2735 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2736 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2737 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2738 else
2740 dump_printf_loc (metadata, user_loc, "\t{ ");
2741 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2742 dump_printf (metadata, "%T%s ", op,
2743 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2744 dump_printf (metadata, "}\n");
2746 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2748 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2749 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2750 dump_printf (dump_kind, " %u", j);
2751 dump_printf (dump_kind, " }\n");
2753 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2755 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2756 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2757 dump_printf (dump_kind, " %u[%u]",
2758 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2759 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2760 dump_printf (dump_kind, " }\n");
2762 if (SLP_TREE_CHILDREN (node).is_empty ())
2763 return;
2764 dump_printf_loc (metadata, user_loc, "\tchildren");
2765 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2766 dump_printf (dump_kind, " %p", (void *)child);
2767 dump_printf (dump_kind, "\n");
2770 DEBUG_FUNCTION void
2771 debug (slp_tree node)
2773 debug_dump_context ctx;
2774 vect_print_slp_tree (MSG_NOTE,
2775 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2776 node);
2779 /* Recursive helper for the dot producer below. */
2781 static void
2782 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2784 if (visited.add (node))
2785 return;
2787 fprintf (f, "\"%p\" [label=\"", (void *)node);
2788 vect_print_slp_tree (MSG_NOTE,
2789 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2790 node);
2791 fprintf (f, "\"];\n");
2794 for (slp_tree child : SLP_TREE_CHILDREN (node))
2795 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2797 for (slp_tree child : SLP_TREE_CHILDREN (node))
2798 if (child)
2799 dot_slp_tree (f, child, visited);
2802 DEBUG_FUNCTION void
2803 dot_slp_tree (const char *fname, slp_tree node)
2805 FILE *f = fopen (fname, "w");
2806 fprintf (f, "digraph {\n");
2807 fflush (f);
2809 debug_dump_context ctx (f);
2810 hash_set<slp_tree> visited;
2811 dot_slp_tree (f, node, visited);
2813 fflush (f);
2814 fprintf (f, "}\n");
2815 fclose (f);
2818 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2820 static void
2821 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2822 slp_tree node, hash_set<slp_tree> &visited)
2824 unsigned i;
2825 slp_tree child;
2827 if (visited.add (node))
2828 return;
2830 vect_print_slp_tree (dump_kind, loc, node);
2832 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2833 if (child)
2834 vect_print_slp_graph (dump_kind, loc, child, visited);
2837 static void
2838 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2839 slp_tree entry)
2841 hash_set<slp_tree> visited;
2842 vect_print_slp_graph (dump_kind, loc, entry, visited);
2845 /* Mark the tree rooted at NODE with PURE_SLP. */
2847 static void
2848 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2850 int i;
2851 stmt_vec_info stmt_info;
2852 slp_tree child;
2854 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2855 return;
2857 if (visited.add (node))
2858 return;
2860 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2861 STMT_SLP_TYPE (stmt_info) = pure_slp;
2863 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2864 if (child)
2865 vect_mark_slp_stmts (child, visited);
2868 static void
2869 vect_mark_slp_stmts (slp_tree node)
2871 hash_set<slp_tree> visited;
2872 vect_mark_slp_stmts (node, visited);
2875 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2877 static void
2878 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2880 int i;
2881 stmt_vec_info stmt_info;
2882 slp_tree child;
2884 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2885 return;
2887 if (visited.add (node))
2888 return;
2890 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2892 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2893 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2894 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2897 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2898 if (child)
2899 vect_mark_slp_stmts_relevant (child, visited);
2902 static void
2903 vect_mark_slp_stmts_relevant (slp_tree node)
2905 hash_set<slp_tree> visited;
2906 vect_mark_slp_stmts_relevant (node, visited);
2910 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2912 static void
2913 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2914 hash_set<slp_tree> &visited)
2916 if (!node || visited.add (node))
2917 return;
2919 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2920 return;
2922 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
2924 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2925 if (STMT_VINFO_DATA_REF (stmt_info)
2926 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2927 loads.safe_push (node);
2930 unsigned i;
2931 slp_tree child;
2932 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2933 vect_gather_slp_loads (loads, child, visited);
2937 /* Find the last store in SLP INSTANCE. */
2939 stmt_vec_info
2940 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2942 stmt_vec_info last = NULL;
2943 stmt_vec_info stmt_vinfo;
2945 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2947 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2948 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2951 return last;
2954 /* Find the first stmt in NODE. */
2956 stmt_vec_info
2957 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2959 stmt_vec_info first = NULL;
2960 stmt_vec_info stmt_vinfo;
2962 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2964 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2965 if (!first
2966 || get_later_stmt (stmt_vinfo, first) == first)
2967 first = stmt_vinfo;
2970 return first;
2973 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2974 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2975 (also containing the first GROUP1_SIZE stmts, since stores are
2976 consecutive), the second containing the remainder.
2977 Return the first stmt in the second group. */
2979 static stmt_vec_info
2980 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2982 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2983 gcc_assert (group1_size > 0);
2984 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2985 gcc_assert (group2_size > 0);
2986 DR_GROUP_SIZE (first_vinfo) = group1_size;
2988 stmt_vec_info stmt_info = first_vinfo;
2989 for (unsigned i = group1_size; i > 1; i--)
2991 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2992 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2994 /* STMT is now the last element of the first group. */
2995 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2996 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2998 DR_GROUP_SIZE (group2) = group2_size;
2999 for (stmt_info = group2; stmt_info;
3000 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3002 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3003 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3006 /* For the second group, the DR_GROUP_GAP is that before the original group,
3007 plus skipping over the first vector. */
3008 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3010 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
3011 DR_GROUP_GAP (first_vinfo) += group2_size;
3013 if (dump_enabled_p ())
3014 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3015 group1_size, group2_size);
3017 return group2;
3020 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3021 statements and a vector of NUNITS elements. */
3023 static poly_uint64
3024 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3026 return exact_div (common_multiple (nunits, group_size), group_size);
3029 /* Helper that checks to see if a node is a load node. */
3031 static inline bool
3032 vect_is_slp_load_node (slp_tree root)
3034 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3035 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3036 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3040 /* Helper function of optimize_load_redistribution that performs the operation
3041 recursively. */
3043 static slp_tree
3044 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3045 vec_info *vinfo, unsigned int group_size,
3046 hash_map<slp_tree, slp_tree> *load_map,
3047 slp_tree root)
3049 if (slp_tree *leader = load_map->get (root))
3050 return *leader;
3052 slp_tree node;
3053 unsigned i;
3055 /* For now, we don't know anything about externals so do not do anything. */
3056 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3057 return NULL;
3058 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3060 /* First convert this node into a load node and add it to the leaves
3061 list and flatten the permute from a lane to a load one. If it's
3062 unneeded it will be elided later. */
3063 vec<stmt_vec_info> stmts;
3064 stmts.create (SLP_TREE_LANES (root));
3065 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3066 for (unsigned j = 0; j < lane_perm.length (); j++)
3068 std::pair<unsigned, unsigned> perm = lane_perm[j];
3069 node = SLP_TREE_CHILDREN (root)[perm.first];
3071 if (!vect_is_slp_load_node (node)
3072 || SLP_TREE_CHILDREN (node).exists ())
3074 stmts.release ();
3075 goto next;
3078 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3081 if (dump_enabled_p ())
3082 dump_printf_loc (MSG_NOTE, vect_location,
3083 "converting stmts on permute node %p\n",
3084 (void *) root);
3086 bool *matches = XALLOCAVEC (bool, group_size);
3087 poly_uint64 max_nunits = 1;
3088 unsigned tree_size = 0, limit = 1;
3089 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3090 matches, &limit, &tree_size, bst_map);
3091 if (!node)
3092 stmts.release ();
3094 load_map->put (root, node);
3095 return node;
3098 next:
3099 load_map->put (root, NULL);
3101 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3103 slp_tree value
3104 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3105 node);
3106 if (value)
3108 SLP_TREE_REF_COUNT (value)++;
3109 SLP_TREE_CHILDREN (root)[i] = value;
3110 /* ??? We know the original leafs of the replaced nodes will
3111 be referenced by bst_map, only the permutes created by
3112 pattern matching are not. */
3113 if (SLP_TREE_REF_COUNT (node) == 1)
3114 load_map->remove (node);
3115 vect_free_slp_tree (node);
3119 return NULL;
3122 /* Temporary workaround for loads not being CSEd during SLP build. This
3123 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3124 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3125 same DR such that the final operation is equal to a permuted load. Such
3126 NODES are then directly converted into LOADS themselves. The nodes are
3127 CSEd using BST_MAP. */
3129 static void
3130 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3131 vec_info *vinfo, unsigned int group_size,
3132 hash_map<slp_tree, slp_tree> *load_map,
3133 slp_tree root)
3135 slp_tree node;
3136 unsigned i;
3138 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3140 slp_tree value
3141 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3142 node);
3143 if (value)
3145 SLP_TREE_REF_COUNT (value)++;
3146 SLP_TREE_CHILDREN (root)[i] = value;
3147 /* ??? We know the original leafs of the replaced nodes will
3148 be referenced by bst_map, only the permutes created by
3149 pattern matching are not. */
3150 if (SLP_TREE_REF_COUNT (node) == 1)
3151 load_map->remove (node);
3152 vect_free_slp_tree (node);
3157 /* Helper function of vect_match_slp_patterns.
3159 Attempts to match patterns against the slp tree rooted in REF_NODE using
3160 VINFO. Patterns are matched in post-order traversal.
3162 If matching is successful the value in REF_NODE is updated and returned, if
3163 not then it is returned unchanged. */
3165 static bool
3166 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3167 slp_tree_to_load_perm_map_t *perm_cache,
3168 slp_compat_nodes_map_t *compat_cache,
3169 hash_set<slp_tree> *visited)
3171 unsigned i;
3172 slp_tree node = *ref_node;
3173 bool found_p = false;
3174 if (!node || visited->add (node))
3175 return false;
3177 slp_tree child;
3178 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3179 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3180 vinfo, perm_cache, compat_cache,
3181 visited);
3183 for (unsigned x = 0; x < num__slp_patterns; x++)
3185 vect_pattern *pattern
3186 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3187 if (pattern)
3189 pattern->build (vinfo);
3190 delete pattern;
3191 found_p = true;
3195 return found_p;
3198 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3199 vec_info VINFO.
3201 The modified tree is returned. Patterns are tried in order and multiple
3202 patterns may match. */
3204 static bool
3205 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3206 hash_set<slp_tree> *visited,
3207 slp_tree_to_load_perm_map_t *perm_cache,
3208 slp_compat_nodes_map_t *compat_cache)
3210 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3211 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3213 if (dump_enabled_p ())
3214 dump_printf_loc (MSG_NOTE, vect_location,
3215 "Analyzing SLP tree %p for patterns\n",
3216 (void *) SLP_INSTANCE_TREE (instance));
3218 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3219 visited);
3222 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3223 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3224 Return true if we could use IFN_STORE_LANES instead and if that appears
3225 to be the better approach. */
3227 static bool
3228 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3229 unsigned int group_size,
3230 unsigned int new_group_size)
3232 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3233 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3234 if (!vectype)
3235 return false;
3236 /* Allow the split if one of the two new groups would operate on full
3237 vectors *within* rather than across one scalar loop iteration.
3238 This is purely a heuristic, but it should work well for group
3239 sizes of 3 and 4, where the possible splits are:
3241 3->2+1: OK if the vector has exactly two elements
3242 4->2+2: Likewise
3243 4->3+1: Less clear-cut. */
3244 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3245 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3246 return false;
3247 return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3250 /* Analyze an SLP instance starting from a group of grouped stores. Call
3251 vect_build_slp_tree to build a tree of packed stmts if possible.
3252 Return FALSE if it's impossible to SLP any stmt in the loop. */
3254 static bool
3255 vect_analyze_slp_instance (vec_info *vinfo,
3256 scalar_stmts_to_slp_tree_map_t *bst_map,
3257 stmt_vec_info stmt_info, slp_instance_kind kind,
3258 unsigned max_tree_size, unsigned *limit);
3260 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3261 of KIND. Return true if successful. */
3263 static bool
3264 vect_build_slp_instance (vec_info *vinfo,
3265 slp_instance_kind kind,
3266 vec<stmt_vec_info> &scalar_stmts,
3267 vec<stmt_vec_info> &root_stmt_infos,
3268 vec<tree> &remain,
3269 unsigned max_tree_size, unsigned *limit,
3270 scalar_stmts_to_slp_tree_map_t *bst_map,
3271 /* ??? We need stmt_info for group splitting. */
3272 stmt_vec_info stmt_info_)
3274 if (kind == slp_inst_kind_ctor)
3276 if (dump_enabled_p ())
3277 dump_printf_loc (MSG_NOTE, vect_location,
3278 "Analyzing vectorizable constructor: %G\n",
3279 root_stmt_infos[0]->stmt);
3282 if (dump_enabled_p ())
3284 dump_printf_loc (MSG_NOTE, vect_location,
3285 "Starting SLP discovery for\n");
3286 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3287 dump_printf_loc (MSG_NOTE, vect_location,
3288 " %G", scalar_stmts[i]->stmt);
3291 /* When a BB reduction doesn't have an even number of lanes
3292 strip it down, treating the remaining lane as scalar.
3293 ??? Selecting the optimal set of lanes to vectorize would be nice
3294 but SLP build for all lanes will fail quickly because we think
3295 we're going to need unrolling. */
3296 if (kind == slp_inst_kind_bb_reduc
3297 && (scalar_stmts.length () & 1))
3298 remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt));
3300 /* Build the tree for the SLP instance. */
3301 unsigned int group_size = scalar_stmts.length ();
3302 bool *matches = XALLOCAVEC (bool, group_size);
3303 poly_uint64 max_nunits = 1;
3304 unsigned tree_size = 0;
3305 unsigned i;
3306 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3307 &max_nunits, matches, limit,
3308 &tree_size, bst_map);
3309 if (node != NULL)
3311 /* Calculate the unrolling factor based on the smallest type. */
3312 poly_uint64 unrolling_factor
3313 = calculate_unrolling_factor (max_nunits, group_size);
3315 if (maybe_ne (unrolling_factor, 1U)
3316 && is_a <bb_vec_info> (vinfo))
3318 unsigned HOST_WIDE_INT const_max_nunits;
3319 if (!max_nunits.is_constant (&const_max_nunits)
3320 || const_max_nunits > group_size)
3322 if (dump_enabled_p ())
3323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3324 "Build SLP failed: store group "
3325 "size not a multiple of the vector size "
3326 "in basic block SLP\n");
3327 vect_free_slp_tree (node);
3328 return false;
3330 /* Fatal mismatch. */
3331 if (dump_enabled_p ())
3332 dump_printf_loc (MSG_NOTE, vect_location,
3333 "SLP discovery succeeded but node needs "
3334 "splitting\n");
3335 memset (matches, true, group_size);
3336 matches[group_size / const_max_nunits * const_max_nunits] = false;
3337 vect_free_slp_tree (node);
3339 else
3341 /* Create a new SLP instance. */
3342 slp_instance new_instance = XNEW (class _slp_instance);
3343 SLP_INSTANCE_TREE (new_instance) = node;
3344 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3345 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3346 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3347 SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3348 SLP_INSTANCE_KIND (new_instance) = kind;
3349 new_instance->reduc_phis = NULL;
3350 new_instance->cost_vec = vNULL;
3351 new_instance->subgraph_entries = vNULL;
3353 if (dump_enabled_p ())
3354 dump_printf_loc (MSG_NOTE, vect_location,
3355 "SLP size %u vs. limit %u.\n",
3356 tree_size, max_tree_size);
3358 /* Fixup SLP reduction chains. */
3359 if (kind == slp_inst_kind_reduc_chain)
3361 /* If this is a reduction chain with a conversion in front
3362 amend the SLP tree with a node for that. */
3363 gimple *scalar_def
3364 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3365 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3367 /* Get at the conversion stmt - we know it's the single use
3368 of the last stmt of the reduction chain. */
3369 use_operand_p use_p;
3370 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3371 &use_p, &scalar_def);
3372 gcc_assert (r);
3373 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3374 next_info = vect_stmt_to_vectorize (next_info);
3375 scalar_stmts = vNULL;
3376 scalar_stmts.create (group_size);
3377 for (unsigned i = 0; i < group_size; ++i)
3378 scalar_stmts.quick_push (next_info);
3379 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3380 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3381 SLP_TREE_CHILDREN (conv).quick_push (node);
3382 SLP_INSTANCE_TREE (new_instance) = conv;
3383 /* We also have to fake this conversion stmt as SLP reduction
3384 group so we don't have to mess with too much code
3385 elsewhere. */
3386 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3387 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3389 /* Fill the backedge child of the PHI SLP node. The
3390 general matching code cannot find it because the
3391 scalar code does not reflect how we vectorize the
3392 reduction. */
3393 use_operand_p use_p;
3394 imm_use_iterator imm_iter;
3395 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3396 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3397 gimple_get_lhs (scalar_def))
3398 /* There are exactly two non-debug uses, the reduction
3399 PHI and the loop-closed PHI node. */
3400 if (!is_gimple_debug (USE_STMT (use_p))
3401 && gimple_bb (USE_STMT (use_p)) == loop->header)
3403 auto_vec<stmt_vec_info, 64> phis (group_size);
3404 stmt_vec_info phi_info
3405 = vinfo->lookup_stmt (USE_STMT (use_p));
3406 for (unsigned i = 0; i < group_size; ++i)
3407 phis.quick_push (phi_info);
3408 slp_tree *phi_node = bst_map->get (phis);
3409 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3410 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3411 = SLP_INSTANCE_TREE (new_instance);
3412 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3416 vinfo->slp_instances.safe_push (new_instance);
3418 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3419 the number of scalar stmts in the root in a few places.
3420 Verify that assumption holds. */
3421 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3422 .length () == group_size);
3424 if (dump_enabled_p ())
3426 dump_printf_loc (MSG_NOTE, vect_location,
3427 "Final SLP tree for instance %p:\n",
3428 (void *) new_instance);
3429 vect_print_slp_graph (MSG_NOTE, vect_location,
3430 SLP_INSTANCE_TREE (new_instance));
3433 return true;
3436 else
3438 /* Failed to SLP. */
3439 /* Free the allocated memory. */
3440 scalar_stmts.release ();
3443 stmt_vec_info stmt_info = stmt_info_;
3444 /* Try to break the group up into pieces. */
3445 if (kind == slp_inst_kind_store)
3447 /* ??? We could delay all the actual splitting of store-groups
3448 until after SLP discovery of the original group completed.
3449 Then we can recurse to vect_build_slp_instance directly. */
3450 for (i = 0; i < group_size; i++)
3451 if (!matches[i])
3452 break;
3454 /* For basic block SLP, try to break the group up into multiples of
3455 a vector size. */
3456 if (is_a <bb_vec_info> (vinfo)
3457 && (i > 1 && i < group_size))
3459 tree scalar_type
3460 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3461 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3462 1 << floor_log2 (i));
3463 unsigned HOST_WIDE_INT const_nunits;
3464 if (vectype
3465 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3467 /* Split into two groups at the first vector boundary. */
3468 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3469 unsigned group1_size = i & ~(const_nunits - 1);
3471 if (dump_enabled_p ())
3472 dump_printf_loc (MSG_NOTE, vect_location,
3473 "Splitting SLP group at stmt %u\n", i);
3474 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3475 group1_size);
3476 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3477 kind, max_tree_size,
3478 limit);
3479 /* Split the rest at the failure point and possibly
3480 re-analyze the remaining matching part if it has
3481 at least two lanes. */
3482 if (group1_size < i
3483 && (i + 1 < group_size
3484 || i - group1_size > 1))
3486 stmt_vec_info rest2 = rest;
3487 rest = vect_split_slp_store_group (rest, i - group1_size);
3488 if (i - group1_size > 1)
3489 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3490 kind, max_tree_size,
3491 limit);
3493 /* Re-analyze the non-matching tail if it has at least
3494 two lanes. */
3495 if (i + 1 < group_size)
3496 res |= vect_analyze_slp_instance (vinfo, bst_map,
3497 rest, kind, max_tree_size,
3498 limit);
3499 return res;
3503 /* For loop vectorization split into arbitrary pieces of size > 1. */
3504 if (is_a <loop_vec_info> (vinfo)
3505 && (i > 1 && i < group_size)
3506 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3508 unsigned group1_size = i;
3510 if (dump_enabled_p ())
3511 dump_printf_loc (MSG_NOTE, vect_location,
3512 "Splitting SLP group at stmt %u\n", i);
3514 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3515 group1_size);
3516 /* Loop vectorization cannot handle gaps in stores, make sure
3517 the split group appears as strided. */
3518 STMT_VINFO_STRIDED_P (rest) = 1;
3519 DR_GROUP_GAP (rest) = 0;
3520 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3521 DR_GROUP_GAP (stmt_info) = 0;
3523 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3524 kind, max_tree_size, limit);
3525 if (i + 1 < group_size)
3526 res |= vect_analyze_slp_instance (vinfo, bst_map,
3527 rest, kind, max_tree_size, limit);
3529 return res;
3532 /* Even though the first vector did not all match, we might be able to SLP
3533 (some) of the remainder. FORNOW ignore this possibility. */
3536 /* Failed to SLP. */
3537 if (dump_enabled_p ())
3538 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3539 return false;
3543 /* Analyze an SLP instance starting from a group of grouped stores. Call
3544 vect_build_slp_tree to build a tree of packed stmts if possible.
3545 Return FALSE if it's impossible to SLP any stmt in the loop. */
3547 static bool
3548 vect_analyze_slp_instance (vec_info *vinfo,
3549 scalar_stmts_to_slp_tree_map_t *bst_map,
3550 stmt_vec_info stmt_info,
3551 slp_instance_kind kind,
3552 unsigned max_tree_size, unsigned *limit)
3554 unsigned int i;
3555 vec<stmt_vec_info> scalar_stmts;
3557 if (is_a <bb_vec_info> (vinfo))
3558 vect_location = stmt_info->stmt;
3560 stmt_vec_info next_info = stmt_info;
3561 if (kind == slp_inst_kind_store)
3563 /* Collect the stores and store them in scalar_stmts. */
3564 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3565 while (next_info)
3567 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3568 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3571 else if (kind == slp_inst_kind_reduc_chain)
3573 /* Collect the reduction stmts and store them in scalar_stmts. */
3574 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3575 while (next_info)
3577 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3578 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3580 /* Mark the first element of the reduction chain as reduction to properly
3581 transform the node. In the reduction analysis phase only the last
3582 element of the chain is marked as reduction. */
3583 STMT_VINFO_DEF_TYPE (stmt_info)
3584 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3585 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3586 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3588 else if (kind == slp_inst_kind_reduc_group)
3590 /* Collect reduction statements. */
3591 const vec<stmt_vec_info> &reductions
3592 = as_a <loop_vec_info> (vinfo)->reductions;
3593 scalar_stmts.create (reductions.length ());
3594 for (i = 0; reductions.iterate (i, &next_info); i++)
3595 if ((STMT_VINFO_RELEVANT_P (next_info)
3596 || STMT_VINFO_LIVE_P (next_info))
3597 /* ??? Make sure we didn't skip a conversion around a reduction
3598 path. In that case we'd have to reverse engineer that conversion
3599 stmt following the chain using reduc_idx and from the PHI
3600 using reduc_def. */
3601 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3602 scalar_stmts.quick_push (next_info);
3603 /* If less than two were relevant/live there's nothing to SLP. */
3604 if (scalar_stmts.length () < 2)
3605 return false;
3607 else
3608 gcc_unreachable ();
3610 vec<stmt_vec_info> roots = vNULL;
3611 vec<tree> remain = vNULL;
3612 /* Build the tree for the SLP instance. */
3613 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3614 roots, remain,
3615 max_tree_size, limit, bst_map,
3616 kind == slp_inst_kind_store
3617 ? stmt_info : NULL);
3619 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3620 where we should do store group splitting. */
3622 return res;
3625 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3626 trees of packed scalar stmts if SLP is possible. */
3628 opt_result
3629 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3631 unsigned int i;
3632 stmt_vec_info first_element;
3633 slp_instance instance;
3635 DUMP_VECT_SCOPE ("vect_analyze_slp");
3637 unsigned limit = max_tree_size;
3639 scalar_stmts_to_slp_tree_map_t *bst_map
3640 = new scalar_stmts_to_slp_tree_map_t ();
3642 /* Find SLP sequences starting from groups of grouped stores. */
3643 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3644 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3645 slp_inst_kind_store, max_tree_size, &limit);
3647 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3649 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3651 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3652 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3653 bb_vinfo->roots[i].stmts,
3654 bb_vinfo->roots[i].roots,
3655 bb_vinfo->roots[i].remain,
3656 max_tree_size, &limit, bst_map, NULL))
3658 bb_vinfo->roots[i].stmts = vNULL;
3659 bb_vinfo->roots[i].roots = vNULL;
3660 bb_vinfo->roots[i].remain = vNULL;
3665 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3667 /* Find SLP sequences starting from reduction chains. */
3668 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3669 if (! STMT_VINFO_RELEVANT_P (first_element)
3670 && ! STMT_VINFO_LIVE_P (first_element))
3672 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3673 slp_inst_kind_reduc_chain,
3674 max_tree_size, &limit))
3676 /* Dissolve reduction chain group. */
3677 stmt_vec_info vinfo = first_element;
3678 stmt_vec_info last = NULL;
3679 while (vinfo)
3681 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3682 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3683 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3684 last = vinfo;
3685 vinfo = next;
3687 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3688 /* It can be still vectorized as part of an SLP reduction. */
3689 loop_vinfo->reductions.safe_push (last);
3692 /* Find SLP sequences starting from groups of reductions. */
3693 if (loop_vinfo->reductions.length () > 1)
3694 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3695 slp_inst_kind_reduc_group, max_tree_size,
3696 &limit);
3699 hash_set<slp_tree> visited_patterns;
3700 slp_tree_to_load_perm_map_t perm_cache;
3701 slp_compat_nodes_map_t compat_cache;
3703 /* See if any patterns can be found in the SLP tree. */
3704 bool pattern_found = false;
3705 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3706 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3707 &visited_patterns, &perm_cache,
3708 &compat_cache);
3710 /* If any were found optimize permutations of loads. */
3711 if (pattern_found)
3713 hash_map<slp_tree, slp_tree> load_map;
3714 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3716 slp_tree root = SLP_INSTANCE_TREE (instance);
3717 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3718 &load_map, root);
3724 /* The map keeps a reference on SLP nodes built, release that. */
3725 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3726 it != bst_map->end (); ++it)
3727 if ((*it).second)
3728 vect_free_slp_tree ((*it).second);
3729 delete bst_map;
3731 if (pattern_found && dump_enabled_p ())
3733 dump_printf_loc (MSG_NOTE, vect_location,
3734 "Pattern matched SLP tree\n");
3735 hash_set<slp_tree> visited;
3736 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3737 vect_print_slp_graph (MSG_NOTE, vect_location,
3738 SLP_INSTANCE_TREE (instance), visited);
3741 return opt_result::success ();
3744 /* Estimates the cost of inserting layout changes into the SLP graph.
3745 It can also say that the insertion is impossible. */
3747 struct slpg_layout_cost
3749 slpg_layout_cost () = default;
3750 slpg_layout_cost (sreal, bool);
3752 static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3753 bool is_possible () const { return depth != sreal::max (); }
3755 bool operator== (const slpg_layout_cost &) const;
3756 bool operator!= (const slpg_layout_cost &) const;
3758 bool is_better_than (const slpg_layout_cost &, bool) const;
3760 void add_parallel_cost (const slpg_layout_cost &);
3761 void add_serial_cost (const slpg_layout_cost &);
3762 void split (unsigned int);
3764 /* The longest sequence of layout changes needed during any traversal
3765 of the partition dag, weighted by execution frequency.
3767 This is the most important metric when optimizing for speed, since
3768 it helps to ensure that we keep the number of operations on
3769 critical paths to a minimum. */
3770 sreal depth = 0;
3772 /* An estimate of the total number of operations needed. It is weighted by
3773 execution frequency when optimizing for speed but not when optimizing for
3774 size. In order to avoid double-counting, a node with a fanout of N will
3775 distribute 1/N of its total cost to each successor.
3777 This is the most important metric when optimizing for size, since
3778 it helps to keep the total number of operations to a minimum, */
3779 sreal total = 0;
3782 /* Construct costs for a node with weight WEIGHT. A higher weight
3783 indicates more frequent execution. IS_FOR_SIZE is true if we are
3784 optimizing for size rather than speed. */
3786 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3787 : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3791 bool
3792 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3794 return depth == other.depth && total == other.total;
3797 bool
3798 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3800 return !operator== (other);
3803 /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
3804 true if we are optimizing for size rather than speed. */
3806 bool
3807 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3808 bool is_for_size) const
3810 if (is_for_size)
3812 if (total != other.total)
3813 return total < other.total;
3814 return depth < other.depth;
3816 else
3818 if (depth != other.depth)
3819 return depth < other.depth;
3820 return total < other.total;
3824 /* Increase the costs to account for something with cost INPUT_COST
3825 happening in parallel with the current costs. */
3827 void
3828 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3830 depth = std::max (depth, input_cost.depth);
3831 total += input_cost.total;
3834 /* Increase the costs to account for something with cost INPUT_COST
3835 happening in series with the current costs. */
3837 void
3838 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3840 depth += other.depth;
3841 total += other.total;
3844 /* Split the total cost among TIMES successors or predecessors. */
3846 void
3847 slpg_layout_cost::split (unsigned int times)
3849 if (times > 1)
3850 total /= times;
3853 /* Information about one node in the SLP graph, for use during
3854 vect_optimize_slp_pass. */
3856 struct slpg_vertex
3858 slpg_vertex (slp_tree node_) : node (node_) {}
3860 /* The node itself. */
3861 slp_tree node;
3863 /* Which partition the node belongs to, or -1 if none. Nodes outside of
3864 partitions are flexible; they can have whichever layout consumers
3865 want them to have. */
3866 int partition = -1;
3868 /* The number of nodes that directly use the result of this one
3869 (i.e. the number of nodes that count this one as a child). */
3870 unsigned int out_degree = 0;
3872 /* The execution frequency of the node. */
3873 sreal weight = 0;
3875 /* The total execution frequency of all nodes that directly use the
3876 result of this one. */
3877 sreal out_weight = 0;
3880 /* Information about one partition of the SLP graph, for use during
3881 vect_optimize_slp_pass. */
3883 struct slpg_partition_info
3885 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3886 of m_partitioned_nodes. */
3887 unsigned int node_begin = 0;
3888 unsigned int node_end = 0;
3890 /* Which layout we've chosen to use for this partition, or -1 if
3891 we haven't picked one yet. */
3892 int layout = -1;
3894 /* The number of predecessors and successors in the partition dag.
3895 The predecessors always have lower partition numbers and the
3896 successors always have higher partition numbers.
3898 Note that the directions of these edges are not necessarily the
3899 same as in the data flow graph. For example, if an SCC has separate
3900 partitions for an inner loop and an outer loop, the inner loop's
3901 partition will have at least two incoming edges from the outer loop's
3902 partition: one for a live-in value and one for a live-out value.
3903 In data flow terms, one of these edges would also be from the outer loop
3904 to the inner loop, but the other would be in the opposite direction. */
3905 unsigned int in_degree = 0;
3906 unsigned int out_degree = 0;
3909 /* Information about the costs of using a particular layout for a
3910 particular partition. It can also say that the combination is
3911 impossible. */
3913 struct slpg_partition_layout_costs
3915 bool is_possible () const { return internal_cost.is_possible (); }
3916 void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3918 /* The costs inherited from predecessor partitions. */
3919 slpg_layout_cost in_cost;
3921 /* The inherent cost of the layout within the node itself. For example,
3922 this is nonzero for a load if choosing a particular layout would require
3923 the load to permute the loaded elements. It is nonzero for a
3924 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3925 to full-vector moves. */
3926 slpg_layout_cost internal_cost;
3928 /* The costs inherited from successor partitions. */
3929 slpg_layout_cost out_cost;
3932 /* This class tries to optimize the layout of vectors in order to avoid
3933 unnecessary shuffling. At the moment, the set of possible layouts are
3934 restricted to bijective permutations.
3936 The goal of the pass depends on whether we're optimizing for size or
3937 for speed. When optimizing for size, the goal is to reduce the overall
3938 number of layout changes (including layout changes implied by things
3939 like load permutations). When optimizing for speed, the goal is to
3940 reduce the maximum latency attributable to layout changes on any
3941 non-cyclical path through the data flow graph.
3943 For example, when optimizing a loop nest for speed, we will prefer
3944 to make layout changes outside of a loop rather than inside of a loop,
3945 and will prefer to make layout changes in parallel rather than serially,
3946 even if that increases the overall number of layout changes.
3948 The high-level procedure is:
3950 (1) Build a graph in which edges go from uses (parents) to definitions
3951 (children).
3953 (2) Divide the graph into a dag of strongly-connected components (SCCs).
3955 (3) When optimizing for speed, partition the nodes in each SCC based
3956 on their containing cfg loop. When optimizing for size, treat
3957 each SCC as a single partition.
3959 This gives us a dag of partitions. The goal is now to assign a
3960 layout to each partition.
3962 (4) Construct a set of vector layouts that are worth considering.
3963 Record which nodes must keep their current layout.
3965 (5) Perform a forward walk over the partition dag (from loads to stores)
3966 accumulating the "forward" cost of using each layout. When visiting
3967 each partition, assign a tentative choice of layout to the partition
3968 and use that choice when calculating the cost of using a different
3969 layout in successor partitions.
3971 (6) Perform a backward walk over the partition dag (from stores to loads),
3972 accumulating the "backward" cost of using each layout. When visiting
3973 each partition, make a final choice of layout for that partition based
3974 on the accumulated forward costs (from (5)) and backward costs
3975 (from (6)).
3977 (7) Apply the chosen layouts to the SLP graph.
3979 For example, consider the SLP statements:
3981 S1: a_1 = load
3982 loop:
3983 S2: a_2 = PHI<a_1, a_3>
3984 S3: b_1 = load
3985 S4: a_3 = a_2 + b_1
3986 exit:
3987 S5: a_4 = PHI<a_3>
3988 S6: store a_4
3990 S2 and S4 form an SCC and are part of the same loop. Every other
3991 statement is in a singleton SCC. In this example there is a one-to-one
3992 mapping between SCCs and partitions and the partition dag looks like this;
3994 S1 S3
3996 S2+S4
4002 S2, S3 and S4 will have a higher execution frequency than the other
4003 statements, so when optimizing for speed, the goal is to avoid any
4004 layout changes:
4006 - within S3
4007 - within S2+S4
4008 - on the S3->S2+S4 edge
4010 For example, if S3 was originally a reversing load, the goal of the
4011 pass is to make it an unreversed load and change the layout on the
4012 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
4013 on S1->S2+S4 and S5->S6 would also be acceptable.)
4015 The difference between SCCs and partitions becomes important if we
4016 add an outer loop:
4018 S1: a_1 = ...
4019 loop1:
4020 S2: a_2 = PHI<a_1, a_6>
4021 S3: b_1 = load
4022 S4: a_3 = a_2 + b_1
4023 loop2:
4024 S5: a_4 = PHI<a_3, a_5>
4025 S6: c_1 = load
4026 S7: a_5 = a_4 + c_1
4027 exit2:
4028 S8: a_6 = PHI<a_5>
4029 S9: store a_6
4030 exit1:
4032 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
4033 for speed, we usually do not want restrictions in the outer loop to "infect"
4034 the decision for the inner loop. For example, if an outer-loop node
4035 in the SCC contains a statement with a fixed layout, that should not
4036 prevent the inner loop from using a different layout. Conversely,
4037 the inner loop should not dictate a layout to the outer loop: if the
4038 outer loop does a lot of computation, then it may not be efficient to
4039 do all of that computation in the inner loop's preferred layout.
4041 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4042 and S5+S7 (inner). We also try to arrange partitions so that:
4044 - the partition for an outer loop comes before the partition for
4045 an inner loop
4047 - if a sibling loop A dominates a sibling loop B, A's partition
4048 comes before B's
4050 This gives the following partition dag for the example above:
4052 S1 S3
4054 S2+S4+S8 S6
4055 | \\ /
4056 | S5+S7
4060 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4061 one for a reversal of the edge S7->S8.
4063 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
4064 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4065 preferred layout against the cost of changing the layout on entry to the
4066 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4068 Although this works well when optimizing for speed, it has the downside
4069 when optimizing for size that the choice of layout for S5+S7 is completely
4070 independent of S9, which lessens the chance of reducing the overall number
4071 of permutations. We therefore do not partition SCCs when optimizing
4072 for size.
4074 To give a concrete example of the difference between optimizing
4075 for size and speed, consider:
4077 a[0] = (b[1] << c[3]) - d[1];
4078 a[1] = (b[0] << c[2]) - d[0];
4079 a[2] = (b[3] << c[1]) - d[3];
4080 a[3] = (b[2] << c[0]) - d[2];
4082 There are three different layouts here: one for a, one for b and d,
4083 and one for c. When optimizing for speed it is better to permute each
4084 of b, c and d into the order required by a, since those permutations
4085 happen in parallel. But when optimizing for size, it is better to:
4087 - permute c into the same order as b
4088 - do the arithmetic
4089 - permute the result into the order required by a
4091 This gives 2 permutations rather than 3. */
4093 class vect_optimize_slp_pass
4095 public:
4096 vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4097 void run ();
4099 private:
4100 /* Graph building. */
4101 struct loop *containing_loop (slp_tree);
4102 bool is_cfg_latch_edge (graph_edge *);
4103 void build_vertices (hash_set<slp_tree> &, slp_tree);
4104 void build_vertices ();
4105 void build_graph ();
4107 /* Partitioning. */
4108 void create_partitions ();
4109 template<typename T> void for_each_partition_edge (unsigned int, T);
4111 /* Layout selection. */
4112 bool is_compatible_layout (slp_tree, unsigned int);
4113 int change_layout_cost (slp_tree, unsigned int, unsigned int);
4114 slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4115 unsigned int);
4116 void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4117 int, unsigned int);
4118 int internal_node_cost (slp_tree, int, unsigned int);
4119 void start_choosing_layouts ();
4121 /* Cost propagation. */
4122 slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4123 unsigned int, unsigned int);
4124 slpg_layout_cost total_in_cost (unsigned int);
4125 slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4126 slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4127 void forward_pass ();
4128 void backward_pass ();
4130 /* Rematerialization. */
4131 slp_tree get_result_with_layout (slp_tree, unsigned int);
4132 void materialize ();
4134 /* Clean-up. */
4135 void remove_redundant_permutations ();
4137 void dump ();
4139 vec_info *m_vinfo;
4141 /* True if we should optimize the graph for size, false if we should
4142 optimize it for speed. (It wouldn't be easy to make this decision
4143 more locally.) */
4144 bool m_optimize_size;
4146 /* A graph of all SLP nodes, with edges leading from uses to definitions.
4147 In other words, a node's predecessors are its slp_tree parents and
4148 a node's successors are its slp_tree children. */
4149 graph *m_slpg = nullptr;
4151 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
4152 auto_vec<slpg_vertex> m_vertices;
4154 /* The list of all leaves of M_SLPG. such as external definitions, constants,
4155 and loads. */
4156 auto_vec<int> m_leafs;
4158 /* This array has one entry for every vector layout that we're considering.
4159 Element 0 is null and indicates "no change". Other entries describe
4160 permutations that are inherent in the current graph and that we would
4161 like to reverse if possible.
4163 For example, a permutation { 1, 2, 3, 0 } means that something has
4164 effectively been permuted in that way, such as a load group
4165 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4166 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4167 in order to put things "back" in order. */
4168 auto_vec<vec<unsigned> > m_perms;
4170 /* A partitioning of the nodes for which a layout must be chosen.
4171 Each partition represents an <SCC, cfg loop> pair; that is,
4172 nodes in different SCCs belong to different partitions, and nodes
4173 within an SCC can be further partitioned according to a containing
4174 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
4176 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4177 from leaves (such as loads) to roots (such as stores).
4179 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
4180 auto_vec<slpg_partition_info> m_partitions;
4182 /* The list of all nodes for which a layout must be chosen. Nodes for
4183 partition P come before the nodes for partition P+1. Nodes within a
4184 partition are in reverse postorder. */
4185 auto_vec<unsigned int> m_partitioned_nodes;
4187 /* Index P * num-layouts + L contains the cost of using layout L
4188 for partition P. */
4189 auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4191 /* Index N * num-layouts + L, if nonnull, is a node that provides the
4192 original output of node N adjusted to have layout L. */
4193 auto_vec<slp_tree> m_node_layouts;
4196 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4197 Also record whether we should optimize anything for speed rather
4198 than size. */
4200 void
4201 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4202 slp_tree node)
4204 unsigned i;
4205 slp_tree child;
4207 if (visited.add (node))
4208 return;
4210 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4212 basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4213 if (optimize_bb_for_speed_p (bb))
4214 m_optimize_size = false;
4217 node->vertex = m_vertices.length ();
4218 m_vertices.safe_push (slpg_vertex (node));
4220 bool leaf = true;
4221 bool force_leaf = false;
4222 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4223 if (child)
4225 leaf = false;
4226 build_vertices (visited, child);
4228 else
4229 force_leaf = true;
4230 /* Since SLP discovery works along use-def edges all cycles have an
4231 entry - but there's the exception of cycles where we do not handle
4232 the entry explicitely (but with a NULL SLP node), like some reductions
4233 and inductions. Force those SLP PHIs to act as leafs to make them
4234 backwards reachable. */
4235 if (leaf || force_leaf)
4236 m_leafs.safe_push (node->vertex);
4239 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
4241 void
4242 vect_optimize_slp_pass::build_vertices ()
4244 hash_set<slp_tree> visited;
4245 unsigned i;
4246 slp_instance instance;
4247 FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4248 build_vertices (visited, SLP_INSTANCE_TREE (instance));
4251 /* Apply (reverse) bijectite PERM to VEC. */
4253 template <class T>
4254 static void
4255 vect_slp_permute (vec<unsigned> perm,
4256 vec<T> &vec, bool reverse)
4258 auto_vec<T, 64> saved;
4259 saved.create (vec.length ());
4260 for (unsigned i = 0; i < vec.length (); ++i)
4261 saved.quick_push (vec[i]);
4263 if (reverse)
4265 for (unsigned i = 0; i < vec.length (); ++i)
4266 vec[perm[i]] = saved[i];
4267 for (unsigned i = 0; i < vec.length (); ++i)
4268 gcc_assert (vec[perm[i]] == saved[i]);
4270 else
4272 for (unsigned i = 0; i < vec.length (); ++i)
4273 vec[i] = saved[perm[i]];
4274 for (unsigned i = 0; i < vec.length (); ++i)
4275 gcc_assert (vec[i] == saved[perm[i]]);
4279 /* Return the cfg loop that contains NODE. */
4281 struct loop *
4282 vect_optimize_slp_pass::containing_loop (slp_tree node)
4284 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4285 if (!rep)
4286 return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4287 return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4290 /* Return true if UD (an edge from a use to a definition) is associated
4291 with a loop latch edge in the cfg. */
4293 bool
4294 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4296 slp_tree use = m_vertices[ud->src].node;
4297 slp_tree def = m_vertices[ud->dest].node;
4298 if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4299 || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4300 return false;
4302 stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4303 return (is_a<gphi *> (use_rep->stmt)
4304 && bb_loop_header_p (gimple_bb (use_rep->stmt))
4305 && containing_loop (def) == containing_loop (use));
4308 /* Build the graph. Mark edges that correspond to cfg loop latch edges with
4309 a nonnull data field. */
4311 void
4312 vect_optimize_slp_pass::build_graph ()
4314 m_optimize_size = true;
4315 build_vertices ();
4317 m_slpg = new_graph (m_vertices.length ());
4318 for (slpg_vertex &v : m_vertices)
4319 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4320 if (child)
4322 graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4323 if (is_cfg_latch_edge (ud))
4324 ud->data = this;
4328 /* Return true if E corresponds to a loop latch edge in the cfg. */
4330 static bool
4331 skip_cfg_latch_edges (graph_edge *e)
4333 return e->data;
4336 /* Create the node partitions. */
4338 void
4339 vect_optimize_slp_pass::create_partitions ()
4341 /* Calculate a postorder of the graph, ignoring edges that correspond
4342 to natural latch edges in the cfg. Reading the vector from the end
4343 to the beginning gives the reverse postorder. */
4344 auto_vec<int> initial_rpo;
4345 graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4346 false, NULL, skip_cfg_latch_edges);
4347 gcc_assert (initial_rpo.length () == m_vertices.length ());
4349 /* Calculate the strongly connected components of the graph. */
4350 auto_vec<int> scc_grouping;
4351 unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4353 /* Create a new index order in which all nodes from the same SCC are
4354 consecutive. Use scc_pos to record the index of the first node in
4355 each SCC. */
4356 auto_vec<unsigned int> scc_pos (num_sccs);
4357 int last_component = -1;
4358 unsigned int node_count = 0;
4359 for (unsigned int node_i : scc_grouping)
4361 if (last_component != m_slpg->vertices[node_i].component)
4363 last_component = m_slpg->vertices[node_i].component;
4364 gcc_assert (last_component == int (scc_pos.length ()));
4365 scc_pos.quick_push (node_count);
4367 node_count += 1;
4369 gcc_assert (node_count == initial_rpo.length ()
4370 && last_component + 1 == int (num_sccs));
4372 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4373 inside each SCC following the RPO we calculated above. The fact that
4374 we ignored natural latch edges when calculating the RPO should ensure
4375 that, for natural loop nests:
4377 - the first node that we encounter in a cfg loop is the loop header phi
4378 - the loop header phis are in dominance order
4380 Arranging for this is an optimization (see below) rather than a
4381 correctness issue. Unnatural loops with a tangled mess of backedges
4382 will still work correctly, but might give poorer results.
4384 Also update scc_pos so that it gives 1 + the index of the last node
4385 in the SCC. */
4386 m_partitioned_nodes.safe_grow (node_count);
4387 for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4389 unsigned int node_i = initial_rpo[old_i];
4390 unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4391 m_partitioned_nodes[new_i] = node_i;
4394 /* When optimizing for speed, partition each SCC based on the containing
4395 cfg loop. The order we constructed above should ensure that, for natural
4396 cfg loops, we'll create sub-SCC partitions for outer loops before
4397 the corresponding sub-SCC partitions for inner loops. Similarly,
4398 when one sibling loop A dominates another sibling loop B, we should
4399 create a sub-SCC partition for A before a sub-SCC partition for B.
4401 As above, nothing depends for correctness on whether this achieves
4402 a natural nesting, but we should get better results when it does. */
4403 m_partitions.reserve (m_vertices.length ());
4404 unsigned int next_partition_i = 0;
4405 hash_map<struct loop *, int> loop_partitions;
4406 unsigned int rpo_begin = 0;
4407 unsigned int num_partitioned_nodes = 0;
4408 for (unsigned int rpo_end : scc_pos)
4410 loop_partitions.empty ();
4411 unsigned int partition_i = next_partition_i;
4412 for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4414 /* Handle externals and constants optimistically throughout.
4415 But treat existing vectors as fixed since we do not handle
4416 permuting them. */
4417 unsigned int node_i = m_partitioned_nodes[rpo_i];
4418 auto &vertex = m_vertices[node_i];
4419 if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4420 && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4421 || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4422 vertex.partition = -1;
4423 else
4425 bool existed;
4426 if (m_optimize_size)
4427 existed = next_partition_i > partition_i;
4428 else
4430 struct loop *loop = containing_loop (vertex.node);
4431 auto &entry = loop_partitions.get_or_insert (loop, &existed);
4432 if (!existed)
4433 entry = next_partition_i;
4434 partition_i = entry;
4436 if (!existed)
4438 m_partitions.quick_push (slpg_partition_info ());
4439 next_partition_i += 1;
4441 vertex.partition = partition_i;
4442 num_partitioned_nodes += 1;
4443 m_partitions[partition_i].node_end += 1;
4446 rpo_begin = rpo_end;
4449 /* Assign ranges of consecutive node indices to each partition,
4450 in partition order. Start with node_end being the same as
4451 node_begin so that the next loop can use it as a counter. */
4452 unsigned int node_begin = 0;
4453 for (auto &partition : m_partitions)
4455 partition.node_begin = node_begin;
4456 node_begin += partition.node_end;
4457 partition.node_end = partition.node_begin;
4459 gcc_assert (node_begin == num_partitioned_nodes);
4461 /* Finally build the list of nodes in partition order. */
4462 m_partitioned_nodes.truncate (num_partitioned_nodes);
4463 for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4465 int partition_i = m_vertices[node_i].partition;
4466 if (partition_i >= 0)
4468 unsigned int order_i = m_partitions[partition_i].node_end++;
4469 m_partitioned_nodes[order_i] = node_i;
4474 /* Look for edges from earlier partitions into node NODE_I and edges from
4475 node NODE_I into later partitions. Call:
4477 FN (ud, other_node_i)
4479 for each such use-to-def edge ud, where other_node_i is the node at the
4480 other end of the edge. */
4482 template<typename T>
4483 void
4484 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4486 int partition_i = m_vertices[node_i].partition;
4487 for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4488 pred; pred = pred->pred_next)
4490 int src_partition_i = m_vertices[pred->src].partition;
4491 if (src_partition_i >= 0 && src_partition_i != partition_i)
4492 fn (pred, pred->src);
4494 for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4495 succ; succ = succ->succ_next)
4497 int dest_partition_i = m_vertices[succ->dest].partition;
4498 if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4499 fn (succ, succ->dest);
4503 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4504 that NODE would operate on. This test is independent of NODE's actual
4505 operation. */
4507 bool
4508 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4509 unsigned int layout_i)
4511 if (layout_i == 0)
4512 return true;
4514 if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4515 return false;
4517 return true;
4520 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4521 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
4522 layouts is incompatible with NODE or if the change is not possible for
4523 some other reason.
4525 The properties taken from NODE include the number of lanes and the
4526 vector type. The actual operation doesn't matter. */
4529 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4530 unsigned int from_layout_i,
4531 unsigned int to_layout_i)
4533 if (!is_compatible_layout (node, from_layout_i)
4534 || !is_compatible_layout (node, to_layout_i))
4535 return -1;
4537 if (from_layout_i == to_layout_i)
4538 return 0;
4540 auto_vec<slp_tree, 1> children (1);
4541 children.quick_push (node);
4542 auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4543 if (from_layout_i > 0)
4544 for (unsigned int i : m_perms[from_layout_i])
4545 perm.quick_push ({ 0, i });
4546 else
4547 for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4548 perm.quick_push ({ 0, i });
4549 if (to_layout_i > 0)
4550 vect_slp_permute (m_perms[to_layout_i], perm, true);
4551 auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4552 children, false);
4553 if (count >= 0)
4554 return MAX (count, 1);
4556 /* ??? In principle we could try changing via layout 0, giving two
4557 layout changes rather than 1. Doing that would require
4558 corresponding support in get_result_with_layout. */
4559 return -1;
4562 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
4564 inline slpg_partition_layout_costs &
4565 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4566 unsigned int layout_i)
4568 return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4571 /* Change PERM in one of two ways:
4573 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4574 chosen for child I of NODE.
4576 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4578 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
4580 void
4581 vect_optimize_slp_pass::
4582 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4583 int in_layout_i, unsigned int out_layout_i)
4585 for (auto &entry : perm)
4587 int this_in_layout_i = in_layout_i;
4588 if (this_in_layout_i < 0)
4590 slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4591 unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4592 this_in_layout_i = m_partitions[in_partition_i].layout;
4594 if (this_in_layout_i > 0)
4595 entry.second = m_perms[this_in_layout_i][entry.second];
4597 if (out_layout_i > 0)
4598 vect_slp_permute (m_perms[out_layout_i], perm, true);
4601 /* Check whether the target allows NODE to be rearranged so that the node's
4602 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
4603 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
4605 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4606 NODE can adapt to the layout changes that have (perhaps provisionally)
4607 been chosen for NODE's children, so that no extra permutations are
4608 needed on either the input or the output of NODE.
4610 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4611 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4613 IN_LAYOUT_I has no meaning for other types of node.
4615 Keeping the node as-is is always valid. If the target doesn't appear
4616 to support the node as-is, but might realistically support other layouts,
4617 then layout 0 instead has the cost of a worst-case permutation. On the
4618 one hand, this ensures that every node has at least one valid layout,
4619 avoiding what would otherwise be an awkward special case. On the other,
4620 it still encourages the pass to change an invalid pre-existing layout
4621 choice into a valid one. */
4624 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4625 unsigned int out_layout_i)
4627 const int fallback_cost = 1;
4629 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4631 auto_lane_permutation_t tmp_perm;
4632 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4634 /* Check that the child nodes support the chosen layout. Checking
4635 the first child is enough, since any second child would have the
4636 same shape. */
4637 auto first_child = SLP_TREE_CHILDREN (node)[0];
4638 if (in_layout_i > 0
4639 && !is_compatible_layout (first_child, in_layout_i))
4640 return -1;
4642 change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4643 int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4644 node, tmp_perm,
4645 SLP_TREE_CHILDREN (node),
4646 false);
4647 if (count < 0)
4649 if (in_layout_i == 0 && out_layout_i == 0)
4651 /* Use the fallback cost if the node could in principle support
4652 some nonzero layout for both the inputs and the outputs.
4653 Otherwise assume that the node will be rejected later
4654 and rebuilt from scalars. */
4655 if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4656 return fallback_cost;
4657 return 0;
4659 return -1;
4662 /* We currently have no way of telling whether the new layout is cheaper
4663 or more expensive than the old one. But at least in principle,
4664 it should be worth making zero permutations (whole-vector shuffles)
4665 cheaper than real permutations, in case the pass is able to remove
4666 the latter. */
4667 return count == 0 ? 0 : 1;
4670 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4671 if (rep
4672 && STMT_VINFO_DATA_REF (rep)
4673 && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4674 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4676 auto_load_permutation_t tmp_perm;
4677 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4678 if (out_layout_i > 0)
4679 vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4681 poly_uint64 vf = 1;
4682 if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4683 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4684 unsigned int n_perms;
4685 if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4686 nullptr, vf, true, false, &n_perms))
4688 auto rep = SLP_TREE_REPRESENTATIVE (node);
4689 if (out_layout_i == 0)
4691 /* Use the fallback cost if the load is an N-to-N permutation.
4692 Otherwise assume that the node will be rejected later
4693 and rebuilt from scalars. */
4694 if (STMT_VINFO_GROUPED_ACCESS (rep)
4695 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4696 == SLP_TREE_LANES (node)))
4697 return fallback_cost;
4698 return 0;
4700 return -1;
4703 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
4704 return n_perms == 0 ? 0 : 1;
4707 return 0;
4710 /* Decide which element layouts we should consider using. Calculate the
4711 weights associated with inserting layout changes on partition edges.
4712 Also mark partitions that cannot change layout, by setting their
4713 layout to zero. */
4715 void
4716 vect_optimize_slp_pass::start_choosing_layouts ()
4718 /* Used to assign unique permutation indices. */
4719 using perm_hash = unbounded_hashmap_traits<
4720 vec_free_hash_base<int_hash_base<unsigned>>,
4721 int_hash<int, -1, -2>
4723 hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4725 /* Layout 0 is "no change". */
4726 m_perms.safe_push (vNULL);
4728 /* Create layouts from existing permutations. */
4729 auto_load_permutation_t tmp_perm;
4730 for (unsigned int node_i : m_partitioned_nodes)
4732 /* Leafs also double as entries to the reverse graph. Allow the
4733 layout of those to be changed. */
4734 auto &vertex = m_vertices[node_i];
4735 auto &partition = m_partitions[vertex.partition];
4736 if (!m_slpg->vertices[node_i].succ)
4737 partition.layout = 0;
4739 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
4740 slp_tree node = vertex.node;
4741 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4742 slp_tree child;
4743 unsigned HOST_WIDE_INT imin, imax = 0;
4744 bool any_permute = false;
4745 tmp_perm.truncate (0);
4746 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4748 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4749 unpermuted, record a layout that reverses this permutation.
4751 We would need more work to cope with loads that are internally
4752 permuted and also have inputs (such as masks for
4753 IFN_MASK_LOADs). */
4754 gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4755 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4757 partition.layout = -1;
4758 continue;
4760 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4761 imin = DR_GROUP_SIZE (dr_stmt) + 1;
4762 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4764 else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4765 && SLP_TREE_CHILDREN (node).length () == 1
4766 && (child = SLP_TREE_CHILDREN (node)[0])
4767 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4768 .is_constant (&imin)))
4770 /* If the child has the same vector size as this node,
4771 reversing the permutation can make the permutation a no-op.
4772 In other cases it can change a true permutation into a
4773 full-vector extract. */
4774 tmp_perm.reserve (SLP_TREE_LANES (node));
4775 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4776 tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4778 else
4779 continue;
4781 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4783 unsigned idx = tmp_perm[j];
4784 imin = MIN (imin, idx);
4785 imax = MAX (imax, idx);
4786 if (idx - tmp_perm[0] != j)
4787 any_permute = true;
4789 /* If the span doesn't match we'd disrupt VF computation, avoid
4790 that for now. */
4791 if (imax - imin + 1 != SLP_TREE_LANES (node))
4792 continue;
4793 /* If there's no permute no need to split one out. In this case
4794 we can consider turning a load into a permuted load, if that
4795 turns out to be cheaper than alternatives. */
4796 if (!any_permute)
4798 partition.layout = -1;
4799 continue;
4802 /* For now only handle true permutes, like
4803 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
4804 when permuting constants and invariants keeping the permute
4805 bijective. */
4806 auto_sbitmap load_index (SLP_TREE_LANES (node));
4807 bitmap_clear (load_index);
4808 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4809 bitmap_set_bit (load_index, tmp_perm[j] - imin);
4810 unsigned j;
4811 for (j = 0; j < SLP_TREE_LANES (node); ++j)
4812 if (!bitmap_bit_p (load_index, j))
4813 break;
4814 if (j != SLP_TREE_LANES (node))
4815 continue;
4817 vec<unsigned> perm = vNULL;
4818 perm.safe_grow (SLP_TREE_LANES (node), true);
4819 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4820 perm[j] = tmp_perm[j] - imin;
4822 if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4824 /* Continue to use existing layouts, but don't add any more. */
4825 int *entry = layout_ids.get (perm);
4826 partition.layout = entry ? *entry : 0;
4827 perm.release ();
4829 else
4831 bool existed;
4832 int &layout_i = layout_ids.get_or_insert (perm, &existed);
4833 if (existed)
4834 perm.release ();
4835 else
4837 layout_i = m_perms.length ();
4838 m_perms.safe_push (perm);
4840 partition.layout = layout_i;
4844 /* Initially assume that every layout is possible and has zero cost
4845 in every partition. */
4846 m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4847 * m_perms.length ());
4849 /* We have to mark outgoing permutations facing non-associating-reduction
4850 graph entries that are not represented as to be materialized.
4851 slp_inst_kind_bb_reduc currently only covers associatable reductions. */
4852 for (slp_instance instance : m_vinfo->slp_instances)
4853 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4855 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4856 m_partitions[m_vertices[node_i].partition].layout = 0;
4858 else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4860 stmt_vec_info stmt_info
4861 = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4862 stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4863 if (needs_fold_left_reduction_p (TREE_TYPE
4864 (gimple_get_lhs (stmt_info->stmt)),
4865 STMT_VINFO_REDUC_CODE (reduc_info)))
4867 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4868 m_partitions[m_vertices[node_i].partition].layout = 0;
4872 /* Check which layouts each node and partition can handle. Calculate the
4873 weights associated with inserting layout changes on edges. */
4874 for (unsigned int node_i : m_partitioned_nodes)
4876 auto &vertex = m_vertices[node_i];
4877 auto &partition = m_partitions[vertex.partition];
4878 slp_tree node = vertex.node;
4880 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4882 vertex.weight = vect_slp_node_weight (node);
4884 /* We do not handle stores with a permutation, so all
4885 incoming permutations must have been materialized.
4887 We also don't handle masked grouped loads, which lack a
4888 permutation vector. In this case the memory locations
4889 form an implicit second input to the loads, on top of the
4890 explicit mask input, and the memory input's layout cannot
4891 be changed.
4893 On the other hand, we do support permuting gather loads and
4894 masked gather loads, where each scalar load is independent
4895 of the others. This can be useful if the address/index input
4896 benefits from permutation. */
4897 if (STMT_VINFO_DATA_REF (rep)
4898 && STMT_VINFO_GROUPED_ACCESS (rep)
4899 && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4900 partition.layout = 0;
4902 /* We cannot change the layout of an operation that is
4903 not independent on lanes. Note this is an explicit
4904 negative list since that's much shorter than the respective
4905 positive one but it's critical to keep maintaining it. */
4906 if (is_gimple_call (STMT_VINFO_STMT (rep)))
4907 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4909 case CFN_COMPLEX_ADD_ROT90:
4910 case CFN_COMPLEX_ADD_ROT270:
4911 case CFN_COMPLEX_MUL:
4912 case CFN_COMPLEX_MUL_CONJ:
4913 case CFN_VEC_ADDSUB:
4914 case CFN_VEC_FMADDSUB:
4915 case CFN_VEC_FMSUBADD:
4916 partition.layout = 0;
4917 default:;
4921 auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4923 auto &other_vertex = m_vertices[other_node_i];
4925 /* Count the number of edges from earlier partitions and the number
4926 of edges to later partitions. */
4927 if (other_vertex.partition < vertex.partition)
4928 partition.in_degree += 1;
4929 else
4930 partition.out_degree += 1;
4932 /* If the current node uses the result of OTHER_NODE_I, accumulate
4933 the effects of that. */
4934 if (ud->src == int (node_i))
4936 other_vertex.out_weight += vertex.weight;
4937 other_vertex.out_degree += 1;
4940 for_each_partition_edge (node_i, process_edge);
4944 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4945 its current (provisional) choice of layout. The inputs do not necessarily
4946 have the same layout as each other. */
4948 slpg_layout_cost
4949 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4951 auto &vertex = m_vertices[node_i];
4952 slpg_layout_cost cost;
4953 auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4955 auto &other_vertex = m_vertices[other_node_i];
4956 if (other_vertex.partition < vertex.partition)
4958 auto &other_partition = m_partitions[other_vertex.partition];
4959 auto &other_costs = partition_layout_costs (other_vertex.partition,
4960 other_partition.layout);
4961 slpg_layout_cost this_cost = other_costs.in_cost;
4962 this_cost.add_serial_cost (other_costs.internal_cost);
4963 this_cost.split (other_partition.out_degree);
4964 cost.add_parallel_cost (this_cost);
4967 for_each_partition_edge (node_i, add_cost);
4968 return cost;
4971 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4972 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
4973 slpg_layout_cost::impossible () if the change isn't possible. */
4975 slpg_layout_cost
4976 vect_optimize_slp_pass::
4977 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4978 unsigned int layout2_i)
4980 auto &def_vertex = m_vertices[ud->dest];
4981 auto &use_vertex = m_vertices[ud->src];
4982 auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4983 auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4984 auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4985 use_layout_i);
4986 if (factor < 0)
4987 return slpg_layout_cost::impossible ();
4989 /* We have a choice of putting the layout change at the site of the
4990 definition or at the site of the use. Prefer the former when
4991 optimizing for size or when the execution frequency of the
4992 definition is no greater than the combined execution frequencies of
4993 the uses. When putting the layout change at the site of the definition,
4994 divvy up the cost among all consumers. */
4995 if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4997 slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4998 cost.split (def_vertex.out_degree);
4999 return cost;
5001 return { use_vertex.weight * factor, m_optimize_size };
5004 /* UD represents a use-def link between FROM_NODE_I and a node in a later
5005 partition; FROM_NODE_I could be the definition node or the use node.
5006 The node at the other end of the link wants to use layout TO_LAYOUT_I.
5007 Return the cost of any necessary fix-ups on edge UD, or return
5008 slpg_layout_cost::impossible () if the change isn't possible.
5010 At this point, FROM_NODE_I's partition has chosen the cheapest
5011 layout based on the information available so far, but this choice
5012 is only provisional. */
5014 slpg_layout_cost
5015 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
5016 unsigned int to_layout_i)
5018 auto &from_vertex = m_vertices[from_node_i];
5019 unsigned int from_partition_i = from_vertex.partition;
5020 slpg_partition_info &from_partition = m_partitions[from_partition_i];
5021 gcc_assert (from_partition.layout >= 0);
5023 /* First calculate the cost on the assumption that FROM_PARTITION sticks
5024 with its current layout preference. */
5025 slpg_layout_cost cost = slpg_layout_cost::impossible ();
5026 auto edge_cost = edge_layout_cost (ud, from_node_i,
5027 from_partition.layout, to_layout_i);
5028 if (edge_cost.is_possible ())
5030 auto &from_costs = partition_layout_costs (from_partition_i,
5031 from_partition.layout);
5032 cost = from_costs.in_cost;
5033 cost.add_serial_cost (from_costs.internal_cost);
5034 cost.split (from_partition.out_degree);
5035 cost.add_serial_cost (edge_cost);
5038 /* Take the minimum of that cost and the cost that applies if
5039 FROM_PARTITION instead switches to TO_LAYOUT_I. */
5040 auto &direct_layout_costs = partition_layout_costs (from_partition_i,
5041 to_layout_i);
5042 if (direct_layout_costs.is_possible ())
5044 slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5045 direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
5046 direct_cost.split (from_partition.out_degree);
5047 if (!cost.is_possible ()
5048 || direct_cost.is_better_than (cost, m_optimize_size))
5049 cost = direct_cost;
5052 return cost;
5055 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
5056 partition; TO_NODE_I could be the definition node or the use node.
5057 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5058 return the cost of any necessary fix-ups on edge UD, or
5059 slpg_layout_cost::impossible () if the choice cannot be made.
5061 At this point, TO_NODE_I's partition has a fixed choice of layout. */
5063 slpg_layout_cost
5064 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
5065 unsigned int from_layout_i)
5067 auto &to_vertex = m_vertices[to_node_i];
5068 unsigned int to_partition_i = to_vertex.partition;
5069 slpg_partition_info &to_partition = m_partitions[to_partition_i];
5070 gcc_assert (to_partition.layout >= 0);
5072 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
5073 adjusted for this input having layout FROM_LAYOUT_I. Assume that
5074 any other inputs keep their current choice of layout. */
5075 auto &to_costs = partition_layout_costs (to_partition_i,
5076 to_partition.layout);
5077 if (ud->src == int (to_node_i)
5078 && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5080 auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
5081 auto old_layout = from_partition.layout;
5082 from_partition.layout = from_layout_i;
5083 int factor = internal_node_cost (to_vertex.node, -1,
5084 to_partition.layout);
5085 from_partition.layout = old_layout;
5086 if (factor >= 0)
5088 slpg_layout_cost cost = to_costs.out_cost;
5089 cost.add_serial_cost ({ to_vertex.weight * factor,
5090 m_optimize_size });
5091 cost.split (to_partition.in_degree);
5092 return cost;
5096 /* Compute the cost if we insert any necessary layout change on edge UD. */
5097 auto edge_cost = edge_layout_cost (ud, to_node_i,
5098 to_partition.layout, from_layout_i);
5099 if (edge_cost.is_possible ())
5101 slpg_layout_cost cost = to_costs.out_cost;
5102 cost.add_serial_cost (to_costs.internal_cost);
5103 cost.split (to_partition.in_degree);
5104 cost.add_serial_cost (edge_cost);
5105 return cost;
5108 return slpg_layout_cost::impossible ();
5111 /* Make a forward pass through the partitions, accumulating input costs.
5112 Make a tentative (provisional) choice of layout for each partition,
5113 ensuring that this choice still allows later partitions to keep
5114 their original layout. */
5116 void
5117 vect_optimize_slp_pass::forward_pass ()
5119 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5120 ++partition_i)
5122 auto &partition = m_partitions[partition_i];
5124 /* If the partition consists of a single VEC_PERM_EXPR, precompute
5125 the incoming cost that would apply if every predecessor partition
5126 keeps its current layout. This is used within the loop below. */
5127 slpg_layout_cost in_cost;
5128 slp_tree single_node = nullptr;
5129 if (partition.node_end == partition.node_begin + 1)
5131 unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5132 single_node = m_vertices[node_i].node;
5133 if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5134 in_cost = total_in_cost (node_i);
5137 /* Go through the possible layouts. Decide which ones are valid
5138 for this partition and record which of the valid layouts has
5139 the lowest cost. */
5140 unsigned int min_layout_i = 0;
5141 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5142 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5144 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5145 if (!layout_costs.is_possible ())
5146 continue;
5148 /* If the recorded layout is already 0 then the layout cannot
5149 change. */
5150 if (partition.layout == 0 && layout_i != 0)
5152 layout_costs.mark_impossible ();
5153 continue;
5156 bool is_possible = true;
5157 for (unsigned int order_i = partition.node_begin;
5158 order_i < partition.node_end; ++order_i)
5160 unsigned int node_i = m_partitioned_nodes[order_i];
5161 auto &vertex = m_vertices[node_i];
5163 /* Reject the layout if it is individually incompatible
5164 with any node in the partition. */
5165 if (!is_compatible_layout (vertex.node, layout_i))
5167 is_possible = false;
5168 break;
5171 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5173 auto &other_vertex = m_vertices[other_node_i];
5174 if (other_vertex.partition < vertex.partition)
5176 /* Accumulate the incoming costs from earlier
5177 partitions, plus the cost of any layout changes
5178 on UD itself. */
5179 auto cost = forward_cost (ud, other_node_i, layout_i);
5180 if (!cost.is_possible ())
5181 is_possible = false;
5182 else
5183 layout_costs.in_cost.add_parallel_cost (cost);
5185 else
5186 /* Reject the layout if it would make layout 0 impossible
5187 for later partitions. This amounts to testing that the
5188 target supports reversing the layout change on edges
5189 to later partitions.
5191 In principle, it might be possible to push a layout
5192 change all the way down a graph, so that it never
5193 needs to be reversed and so that the target doesn't
5194 need to support the reverse operation. But it would
5195 be awkward to bail out if we hit a partition that
5196 does not support the new layout, especially since
5197 we are not dealing with a lattice. */
5198 is_possible &= edge_layout_cost (ud, other_node_i, 0,
5199 layout_i).is_possible ();
5201 for_each_partition_edge (node_i, add_cost);
5203 /* Accumulate the cost of using LAYOUT_I within NODE,
5204 both for the inputs and the outputs. */
5205 int factor = internal_node_cost (vertex.node, layout_i,
5206 layout_i);
5207 if (factor < 0)
5209 is_possible = false;
5210 break;
5212 else if (factor)
5213 layout_costs.internal_cost.add_serial_cost
5214 ({ vertex.weight * factor, m_optimize_size });
5216 if (!is_possible)
5218 layout_costs.mark_impossible ();
5219 continue;
5222 /* Combine the incoming and partition-internal costs. */
5223 slpg_layout_cost combined_cost = layout_costs.in_cost;
5224 combined_cost.add_serial_cost (layout_costs.internal_cost);
5226 /* If this partition consists of a single VEC_PERM_EXPR, see
5227 if the VEC_PERM_EXPR can be changed to support output layout
5228 LAYOUT_I while keeping all the provisional choices of input
5229 layout. */
5230 if (single_node
5231 && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5233 int factor = internal_node_cost (single_node, -1, layout_i);
5234 if (factor >= 0)
5236 auto weight = m_vertices[single_node->vertex].weight;
5237 slpg_layout_cost internal_cost
5238 = { weight * factor, m_optimize_size };
5240 slpg_layout_cost alt_cost = in_cost;
5241 alt_cost.add_serial_cost (internal_cost);
5242 if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5244 combined_cost = alt_cost;
5245 layout_costs.in_cost = in_cost;
5246 layout_costs.internal_cost = internal_cost;
5251 /* Record the layout with the lowest cost. Prefer layout 0 in
5252 the event of a tie between it and another layout. */
5253 if (!min_layout_cost.is_possible ()
5254 || combined_cost.is_better_than (min_layout_cost,
5255 m_optimize_size))
5257 min_layout_i = layout_i;
5258 min_layout_cost = combined_cost;
5262 /* This loop's handling of earlier partitions should ensure that
5263 choosing the original layout for the current partition is no
5264 less valid than it was in the original graph, even with the
5265 provisional layout choices for those earlier partitions. */
5266 gcc_assert (min_layout_cost.is_possible ());
5267 partition.layout = min_layout_i;
5271 /* Make a backward pass through the partitions, accumulating output costs.
5272 Make a final choice of layout for each partition. */
5274 void
5275 vect_optimize_slp_pass::backward_pass ()
5277 for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5279 auto &partition = m_partitions[partition_i];
5281 unsigned int min_layout_i = 0;
5282 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5283 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5285 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5286 if (!layout_costs.is_possible ())
5287 continue;
5289 /* Accumulate the costs from successor partitions. */
5290 bool is_possible = true;
5291 for (unsigned int order_i = partition.node_begin;
5292 order_i < partition.node_end; ++order_i)
5294 unsigned int node_i = m_partitioned_nodes[order_i];
5295 auto &vertex = m_vertices[node_i];
5296 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5298 auto &other_vertex = m_vertices[other_node_i];
5299 auto &other_partition = m_partitions[other_vertex.partition];
5300 if (other_vertex.partition > vertex.partition)
5302 /* Accumulate the incoming costs from later
5303 partitions, plus the cost of any layout changes
5304 on UD itself. */
5305 auto cost = backward_cost (ud, other_node_i, layout_i);
5306 if (!cost.is_possible ())
5307 is_possible = false;
5308 else
5309 layout_costs.out_cost.add_parallel_cost (cost);
5311 else
5312 /* Make sure that earlier partitions can (if necessary
5313 or beneficial) keep the layout that they chose in
5314 the forward pass. This ensures that there is at
5315 least one valid choice of layout. */
5316 is_possible &= edge_layout_cost (ud, other_node_i,
5317 other_partition.layout,
5318 layout_i).is_possible ();
5320 for_each_partition_edge (node_i, add_cost);
5322 if (!is_possible)
5324 layout_costs.mark_impossible ();
5325 continue;
5328 /* Locally combine the costs from the forward and backward passes.
5329 (This combined cost is not passed on, since that would lead
5330 to double counting.) */
5331 slpg_layout_cost combined_cost = layout_costs.in_cost;
5332 combined_cost.add_serial_cost (layout_costs.internal_cost);
5333 combined_cost.add_serial_cost (layout_costs.out_cost);
5335 /* Record the layout with the lowest cost. Prefer layout 0 in
5336 the event of a tie between it and another layout. */
5337 if (!min_layout_cost.is_possible ()
5338 || combined_cost.is_better_than (min_layout_cost,
5339 m_optimize_size))
5341 min_layout_i = layout_i;
5342 min_layout_cost = combined_cost;
5346 gcc_assert (min_layout_cost.is_possible ());
5347 partition.layout = min_layout_i;
5351 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5352 NODE already has the layout that was selected for its partition. */
5354 slp_tree
5355 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5356 unsigned int to_layout_i)
5358 unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5359 slp_tree result = m_node_layouts[result_i];
5360 if (result)
5361 return result;
5363 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5364 || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5365 /* We can't permute vector defs in place. */
5366 && SLP_TREE_VEC_DEFS (node).is_empty ()))
5368 /* If the vector is uniform or unchanged, there's nothing to do. */
5369 if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5370 result = node;
5371 else
5373 auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5374 result = vect_create_new_slp_node (scalar_ops);
5375 vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5378 else
5380 unsigned int partition_i = m_vertices[node->vertex].partition;
5381 unsigned int from_layout_i = m_partitions[partition_i].layout;
5382 if (from_layout_i == to_layout_i)
5383 return node;
5385 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5386 permutation instead of a serial one. Leave the new permutation
5387 in TMP_PERM on success. */
5388 auto_lane_permutation_t tmp_perm;
5389 unsigned int num_inputs = 1;
5390 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5392 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5393 if (from_layout_i != 0)
5394 vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5395 if (to_layout_i != 0)
5396 vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5397 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5398 tmp_perm,
5399 SLP_TREE_CHILDREN (node),
5400 false) >= 0)
5401 num_inputs = SLP_TREE_CHILDREN (node).length ();
5402 else
5403 tmp_perm.truncate (0);
5406 if (dump_enabled_p ())
5408 if (tmp_perm.length () > 0)
5409 dump_printf_loc (MSG_NOTE, vect_location,
5410 "duplicating permutation node %p with"
5411 " layout %d\n",
5412 (void *) node, to_layout_i);
5413 else
5414 dump_printf_loc (MSG_NOTE, vect_location,
5415 "inserting permutation node in place of %p\n",
5416 (void *) node);
5419 unsigned int num_lanes = SLP_TREE_LANES (node);
5420 result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5421 if (SLP_TREE_SCALAR_STMTS (node).length ())
5423 auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5424 stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5425 if (from_layout_i != 0)
5426 vect_slp_permute (m_perms[from_layout_i], stmts, false);
5427 if (to_layout_i != 0)
5428 vect_slp_permute (m_perms[to_layout_i], stmts, true);
5430 SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5431 SLP_TREE_LANES (result) = num_lanes;
5432 SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5433 result->vertex = -1;
5435 auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5436 if (tmp_perm.length ())
5438 lane_perm.safe_splice (tmp_perm);
5439 SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5441 else
5443 lane_perm.create (num_lanes);
5444 for (unsigned j = 0; j < num_lanes; ++j)
5445 lane_perm.quick_push ({ 0, j });
5446 if (from_layout_i != 0)
5447 vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5448 if (to_layout_i != 0)
5449 vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5450 SLP_TREE_CHILDREN (result).safe_push (node);
5452 for (slp_tree child : SLP_TREE_CHILDREN (result))
5453 child->refcnt++;
5455 m_node_layouts[result_i] = result;
5456 return result;
5459 /* Apply the chosen vector layouts to the SLP graph. */
5461 void
5462 vect_optimize_slp_pass::materialize ()
5464 /* We no longer need the costs, so avoid having two O(N * P) arrays
5465 live at the same time. */
5466 m_partition_layout_costs.release ();
5467 m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5469 auto_sbitmap fully_folded (m_vertices.length ());
5470 bitmap_clear (fully_folded);
5471 for (unsigned int node_i : m_partitioned_nodes)
5473 auto &vertex = m_vertices[node_i];
5474 slp_tree node = vertex.node;
5475 int layout_i = m_partitions[vertex.partition].layout;
5476 gcc_assert (layout_i >= 0);
5478 /* Rearrange the scalar statements to match the chosen layout. */
5479 if (layout_i > 0)
5480 vect_slp_permute (m_perms[layout_i],
5481 SLP_TREE_SCALAR_STMTS (node), true);
5483 /* Update load and lane permutations. */
5484 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5486 /* First try to absorb the input vector layouts. If that fails,
5487 force the inputs to have layout LAYOUT_I too. We checked that
5488 that was possible before deciding to use nonzero output layouts.
5489 (Note that at this stage we don't really have any guarantee that
5490 the target supports the original VEC_PERM_EXPR.) */
5491 auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5492 auto_lane_permutation_t tmp_perm;
5493 tmp_perm.safe_splice (perm);
5494 change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5495 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5496 tmp_perm,
5497 SLP_TREE_CHILDREN (node),
5498 false) >= 0)
5500 if (dump_enabled_p ()
5501 && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5502 perm.begin ()))
5503 dump_printf_loc (MSG_NOTE, vect_location,
5504 "absorbing input layouts into %p\n",
5505 (void *) node);
5506 std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5507 bitmap_set_bit (fully_folded, node_i);
5509 else
5511 /* Not MSG_MISSED because it would make no sense to users. */
5512 if (dump_enabled_p ())
5513 dump_printf_loc (MSG_NOTE, vect_location,
5514 "failed to absorb input layouts into %p\n",
5515 (void *) node);
5516 change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5519 else
5521 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5522 auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5523 if (layout_i > 0)
5524 /* ??? When we handle non-bijective permutes the idea
5525 is that we can force the load-permutation to be
5526 { min, min + 1, min + 2, ... max }. But then the
5527 scalar defs might no longer match the lane content
5528 which means wrong-code with live lane vectorization.
5529 So we possibly have to have NULL entries for those. */
5530 vect_slp_permute (m_perms[layout_i], load_perm, true);
5534 /* Do this before any nodes disappear, since it involves a walk
5535 over the leaves. */
5536 remove_redundant_permutations ();
5538 /* Replace each child with a correctly laid-out version. */
5539 for (unsigned int node_i : m_partitioned_nodes)
5541 /* Skip nodes that have already been handled above. */
5542 if (bitmap_bit_p (fully_folded, node_i))
5543 continue;
5545 auto &vertex = m_vertices[node_i];
5546 int in_layout_i = m_partitions[vertex.partition].layout;
5547 gcc_assert (in_layout_i >= 0);
5549 unsigned j;
5550 slp_tree child;
5551 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5553 if (!child)
5554 continue;
5556 slp_tree new_child = get_result_with_layout (child, in_layout_i);
5557 if (new_child != child)
5559 vect_free_slp_tree (child);
5560 SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5561 new_child->refcnt += 1;
5567 /* Elide load permutations that are not necessary. Such permutations might
5568 be pre-existing, rather than created by the layout optimizations. */
5570 void
5571 vect_optimize_slp_pass::remove_redundant_permutations ()
5573 for (unsigned int node_i : m_leafs)
5575 slp_tree node = m_vertices[node_i].node;
5576 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5577 continue;
5579 /* In basic block vectorization we allow any subchain of an interleaving
5580 chain.
5581 FORNOW: not in loop SLP because of realignment complications. */
5582 if (is_a <bb_vec_info> (m_vinfo))
5584 bool subchain_p = true;
5585 stmt_vec_info next_load_info = NULL;
5586 stmt_vec_info load_info;
5587 unsigned j;
5588 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5590 if (j != 0
5591 && (next_load_info != load_info
5592 || DR_GROUP_GAP (load_info) != 1))
5594 subchain_p = false;
5595 break;
5597 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5599 if (subchain_p)
5601 SLP_TREE_LOAD_PERMUTATION (node).release ();
5602 continue;
5605 else
5607 loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5608 stmt_vec_info load_info;
5609 bool this_load_permuted = false;
5610 unsigned j;
5611 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5612 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5614 this_load_permuted = true;
5615 break;
5617 /* When this isn't a grouped access we know it's single element
5618 and contiguous. */
5619 if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5621 if (!this_load_permuted
5622 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5623 || SLP_TREE_LANES (node) == 1))
5624 SLP_TREE_LOAD_PERMUTATION (node).release ();
5625 continue;
5627 stmt_vec_info first_stmt_info
5628 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5629 if (!this_load_permuted
5630 /* The load requires permutation when unrolling exposes
5631 a gap either because the group is larger than the SLP
5632 group-size or because there is a gap between the groups. */
5633 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5634 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5635 && DR_GROUP_GAP (first_stmt_info) == 0)))
5637 SLP_TREE_LOAD_PERMUTATION (node).release ();
5638 continue;
5644 /* Print the partition graph and layout information to the dump file. */
5646 void
5647 vect_optimize_slp_pass::dump ()
5649 dump_printf_loc (MSG_NOTE, vect_location,
5650 "SLP optimize permutations:\n");
5651 for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5653 dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
5654 const char *sep = "";
5655 for (unsigned int idx : m_perms[layout_i])
5657 dump_printf (MSG_NOTE, "%s%d", sep, idx);
5658 sep = ", ";
5660 dump_printf (MSG_NOTE, " }\n");
5662 dump_printf_loc (MSG_NOTE, vect_location,
5663 "SLP optimize partitions:\n");
5664 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5665 ++partition_i)
5667 auto &partition = m_partitions[partition_i];
5668 dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
5669 dump_printf_loc (MSG_NOTE, vect_location,
5670 " partition %d (layout %d):\n",
5671 partition_i, partition.layout);
5672 dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
5673 for (unsigned int order_i = partition.node_begin;
5674 order_i < partition.node_end; ++order_i)
5676 auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5677 dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
5678 (void *) vertex.node);
5679 dump_printf_loc (MSG_NOTE, vect_location,
5680 " weight: %f\n",
5681 vertex.weight.to_double ());
5682 if (vertex.out_degree)
5683 dump_printf_loc (MSG_NOTE, vect_location,
5684 " out weight: %f (degree %d)\n",
5685 vertex.out_weight.to_double (),
5686 vertex.out_degree);
5687 if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5688 dump_printf_loc (MSG_NOTE, vect_location,
5689 " op: VEC_PERM_EXPR\n");
5690 else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5691 dump_printf_loc (MSG_NOTE, vect_location,
5692 " op template: %G", rep->stmt);
5694 dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
5695 for (unsigned int order_i = partition.node_begin;
5696 order_i < partition.node_end; ++order_i)
5698 unsigned int node_i = m_partitioned_nodes[order_i];
5699 auto &vertex = m_vertices[node_i];
5700 auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5702 auto &other_vertex = m_vertices[other_node_i];
5703 if (other_vertex.partition < vertex.partition)
5704 dump_printf_loc (MSG_NOTE, vect_location,
5705 " - %p [%d] --> %p\n",
5706 (void *) other_vertex.node,
5707 other_vertex.partition,
5708 (void *) vertex.node);
5709 else
5710 dump_printf_loc (MSG_NOTE, vect_location,
5711 " - %p --> [%d] %p\n",
5712 (void *) vertex.node,
5713 other_vertex.partition,
5714 (void *) other_vertex.node);
5716 for_each_partition_edge (node_i, print_edge);
5719 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5721 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5722 if (layout_costs.is_possible ())
5724 dump_printf_loc (MSG_NOTE, vect_location,
5725 " layout %d:%s\n", layout_i,
5726 partition.layout == int (layout_i)
5727 ? " (*)" : "");
5728 slpg_layout_cost combined_cost = layout_costs.in_cost;
5729 combined_cost.add_serial_cost (layout_costs.internal_cost);
5730 combined_cost.add_serial_cost (layout_costs.out_cost);
5731 #define TEMPLATE "{depth: %f, total: %f}"
5732 dump_printf_loc (MSG_NOTE, vect_location,
5733 " " TEMPLATE "\n",
5734 layout_costs.in_cost.depth.to_double (),
5735 layout_costs.in_cost.total.to_double ());
5736 dump_printf_loc (MSG_NOTE, vect_location,
5737 " + " TEMPLATE "\n",
5738 layout_costs.internal_cost.depth.to_double (),
5739 layout_costs.internal_cost.total.to_double ());
5740 dump_printf_loc (MSG_NOTE, vect_location,
5741 " + " TEMPLATE "\n",
5742 layout_costs.out_cost.depth.to_double (),
5743 layout_costs.out_cost.total.to_double ());
5744 dump_printf_loc (MSG_NOTE, vect_location,
5745 " = " TEMPLATE "\n",
5746 combined_cost.depth.to_double (),
5747 combined_cost.total.to_double ());
5748 #undef TEMPLATE
5750 else
5751 dump_printf_loc (MSG_NOTE, vect_location,
5752 " layout %d: rejected\n", layout_i);
5757 /* Main entry point for the SLP graph optimization pass. */
5759 void
5760 vect_optimize_slp_pass::run ()
5762 build_graph ();
5763 create_partitions ();
5764 start_choosing_layouts ();
5765 if (m_perms.length () > 1)
5767 forward_pass ();
5768 backward_pass ();
5769 if (dump_enabled_p ())
5770 dump ();
5771 materialize ();
5772 while (!m_perms.is_empty ())
5773 m_perms.pop ().release ();
5775 else
5776 remove_redundant_permutations ();
5777 free_graph (m_slpg);
5780 /* Optimize the SLP graph of VINFO. */
5782 void
5783 vect_optimize_slp (vec_info *vinfo)
5785 if (vinfo->slp_instances.is_empty ())
5786 return;
5787 vect_optimize_slp_pass (vinfo).run ();
5790 /* Gather loads reachable from the individual SLP graph entries. */
5792 void
5793 vect_gather_slp_loads (vec_info *vinfo)
5795 unsigned i;
5796 slp_instance instance;
5797 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5799 hash_set<slp_tree> visited;
5800 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5801 SLP_INSTANCE_TREE (instance), visited);
5806 /* For each possible SLP instance decide whether to SLP it and calculate overall
5807 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
5808 least one instance. */
5810 bool
5811 vect_make_slp_decision (loop_vec_info loop_vinfo)
5813 unsigned int i;
5814 poly_uint64 unrolling_factor = 1;
5815 const vec<slp_instance> &slp_instances
5816 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5817 slp_instance instance;
5818 int decided_to_slp = 0;
5820 DUMP_VECT_SCOPE ("vect_make_slp_decision");
5822 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5824 /* FORNOW: SLP if you can. */
5825 /* All unroll factors have the form:
5827 GET_MODE_SIZE (vinfo->vector_mode) * X
5829 for some rational X, so they must have a common multiple. */
5830 unrolling_factor
5831 = force_common_multiple (unrolling_factor,
5832 SLP_INSTANCE_UNROLLING_FACTOR (instance));
5834 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
5835 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5836 loop-based vectorization. Such stmts will be marked as HYBRID. */
5837 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5838 decided_to_slp++;
5841 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5843 if (decided_to_slp && dump_enabled_p ())
5845 dump_printf_loc (MSG_NOTE, vect_location,
5846 "Decided to SLP %d instances. Unrolling factor ",
5847 decided_to_slp);
5848 dump_dec (MSG_NOTE, unrolling_factor);
5849 dump_printf (MSG_NOTE, "\n");
5852 return (decided_to_slp > 0);
5855 /* Private data for vect_detect_hybrid_slp. */
5856 struct vdhs_data
5858 loop_vec_info loop_vinfo;
5859 vec<stmt_vec_info> *worklist;
5862 /* Walker for walk_gimple_op. */
5864 static tree
5865 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5867 walk_stmt_info *wi = (walk_stmt_info *)data;
5868 vdhs_data *dat = (vdhs_data *)wi->info;
5870 if (wi->is_lhs)
5871 return NULL_TREE;
5873 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5874 if (!def_stmt_info)
5875 return NULL_TREE;
5876 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5877 if (PURE_SLP_STMT (def_stmt_info))
5879 if (dump_enabled_p ())
5880 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5881 def_stmt_info->stmt);
5882 STMT_SLP_TYPE (def_stmt_info) = hybrid;
5883 dat->worklist->safe_push (def_stmt_info);
5886 return NULL_TREE;
5889 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5890 if so, otherwise pushing it to WORKLIST. */
5892 static void
5893 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5894 vec<stmt_vec_info> &worklist,
5895 stmt_vec_info stmt_info)
5897 if (dump_enabled_p ())
5898 dump_printf_loc (MSG_NOTE, vect_location,
5899 "Processing hybrid candidate : %G", stmt_info->stmt);
5900 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5901 imm_use_iterator iter2;
5902 ssa_op_iter iter1;
5903 use_operand_p use_p;
5904 def_operand_p def_p;
5905 bool any_def = false;
5906 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5908 any_def = true;
5909 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5911 if (is_gimple_debug (USE_STMT (use_p)))
5912 continue;
5913 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5914 /* An out-of loop use means this is a loop_vect sink. */
5915 if (!use_info)
5917 if (dump_enabled_p ())
5918 dump_printf_loc (MSG_NOTE, vect_location,
5919 "Found loop_vect sink: %G", stmt_info->stmt);
5920 worklist.safe_push (stmt_info);
5921 return;
5923 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5925 if (dump_enabled_p ())
5926 dump_printf_loc (MSG_NOTE, vect_location,
5927 "Found loop_vect use: %G", use_info->stmt);
5928 worklist.safe_push (stmt_info);
5929 return;
5933 /* No def means this is a loo_vect sink. */
5934 if (!any_def)
5936 if (dump_enabled_p ())
5937 dump_printf_loc (MSG_NOTE, vect_location,
5938 "Found loop_vect sink: %G", stmt_info->stmt);
5939 worklist.safe_push (stmt_info);
5940 return;
5942 if (dump_enabled_p ())
5943 dump_printf_loc (MSG_NOTE, vect_location,
5944 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5945 STMT_SLP_TYPE (stmt_info) = pure_slp;
5948 /* Find stmts that must be both vectorized and SLPed. */
5950 void
5951 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5953 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5955 /* All stmts participating in SLP are marked pure_slp, all other
5956 stmts are loop_vect.
5957 First collect all loop_vect stmts into a worklist.
5958 SLP patterns cause not all original scalar stmts to appear in
5959 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5960 Rectify this here and do a backward walk over the IL only considering
5961 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5962 mark them as pure_slp. */
5963 auto_vec<stmt_vec_info> worklist;
5964 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5966 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5967 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5968 gsi_next (&gsi))
5970 gphi *phi = gsi.phi ();
5971 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5972 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5973 maybe_push_to_hybrid_worklist (loop_vinfo,
5974 worklist, stmt_info);
5976 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5977 gsi_prev (&gsi))
5979 gimple *stmt = gsi_stmt (gsi);
5980 if (is_gimple_debug (stmt))
5981 continue;
5982 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5983 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5985 for (gimple_stmt_iterator gsi2
5986 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5987 !gsi_end_p (gsi2); gsi_next (&gsi2))
5989 stmt_vec_info patt_info
5990 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5991 if (!STMT_SLP_TYPE (patt_info)
5992 && STMT_VINFO_RELEVANT (patt_info))
5993 maybe_push_to_hybrid_worklist (loop_vinfo,
5994 worklist, patt_info);
5996 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5998 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5999 maybe_push_to_hybrid_worklist (loop_vinfo,
6000 worklist, stmt_info);
6004 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
6005 mark any SLP vectorized stmt as hybrid.
6006 ??? We're visiting def stmts N times (once for each non-SLP and
6007 once for each hybrid-SLP use). */
6008 walk_stmt_info wi;
6009 vdhs_data dat;
6010 dat.worklist = &worklist;
6011 dat.loop_vinfo = loop_vinfo;
6012 memset (&wi, 0, sizeof (wi));
6013 wi.info = (void *)&dat;
6014 while (!worklist.is_empty ())
6016 stmt_vec_info stmt_info = worklist.pop ();
6017 /* Since SSA operands are not set up for pattern stmts we need
6018 to use walk_gimple_op. */
6019 wi.is_lhs = 0;
6020 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
6021 /* For gather/scatter make sure to walk the offset operand, that
6022 can be a scaling and conversion away. */
6023 gather_scatter_info gs_info;
6024 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6025 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6027 int dummy;
6028 vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
6034 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
6036 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6037 : vec_info (vec_info::bb, shared),
6038 bbs (_bbs),
6039 roots (vNULL)
6041 for (unsigned i = 0; i < bbs.length (); ++i)
6043 if (i != 0)
6044 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6045 gsi_next (&si))
6047 gphi *phi = si.phi ();
6048 gimple_set_uid (phi, 0);
6049 add_stmt (phi);
6051 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6052 !gsi_end_p (gsi); gsi_next (&gsi))
6054 gimple *stmt = gsi_stmt (gsi);
6055 gimple_set_uid (stmt, 0);
6056 if (is_gimple_debug (stmt))
6057 continue;
6058 add_stmt (stmt);
6064 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
6065 stmts in the basic block. */
6067 _bb_vec_info::~_bb_vec_info ()
6069 /* Reset region marker. */
6070 for (unsigned i = 0; i < bbs.length (); ++i)
6072 if (i != 0)
6073 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6074 gsi_next (&si))
6076 gphi *phi = si.phi ();
6077 gimple_set_uid (phi, -1);
6079 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6080 !gsi_end_p (gsi); gsi_next (&gsi))
6082 gimple *stmt = gsi_stmt (gsi);
6083 gimple_set_uid (stmt, -1);
6087 for (unsigned i = 0; i < roots.length (); ++i)
6089 roots[i].stmts.release ();
6090 roots[i].roots.release ();
6091 roots[i].remain.release ();
6093 roots.release ();
6096 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
6097 given then that child nodes have already been processed, and that
6098 their def types currently match their SLP node's def type. */
6100 static bool
6101 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6102 slp_instance node_instance,
6103 stmt_vector_for_cost *cost_vec)
6105 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6107 /* Calculate the number of vector statements to be created for the
6108 scalar stmts in this node. For SLP reductions it is equal to the
6109 number of vector statements in the children (which has already been
6110 calculated by the recursive call). Otherwise it is the number of
6111 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6112 VF divided by the number of elements in a vector. */
6113 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6114 && !STMT_VINFO_DATA_REF (stmt_info)
6115 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6117 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6118 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6120 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6121 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6122 break;
6125 else
6127 poly_uint64 vf;
6128 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6129 vf = loop_vinfo->vectorization_factor;
6130 else
6131 vf = 1;
6132 unsigned int group_size = SLP_TREE_LANES (node);
6133 tree vectype = SLP_TREE_VECTYPE (node);
6134 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6135 = vect_get_num_vectors (vf * group_size, vectype);
6138 /* Handle purely internal nodes. */
6139 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6141 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6142 return false;
6144 stmt_vec_info slp_stmt_info;
6145 unsigned int i;
6146 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6148 if (STMT_VINFO_LIVE_P (slp_stmt_info)
6149 && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6150 node_instance, i,
6151 false, cost_vec))
6152 return false;
6154 return true;
6157 bool dummy;
6158 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6159 node, node_instance, cost_vec);
6162 /* Try to build NODE from scalars, returning true on success.
6163 NODE_INSTANCE is the SLP instance that contains NODE. */
6165 static bool
6166 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6167 slp_instance node_instance)
6169 stmt_vec_info stmt_info;
6170 unsigned int i;
6172 if (!is_a <bb_vec_info> (vinfo)
6173 || node == SLP_INSTANCE_TREE (node_instance)
6174 || !SLP_TREE_SCALAR_STMTS (node).exists ()
6175 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6176 /* Force the mask use to be built from scalars instead. */
6177 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6178 return false;
6180 if (dump_enabled_p ())
6181 dump_printf_loc (MSG_NOTE, vect_location,
6182 "Building vector operands of %p from scalars instead\n",
6183 (void *) node);
6185 /* Don't remove and free the child nodes here, since they could be
6186 referenced by other structures. The analysis and scheduling phases
6187 (need to) ignore child nodes of anything that isn't vect_internal_def. */
6188 unsigned int group_size = SLP_TREE_LANES (node);
6189 SLP_TREE_DEF_TYPE (node) = vect_external_def;
6190 /* Invariants get their vector type from the uses. */
6191 SLP_TREE_VECTYPE (node) = NULL_TREE;
6192 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6193 SLP_TREE_LOAD_PERMUTATION (node).release ();
6194 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6196 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6197 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6199 return true;
6202 /* Return true if all elements of the slice are the same. */
6203 bool
6204 vect_scalar_ops_slice::all_same_p () const
6206 for (unsigned int i = 1; i < length; ++i)
6207 if (!operand_equal_p (op (0), op (i)))
6208 return false;
6209 return true;
6212 hashval_t
6213 vect_scalar_ops_slice_hash::hash (const value_type &s)
6215 hashval_t hash = 0;
6216 for (unsigned i = 0; i < s.length; ++i)
6217 hash = iterative_hash_expr (s.op (i), hash);
6218 return hash;
6221 bool
6222 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6223 const compare_type &s2)
6225 if (s1.length != s2.length)
6226 return false;
6227 for (unsigned i = 0; i < s1.length; ++i)
6228 if (!operand_equal_p (s1.op (i), s2.op (i)))
6229 return false;
6230 return true;
6233 /* Compute the prologue cost for invariant or constant operands represented
6234 by NODE. */
6236 static void
6237 vect_prologue_cost_for_slp (slp_tree node,
6238 stmt_vector_for_cost *cost_vec)
6240 /* There's a special case of an existing vector, that costs nothing. */
6241 if (SLP_TREE_SCALAR_OPS (node).length () == 0
6242 && !SLP_TREE_VEC_DEFS (node).is_empty ())
6243 return;
6244 /* Without looking at the actual initializer a vector of
6245 constants can be implemented as load from the constant pool.
6246 When all elements are the same we can use a splat. */
6247 tree vectype = SLP_TREE_VECTYPE (node);
6248 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6249 unsigned HOST_WIDE_INT const_nunits;
6250 unsigned nelt_limit;
6251 auto ops = &SLP_TREE_SCALAR_OPS (node);
6252 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6253 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6254 && ! multiple_p (const_nunits, group_size))
6256 nelt_limit = const_nunits;
6257 hash_set<vect_scalar_ops_slice_hash> vector_ops;
6258 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6259 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6260 starts.quick_push (i * const_nunits);
6262 else
6264 /* If either the vector has variable length or the vectors
6265 are composed of repeated whole groups we only need to
6266 cost construction once. All vectors will be the same. */
6267 nelt_limit = group_size;
6268 starts.quick_push (0);
6270 /* ??? We're just tracking whether vectors in a single node are the same.
6271 Ideally we'd do something more global. */
6272 bool passed = false;
6273 for (unsigned int start : starts)
6275 vect_cost_for_stmt kind;
6276 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6277 kind = vector_load;
6278 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6279 kind = scalar_to_vec;
6280 else
6281 kind = vec_construct;
6282 /* The target cost hook has no idea which part of the SLP node
6283 we are costing so avoid passing it down more than once. Pass
6284 it to the first vec_construct or scalar_to_vec part since for those
6285 the x86 backend tries to account for GPR to XMM register moves. */
6286 record_stmt_cost (cost_vec, 1, kind,
6287 (kind != vector_load && !passed) ? node : nullptr,
6288 vectype, 0, vect_prologue);
6289 if (kind != vector_load)
6290 passed = true;
6294 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6295 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6297 Return true if the operations are supported. */
6299 static bool
6300 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6301 slp_instance node_instance,
6302 hash_set<slp_tree> &visited_set,
6303 vec<slp_tree> &visited_vec,
6304 stmt_vector_for_cost *cost_vec)
6306 int i, j;
6307 slp_tree child;
6309 /* Assume we can code-generate all invariants. */
6310 if (!node
6311 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6312 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6313 return true;
6315 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6317 if (dump_enabled_p ())
6318 dump_printf_loc (MSG_NOTE, vect_location,
6319 "Failed cyclic SLP reference in %p\n", (void *) node);
6320 return false;
6322 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6324 /* If we already analyzed the exact same set of scalar stmts we're done.
6325 We share the generated vector stmts for those. */
6326 if (visited_set.add (node))
6327 return true;
6328 visited_vec.safe_push (node);
6330 bool res = true;
6331 unsigned visited_rec_start = visited_vec.length ();
6332 unsigned cost_vec_rec_start = cost_vec->length ();
6333 bool seen_non_constant_child = false;
6334 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6336 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6337 visited_set, visited_vec,
6338 cost_vec);
6339 if (!res)
6340 break;
6341 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6342 seen_non_constant_child = true;
6344 /* We're having difficulties scheduling nodes with just constant
6345 operands and no scalar stmts since we then cannot compute a stmt
6346 insertion place. */
6347 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6349 if (dump_enabled_p ())
6350 dump_printf_loc (MSG_NOTE, vect_location,
6351 "Cannot vectorize all-constant op node %p\n",
6352 (void *) node);
6353 res = false;
6356 if (res)
6357 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6358 cost_vec);
6359 /* If analysis failed we have to pop all recursive visited nodes
6360 plus ourselves. */
6361 if (!res)
6363 while (visited_vec.length () >= visited_rec_start)
6364 visited_set.remove (visited_vec.pop ());
6365 cost_vec->truncate (cost_vec_rec_start);
6368 /* When the node can be vectorized cost invariant nodes it references.
6369 This is not done in DFS order to allow the refering node
6370 vectorizable_* calls to nail down the invariant nodes vector type
6371 and possibly unshare it if it needs a different vector type than
6372 other referrers. */
6373 if (res)
6374 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6375 if (child
6376 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6377 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6378 /* Perform usual caching, note code-generation still
6379 code-gens these nodes multiple times but we expect
6380 to CSE them later. */
6381 && !visited_set.add (child))
6383 visited_vec.safe_push (child);
6384 /* ??? After auditing more code paths make a "default"
6385 and push the vector type from NODE to all children
6386 if it is not already set. */
6387 /* Compute the number of vectors to be generated. */
6388 tree vector_type = SLP_TREE_VECTYPE (child);
6389 if (!vector_type)
6391 /* For shifts with a scalar argument we don't need
6392 to cost or code-generate anything.
6393 ??? Represent this more explicitely. */
6394 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6395 == shift_vec_info_type)
6396 && j == 1);
6397 continue;
6399 unsigned group_size = SLP_TREE_LANES (child);
6400 poly_uint64 vf = 1;
6401 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6402 vf = loop_vinfo->vectorization_factor;
6403 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6404 = vect_get_num_vectors (vf * group_size, vector_type);
6405 /* And cost them. */
6406 vect_prologue_cost_for_slp (child, cost_vec);
6409 /* If this node or any of its children can't be vectorized, try pruning
6410 the tree here rather than felling the whole thing. */
6411 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6413 /* We'll need to revisit this for invariant costing and number
6414 of vectorized stmt setting. */
6415 res = true;
6418 return res;
6421 /* Given a definition DEF, analyze if it will have any live scalar use after
6422 performing SLP vectorization whose information is represented by BB_VINFO,
6423 and record result into hash map SCALAR_USE_MAP as cache for later fast
6424 check. If recursion DEPTH exceeds a limit, stop analysis and make a
6425 conservative assumption. Return 0 if no scalar use, 1 if there is, -1
6426 means recursion is limited. */
6428 static int
6429 vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
6430 hash_map<tree, int> &scalar_use_map,
6431 int depth = 0)
6433 const int depth_limit = 2;
6434 imm_use_iterator use_iter;
6435 gimple *use_stmt;
6437 if (int *res = scalar_use_map.get (def))
6438 return *res;
6440 int scalar_use = 1;
6442 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
6444 if (is_gimple_debug (use_stmt))
6445 continue;
6447 stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6449 if (!use_stmt_info)
6450 break;
6452 if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6453 continue;
6455 /* Do not step forward when encounter PHI statement, since it may
6456 involve cyclic reference and cause infinite recursive invocation. */
6457 if (gimple_code (use_stmt) == GIMPLE_PHI)
6458 break;
6460 /* When pattern recognition is involved, a statement whose definition is
6461 consumed in some pattern, may not be included in the final replacement
6462 pattern statements, so would be skipped when building SLP graph.
6464 * Original
6465 char a_c = *(char *) a;
6466 char b_c = *(char *) b;
6467 unsigned short a_s = (unsigned short) a_c;
6468 int a_i = (int) a_s;
6469 int b_i = (int) b_c;
6470 int r_i = a_i - b_i;
6472 * After pattern replacement
6473 a_s = (unsigned short) a_c;
6474 a_i = (int) a_s;
6476 patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
6477 patt_b_i = (int) patt_b_s; // b_i = (int) b_c
6479 patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
6480 patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
6482 The definitions of a_i(original statement) and b_i(pattern statement)
6483 are related to, but actually not part of widen_minus pattern.
6484 Vectorizing the pattern does not cause these definition statements to
6485 be marked as PURE_SLP. For this case, we need to recursively check
6486 whether their uses are all absorbed into vectorized code. But there
6487 is an exception that some use may participate in an vectorized
6488 operation via an external SLP node containing that use as an element.
6489 The parameter "scalar_use_map" tags such kind of SSA as having scalar
6490 use in advance. */
6491 tree lhs = gimple_get_lhs (use_stmt);
6493 if (!lhs || TREE_CODE (lhs) != SSA_NAME)
6494 break;
6496 if (depth_limit && depth >= depth_limit)
6497 return -1;
6499 if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
6500 depth + 1)))
6501 break;
6504 if (end_imm_use_stmt_p (&use_iter))
6505 scalar_use = 0;
6507 /* If recursion is limited, do not cache result for non-root defs. */
6508 if (!depth || scalar_use >= 0)
6510 bool added = scalar_use_map.put (def, scalar_use);
6511 gcc_assert (!added);
6514 return scalar_use;
6517 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6518 region and that can be vectorized using vectorizable_live_operation
6519 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
6520 scalar code computing it to be retained. */
6522 static void
6523 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6524 slp_instance instance,
6525 stmt_vector_for_cost *cost_vec,
6526 hash_map<tree, int> &scalar_use_map,
6527 hash_set<stmt_vec_info> &svisited,
6528 hash_set<slp_tree> &visited)
6530 if (visited.add (node))
6531 return;
6533 unsigned i;
6534 stmt_vec_info stmt_info;
6535 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6536 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6538 if (svisited.contains (stmt_info))
6539 continue;
6540 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6541 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6542 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6543 /* Only the pattern root stmt computes the original scalar value. */
6544 continue;
6545 bool mark_visited = true;
6546 gimple *orig_stmt = orig_stmt_info->stmt;
6547 ssa_op_iter op_iter;
6548 def_operand_p def_p;
6549 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6551 if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
6552 scalar_use_map))
6554 STMT_VINFO_LIVE_P (stmt_info) = true;
6555 if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
6556 instance, i, false, cost_vec))
6557 /* ??? So we know we can vectorize the live stmt from one SLP
6558 node. If we cannot do so from all or none consistently
6559 we'd have to record which SLP node (and lane) we want to
6560 use for the live operation. So make sure we can
6561 code-generate from all nodes. */
6562 mark_visited = false;
6563 else
6564 STMT_VINFO_LIVE_P (stmt_info) = false;
6567 /* We have to verify whether we can insert the lane extract
6568 before all uses. The following is a conservative approximation.
6569 We cannot put this into vectorizable_live_operation because
6570 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6571 doesn't work.
6572 Note that while the fact that we emit code for loads at the
6573 first load should make this a non-problem leafs we construct
6574 from scalars are vectorized after the last scalar def.
6575 ??? If we'd actually compute the insert location during
6576 analysis we could use sth less conservative than the last
6577 scalar stmt in the node for the dominance check. */
6578 /* ??? What remains is "live" uses in vector CTORs in the same
6579 SLP graph which is where those uses can end up code-generated
6580 right after their definition instead of close to their original
6581 use. But that would restrict us to code-generate lane-extracts
6582 from the latest stmt in a node. So we compensate for this
6583 during code-generation, simply not replacing uses for those
6584 hopefully rare cases. */
6585 imm_use_iterator use_iter;
6586 gimple *use_stmt;
6587 stmt_vec_info use_stmt_info;
6589 if (STMT_VINFO_LIVE_P (stmt_info))
6590 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6591 if (!is_gimple_debug (use_stmt)
6592 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6593 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6594 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6596 if (dump_enabled_p ())
6597 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6598 "Cannot determine insertion place for "
6599 "lane extract\n");
6600 STMT_VINFO_LIVE_P (stmt_info) = false;
6601 mark_visited = true;
6604 if (mark_visited)
6605 svisited.add (stmt_info);
6608 slp_tree child;
6609 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6610 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6611 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
6612 scalar_use_map, svisited, visited);
6615 /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
6616 are live outside of the basic-block vectorized region and that can be
6617 vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
6619 static void
6620 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
6622 if (bb_vinfo->slp_instances.is_empty ())
6623 return;
6625 hash_set<stmt_vec_info> svisited;
6626 hash_set<slp_tree> visited;
6627 hash_map<tree, int> scalar_use_map;
6628 auto_vec<slp_tree> worklist;
6630 for (slp_instance instance : bb_vinfo->slp_instances)
6631 if (!visited.add (SLP_INSTANCE_TREE (instance)))
6632 worklist.safe_push (SLP_INSTANCE_TREE (instance));
6636 slp_tree node = worklist.pop ();
6638 if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
6640 for (tree op : SLP_TREE_SCALAR_OPS (node))
6641 if (TREE_CODE (op) == SSA_NAME)
6642 scalar_use_map.put (op, 1);
6644 else
6646 for (slp_tree child : SLP_TREE_CHILDREN (node))
6647 if (child && !visited.add (child))
6648 worklist.safe_push (child);
6650 } while (!worklist.is_empty ());
6652 visited.empty ();
6654 for (slp_instance instance : bb_vinfo->slp_instances)
6656 vect_location = instance->location ();
6657 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6658 instance, &instance->cost_vec,
6659 scalar_use_map, svisited, visited);
6663 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
6665 static bool
6666 vectorizable_bb_reduc_epilogue (slp_instance instance,
6667 stmt_vector_for_cost *cost_vec)
6669 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6670 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6671 if (reduc_code == MINUS_EXPR)
6672 reduc_code = PLUS_EXPR;
6673 internal_fn reduc_fn;
6674 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6675 if (!vectype
6676 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6677 || reduc_fn == IFN_LAST
6678 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6679 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6680 TREE_TYPE (vectype)))
6682 if (dump_enabled_p ())
6683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6684 "not vectorized: basic block reduction epilogue "
6685 "operation unsupported.\n");
6686 return false;
6689 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6690 cost log2 vector operations plus shuffles and one extraction. */
6691 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6692 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6693 vectype, 0, vect_body);
6694 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6695 vectype, 0, vect_body);
6696 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6697 vectype, 0, vect_body);
6699 /* Since we replace all stmts of a possibly longer scalar reduction
6700 chain account for the extra scalar stmts for that. */
6701 record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6702 instance->root_stmts[0], 0, vect_body);
6703 return true;
6706 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6707 and recurse to children. */
6709 static void
6710 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6711 hash_set<slp_tree> &visited)
6713 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6714 || visited.add (node))
6715 return;
6717 stmt_vec_info stmt;
6718 unsigned i;
6719 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6720 roots.remove (vect_orig_stmt (stmt));
6722 slp_tree child;
6723 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6724 if (child)
6725 vect_slp_prune_covered_roots (child, roots, visited);
6728 /* Analyze statements in SLP instances of VINFO. Return true if the
6729 operations are supported. */
6731 bool
6732 vect_slp_analyze_operations (vec_info *vinfo)
6734 slp_instance instance;
6735 int i;
6737 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6739 hash_set<slp_tree> visited;
6740 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6742 auto_vec<slp_tree> visited_vec;
6743 stmt_vector_for_cost cost_vec;
6744 cost_vec.create (2);
6745 if (is_a <bb_vec_info> (vinfo))
6746 vect_location = instance->location ();
6747 if (!vect_slp_analyze_node_operations (vinfo,
6748 SLP_INSTANCE_TREE (instance),
6749 instance, visited, visited_vec,
6750 &cost_vec)
6751 /* CTOR instances require vectorized defs for the SLP tree root. */
6752 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6753 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6754 != vect_internal_def
6755 /* Make sure we vectorized with the expected type. */
6756 || !useless_type_conversion_p
6757 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6758 (instance->root_stmts[0]->stmt))),
6759 TREE_TYPE (SLP_TREE_VECTYPE
6760 (SLP_INSTANCE_TREE (instance))))))
6761 /* Check we can vectorize the reduction. */
6762 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6763 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6765 slp_tree node = SLP_INSTANCE_TREE (instance);
6766 stmt_vec_info stmt_info;
6767 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6768 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6769 else
6770 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6771 if (dump_enabled_p ())
6772 dump_printf_loc (MSG_NOTE, vect_location,
6773 "removing SLP instance operations starting from: %G",
6774 stmt_info->stmt);
6775 vect_free_slp_instance (instance);
6776 vinfo->slp_instances.ordered_remove (i);
6777 cost_vec.release ();
6778 while (!visited_vec.is_empty ())
6779 visited.remove (visited_vec.pop ());
6781 else
6783 i++;
6784 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6786 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6787 cost_vec.release ();
6789 else
6790 /* For BB vectorization remember the SLP graph entry
6791 cost for later. */
6792 instance->cost_vec = cost_vec;
6796 /* Now look for SLP instances with a root that are covered by other
6797 instances and remove them. */
6798 hash_set<stmt_vec_info> roots;
6799 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6800 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6801 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6802 if (!roots.is_empty ())
6804 visited.empty ();
6805 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6806 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6807 visited);
6808 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6809 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6810 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6812 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6813 if (dump_enabled_p ())
6814 dump_printf_loc (MSG_NOTE, vect_location,
6815 "removing SLP instance operations starting "
6816 "from: %G", root->stmt);
6817 vect_free_slp_instance (instance);
6818 vinfo->slp_instances.ordered_remove (i);
6820 else
6821 ++i;
6824 /* Compute vectorizable live stmts. */
6825 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6826 vect_bb_slp_mark_live_stmts (bb_vinfo);
6828 return !vinfo->slp_instances.is_empty ();
6831 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6832 closing the eventual chain. */
6834 static slp_instance
6835 get_ultimate_leader (slp_instance instance,
6836 hash_map<slp_instance, slp_instance> &instance_leader)
6838 auto_vec<slp_instance *, 8> chain;
6839 slp_instance *tem;
6840 while (*(tem = instance_leader.get (instance)) != instance)
6842 chain.safe_push (tem);
6843 instance = *tem;
6845 while (!chain.is_empty ())
6846 *chain.pop () = instance;
6847 return instance;
6850 namespace {
6851 /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
6852 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6853 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
6855 INSTANCE_LEADER is as for get_ultimate_leader. */
6857 template<typename T>
6858 bool
6859 vect_map_to_instance (slp_instance instance, T key,
6860 hash_map<T, slp_instance> &key_to_instance,
6861 hash_map<slp_instance, slp_instance> &instance_leader)
6863 bool existed_p;
6864 slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6865 if (!existed_p)
6867 else if (key_instance != instance)
6869 /* If we're running into a previously marked key make us the
6870 leader of the current ultimate leader. This keeps the
6871 leader chain acyclic and works even when the current instance
6872 connects two previously independent graph parts. */
6873 slp_instance key_leader
6874 = get_ultimate_leader (key_instance, instance_leader);
6875 if (key_leader != instance)
6876 instance_leader.put (key_leader, instance);
6878 key_instance = instance;
6879 return existed_p;
6883 /* Worker of vect_bb_partition_graph, recurse on NODE. */
6885 static void
6886 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6887 slp_instance instance, slp_tree node,
6888 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6889 hash_map<slp_tree, slp_instance> &node_to_instance,
6890 hash_map<slp_instance, slp_instance> &instance_leader)
6892 stmt_vec_info stmt_info;
6893 unsigned i;
6895 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6896 vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6897 instance_leader);
6899 if (vect_map_to_instance (instance, node, node_to_instance,
6900 instance_leader))
6901 return;
6903 slp_tree child;
6904 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6905 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6906 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6907 node_to_instance, instance_leader);
6910 /* Partition the SLP graph into pieces that can be costed independently. */
6912 static void
6913 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6915 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6917 /* First walk the SLP graph assigning each involved scalar stmt a
6918 corresponding SLP graph entry and upon visiting a previously
6919 marked stmt, make the stmts leader the current SLP graph entry. */
6920 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6921 hash_map<slp_tree, slp_instance> node_to_instance;
6922 hash_map<slp_instance, slp_instance> instance_leader;
6923 slp_instance instance;
6924 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6926 instance_leader.put (instance, instance);
6927 vect_bb_partition_graph_r (bb_vinfo,
6928 instance, SLP_INSTANCE_TREE (instance),
6929 stmt_to_instance, node_to_instance,
6930 instance_leader);
6933 /* Then collect entries to each independent subgraph. */
6934 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6936 slp_instance leader = get_ultimate_leader (instance, instance_leader);
6937 leader->subgraph_entries.safe_push (instance);
6938 if (dump_enabled_p ()
6939 && leader != instance)
6940 dump_printf_loc (MSG_NOTE, vect_location,
6941 "instance %p is leader of %p\n",
6942 (void *) leader, (void *) instance);
6946 /* Compute the set of scalar stmts participating in internal and external
6947 nodes. */
6949 static void
6950 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6951 hash_set<slp_tree> &visited,
6952 hash_set<stmt_vec_info> &vstmts,
6953 hash_set<stmt_vec_info> &estmts)
6955 int i;
6956 stmt_vec_info stmt_info;
6957 slp_tree child;
6959 if (visited.add (node))
6960 return;
6962 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6964 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6965 vstmts.add (stmt_info);
6967 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6968 if (child)
6969 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6970 vstmts, estmts);
6972 else
6973 for (tree def : SLP_TREE_SCALAR_OPS (node))
6975 stmt_vec_info def_stmt = vinfo->lookup_def (def);
6976 if (def_stmt)
6977 estmts.add (def_stmt);
6982 /* Compute the scalar cost of the SLP node NODE and its children
6983 and return it. Do not account defs that are marked in LIFE and
6984 update LIFE according to uses of NODE. */
6986 static void
6987 vect_bb_slp_scalar_cost (vec_info *vinfo,
6988 slp_tree node, vec<bool, va_heap> *life,
6989 stmt_vector_for_cost *cost_vec,
6990 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6991 hash_set<slp_tree> &visited)
6993 unsigned i;
6994 stmt_vec_info stmt_info;
6995 slp_tree child;
6997 if (visited.add (node))
6998 return;
7000 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7002 ssa_op_iter op_iter;
7003 def_operand_p def_p;
7005 if ((*life)[i])
7006 continue;
7008 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
7009 gimple *orig_stmt = orig_stmt_info->stmt;
7011 /* If there is a non-vectorized use of the defs then the scalar
7012 stmt is kept live in which case we do not account it or any
7013 required defs in the SLP children in the scalar cost. This
7014 way we make the vectorization more costly when compared to
7015 the scalar cost. */
7016 if (!STMT_VINFO_LIVE_P (stmt_info))
7018 auto_vec<gimple *, 8> worklist;
7019 hash_set<gimple *> *worklist_visited = NULL;
7020 worklist.quick_push (orig_stmt);
7023 gimple *work_stmt = worklist.pop ();
7024 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
7026 imm_use_iterator use_iter;
7027 gimple *use_stmt;
7028 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
7029 DEF_FROM_PTR (def_p))
7030 if (!is_gimple_debug (use_stmt))
7032 stmt_vec_info use_stmt_info
7033 = vinfo->lookup_stmt (use_stmt);
7034 if (!use_stmt_info
7035 || !vectorized_scalar_stmts.contains (use_stmt_info))
7037 if (use_stmt_info
7038 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
7040 /* For stmts participating in patterns we have
7041 to check its uses recursively. */
7042 if (!worklist_visited)
7043 worklist_visited = new hash_set<gimple *> ();
7044 if (!worklist_visited->add (use_stmt))
7045 worklist.safe_push (use_stmt);
7046 continue;
7048 (*life)[i] = true;
7049 goto next_lane;
7054 while (!worklist.is_empty ());
7055 next_lane:
7056 if (worklist_visited)
7057 delete worklist_visited;
7058 if ((*life)[i])
7059 continue;
7062 /* Count scalar stmts only once. */
7063 if (gimple_visited_p (orig_stmt))
7064 continue;
7065 gimple_set_visited (orig_stmt, true);
7067 vect_cost_for_stmt kind;
7068 if (STMT_VINFO_DATA_REF (orig_stmt_info))
7070 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
7071 kind = scalar_load;
7072 else
7073 kind = scalar_store;
7075 else if (vect_nop_conversion_p (orig_stmt_info))
7076 continue;
7077 /* For single-argument PHIs assume coalescing which means zero cost
7078 for the scalar and the vector PHIs. This avoids artificially
7079 favoring the vector path (but may pessimize it in some cases). */
7080 else if (is_a <gphi *> (orig_stmt_info->stmt)
7081 && gimple_phi_num_args
7082 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
7083 continue;
7084 else
7085 kind = scalar_stmt;
7086 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
7087 SLP_TREE_VECTYPE (node), 0, vect_body);
7090 auto_vec<bool, 20> subtree_life;
7091 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7093 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7095 /* Do not directly pass LIFE to the recursive call, copy it to
7096 confine changes in the callee to the current child/subtree. */
7097 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7099 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
7100 for (unsigned j = 0;
7101 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
7103 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
7104 if (perm.first == i)
7105 subtree_life[perm.second] = (*life)[j];
7108 else
7110 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
7111 subtree_life.safe_splice (*life);
7113 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
7114 vectorized_scalar_stmts, visited);
7115 subtree_life.truncate (0);
7120 /* Comparator for the loop-index sorted cost vectors. */
7122 static int
7123 li_cost_vec_cmp (const void *a_, const void *b_)
7125 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
7126 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
7127 if (a->first < b->first)
7128 return -1;
7129 else if (a->first == b->first)
7130 return 0;
7131 return 1;
7134 /* Check if vectorization of the basic block is profitable for the
7135 subgraph denoted by SLP_INSTANCES. */
7137 static bool
7138 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
7139 vec<slp_instance> slp_instances,
7140 loop_p orig_loop)
7142 slp_instance instance;
7143 int i;
7144 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
7145 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
7147 if (dump_enabled_p ())
7149 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
7150 hash_set<slp_tree> visited;
7151 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7152 vect_print_slp_graph (MSG_NOTE, vect_location,
7153 SLP_INSTANCE_TREE (instance), visited);
7156 /* Compute the set of scalar stmts we know will go away 'locally' when
7157 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
7158 not accurate for nodes promoted extern late or for scalar stmts that
7159 are used both in extern defs and in vectorized defs. */
7160 hash_set<stmt_vec_info> vectorized_scalar_stmts;
7161 hash_set<stmt_vec_info> scalar_stmts_in_externs;
7162 hash_set<slp_tree> visited;
7163 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7165 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
7166 SLP_INSTANCE_TREE (instance),
7167 visited,
7168 vectorized_scalar_stmts,
7169 scalar_stmts_in_externs);
7170 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7171 vectorized_scalar_stmts.add (rstmt);
7173 /* Scalar stmts used as defs in external nodes need to be preseved, so
7174 remove them from vectorized_scalar_stmts. */
7175 for (stmt_vec_info stmt : scalar_stmts_in_externs)
7176 vectorized_scalar_stmts.remove (stmt);
7178 /* Calculate scalar cost and sum the cost for the vector stmts
7179 previously collected. */
7180 stmt_vector_for_cost scalar_costs = vNULL;
7181 stmt_vector_for_cost vector_costs = vNULL;
7182 visited.empty ();
7183 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7185 auto_vec<bool, 20> life;
7186 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7187 true);
7188 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7189 record_stmt_cost (&scalar_costs,
7190 SLP_INSTANCE_ROOT_STMTS (instance).length (),
7191 scalar_stmt,
7192 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
7193 vect_bb_slp_scalar_cost (bb_vinfo,
7194 SLP_INSTANCE_TREE (instance),
7195 &life, &scalar_costs, vectorized_scalar_stmts,
7196 visited);
7197 vector_costs.safe_splice (instance->cost_vec);
7198 instance->cost_vec.release ();
7201 if (dump_enabled_p ())
7202 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7204 /* When costing non-loop vectorization we need to consider each covered
7205 loop independently and make sure vectorization is profitable. For
7206 now we assume a loop may be not entered or executed an arbitrary
7207 number of iterations (??? static information can provide more
7208 precise info here) which means we can simply cost each containing
7209 loops stmts separately. */
7211 /* First produce cost vectors sorted by loop index. */
7212 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7213 li_scalar_costs (scalar_costs.length ());
7214 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7215 li_vector_costs (vector_costs.length ());
7216 stmt_info_for_cost *cost;
7217 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7219 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7220 li_scalar_costs.quick_push (std::make_pair (l, cost));
7222 /* Use a random used loop as fallback in case the first vector_costs
7223 entry does not have a stmt_info associated with it. */
7224 unsigned l = li_scalar_costs[0].first;
7225 FOR_EACH_VEC_ELT (vector_costs, i, cost)
7227 /* We inherit from the previous COST, invariants, externals and
7228 extracts immediately follow the cost for the related stmt. */
7229 if (cost->stmt_info)
7230 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7231 li_vector_costs.quick_push (std::make_pair (l, cost));
7233 li_scalar_costs.qsort (li_cost_vec_cmp);
7234 li_vector_costs.qsort (li_cost_vec_cmp);
7236 /* Now cost the portions individually. */
7237 unsigned vi = 0;
7238 unsigned si = 0;
7239 bool profitable = true;
7240 while (si < li_scalar_costs.length ()
7241 && vi < li_vector_costs.length ())
7243 unsigned sl = li_scalar_costs[si].first;
7244 unsigned vl = li_vector_costs[vi].first;
7245 if (sl != vl)
7247 if (dump_enabled_p ())
7248 dump_printf_loc (MSG_NOTE, vect_location,
7249 "Scalar %d and vector %d loop part do not "
7250 "match up, skipping scalar part\n", sl, vl);
7251 /* Skip the scalar part, assuming zero cost on the vector side. */
7254 si++;
7256 while (si < li_scalar_costs.length ()
7257 && li_scalar_costs[si].first == sl);
7258 continue;
7261 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7264 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7265 si++;
7267 while (si < li_scalar_costs.length ()
7268 && li_scalar_costs[si].first == sl);
7269 unsigned dummy;
7270 finish_cost (scalar_target_cost_data, nullptr,
7271 &dummy, &scalar_cost, &dummy);
7273 /* Complete the target-specific vector cost calculation. */
7274 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7277 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7278 vi++;
7280 while (vi < li_vector_costs.length ()
7281 && li_vector_costs[vi].first == vl);
7282 finish_cost (vect_target_cost_data, scalar_target_cost_data,
7283 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7284 delete scalar_target_cost_data;
7285 delete vect_target_cost_data;
7287 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7289 if (dump_enabled_p ())
7291 dump_printf_loc (MSG_NOTE, vect_location,
7292 "Cost model analysis for part in loop %d:\n", sl);
7293 dump_printf (MSG_NOTE, " Vector cost: %d\n",
7294 vec_inside_cost + vec_outside_cost);
7295 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
7298 /* Vectorization is profitable if its cost is more than the cost of scalar
7299 version. Note that we err on the vector side for equal cost because
7300 the cost estimate is otherwise quite pessimistic (constant uses are
7301 free on the scalar side but cost a load on the vector side for
7302 example). */
7303 if (vec_outside_cost + vec_inside_cost > scalar_cost)
7305 profitable = false;
7306 break;
7309 if (profitable && vi < li_vector_costs.length ())
7311 if (dump_enabled_p ())
7312 dump_printf_loc (MSG_NOTE, vect_location,
7313 "Excess vector cost for part in loop %d:\n",
7314 li_vector_costs[vi].first);
7315 profitable = false;
7318 /* Unset visited flag. This is delayed when the subgraph is profitable
7319 and we process the loop for remaining unvectorized if-converted code. */
7320 if (!orig_loop || !profitable)
7321 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7322 gimple_set_visited (cost->stmt_info->stmt, false);
7324 scalar_costs.release ();
7325 vector_costs.release ();
7327 return profitable;
7330 /* qsort comparator for lane defs. */
7332 static int
7333 vld_cmp (const void *a_, const void *b_)
7335 auto *a = (const std::pair<unsigned, tree> *)a_;
7336 auto *b = (const std::pair<unsigned, tree> *)b_;
7337 return a->first - b->first;
7340 /* Return true if USE_STMT is a vector lane insert into VEC and set
7341 *THIS_LANE to the lane number that is set. */
7343 static bool
7344 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7346 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7347 if (!use_ass
7348 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7349 || (vec
7350 ? gimple_assign_rhs1 (use_ass) != vec
7351 : ((vec = gimple_assign_rhs1 (use_ass)), false))
7352 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7353 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7354 || !constant_multiple_p
7355 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7356 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7357 this_lane))
7358 return false;
7359 return true;
7362 /* Find any vectorizable constructors and add them to the grouped_store
7363 array. */
7365 static void
7366 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7368 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7369 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7370 !gsi_end_p (gsi); gsi_next (&gsi))
7372 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7373 if (!assign)
7374 continue;
7376 tree rhs = gimple_assign_rhs1 (assign);
7377 enum tree_code code = gimple_assign_rhs_code (assign);
7378 use_operand_p use_p;
7379 gimple *use_stmt;
7380 if (code == CONSTRUCTOR)
7382 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7383 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7384 CONSTRUCTOR_NELTS (rhs))
7385 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7386 || uniform_vector_p (rhs))
7387 continue;
7389 unsigned j;
7390 tree val;
7391 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7392 if (TREE_CODE (val) != SSA_NAME
7393 || !bb_vinfo->lookup_def (val))
7394 break;
7395 if (j != CONSTRUCTOR_NELTS (rhs))
7396 continue;
7398 vec<stmt_vec_info> roots = vNULL;
7399 roots.safe_push (bb_vinfo->lookup_stmt (assign));
7400 vec<stmt_vec_info> stmts;
7401 stmts.create (CONSTRUCTOR_NELTS (rhs));
7402 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7403 stmts.quick_push
7404 (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7405 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7406 stmts, roots));
7408 else if (code == BIT_INSERT_EXPR
7409 && VECTOR_TYPE_P (TREE_TYPE (rhs))
7410 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7411 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7412 && integer_zerop (gimple_assign_rhs3 (assign))
7413 && useless_type_conversion_p
7414 (TREE_TYPE (TREE_TYPE (rhs)),
7415 TREE_TYPE (gimple_assign_rhs2 (assign)))
7416 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7418 /* We start to match on insert to lane zero but since the
7419 inserts need not be ordered we'd have to search both
7420 the def and the use chains. */
7421 tree vectype = TREE_TYPE (rhs);
7422 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7423 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7424 auto_sbitmap lanes (nlanes);
7425 bitmap_clear (lanes);
7426 bitmap_set_bit (lanes, 0);
7427 tree def = gimple_assign_lhs (assign);
7428 lane_defs.quick_push
7429 (std::make_pair (0, gimple_assign_rhs2 (assign)));
7430 unsigned lanes_found = 1;
7431 /* Start with the use chains, the last stmt will be the root. */
7432 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7433 vec<stmt_vec_info> roots = vNULL;
7434 roots.safe_push (last);
7437 use_operand_p use_p;
7438 gimple *use_stmt;
7439 if (!single_imm_use (def, &use_p, &use_stmt))
7440 break;
7441 unsigned this_lane;
7442 if (!bb_vinfo->lookup_stmt (use_stmt)
7443 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7444 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7445 break;
7446 if (bitmap_bit_p (lanes, this_lane))
7447 break;
7448 lanes_found++;
7449 bitmap_set_bit (lanes, this_lane);
7450 gassign *use_ass = as_a <gassign *> (use_stmt);
7451 lane_defs.quick_push (std::make_pair
7452 (this_lane, gimple_assign_rhs2 (use_ass)));
7453 last = bb_vinfo->lookup_stmt (use_ass);
7454 roots.safe_push (last);
7455 def = gimple_assign_lhs (use_ass);
7457 while (lanes_found < nlanes);
7458 if (roots.length () > 1)
7459 std::swap(roots[0], roots[roots.length () - 1]);
7460 if (lanes_found < nlanes)
7462 /* Now search the def chain. */
7463 def = gimple_assign_rhs1 (assign);
7466 if (TREE_CODE (def) != SSA_NAME
7467 || !has_single_use (def))
7468 break;
7469 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7470 unsigned this_lane;
7471 if (!bb_vinfo->lookup_stmt (def_stmt)
7472 || !vect_slp_is_lane_insert (def_stmt,
7473 NULL_TREE, &this_lane)
7474 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7475 break;
7476 if (bitmap_bit_p (lanes, this_lane))
7477 break;
7478 lanes_found++;
7479 bitmap_set_bit (lanes, this_lane);
7480 lane_defs.quick_push (std::make_pair
7481 (this_lane,
7482 gimple_assign_rhs2 (def_stmt)));
7483 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7484 def = gimple_assign_rhs1 (def_stmt);
7486 while (lanes_found < nlanes);
7488 if (lanes_found == nlanes)
7490 /* Sort lane_defs after the lane index and register the root. */
7491 lane_defs.qsort (vld_cmp);
7492 vec<stmt_vec_info> stmts;
7493 stmts.create (nlanes);
7494 for (unsigned i = 0; i < nlanes; ++i)
7495 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7496 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7497 stmts, roots));
7499 else
7500 roots.release ();
7502 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7503 && (associative_tree_code (code) || code == MINUS_EXPR)
7504 /* ??? This pessimizes a two-element reduction. PR54400.
7505 ??? In-order reduction could be handled if we only
7506 traverse one operand chain in vect_slp_linearize_chain. */
7507 && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7508 /* Ops with constants at the tail can be stripped here. */
7509 && TREE_CODE (rhs) == SSA_NAME
7510 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7511 /* Should be the chain end. */
7512 && (!single_imm_use (gimple_assign_lhs (assign),
7513 &use_p, &use_stmt)
7514 || !is_gimple_assign (use_stmt)
7515 || (gimple_assign_rhs_code (use_stmt) != code
7516 && ((code != PLUS_EXPR && code != MINUS_EXPR)
7517 || (gimple_assign_rhs_code (use_stmt)
7518 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7520 /* We start the match at the end of a possible association
7521 chain. */
7522 auto_vec<chain_op_t> chain;
7523 auto_vec<std::pair<tree_code, gimple *> > worklist;
7524 auto_vec<gimple *> chain_stmts;
7525 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7526 if (code == MINUS_EXPR)
7527 code = PLUS_EXPR;
7528 internal_fn reduc_fn;
7529 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7530 || reduc_fn == IFN_LAST)
7531 continue;
7532 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7533 /* ??? */
7534 code_stmt, alt_code_stmt, &chain_stmts);
7535 if (chain.length () > 1)
7537 /* Sort the chain according to def_type and operation. */
7538 chain.sort (dt_sort_cmp, bb_vinfo);
7539 /* ??? Now we'd want to strip externals and constants
7540 but record those to be handled in the epilogue. */
7541 /* ??? For now do not allow mixing ops or externs/constants. */
7542 bool invalid = false;
7543 unsigned remain_cnt = 0;
7544 for (unsigned i = 0; i < chain.length (); ++i)
7546 if (chain[i].code != code)
7548 invalid = true;
7549 break;
7551 if (chain[i].dt != vect_internal_def
7552 /* Avoid stmts where the def is not the LHS, like
7553 ASMs. */
7554 || (gimple_get_lhs (bb_vinfo->lookup_def
7555 (chain[i].op)->stmt)
7556 != chain[i].op))
7557 remain_cnt++;
7559 if (!invalid && chain.length () - remain_cnt > 1)
7561 vec<stmt_vec_info> stmts;
7562 vec<tree> remain = vNULL;
7563 stmts.create (chain.length ());
7564 if (remain_cnt > 0)
7565 remain.create (remain_cnt);
7566 for (unsigned i = 0; i < chain.length (); ++i)
7568 stmt_vec_info stmt_info;
7569 if (chain[i].dt == vect_internal_def
7570 && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
7571 gimple_get_lhs (stmt_info->stmt) == chain[i].op))
7572 stmts.quick_push (stmt_info);
7573 else
7574 remain.quick_push (chain[i].op);
7576 vec<stmt_vec_info> roots;
7577 roots.create (chain_stmts.length ());
7578 for (unsigned i = 0; i < chain_stmts.length (); ++i)
7579 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7580 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7581 stmts, roots, remain));
7588 /* Walk the grouped store chains and replace entries with their
7589 pattern variant if any. */
7591 static void
7592 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7594 stmt_vec_info first_element;
7595 unsigned i;
7597 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7599 /* We also have CTORs in this array. */
7600 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7601 continue;
7602 if (STMT_VINFO_IN_PATTERN_P (first_element))
7604 stmt_vec_info orig = first_element;
7605 first_element = STMT_VINFO_RELATED_STMT (first_element);
7606 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7607 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7608 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7609 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7610 vinfo->grouped_stores[i] = first_element;
7612 stmt_vec_info prev = first_element;
7613 while (DR_GROUP_NEXT_ELEMENT (prev))
7615 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7616 if (STMT_VINFO_IN_PATTERN_P (elt))
7618 stmt_vec_info orig = elt;
7619 elt = STMT_VINFO_RELATED_STMT (elt);
7620 DR_GROUP_NEXT_ELEMENT (prev) = elt;
7621 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7622 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7624 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7625 prev = elt;
7630 /* Check if the region described by BB_VINFO can be vectorized, returning
7631 true if so. When returning false, set FATAL to true if the same failure
7632 would prevent vectorization at other vector sizes, false if it is still
7633 worth trying other sizes. N_STMTS is the number of statements in the
7634 region. */
7636 static bool
7637 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7638 vec<int> *dataref_groups)
7640 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7642 slp_instance instance;
7643 int i;
7644 poly_uint64 min_vf = 2;
7646 /* The first group of checks is independent of the vector size. */
7647 fatal = true;
7649 /* Analyze the data references. */
7651 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7653 if (dump_enabled_p ())
7654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7655 "not vectorized: unhandled data-ref in basic "
7656 "block.\n");
7657 return false;
7660 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7662 if (dump_enabled_p ())
7663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7664 "not vectorized: unhandled data access in "
7665 "basic block.\n");
7666 return false;
7669 vect_slp_check_for_roots (bb_vinfo);
7671 /* If there are no grouped stores and no constructors in the region
7672 there is no need to continue with pattern recog as vect_analyze_slp
7673 will fail anyway. */
7674 if (bb_vinfo->grouped_stores.is_empty ()
7675 && bb_vinfo->roots.is_empty ())
7677 if (dump_enabled_p ())
7678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7679 "not vectorized: no grouped stores in "
7680 "basic block.\n");
7681 return false;
7684 /* While the rest of the analysis below depends on it in some way. */
7685 fatal = false;
7687 vect_pattern_recog (bb_vinfo);
7689 /* Update store groups from pattern processing. */
7690 vect_fixup_store_groups_with_patterns (bb_vinfo);
7692 /* Check the SLP opportunities in the basic block, analyze and build SLP
7693 trees. */
7694 if (!vect_analyze_slp (bb_vinfo, n_stmts))
7696 if (dump_enabled_p ())
7698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7699 "Failed to SLP the basic block.\n");
7700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7701 "not vectorized: failed to find SLP opportunities "
7702 "in basic block.\n");
7704 return false;
7707 /* Optimize permutations. */
7708 vect_optimize_slp (bb_vinfo);
7710 /* Gather the loads reachable from the SLP graph entries. */
7711 vect_gather_slp_loads (bb_vinfo);
7713 vect_record_base_alignments (bb_vinfo);
7715 /* Analyze and verify the alignment of data references and the
7716 dependence in the SLP instances. */
7717 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7719 vect_location = instance->location ();
7720 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7721 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7723 slp_tree node = SLP_INSTANCE_TREE (instance);
7724 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7725 if (dump_enabled_p ())
7726 dump_printf_loc (MSG_NOTE, vect_location,
7727 "removing SLP instance operations starting from: %G",
7728 stmt_info->stmt);
7729 vect_free_slp_instance (instance);
7730 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7731 continue;
7734 /* Mark all the statements that we want to vectorize as pure SLP and
7735 relevant. */
7736 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7737 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7738 unsigned j;
7739 stmt_vec_info root;
7740 /* Likewise consider instance root stmts as vectorized. */
7741 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7742 STMT_SLP_TYPE (root) = pure_slp;
7744 i++;
7746 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7747 return false;
7749 if (!vect_slp_analyze_operations (bb_vinfo))
7751 if (dump_enabled_p ())
7752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7753 "not vectorized: bad operation in basic block.\n");
7754 return false;
7757 vect_bb_partition_graph (bb_vinfo);
7759 return true;
7762 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
7763 basic blocks in BBS, returning true on success.
7764 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
7766 static bool
7767 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7768 vec<int> *dataref_groups, unsigned int n_stmts,
7769 loop_p orig_loop)
7771 bb_vec_info bb_vinfo;
7772 auto_vector_modes vector_modes;
7774 /* Autodetect first vector size we try. */
7775 machine_mode next_vector_mode = VOIDmode;
7776 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7777 unsigned int mode_i = 0;
7779 vec_info_shared shared;
7781 machine_mode autodetected_vector_mode = VOIDmode;
7782 while (1)
7784 bool vectorized = false;
7785 bool fatal = false;
7786 bb_vinfo = new _bb_vec_info (bbs, &shared);
7788 bool first_time_p = shared.datarefs.is_empty ();
7789 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7790 if (first_time_p)
7791 bb_vinfo->shared->save_datarefs ();
7792 else
7793 bb_vinfo->shared->check_datarefs ();
7794 bb_vinfo->vector_mode = next_vector_mode;
7796 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7798 if (dump_enabled_p ())
7800 dump_printf_loc (MSG_NOTE, vect_location,
7801 "***** Analysis succeeded with vector mode"
7802 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7803 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7806 bb_vinfo->shared->check_datarefs ();
7808 bool force_clear = false;
7809 auto_vec<slp_instance> profitable_subgraphs;
7810 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7812 if (instance->subgraph_entries.is_empty ())
7813 continue;
7815 dump_user_location_t saved_vect_location = vect_location;
7816 vect_location = instance->location ();
7817 if (!unlimited_cost_model (NULL)
7818 && !vect_bb_vectorization_profitable_p
7819 (bb_vinfo, instance->subgraph_entries, orig_loop))
7821 if (dump_enabled_p ())
7822 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7823 "not vectorized: vectorization is not "
7824 "profitable.\n");
7825 vect_location = saved_vect_location;
7826 continue;
7829 vect_location = saved_vect_location;
7830 if (!dbg_cnt (vect_slp))
7832 force_clear = true;
7833 continue;
7836 profitable_subgraphs.safe_push (instance);
7839 /* When we're vectorizing an if-converted loop body make sure
7840 we vectorized all if-converted code. */
7841 if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
7843 gcc_assert (bb_vinfo->bbs.length () == 1);
7844 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7845 !gsi_end_p (gsi); gsi_next (&gsi))
7847 /* The costing above left us with DCEable vectorized scalar
7848 stmts having the visited flag set on profitable
7849 subgraphs. Do the delayed clearing of the flag here. */
7850 if (gimple_visited_p (gsi_stmt (gsi)))
7852 gimple_set_visited (gsi_stmt (gsi), false);
7853 continue;
7855 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7856 continue;
7858 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7859 if (gimple_assign_rhs_code (ass) == COND_EXPR)
7861 if (!profitable_subgraphs.is_empty ()
7862 && dump_enabled_p ())
7863 dump_printf_loc (MSG_NOTE, vect_location,
7864 "not profitable because of "
7865 "unprofitable if-converted scalar "
7866 "code\n");
7867 profitable_subgraphs.truncate (0);
7872 /* Finally schedule the profitable subgraphs. */
7873 for (slp_instance instance : profitable_subgraphs)
7875 if (!vectorized && dump_enabled_p ())
7876 dump_printf_loc (MSG_NOTE, vect_location,
7877 "Basic block will be vectorized "
7878 "using SLP\n");
7879 vectorized = true;
7881 /* Dump before scheduling as store vectorization will remove
7882 the original stores and mess with the instance tree
7883 so querying its location will eventually ICE. */
7884 if (flag_checking)
7885 for (slp_instance sub : instance->subgraph_entries)
7886 gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7887 unsigned HOST_WIDE_INT bytes;
7888 if (dump_enabled_p ())
7889 for (slp_instance sub : instance->subgraph_entries)
7891 tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7892 if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7893 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7894 sub->location (),
7895 "basic block part vectorized using %wu "
7896 "byte vectors\n", bytes);
7897 else
7898 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7899 sub->location (),
7900 "basic block part vectorized using "
7901 "variable length vectors\n");
7904 dump_user_location_t saved_vect_location = vect_location;
7905 vect_location = instance->location ();
7907 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7909 vect_location = saved_vect_location;
7912 else
7914 if (dump_enabled_p ())
7915 dump_printf_loc (MSG_NOTE, vect_location,
7916 "***** Analysis failed with vector mode %s\n",
7917 GET_MODE_NAME (bb_vinfo->vector_mode));
7920 if (mode_i == 0)
7921 autodetected_vector_mode = bb_vinfo->vector_mode;
7923 if (!fatal)
7924 while (mode_i < vector_modes.length ()
7925 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7927 if (dump_enabled_p ())
7928 dump_printf_loc (MSG_NOTE, vect_location,
7929 "***** The result for vector mode %s would"
7930 " be the same\n",
7931 GET_MODE_NAME (vector_modes[mode_i]));
7932 mode_i += 1;
7935 delete bb_vinfo;
7937 if (mode_i < vector_modes.length ()
7938 && VECTOR_MODE_P (autodetected_vector_mode)
7939 && (related_vector_mode (vector_modes[mode_i],
7940 GET_MODE_INNER (autodetected_vector_mode))
7941 == autodetected_vector_mode)
7942 && (related_vector_mode (autodetected_vector_mode,
7943 GET_MODE_INNER (vector_modes[mode_i]))
7944 == vector_modes[mode_i]))
7946 if (dump_enabled_p ())
7947 dump_printf_loc (MSG_NOTE, vect_location,
7948 "***** Skipping vector mode %s, which would"
7949 " repeat the analysis for %s\n",
7950 GET_MODE_NAME (vector_modes[mode_i]),
7951 GET_MODE_NAME (autodetected_vector_mode));
7952 mode_i += 1;
7955 if (vectorized
7956 || mode_i == vector_modes.length ()
7957 || autodetected_vector_mode == VOIDmode
7958 /* If vect_slp_analyze_bb_1 signaled that analysis for all
7959 vector sizes will fail do not bother iterating. */
7960 || fatal)
7961 return vectorized;
7963 /* Try the next biggest vector size. */
7964 next_vector_mode = vector_modes[mode_i++];
7965 if (dump_enabled_p ())
7966 dump_printf_loc (MSG_NOTE, vect_location,
7967 "***** Re-trying analysis with vector mode %s\n",
7968 GET_MODE_NAME (next_vector_mode));
7973 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
7974 true if anything in the basic-block was vectorized. */
7976 static bool
7977 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7979 vec<data_reference_p> datarefs = vNULL;
7980 auto_vec<int> dataref_groups;
7981 int insns = 0;
7982 int current_group = 0;
7984 for (unsigned i = 0; i < bbs.length (); i++)
7986 basic_block bb = bbs[i];
7987 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7988 gsi_next (&gsi))
7990 gimple *stmt = gsi_stmt (gsi);
7991 if (is_gimple_debug (stmt))
7992 continue;
7994 insns++;
7996 if (gimple_location (stmt) != UNKNOWN_LOCATION)
7997 vect_location = stmt;
7999 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
8000 &dataref_groups, current_group))
8001 ++current_group;
8003 /* New BBs always start a new DR group. */
8004 ++current_group;
8007 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
8010 /* Special entry for the BB vectorizer. Analyze and transform a single
8011 if-converted BB with ORIG_LOOPs body being the not if-converted
8012 representation. Returns true if anything in the basic-block was
8013 vectorized. */
8015 bool
8016 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
8018 auto_vec<basic_block> bbs;
8019 bbs.safe_push (bb);
8020 return vect_slp_bbs (bbs, orig_loop);
8023 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
8024 true if anything in the basic-block was vectorized. */
8026 bool
8027 vect_slp_function (function *fun)
8029 bool r = false;
8030 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
8031 auto_bitmap exit_bbs;
8032 bitmap_set_bit (exit_bbs, EXIT_BLOCK);
8033 edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
8034 unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
8035 true, rpo, NULL);
8037 /* For the moment split the function into pieces to avoid making
8038 the iteration on the vector mode moot. Split at points we know
8039 to not handle well which is CFG merges (SLP discovery doesn't
8040 handle non-loop-header PHIs) and loop exits. Since pattern
8041 recog requires reverse iteration to visit uses before defs
8042 simply chop RPO into pieces. */
8043 auto_vec<basic_block> bbs;
8044 for (unsigned i = 0; i < n; i++)
8046 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
8047 bool split = false;
8049 /* Split when a BB is not dominated by the first block. */
8050 if (!bbs.is_empty ()
8051 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
8053 if (dump_enabled_p ())
8054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8055 "splitting region at dominance boundary bb%d\n",
8056 bb->index);
8057 split = true;
8059 /* Split when the loop determined by the first block
8060 is exited. This is because we eventually insert
8061 invariants at region begin. */
8062 else if (!bbs.is_empty ()
8063 && bbs[0]->loop_father != bb->loop_father
8064 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
8066 if (dump_enabled_p ())
8067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8068 "splitting region at loop %d exit at bb%d\n",
8069 bbs[0]->loop_father->num, bb->index);
8070 split = true;
8072 else if (!bbs.is_empty ()
8073 && bb->loop_father->header == bb
8074 && bb->loop_father->dont_vectorize)
8076 if (dump_enabled_p ())
8077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8078 "splitting region at dont-vectorize loop %d "
8079 "entry at bb%d\n",
8080 bb->loop_father->num, bb->index);
8081 split = true;
8084 if (split && !bbs.is_empty ())
8086 r |= vect_slp_bbs (bbs, NULL);
8087 bbs.truncate (0);
8090 if (bbs.is_empty ())
8092 /* We need to be able to insert at the head of the region which
8093 we cannot for region starting with a returns-twice call. */
8094 if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
8095 if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
8097 if (dump_enabled_p ())
8098 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8099 "skipping bb%d as start of region as it "
8100 "starts with returns-twice call\n",
8101 bb->index);
8102 continue;
8104 /* If the loop this BB belongs to is marked as not to be vectorized
8105 honor that also for BB vectorization. */
8106 if (bb->loop_father->dont_vectorize)
8107 continue;
8110 bbs.safe_push (bb);
8112 /* When we have a stmt ending this block and defining a
8113 value we have to insert on edges when inserting after it for
8114 a vector containing its definition. Avoid this for now. */
8115 if (gimple *last = *gsi_last_bb (bb))
8116 if (gimple_get_lhs (last)
8117 && is_ctrl_altering_stmt (last))
8119 if (dump_enabled_p ())
8120 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8121 "splitting region at control altering "
8122 "definition %G", last);
8123 r |= vect_slp_bbs (bbs, NULL);
8124 bbs.truncate (0);
8128 if (!bbs.is_empty ())
8129 r |= vect_slp_bbs (bbs, NULL);
8131 free (rpo);
8133 return r;
8136 /* Build a variable-length vector in which the elements in ELTS are repeated
8137 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
8138 RESULTS and add any new instructions to SEQ.
8140 The approach we use is:
8142 (1) Find a vector mode VM with integer elements of mode IM.
8144 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8145 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
8146 from small vectors to IM.
8148 (3) Duplicate each ELTS'[I] into a vector of mode VM.
8150 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
8151 correct byte contents.
8153 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
8155 We try to find the largest IM for which this sequence works, in order
8156 to cut down on the number of interleaves. */
8158 void
8159 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
8160 const vec<tree> &elts, unsigned int nresults,
8161 vec<tree> &results)
8163 unsigned int nelts = elts.length ();
8164 tree element_type = TREE_TYPE (vector_type);
8166 /* (1) Find a vector mode VM with integer elements of mode IM. */
8167 unsigned int nvectors = 1;
8168 tree new_vector_type;
8169 tree permutes[2];
8170 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
8171 &nvectors, &new_vector_type,
8172 permutes))
8173 gcc_unreachable ();
8175 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
8176 unsigned int partial_nelts = nelts / nvectors;
8177 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8179 tree_vector_builder partial_elts;
8180 auto_vec<tree, 32> pieces (nvectors * 2);
8181 pieces.quick_grow_cleared (nvectors * 2);
8182 for (unsigned int i = 0; i < nvectors; ++i)
8184 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8185 ELTS' has mode IM. */
8186 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
8187 for (unsigned int j = 0; j < partial_nelts; ++j)
8188 partial_elts.quick_push (elts[i * partial_nelts + j]);
8189 tree t = gimple_build_vector (seq, &partial_elts);
8190 t = gimple_build (seq, VIEW_CONVERT_EXPR,
8191 TREE_TYPE (new_vector_type), t);
8193 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
8194 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
8197 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
8198 correct byte contents.
8200 Conceptually, we need to repeat the following operation log2(nvectors)
8201 times, where hi_start = nvectors / 2:
8203 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
8204 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
8206 However, if each input repeats every N elements and the VF is
8207 a multiple of N * 2, the HI result is the same as the LO result.
8208 This will be true for the first N1 iterations of the outer loop,
8209 followed by N2 iterations for which both the LO and HI results
8210 are needed. I.e.:
8212 N1 + N2 = log2(nvectors)
8214 Each "N1 iteration" doubles the number of redundant vectors and the
8215 effect of the process as a whole is to have a sequence of nvectors/2**N1
8216 vectors that repeats 2**N1 times. Rather than generate these redundant
8217 vectors, we halve the number of vectors for each N1 iteration. */
8218 unsigned int in_start = 0;
8219 unsigned int out_start = nvectors;
8220 unsigned int new_nvectors = nvectors;
8221 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
8223 unsigned int hi_start = new_nvectors / 2;
8224 unsigned int out_i = 0;
8225 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
8227 if ((in_i & 1) != 0
8228 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
8229 2 * in_repeat))
8230 continue;
8232 tree output = make_ssa_name (new_vector_type);
8233 tree input1 = pieces[in_start + (in_i / 2)];
8234 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
8235 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8236 input1, input2,
8237 permutes[in_i & 1]);
8238 gimple_seq_add_stmt (seq, stmt);
8239 pieces[out_start + out_i] = output;
8240 out_i += 1;
8242 std::swap (in_start, out_start);
8243 new_nvectors = out_i;
8246 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
8247 results.reserve (nresults);
8248 for (unsigned int i = 0; i < nresults; ++i)
8249 if (i < new_nvectors)
8250 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
8251 pieces[in_start + i]));
8252 else
8253 results.quick_push (results[i - new_nvectors]);
8257 /* For constant and loop invariant defs in OP_NODE this function creates
8258 vector defs that will be used in the vectorized stmts and stores them
8259 to SLP_TREE_VEC_DEFS of OP_NODE. */
8261 static void
8262 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8264 unsigned HOST_WIDE_INT nunits;
8265 tree vec_cst;
8266 unsigned j, number_of_places_left_in_vector;
8267 tree vector_type;
8268 tree vop;
8269 int group_size = op_node->ops.length ();
8270 unsigned int vec_num, i;
8271 unsigned number_of_copies = 1;
8272 bool constant_p;
8273 gimple_seq ctor_seq = NULL;
8274 auto_vec<tree, 16> permute_results;
8276 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
8277 vector_type = SLP_TREE_VECTYPE (op_node);
8279 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8280 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8281 auto_vec<tree> voprnds (number_of_vectors);
8283 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8284 created vectors. It is greater than 1 if unrolling is performed.
8286 For example, we have two scalar operands, s1 and s2 (e.g., group of
8287 strided accesses of size two), while NUNITS is four (i.e., four scalars
8288 of this type can be packed in a vector). The output vector will contain
8289 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
8290 will be 2).
8292 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8293 containing the operands.
8295 For example, NUNITS is four as before, and the group size is 8
8296 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
8297 {s5, s6, s7, s8}. */
8299 /* When using duplicate_and_interleave, we just need one element for
8300 each scalar statement. */
8301 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8302 nunits = group_size;
8304 number_of_copies = nunits * number_of_vectors / group_size;
8306 number_of_places_left_in_vector = nunits;
8307 constant_p = true;
8308 tree uniform_elt = NULL_TREE;
8309 tree_vector_builder elts (vector_type, nunits, 1);
8310 elts.quick_grow (nunits);
8311 stmt_vec_info insert_after = NULL;
8312 for (j = 0; j < number_of_copies; j++)
8314 tree op;
8315 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8317 /* Create 'vect_ = {op0,op1,...,opn}'. */
8318 tree orig_op = op;
8319 if (number_of_places_left_in_vector == nunits)
8320 uniform_elt = op;
8321 else if (uniform_elt && operand_equal_p (uniform_elt, op))
8322 op = elts[number_of_places_left_in_vector];
8323 else
8324 uniform_elt = NULL_TREE;
8325 number_of_places_left_in_vector--;
8326 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8328 if (CONSTANT_CLASS_P (op))
8330 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8332 /* Can't use VIEW_CONVERT_EXPR for booleans because
8333 of possibly different sizes of scalar value and
8334 vector element. */
8335 if (integer_zerop (op))
8336 op = build_int_cst (TREE_TYPE (vector_type), 0);
8337 else if (integer_onep (op))
8338 op = build_all_ones_cst (TREE_TYPE (vector_type));
8339 else
8340 gcc_unreachable ();
8342 else
8343 op = fold_unary (VIEW_CONVERT_EXPR,
8344 TREE_TYPE (vector_type), op);
8345 gcc_assert (op && CONSTANT_CLASS_P (op));
8347 else
8349 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8350 gimple *init_stmt;
8351 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8353 tree true_val
8354 = build_all_ones_cst (TREE_TYPE (vector_type));
8355 tree false_val
8356 = build_zero_cst (TREE_TYPE (vector_type));
8357 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8358 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8359 op, true_val,
8360 false_val);
8362 else
8364 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8365 op);
8366 init_stmt
8367 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8368 op);
8370 gimple_seq_add_stmt (&ctor_seq, init_stmt);
8371 op = new_temp;
8374 elts[number_of_places_left_in_vector] = op;
8375 if (!CONSTANT_CLASS_P (op))
8376 constant_p = false;
8377 /* For BB vectorization we have to compute an insert location
8378 when a def is inside the analyzed region since we cannot
8379 simply insert at the BB start in this case. */
8380 stmt_vec_info opdef;
8381 if (TREE_CODE (orig_op) == SSA_NAME
8382 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8383 && is_a <bb_vec_info> (vinfo)
8384 && (opdef = vinfo->lookup_def (orig_op)))
8386 if (!insert_after)
8387 insert_after = opdef;
8388 else
8389 insert_after = get_later_stmt (insert_after, opdef);
8392 if (number_of_places_left_in_vector == 0)
8394 auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
8395 if (uniform_elt)
8396 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
8397 elts[0]);
8398 else if (constant_p
8399 ? multiple_p (type_nunits, nunits)
8400 : known_eq (type_nunits, nunits))
8401 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8402 else
8404 if (permute_results.is_empty ())
8405 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8406 elts, number_of_vectors,
8407 permute_results);
8408 vec_cst = permute_results[number_of_vectors - j - 1];
8410 if (!gimple_seq_empty_p (ctor_seq))
8412 if (insert_after)
8414 gimple_stmt_iterator gsi;
8415 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8417 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8418 gsi_insert_seq_before (&gsi, ctor_seq,
8419 GSI_CONTINUE_LINKING);
8421 else if (!stmt_ends_bb_p (insert_after->stmt))
8423 gsi = gsi_for_stmt (insert_after->stmt);
8424 gsi_insert_seq_after (&gsi, ctor_seq,
8425 GSI_CONTINUE_LINKING);
8427 else
8429 /* When we want to insert after a def where the
8430 defining stmt throws then insert on the fallthru
8431 edge. */
8432 edge e = find_fallthru_edge
8433 (gimple_bb (insert_after->stmt)->succs);
8434 basic_block new_bb
8435 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8436 gcc_assert (!new_bb);
8439 else
8440 vinfo->insert_seq_on_entry (NULL, ctor_seq);
8441 ctor_seq = NULL;
8443 voprnds.quick_push (vec_cst);
8444 insert_after = NULL;
8445 number_of_places_left_in_vector = nunits;
8446 constant_p = true;
8447 elts.new_vector (vector_type, nunits, 1);
8448 elts.quick_grow (nunits);
8453 /* Since the vectors are created in the reverse order, we should invert
8454 them. */
8455 vec_num = voprnds.length ();
8456 for (j = vec_num; j != 0; j--)
8458 vop = voprnds[j - 1];
8459 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8462 /* In case that VF is greater than the unrolling factor needed for the SLP
8463 group of stmts, NUMBER_OF_VECTORS to be created is greater than
8464 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8465 to replicate the vectors. */
8466 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8467 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8468 i++)
8469 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8472 /* Get the Ith vectorized definition from SLP_NODE. */
8474 tree
8475 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8477 return SLP_TREE_VEC_DEFS (slp_node)[i];
8480 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
8482 void
8483 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8485 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8486 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8489 /* Get N vectorized definitions for SLP_NODE. */
8491 void
8492 vect_get_slp_defs (vec_info *,
8493 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8495 if (n == -1U)
8496 n = SLP_TREE_CHILDREN (slp_node).length ();
8498 for (unsigned i = 0; i < n; ++i)
8500 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8501 vec<tree> vec_defs = vNULL;
8502 vect_get_slp_defs (child, &vec_defs);
8503 vec_oprnds->quick_push (vec_defs);
8507 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8508 - PERM gives the permutation that the caller wants to use for NODE,
8509 which might be different from SLP_LOAD_PERMUTATION.
8510 - DUMP_P controls whether the function dumps information. */
8512 static bool
8513 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8514 load_permutation_t &perm,
8515 const vec<tree> &dr_chain,
8516 gimple_stmt_iterator *gsi, poly_uint64 vf,
8517 bool analyze_only, bool dump_p,
8518 unsigned *n_perms, unsigned int *n_loads,
8519 bool dce_chain)
8521 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8522 int vec_index = 0;
8523 tree vectype = SLP_TREE_VECTYPE (node);
8524 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8525 unsigned int mask_element;
8526 unsigned dr_group_size;
8527 machine_mode mode;
8529 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8530 dr_group_size = 1;
8531 else
8533 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8534 dr_group_size = DR_GROUP_SIZE (stmt_info);
8537 mode = TYPE_MODE (vectype);
8538 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8539 unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8541 /* Initialize the vect stmts of NODE to properly insert the generated
8542 stmts later. */
8543 if (! analyze_only)
8544 for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8545 SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8547 /* Generate permutation masks for every NODE. Number of masks for each NODE
8548 is equal to GROUP_SIZE.
8549 E.g., we have a group of three nodes with three loads from the same
8550 location in each node, and the vector size is 4. I.e., we have a
8551 a0b0c0a1b1c1... sequence and we need to create the following vectors:
8552 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8553 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8556 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8557 The last mask is illegal since we assume two operands for permute
8558 operation, and the mask element values can't be outside that range.
8559 Hence, the last mask must be converted into {2,5,5,5}.
8560 For the first two permutations we need the first and the second input
8561 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8562 we need the second and the third vectors: {b1,c1,a2,b2} and
8563 {c2,a3,b3,c3}. */
8565 int vect_stmts_counter = 0;
8566 unsigned int index = 0;
8567 int first_vec_index = -1;
8568 int second_vec_index = -1;
8569 bool noop_p = true;
8570 *n_perms = 0;
8572 vec_perm_builder mask;
8573 unsigned int nelts_to_build;
8574 unsigned int nvectors_per_build;
8575 unsigned int in_nlanes;
8576 bool repeating_p = (group_size == dr_group_size
8577 && multiple_p (nunits, group_size));
8578 if (repeating_p)
8580 /* A single vector contains a whole number of copies of the node, so:
8581 (a) all permutes can use the same mask; and
8582 (b) the permutes only need a single vector input. */
8583 mask.new_vector (nunits, group_size, 3);
8584 nelts_to_build = mask.encoded_nelts ();
8585 /* It's possible to obtain zero nstmts during analyze_only, so make
8586 it at least one to ensure the later computation for n_perms
8587 proceed. */
8588 nvectors_per_build = nstmts > 0 ? nstmts : 1;
8589 in_nlanes = dr_group_size * 3;
8591 else
8593 /* We need to construct a separate mask for each vector statement. */
8594 unsigned HOST_WIDE_INT const_nunits, const_vf;
8595 if (!nunits.is_constant (&const_nunits)
8596 || !vf.is_constant (&const_vf))
8597 return false;
8598 mask.new_vector (const_nunits, const_nunits, 1);
8599 nelts_to_build = const_vf * group_size;
8600 nvectors_per_build = 1;
8601 in_nlanes = const_vf * dr_group_size;
8603 auto_sbitmap used_in_lanes (in_nlanes);
8604 bitmap_clear (used_in_lanes);
8605 auto_bitmap used_defs;
8607 unsigned int count = mask.encoded_nelts ();
8608 mask.quick_grow (count);
8609 vec_perm_indices indices;
8611 for (unsigned int j = 0; j < nelts_to_build; j++)
8613 unsigned int iter_num = j / group_size;
8614 unsigned int stmt_num = j % group_size;
8615 unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8616 bitmap_set_bit (used_in_lanes, i);
8617 if (repeating_p)
8619 first_vec_index = 0;
8620 mask_element = i;
8622 else
8624 /* Enforced before the loop when !repeating_p. */
8625 unsigned int const_nunits = nunits.to_constant ();
8626 vec_index = i / const_nunits;
8627 mask_element = i % const_nunits;
8628 if (vec_index == first_vec_index
8629 || first_vec_index == -1)
8631 first_vec_index = vec_index;
8633 else if (vec_index == second_vec_index
8634 || second_vec_index == -1)
8636 second_vec_index = vec_index;
8637 mask_element += const_nunits;
8639 else
8641 if (dump_p)
8642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8643 "permutation requires at "
8644 "least three vectors %G",
8645 stmt_info->stmt);
8646 gcc_assert (analyze_only);
8647 return false;
8650 gcc_assert (mask_element < 2 * const_nunits);
8653 if (mask_element != index)
8654 noop_p = false;
8655 mask[index++] = mask_element;
8657 if (index == count)
8659 if (!noop_p)
8661 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8662 if (!can_vec_perm_const_p (mode, mode, indices))
8664 if (dump_p)
8666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8667 "unsupported vect permute { ");
8668 for (i = 0; i < count; ++i)
8670 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8671 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8673 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8675 gcc_assert (analyze_only);
8676 return false;
8679 tree mask_vec = NULL_TREE;
8680 if (!analyze_only)
8681 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8683 if (second_vec_index == -1)
8684 second_vec_index = first_vec_index;
8686 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8688 ++*n_perms;
8689 if (analyze_only)
8690 continue;
8691 /* Generate the permute statement if necessary. */
8692 tree first_vec = dr_chain[first_vec_index + ri];
8693 tree second_vec = dr_chain[second_vec_index + ri];
8694 gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8695 tree perm_dest
8696 = vect_create_destination_var (gimple_assign_lhs (stmt),
8697 vectype);
8698 perm_dest = make_ssa_name (perm_dest);
8699 gimple *perm_stmt
8700 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8701 second_vec, mask_vec);
8702 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8703 gsi);
8704 if (dce_chain)
8706 bitmap_set_bit (used_defs, first_vec_index + ri);
8707 bitmap_set_bit (used_defs, second_vec_index + ri);
8710 /* Store the vector statement in NODE. */
8711 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8714 else if (!analyze_only)
8716 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8718 tree first_vec = dr_chain[first_vec_index + ri];
8719 /* If mask was NULL_TREE generate the requested
8720 identity transform. */
8721 if (dce_chain)
8722 bitmap_set_bit (used_defs, first_vec_index + ri);
8724 /* Store the vector statement in NODE. */
8725 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8729 index = 0;
8730 first_vec_index = -1;
8731 second_vec_index = -1;
8732 noop_p = true;
8736 if (n_loads)
8738 if (repeating_p)
8739 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8740 else
8742 /* Enforced above when !repeating_p. */
8743 unsigned int const_nunits = nunits.to_constant ();
8744 *n_loads = 0;
8745 bool load_seen = false;
8746 for (unsigned i = 0; i < in_nlanes; ++i)
8748 if (i % const_nunits == 0)
8750 if (load_seen)
8751 *n_loads += 1;
8752 load_seen = false;
8754 if (bitmap_bit_p (used_in_lanes, i))
8755 load_seen = true;
8757 if (load_seen)
8758 *n_loads += 1;
8762 if (dce_chain)
8763 for (unsigned i = 0; i < dr_chain.length (); ++i)
8764 if (!bitmap_bit_p (used_defs, i))
8766 tree def = dr_chain[i];
8769 gimple *stmt = SSA_NAME_DEF_STMT (def);
8770 if (is_gimple_assign (stmt)
8771 && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
8772 || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
8773 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
8774 else
8775 def = NULL;
8776 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8777 gsi_remove (&rgsi, true);
8778 release_defs (stmt);
8780 while (def);
8783 return true;
8786 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8787 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8788 permute statements for the SLP node NODE. Store the number of vector
8789 permute instructions in *N_PERMS and the number of vector load
8790 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
8791 that were not needed. */
8793 bool
8794 vect_transform_slp_perm_load (vec_info *vinfo,
8795 slp_tree node, const vec<tree> &dr_chain,
8796 gimple_stmt_iterator *gsi, poly_uint64 vf,
8797 bool analyze_only, unsigned *n_perms,
8798 unsigned int *n_loads, bool dce_chain)
8800 return vect_transform_slp_perm_load_1 (vinfo, node,
8801 SLP_TREE_LOAD_PERMUTATION (node),
8802 dr_chain, gsi, vf, analyze_only,
8803 dump_enabled_p (), n_perms, n_loads,
8804 dce_chain);
8807 /* Produce the next vector result for SLP permutation NODE by adding a vector
8808 statement at GSI. If MASK_VEC is nonnull, add:
8810 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8812 otherwise add:
8814 <new SSA name> = FIRST_DEF. */
8816 static void
8817 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8818 slp_tree node, tree first_def, tree second_def,
8819 tree mask_vec, poly_uint64 identity_offset)
8821 tree vectype = SLP_TREE_VECTYPE (node);
8823 /* ??? We SLP match existing vector element extracts but
8824 allow punning which we need to re-instantiate at uses
8825 but have no good way of explicitly representing. */
8826 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8827 && !types_compatible_p (TREE_TYPE (first_def), vectype))
8829 gassign *conv_stmt
8830 = gimple_build_assign (make_ssa_name (vectype),
8831 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8832 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8833 first_def = gimple_assign_lhs (conv_stmt);
8835 gassign *perm_stmt;
8836 tree perm_dest = make_ssa_name (vectype);
8837 if (mask_vec)
8839 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8840 TYPE_SIZE (vectype))
8841 && !types_compatible_p (TREE_TYPE (second_def), vectype))
8843 gassign *conv_stmt
8844 = gimple_build_assign (make_ssa_name (vectype),
8845 build1 (VIEW_CONVERT_EXPR,
8846 vectype, second_def));
8847 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8848 second_def = gimple_assign_lhs (conv_stmt);
8850 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8851 first_def, second_def,
8852 mask_vec);
8854 else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8856 /* For identity permutes we still need to handle the case
8857 of offsetted extracts or concats. */
8858 unsigned HOST_WIDE_INT c;
8859 auto first_def_nunits
8860 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8861 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8863 unsigned HOST_WIDE_INT elsz
8864 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8865 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8866 TYPE_SIZE (vectype),
8867 bitsize_int (identity_offset * elsz));
8868 perm_stmt = gimple_build_assign (perm_dest, lowpart);
8870 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8871 first_def_nunits, &c) && c == 2)
8873 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8874 NULL_TREE, second_def);
8875 perm_stmt = gimple_build_assign (perm_dest, ctor);
8877 else
8878 gcc_unreachable ();
8880 else
8882 /* We need a copy here in case the def was external. */
8883 perm_stmt = gimple_build_assign (perm_dest, first_def);
8885 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8886 /* Store the vector statement in NODE. */
8887 node->push_vec_def (perm_stmt);
8890 /* Subroutine of vectorizable_slp_permutation. Check whether the target
8891 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8892 If GSI is nonnull, emit the permutation there.
8894 When GSI is null, the only purpose of NODE is to give properties
8895 of the result, such as the vector type and number of SLP lanes.
8896 The node does not need to be a VEC_PERM_EXPR.
8898 If the target supports the operation, return the number of individual
8899 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
8900 dump file if DUMP_P is true. */
8902 static int
8903 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8904 slp_tree node, lane_permutation_t &perm,
8905 vec<slp_tree> &children, bool dump_p)
8907 tree vectype = SLP_TREE_VECTYPE (node);
8909 /* ??? We currently only support all same vector input types
8910 while the SLP IL should really do a concat + select and thus accept
8911 arbitrary mismatches. */
8912 slp_tree child;
8913 unsigned i;
8914 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8915 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8916 tree op_vectype = NULL_TREE;
8917 FOR_EACH_VEC_ELT (children, i, child)
8918 if (SLP_TREE_VECTYPE (child))
8920 op_vectype = SLP_TREE_VECTYPE (child);
8921 break;
8923 if (!op_vectype)
8924 op_vectype = vectype;
8925 FOR_EACH_VEC_ELT (children, i, child)
8927 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8928 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8929 || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8930 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8932 if (dump_p)
8933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8934 "Unsupported vector types in lane permutation\n");
8935 return -1;
8937 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8938 repeating_p = false;
8941 gcc_assert (perm.length () == SLP_TREE_LANES (node));
8942 if (dump_p)
8944 dump_printf_loc (MSG_NOTE, vect_location,
8945 "vectorizing permutation");
8946 for (unsigned i = 0; i < perm.length (); ++i)
8947 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8948 if (repeating_p)
8949 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8950 dump_printf (MSG_NOTE, "\n");
8953 /* REPEATING_P is true if every output vector is guaranteed to use the
8954 same permute vector. We can handle that case for both variable-length
8955 and constant-length vectors, but we only handle other cases for
8956 constant-length vectors.
8958 Set:
8960 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8961 mask vector that we want to build.
8963 - NCOPIES to the number of copies of PERM that we need in order
8964 to build the necessary permute mask vectors.
8966 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8967 for each permute mask vector. This is only relevant when GSI is
8968 nonnull. */
8969 uint64_t npatterns;
8970 unsigned nelts_per_pattern;
8971 uint64_t ncopies;
8972 unsigned noutputs_per_mask;
8973 if (repeating_p)
8975 /* We need a single permute mask vector that has the form:
8977 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8979 In other words, the original n-element permute in PERM is
8980 "unrolled" to fill a full vector. The stepped vector encoding
8981 that we use for permutes requires 3n elements. */
8982 npatterns = SLP_TREE_LANES (node);
8983 nelts_per_pattern = ncopies = 3;
8984 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8986 else
8988 /* Calculate every element of every permute mask vector explicitly,
8989 instead of relying on the pattern described above. */
8990 if (!nunits.is_constant (&npatterns)
8991 || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
8992 return -1;
8993 nelts_per_pattern = ncopies = 1;
8994 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8995 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8996 return -1;
8997 noutputs_per_mask = 1;
8999 unsigned olanes = ncopies * SLP_TREE_LANES (node);
9000 gcc_assert (repeating_p || multiple_p (olanes, nunits));
9002 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
9003 from the { SLP operand, scalar lane } permutation as recorded in the
9004 SLP node as intermediate step. This part should already work
9005 with SLP children with arbitrary number of lanes. */
9006 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
9007 auto_vec<unsigned> active_lane;
9008 vperm.create (olanes);
9009 active_lane.safe_grow_cleared (children.length (), true);
9010 for (unsigned i = 0; i < ncopies; ++i)
9012 for (unsigned pi = 0; pi < perm.length (); ++pi)
9014 std::pair<unsigned, unsigned> p = perm[pi];
9015 tree vtype = SLP_TREE_VECTYPE (children[p.first]);
9016 if (repeating_p)
9017 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
9018 else
9020 /* We checked above that the vectors are constant-length. */
9021 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
9022 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
9023 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
9024 vperm.quick_push ({{p.first, vi}, vl});
9027 /* Advance to the next group. */
9028 for (unsigned j = 0; j < children.length (); ++j)
9029 active_lane[j] += SLP_TREE_LANES (children[j]);
9032 if (dump_p)
9034 dump_printf_loc (MSG_NOTE, vect_location,
9035 "vectorizing permutation");
9036 for (unsigned i = 0; i < perm.length (); ++i)
9037 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
9038 if (repeating_p)
9039 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
9040 dump_printf (MSG_NOTE, "\n");
9041 dump_printf_loc (MSG_NOTE, vect_location, "as");
9042 for (unsigned i = 0; i < vperm.length (); ++i)
9044 if (i != 0
9045 && (repeating_p
9046 ? multiple_p (i, npatterns)
9047 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
9048 dump_printf (MSG_NOTE, ",");
9049 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
9050 vperm[i].first.first, vperm[i].first.second,
9051 vperm[i].second);
9053 dump_printf (MSG_NOTE, "\n");
9056 /* We can only handle two-vector permutes, everything else should
9057 be lowered on the SLP level. The following is closely inspired
9058 by vect_transform_slp_perm_load and is supposed to eventually
9059 replace it.
9060 ??? As intermediate step do code-gen in the SLP tree representation
9061 somehow? */
9062 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
9063 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
9064 unsigned int index = 0;
9065 poly_uint64 mask_element;
9066 vec_perm_builder mask;
9067 mask.new_vector (nunits, npatterns, nelts_per_pattern);
9068 unsigned int count = mask.encoded_nelts ();
9069 mask.quick_grow (count);
9070 vec_perm_indices indices;
9071 unsigned nperms = 0;
9072 for (unsigned i = 0; i < vperm.length (); ++i)
9074 mask_element = vperm[i].second;
9075 if (first_vec.first == -1U
9076 || first_vec == vperm[i].first)
9077 first_vec = vperm[i].first;
9078 else if (second_vec.first == -1U
9079 || second_vec == vperm[i].first)
9081 second_vec = vperm[i].first;
9082 mask_element += nunits;
9084 else
9086 if (dump_p)
9087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9088 "permutation requires at "
9089 "least three vectors\n");
9090 gcc_assert (!gsi);
9091 return -1;
9094 mask[index++] = mask_element;
9096 if (index == count)
9098 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
9099 TYPE_VECTOR_SUBPARTS (op_vectype));
9100 bool identity_p = (indices.series_p (0, 1, mask[0], 1)
9101 && constant_multiple_p (mask[0], nunits));
9102 machine_mode vmode = TYPE_MODE (vectype);
9103 machine_mode op_vmode = TYPE_MODE (op_vectype);
9104 unsigned HOST_WIDE_INT c;
9105 if ((!identity_p
9106 && !can_vec_perm_const_p (vmode, op_vmode, indices))
9107 || (identity_p
9108 && !known_le (nunits,
9109 TYPE_VECTOR_SUBPARTS (op_vectype))
9110 && (!constant_multiple_p (nunits,
9111 TYPE_VECTOR_SUBPARTS (op_vectype),
9112 &c) || c != 2)))
9114 if (dump_p)
9116 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
9117 vect_location,
9118 "unsupported vect permute { ");
9119 for (i = 0; i < count; ++i)
9121 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
9122 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
9124 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
9126 gcc_assert (!gsi);
9127 return -1;
9130 if (!identity_p)
9131 nperms++;
9132 if (gsi)
9134 if (second_vec.first == -1U)
9135 second_vec = first_vec;
9137 slp_tree
9138 first_node = children[first_vec.first],
9139 second_node = children[second_vec.first];
9141 tree mask_vec = NULL_TREE;
9142 if (!identity_p)
9143 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
9145 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
9147 tree first_def
9148 = vect_get_slp_vect_def (first_node,
9149 first_vec.second + vi);
9150 tree second_def
9151 = vect_get_slp_vect_def (second_node,
9152 second_vec.second + vi);
9153 vect_add_slp_permutation (vinfo, gsi, node, first_def,
9154 second_def, mask_vec, mask[0]);
9158 index = 0;
9159 first_vec = std::make_pair (-1U, -1U);
9160 second_vec = std::make_pair (-1U, -1U);
9164 return nperms;
9167 /* Vectorize the SLP permutations in NODE as specified
9168 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
9169 child number and lane number.
9170 Interleaving of two two-lane two-child SLP subtrees (not supported):
9171 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
9172 A blend of two four-lane two-child SLP subtrees:
9173 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
9174 Highpart of a four-lane one-child SLP subtree (not supported):
9175 [ { 0, 2 }, { 0, 3 } ]
9176 Where currently only a subset is supported by code generating below. */
9178 static bool
9179 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
9180 slp_tree node, stmt_vector_for_cost *cost_vec)
9182 tree vectype = SLP_TREE_VECTYPE (node);
9183 lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
9184 int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
9185 SLP_TREE_CHILDREN (node),
9186 dump_enabled_p ());
9187 if (nperms < 0)
9188 return false;
9190 if (!gsi)
9191 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
9193 return true;
9196 /* Vectorize SLP NODE. */
9198 static void
9199 vect_schedule_slp_node (vec_info *vinfo,
9200 slp_tree node, slp_instance instance)
9202 gimple_stmt_iterator si;
9203 int i;
9204 slp_tree child;
9206 /* Vectorize externals and constants. */
9207 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9208 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
9210 /* ??? vectorizable_shift can end up using a scalar operand which is
9211 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
9212 node in this case. */
9213 if (!SLP_TREE_VECTYPE (node))
9214 return;
9216 /* There are two reasons vector defs might already exist. The first
9217 is that we are vectorizing an existing vector def. The second is
9218 when performing BB vectorization shared constant/external nodes
9219 are not split apart during partitioning so during the code-gen
9220 DFS walk we can end up visiting them twice. */
9221 if (! SLP_TREE_VEC_DEFS (node).exists ())
9222 vect_create_constant_vectors (vinfo, node);
9223 return;
9226 gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9228 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9230 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
9231 SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9233 if (dump_enabled_p ())
9234 dump_printf_loc (MSG_NOTE, vect_location,
9235 "------>vectorizing SLP node starting from: %G",
9236 stmt_info->stmt);
9238 if (STMT_VINFO_DATA_REF (stmt_info)
9239 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9241 /* Vectorized loads go before the first scalar load to make it
9242 ready early, vectorized stores go before the last scalar
9243 stmt which is where all uses are ready. */
9244 stmt_vec_info last_stmt_info = NULL;
9245 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9246 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9247 else /* DR_IS_WRITE */
9248 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9249 si = gsi_for_stmt (last_stmt_info->stmt);
9251 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9252 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9253 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9254 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9256 /* For PHI node vectorization we do not use the insertion iterator. */
9257 si = gsi_none ();
9259 else
9261 /* Emit other stmts after the children vectorized defs which is
9262 earliest possible. */
9263 gimple *last_stmt = NULL;
9264 if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
9265 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
9266 || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
9268 /* But avoid scheduling internal defs outside of the loop when
9269 we might have only implicitly tracked loop mask/len defs. */
9270 gimple_stmt_iterator si
9271 = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
9272 last_stmt = *si;
9274 bool seen_vector_def = false;
9275 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9276 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9278 /* For fold-left reductions we are retaining the scalar
9279 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9280 set so the representation isn't perfect. Resort to the
9281 last scalar def here. */
9282 if (SLP_TREE_VEC_DEFS (child).is_empty ())
9284 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9285 == cycle_phi_info_type);
9286 gphi *phi = as_a <gphi *>
9287 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
9288 if (!last_stmt
9289 || vect_stmt_dominates_stmt_p (last_stmt, phi))
9290 last_stmt = phi;
9292 /* We are emitting all vectorized stmts in the same place and
9293 the last one is the last.
9294 ??? Unless we have a load permutation applied and that
9295 figures to re-use an earlier generated load. */
9296 unsigned j;
9297 tree vdef;
9298 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9300 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9301 if (!last_stmt
9302 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9303 last_stmt = vstmt;
9306 else if (!SLP_TREE_VECTYPE (child))
9308 /* For externals we use unvectorized at all scalar defs. */
9309 unsigned j;
9310 tree def;
9311 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9312 if (TREE_CODE (def) == SSA_NAME
9313 && !SSA_NAME_IS_DEFAULT_DEF (def))
9315 gimple *stmt = SSA_NAME_DEF_STMT (def);
9316 if (!last_stmt
9317 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9318 last_stmt = stmt;
9321 else
9323 /* For externals we have to look at all defs since their
9324 insertion place is decided per vector. But beware
9325 of pre-existing vectors where we need to make sure
9326 we do not insert before the region boundary. */
9327 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9328 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9329 seen_vector_def = true;
9330 else
9332 unsigned j;
9333 tree vdef;
9334 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9335 if (TREE_CODE (vdef) == SSA_NAME
9336 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9338 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9339 if (!last_stmt
9340 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9341 last_stmt = vstmt;
9345 /* This can happen when all children are pre-existing vectors or
9346 constants. */
9347 if (!last_stmt)
9348 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9349 if (!last_stmt)
9351 gcc_assert (seen_vector_def);
9352 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9354 else if (is_ctrl_altering_stmt (last_stmt))
9356 /* We split regions to vectorize at control altering stmts
9357 with a definition so this must be an external which
9358 we can insert at the start of the region. */
9359 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9361 else if (is_a <bb_vec_info> (vinfo)
9362 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9363 && gimple_could_trap_p (stmt_info->stmt))
9365 /* We've constrained possibly trapping operations to all come
9366 from the same basic-block, if vectorized defs would allow earlier
9367 scheduling still force vectorized stmts to the original block.
9368 This is only necessary for BB vectorization since for loop vect
9369 all operations are in a single BB and scalar stmt based
9370 placement doesn't play well with epilogue vectorization. */
9371 gcc_assert (dominated_by_p (CDI_DOMINATORS,
9372 gimple_bb (stmt_info->stmt),
9373 gimple_bb (last_stmt)));
9374 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9376 else if (is_a <gphi *> (last_stmt))
9377 si = gsi_after_labels (gimple_bb (last_stmt));
9378 else
9380 si = gsi_for_stmt (last_stmt);
9381 gsi_next (&si);
9385 /* Handle purely internal nodes. */
9386 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9388 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
9389 be shared with different SLP nodes (but usually it's the same
9390 operation apart from the case the stmt is only there for denoting
9391 the actual scalar lane defs ...). So do not call vect_transform_stmt
9392 but open-code it here (partly). */
9393 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9394 gcc_assert (done);
9395 stmt_vec_info slp_stmt_info;
9396 unsigned int i;
9397 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9398 if (STMT_VINFO_LIVE_P (slp_stmt_info))
9400 done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9401 instance, i, true, NULL);
9402 gcc_assert (done);
9405 else
9406 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9409 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9410 For loop vectorization this is done in vectorizable_call, but for SLP
9411 it needs to be deferred until end of vect_schedule_slp, because multiple
9412 SLP instances may refer to the same scalar stmt. */
9414 static void
9415 vect_remove_slp_scalar_calls (vec_info *vinfo,
9416 slp_tree node, hash_set<slp_tree> &visited)
9418 gimple *new_stmt;
9419 gimple_stmt_iterator gsi;
9420 int i;
9421 slp_tree child;
9422 tree lhs;
9423 stmt_vec_info stmt_info;
9425 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9426 return;
9428 if (visited.add (node))
9429 return;
9431 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9432 vect_remove_slp_scalar_calls (vinfo, child, visited);
9434 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9436 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9437 if (!stmt || gimple_bb (stmt) == NULL)
9438 continue;
9439 if (is_pattern_stmt_p (stmt_info)
9440 || !PURE_SLP_STMT (stmt_info))
9441 continue;
9442 lhs = gimple_call_lhs (stmt);
9443 if (lhs)
9444 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9445 else
9447 new_stmt = gimple_build_nop ();
9448 unlink_stmt_vdef (stmt_info->stmt);
9450 gsi = gsi_for_stmt (stmt);
9451 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9452 if (lhs)
9453 SSA_NAME_DEF_STMT (lhs) = new_stmt;
9457 static void
9458 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9460 hash_set<slp_tree> visited;
9461 vect_remove_slp_scalar_calls (vinfo, node, visited);
9464 /* Vectorize the instance root. */
9466 void
9467 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9469 gassign *rstmt = NULL;
9471 if (instance->kind == slp_inst_kind_ctor)
9473 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9475 tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9476 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9477 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9478 TREE_TYPE (vect_lhs)))
9479 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9480 vect_lhs);
9481 rstmt = gimple_build_assign (root_lhs, vect_lhs);
9483 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9485 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9486 tree child_def;
9487 int j;
9488 vec<constructor_elt, va_gc> *v;
9489 vec_alloc (v, nelts);
9491 /* A CTOR can handle V16HI composition from VNx8HI so we
9492 do not need to convert vector elements if the types
9493 do not match. */
9494 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9495 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9496 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9497 tree rtype
9498 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9499 tree r_constructor = build_constructor (rtype, v);
9500 rstmt = gimple_build_assign (lhs, r_constructor);
9503 else if (instance->kind == slp_inst_kind_bb_reduc)
9505 /* Largely inspired by reduction chain epilogue handling in
9506 vect_create_epilog_for_reduction. */
9507 vec<tree> vec_defs = vNULL;
9508 vect_get_slp_defs (node, &vec_defs);
9509 enum tree_code reduc_code
9510 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9511 /* ??? We actually have to reflect signs somewhere. */
9512 if (reduc_code == MINUS_EXPR)
9513 reduc_code = PLUS_EXPR;
9514 gimple_seq epilogue = NULL;
9515 /* We may end up with more than one vector result, reduce them
9516 to one vector. */
9517 tree vec_def = vec_defs[0];
9518 tree vectype = TREE_TYPE (vec_def);
9519 tree compute_vectype = vectype;
9520 bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9521 && TYPE_OVERFLOW_UNDEFINED (vectype)
9522 && operation_can_overflow (reduc_code));
9523 if (pun_for_overflow_p)
9525 compute_vectype = unsigned_type_for (vectype);
9526 vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9527 compute_vectype, vec_def);
9529 for (unsigned i = 1; i < vec_defs.length (); ++i)
9531 tree def = vec_defs[i];
9532 if (pun_for_overflow_p)
9533 def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9534 compute_vectype, def);
9535 vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9536 vec_def, def);
9538 vec_defs.release ();
9539 /* ??? Support other schemes than direct internal fn. */
9540 internal_fn reduc_fn;
9541 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9542 || reduc_fn == IFN_LAST)
9543 gcc_unreachable ();
9544 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9545 TREE_TYPE (compute_vectype), vec_def);
9546 if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9548 tree rem_def = NULL_TREE;
9549 for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9551 def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9552 if (!rem_def)
9553 rem_def = def;
9554 else
9555 rem_def = gimple_build (&epilogue, reduc_code,
9556 TREE_TYPE (scalar_def),
9557 rem_def, def);
9559 scalar_def = gimple_build (&epilogue, reduc_code,
9560 TREE_TYPE (scalar_def),
9561 scalar_def, rem_def);
9563 scalar_def = gimple_convert (&epilogue,
9564 TREE_TYPE (vectype), scalar_def);
9565 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9566 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9567 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9568 update_stmt (gsi_stmt (rgsi));
9569 return;
9571 else
9572 gcc_unreachable ();
9574 gcc_assert (rstmt);
9576 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9577 gsi_replace (&rgsi, rstmt, true);
9580 struct slp_scc_info
9582 bool on_stack;
9583 int dfs;
9584 int lowlink;
9587 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
9589 static void
9590 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9591 hash_map<slp_tree, slp_scc_info> &scc_info,
9592 int &maxdfs, vec<slp_tree> &stack)
9594 bool existed_p;
9595 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9596 gcc_assert (!existed_p);
9597 info->dfs = maxdfs;
9598 info->lowlink = maxdfs;
9599 maxdfs++;
9601 /* Leaf. */
9602 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9604 info->on_stack = false;
9605 vect_schedule_slp_node (vinfo, node, instance);
9606 return;
9609 info->on_stack = true;
9610 stack.safe_push (node);
9612 unsigned i;
9613 slp_tree child;
9614 /* DFS recurse. */
9615 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9617 if (!child)
9618 continue;
9619 slp_scc_info *child_info = scc_info.get (child);
9620 if (!child_info)
9622 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9623 /* Recursion might have re-allocated the node. */
9624 info = scc_info.get (node);
9625 child_info = scc_info.get (child);
9626 info->lowlink = MIN (info->lowlink, child_info->lowlink);
9628 else if (child_info->on_stack)
9629 info->lowlink = MIN (info->lowlink, child_info->dfs);
9631 if (info->lowlink != info->dfs)
9632 return;
9634 auto_vec<slp_tree, 4> phis_to_fixup;
9636 /* Singleton. */
9637 if (stack.last () == node)
9639 stack.pop ();
9640 info->on_stack = false;
9641 vect_schedule_slp_node (vinfo, node, instance);
9642 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9643 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9644 phis_to_fixup.quick_push (node);
9646 else
9648 /* SCC. */
9649 int last_idx = stack.length () - 1;
9650 while (stack[last_idx] != node)
9651 last_idx--;
9652 /* We can break the cycle at PHIs who have at least one child
9653 code generated. Then we could re-start the DFS walk until
9654 all nodes in the SCC are covered (we might have new entries
9655 for only back-reachable nodes). But it's simpler to just
9656 iterate and schedule those that are ready. */
9657 unsigned todo = stack.length () - last_idx;
9660 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9662 slp_tree entry = stack[idx];
9663 if (!entry)
9664 continue;
9665 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9666 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9667 bool ready = !phi;
9668 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9669 if (!child)
9671 gcc_assert (phi);
9672 ready = true;
9673 break;
9675 else if (scc_info.get (child)->on_stack)
9677 if (!phi)
9679 ready = false;
9680 break;
9683 else
9685 if (phi)
9687 ready = true;
9688 break;
9691 if (ready)
9693 vect_schedule_slp_node (vinfo, entry, instance);
9694 scc_info.get (entry)->on_stack = false;
9695 stack[idx] = NULL;
9696 todo--;
9697 if (phi)
9698 phis_to_fixup.safe_push (entry);
9702 while (todo != 0);
9704 /* Pop the SCC. */
9705 stack.truncate (last_idx);
9708 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
9709 slp_tree phi_node;
9710 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9712 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9713 edge_iterator ei;
9714 edge e;
9715 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9717 unsigned dest_idx = e->dest_idx;
9718 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9719 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9720 continue;
9721 unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9722 /* Simply fill all args. */
9723 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9724 != vect_first_order_recurrence)
9725 for (unsigned i = 0; i < n; ++i)
9727 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9728 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9729 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9730 e, gimple_phi_arg_location (phi, dest_idx));
9732 else
9734 /* Unless it is a first order recurrence which needs
9735 args filled in for both the PHI node and the permutes. */
9736 gimple *perm
9737 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9738 gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9739 add_phi_arg (as_a <gphi *> (rphi),
9740 vect_get_slp_vect_def (child, n - 1),
9741 e, gimple_phi_arg_location (phi, dest_idx));
9742 for (unsigned i = 0; i < n; ++i)
9744 gimple *perm
9745 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9746 if (i > 0)
9747 gimple_assign_set_rhs1 (perm,
9748 vect_get_slp_vect_def (child, i - 1));
9749 gimple_assign_set_rhs2 (perm,
9750 vect_get_slp_vect_def (child, i));
9751 update_stmt (perm);
9758 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
9760 void
9761 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9763 slp_instance instance;
9764 unsigned int i;
9766 hash_map<slp_tree, slp_scc_info> scc_info;
9767 int maxdfs = 0;
9768 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9770 slp_tree node = SLP_INSTANCE_TREE (instance);
9771 if (dump_enabled_p ())
9773 dump_printf_loc (MSG_NOTE, vect_location,
9774 "Vectorizing SLP tree:\n");
9775 /* ??? Dump all? */
9776 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9777 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9778 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9779 vect_print_slp_graph (MSG_NOTE, vect_location,
9780 SLP_INSTANCE_TREE (instance));
9782 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9783 have a PHI be the node breaking the cycle. */
9784 auto_vec<slp_tree> stack;
9785 if (!scc_info.get (node))
9786 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9788 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9789 vectorize_slp_instance_root_stmt (node, instance);
9791 if (dump_enabled_p ())
9792 dump_printf_loc (MSG_NOTE, vect_location,
9793 "vectorizing stmts using SLP.\n");
9796 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9798 slp_tree root = SLP_INSTANCE_TREE (instance);
9799 stmt_vec_info store_info;
9800 unsigned int j;
9802 /* Remove scalar call stmts. Do not do this for basic-block
9803 vectorization as not all uses may be vectorized.
9804 ??? Why should this be necessary? DCE should be able to
9805 remove the stmts itself.
9806 ??? For BB vectorization we can as well remove scalar
9807 stmts starting from the SLP tree root if they have no
9808 uses. */
9809 if (is_a <loop_vec_info> (vinfo))
9810 vect_remove_slp_scalar_calls (vinfo, root);
9812 /* Remove vectorized stores original scalar stmts. */
9813 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9815 if (!STMT_VINFO_DATA_REF (store_info)
9816 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9817 break;
9819 store_info = vect_orig_stmt (store_info);
9820 /* Free the attached stmt_vec_info and remove the stmt. */
9821 vinfo->remove_stmt (store_info);
9823 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9824 to not crash in vect_free_slp_tree later. */
9825 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9826 SLP_TREE_REPRESENTATIVE (root) = NULL;