ada: Fix minor glitch in finish_record_type
[official-gcc.git] / gcc / tree-vect-slp.cc
bloba3e54ebf62a29d2ff1c35546a742763c84f2d8b5
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #define INCLUDE_ALGORITHM
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "insn-config.h"
35 #include "recog.h" /* FIXME: for insn_data */
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 #include "dump-context.h"
49 #include "cfganal.h"
50 #include "tree-eh.h"
51 #include "tree-cfg.h"
52 #include "alloc-pool.h"
53 #include "sreal.h"
54 #include "predict.h"
56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 load_permutation_t &,
58 const vec<tree> &,
59 gimple_stmt_iterator *,
60 poly_uint64, bool, bool,
61 unsigned *,
62 unsigned * = nullptr,
63 bool = false);
64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 slp_tree, lane_permutation_t &,
66 vec<slp_tree> &, bool);
67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
68 slp_tree, stmt_vector_for_cost *);
69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
71 static object_allocator<_slp_tree> *slp_tree_pool;
72 static slp_tree slp_first_node;
74 void
75 vect_slp_init (void)
77 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
80 void
81 vect_slp_fini (void)
83 while (slp_first_node)
84 delete slp_first_node;
85 delete slp_tree_pool;
86 slp_tree_pool = NULL;
89 void *
90 _slp_tree::operator new (size_t n)
92 gcc_assert (n == sizeof (_slp_tree));
93 return slp_tree_pool->allocate_raw ();
96 void
97 _slp_tree::operator delete (void *node, size_t n)
99 gcc_assert (n == sizeof (_slp_tree));
100 slp_tree_pool->remove_raw (node);
104 /* Initialize a SLP node. */
106 _slp_tree::_slp_tree ()
108 this->prev_node = NULL;
109 if (slp_first_node)
110 slp_first_node->prev_node = this;
111 this->next_node = slp_first_node;
112 slp_first_node = this;
113 SLP_TREE_SCALAR_STMTS (this) = vNULL;
114 SLP_TREE_SCALAR_OPS (this) = vNULL;
115 SLP_TREE_VEC_DEFS (this) = vNULL;
116 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
117 SLP_TREE_CHILDREN (this) = vNULL;
118 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
119 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
120 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
121 SLP_TREE_CODE (this) = ERROR_MARK;
122 SLP_TREE_VECTYPE (this) = NULL_TREE;
123 SLP_TREE_REPRESENTATIVE (this) = NULL;
124 SLP_TREE_REF_COUNT (this) = 1;
125 this->failed = NULL;
126 this->max_nunits = 1;
127 this->lanes = 0;
130 /* Tear down a SLP node. */
132 _slp_tree::~_slp_tree ()
134 if (this->prev_node)
135 this->prev_node->next_node = this->next_node;
136 else
137 slp_first_node = this->next_node;
138 if (this->next_node)
139 this->next_node->prev_node = this->prev_node;
140 SLP_TREE_CHILDREN (this).release ();
141 SLP_TREE_SCALAR_STMTS (this).release ();
142 SLP_TREE_SCALAR_OPS (this).release ();
143 SLP_TREE_VEC_DEFS (this).release ();
144 SLP_TREE_LOAD_PERMUTATION (this).release ();
145 SLP_TREE_LANE_PERMUTATION (this).release ();
146 if (this->failed)
147 free (failed);
150 /* Push the single SSA definition in DEF to the vector of vector defs. */
152 void
153 _slp_tree::push_vec_def (gimple *def)
155 if (gphi *phi = dyn_cast <gphi *> (def))
156 vec_defs.quick_push (gimple_phi_result (phi));
157 else
159 def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
160 vec_defs.quick_push (get_def_from_ptr (defop));
164 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
166 void
167 vect_free_slp_tree (slp_tree node)
169 int i;
170 slp_tree child;
172 if (--SLP_TREE_REF_COUNT (node) != 0)
173 return;
175 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
176 if (child)
177 vect_free_slp_tree (child);
179 /* If the node defines any SLP only patterns then those patterns are no
180 longer valid and should be removed. */
181 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
182 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
184 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
185 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
186 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
189 delete node;
192 /* Return a location suitable for dumpings related to the SLP instance. */
194 dump_user_location_t
195 _slp_instance::location () const
197 if (!root_stmts.is_empty ())
198 return root_stmts[0]->stmt;
199 else
200 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
204 /* Free the memory allocated for the SLP instance. */
206 void
207 vect_free_slp_instance (slp_instance instance)
209 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
210 SLP_INSTANCE_LOADS (instance).release ();
211 SLP_INSTANCE_ROOT_STMTS (instance).release ();
212 SLP_INSTANCE_REMAIN_DEFS (instance).release ();
213 instance->subgraph_entries.release ();
214 instance->cost_vec.release ();
215 free (instance);
219 /* Create an SLP node for SCALAR_STMTS. */
221 slp_tree
222 vect_create_new_slp_node (unsigned nops, tree_code code)
224 slp_tree node = new _slp_tree;
225 SLP_TREE_SCALAR_STMTS (node) = vNULL;
226 SLP_TREE_CHILDREN (node).create (nops);
227 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
228 SLP_TREE_CODE (node) = code;
229 return node;
231 /* Create an SLP node for SCALAR_STMTS. */
233 static slp_tree
234 vect_create_new_slp_node (slp_tree node,
235 vec<stmt_vec_info> scalar_stmts, unsigned nops)
237 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
238 SLP_TREE_CHILDREN (node).create (nops);
239 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
240 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
241 SLP_TREE_LANES (node) = scalar_stmts.length ();
242 return node;
245 /* Create an SLP node for SCALAR_STMTS. */
247 static slp_tree
248 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
250 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
253 /* Create an SLP node for OPS. */
255 static slp_tree
256 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
258 SLP_TREE_SCALAR_OPS (node) = ops;
259 SLP_TREE_DEF_TYPE (node) = vect_external_def;
260 SLP_TREE_LANES (node) = ops.length ();
261 return node;
264 /* Create an SLP node for OPS. */
266 static slp_tree
267 vect_create_new_slp_node (vec<tree> ops)
269 return vect_create_new_slp_node (new _slp_tree, ops);
273 /* This structure is used in creation of an SLP tree. Each instance
274 corresponds to the same operand in a group of scalar stmts in an SLP
275 node. */
276 typedef struct _slp_oprnd_info
278 /* Def-stmts for the operands. */
279 vec<stmt_vec_info> def_stmts;
280 /* Operands. */
281 vec<tree> ops;
282 /* Information about the first statement, its vector def-type, type, the
283 operand itself in case it's constant, and an indication if it's a pattern
284 stmt. */
285 tree first_op_type;
286 enum vect_def_type first_dt;
287 bool any_pattern;
288 } *slp_oprnd_info;
291 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
292 operand. */
293 static vec<slp_oprnd_info>
294 vect_create_oprnd_info (int nops, int group_size)
296 int i;
297 slp_oprnd_info oprnd_info;
298 vec<slp_oprnd_info> oprnds_info;
300 oprnds_info.create (nops);
301 for (i = 0; i < nops; i++)
303 oprnd_info = XNEW (struct _slp_oprnd_info);
304 oprnd_info->def_stmts.create (group_size);
305 oprnd_info->ops.create (group_size);
306 oprnd_info->first_dt = vect_uninitialized_def;
307 oprnd_info->first_op_type = NULL_TREE;
308 oprnd_info->any_pattern = false;
309 oprnds_info.quick_push (oprnd_info);
312 return oprnds_info;
316 /* Free operands info. */
318 static void
319 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
321 int i;
322 slp_oprnd_info oprnd_info;
324 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
326 oprnd_info->def_stmts.release ();
327 oprnd_info->ops.release ();
328 XDELETE (oprnd_info);
331 oprnds_info.release ();
334 /* Return the execution frequency of NODE (so that a higher value indicates
335 a "more important" node when optimizing for speed). */
337 static sreal
338 vect_slp_node_weight (slp_tree node)
340 stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
341 basic_block bb = gimple_bb (stmt_info->stmt);
342 return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
345 /* Return true if STMTS contains a pattern statement. */
347 static bool
348 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
350 stmt_vec_info stmt_info;
351 unsigned int i;
352 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
353 if (is_pattern_stmt_p (stmt_info))
354 return true;
355 return false;
358 /* Return true when all lanes in the external or constant NODE have
359 the same value. */
361 static bool
362 vect_slp_tree_uniform_p (slp_tree node)
364 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
365 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
367 /* Pre-exsting vectors. */
368 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
369 return false;
371 unsigned i;
372 tree op, first = NULL_TREE;
373 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
374 if (!first)
375 first = op;
376 else if (!operand_equal_p (first, op, 0))
377 return false;
379 return true;
382 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
383 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
384 of the chain. */
387 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
388 stmt_vec_info first_stmt_info)
390 stmt_vec_info next_stmt_info = first_stmt_info;
391 int result = 0;
393 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
394 return -1;
398 if (next_stmt_info == stmt_info)
399 return result;
400 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
401 if (next_stmt_info)
402 result += DR_GROUP_GAP (next_stmt_info);
404 while (next_stmt_info);
406 return -1;
409 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
410 using the method implemented by duplicate_and_interleave. Return true
411 if so, returning the number of intermediate vectors in *NVECTORS_OUT
412 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
413 (if nonnull). */
415 bool
416 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
417 tree elt_type, unsigned int *nvectors_out,
418 tree *vector_type_out,
419 tree *permutes)
421 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
422 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
423 return false;
425 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
426 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
427 unsigned int nvectors = 1;
428 for (;;)
430 scalar_int_mode int_mode;
431 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
432 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
434 /* Get the natural vector type for this SLP group size. */
435 tree int_type = build_nonstandard_integer_type
436 (GET_MODE_BITSIZE (int_mode), 1);
437 tree vector_type
438 = get_vectype_for_scalar_type (vinfo, int_type, count);
439 poly_int64 half_nelts;
440 if (vector_type
441 && VECTOR_MODE_P (TYPE_MODE (vector_type))
442 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
443 GET_MODE_SIZE (base_vector_mode))
444 && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
445 2, &half_nelts))
447 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
448 together into elements of type INT_TYPE and using the result
449 to build NVECTORS vectors. */
450 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
451 vec_perm_builder sel1 (nelts, 2, 3);
452 vec_perm_builder sel2 (nelts, 2, 3);
454 for (unsigned int i = 0; i < 3; ++i)
456 sel1.quick_push (i);
457 sel1.quick_push (i + nelts);
458 sel2.quick_push (half_nelts + i);
459 sel2.quick_push (half_nelts + i + nelts);
461 vec_perm_indices indices1 (sel1, 2, nelts);
462 vec_perm_indices indices2 (sel2, 2, nelts);
463 machine_mode vmode = TYPE_MODE (vector_type);
464 if (can_vec_perm_const_p (vmode, vmode, indices1)
465 && can_vec_perm_const_p (vmode, vmode, indices2))
467 if (nvectors_out)
468 *nvectors_out = nvectors;
469 if (vector_type_out)
470 *vector_type_out = vector_type;
471 if (permutes)
473 permutes[0] = vect_gen_perm_mask_checked (vector_type,
474 indices1);
475 permutes[1] = vect_gen_perm_mask_checked (vector_type,
476 indices2);
478 return true;
482 if (!multiple_p (elt_bytes, 2, &elt_bytes))
483 return false;
484 nvectors *= 2;
488 /* Return true if DTA and DTB match. */
490 static bool
491 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
493 return (dta == dtb
494 || ((dta == vect_external_def || dta == vect_constant_def)
495 && (dtb == vect_external_def || dtb == vect_constant_def)));
498 static const int cond_expr_maps[3][5] = {
499 { 4, -1, -2, 1, 2 },
500 { 4, -2, -1, 1, 2 },
501 { 4, -1, -2, 2, 1 }
503 static const int arg1_map[] = { 1, 1 };
504 static const int arg2_map[] = { 1, 2 };
505 static const int arg1_arg4_map[] = { 2, 1, 4 };
506 static const int arg3_arg2_map[] = { 2, 3, 2 };
507 static const int op1_op0_map[] = { 2, 1, 0 };
509 /* For most SLP statements, there is a one-to-one mapping between
510 gimple arguments and child nodes. If that is not true for STMT,
511 return an array that contains:
513 - the number of child nodes, followed by
514 - for each child node, the index of the argument associated with that node.
515 The special index -1 is the first operand of an embedded comparison and
516 the special index -2 is the second operand of an embedded comparison.
518 SWAP is as for vect_get_and_check_slp_defs. */
520 static const int *
521 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
523 if (auto assign = dyn_cast<const gassign *> (stmt))
525 if (gimple_assign_rhs_code (assign) == COND_EXPR
526 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
527 return cond_expr_maps[swap];
528 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
529 && swap)
530 return op1_op0_map;
532 gcc_assert (!swap);
533 if (auto call = dyn_cast<const gcall *> (stmt))
535 if (gimple_call_internal_p (call))
536 switch (gimple_call_internal_fn (call))
538 case IFN_MASK_LOAD:
539 return arg2_map;
541 case IFN_GATHER_LOAD:
542 return arg1_map;
544 case IFN_MASK_GATHER_LOAD:
545 return arg1_arg4_map;
547 case IFN_MASK_STORE:
548 return arg3_arg2_map;
550 default:
551 break;
554 return nullptr;
557 /* Return the SLP node child index for operand OP of STMT. */
560 vect_slp_child_index_for_operand (const gimple *stmt, int op)
562 const int *opmap = vect_get_operand_map (stmt);
563 if (!opmap)
564 return op;
565 for (int i = 1; i < 1 + opmap[0]; ++i)
566 if (opmap[i] == op)
567 return i - 1;
568 gcc_unreachable ();
571 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
572 they are of a valid type and that they match the defs of the first stmt of
573 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
574 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
575 indicates swap is required for cond_expr stmts. Specifically, SWAP
576 is 1 if STMT is cond and operands of comparison need to be swapped;
577 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
579 If there was a fatal error return -1; if the error could be corrected by
580 swapping operands of father node of this one, return 1; if everything is
581 ok return 0. */
582 static int
583 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
584 bool *skip_args,
585 vec<stmt_vec_info> stmts, unsigned stmt_num,
586 vec<slp_oprnd_info> *oprnds_info)
588 stmt_vec_info stmt_info = stmts[stmt_num];
589 tree oprnd;
590 unsigned int i, number_of_oprnds;
591 enum vect_def_type dt = vect_uninitialized_def;
592 slp_oprnd_info oprnd_info;
593 unsigned int commutative_op = -1U;
594 bool first = stmt_num == 0;
596 if (!is_a<gcall *> (stmt_info->stmt)
597 && !is_a<gassign *> (stmt_info->stmt)
598 && !is_a<gphi *> (stmt_info->stmt))
599 return -1;
601 number_of_oprnds = gimple_num_args (stmt_info->stmt);
602 const int *map = vect_get_operand_map (stmt_info->stmt, swap);
603 if (map)
604 number_of_oprnds = *map++;
605 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
607 if (gimple_call_internal_p (stmt))
609 internal_fn ifn = gimple_call_internal_fn (stmt);
610 commutative_op = first_commutative_argument (ifn);
613 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
615 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
616 commutative_op = 0;
619 bool swapped = (swap != 0);
620 bool backedge = false;
621 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
622 for (i = 0; i < number_of_oprnds; i++)
624 int opno = map ? map[i] : int (i);
625 if (opno < 0)
626 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
627 else
629 oprnd = gimple_arg (stmt_info->stmt, opno);
630 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
632 edge e = gimple_phi_arg_edge (stmt, opno);
633 backedge = (is_a <bb_vec_info> (vinfo)
634 ? e->flags & EDGE_DFS_BACK
635 : dominated_by_p (CDI_DOMINATORS, e->src,
636 gimple_bb (stmt_info->stmt)));
639 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
640 oprnd = TREE_OPERAND (oprnd, 0);
642 oprnd_info = (*oprnds_info)[i];
644 stmt_vec_info def_stmt_info;
645 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
647 if (dump_enabled_p ())
648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
649 "Build SLP failed: can't analyze def for %T\n",
650 oprnd);
652 return -1;
655 if (skip_args[i])
657 oprnd_info->def_stmts.quick_push (NULL);
658 oprnd_info->ops.quick_push (NULL_TREE);
659 oprnd_info->first_dt = vect_uninitialized_def;
660 continue;
663 oprnd_info->def_stmts.quick_push (def_stmt_info);
664 oprnd_info->ops.quick_push (oprnd);
666 if (def_stmt_info
667 && is_pattern_stmt_p (def_stmt_info))
669 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
670 != def_stmt_info)
671 oprnd_info->any_pattern = true;
672 else
673 /* If we promote this to external use the original stmt def. */
674 oprnd_info->ops.last ()
675 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
678 /* If there's a extern def on a backedge make sure we can
679 code-generate at the region start.
680 ??? This is another case that could be fixed by adjusting
681 how we split the function but at the moment we'd have conflicting
682 goals there. */
683 if (backedge
684 && dts[i] == vect_external_def
685 && is_a <bb_vec_info> (vinfo)
686 && TREE_CODE (oprnd) == SSA_NAME
687 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
688 && !dominated_by_p (CDI_DOMINATORS,
689 as_a <bb_vec_info> (vinfo)->bbs[0],
690 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
692 if (dump_enabled_p ())
693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
694 "Build SLP failed: extern def %T only defined "
695 "on backedge\n", oprnd);
696 return -1;
699 if (first)
701 tree type = TREE_TYPE (oprnd);
702 dt = dts[i];
703 if ((dt == vect_constant_def
704 || dt == vect_external_def)
705 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
706 && (TREE_CODE (type) == BOOLEAN_TYPE
707 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
708 type)))
710 if (dump_enabled_p ())
711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
712 "Build SLP failed: invalid type of def "
713 "for variable-length SLP %T\n", oprnd);
714 return -1;
717 /* For the swapping logic below force vect_reduction_def
718 for the reduction op in a SLP reduction group. */
719 if (!STMT_VINFO_DATA_REF (stmt_info)
720 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
721 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
722 && def_stmt_info)
723 dts[i] = dt = vect_reduction_def;
725 /* Check the types of the definition. */
726 switch (dt)
728 case vect_external_def:
729 case vect_constant_def:
730 case vect_internal_def:
731 case vect_reduction_def:
732 case vect_induction_def:
733 case vect_nested_cycle:
734 case vect_first_order_recurrence:
735 break;
737 default:
738 /* FORNOW: Not supported. */
739 if (dump_enabled_p ())
740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
741 "Build SLP failed: illegal type of def %T\n",
742 oprnd);
743 return -1;
746 oprnd_info->first_dt = dt;
747 oprnd_info->first_op_type = type;
750 if (first)
751 return 0;
753 /* Now match the operand definition types to that of the first stmt. */
754 for (i = 0; i < number_of_oprnds;)
756 if (skip_args[i])
758 ++i;
759 continue;
762 oprnd_info = (*oprnds_info)[i];
763 dt = dts[i];
764 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
765 oprnd = oprnd_info->ops[stmt_num];
766 tree type = TREE_TYPE (oprnd);
768 if (!types_compatible_p (oprnd_info->first_op_type, type))
770 if (dump_enabled_p ())
771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
772 "Build SLP failed: different operand types\n");
773 return 1;
776 /* Not first stmt of the group, check that the def-stmt/s match
777 the def-stmt/s of the first stmt. Allow different definition
778 types for reduction chains: the first stmt must be a
779 vect_reduction_def (a phi node), and the rest
780 end in the reduction chain. */
781 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
782 && !(oprnd_info->first_dt == vect_reduction_def
783 && !STMT_VINFO_DATA_REF (stmt_info)
784 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
785 && def_stmt_info
786 && !STMT_VINFO_DATA_REF (def_stmt_info)
787 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
788 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
789 || (!STMT_VINFO_DATA_REF (stmt_info)
790 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
791 && ((!def_stmt_info
792 || STMT_VINFO_DATA_REF (def_stmt_info)
793 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
794 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
795 != (oprnd_info->first_dt != vect_reduction_def))))
797 /* Try swapping operands if we got a mismatch. For BB
798 vectorization only in case it will clearly improve things. */
799 if (i == commutative_op && !swapped
800 && (!is_a <bb_vec_info> (vinfo)
801 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
802 dts[i+1])
803 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
804 || vect_def_types_match
805 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
807 if (dump_enabled_p ())
808 dump_printf_loc (MSG_NOTE, vect_location,
809 "trying swapped operands\n");
810 std::swap (dts[i], dts[i+1]);
811 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
812 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
813 std::swap ((*oprnds_info)[i]->ops[stmt_num],
814 (*oprnds_info)[i+1]->ops[stmt_num]);
815 swapped = true;
816 continue;
819 if (is_a <bb_vec_info> (vinfo)
820 && !oprnd_info->any_pattern)
822 /* Now for commutative ops we should see whether we can
823 make the other operand matching. */
824 if (dump_enabled_p ())
825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
826 "treating operand as external\n");
827 oprnd_info->first_dt = dt = vect_external_def;
829 else
831 if (dump_enabled_p ())
832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
833 "Build SLP failed: different types\n");
834 return 1;
838 /* Make sure to demote the overall operand to external. */
839 if (dt == vect_external_def)
840 oprnd_info->first_dt = vect_external_def;
841 /* For a SLP reduction chain we want to duplicate the reduction to
842 each of the chain members. That gets us a sane SLP graph (still
843 the stmts are not 100% correct wrt the initial values). */
844 else if ((dt == vect_internal_def
845 || dt == vect_reduction_def)
846 && oprnd_info->first_dt == vect_reduction_def
847 && !STMT_VINFO_DATA_REF (stmt_info)
848 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
849 && !STMT_VINFO_DATA_REF (def_stmt_info)
850 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
851 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
853 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
854 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
857 ++i;
860 /* Swap operands. */
861 if (swapped)
863 if (dump_enabled_p ())
864 dump_printf_loc (MSG_NOTE, vect_location,
865 "swapped operands to match def types in %G",
866 stmt_info->stmt);
869 return 0;
872 /* Return true if call statements CALL1 and CALL2 are similar enough
873 to be combined into the same SLP group. */
875 bool
876 compatible_calls_p (gcall *call1, gcall *call2)
878 unsigned int nargs = gimple_call_num_args (call1);
879 if (nargs != gimple_call_num_args (call2))
880 return false;
882 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
883 return false;
885 if (gimple_call_internal_p (call1))
887 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
888 TREE_TYPE (gimple_call_lhs (call2))))
889 return false;
890 for (unsigned int i = 0; i < nargs; ++i)
891 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
892 TREE_TYPE (gimple_call_arg (call2, i))))
893 return false;
895 else
897 if (!operand_equal_p (gimple_call_fn (call1),
898 gimple_call_fn (call2), 0))
899 return false;
901 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
902 return false;
905 /* Check that any unvectorized arguments are equal. */
906 if (const int *map = vect_get_operand_map (call1))
908 unsigned int nkept = *map++;
909 unsigned int mapi = 0;
910 for (unsigned int i = 0; i < nargs; ++i)
911 if (mapi < nkept && map[mapi] == int (i))
912 mapi += 1;
913 else if (!operand_equal_p (gimple_call_arg (call1, i),
914 gimple_call_arg (call2, i)))
915 return false;
918 return true;
921 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
922 caller's attempt to find the vector type in STMT_INFO with the narrowest
923 element type. Return true if VECTYPE is nonnull and if it is valid
924 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
925 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
926 vect_build_slp_tree. */
928 static bool
929 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
930 unsigned int group_size,
931 tree vectype, poly_uint64 *max_nunits)
933 if (!vectype)
935 if (dump_enabled_p ())
936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
937 "Build SLP failed: unsupported data-type in %G\n",
938 stmt_info->stmt);
939 /* Fatal mismatch. */
940 return false;
943 /* If populating the vector type requires unrolling then fail
944 before adjusting *max_nunits for basic-block vectorization. */
945 if (is_a <bb_vec_info> (vinfo)
946 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
948 if (dump_enabled_p ())
949 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
950 "Build SLP failed: unrolling required "
951 "in basic block SLP\n");
952 /* Fatal mismatch. */
953 return false;
956 /* In case of multiple types we need to detect the smallest type. */
957 vect_update_max_nunits (max_nunits, vectype);
958 return true;
961 /* Verify if the scalar stmts STMTS are isomorphic, require data
962 permutation or are of unsupported types of operation. Return
963 true if they are, otherwise return false and indicate in *MATCHES
964 which stmts are not isomorphic to the first one. If MATCHES[0]
965 is false then this indicates the comparison could not be
966 carried out or the stmts will never be vectorized by SLP.
968 Note COND_EXPR is possibly isomorphic to another one after swapping its
969 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
970 the first stmt by swapping the two operands of comparison; set SWAP[i]
971 to 2 if stmt I is isormorphic to the first stmt by inverting the code
972 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
973 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
975 static bool
976 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
977 vec<stmt_vec_info> stmts, unsigned int group_size,
978 poly_uint64 *max_nunits, bool *matches,
979 bool *two_operators, tree *node_vectype)
981 unsigned int i;
982 stmt_vec_info first_stmt_info = stmts[0];
983 code_helper first_stmt_code = ERROR_MARK;
984 code_helper alt_stmt_code = ERROR_MARK;
985 code_helper rhs_code = ERROR_MARK;
986 code_helper first_cond_code = ERROR_MARK;
987 tree lhs;
988 bool need_same_oprnds = false;
989 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
990 stmt_vec_info first_load = NULL, prev_first_load = NULL;
991 bool first_stmt_ldst_p = false, ldst_p = false;
992 bool first_stmt_phi_p = false, phi_p = false;
993 bool maybe_soft_fail = false;
994 tree soft_fail_nunits_vectype = NULL_TREE;
996 /* For every stmt in NODE find its def stmt/s. */
997 stmt_vec_info stmt_info;
998 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1000 gimple *stmt = stmt_info->stmt;
1001 swap[i] = 0;
1002 matches[i] = false;
1004 if (dump_enabled_p ())
1005 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1007 /* Fail to vectorize statements marked as unvectorizable, throw
1008 or are volatile. */
1009 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1010 || stmt_can_throw_internal (cfun, stmt)
1011 || gimple_has_volatile_ops (stmt))
1013 if (dump_enabled_p ())
1014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1015 "Build SLP failed: unvectorizable statement %G",
1016 stmt);
1017 /* ??? For BB vectorization we want to commutate operands in a way
1018 to shuffle all unvectorizable defs into one operand and have
1019 the other still vectorized. The following doesn't reliably
1020 work for this though but it's the easiest we can do here. */
1021 if (is_a <bb_vec_info> (vinfo) && i != 0)
1022 continue;
1023 /* Fatal mismatch. */
1024 matches[0] = false;
1025 return false;
1028 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1029 lhs = gimple_get_lhs (stmt);
1030 if (lhs == NULL_TREE
1031 && (!call_stmt
1032 || !gimple_call_internal_p (stmt)
1033 || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1035 if (dump_enabled_p ())
1036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1037 "Build SLP failed: not GIMPLE_ASSIGN nor "
1038 "GIMPLE_CALL %G", stmt);
1039 if (is_a <bb_vec_info> (vinfo) && i != 0)
1040 continue;
1041 /* Fatal mismatch. */
1042 matches[0] = false;
1043 return false;
1046 tree nunits_vectype;
1047 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1048 &nunits_vectype, group_size))
1050 if (is_a <bb_vec_info> (vinfo) && i != 0)
1051 continue;
1052 /* Fatal mismatch. */
1053 matches[0] = false;
1054 return false;
1056 /* Record nunits required but continue analysis, producing matches[]
1057 as if nunits was not an issue. This allows splitting of groups
1058 to happen. */
1059 if (nunits_vectype
1060 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1061 nunits_vectype, max_nunits))
1063 gcc_assert (is_a <bb_vec_info> (vinfo));
1064 maybe_soft_fail = true;
1065 soft_fail_nunits_vectype = nunits_vectype;
1068 gcc_assert (vectype);
1070 if (call_stmt)
1072 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1073 if (cfn != CFN_LAST)
1074 rhs_code = cfn;
1075 else
1076 rhs_code = CALL_EXPR;
1078 if (cfn == CFN_MASK_LOAD
1079 || cfn == CFN_GATHER_LOAD
1080 || cfn == CFN_MASK_GATHER_LOAD)
1081 ldst_p = true;
1082 else if (cfn == CFN_MASK_STORE)
1084 ldst_p = true;
1085 rhs_code = CFN_MASK_STORE;
1087 else if ((internal_fn_p (cfn)
1088 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1089 || gimple_call_tail_p (call_stmt)
1090 || gimple_call_noreturn_p (call_stmt)
1091 || gimple_call_chain (call_stmt))
1093 if (dump_enabled_p ())
1094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1095 "Build SLP failed: unsupported call type %G",
1096 (gimple *) call_stmt);
1097 if (is_a <bb_vec_info> (vinfo) && i != 0)
1098 continue;
1099 /* Fatal mismatch. */
1100 matches[0] = false;
1101 return false;
1104 else if (gimple_code (stmt) == GIMPLE_PHI)
1106 rhs_code = ERROR_MARK;
1107 phi_p = true;
1109 else
1111 rhs_code = gimple_assign_rhs_code (stmt);
1112 ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1115 /* Check the operation. */
1116 if (i == 0)
1118 *node_vectype = vectype;
1119 first_stmt_code = rhs_code;
1120 first_stmt_ldst_p = ldst_p;
1121 first_stmt_phi_p = phi_p;
1123 /* Shift arguments should be equal in all the packed stmts for a
1124 vector shift with scalar shift operand. */
1125 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1126 || rhs_code == LROTATE_EXPR
1127 || rhs_code == RROTATE_EXPR)
1129 /* First see if we have a vector/vector shift. */
1130 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1132 /* No vector/vector shift, try for a vector/scalar shift. */
1133 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1135 if (dump_enabled_p ())
1136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1137 "Build SLP failed: "
1138 "op not supported by target.\n");
1139 if (is_a <bb_vec_info> (vinfo) && i != 0)
1140 continue;
1141 /* Fatal mismatch. */
1142 matches[0] = false;
1143 return false;
1145 need_same_oprnds = true;
1146 first_op1 = gimple_assign_rhs2 (stmt);
1149 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1151 need_same_oprnds = true;
1152 first_op1 = gimple_assign_rhs2 (stmt);
1154 else if (!ldst_p
1155 && rhs_code == BIT_FIELD_REF)
1157 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1158 if (!is_a <bb_vec_info> (vinfo)
1159 || TREE_CODE (vec) != SSA_NAME
1160 /* When the element types are not compatible we pun the
1161 source to the target vectype which requires equal size. */
1162 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1163 || !types_compatible_p (TREE_TYPE (vectype),
1164 TREE_TYPE (TREE_TYPE (vec))))
1165 && !operand_equal_p (TYPE_SIZE (vectype),
1166 TYPE_SIZE (TREE_TYPE (vec)))))
1168 if (dump_enabled_p ())
1169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1170 "Build SLP failed: "
1171 "BIT_FIELD_REF not supported\n");
1172 /* Fatal mismatch. */
1173 matches[0] = false;
1174 return false;
1177 else if (rhs_code == CFN_DIV_POW2)
1179 need_same_oprnds = true;
1180 first_op1 = gimple_call_arg (call_stmt, 1);
1183 else
1185 if (first_stmt_code != rhs_code
1186 && alt_stmt_code == ERROR_MARK)
1187 alt_stmt_code = rhs_code;
1188 if ((first_stmt_code != rhs_code
1189 && (first_stmt_code != IMAGPART_EXPR
1190 || rhs_code != REALPART_EXPR)
1191 && (first_stmt_code != REALPART_EXPR
1192 || rhs_code != IMAGPART_EXPR)
1193 /* Handle mismatches in plus/minus by computing both
1194 and merging the results. */
1195 && !((first_stmt_code == PLUS_EXPR
1196 || first_stmt_code == MINUS_EXPR)
1197 && (alt_stmt_code == PLUS_EXPR
1198 || alt_stmt_code == MINUS_EXPR)
1199 && rhs_code == alt_stmt_code)
1200 && !(first_stmt_code.is_tree_code ()
1201 && rhs_code.is_tree_code ()
1202 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1203 == tcc_comparison)
1204 && (swap_tree_comparison (tree_code (first_stmt_code))
1205 == tree_code (rhs_code)))
1206 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1207 && (first_stmt_code == ARRAY_REF
1208 || first_stmt_code == BIT_FIELD_REF
1209 || first_stmt_code == INDIRECT_REF
1210 || first_stmt_code == COMPONENT_REF
1211 || first_stmt_code == MEM_REF)
1212 && (rhs_code == ARRAY_REF
1213 || rhs_code == BIT_FIELD_REF
1214 || rhs_code == INDIRECT_REF
1215 || rhs_code == COMPONENT_REF
1216 || rhs_code == MEM_REF)))
1217 || first_stmt_ldst_p != ldst_p
1218 || first_stmt_phi_p != phi_p)
1220 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223 "Build SLP failed: different operation "
1224 "in stmt %G", stmt);
1225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1226 "original stmt %G", first_stmt_info->stmt);
1228 /* Mismatch. */
1229 continue;
1232 if (!ldst_p
1233 && first_stmt_code == BIT_FIELD_REF
1234 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1235 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1237 if (dump_enabled_p ())
1238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1239 "Build SLP failed: different BIT_FIELD_REF "
1240 "arguments in %G", stmt);
1241 /* Mismatch. */
1242 continue;
1245 if (call_stmt
1246 && first_stmt_code != CFN_MASK_LOAD
1247 && first_stmt_code != CFN_MASK_STORE)
1249 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1250 call_stmt))
1252 if (dump_enabled_p ())
1253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254 "Build SLP failed: different calls in %G",
1255 stmt);
1256 /* Mismatch. */
1257 continue;
1261 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1262 && (gimple_bb (first_stmt_info->stmt)
1263 != gimple_bb (stmt_info->stmt)))
1265 if (dump_enabled_p ())
1266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1267 "Build SLP failed: different BB for PHI "
1268 "or possibly trapping operation in %G", stmt);
1269 /* Mismatch. */
1270 continue;
1273 if (need_same_oprnds)
1275 tree other_op1 = gimple_arg (stmt, 1);
1276 if (!operand_equal_p (first_op1, other_op1, 0))
1278 if (dump_enabled_p ())
1279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1280 "Build SLP failed: different shift "
1281 "arguments in %G", stmt);
1282 /* Mismatch. */
1283 continue;
1287 if (!types_compatible_p (vectype, *node_vectype))
1289 if (dump_enabled_p ())
1290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291 "Build SLP failed: different vector type "
1292 "in %G", stmt);
1293 /* Mismatch. */
1294 continue;
1298 /* Grouped store or load. */
1299 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1301 gcc_assert (ldst_p);
1302 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1304 /* Store. */
1305 gcc_assert (rhs_code == CFN_MASK_STORE
1306 || REFERENCE_CLASS_P (lhs)
1307 || DECL_P (lhs));
1309 else
1311 /* Load. */
1312 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1313 if (prev_first_load)
1315 /* Check that there are no loads from different interleaving
1316 chains in the same node. */
1317 if (prev_first_load != first_load)
1319 if (dump_enabled_p ())
1320 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1321 vect_location,
1322 "Build SLP failed: different "
1323 "interleaving chains in one node %G",
1324 stmt);
1325 /* Mismatch. */
1326 continue;
1329 else
1330 prev_first_load = first_load;
1333 /* Non-grouped store or load. */
1334 else if (ldst_p)
1336 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1337 && rhs_code != CFN_GATHER_LOAD
1338 && rhs_code != CFN_MASK_GATHER_LOAD
1339 /* Not grouped loads are handled as externals for BB
1340 vectorization. For loop vectorization we can handle
1341 splats the same we handle single element interleaving. */
1342 && (is_a <bb_vec_info> (vinfo)
1343 || stmt_info != first_stmt_info
1344 || STMT_VINFO_GATHER_SCATTER_P (stmt_info)))
1346 /* Not grouped load. */
1347 if (dump_enabled_p ())
1348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1349 "Build SLP failed: not grouped load %G", stmt);
1351 if (i != 0)
1352 continue;
1353 /* Fatal mismatch. */
1354 matches[0] = false;
1355 return false;
1358 /* Not memory operation. */
1359 else
1361 if (!phi_p
1362 && rhs_code.is_tree_code ()
1363 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1364 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1365 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1366 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1367 && rhs_code != VIEW_CONVERT_EXPR
1368 && rhs_code != CALL_EXPR
1369 && rhs_code != BIT_FIELD_REF)
1371 if (dump_enabled_p ())
1372 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1373 "Build SLP failed: operation unsupported %G",
1374 stmt);
1375 if (is_a <bb_vec_info> (vinfo) && i != 0)
1376 continue;
1377 /* Fatal mismatch. */
1378 matches[0] = false;
1379 return false;
1382 if (rhs_code == COND_EXPR)
1384 tree cond_expr = gimple_assign_rhs1 (stmt);
1385 enum tree_code cond_code = TREE_CODE (cond_expr);
1386 enum tree_code swap_code = ERROR_MARK;
1387 enum tree_code invert_code = ERROR_MARK;
1389 if (i == 0)
1390 first_cond_code = TREE_CODE (cond_expr);
1391 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1393 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1394 swap_code = swap_tree_comparison (cond_code);
1395 invert_code = invert_tree_comparison (cond_code, honor_nans);
1398 if (first_cond_code == cond_code)
1400 /* Isomorphic can be achieved by swapping. */
1401 else if (first_cond_code == swap_code)
1402 swap[i] = 1;
1403 /* Isomorphic can be achieved by inverting. */
1404 else if (first_cond_code == invert_code)
1405 swap[i] = 2;
1406 else
1408 if (dump_enabled_p ())
1409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1410 "Build SLP failed: different"
1411 " operation %G", stmt);
1412 /* Mismatch. */
1413 continue;
1417 if (rhs_code.is_tree_code ()
1418 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1419 && (swap_tree_comparison ((tree_code)first_stmt_code)
1420 == (tree_code)rhs_code))
1421 swap[i] = 1;
1424 matches[i] = true;
1427 for (i = 0; i < group_size; ++i)
1428 if (!matches[i])
1429 return false;
1431 /* If we allowed a two-operation SLP node verify the target can cope
1432 with the permute we are going to use. */
1433 if (alt_stmt_code != ERROR_MARK
1434 && (!alt_stmt_code.is_tree_code ()
1435 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1436 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1438 *two_operators = true;
1441 if (maybe_soft_fail)
1443 unsigned HOST_WIDE_INT const_nunits;
1444 if (!TYPE_VECTOR_SUBPARTS
1445 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1446 || const_nunits > group_size)
1447 matches[0] = false;
1448 else
1450 /* With constant vector elements simulate a mismatch at the
1451 point we need to split. */
1452 unsigned tail = group_size & (const_nunits - 1);
1453 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1455 return false;
1458 return true;
1461 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1462 Note we never remove apart from at destruction time so we do not
1463 need a special value for deleted that differs from empty. */
1464 struct bst_traits
1466 typedef vec <stmt_vec_info> value_type;
1467 typedef vec <stmt_vec_info> compare_type;
1468 static inline hashval_t hash (value_type);
1469 static inline bool equal (value_type existing, value_type candidate);
1470 static inline bool is_empty (value_type x) { return !x.exists (); }
1471 static inline bool is_deleted (value_type x) { return !x.exists (); }
1472 static const bool empty_zero_p = true;
1473 static inline void mark_empty (value_type &x) { x.release (); }
1474 static inline void mark_deleted (value_type &x) { x.release (); }
1475 static inline void remove (value_type &x) { x.release (); }
1477 inline hashval_t
1478 bst_traits::hash (value_type x)
1480 inchash::hash h;
1481 for (unsigned i = 0; i < x.length (); ++i)
1482 h.add_int (gimple_uid (x[i]->stmt));
1483 return h.end ();
1485 inline bool
1486 bst_traits::equal (value_type existing, value_type candidate)
1488 if (existing.length () != candidate.length ())
1489 return false;
1490 for (unsigned i = 0; i < existing.length (); ++i)
1491 if (existing[i] != candidate[i])
1492 return false;
1493 return true;
1496 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1497 but then vec::insert does memmove and that's not compatible with
1498 std::pair. */
1499 struct chain_op_t
1501 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1502 : code (code_), dt (dt_), op (op_) {}
1503 tree_code code;
1504 vect_def_type dt;
1505 tree op;
1508 /* Comparator for sorting associatable chains. */
1510 static int
1511 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1513 auto *op1 = (const chain_op_t *) op1_;
1514 auto *op2 = (const chain_op_t *) op2_;
1515 if (op1->dt != op2->dt)
1516 return (int)op1->dt - (int)op2->dt;
1517 return (int)op1->code - (int)op2->code;
1520 /* Linearize the associatable expression chain at START with the
1521 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1522 filling CHAIN with the result and using WORKLIST as intermediate storage.
1523 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1524 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1525 stmts, starting with START. */
1527 static void
1528 vect_slp_linearize_chain (vec_info *vinfo,
1529 vec<std::pair<tree_code, gimple *> > &worklist,
1530 vec<chain_op_t> &chain,
1531 enum tree_code code, gimple *start,
1532 gimple *&code_stmt, gimple *&alt_code_stmt,
1533 vec<gimple *> *chain_stmts)
1535 /* For each lane linearize the addition/subtraction (or other
1536 uniform associatable operation) expression tree. */
1537 worklist.safe_push (std::make_pair (code, start));
1538 while (!worklist.is_empty ())
1540 auto entry = worklist.pop ();
1541 gassign *stmt = as_a <gassign *> (entry.second);
1542 enum tree_code in_code = entry.first;
1543 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1544 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1545 if (!code_stmt
1546 && gimple_assign_rhs_code (stmt) == code)
1547 code_stmt = stmt;
1548 else if (!alt_code_stmt
1549 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1550 alt_code_stmt = stmt;
1551 if (chain_stmts)
1552 chain_stmts->safe_push (stmt);
1553 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1555 tree op = gimple_op (stmt, opnum);
1556 vect_def_type dt;
1557 stmt_vec_info def_stmt_info;
1558 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1559 gcc_assert (res);
1560 if (dt == vect_internal_def
1561 && is_pattern_stmt_p (def_stmt_info))
1562 op = gimple_get_lhs (def_stmt_info->stmt);
1563 gimple *use_stmt;
1564 use_operand_p use_p;
1565 if (dt == vect_internal_def
1566 && single_imm_use (op, &use_p, &use_stmt)
1567 && is_gimple_assign (def_stmt_info->stmt)
1568 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1569 || (code == PLUS_EXPR
1570 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1571 == MINUS_EXPR))))
1573 tree_code op_def_code = this_code;
1574 if (op_def_code == MINUS_EXPR && opnum == 1)
1575 op_def_code = PLUS_EXPR;
1576 if (in_code == MINUS_EXPR)
1577 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1578 worklist.safe_push (std::make_pair (op_def_code,
1579 def_stmt_info->stmt));
1581 else
1583 tree_code op_def_code = this_code;
1584 if (op_def_code == MINUS_EXPR && opnum == 1)
1585 op_def_code = PLUS_EXPR;
1586 if (in_code == MINUS_EXPR)
1587 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1588 chain.safe_push (chain_op_t (op_def_code, dt, op));
1594 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1595 simple_hashmap_traits <bst_traits, slp_tree> >
1596 scalar_stmts_to_slp_tree_map_t;
1598 static slp_tree
1599 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1600 vec<stmt_vec_info> stmts, unsigned int group_size,
1601 poly_uint64 *max_nunits,
1602 bool *matches, unsigned *limit, unsigned *tree_size,
1603 scalar_stmts_to_slp_tree_map_t *bst_map);
1605 static slp_tree
1606 vect_build_slp_tree (vec_info *vinfo,
1607 vec<stmt_vec_info> stmts, unsigned int group_size,
1608 poly_uint64 *max_nunits,
1609 bool *matches, unsigned *limit, unsigned *tree_size,
1610 scalar_stmts_to_slp_tree_map_t *bst_map)
1612 if (slp_tree *leader = bst_map->get (stmts))
1614 if (dump_enabled_p ())
1615 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1616 !(*leader)->failed ? "" : "failed ",
1617 (void *) *leader);
1618 if (!(*leader)->failed)
1620 SLP_TREE_REF_COUNT (*leader)++;
1621 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1622 stmts.release ();
1623 return *leader;
1625 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1626 return NULL;
1629 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1630 so we can pick up backedge destinations during discovery. */
1631 slp_tree res = new _slp_tree;
1632 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1633 SLP_TREE_SCALAR_STMTS (res) = stmts;
1634 bst_map->put (stmts.copy (), res);
1636 if (*limit == 0)
1638 if (dump_enabled_p ())
1639 dump_printf_loc (MSG_NOTE, vect_location,
1640 "SLP discovery limit exceeded\n");
1641 /* Mark the node invalid so we can detect those when still in use
1642 as backedge destinations. */
1643 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1644 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1645 res->failed = XNEWVEC (bool, group_size);
1646 memset (res->failed, 0, sizeof (bool) * group_size);
1647 memset (matches, 0, sizeof (bool) * group_size);
1648 return NULL;
1650 --*limit;
1652 if (dump_enabled_p ())
1653 dump_printf_loc (MSG_NOTE, vect_location,
1654 "starting SLP discovery for node %p\n", (void *) res);
1656 poly_uint64 this_max_nunits = 1;
1657 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1658 &this_max_nunits,
1659 matches, limit, tree_size, bst_map);
1660 if (!res_)
1662 if (dump_enabled_p ())
1663 dump_printf_loc (MSG_NOTE, vect_location,
1664 "SLP discovery for node %p failed\n", (void *) res);
1665 /* Mark the node invalid so we can detect those when still in use
1666 as backedge destinations. */
1667 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1668 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1669 res->failed = XNEWVEC (bool, group_size);
1670 if (flag_checking)
1672 unsigned i;
1673 for (i = 0; i < group_size; ++i)
1674 if (!matches[i])
1675 break;
1676 gcc_assert (i < group_size);
1678 memcpy (res->failed, matches, sizeof (bool) * group_size);
1680 else
1682 if (dump_enabled_p ())
1683 dump_printf_loc (MSG_NOTE, vect_location,
1684 "SLP discovery for node %p succeeded\n",
1685 (void *) res);
1686 gcc_assert (res_ == res);
1687 res->max_nunits = this_max_nunits;
1688 vect_update_max_nunits (max_nunits, this_max_nunits);
1689 /* Keep a reference for the bst_map use. */
1690 SLP_TREE_REF_COUNT (res)++;
1692 return res_;
1695 /* Helper for building an associated SLP node chain. */
1697 static void
1698 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1699 slp_tree op0, slp_tree op1,
1700 stmt_vec_info oper1, stmt_vec_info oper2,
1701 vec<std::pair<unsigned, unsigned> > lperm)
1703 unsigned group_size = SLP_TREE_LANES (op1);
1705 slp_tree child1 = new _slp_tree;
1706 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1707 SLP_TREE_VECTYPE (child1) = vectype;
1708 SLP_TREE_LANES (child1) = group_size;
1709 SLP_TREE_CHILDREN (child1).create (2);
1710 SLP_TREE_CHILDREN (child1).quick_push (op0);
1711 SLP_TREE_CHILDREN (child1).quick_push (op1);
1712 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1714 slp_tree child2 = new _slp_tree;
1715 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1716 SLP_TREE_VECTYPE (child2) = vectype;
1717 SLP_TREE_LANES (child2) = group_size;
1718 SLP_TREE_CHILDREN (child2).create (2);
1719 SLP_TREE_CHILDREN (child2).quick_push (op0);
1720 SLP_TREE_REF_COUNT (op0)++;
1721 SLP_TREE_CHILDREN (child2).quick_push (op1);
1722 SLP_TREE_REF_COUNT (op1)++;
1723 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1725 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1726 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1727 SLP_TREE_VECTYPE (perm) = vectype;
1728 SLP_TREE_LANES (perm) = group_size;
1729 /* ??? We should set this NULL but that's not expected. */
1730 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1731 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1732 SLP_TREE_CHILDREN (perm).quick_push (child1);
1733 SLP_TREE_CHILDREN (perm).quick_push (child2);
1736 /* Recursively build an SLP tree starting from NODE.
1737 Fail (and return a value not equal to zero) if def-stmts are not
1738 isomorphic, require data permutation or are of unsupported types of
1739 operation. Otherwise, return 0.
1740 The value returned is the depth in the SLP tree where a mismatch
1741 was found. */
1743 static slp_tree
1744 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1745 vec<stmt_vec_info> stmts, unsigned int group_size,
1746 poly_uint64 *max_nunits,
1747 bool *matches, unsigned *limit, unsigned *tree_size,
1748 scalar_stmts_to_slp_tree_map_t *bst_map)
1750 unsigned nops, i, this_tree_size = 0;
1751 poly_uint64 this_max_nunits = *max_nunits;
1753 matches[0] = false;
1755 stmt_vec_info stmt_info = stmts[0];
1756 if (!is_a<gcall *> (stmt_info->stmt)
1757 && !is_a<gassign *> (stmt_info->stmt)
1758 && !is_a<gphi *> (stmt_info->stmt))
1759 return NULL;
1761 nops = gimple_num_args (stmt_info->stmt);
1762 if (const int *map = vect_get_operand_map (stmt_info->stmt))
1763 nops = map[0];
1765 /* If the SLP node is a PHI (induction or reduction), terminate
1766 the recursion. */
1767 bool *skip_args = XALLOCAVEC (bool, nops);
1768 memset (skip_args, 0, sizeof (bool) * nops);
1769 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1770 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1772 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1773 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1774 group_size);
1775 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1776 max_nunits))
1777 return NULL;
1779 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1780 if (def_type == vect_induction_def)
1782 /* Induction PHIs are not cycles but walk the initial
1783 value. Only for inner loops through, for outer loops
1784 we need to pick up the value from the actual PHIs
1785 to more easily support peeling and epilogue vectorization. */
1786 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1787 if (!nested_in_vect_loop_p (loop, stmt_info))
1788 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1789 else
1790 loop = loop->inner;
1791 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1793 else if (def_type == vect_reduction_def
1794 || def_type == vect_double_reduction_def
1795 || def_type == vect_nested_cycle
1796 || def_type == vect_first_order_recurrence)
1798 /* Else def types have to match. */
1799 stmt_vec_info other_info;
1800 bool all_same = true;
1801 FOR_EACH_VEC_ELT (stmts, i, other_info)
1803 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1804 return NULL;
1805 if (other_info != stmt_info)
1806 all_same = false;
1808 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1809 /* Reduction initial values are not explicitely represented. */
1810 if (def_type != vect_first_order_recurrence
1811 && !nested_in_vect_loop_p (loop, stmt_info))
1812 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1813 /* Reduction chain backedge defs are filled manually.
1814 ??? Need a better way to identify a SLP reduction chain PHI.
1815 Or a better overall way to SLP match those. */
1816 if (all_same && def_type == vect_reduction_def)
1817 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1819 else if (def_type != vect_internal_def)
1820 return NULL;
1824 bool two_operators = false;
1825 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1826 tree vectype = NULL_TREE;
1827 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1828 &this_max_nunits, matches, &two_operators,
1829 &vectype))
1830 return NULL;
1832 /* If the SLP node is a load, terminate the recursion unless masked. */
1833 if (STMT_VINFO_DATA_REF (stmt_info)
1834 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1836 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1837 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1838 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1839 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1840 else
1842 *max_nunits = this_max_nunits;
1843 (*tree_size)++;
1844 node = vect_create_new_slp_node (node, stmts, 0);
1845 SLP_TREE_VECTYPE (node) = vectype;
1846 /* And compute the load permutation. Whether it is actually
1847 a permutation depends on the unrolling factor which is
1848 decided later. */
1849 vec<unsigned> load_permutation;
1850 int j;
1851 stmt_vec_info load_info;
1852 load_permutation.create (group_size);
1853 stmt_vec_info first_stmt_info
1854 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1855 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1857 int load_place;
1858 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1859 load_place = vect_get_place_in_interleaving_chain
1860 (load_info, first_stmt_info);
1861 else
1862 load_place = 0;
1863 gcc_assert (load_place != -1);
1864 load_permutation.safe_push (load_place);
1866 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1867 return node;
1870 else if (gimple_assign_single_p (stmt_info->stmt)
1871 && !gimple_vuse (stmt_info->stmt)
1872 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1874 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1875 the same SSA name vector of a compatible type to vectype. */
1876 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1877 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1878 stmt_vec_info estmt_info;
1879 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1881 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1882 tree bfref = gimple_assign_rhs1 (estmt);
1883 HOST_WIDE_INT lane;
1884 if (!known_eq (bit_field_size (bfref),
1885 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1886 || !constant_multiple_p (bit_field_offset (bfref),
1887 bit_field_size (bfref), &lane))
1889 lperm.release ();
1890 matches[0] = false;
1891 return NULL;
1893 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1895 slp_tree vnode = vect_create_new_slp_node (vNULL);
1896 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1897 /* ??? We record vectype here but we hide eventually necessary
1898 punning and instead rely on code generation to materialize
1899 VIEW_CONVERT_EXPRs as necessary. We instead should make
1900 this explicit somehow. */
1901 SLP_TREE_VECTYPE (vnode) = vectype;
1902 else
1904 /* For different size but compatible elements we can still
1905 use VEC_PERM_EXPR without punning. */
1906 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1907 && types_compatible_p (TREE_TYPE (vectype),
1908 TREE_TYPE (TREE_TYPE (vec))));
1909 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1911 auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
1912 unsigned HOST_WIDE_INT const_nunits;
1913 if (nunits.is_constant (&const_nunits))
1914 SLP_TREE_LANES (vnode) = const_nunits;
1915 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1916 /* We are always building a permutation node even if it is an identity
1917 permute to shield the rest of the vectorizer from the odd node
1918 representing an actual vector without any scalar ops.
1919 ??? We could hide it completely with making the permute node
1920 external? */
1921 node = vect_create_new_slp_node (node, stmts, 1);
1922 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1923 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1924 SLP_TREE_VECTYPE (node) = vectype;
1925 SLP_TREE_CHILDREN (node).quick_push (vnode);
1926 return node;
1928 /* When discovery reaches an associatable operation see whether we can
1929 improve that to match up lanes in a way superior to the operand
1930 swapping code which at most looks at two defs.
1931 ??? For BB vectorization we cannot do the brute-force search
1932 for matching as we can succeed by means of builds from scalars
1933 and have no good way to "cost" one build against another. */
1934 else if (is_a <loop_vec_info> (vinfo)
1935 /* ??? We don't handle !vect_internal_def defs below. */
1936 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1937 && is_gimple_assign (stmt_info->stmt)
1938 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1939 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1940 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1941 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1942 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1944 /* See if we have a chain of (mixed) adds or subtracts or other
1945 associatable ops. */
1946 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1947 if (code == MINUS_EXPR)
1948 code = PLUS_EXPR;
1949 stmt_vec_info other_op_stmt_info = NULL;
1950 stmt_vec_info op_stmt_info = NULL;
1951 unsigned chain_len = 0;
1952 auto_vec<chain_op_t> chain;
1953 auto_vec<std::pair<tree_code, gimple *> > worklist;
1954 auto_vec<vec<chain_op_t> > chains (group_size);
1955 auto_vec<slp_tree, 4> children;
1956 bool hard_fail = true;
1957 for (unsigned lane = 0; lane < group_size; ++lane)
1959 /* For each lane linearize the addition/subtraction (or other
1960 uniform associatable operation) expression tree. */
1961 gimple *op_stmt = NULL, *other_op_stmt = NULL;
1962 vect_slp_linearize_chain (vinfo, worklist, chain, code,
1963 stmts[lane]->stmt, op_stmt, other_op_stmt,
1964 NULL);
1965 if (!op_stmt_info && op_stmt)
1966 op_stmt_info = vinfo->lookup_stmt (op_stmt);
1967 if (!other_op_stmt_info && other_op_stmt)
1968 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1969 if (chain.length () == 2)
1971 /* In a chain of just two elements resort to the regular
1972 operand swapping scheme. If we run into a length
1973 mismatch still hard-FAIL. */
1974 if (chain_len == 0)
1975 hard_fail = false;
1976 else
1978 matches[lane] = false;
1979 /* ??? We might want to process the other lanes, but
1980 make sure to not give false matching hints to the
1981 caller for lanes we did not process. */
1982 if (lane != group_size - 1)
1983 matches[0] = false;
1985 break;
1987 else if (chain_len == 0)
1988 chain_len = chain.length ();
1989 else if (chain.length () != chain_len)
1991 /* ??? Here we could slip in magic to compensate with
1992 neutral operands. */
1993 matches[lane] = false;
1994 if (lane != group_size - 1)
1995 matches[0] = false;
1996 break;
1998 chains.quick_push (chain.copy ());
1999 chain.truncate (0);
2001 if (chains.length () == group_size)
2003 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2004 if (!op_stmt_info)
2006 hard_fail = false;
2007 goto out;
2009 /* Now we have a set of chains with the same length. */
2010 /* 1. pre-sort according to def_type and operation. */
2011 for (unsigned lane = 0; lane < group_size; ++lane)
2012 chains[lane].stablesort (dt_sort_cmp, vinfo);
2013 if (dump_enabled_p ())
2015 dump_printf_loc (MSG_NOTE, vect_location,
2016 "pre-sorted chains of %s\n",
2017 get_tree_code_name (code));
2018 for (unsigned lane = 0; lane < group_size; ++lane)
2020 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2021 dump_printf (MSG_NOTE, "%s %T ",
2022 get_tree_code_name (chains[lane][opnum].code),
2023 chains[lane][opnum].op);
2024 dump_printf (MSG_NOTE, "\n");
2027 /* 2. try to build children nodes, associating as necessary. */
2028 for (unsigned n = 0; n < chain_len; ++n)
2030 vect_def_type dt = chains[0][n].dt;
2031 unsigned lane;
2032 for (lane = 0; lane < group_size; ++lane)
2033 if (chains[lane][n].dt != dt)
2035 if (dt == vect_constant_def
2036 && chains[lane][n].dt == vect_external_def)
2037 dt = vect_external_def;
2038 else if (dt == vect_external_def
2039 && chains[lane][n].dt == vect_constant_def)
2041 else
2042 break;
2044 if (lane != group_size)
2046 if (dump_enabled_p ())
2047 dump_printf_loc (MSG_NOTE, vect_location,
2048 "giving up on chain due to mismatched "
2049 "def types\n");
2050 matches[lane] = false;
2051 if (lane != group_size - 1)
2052 matches[0] = false;
2053 goto out;
2055 if (dt == vect_constant_def
2056 || dt == vect_external_def)
2058 /* Check whether we can build the invariant. If we can't
2059 we never will be able to. */
2060 tree type = TREE_TYPE (chains[0][n].op);
2061 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2062 && (TREE_CODE (type) == BOOLEAN_TYPE
2063 || !can_duplicate_and_interleave_p (vinfo, group_size,
2064 type)))
2066 matches[0] = false;
2067 goto out;
2069 vec<tree> ops;
2070 ops.create (group_size);
2071 for (lane = 0; lane < group_size; ++lane)
2072 ops.quick_push (chains[lane][n].op);
2073 slp_tree child = vect_create_new_slp_node (ops);
2074 SLP_TREE_DEF_TYPE (child) = dt;
2075 children.safe_push (child);
2077 else if (dt != vect_internal_def)
2079 /* Not sure, we might need sth special.
2080 gcc.dg/vect/pr96854.c,
2081 gfortran.dg/vect/fast-math-pr37021.f90
2082 and gfortran.dg/vect/pr61171.f trigger. */
2083 /* Soft-fail for now. */
2084 hard_fail = false;
2085 goto out;
2087 else
2089 vec<stmt_vec_info> op_stmts;
2090 op_stmts.create (group_size);
2091 slp_tree child = NULL;
2092 /* Brute-force our way. We have to consider a lane
2093 failing after fixing an earlier fail up in the
2094 SLP discovery recursion. So track the current
2095 permute per lane. */
2096 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2097 memset (perms, 0, sizeof (unsigned) * group_size);
2100 op_stmts.truncate (0);
2101 for (lane = 0; lane < group_size; ++lane)
2102 op_stmts.quick_push
2103 (vinfo->lookup_def (chains[lane][n].op));
2104 child = vect_build_slp_tree (vinfo, op_stmts,
2105 group_size, &this_max_nunits,
2106 matches, limit,
2107 &this_tree_size, bst_map);
2108 /* ??? We're likely getting too many fatal mismatches
2109 here so maybe we want to ignore them (but then we
2110 have no idea which lanes fatally mismatched). */
2111 if (child || !matches[0])
2112 break;
2113 /* Swap another lane we have not yet matched up into
2114 lanes that did not match. If we run out of
2115 permute possibilities for a lane terminate the
2116 search. */
2117 bool term = false;
2118 for (lane = 1; lane < group_size; ++lane)
2119 if (!matches[lane])
2121 if (n + perms[lane] + 1 == chain_len)
2123 term = true;
2124 break;
2126 std::swap (chains[lane][n],
2127 chains[lane][n + perms[lane] + 1]);
2128 perms[lane]++;
2130 if (term)
2131 break;
2133 while (1);
2134 if (!child)
2136 if (dump_enabled_p ())
2137 dump_printf_loc (MSG_NOTE, vect_location,
2138 "failed to match up op %d\n", n);
2139 op_stmts.release ();
2140 if (lane != group_size - 1)
2141 matches[0] = false;
2142 else
2143 matches[lane] = false;
2144 goto out;
2146 if (dump_enabled_p ())
2148 dump_printf_loc (MSG_NOTE, vect_location,
2149 "matched up op %d to\n", n);
2150 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2152 children.safe_push (child);
2155 /* 3. build SLP nodes to combine the chain. */
2156 for (unsigned lane = 0; lane < group_size; ++lane)
2157 if (chains[lane][0].code != code)
2159 /* See if there's any alternate all-PLUS entry. */
2160 unsigned n;
2161 for (n = 1; n < chain_len; ++n)
2163 for (lane = 0; lane < group_size; ++lane)
2164 if (chains[lane][n].code != code)
2165 break;
2166 if (lane == group_size)
2167 break;
2169 if (n != chain_len)
2171 /* Swap that in at first position. */
2172 std::swap (children[0], children[n]);
2173 for (lane = 0; lane < group_size; ++lane)
2174 std::swap (chains[lane][0], chains[lane][n]);
2176 else
2178 /* ??? When this triggers and we end up with two
2179 vect_constant/external_def up-front things break (ICE)
2180 spectacularly finding an insertion place for the
2181 all-constant op. We should have a fully
2182 vect_internal_def operand though(?) so we can swap
2183 that into first place and then prepend the all-zero
2184 constant. */
2185 if (dump_enabled_p ())
2186 dump_printf_loc (MSG_NOTE, vect_location,
2187 "inserting constant zero to compensate "
2188 "for (partially) negated first "
2189 "operand\n");
2190 chain_len++;
2191 for (lane = 0; lane < group_size; ++lane)
2192 chains[lane].safe_insert
2193 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2194 vec<tree> zero_ops;
2195 zero_ops.create (group_size);
2196 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2197 for (lane = 1; lane < group_size; ++lane)
2198 zero_ops.quick_push (zero_ops[0]);
2199 slp_tree zero = vect_create_new_slp_node (zero_ops);
2200 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2201 children.safe_insert (0, zero);
2203 break;
2205 for (unsigned i = 1; i < children.length (); ++i)
2207 slp_tree op0 = children[i - 1];
2208 slp_tree op1 = children[i];
2209 bool this_two_op = false;
2210 for (unsigned lane = 0; lane < group_size; ++lane)
2211 if (chains[lane][i].code != chains[0][i].code)
2213 this_two_op = true;
2214 break;
2216 slp_tree child;
2217 if (i == children.length () - 1)
2218 child = vect_create_new_slp_node (node, stmts, 2);
2219 else
2220 child = vect_create_new_slp_node (2, ERROR_MARK);
2221 if (this_two_op)
2223 vec<std::pair<unsigned, unsigned> > lperm;
2224 lperm.create (group_size);
2225 for (unsigned lane = 0; lane < group_size; ++lane)
2226 lperm.quick_push (std::make_pair
2227 (chains[lane][i].code != chains[0][i].code, lane));
2228 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2229 (chains[0][i].code == code
2230 ? op_stmt_info
2231 : other_op_stmt_info),
2232 (chains[0][i].code == code
2233 ? other_op_stmt_info
2234 : op_stmt_info),
2235 lperm);
2237 else
2239 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2240 SLP_TREE_VECTYPE (child) = vectype;
2241 SLP_TREE_LANES (child) = group_size;
2242 SLP_TREE_CHILDREN (child).quick_push (op0);
2243 SLP_TREE_CHILDREN (child).quick_push (op1);
2244 SLP_TREE_REPRESENTATIVE (child)
2245 = (chains[0][i].code == code
2246 ? op_stmt_info : other_op_stmt_info);
2248 children[i] = child;
2250 *tree_size += this_tree_size + 1;
2251 *max_nunits = this_max_nunits;
2252 while (!chains.is_empty ())
2253 chains.pop ().release ();
2254 return node;
2256 out:
2257 while (!children.is_empty ())
2258 vect_free_slp_tree (children.pop ());
2259 while (!chains.is_empty ())
2260 chains.pop ().release ();
2261 /* Hard-fail, otherwise we might run into quadratic processing of the
2262 chains starting one stmt into the chain again. */
2263 if (hard_fail)
2264 return NULL;
2265 /* Fall thru to normal processing. */
2268 /* Get at the operands, verifying they are compatible. */
2269 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2270 slp_oprnd_info oprnd_info;
2271 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2273 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2274 stmts, i, &oprnds_info);
2275 if (res != 0)
2276 matches[(res == -1) ? 0 : i] = false;
2277 if (!matches[0])
2278 break;
2280 for (i = 0; i < group_size; ++i)
2281 if (!matches[i])
2283 vect_free_oprnd_info (oprnds_info);
2284 return NULL;
2286 swap = NULL;
2288 auto_vec<slp_tree, 4> children;
2290 stmt_info = stmts[0];
2292 /* Create SLP_TREE nodes for the definition node/s. */
2293 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2295 slp_tree child;
2296 unsigned int j;
2298 /* We're skipping certain operands from processing, for example
2299 outer loop reduction initial defs. */
2300 if (skip_args[i])
2302 children.safe_push (NULL);
2303 continue;
2306 if (oprnd_info->first_dt == vect_uninitialized_def)
2308 /* COND_EXPR have one too many eventually if the condition
2309 is a SSA name. */
2310 gcc_assert (i == 3 && nops == 4);
2311 continue;
2314 if (is_a <bb_vec_info> (vinfo)
2315 && oprnd_info->first_dt == vect_internal_def
2316 && !oprnd_info->any_pattern)
2318 /* For BB vectorization, if all defs are the same do not
2319 bother to continue the build along the single-lane
2320 graph but use a splat of the scalar value. */
2321 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2322 for (j = 1; j < group_size; ++j)
2323 if (oprnd_info->def_stmts[j] != first_def)
2324 break;
2325 if (j == group_size
2326 /* But avoid doing this for loads where we may be
2327 able to CSE things, unless the stmt is not
2328 vectorizable. */
2329 && (!STMT_VINFO_VECTORIZABLE (first_def)
2330 || !gimple_vuse (first_def->stmt)))
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_NOTE, vect_location,
2334 "Using a splat of the uniform operand %G",
2335 first_def->stmt);
2336 oprnd_info->first_dt = vect_external_def;
2340 if (oprnd_info->first_dt == vect_external_def
2341 || oprnd_info->first_dt == vect_constant_def)
2343 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2344 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2345 oprnd_info->ops = vNULL;
2346 children.safe_push (invnode);
2347 continue;
2350 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2351 group_size, &this_max_nunits,
2352 matches, limit,
2353 &this_tree_size, bst_map)) != NULL)
2355 oprnd_info->def_stmts = vNULL;
2356 children.safe_push (child);
2357 continue;
2360 /* If the SLP build for operand zero failed and operand zero
2361 and one can be commutated try that for the scalar stmts
2362 that failed the match. */
2363 if (i == 0
2364 /* A first scalar stmt mismatch signals a fatal mismatch. */
2365 && matches[0]
2366 /* ??? For COND_EXPRs we can swap the comparison operands
2367 as well as the arms under some constraints. */
2368 && nops == 2
2369 && oprnds_info[1]->first_dt == vect_internal_def
2370 && is_gimple_assign (stmt_info->stmt)
2371 /* Swapping operands for reductions breaks assumptions later on. */
2372 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2373 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2375 /* See whether we can swap the matching or the non-matching
2376 stmt operands. */
2377 bool swap_not_matching = true;
2380 for (j = 0; j < group_size; ++j)
2382 if (matches[j] != !swap_not_matching)
2383 continue;
2384 stmt_vec_info stmt_info = stmts[j];
2385 /* Verify if we can swap operands of this stmt. */
2386 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2387 if (!stmt
2388 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2390 if (!swap_not_matching)
2391 goto fail;
2392 swap_not_matching = false;
2393 break;
2397 while (j != group_size);
2399 /* Swap mismatched definition stmts. */
2400 if (dump_enabled_p ())
2401 dump_printf_loc (MSG_NOTE, vect_location,
2402 "Re-trying with swapped operands of stmts ");
2403 for (j = 0; j < group_size; ++j)
2404 if (matches[j] == !swap_not_matching)
2406 std::swap (oprnds_info[0]->def_stmts[j],
2407 oprnds_info[1]->def_stmts[j]);
2408 std::swap (oprnds_info[0]->ops[j],
2409 oprnds_info[1]->ops[j]);
2410 if (dump_enabled_p ())
2411 dump_printf (MSG_NOTE, "%d ", j);
2413 if (dump_enabled_p ())
2414 dump_printf (MSG_NOTE, "\n");
2415 /* After swapping some operands we lost track whether an
2416 operand has any pattern defs so be conservative here. */
2417 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2418 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2419 /* And try again with scratch 'matches' ... */
2420 bool *tem = XALLOCAVEC (bool, group_size);
2421 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2422 group_size, &this_max_nunits,
2423 tem, limit,
2424 &this_tree_size, bst_map)) != NULL)
2426 oprnd_info->def_stmts = vNULL;
2427 children.safe_push (child);
2428 continue;
2431 fail:
2433 /* If the SLP build failed and we analyze a basic-block
2434 simply treat nodes we fail to build as externally defined
2435 (and thus build vectors from the scalar defs).
2436 The cost model will reject outright expensive cases.
2437 ??? This doesn't treat cases where permutation ultimatively
2438 fails (or we don't try permutation below). Ideally we'd
2439 even compute a permutation that will end up with the maximum
2440 SLP tree size... */
2441 if (is_a <bb_vec_info> (vinfo)
2442 /* ??? Rejecting patterns this way doesn't work. We'd have to
2443 do extra work to cancel the pattern so the uses see the
2444 scalar version. */
2445 && !is_pattern_stmt_p (stmt_info)
2446 && !oprnd_info->any_pattern)
2448 /* But if there's a leading vector sized set of matching stmts
2449 fail here so we can split the group. This matches the condition
2450 vect_analyze_slp_instance uses. */
2451 /* ??? We might want to split here and combine the results to support
2452 multiple vector sizes better. */
2453 for (j = 0; j < group_size; ++j)
2454 if (!matches[j])
2455 break;
2456 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2458 if (dump_enabled_p ())
2459 dump_printf_loc (MSG_NOTE, vect_location,
2460 "Building vector operands from scalars\n");
2461 this_tree_size++;
2462 child = vect_create_new_slp_node (oprnd_info->ops);
2463 children.safe_push (child);
2464 oprnd_info->ops = vNULL;
2465 continue;
2469 gcc_assert (child == NULL);
2470 FOR_EACH_VEC_ELT (children, j, child)
2471 if (child)
2472 vect_free_slp_tree (child);
2473 vect_free_oprnd_info (oprnds_info);
2474 return NULL;
2477 vect_free_oprnd_info (oprnds_info);
2479 /* If we have all children of a child built up from uniform scalars
2480 or does more than one possibly expensive vector construction then
2481 just throw that away, causing it built up from scalars.
2482 The exception is the SLP node for the vector store. */
2483 if (is_a <bb_vec_info> (vinfo)
2484 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2485 /* ??? Rejecting patterns this way doesn't work. We'd have to
2486 do extra work to cancel the pattern so the uses see the
2487 scalar version. */
2488 && !is_pattern_stmt_p (stmt_info))
2490 slp_tree child;
2491 unsigned j;
2492 bool all_uniform_p = true;
2493 unsigned n_vector_builds = 0;
2494 FOR_EACH_VEC_ELT (children, j, child)
2496 if (!child)
2498 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2499 all_uniform_p = false;
2500 else if (!vect_slp_tree_uniform_p (child))
2502 all_uniform_p = false;
2503 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2504 n_vector_builds++;
2507 if (all_uniform_p
2508 || n_vector_builds > 1
2509 || (n_vector_builds == children.length ()
2510 && is_a <gphi *> (stmt_info->stmt)))
2512 /* Roll back. */
2513 matches[0] = false;
2514 FOR_EACH_VEC_ELT (children, j, child)
2515 if (child)
2516 vect_free_slp_tree (child);
2518 if (dump_enabled_p ())
2519 dump_printf_loc (MSG_NOTE, vect_location,
2520 "Building parent vector operands from "
2521 "scalars instead\n");
2522 return NULL;
2526 *tree_size += this_tree_size + 1;
2527 *max_nunits = this_max_nunits;
2529 if (two_operators)
2531 /* ??? We'd likely want to either cache in bst_map sth like
2532 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2533 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2534 explicit stmts to put in so the keying on 'stmts' doesn't
2535 work (but we have the same issue with nodes that use 'ops'). */
2536 slp_tree one = new _slp_tree;
2537 slp_tree two = new _slp_tree;
2538 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2539 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2540 SLP_TREE_VECTYPE (one) = vectype;
2541 SLP_TREE_VECTYPE (two) = vectype;
2542 SLP_TREE_CHILDREN (one).safe_splice (children);
2543 SLP_TREE_CHILDREN (two).safe_splice (children);
2544 slp_tree child;
2545 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2546 SLP_TREE_REF_COUNT (child)++;
2548 /* Here we record the original defs since this
2549 node represents the final lane configuration. */
2550 node = vect_create_new_slp_node (node, stmts, 2);
2551 SLP_TREE_VECTYPE (node) = vectype;
2552 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2553 SLP_TREE_CHILDREN (node).quick_push (one);
2554 SLP_TREE_CHILDREN (node).quick_push (two);
2555 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2556 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2557 enum tree_code ocode = ERROR_MARK;
2558 stmt_vec_info ostmt_info;
2559 unsigned j = 0;
2560 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2562 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2563 if (gimple_assign_rhs_code (ostmt) != code0)
2565 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2566 ocode = gimple_assign_rhs_code (ostmt);
2567 j = i;
2569 else
2570 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2572 SLP_TREE_CODE (one) = code0;
2573 SLP_TREE_CODE (two) = ocode;
2574 SLP_TREE_LANES (one) = stmts.length ();
2575 SLP_TREE_LANES (two) = stmts.length ();
2576 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2577 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2578 return node;
2581 node = vect_create_new_slp_node (node, stmts, nops);
2582 SLP_TREE_VECTYPE (node) = vectype;
2583 SLP_TREE_CHILDREN (node).splice (children);
2584 return node;
2587 /* Dump a single SLP tree NODE. */
2589 static void
2590 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2591 slp_tree node)
2593 unsigned i, j;
2594 slp_tree child;
2595 stmt_vec_info stmt_info;
2596 tree op;
2598 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2599 dump_user_location_t user_loc = loc.get_user_location ();
2600 dump_printf_loc (metadata, user_loc,
2601 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2602 ", refcnt=%u)",
2603 SLP_TREE_DEF_TYPE (node) == vect_external_def
2604 ? " (external)"
2605 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2606 ? " (constant)"
2607 : ""), (void *) node,
2608 estimated_poly_value (node->max_nunits),
2609 SLP_TREE_REF_COUNT (node));
2610 if (SLP_TREE_VECTYPE (node))
2611 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2612 dump_printf (metadata, "\n");
2613 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2615 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2616 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2617 else
2618 dump_printf_loc (metadata, user_loc, "op template: %G",
2619 SLP_TREE_REPRESENTATIVE (node)->stmt);
2621 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2622 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2623 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2624 else
2626 dump_printf_loc (metadata, user_loc, "\t{ ");
2627 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2628 dump_printf (metadata, "%T%s ", op,
2629 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2630 dump_printf (metadata, "}\n");
2632 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2634 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2635 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2636 dump_printf (dump_kind, " %u", j);
2637 dump_printf (dump_kind, " }\n");
2639 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2641 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2642 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2643 dump_printf (dump_kind, " %u[%u]",
2644 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2645 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2646 dump_printf (dump_kind, " }\n");
2648 if (SLP_TREE_CHILDREN (node).is_empty ())
2649 return;
2650 dump_printf_loc (metadata, user_loc, "\tchildren");
2651 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2652 dump_printf (dump_kind, " %p", (void *)child);
2653 dump_printf (dump_kind, "\n");
2656 DEBUG_FUNCTION void
2657 debug (slp_tree node)
2659 debug_dump_context ctx;
2660 vect_print_slp_tree (MSG_NOTE,
2661 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2662 node);
2665 /* Recursive helper for the dot producer below. */
2667 static void
2668 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2670 if (visited.add (node))
2671 return;
2673 fprintf (f, "\"%p\" [label=\"", (void *)node);
2674 vect_print_slp_tree (MSG_NOTE,
2675 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2676 node);
2677 fprintf (f, "\"];\n");
2680 for (slp_tree child : SLP_TREE_CHILDREN (node))
2681 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2683 for (slp_tree child : SLP_TREE_CHILDREN (node))
2684 if (child)
2685 dot_slp_tree (f, child, visited);
2688 DEBUG_FUNCTION void
2689 dot_slp_tree (const char *fname, slp_tree node)
2691 FILE *f = fopen (fname, "w");
2692 fprintf (f, "digraph {\n");
2693 fflush (f);
2695 debug_dump_context ctx (f);
2696 hash_set<slp_tree> visited;
2697 dot_slp_tree (f, node, visited);
2699 fflush (f);
2700 fprintf (f, "}\n");
2701 fclose (f);
2704 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2706 static void
2707 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2708 slp_tree node, hash_set<slp_tree> &visited)
2710 unsigned i;
2711 slp_tree child;
2713 if (visited.add (node))
2714 return;
2716 vect_print_slp_tree (dump_kind, loc, node);
2718 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2719 if (child)
2720 vect_print_slp_graph (dump_kind, loc, child, visited);
2723 static void
2724 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2725 slp_tree entry)
2727 hash_set<slp_tree> visited;
2728 vect_print_slp_graph (dump_kind, loc, entry, visited);
2731 /* Mark the tree rooted at NODE with PURE_SLP. */
2733 static void
2734 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2736 int i;
2737 stmt_vec_info stmt_info;
2738 slp_tree child;
2740 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2741 return;
2743 if (visited.add (node))
2744 return;
2746 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2747 STMT_SLP_TYPE (stmt_info) = pure_slp;
2749 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2750 if (child)
2751 vect_mark_slp_stmts (child, visited);
2754 static void
2755 vect_mark_slp_stmts (slp_tree node)
2757 hash_set<slp_tree> visited;
2758 vect_mark_slp_stmts (node, visited);
2761 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2763 static void
2764 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2766 int i;
2767 stmt_vec_info stmt_info;
2768 slp_tree child;
2770 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2771 return;
2773 if (visited.add (node))
2774 return;
2776 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2778 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2779 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2780 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2783 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2784 if (child)
2785 vect_mark_slp_stmts_relevant (child, visited);
2788 static void
2789 vect_mark_slp_stmts_relevant (slp_tree node)
2791 hash_set<slp_tree> visited;
2792 vect_mark_slp_stmts_relevant (node, visited);
2796 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2798 static void
2799 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2800 hash_set<slp_tree> &visited)
2802 if (!node || visited.add (node))
2803 return;
2805 if (SLP_TREE_CHILDREN (node).length () == 0)
2807 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2808 return;
2809 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2810 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2811 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2812 loads.safe_push (node);
2814 else
2816 unsigned i;
2817 slp_tree child;
2818 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2819 vect_gather_slp_loads (loads, child, visited);
2824 /* Find the last store in SLP INSTANCE. */
2826 stmt_vec_info
2827 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2829 stmt_vec_info last = NULL;
2830 stmt_vec_info stmt_vinfo;
2832 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2834 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2835 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2838 return last;
2841 /* Find the first stmt in NODE. */
2843 stmt_vec_info
2844 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2846 stmt_vec_info first = NULL;
2847 stmt_vec_info stmt_vinfo;
2849 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2851 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2852 if (!first
2853 || get_later_stmt (stmt_vinfo, first) == first)
2854 first = stmt_vinfo;
2857 return first;
2860 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2861 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2862 (also containing the first GROUP1_SIZE stmts, since stores are
2863 consecutive), the second containing the remainder.
2864 Return the first stmt in the second group. */
2866 static stmt_vec_info
2867 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2869 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2870 gcc_assert (group1_size > 0);
2871 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2872 gcc_assert (group2_size > 0);
2873 DR_GROUP_SIZE (first_vinfo) = group1_size;
2875 stmt_vec_info stmt_info = first_vinfo;
2876 for (unsigned i = group1_size; i > 1; i--)
2878 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2879 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2881 /* STMT is now the last element of the first group. */
2882 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2883 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2885 DR_GROUP_SIZE (group2) = group2_size;
2886 for (stmt_info = group2; stmt_info;
2887 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2889 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2890 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2893 /* For the second group, the DR_GROUP_GAP is that before the original group,
2894 plus skipping over the first vector. */
2895 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2897 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2898 DR_GROUP_GAP (first_vinfo) += group2_size;
2900 if (dump_enabled_p ())
2901 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2902 group1_size, group2_size);
2904 return group2;
2907 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2908 statements and a vector of NUNITS elements. */
2910 static poly_uint64
2911 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2913 return exact_div (common_multiple (nunits, group_size), group_size);
2916 /* Helper that checks to see if a node is a load node. */
2918 static inline bool
2919 vect_is_slp_load_node (slp_tree root)
2921 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2922 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2923 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2927 /* Helper function of optimize_load_redistribution that performs the operation
2928 recursively. */
2930 static slp_tree
2931 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2932 vec_info *vinfo, unsigned int group_size,
2933 hash_map<slp_tree, slp_tree> *load_map,
2934 slp_tree root)
2936 if (slp_tree *leader = load_map->get (root))
2937 return *leader;
2939 slp_tree node;
2940 unsigned i;
2942 /* For now, we don't know anything about externals so do not do anything. */
2943 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2944 return NULL;
2945 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2947 /* First convert this node into a load node and add it to the leaves
2948 list and flatten the permute from a lane to a load one. If it's
2949 unneeded it will be elided later. */
2950 vec<stmt_vec_info> stmts;
2951 stmts.create (SLP_TREE_LANES (root));
2952 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2953 for (unsigned j = 0; j < lane_perm.length (); j++)
2955 std::pair<unsigned, unsigned> perm = lane_perm[j];
2956 node = SLP_TREE_CHILDREN (root)[perm.first];
2958 if (!vect_is_slp_load_node (node)
2959 || SLP_TREE_CHILDREN (node).exists ())
2961 stmts.release ();
2962 goto next;
2965 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2968 if (dump_enabled_p ())
2969 dump_printf_loc (MSG_NOTE, vect_location,
2970 "converting stmts on permute node %p\n",
2971 (void *) root);
2973 bool *matches = XALLOCAVEC (bool, group_size);
2974 poly_uint64 max_nunits = 1;
2975 unsigned tree_size = 0, limit = 1;
2976 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2977 matches, &limit, &tree_size, bst_map);
2978 if (!node)
2979 stmts.release ();
2981 load_map->put (root, node);
2982 return node;
2985 next:
2986 load_map->put (root, NULL);
2988 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2990 slp_tree value
2991 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2992 node);
2993 if (value)
2995 SLP_TREE_REF_COUNT (value)++;
2996 SLP_TREE_CHILDREN (root)[i] = value;
2997 /* ??? We know the original leafs of the replaced nodes will
2998 be referenced by bst_map, only the permutes created by
2999 pattern matching are not. */
3000 if (SLP_TREE_REF_COUNT (node) == 1)
3001 load_map->remove (node);
3002 vect_free_slp_tree (node);
3006 return NULL;
3009 /* Temporary workaround for loads not being CSEd during SLP build. This
3010 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3011 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3012 same DR such that the final operation is equal to a permuted load. Such
3013 NODES are then directly converted into LOADS themselves. The nodes are
3014 CSEd using BST_MAP. */
3016 static void
3017 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3018 vec_info *vinfo, unsigned int group_size,
3019 hash_map<slp_tree, slp_tree> *load_map,
3020 slp_tree root)
3022 slp_tree node;
3023 unsigned i;
3025 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3027 slp_tree value
3028 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3029 node);
3030 if (value)
3032 SLP_TREE_REF_COUNT (value)++;
3033 SLP_TREE_CHILDREN (root)[i] = value;
3034 /* ??? We know the original leafs of the replaced nodes will
3035 be referenced by bst_map, only the permutes created by
3036 pattern matching are not. */
3037 if (SLP_TREE_REF_COUNT (node) == 1)
3038 load_map->remove (node);
3039 vect_free_slp_tree (node);
3044 /* Helper function of vect_match_slp_patterns.
3046 Attempts to match patterns against the slp tree rooted in REF_NODE using
3047 VINFO. Patterns are matched in post-order traversal.
3049 If matching is successful the value in REF_NODE is updated and returned, if
3050 not then it is returned unchanged. */
3052 static bool
3053 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3054 slp_tree_to_load_perm_map_t *perm_cache,
3055 slp_compat_nodes_map_t *compat_cache,
3056 hash_set<slp_tree> *visited)
3058 unsigned i;
3059 slp_tree node = *ref_node;
3060 bool found_p = false;
3061 if (!node || visited->add (node))
3062 return false;
3064 slp_tree child;
3065 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3066 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3067 vinfo, perm_cache, compat_cache,
3068 visited);
3070 for (unsigned x = 0; x < num__slp_patterns; x++)
3072 vect_pattern *pattern
3073 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3074 if (pattern)
3076 pattern->build (vinfo);
3077 delete pattern;
3078 found_p = true;
3082 return found_p;
3085 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3086 vec_info VINFO.
3088 The modified tree is returned. Patterns are tried in order and multiple
3089 patterns may match. */
3091 static bool
3092 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3093 hash_set<slp_tree> *visited,
3094 slp_tree_to_load_perm_map_t *perm_cache,
3095 slp_compat_nodes_map_t *compat_cache)
3097 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3098 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3100 if (dump_enabled_p ())
3101 dump_printf_loc (MSG_NOTE, vect_location,
3102 "Analyzing SLP tree %p for patterns\n",
3103 (void *) SLP_INSTANCE_TREE (instance));
3105 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3106 visited);
3109 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3110 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3111 Return true if we could use IFN_STORE_LANES instead and if that appears
3112 to be the better approach. */
3114 static bool
3115 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3116 unsigned int group_size,
3117 unsigned int new_group_size)
3119 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3120 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3121 if (!vectype)
3122 return false;
3123 /* Allow the split if one of the two new groups would operate on full
3124 vectors *within* rather than across one scalar loop iteration.
3125 This is purely a heuristic, but it should work well for group
3126 sizes of 3 and 4, where the possible splits are:
3128 3->2+1: OK if the vector has exactly two elements
3129 4->2+2: Likewise
3130 4->3+1: Less clear-cut. */
3131 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3132 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3133 return false;
3134 return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3137 /* Analyze an SLP instance starting from a group of grouped stores. Call
3138 vect_build_slp_tree to build a tree of packed stmts if possible.
3139 Return FALSE if it's impossible to SLP any stmt in the loop. */
3141 static bool
3142 vect_analyze_slp_instance (vec_info *vinfo,
3143 scalar_stmts_to_slp_tree_map_t *bst_map,
3144 stmt_vec_info stmt_info, slp_instance_kind kind,
3145 unsigned max_tree_size, unsigned *limit);
3147 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3148 of KIND. Return true if successful. */
3150 static bool
3151 vect_build_slp_instance (vec_info *vinfo,
3152 slp_instance_kind kind,
3153 vec<stmt_vec_info> &scalar_stmts,
3154 vec<stmt_vec_info> &root_stmt_infos,
3155 vec<tree> &remain,
3156 unsigned max_tree_size, unsigned *limit,
3157 scalar_stmts_to_slp_tree_map_t *bst_map,
3158 /* ??? We need stmt_info for group splitting. */
3159 stmt_vec_info stmt_info_)
3161 if (kind == slp_inst_kind_ctor)
3163 if (dump_enabled_p ())
3164 dump_printf_loc (MSG_NOTE, vect_location,
3165 "Analyzing vectorizable constructor: %G\n",
3166 root_stmt_infos[0]->stmt);
3169 if (dump_enabled_p ())
3171 dump_printf_loc (MSG_NOTE, vect_location,
3172 "Starting SLP discovery for\n");
3173 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3174 dump_printf_loc (MSG_NOTE, vect_location,
3175 " %G", scalar_stmts[i]->stmt);
3178 /* When a BB reduction doesn't have an even number of lanes
3179 strip it down, treating the remaining lane as scalar.
3180 ??? Selecting the optimal set of lanes to vectorize would be nice
3181 but SLP build for all lanes will fail quickly because we think
3182 we're going to need unrolling. */
3183 if (kind == slp_inst_kind_bb_reduc
3184 && (scalar_stmts.length () & 1))
3185 remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt));
3187 /* Build the tree for the SLP instance. */
3188 unsigned int group_size = scalar_stmts.length ();
3189 bool *matches = XALLOCAVEC (bool, group_size);
3190 poly_uint64 max_nunits = 1;
3191 unsigned tree_size = 0;
3192 unsigned i;
3193 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3194 &max_nunits, matches, limit,
3195 &tree_size, bst_map);
3196 if (node != NULL)
3198 /* Calculate the unrolling factor based on the smallest type. */
3199 poly_uint64 unrolling_factor
3200 = calculate_unrolling_factor (max_nunits, group_size);
3202 if (maybe_ne (unrolling_factor, 1U)
3203 && is_a <bb_vec_info> (vinfo))
3205 unsigned HOST_WIDE_INT const_max_nunits;
3206 if (!max_nunits.is_constant (&const_max_nunits)
3207 || const_max_nunits > group_size)
3209 if (dump_enabled_p ())
3210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3211 "Build SLP failed: store group "
3212 "size not a multiple of the vector size "
3213 "in basic block SLP\n");
3214 vect_free_slp_tree (node);
3215 return false;
3217 /* Fatal mismatch. */
3218 if (dump_enabled_p ())
3219 dump_printf_loc (MSG_NOTE, vect_location,
3220 "SLP discovery succeeded but node needs "
3221 "splitting\n");
3222 memset (matches, true, group_size);
3223 matches[group_size / const_max_nunits * const_max_nunits] = false;
3224 vect_free_slp_tree (node);
3226 else
3228 /* Create a new SLP instance. */
3229 slp_instance new_instance = XNEW (class _slp_instance);
3230 SLP_INSTANCE_TREE (new_instance) = node;
3231 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3232 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3233 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3234 SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3235 SLP_INSTANCE_KIND (new_instance) = kind;
3236 new_instance->reduc_phis = NULL;
3237 new_instance->cost_vec = vNULL;
3238 new_instance->subgraph_entries = vNULL;
3240 if (dump_enabled_p ())
3241 dump_printf_loc (MSG_NOTE, vect_location,
3242 "SLP size %u vs. limit %u.\n",
3243 tree_size, max_tree_size);
3245 /* Fixup SLP reduction chains. */
3246 if (kind == slp_inst_kind_reduc_chain)
3248 /* If this is a reduction chain with a conversion in front
3249 amend the SLP tree with a node for that. */
3250 gimple *scalar_def
3251 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3252 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3254 /* Get at the conversion stmt - we know it's the single use
3255 of the last stmt of the reduction chain. */
3256 use_operand_p use_p;
3257 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3258 &use_p, &scalar_def);
3259 gcc_assert (r);
3260 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3261 next_info = vect_stmt_to_vectorize (next_info);
3262 scalar_stmts = vNULL;
3263 scalar_stmts.create (group_size);
3264 for (unsigned i = 0; i < group_size; ++i)
3265 scalar_stmts.quick_push (next_info);
3266 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3267 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3268 SLP_TREE_CHILDREN (conv).quick_push (node);
3269 SLP_INSTANCE_TREE (new_instance) = conv;
3270 /* We also have to fake this conversion stmt as SLP reduction
3271 group so we don't have to mess with too much code
3272 elsewhere. */
3273 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3274 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3276 /* Fill the backedge child of the PHI SLP node. The
3277 general matching code cannot find it because the
3278 scalar code does not reflect how we vectorize the
3279 reduction. */
3280 use_operand_p use_p;
3281 imm_use_iterator imm_iter;
3282 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3283 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3284 gimple_get_lhs (scalar_def))
3285 /* There are exactly two non-debug uses, the reduction
3286 PHI and the loop-closed PHI node. */
3287 if (!is_gimple_debug (USE_STMT (use_p))
3288 && gimple_bb (USE_STMT (use_p)) == loop->header)
3290 auto_vec<stmt_vec_info, 64> phis (group_size);
3291 stmt_vec_info phi_info
3292 = vinfo->lookup_stmt (USE_STMT (use_p));
3293 for (unsigned i = 0; i < group_size; ++i)
3294 phis.quick_push (phi_info);
3295 slp_tree *phi_node = bst_map->get (phis);
3296 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3297 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3298 = SLP_INSTANCE_TREE (new_instance);
3299 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3303 vinfo->slp_instances.safe_push (new_instance);
3305 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3306 the number of scalar stmts in the root in a few places.
3307 Verify that assumption holds. */
3308 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3309 .length () == group_size);
3311 if (dump_enabled_p ())
3313 dump_printf_loc (MSG_NOTE, vect_location,
3314 "Final SLP tree for instance %p:\n",
3315 (void *) new_instance);
3316 vect_print_slp_graph (MSG_NOTE, vect_location,
3317 SLP_INSTANCE_TREE (new_instance));
3320 return true;
3323 else
3325 /* Failed to SLP. */
3326 /* Free the allocated memory. */
3327 scalar_stmts.release ();
3330 stmt_vec_info stmt_info = stmt_info_;
3331 /* Try to break the group up into pieces. */
3332 if (kind == slp_inst_kind_store)
3334 /* ??? We could delay all the actual splitting of store-groups
3335 until after SLP discovery of the original group completed.
3336 Then we can recurse to vect_build_slp_instance directly. */
3337 for (i = 0; i < group_size; i++)
3338 if (!matches[i])
3339 break;
3341 /* For basic block SLP, try to break the group up into multiples of
3342 a vector size. */
3343 if (is_a <bb_vec_info> (vinfo)
3344 && (i > 1 && i < group_size))
3346 tree scalar_type
3347 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3348 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3349 1 << floor_log2 (i));
3350 unsigned HOST_WIDE_INT const_nunits;
3351 if (vectype
3352 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3354 /* Split into two groups at the first vector boundary. */
3355 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3356 unsigned group1_size = i & ~(const_nunits - 1);
3358 if (dump_enabled_p ())
3359 dump_printf_loc (MSG_NOTE, vect_location,
3360 "Splitting SLP group at stmt %u\n", i);
3361 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3362 group1_size);
3363 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3364 kind, max_tree_size,
3365 limit);
3366 /* Split the rest at the failure point and possibly
3367 re-analyze the remaining matching part if it has
3368 at least two lanes. */
3369 if (group1_size < i
3370 && (i + 1 < group_size
3371 || i - group1_size > 1))
3373 stmt_vec_info rest2 = rest;
3374 rest = vect_split_slp_store_group (rest, i - group1_size);
3375 if (i - group1_size > 1)
3376 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3377 kind, max_tree_size,
3378 limit);
3380 /* Re-analyze the non-matching tail if it has at least
3381 two lanes. */
3382 if (i + 1 < group_size)
3383 res |= vect_analyze_slp_instance (vinfo, bst_map,
3384 rest, kind, max_tree_size,
3385 limit);
3386 return res;
3390 /* For loop vectorization split into arbitrary pieces of size > 1. */
3391 if (is_a <loop_vec_info> (vinfo)
3392 && (i > 1 && i < group_size)
3393 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3395 unsigned group1_size = i;
3397 if (dump_enabled_p ())
3398 dump_printf_loc (MSG_NOTE, vect_location,
3399 "Splitting SLP group at stmt %u\n", i);
3401 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3402 group1_size);
3403 /* Loop vectorization cannot handle gaps in stores, make sure
3404 the split group appears as strided. */
3405 STMT_VINFO_STRIDED_P (rest) = 1;
3406 DR_GROUP_GAP (rest) = 0;
3407 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3408 DR_GROUP_GAP (stmt_info) = 0;
3410 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3411 kind, max_tree_size, limit);
3412 if (i + 1 < group_size)
3413 res |= vect_analyze_slp_instance (vinfo, bst_map,
3414 rest, kind, max_tree_size, limit);
3416 return res;
3419 /* Even though the first vector did not all match, we might be able to SLP
3420 (some) of the remainder. FORNOW ignore this possibility. */
3423 /* Failed to SLP. */
3424 if (dump_enabled_p ())
3425 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3426 return false;
3430 /* Analyze an SLP instance starting from a group of grouped stores. Call
3431 vect_build_slp_tree to build a tree of packed stmts if possible.
3432 Return FALSE if it's impossible to SLP any stmt in the loop. */
3434 static bool
3435 vect_analyze_slp_instance (vec_info *vinfo,
3436 scalar_stmts_to_slp_tree_map_t *bst_map,
3437 stmt_vec_info stmt_info,
3438 slp_instance_kind kind,
3439 unsigned max_tree_size, unsigned *limit)
3441 unsigned int i;
3442 vec<stmt_vec_info> scalar_stmts;
3444 if (is_a <bb_vec_info> (vinfo))
3445 vect_location = stmt_info->stmt;
3447 stmt_vec_info next_info = stmt_info;
3448 if (kind == slp_inst_kind_store)
3450 /* Collect the stores and store them in scalar_stmts. */
3451 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3452 while (next_info)
3454 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3455 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3458 else if (kind == slp_inst_kind_reduc_chain)
3460 /* Collect the reduction stmts and store them in scalar_stmts. */
3461 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3462 while (next_info)
3464 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3465 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3467 /* Mark the first element of the reduction chain as reduction to properly
3468 transform the node. In the reduction analysis phase only the last
3469 element of the chain is marked as reduction. */
3470 STMT_VINFO_DEF_TYPE (stmt_info)
3471 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3472 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3473 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3475 else if (kind == slp_inst_kind_reduc_group)
3477 /* Collect reduction statements. */
3478 const vec<stmt_vec_info> &reductions
3479 = as_a <loop_vec_info> (vinfo)->reductions;
3480 scalar_stmts.create (reductions.length ());
3481 for (i = 0; reductions.iterate (i, &next_info); i++)
3482 if ((STMT_VINFO_RELEVANT_P (next_info)
3483 || STMT_VINFO_LIVE_P (next_info))
3484 /* ??? Make sure we didn't skip a conversion around a reduction
3485 path. In that case we'd have to reverse engineer that conversion
3486 stmt following the chain using reduc_idx and from the PHI
3487 using reduc_def. */
3488 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3489 scalar_stmts.quick_push (next_info);
3490 /* If less than two were relevant/live there's nothing to SLP. */
3491 if (scalar_stmts.length () < 2)
3492 return false;
3494 else
3495 gcc_unreachable ();
3497 vec<stmt_vec_info> roots = vNULL;
3498 vec<tree> remain = vNULL;
3499 /* Build the tree for the SLP instance. */
3500 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3501 roots, remain,
3502 max_tree_size, limit, bst_map,
3503 kind == slp_inst_kind_store
3504 ? stmt_info : NULL);
3506 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3507 where we should do store group splitting. */
3509 return res;
3512 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3513 trees of packed scalar stmts if SLP is possible. */
3515 opt_result
3516 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3518 unsigned int i;
3519 stmt_vec_info first_element;
3520 slp_instance instance;
3522 DUMP_VECT_SCOPE ("vect_analyze_slp");
3524 unsigned limit = max_tree_size;
3526 scalar_stmts_to_slp_tree_map_t *bst_map
3527 = new scalar_stmts_to_slp_tree_map_t ();
3529 /* Find SLP sequences starting from groups of grouped stores. */
3530 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3531 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3532 slp_inst_kind_store, max_tree_size, &limit);
3534 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3536 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3538 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3539 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3540 bb_vinfo->roots[i].stmts,
3541 bb_vinfo->roots[i].roots,
3542 bb_vinfo->roots[i].remain,
3543 max_tree_size, &limit, bst_map, NULL))
3545 bb_vinfo->roots[i].stmts = vNULL;
3546 bb_vinfo->roots[i].roots = vNULL;
3547 bb_vinfo->roots[i].remain = vNULL;
3552 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3554 /* Find SLP sequences starting from reduction chains. */
3555 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3556 if (! STMT_VINFO_RELEVANT_P (first_element)
3557 && ! STMT_VINFO_LIVE_P (first_element))
3559 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3560 slp_inst_kind_reduc_chain,
3561 max_tree_size, &limit))
3563 /* Dissolve reduction chain group. */
3564 stmt_vec_info vinfo = first_element;
3565 stmt_vec_info last = NULL;
3566 while (vinfo)
3568 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3569 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3570 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3571 last = vinfo;
3572 vinfo = next;
3574 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3575 /* It can be still vectorized as part of an SLP reduction. */
3576 loop_vinfo->reductions.safe_push (last);
3579 /* Find SLP sequences starting from groups of reductions. */
3580 if (loop_vinfo->reductions.length () > 1)
3581 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3582 slp_inst_kind_reduc_group, max_tree_size,
3583 &limit);
3586 hash_set<slp_tree> visited_patterns;
3587 slp_tree_to_load_perm_map_t perm_cache;
3588 slp_compat_nodes_map_t compat_cache;
3590 /* See if any patterns can be found in the SLP tree. */
3591 bool pattern_found = false;
3592 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3593 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3594 &visited_patterns, &perm_cache,
3595 &compat_cache);
3597 /* If any were found optimize permutations of loads. */
3598 if (pattern_found)
3600 hash_map<slp_tree, slp_tree> load_map;
3601 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3603 slp_tree root = SLP_INSTANCE_TREE (instance);
3604 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3605 &load_map, root);
3611 /* The map keeps a reference on SLP nodes built, release that. */
3612 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3613 it != bst_map->end (); ++it)
3614 if ((*it).second)
3615 vect_free_slp_tree ((*it).second);
3616 delete bst_map;
3618 if (pattern_found && dump_enabled_p ())
3620 dump_printf_loc (MSG_NOTE, vect_location,
3621 "Pattern matched SLP tree\n");
3622 hash_set<slp_tree> visited;
3623 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3624 vect_print_slp_graph (MSG_NOTE, vect_location,
3625 SLP_INSTANCE_TREE (instance), visited);
3628 return opt_result::success ();
3631 /* Estimates the cost of inserting layout changes into the SLP graph.
3632 It can also say that the insertion is impossible. */
3634 struct slpg_layout_cost
3636 slpg_layout_cost () = default;
3637 slpg_layout_cost (sreal, bool);
3639 static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3640 bool is_possible () const { return depth != sreal::max (); }
3642 bool operator== (const slpg_layout_cost &) const;
3643 bool operator!= (const slpg_layout_cost &) const;
3645 bool is_better_than (const slpg_layout_cost &, bool) const;
3647 void add_parallel_cost (const slpg_layout_cost &);
3648 void add_serial_cost (const slpg_layout_cost &);
3649 void split (unsigned int);
3651 /* The longest sequence of layout changes needed during any traversal
3652 of the partition dag, weighted by execution frequency.
3654 This is the most important metric when optimizing for speed, since
3655 it helps to ensure that we keep the number of operations on
3656 critical paths to a minimum. */
3657 sreal depth = 0;
3659 /* An estimate of the total number of operations needed. It is weighted by
3660 execution frequency when optimizing for speed but not when optimizing for
3661 size. In order to avoid double-counting, a node with a fanout of N will
3662 distribute 1/N of its total cost to each successor.
3664 This is the most important metric when optimizing for size, since
3665 it helps to keep the total number of operations to a minimum, */
3666 sreal total = 0;
3669 /* Construct costs for a node with weight WEIGHT. A higher weight
3670 indicates more frequent execution. IS_FOR_SIZE is true if we are
3671 optimizing for size rather than speed. */
3673 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3674 : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3678 bool
3679 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3681 return depth == other.depth && total == other.total;
3684 bool
3685 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3687 return !operator== (other);
3690 /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
3691 true if we are optimizing for size rather than speed. */
3693 bool
3694 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3695 bool is_for_size) const
3697 if (is_for_size)
3699 if (total != other.total)
3700 return total < other.total;
3701 return depth < other.depth;
3703 else
3705 if (depth != other.depth)
3706 return depth < other.depth;
3707 return total < other.total;
3711 /* Increase the costs to account for something with cost INPUT_COST
3712 happening in parallel with the current costs. */
3714 void
3715 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3717 depth = std::max (depth, input_cost.depth);
3718 total += input_cost.total;
3721 /* Increase the costs to account for something with cost INPUT_COST
3722 happening in series with the current costs. */
3724 void
3725 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3727 depth += other.depth;
3728 total += other.total;
3731 /* Split the total cost among TIMES successors or predecessors. */
3733 void
3734 slpg_layout_cost::split (unsigned int times)
3736 if (times > 1)
3737 total /= times;
3740 /* Information about one node in the SLP graph, for use during
3741 vect_optimize_slp_pass. */
3743 struct slpg_vertex
3745 slpg_vertex (slp_tree node_) : node (node_) {}
3747 /* The node itself. */
3748 slp_tree node;
3750 /* Which partition the node belongs to, or -1 if none. Nodes outside of
3751 partitions are flexible; they can have whichever layout consumers
3752 want them to have. */
3753 int partition = -1;
3755 /* The number of nodes that directly use the result of this one
3756 (i.e. the number of nodes that count this one as a child). */
3757 unsigned int out_degree = 0;
3759 /* The execution frequency of the node. */
3760 sreal weight = 0;
3762 /* The total execution frequency of all nodes that directly use the
3763 result of this one. */
3764 sreal out_weight = 0;
3767 /* Information about one partition of the SLP graph, for use during
3768 vect_optimize_slp_pass. */
3770 struct slpg_partition_info
3772 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3773 of m_partitioned_nodes. */
3774 unsigned int node_begin = 0;
3775 unsigned int node_end = 0;
3777 /* Which layout we've chosen to use for this partition, or -1 if
3778 we haven't picked one yet. */
3779 int layout = -1;
3781 /* The number of predecessors and successors in the partition dag.
3782 The predecessors always have lower partition numbers and the
3783 successors always have higher partition numbers.
3785 Note that the directions of these edges are not necessarily the
3786 same as in the data flow graph. For example, if an SCC has separate
3787 partitions for an inner loop and an outer loop, the inner loop's
3788 partition will have at least two incoming edges from the outer loop's
3789 partition: one for a live-in value and one for a live-out value.
3790 In data flow terms, one of these edges would also be from the outer loop
3791 to the inner loop, but the other would be in the opposite direction. */
3792 unsigned int in_degree = 0;
3793 unsigned int out_degree = 0;
3796 /* Information about the costs of using a particular layout for a
3797 particular partition. It can also say that the combination is
3798 impossible. */
3800 struct slpg_partition_layout_costs
3802 bool is_possible () const { return internal_cost.is_possible (); }
3803 void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3805 /* The costs inherited from predecessor partitions. */
3806 slpg_layout_cost in_cost;
3808 /* The inherent cost of the layout within the node itself. For example,
3809 this is nonzero for a load if choosing a particular layout would require
3810 the load to permute the loaded elements. It is nonzero for a
3811 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3812 to full-vector moves. */
3813 slpg_layout_cost internal_cost;
3815 /* The costs inherited from successor partitions. */
3816 slpg_layout_cost out_cost;
3819 /* This class tries to optimize the layout of vectors in order to avoid
3820 unnecessary shuffling. At the moment, the set of possible layouts are
3821 restricted to bijective permutations.
3823 The goal of the pass depends on whether we're optimizing for size or
3824 for speed. When optimizing for size, the goal is to reduce the overall
3825 number of layout changes (including layout changes implied by things
3826 like load permutations). When optimizing for speed, the goal is to
3827 reduce the maximum latency attributable to layout changes on any
3828 non-cyclical path through the data flow graph.
3830 For example, when optimizing a loop nest for speed, we will prefer
3831 to make layout changes outside of a loop rather than inside of a loop,
3832 and will prefer to make layout changes in parallel rather than serially,
3833 even if that increases the overall number of layout changes.
3835 The high-level procedure is:
3837 (1) Build a graph in which edges go from uses (parents) to definitions
3838 (children).
3840 (2) Divide the graph into a dag of strongly-connected components (SCCs).
3842 (3) When optimizing for speed, partition the nodes in each SCC based
3843 on their containing cfg loop. When optimizing for size, treat
3844 each SCC as a single partition.
3846 This gives us a dag of partitions. The goal is now to assign a
3847 layout to each partition.
3849 (4) Construct a set of vector layouts that are worth considering.
3850 Record which nodes must keep their current layout.
3852 (5) Perform a forward walk over the partition dag (from loads to stores)
3853 accumulating the "forward" cost of using each layout. When visiting
3854 each partition, assign a tentative choice of layout to the partition
3855 and use that choice when calculating the cost of using a different
3856 layout in successor partitions.
3858 (6) Perform a backward walk over the partition dag (from stores to loads),
3859 accumulating the "backward" cost of using each layout. When visiting
3860 each partition, make a final choice of layout for that partition based
3861 on the accumulated forward costs (from (5)) and backward costs
3862 (from (6)).
3864 (7) Apply the chosen layouts to the SLP graph.
3866 For example, consider the SLP statements:
3868 S1: a_1 = load
3869 loop:
3870 S2: a_2 = PHI<a_1, a_3>
3871 S3: b_1 = load
3872 S4: a_3 = a_2 + b_1
3873 exit:
3874 S5: a_4 = PHI<a_3>
3875 S6: store a_4
3877 S2 and S4 form an SCC and are part of the same loop. Every other
3878 statement is in a singleton SCC. In this example there is a one-to-one
3879 mapping between SCCs and partitions and the partition dag looks like this;
3881 S1 S3
3883 S2+S4
3889 S2, S3 and S4 will have a higher execution frequency than the other
3890 statements, so when optimizing for speed, the goal is to avoid any
3891 layout changes:
3893 - within S3
3894 - within S2+S4
3895 - on the S3->S2+S4 edge
3897 For example, if S3 was originally a reversing load, the goal of the
3898 pass is to make it an unreversed load and change the layout on the
3899 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
3900 on S1->S2+S4 and S5->S6 would also be acceptable.)
3902 The difference between SCCs and partitions becomes important if we
3903 add an outer loop:
3905 S1: a_1 = ...
3906 loop1:
3907 S2: a_2 = PHI<a_1, a_6>
3908 S3: b_1 = load
3909 S4: a_3 = a_2 + b_1
3910 loop2:
3911 S5: a_4 = PHI<a_3, a_5>
3912 S6: c_1 = load
3913 S7: a_5 = a_4 + c_1
3914 exit2:
3915 S8: a_6 = PHI<a_5>
3916 S9: store a_6
3917 exit1:
3919 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
3920 for speed, we usually do not want restrictions in the outer loop to "infect"
3921 the decision for the inner loop. For example, if an outer-loop node
3922 in the SCC contains a statement with a fixed layout, that should not
3923 prevent the inner loop from using a different layout. Conversely,
3924 the inner loop should not dictate a layout to the outer loop: if the
3925 outer loop does a lot of computation, then it may not be efficient to
3926 do all of that computation in the inner loop's preferred layout.
3928 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
3929 and S5+S7 (inner). We also try to arrange partitions so that:
3931 - the partition for an outer loop comes before the partition for
3932 an inner loop
3934 - if a sibling loop A dominates a sibling loop B, A's partition
3935 comes before B's
3937 This gives the following partition dag for the example above:
3939 S1 S3
3941 S2+S4+S8 S6
3942 | \\ /
3943 | S5+S7
3947 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
3948 one for a reversal of the edge S7->S8.
3950 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
3951 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
3952 preferred layout against the cost of changing the layout on entry to the
3953 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
3955 Although this works well when optimizing for speed, it has the downside
3956 when optimizing for size that the choice of layout for S5+S7 is completely
3957 independent of S9, which lessens the chance of reducing the overall number
3958 of permutations. We therefore do not partition SCCs when optimizing
3959 for size.
3961 To give a concrete example of the difference between optimizing
3962 for size and speed, consider:
3964 a[0] = (b[1] << c[3]) - d[1];
3965 a[1] = (b[0] << c[2]) - d[0];
3966 a[2] = (b[3] << c[1]) - d[3];
3967 a[3] = (b[2] << c[0]) - d[2];
3969 There are three different layouts here: one for a, one for b and d,
3970 and one for c. When optimizing for speed it is better to permute each
3971 of b, c and d into the order required by a, since those permutations
3972 happen in parallel. But when optimizing for size, it is better to:
3974 - permute c into the same order as b
3975 - do the arithmetic
3976 - permute the result into the order required by a
3978 This gives 2 permutations rather than 3. */
3980 class vect_optimize_slp_pass
3982 public:
3983 vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
3984 void run ();
3986 private:
3987 /* Graph building. */
3988 struct loop *containing_loop (slp_tree);
3989 bool is_cfg_latch_edge (graph_edge *);
3990 void build_vertices (hash_set<slp_tree> &, slp_tree);
3991 void build_vertices ();
3992 void build_graph ();
3994 /* Partitioning. */
3995 void create_partitions ();
3996 template<typename T> void for_each_partition_edge (unsigned int, T);
3998 /* Layout selection. */
3999 bool is_compatible_layout (slp_tree, unsigned int);
4000 int change_layout_cost (slp_tree, unsigned int, unsigned int);
4001 slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4002 unsigned int);
4003 void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4004 int, unsigned int);
4005 int internal_node_cost (slp_tree, int, unsigned int);
4006 void start_choosing_layouts ();
4008 /* Cost propagation. */
4009 slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4010 unsigned int, unsigned int);
4011 slpg_layout_cost total_in_cost (unsigned int);
4012 slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4013 slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4014 void forward_pass ();
4015 void backward_pass ();
4017 /* Rematerialization. */
4018 slp_tree get_result_with_layout (slp_tree, unsigned int);
4019 void materialize ();
4021 /* Clean-up. */
4022 void remove_redundant_permutations ();
4024 void dump ();
4026 vec_info *m_vinfo;
4028 /* True if we should optimize the graph for size, false if we should
4029 optimize it for speed. (It wouldn't be easy to make this decision
4030 more locally.) */
4031 bool m_optimize_size;
4033 /* A graph of all SLP nodes, with edges leading from uses to definitions.
4034 In other words, a node's predecessors are its slp_tree parents and
4035 a node's successors are its slp_tree children. */
4036 graph *m_slpg = nullptr;
4038 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
4039 auto_vec<slpg_vertex> m_vertices;
4041 /* The list of all leaves of M_SLPG. such as external definitions, constants,
4042 and loads. */
4043 auto_vec<int> m_leafs;
4045 /* This array has one entry for every vector layout that we're considering.
4046 Element 0 is null and indicates "no change". Other entries describe
4047 permutations that are inherent in the current graph and that we would
4048 like to reverse if possible.
4050 For example, a permutation { 1, 2, 3, 0 } means that something has
4051 effectively been permuted in that way, such as a load group
4052 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4053 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4054 in order to put things "back" in order. */
4055 auto_vec<vec<unsigned> > m_perms;
4057 /* A partitioning of the nodes for which a layout must be chosen.
4058 Each partition represents an <SCC, cfg loop> pair; that is,
4059 nodes in different SCCs belong to different partitions, and nodes
4060 within an SCC can be further partitioned according to a containing
4061 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
4063 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4064 from leaves (such as loads) to roots (such as stores).
4066 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
4067 auto_vec<slpg_partition_info> m_partitions;
4069 /* The list of all nodes for which a layout must be chosen. Nodes for
4070 partition P come before the nodes for partition P+1. Nodes within a
4071 partition are in reverse postorder. */
4072 auto_vec<unsigned int> m_partitioned_nodes;
4074 /* Index P * num-layouts + L contains the cost of using layout L
4075 for partition P. */
4076 auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4078 /* Index N * num-layouts + L, if nonnull, is a node that provides the
4079 original output of node N adjusted to have layout L. */
4080 auto_vec<slp_tree> m_node_layouts;
4083 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4084 Also record whether we should optimize anything for speed rather
4085 than size. */
4087 void
4088 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4089 slp_tree node)
4091 unsigned i;
4092 slp_tree child;
4094 if (visited.add (node))
4095 return;
4097 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4099 basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4100 if (optimize_bb_for_speed_p (bb))
4101 m_optimize_size = false;
4104 node->vertex = m_vertices.length ();
4105 m_vertices.safe_push (slpg_vertex (node));
4107 bool leaf = true;
4108 bool force_leaf = false;
4109 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4110 if (child)
4112 leaf = false;
4113 build_vertices (visited, child);
4115 else
4116 force_leaf = true;
4117 /* Since SLP discovery works along use-def edges all cycles have an
4118 entry - but there's the exception of cycles where we do not handle
4119 the entry explicitely (but with a NULL SLP node), like some reductions
4120 and inductions. Force those SLP PHIs to act as leafs to make them
4121 backwards reachable. */
4122 if (leaf || force_leaf)
4123 m_leafs.safe_push (node->vertex);
4126 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
4128 void
4129 vect_optimize_slp_pass::build_vertices ()
4131 hash_set<slp_tree> visited;
4132 unsigned i;
4133 slp_instance instance;
4134 FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4135 build_vertices (visited, SLP_INSTANCE_TREE (instance));
4138 /* Apply (reverse) bijectite PERM to VEC. */
4140 template <class T>
4141 static void
4142 vect_slp_permute (vec<unsigned> perm,
4143 vec<T> &vec, bool reverse)
4145 auto_vec<T, 64> saved;
4146 saved.create (vec.length ());
4147 for (unsigned i = 0; i < vec.length (); ++i)
4148 saved.quick_push (vec[i]);
4150 if (reverse)
4152 for (unsigned i = 0; i < vec.length (); ++i)
4153 vec[perm[i]] = saved[i];
4154 for (unsigned i = 0; i < vec.length (); ++i)
4155 gcc_assert (vec[perm[i]] == saved[i]);
4157 else
4159 for (unsigned i = 0; i < vec.length (); ++i)
4160 vec[i] = saved[perm[i]];
4161 for (unsigned i = 0; i < vec.length (); ++i)
4162 gcc_assert (vec[i] == saved[perm[i]]);
4166 /* Return the cfg loop that contains NODE. */
4168 struct loop *
4169 vect_optimize_slp_pass::containing_loop (slp_tree node)
4171 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4172 if (!rep)
4173 return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4174 return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4177 /* Return true if UD (an edge from a use to a definition) is associated
4178 with a loop latch edge in the cfg. */
4180 bool
4181 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4183 slp_tree use = m_vertices[ud->src].node;
4184 slp_tree def = m_vertices[ud->dest].node;
4185 if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4186 || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4187 return false;
4189 stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4190 return (is_a<gphi *> (use_rep->stmt)
4191 && bb_loop_header_p (gimple_bb (use_rep->stmt))
4192 && containing_loop (def) == containing_loop (use));
4195 /* Build the graph. Mark edges that correspond to cfg loop latch edges with
4196 a nonnull data field. */
4198 void
4199 vect_optimize_slp_pass::build_graph ()
4201 m_optimize_size = true;
4202 build_vertices ();
4204 m_slpg = new_graph (m_vertices.length ());
4205 for (slpg_vertex &v : m_vertices)
4206 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4207 if (child)
4209 graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4210 if (is_cfg_latch_edge (ud))
4211 ud->data = this;
4215 /* Return true if E corresponds to a loop latch edge in the cfg. */
4217 static bool
4218 skip_cfg_latch_edges (graph_edge *e)
4220 return e->data;
4223 /* Create the node partitions. */
4225 void
4226 vect_optimize_slp_pass::create_partitions ()
4228 /* Calculate a postorder of the graph, ignoring edges that correspond
4229 to natural latch edges in the cfg. Reading the vector from the end
4230 to the beginning gives the reverse postorder. */
4231 auto_vec<int> initial_rpo;
4232 graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4233 false, NULL, skip_cfg_latch_edges);
4234 gcc_assert (initial_rpo.length () == m_vertices.length ());
4236 /* Calculate the strongly connected components of the graph. */
4237 auto_vec<int> scc_grouping;
4238 unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4240 /* Create a new index order in which all nodes from the same SCC are
4241 consecutive. Use scc_pos to record the index of the first node in
4242 each SCC. */
4243 auto_vec<unsigned int> scc_pos (num_sccs);
4244 int last_component = -1;
4245 unsigned int node_count = 0;
4246 for (unsigned int node_i : scc_grouping)
4248 if (last_component != m_slpg->vertices[node_i].component)
4250 last_component = m_slpg->vertices[node_i].component;
4251 gcc_assert (last_component == int (scc_pos.length ()));
4252 scc_pos.quick_push (node_count);
4254 node_count += 1;
4256 gcc_assert (node_count == initial_rpo.length ()
4257 && last_component + 1 == int (num_sccs));
4259 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4260 inside each SCC following the RPO we calculated above. The fact that
4261 we ignored natural latch edges when calculating the RPO should ensure
4262 that, for natural loop nests:
4264 - the first node that we encounter in a cfg loop is the loop header phi
4265 - the loop header phis are in dominance order
4267 Arranging for this is an optimization (see below) rather than a
4268 correctness issue. Unnatural loops with a tangled mess of backedges
4269 will still work correctly, but might give poorer results.
4271 Also update scc_pos so that it gives 1 + the index of the last node
4272 in the SCC. */
4273 m_partitioned_nodes.safe_grow (node_count);
4274 for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4276 unsigned int node_i = initial_rpo[old_i];
4277 unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4278 m_partitioned_nodes[new_i] = node_i;
4281 /* When optimizing for speed, partition each SCC based on the containing
4282 cfg loop. The order we constructed above should ensure that, for natural
4283 cfg loops, we'll create sub-SCC partitions for outer loops before
4284 the corresponding sub-SCC partitions for inner loops. Similarly,
4285 when one sibling loop A dominates another sibling loop B, we should
4286 create a sub-SCC partition for A before a sub-SCC partition for B.
4288 As above, nothing depends for correctness on whether this achieves
4289 a natural nesting, but we should get better results when it does. */
4290 m_partitions.reserve (m_vertices.length ());
4291 unsigned int next_partition_i = 0;
4292 hash_map<struct loop *, int> loop_partitions;
4293 unsigned int rpo_begin = 0;
4294 unsigned int num_partitioned_nodes = 0;
4295 for (unsigned int rpo_end : scc_pos)
4297 loop_partitions.empty ();
4298 unsigned int partition_i = next_partition_i;
4299 for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4301 /* Handle externals and constants optimistically throughout.
4302 But treat existing vectors as fixed since we do not handle
4303 permuting them. */
4304 unsigned int node_i = m_partitioned_nodes[rpo_i];
4305 auto &vertex = m_vertices[node_i];
4306 if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4307 && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4308 || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4309 vertex.partition = -1;
4310 else
4312 bool existed;
4313 if (m_optimize_size)
4314 existed = next_partition_i > partition_i;
4315 else
4317 struct loop *loop = containing_loop (vertex.node);
4318 auto &entry = loop_partitions.get_or_insert (loop, &existed);
4319 if (!existed)
4320 entry = next_partition_i;
4321 partition_i = entry;
4323 if (!existed)
4325 m_partitions.quick_push (slpg_partition_info ());
4326 next_partition_i += 1;
4328 vertex.partition = partition_i;
4329 num_partitioned_nodes += 1;
4330 m_partitions[partition_i].node_end += 1;
4333 rpo_begin = rpo_end;
4336 /* Assign ranges of consecutive node indices to each partition,
4337 in partition order. Start with node_end being the same as
4338 node_begin so that the next loop can use it as a counter. */
4339 unsigned int node_begin = 0;
4340 for (auto &partition : m_partitions)
4342 partition.node_begin = node_begin;
4343 node_begin += partition.node_end;
4344 partition.node_end = partition.node_begin;
4346 gcc_assert (node_begin == num_partitioned_nodes);
4348 /* Finally build the list of nodes in partition order. */
4349 m_partitioned_nodes.truncate (num_partitioned_nodes);
4350 for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4352 int partition_i = m_vertices[node_i].partition;
4353 if (partition_i >= 0)
4355 unsigned int order_i = m_partitions[partition_i].node_end++;
4356 m_partitioned_nodes[order_i] = node_i;
4361 /* Look for edges from earlier partitions into node NODE_I and edges from
4362 node NODE_I into later partitions. Call:
4364 FN (ud, other_node_i)
4366 for each such use-to-def edge ud, where other_node_i is the node at the
4367 other end of the edge. */
4369 template<typename T>
4370 void
4371 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4373 int partition_i = m_vertices[node_i].partition;
4374 for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4375 pred; pred = pred->pred_next)
4377 int src_partition_i = m_vertices[pred->src].partition;
4378 if (src_partition_i >= 0 && src_partition_i != partition_i)
4379 fn (pred, pred->src);
4381 for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4382 succ; succ = succ->succ_next)
4384 int dest_partition_i = m_vertices[succ->dest].partition;
4385 if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4386 fn (succ, succ->dest);
4390 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4391 that NODE would operate on. This test is independent of NODE's actual
4392 operation. */
4394 bool
4395 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4396 unsigned int layout_i)
4398 if (layout_i == 0)
4399 return true;
4401 if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4402 return false;
4404 return true;
4407 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4408 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
4409 layouts is incompatible with NODE or if the change is not possible for
4410 some other reason.
4412 The properties taken from NODE include the number of lanes and the
4413 vector type. The actual operation doesn't matter. */
4416 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4417 unsigned int from_layout_i,
4418 unsigned int to_layout_i)
4420 if (!is_compatible_layout (node, from_layout_i)
4421 || !is_compatible_layout (node, to_layout_i))
4422 return -1;
4424 if (from_layout_i == to_layout_i)
4425 return 0;
4427 auto_vec<slp_tree, 1> children (1);
4428 children.quick_push (node);
4429 auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4430 if (from_layout_i > 0)
4431 for (unsigned int i : m_perms[from_layout_i])
4432 perm.quick_push ({ 0, i });
4433 else
4434 for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4435 perm.quick_push ({ 0, i });
4436 if (to_layout_i > 0)
4437 vect_slp_permute (m_perms[to_layout_i], perm, true);
4438 auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4439 children, false);
4440 if (count >= 0)
4441 return MAX (count, 1);
4443 /* ??? In principle we could try changing via layout 0, giving two
4444 layout changes rather than 1. Doing that would require
4445 corresponding support in get_result_with_layout. */
4446 return -1;
4449 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
4451 inline slpg_partition_layout_costs &
4452 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4453 unsigned int layout_i)
4455 return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4458 /* Change PERM in one of two ways:
4460 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4461 chosen for child I of NODE.
4463 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4465 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
4467 void
4468 vect_optimize_slp_pass::
4469 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4470 int in_layout_i, unsigned int out_layout_i)
4472 for (auto &entry : perm)
4474 int this_in_layout_i = in_layout_i;
4475 if (this_in_layout_i < 0)
4477 slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4478 unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4479 this_in_layout_i = m_partitions[in_partition_i].layout;
4481 if (this_in_layout_i > 0)
4482 entry.second = m_perms[this_in_layout_i][entry.second];
4484 if (out_layout_i > 0)
4485 vect_slp_permute (m_perms[out_layout_i], perm, true);
4488 /* Check whether the target allows NODE to be rearranged so that the node's
4489 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
4490 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
4492 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4493 NODE can adapt to the layout changes that have (perhaps provisionally)
4494 been chosen for NODE's children, so that no extra permutations are
4495 needed on either the input or the output of NODE.
4497 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4498 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4500 IN_LAYOUT_I has no meaning for other types of node.
4502 Keeping the node as-is is always valid. If the target doesn't appear
4503 to support the node as-is, but might realistically support other layouts,
4504 then layout 0 instead has the cost of a worst-case permutation. On the
4505 one hand, this ensures that every node has at least one valid layout,
4506 avoiding what would otherwise be an awkward special case. On the other,
4507 it still encourages the pass to change an invalid pre-existing layout
4508 choice into a valid one. */
4511 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4512 unsigned int out_layout_i)
4514 const int fallback_cost = 1;
4516 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4518 auto_lane_permutation_t tmp_perm;
4519 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4521 /* Check that the child nodes support the chosen layout. Checking
4522 the first child is enough, since any second child would have the
4523 same shape. */
4524 auto first_child = SLP_TREE_CHILDREN (node)[0];
4525 if (in_layout_i > 0
4526 && !is_compatible_layout (first_child, in_layout_i))
4527 return -1;
4529 change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4530 int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4531 node, tmp_perm,
4532 SLP_TREE_CHILDREN (node),
4533 false);
4534 if (count < 0)
4536 if (in_layout_i == 0 && out_layout_i == 0)
4538 /* Use the fallback cost if the node could in principle support
4539 some nonzero layout for both the inputs and the outputs.
4540 Otherwise assume that the node will be rejected later
4541 and rebuilt from scalars. */
4542 if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4543 return fallback_cost;
4544 return 0;
4546 return -1;
4549 /* We currently have no way of telling whether the new layout is cheaper
4550 or more expensive than the old one. But at least in principle,
4551 it should be worth making zero permutations (whole-vector shuffles)
4552 cheaper than real permutations, in case the pass is able to remove
4553 the latter. */
4554 return count == 0 ? 0 : 1;
4557 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4558 if (rep
4559 && STMT_VINFO_DATA_REF (rep)
4560 && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4561 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4563 auto_load_permutation_t tmp_perm;
4564 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4565 if (out_layout_i > 0)
4566 vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4568 poly_uint64 vf = 1;
4569 if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4570 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4571 unsigned int n_perms;
4572 if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4573 nullptr, vf, true, false, &n_perms))
4575 auto rep = SLP_TREE_REPRESENTATIVE (node);
4576 if (out_layout_i == 0)
4578 /* Use the fallback cost if the load is an N-to-N permutation.
4579 Otherwise assume that the node will be rejected later
4580 and rebuilt from scalars. */
4581 if (STMT_VINFO_GROUPED_ACCESS (rep)
4582 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4583 == SLP_TREE_LANES (node)))
4584 return fallback_cost;
4585 return 0;
4587 return -1;
4590 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
4591 return n_perms == 0 ? 0 : 1;
4594 return 0;
4597 /* Decide which element layouts we should consider using. Calculate the
4598 weights associated with inserting layout changes on partition edges.
4599 Also mark partitions that cannot change layout, by setting their
4600 layout to zero. */
4602 void
4603 vect_optimize_slp_pass::start_choosing_layouts ()
4605 /* Used to assign unique permutation indices. */
4606 using perm_hash = unbounded_hashmap_traits<
4607 vec_free_hash_base<int_hash_base<unsigned>>,
4608 int_hash<int, -1, -2>
4610 hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4612 /* Layout 0 is "no change". */
4613 m_perms.safe_push (vNULL);
4615 /* Create layouts from existing permutations. */
4616 auto_load_permutation_t tmp_perm;
4617 for (unsigned int node_i : m_partitioned_nodes)
4619 /* Leafs also double as entries to the reverse graph. Allow the
4620 layout of those to be changed. */
4621 auto &vertex = m_vertices[node_i];
4622 auto &partition = m_partitions[vertex.partition];
4623 if (!m_slpg->vertices[node_i].succ)
4624 partition.layout = 0;
4626 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
4627 slp_tree node = vertex.node;
4628 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4629 slp_tree child;
4630 unsigned HOST_WIDE_INT imin, imax = 0;
4631 bool any_permute = false;
4632 tmp_perm.truncate (0);
4633 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4635 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4636 unpermuted, record a layout that reverses this permutation.
4638 We would need more work to cope with loads that are internally
4639 permuted and also have inputs (such as masks for
4640 IFN_MASK_LOADs). */
4641 gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4642 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4644 partition.layout = -1;
4645 continue;
4647 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4648 imin = DR_GROUP_SIZE (dr_stmt) + 1;
4649 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4651 else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4652 && SLP_TREE_CHILDREN (node).length () == 1
4653 && (child = SLP_TREE_CHILDREN (node)[0])
4654 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4655 .is_constant (&imin)))
4657 /* If the child has the same vector size as this node,
4658 reversing the permutation can make the permutation a no-op.
4659 In other cases it can change a true permutation into a
4660 full-vector extract. */
4661 tmp_perm.reserve (SLP_TREE_LANES (node));
4662 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4663 tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4665 else
4666 continue;
4668 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4670 unsigned idx = tmp_perm[j];
4671 imin = MIN (imin, idx);
4672 imax = MAX (imax, idx);
4673 if (idx - tmp_perm[0] != j)
4674 any_permute = true;
4676 /* If the span doesn't match we'd disrupt VF computation, avoid
4677 that for now. */
4678 if (imax - imin + 1 != SLP_TREE_LANES (node))
4679 continue;
4680 /* If there's no permute no need to split one out. In this case
4681 we can consider turning a load into a permuted load, if that
4682 turns out to be cheaper than alternatives. */
4683 if (!any_permute)
4685 partition.layout = -1;
4686 continue;
4689 /* For now only handle true permutes, like
4690 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
4691 when permuting constants and invariants keeping the permute
4692 bijective. */
4693 auto_sbitmap load_index (SLP_TREE_LANES (node));
4694 bitmap_clear (load_index);
4695 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4696 bitmap_set_bit (load_index, tmp_perm[j] - imin);
4697 unsigned j;
4698 for (j = 0; j < SLP_TREE_LANES (node); ++j)
4699 if (!bitmap_bit_p (load_index, j))
4700 break;
4701 if (j != SLP_TREE_LANES (node))
4702 continue;
4704 vec<unsigned> perm = vNULL;
4705 perm.safe_grow (SLP_TREE_LANES (node), true);
4706 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4707 perm[j] = tmp_perm[j] - imin;
4709 if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4711 /* Continue to use existing layouts, but don't add any more. */
4712 int *entry = layout_ids.get (perm);
4713 partition.layout = entry ? *entry : 0;
4714 perm.release ();
4716 else
4718 bool existed;
4719 int &layout_i = layout_ids.get_or_insert (perm, &existed);
4720 if (existed)
4721 perm.release ();
4722 else
4724 layout_i = m_perms.length ();
4725 m_perms.safe_push (perm);
4727 partition.layout = layout_i;
4731 /* Initially assume that every layout is possible and has zero cost
4732 in every partition. */
4733 m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4734 * m_perms.length ());
4736 /* We have to mark outgoing permutations facing non-associating-reduction
4737 graph entries that are not represented as to be materialized.
4738 slp_inst_kind_bb_reduc currently only covers associatable reductions. */
4739 for (slp_instance instance : m_vinfo->slp_instances)
4740 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4742 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4743 m_partitions[m_vertices[node_i].partition].layout = 0;
4745 else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4747 stmt_vec_info stmt_info
4748 = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4749 stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4750 if (needs_fold_left_reduction_p (TREE_TYPE
4751 (gimple_get_lhs (stmt_info->stmt)),
4752 STMT_VINFO_REDUC_CODE (reduc_info)))
4754 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4755 m_partitions[m_vertices[node_i].partition].layout = 0;
4759 /* Check which layouts each node and partition can handle. Calculate the
4760 weights associated with inserting layout changes on edges. */
4761 for (unsigned int node_i : m_partitioned_nodes)
4763 auto &vertex = m_vertices[node_i];
4764 auto &partition = m_partitions[vertex.partition];
4765 slp_tree node = vertex.node;
4767 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4769 vertex.weight = vect_slp_node_weight (node);
4771 /* We do not handle stores with a permutation, so all
4772 incoming permutations must have been materialized.
4774 We also don't handle masked grouped loads, which lack a
4775 permutation vector. In this case the memory locations
4776 form an implicit second input to the loads, on top of the
4777 explicit mask input, and the memory input's layout cannot
4778 be changed.
4780 On the other hand, we do support permuting gather loads and
4781 masked gather loads, where each scalar load is independent
4782 of the others. This can be useful if the address/index input
4783 benefits from permutation. */
4784 if (STMT_VINFO_DATA_REF (rep)
4785 && STMT_VINFO_GROUPED_ACCESS (rep)
4786 && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4787 partition.layout = 0;
4789 /* We cannot change the layout of an operation that is
4790 not independent on lanes. Note this is an explicit
4791 negative list since that's much shorter than the respective
4792 positive one but it's critical to keep maintaining it. */
4793 if (is_gimple_call (STMT_VINFO_STMT (rep)))
4794 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4796 case CFN_COMPLEX_ADD_ROT90:
4797 case CFN_COMPLEX_ADD_ROT270:
4798 case CFN_COMPLEX_MUL:
4799 case CFN_COMPLEX_MUL_CONJ:
4800 case CFN_VEC_ADDSUB:
4801 case CFN_VEC_FMADDSUB:
4802 case CFN_VEC_FMSUBADD:
4803 partition.layout = 0;
4804 default:;
4808 auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4810 auto &other_vertex = m_vertices[other_node_i];
4812 /* Count the number of edges from earlier partitions and the number
4813 of edges to later partitions. */
4814 if (other_vertex.partition < vertex.partition)
4815 partition.in_degree += 1;
4816 else
4817 partition.out_degree += 1;
4819 /* If the current node uses the result of OTHER_NODE_I, accumulate
4820 the effects of that. */
4821 if (ud->src == int (node_i))
4823 other_vertex.out_weight += vertex.weight;
4824 other_vertex.out_degree += 1;
4827 for_each_partition_edge (node_i, process_edge);
4831 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4832 its current (provisional) choice of layout. The inputs do not necessarily
4833 have the same layout as each other. */
4835 slpg_layout_cost
4836 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4838 auto &vertex = m_vertices[node_i];
4839 slpg_layout_cost cost;
4840 auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4842 auto &other_vertex = m_vertices[other_node_i];
4843 if (other_vertex.partition < vertex.partition)
4845 auto &other_partition = m_partitions[other_vertex.partition];
4846 auto &other_costs = partition_layout_costs (other_vertex.partition,
4847 other_partition.layout);
4848 slpg_layout_cost this_cost = other_costs.in_cost;
4849 this_cost.add_serial_cost (other_costs.internal_cost);
4850 this_cost.split (other_partition.out_degree);
4851 cost.add_parallel_cost (this_cost);
4854 for_each_partition_edge (node_i, add_cost);
4855 return cost;
4858 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4859 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
4860 slpg_layout_cost::impossible () if the change isn't possible. */
4862 slpg_layout_cost
4863 vect_optimize_slp_pass::
4864 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4865 unsigned int layout2_i)
4867 auto &def_vertex = m_vertices[ud->dest];
4868 auto &use_vertex = m_vertices[ud->src];
4869 auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4870 auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4871 auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4872 use_layout_i);
4873 if (factor < 0)
4874 return slpg_layout_cost::impossible ();
4876 /* We have a choice of putting the layout change at the site of the
4877 definition or at the site of the use. Prefer the former when
4878 optimizing for size or when the execution frequency of the
4879 definition is no greater than the combined execution frequencies of
4880 the uses. When putting the layout change at the site of the definition,
4881 divvy up the cost among all consumers. */
4882 if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4884 slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4885 cost.split (def_vertex.out_degree);
4886 return cost;
4888 return { use_vertex.weight * factor, m_optimize_size };
4891 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4892 partition; FROM_NODE_I could be the definition node or the use node.
4893 The node at the other end of the link wants to use layout TO_LAYOUT_I.
4894 Return the cost of any necessary fix-ups on edge UD, or return
4895 slpg_layout_cost::impossible () if the change isn't possible.
4897 At this point, FROM_NODE_I's partition has chosen the cheapest
4898 layout based on the information available so far, but this choice
4899 is only provisional. */
4901 slpg_layout_cost
4902 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
4903 unsigned int to_layout_i)
4905 auto &from_vertex = m_vertices[from_node_i];
4906 unsigned int from_partition_i = from_vertex.partition;
4907 slpg_partition_info &from_partition = m_partitions[from_partition_i];
4908 gcc_assert (from_partition.layout >= 0);
4910 /* First calculate the cost on the assumption that FROM_PARTITION sticks
4911 with its current layout preference. */
4912 slpg_layout_cost cost = slpg_layout_cost::impossible ();
4913 auto edge_cost = edge_layout_cost (ud, from_node_i,
4914 from_partition.layout, to_layout_i);
4915 if (edge_cost.is_possible ())
4917 auto &from_costs = partition_layout_costs (from_partition_i,
4918 from_partition.layout);
4919 cost = from_costs.in_cost;
4920 cost.add_serial_cost (from_costs.internal_cost);
4921 cost.split (from_partition.out_degree);
4922 cost.add_serial_cost (edge_cost);
4925 /* Take the minimum of that cost and the cost that applies if
4926 FROM_PARTITION instead switches to TO_LAYOUT_I. */
4927 auto &direct_layout_costs = partition_layout_costs (from_partition_i,
4928 to_layout_i);
4929 if (direct_layout_costs.is_possible ())
4931 slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
4932 direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
4933 direct_cost.split (from_partition.out_degree);
4934 if (!cost.is_possible ()
4935 || direct_cost.is_better_than (cost, m_optimize_size))
4936 cost = direct_cost;
4939 return cost;
4942 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
4943 partition; TO_NODE_I could be the definition node or the use node.
4944 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
4945 return the cost of any necessary fix-ups on edge UD, or
4946 slpg_layout_cost::impossible () if the choice cannot be made.
4948 At this point, TO_NODE_I's partition has a fixed choice of layout. */
4950 slpg_layout_cost
4951 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
4952 unsigned int from_layout_i)
4954 auto &to_vertex = m_vertices[to_node_i];
4955 unsigned int to_partition_i = to_vertex.partition;
4956 slpg_partition_info &to_partition = m_partitions[to_partition_i];
4957 gcc_assert (to_partition.layout >= 0);
4959 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
4960 adjusted for this input having layout FROM_LAYOUT_I. Assume that
4961 any other inputs keep their current choice of layout. */
4962 auto &to_costs = partition_layout_costs (to_partition_i,
4963 to_partition.layout);
4964 if (ud->src == int (to_node_i)
4965 && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
4967 auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
4968 auto old_layout = from_partition.layout;
4969 from_partition.layout = from_layout_i;
4970 int factor = internal_node_cost (to_vertex.node, -1,
4971 to_partition.layout);
4972 from_partition.layout = old_layout;
4973 if (factor >= 0)
4975 slpg_layout_cost cost = to_costs.out_cost;
4976 cost.add_serial_cost ({ to_vertex.weight * factor,
4977 m_optimize_size });
4978 cost.split (to_partition.in_degree);
4979 return cost;
4983 /* Compute the cost if we insert any necessary layout change on edge UD. */
4984 auto edge_cost = edge_layout_cost (ud, to_node_i,
4985 to_partition.layout, from_layout_i);
4986 if (edge_cost.is_possible ())
4988 slpg_layout_cost cost = to_costs.out_cost;
4989 cost.add_serial_cost (to_costs.internal_cost);
4990 cost.split (to_partition.in_degree);
4991 cost.add_serial_cost (edge_cost);
4992 return cost;
4995 return slpg_layout_cost::impossible ();
4998 /* Make a forward pass through the partitions, accumulating input costs.
4999 Make a tentative (provisional) choice of layout for each partition,
5000 ensuring that this choice still allows later partitions to keep
5001 their original layout. */
5003 void
5004 vect_optimize_slp_pass::forward_pass ()
5006 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5007 ++partition_i)
5009 auto &partition = m_partitions[partition_i];
5011 /* If the partition consists of a single VEC_PERM_EXPR, precompute
5012 the incoming cost that would apply if every predecessor partition
5013 keeps its current layout. This is used within the loop below. */
5014 slpg_layout_cost in_cost;
5015 slp_tree single_node = nullptr;
5016 if (partition.node_end == partition.node_begin + 1)
5018 unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5019 single_node = m_vertices[node_i].node;
5020 if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5021 in_cost = total_in_cost (node_i);
5024 /* Go through the possible layouts. Decide which ones are valid
5025 for this partition and record which of the valid layouts has
5026 the lowest cost. */
5027 unsigned int min_layout_i = 0;
5028 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5029 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5031 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5032 if (!layout_costs.is_possible ())
5033 continue;
5035 /* If the recorded layout is already 0 then the layout cannot
5036 change. */
5037 if (partition.layout == 0 && layout_i != 0)
5039 layout_costs.mark_impossible ();
5040 continue;
5043 bool is_possible = true;
5044 for (unsigned int order_i = partition.node_begin;
5045 order_i < partition.node_end; ++order_i)
5047 unsigned int node_i = m_partitioned_nodes[order_i];
5048 auto &vertex = m_vertices[node_i];
5050 /* Reject the layout if it is individually incompatible
5051 with any node in the partition. */
5052 if (!is_compatible_layout (vertex.node, layout_i))
5054 is_possible = false;
5055 break;
5058 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5060 auto &other_vertex = m_vertices[other_node_i];
5061 if (other_vertex.partition < vertex.partition)
5063 /* Accumulate the incoming costs from earlier
5064 partitions, plus the cost of any layout changes
5065 on UD itself. */
5066 auto cost = forward_cost (ud, other_node_i, layout_i);
5067 if (!cost.is_possible ())
5068 is_possible = false;
5069 else
5070 layout_costs.in_cost.add_parallel_cost (cost);
5072 else
5073 /* Reject the layout if it would make layout 0 impossible
5074 for later partitions. This amounts to testing that the
5075 target supports reversing the layout change on edges
5076 to later partitions.
5078 In principle, it might be possible to push a layout
5079 change all the way down a graph, so that it never
5080 needs to be reversed and so that the target doesn't
5081 need to support the reverse operation. But it would
5082 be awkward to bail out if we hit a partition that
5083 does not support the new layout, especially since
5084 we are not dealing with a lattice. */
5085 is_possible &= edge_layout_cost (ud, other_node_i, 0,
5086 layout_i).is_possible ();
5088 for_each_partition_edge (node_i, add_cost);
5090 /* Accumulate the cost of using LAYOUT_I within NODE,
5091 both for the inputs and the outputs. */
5092 int factor = internal_node_cost (vertex.node, layout_i,
5093 layout_i);
5094 if (factor < 0)
5096 is_possible = false;
5097 break;
5099 else if (factor)
5100 layout_costs.internal_cost.add_serial_cost
5101 ({ vertex.weight * factor, m_optimize_size });
5103 if (!is_possible)
5105 layout_costs.mark_impossible ();
5106 continue;
5109 /* Combine the incoming and partition-internal costs. */
5110 slpg_layout_cost combined_cost = layout_costs.in_cost;
5111 combined_cost.add_serial_cost (layout_costs.internal_cost);
5113 /* If this partition consists of a single VEC_PERM_EXPR, see
5114 if the VEC_PERM_EXPR can be changed to support output layout
5115 LAYOUT_I while keeping all the provisional choices of input
5116 layout. */
5117 if (single_node
5118 && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5120 int factor = internal_node_cost (single_node, -1, layout_i);
5121 if (factor >= 0)
5123 auto weight = m_vertices[single_node->vertex].weight;
5124 slpg_layout_cost internal_cost
5125 = { weight * factor, m_optimize_size };
5127 slpg_layout_cost alt_cost = in_cost;
5128 alt_cost.add_serial_cost (internal_cost);
5129 if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5131 combined_cost = alt_cost;
5132 layout_costs.in_cost = in_cost;
5133 layout_costs.internal_cost = internal_cost;
5138 /* Record the layout with the lowest cost. Prefer layout 0 in
5139 the event of a tie between it and another layout. */
5140 if (!min_layout_cost.is_possible ()
5141 || combined_cost.is_better_than (min_layout_cost,
5142 m_optimize_size))
5144 min_layout_i = layout_i;
5145 min_layout_cost = combined_cost;
5149 /* This loop's handling of earlier partitions should ensure that
5150 choosing the original layout for the current partition is no
5151 less valid than it was in the original graph, even with the
5152 provisional layout choices for those earlier partitions. */
5153 gcc_assert (min_layout_cost.is_possible ());
5154 partition.layout = min_layout_i;
5158 /* Make a backward pass through the partitions, accumulating output costs.
5159 Make a final choice of layout for each partition. */
5161 void
5162 vect_optimize_slp_pass::backward_pass ()
5164 for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5166 auto &partition = m_partitions[partition_i];
5168 unsigned int min_layout_i = 0;
5169 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5170 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5172 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5173 if (!layout_costs.is_possible ())
5174 continue;
5176 /* Accumulate the costs from successor partitions. */
5177 bool is_possible = true;
5178 for (unsigned int order_i = partition.node_begin;
5179 order_i < partition.node_end; ++order_i)
5181 unsigned int node_i = m_partitioned_nodes[order_i];
5182 auto &vertex = m_vertices[node_i];
5183 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5185 auto &other_vertex = m_vertices[other_node_i];
5186 auto &other_partition = m_partitions[other_vertex.partition];
5187 if (other_vertex.partition > vertex.partition)
5189 /* Accumulate the incoming costs from later
5190 partitions, plus the cost of any layout changes
5191 on UD itself. */
5192 auto cost = backward_cost (ud, other_node_i, layout_i);
5193 if (!cost.is_possible ())
5194 is_possible = false;
5195 else
5196 layout_costs.out_cost.add_parallel_cost (cost);
5198 else
5199 /* Make sure that earlier partitions can (if necessary
5200 or beneficial) keep the layout that they chose in
5201 the forward pass. This ensures that there is at
5202 least one valid choice of layout. */
5203 is_possible &= edge_layout_cost (ud, other_node_i,
5204 other_partition.layout,
5205 layout_i).is_possible ();
5207 for_each_partition_edge (node_i, add_cost);
5209 if (!is_possible)
5211 layout_costs.mark_impossible ();
5212 continue;
5215 /* Locally combine the costs from the forward and backward passes.
5216 (This combined cost is not passed on, since that would lead
5217 to double counting.) */
5218 slpg_layout_cost combined_cost = layout_costs.in_cost;
5219 combined_cost.add_serial_cost (layout_costs.internal_cost);
5220 combined_cost.add_serial_cost (layout_costs.out_cost);
5222 /* Record the layout with the lowest cost. Prefer layout 0 in
5223 the event of a tie between it and another layout. */
5224 if (!min_layout_cost.is_possible ()
5225 || combined_cost.is_better_than (min_layout_cost,
5226 m_optimize_size))
5228 min_layout_i = layout_i;
5229 min_layout_cost = combined_cost;
5233 gcc_assert (min_layout_cost.is_possible ());
5234 partition.layout = min_layout_i;
5238 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5239 NODE already has the layout that was selected for its partition. */
5241 slp_tree
5242 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5243 unsigned int to_layout_i)
5245 unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5246 slp_tree result = m_node_layouts[result_i];
5247 if (result)
5248 return result;
5250 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5251 || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5252 /* We can't permute vector defs in place. */
5253 && SLP_TREE_VEC_DEFS (node).is_empty ()))
5255 /* If the vector is uniform or unchanged, there's nothing to do. */
5256 if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5257 result = node;
5258 else
5260 auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5261 result = vect_create_new_slp_node (scalar_ops);
5262 vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5265 else
5267 unsigned int partition_i = m_vertices[node->vertex].partition;
5268 unsigned int from_layout_i = m_partitions[partition_i].layout;
5269 if (from_layout_i == to_layout_i)
5270 return node;
5272 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5273 permutation instead of a serial one. Leave the new permutation
5274 in TMP_PERM on success. */
5275 auto_lane_permutation_t tmp_perm;
5276 unsigned int num_inputs = 1;
5277 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5279 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5280 if (from_layout_i != 0)
5281 vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5282 if (to_layout_i != 0)
5283 vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5284 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5285 tmp_perm,
5286 SLP_TREE_CHILDREN (node),
5287 false) >= 0)
5288 num_inputs = SLP_TREE_CHILDREN (node).length ();
5289 else
5290 tmp_perm.truncate (0);
5293 if (dump_enabled_p ())
5295 if (tmp_perm.length () > 0)
5296 dump_printf_loc (MSG_NOTE, vect_location,
5297 "duplicating permutation node %p with"
5298 " layout %d\n",
5299 (void *) node, to_layout_i);
5300 else
5301 dump_printf_loc (MSG_NOTE, vect_location,
5302 "inserting permutation node in place of %p\n",
5303 (void *) node);
5306 unsigned int num_lanes = SLP_TREE_LANES (node);
5307 result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5308 if (SLP_TREE_SCALAR_STMTS (node).length ())
5310 auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5311 stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5312 if (from_layout_i != 0)
5313 vect_slp_permute (m_perms[from_layout_i], stmts, false);
5314 if (to_layout_i != 0)
5315 vect_slp_permute (m_perms[to_layout_i], stmts, true);
5317 SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5318 SLP_TREE_LANES (result) = num_lanes;
5319 SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5320 result->vertex = -1;
5322 auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5323 if (tmp_perm.length ())
5325 lane_perm.safe_splice (tmp_perm);
5326 SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5328 else
5330 lane_perm.create (num_lanes);
5331 for (unsigned j = 0; j < num_lanes; ++j)
5332 lane_perm.quick_push ({ 0, j });
5333 if (from_layout_i != 0)
5334 vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5335 if (to_layout_i != 0)
5336 vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5337 SLP_TREE_CHILDREN (result).safe_push (node);
5339 for (slp_tree child : SLP_TREE_CHILDREN (result))
5340 child->refcnt++;
5342 m_node_layouts[result_i] = result;
5343 return result;
5346 /* Apply the chosen vector layouts to the SLP graph. */
5348 void
5349 vect_optimize_slp_pass::materialize ()
5351 /* We no longer need the costs, so avoid having two O(N * P) arrays
5352 live at the same time. */
5353 m_partition_layout_costs.release ();
5354 m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5356 auto_sbitmap fully_folded (m_vertices.length ());
5357 bitmap_clear (fully_folded);
5358 for (unsigned int node_i : m_partitioned_nodes)
5360 auto &vertex = m_vertices[node_i];
5361 slp_tree node = vertex.node;
5362 int layout_i = m_partitions[vertex.partition].layout;
5363 gcc_assert (layout_i >= 0);
5365 /* Rearrange the scalar statements to match the chosen layout. */
5366 if (layout_i > 0)
5367 vect_slp_permute (m_perms[layout_i],
5368 SLP_TREE_SCALAR_STMTS (node), true);
5370 /* Update load and lane permutations. */
5371 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5373 /* First try to absorb the input vector layouts. If that fails,
5374 force the inputs to have layout LAYOUT_I too. We checked that
5375 that was possible before deciding to use nonzero output layouts.
5376 (Note that at this stage we don't really have any guarantee that
5377 the target supports the original VEC_PERM_EXPR.) */
5378 auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5379 auto_lane_permutation_t tmp_perm;
5380 tmp_perm.safe_splice (perm);
5381 change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5382 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5383 tmp_perm,
5384 SLP_TREE_CHILDREN (node),
5385 false) >= 0)
5387 if (dump_enabled_p ()
5388 && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5389 perm.begin ()))
5390 dump_printf_loc (MSG_NOTE, vect_location,
5391 "absorbing input layouts into %p\n",
5392 (void *) node);
5393 std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5394 bitmap_set_bit (fully_folded, node_i);
5396 else
5398 /* Not MSG_MISSED because it would make no sense to users. */
5399 if (dump_enabled_p ())
5400 dump_printf_loc (MSG_NOTE, vect_location,
5401 "failed to absorb input layouts into %p\n",
5402 (void *) node);
5403 change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5406 else
5408 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5409 auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5410 if (layout_i > 0)
5411 /* ??? When we handle non-bijective permutes the idea
5412 is that we can force the load-permutation to be
5413 { min, min + 1, min + 2, ... max }. But then the
5414 scalar defs might no longer match the lane content
5415 which means wrong-code with live lane vectorization.
5416 So we possibly have to have NULL entries for those. */
5417 vect_slp_permute (m_perms[layout_i], load_perm, true);
5421 /* Do this before any nodes disappear, since it involves a walk
5422 over the leaves. */
5423 remove_redundant_permutations ();
5425 /* Replace each child with a correctly laid-out version. */
5426 for (unsigned int node_i : m_partitioned_nodes)
5428 /* Skip nodes that have already been handled above. */
5429 if (bitmap_bit_p (fully_folded, node_i))
5430 continue;
5432 auto &vertex = m_vertices[node_i];
5433 int in_layout_i = m_partitions[vertex.partition].layout;
5434 gcc_assert (in_layout_i >= 0);
5436 unsigned j;
5437 slp_tree child;
5438 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5440 if (!child)
5441 continue;
5443 slp_tree new_child = get_result_with_layout (child, in_layout_i);
5444 if (new_child != child)
5446 vect_free_slp_tree (child);
5447 SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5448 new_child->refcnt += 1;
5454 /* Elide load permutations that are not necessary. Such permutations might
5455 be pre-existing, rather than created by the layout optimizations. */
5457 void
5458 vect_optimize_slp_pass::remove_redundant_permutations ()
5460 for (unsigned int node_i : m_leafs)
5462 slp_tree node = m_vertices[node_i].node;
5463 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5464 continue;
5466 /* In basic block vectorization we allow any subchain of an interleaving
5467 chain.
5468 FORNOW: not in loop SLP because of realignment complications. */
5469 if (is_a <bb_vec_info> (m_vinfo))
5471 bool subchain_p = true;
5472 stmt_vec_info next_load_info = NULL;
5473 stmt_vec_info load_info;
5474 unsigned j;
5475 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5477 if (j != 0
5478 && (next_load_info != load_info
5479 || DR_GROUP_GAP (load_info) != 1))
5481 subchain_p = false;
5482 break;
5484 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5486 if (subchain_p)
5488 SLP_TREE_LOAD_PERMUTATION (node).release ();
5489 continue;
5492 else
5494 loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5495 stmt_vec_info load_info;
5496 bool this_load_permuted = false;
5497 unsigned j;
5498 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5499 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5501 this_load_permuted = true;
5502 break;
5504 /* When this isn't a grouped access we know it's single element
5505 and contiguous. */
5506 if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5508 if (!this_load_permuted
5509 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5510 || SLP_TREE_LANES (node) == 1))
5511 SLP_TREE_LOAD_PERMUTATION (node).release ();
5512 continue;
5514 stmt_vec_info first_stmt_info
5515 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5516 if (!this_load_permuted
5517 /* The load requires permutation when unrolling exposes
5518 a gap either because the group is larger than the SLP
5519 group-size or because there is a gap between the groups. */
5520 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5521 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5522 && DR_GROUP_GAP (first_stmt_info) == 0)))
5524 SLP_TREE_LOAD_PERMUTATION (node).release ();
5525 continue;
5531 /* Print the partition graph and layout information to the dump file. */
5533 void
5534 vect_optimize_slp_pass::dump ()
5536 dump_printf_loc (MSG_NOTE, vect_location,
5537 "SLP optimize permutations:\n");
5538 for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5540 dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
5541 const char *sep = "";
5542 for (unsigned int idx : m_perms[layout_i])
5544 dump_printf (MSG_NOTE, "%s%d", sep, idx);
5545 sep = ", ";
5547 dump_printf (MSG_NOTE, " }\n");
5549 dump_printf_loc (MSG_NOTE, vect_location,
5550 "SLP optimize partitions:\n");
5551 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5552 ++partition_i)
5554 auto &partition = m_partitions[partition_i];
5555 dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
5556 dump_printf_loc (MSG_NOTE, vect_location,
5557 " partition %d (layout %d):\n",
5558 partition_i, partition.layout);
5559 dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
5560 for (unsigned int order_i = partition.node_begin;
5561 order_i < partition.node_end; ++order_i)
5563 auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5564 dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
5565 (void *) vertex.node);
5566 dump_printf_loc (MSG_NOTE, vect_location,
5567 " weight: %f\n",
5568 vertex.weight.to_double ());
5569 if (vertex.out_degree)
5570 dump_printf_loc (MSG_NOTE, vect_location,
5571 " out weight: %f (degree %d)\n",
5572 vertex.out_weight.to_double (),
5573 vertex.out_degree);
5574 if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5575 dump_printf_loc (MSG_NOTE, vect_location,
5576 " op: VEC_PERM_EXPR\n");
5577 else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5578 dump_printf_loc (MSG_NOTE, vect_location,
5579 " op template: %G", rep->stmt);
5581 dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
5582 for (unsigned int order_i = partition.node_begin;
5583 order_i < partition.node_end; ++order_i)
5585 unsigned int node_i = m_partitioned_nodes[order_i];
5586 auto &vertex = m_vertices[node_i];
5587 auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5589 auto &other_vertex = m_vertices[other_node_i];
5590 if (other_vertex.partition < vertex.partition)
5591 dump_printf_loc (MSG_NOTE, vect_location,
5592 " - %p [%d] --> %p\n",
5593 (void *) other_vertex.node,
5594 other_vertex.partition,
5595 (void *) vertex.node);
5596 else
5597 dump_printf_loc (MSG_NOTE, vect_location,
5598 " - %p --> [%d] %p\n",
5599 (void *) vertex.node,
5600 other_vertex.partition,
5601 (void *) other_vertex.node);
5603 for_each_partition_edge (node_i, print_edge);
5606 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5608 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5609 if (layout_costs.is_possible ())
5611 dump_printf_loc (MSG_NOTE, vect_location,
5612 " layout %d:%s\n", layout_i,
5613 partition.layout == int (layout_i)
5614 ? " (*)" : "");
5615 slpg_layout_cost combined_cost = layout_costs.in_cost;
5616 combined_cost.add_serial_cost (layout_costs.internal_cost);
5617 combined_cost.add_serial_cost (layout_costs.out_cost);
5618 #define TEMPLATE "{depth: %f, total: %f}"
5619 dump_printf_loc (MSG_NOTE, vect_location,
5620 " " TEMPLATE "\n",
5621 layout_costs.in_cost.depth.to_double (),
5622 layout_costs.in_cost.total.to_double ());
5623 dump_printf_loc (MSG_NOTE, vect_location,
5624 " + " TEMPLATE "\n",
5625 layout_costs.internal_cost.depth.to_double (),
5626 layout_costs.internal_cost.total.to_double ());
5627 dump_printf_loc (MSG_NOTE, vect_location,
5628 " + " TEMPLATE "\n",
5629 layout_costs.out_cost.depth.to_double (),
5630 layout_costs.out_cost.total.to_double ());
5631 dump_printf_loc (MSG_NOTE, vect_location,
5632 " = " TEMPLATE "\n",
5633 combined_cost.depth.to_double (),
5634 combined_cost.total.to_double ());
5635 #undef TEMPLATE
5637 else
5638 dump_printf_loc (MSG_NOTE, vect_location,
5639 " layout %d: rejected\n", layout_i);
5644 /* Main entry point for the SLP graph optimization pass. */
5646 void
5647 vect_optimize_slp_pass::run ()
5649 build_graph ();
5650 create_partitions ();
5651 start_choosing_layouts ();
5652 if (m_perms.length () > 1)
5654 forward_pass ();
5655 backward_pass ();
5656 if (dump_enabled_p ())
5657 dump ();
5658 materialize ();
5659 while (!m_perms.is_empty ())
5660 m_perms.pop ().release ();
5662 else
5663 remove_redundant_permutations ();
5664 free_graph (m_slpg);
5667 /* Optimize the SLP graph of VINFO. */
5669 void
5670 vect_optimize_slp (vec_info *vinfo)
5672 if (vinfo->slp_instances.is_empty ())
5673 return;
5674 vect_optimize_slp_pass (vinfo).run ();
5677 /* Gather loads reachable from the individual SLP graph entries. */
5679 void
5680 vect_gather_slp_loads (vec_info *vinfo)
5682 unsigned i;
5683 slp_instance instance;
5684 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5686 hash_set<slp_tree> visited;
5687 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5688 SLP_INSTANCE_TREE (instance), visited);
5693 /* For each possible SLP instance decide whether to SLP it and calculate overall
5694 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
5695 least one instance. */
5697 bool
5698 vect_make_slp_decision (loop_vec_info loop_vinfo)
5700 unsigned int i;
5701 poly_uint64 unrolling_factor = 1;
5702 const vec<slp_instance> &slp_instances
5703 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5704 slp_instance instance;
5705 int decided_to_slp = 0;
5707 DUMP_VECT_SCOPE ("vect_make_slp_decision");
5709 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5711 /* FORNOW: SLP if you can. */
5712 /* All unroll factors have the form:
5714 GET_MODE_SIZE (vinfo->vector_mode) * X
5716 for some rational X, so they must have a common multiple. */
5717 unrolling_factor
5718 = force_common_multiple (unrolling_factor,
5719 SLP_INSTANCE_UNROLLING_FACTOR (instance));
5721 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
5722 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5723 loop-based vectorization. Such stmts will be marked as HYBRID. */
5724 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5725 decided_to_slp++;
5728 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5730 if (decided_to_slp && dump_enabled_p ())
5732 dump_printf_loc (MSG_NOTE, vect_location,
5733 "Decided to SLP %d instances. Unrolling factor ",
5734 decided_to_slp);
5735 dump_dec (MSG_NOTE, unrolling_factor);
5736 dump_printf (MSG_NOTE, "\n");
5739 return (decided_to_slp > 0);
5742 /* Private data for vect_detect_hybrid_slp. */
5743 struct vdhs_data
5745 loop_vec_info loop_vinfo;
5746 vec<stmt_vec_info> *worklist;
5749 /* Walker for walk_gimple_op. */
5751 static tree
5752 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5754 walk_stmt_info *wi = (walk_stmt_info *)data;
5755 vdhs_data *dat = (vdhs_data *)wi->info;
5757 if (wi->is_lhs)
5758 return NULL_TREE;
5760 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5761 if (!def_stmt_info)
5762 return NULL_TREE;
5763 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5764 if (PURE_SLP_STMT (def_stmt_info))
5766 if (dump_enabled_p ())
5767 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5768 def_stmt_info->stmt);
5769 STMT_SLP_TYPE (def_stmt_info) = hybrid;
5770 dat->worklist->safe_push (def_stmt_info);
5773 return NULL_TREE;
5776 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5777 if so, otherwise pushing it to WORKLIST. */
5779 static void
5780 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5781 vec<stmt_vec_info> &worklist,
5782 stmt_vec_info stmt_info)
5784 if (dump_enabled_p ())
5785 dump_printf_loc (MSG_NOTE, vect_location,
5786 "Processing hybrid candidate : %G", stmt_info->stmt);
5787 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5788 imm_use_iterator iter2;
5789 ssa_op_iter iter1;
5790 use_operand_p use_p;
5791 def_operand_p def_p;
5792 bool any_def = false;
5793 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5795 any_def = true;
5796 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5798 if (is_gimple_debug (USE_STMT (use_p)))
5799 continue;
5800 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5801 /* An out-of loop use means this is a loop_vect sink. */
5802 if (!use_info)
5804 if (dump_enabled_p ())
5805 dump_printf_loc (MSG_NOTE, vect_location,
5806 "Found loop_vect sink: %G", stmt_info->stmt);
5807 worklist.safe_push (stmt_info);
5808 return;
5810 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5812 if (dump_enabled_p ())
5813 dump_printf_loc (MSG_NOTE, vect_location,
5814 "Found loop_vect use: %G", use_info->stmt);
5815 worklist.safe_push (stmt_info);
5816 return;
5820 /* No def means this is a loo_vect sink. */
5821 if (!any_def)
5823 if (dump_enabled_p ())
5824 dump_printf_loc (MSG_NOTE, vect_location,
5825 "Found loop_vect sink: %G", stmt_info->stmt);
5826 worklist.safe_push (stmt_info);
5827 return;
5829 if (dump_enabled_p ())
5830 dump_printf_loc (MSG_NOTE, vect_location,
5831 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5832 STMT_SLP_TYPE (stmt_info) = pure_slp;
5835 /* Find stmts that must be both vectorized and SLPed. */
5837 void
5838 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5840 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5842 /* All stmts participating in SLP are marked pure_slp, all other
5843 stmts are loop_vect.
5844 First collect all loop_vect stmts into a worklist.
5845 SLP patterns cause not all original scalar stmts to appear in
5846 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5847 Rectify this here and do a backward walk over the IL only considering
5848 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5849 mark them as pure_slp. */
5850 auto_vec<stmt_vec_info> worklist;
5851 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5853 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5854 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5855 gsi_next (&gsi))
5857 gphi *phi = gsi.phi ();
5858 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5859 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5860 maybe_push_to_hybrid_worklist (loop_vinfo,
5861 worklist, stmt_info);
5863 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5864 gsi_prev (&gsi))
5866 gimple *stmt = gsi_stmt (gsi);
5867 if (is_gimple_debug (stmt))
5868 continue;
5869 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5870 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5872 for (gimple_stmt_iterator gsi2
5873 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5874 !gsi_end_p (gsi2); gsi_next (&gsi2))
5876 stmt_vec_info patt_info
5877 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5878 if (!STMT_SLP_TYPE (patt_info)
5879 && STMT_VINFO_RELEVANT (patt_info))
5880 maybe_push_to_hybrid_worklist (loop_vinfo,
5881 worklist, patt_info);
5883 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5885 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5886 maybe_push_to_hybrid_worklist (loop_vinfo,
5887 worklist, stmt_info);
5891 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5892 mark any SLP vectorized stmt as hybrid.
5893 ??? We're visiting def stmts N times (once for each non-SLP and
5894 once for each hybrid-SLP use). */
5895 walk_stmt_info wi;
5896 vdhs_data dat;
5897 dat.worklist = &worklist;
5898 dat.loop_vinfo = loop_vinfo;
5899 memset (&wi, 0, sizeof (wi));
5900 wi.info = (void *)&dat;
5901 while (!worklist.is_empty ())
5903 stmt_vec_info stmt_info = worklist.pop ();
5904 /* Since SSA operands are not set up for pattern stmts we need
5905 to use walk_gimple_op. */
5906 wi.is_lhs = 0;
5907 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
5908 /* For gather/scatter make sure to walk the offset operand, that
5909 can be a scaling and conversion away. */
5910 gather_scatter_info gs_info;
5911 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5912 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
5914 int dummy;
5915 vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
5921 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
5923 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
5924 : vec_info (vec_info::bb, shared),
5925 bbs (_bbs),
5926 roots (vNULL)
5928 for (unsigned i = 0; i < bbs.length (); ++i)
5930 if (i != 0)
5931 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5932 gsi_next (&si))
5934 gphi *phi = si.phi ();
5935 gimple_set_uid (phi, 0);
5936 add_stmt (phi);
5938 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5939 !gsi_end_p (gsi); gsi_next (&gsi))
5941 gimple *stmt = gsi_stmt (gsi);
5942 gimple_set_uid (stmt, 0);
5943 if (is_gimple_debug (stmt))
5944 continue;
5945 add_stmt (stmt);
5951 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
5952 stmts in the basic block. */
5954 _bb_vec_info::~_bb_vec_info ()
5956 /* Reset region marker. */
5957 for (unsigned i = 0; i < bbs.length (); ++i)
5959 if (i != 0)
5960 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5961 gsi_next (&si))
5963 gphi *phi = si.phi ();
5964 gimple_set_uid (phi, -1);
5966 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5967 !gsi_end_p (gsi); gsi_next (&gsi))
5969 gimple *stmt = gsi_stmt (gsi);
5970 gimple_set_uid (stmt, -1);
5974 for (unsigned i = 0; i < roots.length (); ++i)
5976 roots[i].stmts.release ();
5977 roots[i].roots.release ();
5978 roots[i].remain.release ();
5980 roots.release ();
5983 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
5984 given then that child nodes have already been processed, and that
5985 their def types currently match their SLP node's def type. */
5987 static bool
5988 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
5989 slp_instance node_instance,
5990 stmt_vector_for_cost *cost_vec)
5992 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5994 /* Calculate the number of vector statements to be created for the
5995 scalar stmts in this node. For SLP reductions it is equal to the
5996 number of vector statements in the children (which has already been
5997 calculated by the recursive call). Otherwise it is the number of
5998 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
5999 VF divided by the number of elements in a vector. */
6000 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6001 && !STMT_VINFO_DATA_REF (stmt_info)
6002 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6004 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6005 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6007 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6008 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6009 break;
6012 else
6014 poly_uint64 vf;
6015 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6016 vf = loop_vinfo->vectorization_factor;
6017 else
6018 vf = 1;
6019 unsigned int group_size = SLP_TREE_LANES (node);
6020 tree vectype = SLP_TREE_VECTYPE (node);
6021 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6022 = vect_get_num_vectors (vf * group_size, vectype);
6025 /* Handle purely internal nodes. */
6026 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6028 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6029 return false;
6031 stmt_vec_info slp_stmt_info;
6032 unsigned int i;
6033 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6035 if (STMT_VINFO_LIVE_P (slp_stmt_info)
6036 && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6037 node_instance, i,
6038 false, cost_vec))
6039 return false;
6041 return true;
6044 bool dummy;
6045 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6046 node, node_instance, cost_vec);
6049 /* Try to build NODE from scalars, returning true on success.
6050 NODE_INSTANCE is the SLP instance that contains NODE. */
6052 static bool
6053 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6054 slp_instance node_instance)
6056 stmt_vec_info stmt_info;
6057 unsigned int i;
6059 if (!is_a <bb_vec_info> (vinfo)
6060 || node == SLP_INSTANCE_TREE (node_instance)
6061 || !SLP_TREE_SCALAR_STMTS (node).exists ()
6062 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6063 /* Force the mask use to be built from scalars instead. */
6064 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6065 return false;
6067 if (dump_enabled_p ())
6068 dump_printf_loc (MSG_NOTE, vect_location,
6069 "Building vector operands of %p from scalars instead\n",
6070 (void *) node);
6072 /* Don't remove and free the child nodes here, since they could be
6073 referenced by other structures. The analysis and scheduling phases
6074 (need to) ignore child nodes of anything that isn't vect_internal_def. */
6075 unsigned int group_size = SLP_TREE_LANES (node);
6076 SLP_TREE_DEF_TYPE (node) = vect_external_def;
6077 /* Invariants get their vector type from the uses. */
6078 SLP_TREE_VECTYPE (node) = NULL_TREE;
6079 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6080 SLP_TREE_LOAD_PERMUTATION (node).release ();
6081 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6083 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6084 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6086 return true;
6089 /* Return true if all elements of the slice are the same. */
6090 bool
6091 vect_scalar_ops_slice::all_same_p () const
6093 for (unsigned int i = 1; i < length; ++i)
6094 if (!operand_equal_p (op (0), op (i)))
6095 return false;
6096 return true;
6099 hashval_t
6100 vect_scalar_ops_slice_hash::hash (const value_type &s)
6102 hashval_t hash = 0;
6103 for (unsigned i = 0; i < s.length; ++i)
6104 hash = iterative_hash_expr (s.op (i), hash);
6105 return hash;
6108 bool
6109 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6110 const compare_type &s2)
6112 if (s1.length != s2.length)
6113 return false;
6114 for (unsigned i = 0; i < s1.length; ++i)
6115 if (!operand_equal_p (s1.op (i), s2.op (i)))
6116 return false;
6117 return true;
6120 /* Compute the prologue cost for invariant or constant operands represented
6121 by NODE. */
6123 static void
6124 vect_prologue_cost_for_slp (slp_tree node,
6125 stmt_vector_for_cost *cost_vec)
6127 /* There's a special case of an existing vector, that costs nothing. */
6128 if (SLP_TREE_SCALAR_OPS (node).length () == 0
6129 && !SLP_TREE_VEC_DEFS (node).is_empty ())
6130 return;
6131 /* Without looking at the actual initializer a vector of
6132 constants can be implemented as load from the constant pool.
6133 When all elements are the same we can use a splat. */
6134 tree vectype = SLP_TREE_VECTYPE (node);
6135 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6136 unsigned HOST_WIDE_INT const_nunits;
6137 unsigned nelt_limit;
6138 auto ops = &SLP_TREE_SCALAR_OPS (node);
6139 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6140 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6141 && ! multiple_p (const_nunits, group_size))
6143 nelt_limit = const_nunits;
6144 hash_set<vect_scalar_ops_slice_hash> vector_ops;
6145 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6146 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6147 starts.quick_push (i * const_nunits);
6149 else
6151 /* If either the vector has variable length or the vectors
6152 are composed of repeated whole groups we only need to
6153 cost construction once. All vectors will be the same. */
6154 nelt_limit = group_size;
6155 starts.quick_push (0);
6157 /* ??? We're just tracking whether vectors in a single node are the same.
6158 Ideally we'd do something more global. */
6159 bool passed = false;
6160 for (unsigned int start : starts)
6162 vect_cost_for_stmt kind;
6163 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6164 kind = vector_load;
6165 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6166 kind = scalar_to_vec;
6167 else
6168 kind = vec_construct;
6169 /* The target cost hook has no idea which part of the SLP node
6170 we are costing so avoid passing it down more than once. Pass
6171 it to the first vec_construct or scalar_to_vec part since for those
6172 the x86 backend tries to account for GPR to XMM register moves. */
6173 record_stmt_cost (cost_vec, 1, kind,
6174 (kind != vector_load && !passed) ? node : nullptr,
6175 vectype, 0, vect_prologue);
6176 if (kind != vector_load)
6177 passed = true;
6181 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6182 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6184 Return true if the operations are supported. */
6186 static bool
6187 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6188 slp_instance node_instance,
6189 hash_set<slp_tree> &visited_set,
6190 vec<slp_tree> &visited_vec,
6191 stmt_vector_for_cost *cost_vec)
6193 int i, j;
6194 slp_tree child;
6196 /* Assume we can code-generate all invariants. */
6197 if (!node
6198 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6199 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6200 return true;
6202 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6204 if (dump_enabled_p ())
6205 dump_printf_loc (MSG_NOTE, vect_location,
6206 "Failed cyclic SLP reference in %p\n", (void *) node);
6207 return false;
6209 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6211 /* If we already analyzed the exact same set of scalar stmts we're done.
6212 We share the generated vector stmts for those. */
6213 if (visited_set.add (node))
6214 return true;
6215 visited_vec.safe_push (node);
6217 bool res = true;
6218 unsigned visited_rec_start = visited_vec.length ();
6219 unsigned cost_vec_rec_start = cost_vec->length ();
6220 bool seen_non_constant_child = false;
6221 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6223 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6224 visited_set, visited_vec,
6225 cost_vec);
6226 if (!res)
6227 break;
6228 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6229 seen_non_constant_child = true;
6231 /* We're having difficulties scheduling nodes with just constant
6232 operands and no scalar stmts since we then cannot compute a stmt
6233 insertion place. */
6234 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6236 if (dump_enabled_p ())
6237 dump_printf_loc (MSG_NOTE, vect_location,
6238 "Cannot vectorize all-constant op node %p\n",
6239 (void *) node);
6240 res = false;
6243 if (res)
6244 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6245 cost_vec);
6246 /* If analysis failed we have to pop all recursive visited nodes
6247 plus ourselves. */
6248 if (!res)
6250 while (visited_vec.length () >= visited_rec_start)
6251 visited_set.remove (visited_vec.pop ());
6252 cost_vec->truncate (cost_vec_rec_start);
6255 /* When the node can be vectorized cost invariant nodes it references.
6256 This is not done in DFS order to allow the refering node
6257 vectorizable_* calls to nail down the invariant nodes vector type
6258 and possibly unshare it if it needs a different vector type than
6259 other referrers. */
6260 if (res)
6261 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6262 if (child
6263 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6264 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6265 /* Perform usual caching, note code-generation still
6266 code-gens these nodes multiple times but we expect
6267 to CSE them later. */
6268 && !visited_set.add (child))
6270 visited_vec.safe_push (child);
6271 /* ??? After auditing more code paths make a "default"
6272 and push the vector type from NODE to all children
6273 if it is not already set. */
6274 /* Compute the number of vectors to be generated. */
6275 tree vector_type = SLP_TREE_VECTYPE (child);
6276 if (!vector_type)
6278 /* For shifts with a scalar argument we don't need
6279 to cost or code-generate anything.
6280 ??? Represent this more explicitely. */
6281 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6282 == shift_vec_info_type)
6283 && j == 1);
6284 continue;
6286 unsigned group_size = SLP_TREE_LANES (child);
6287 poly_uint64 vf = 1;
6288 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6289 vf = loop_vinfo->vectorization_factor;
6290 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6291 = vect_get_num_vectors (vf * group_size, vector_type);
6292 /* And cost them. */
6293 vect_prologue_cost_for_slp (child, cost_vec);
6296 /* If this node or any of its children can't be vectorized, try pruning
6297 the tree here rather than felling the whole thing. */
6298 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6300 /* We'll need to revisit this for invariant costing and number
6301 of vectorized stmt setting. */
6302 res = true;
6305 return res;
6308 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6309 region and that can be vectorized using vectorizable_live_operation
6310 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
6311 scalar code computing it to be retained. */
6313 static void
6314 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6315 slp_instance instance,
6316 stmt_vector_for_cost *cost_vec,
6317 hash_set<stmt_vec_info> &svisited,
6318 hash_set<slp_tree> &visited)
6320 if (visited.add (node))
6321 return;
6323 unsigned i;
6324 stmt_vec_info stmt_info;
6325 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6326 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6328 if (svisited.contains (stmt_info))
6329 continue;
6330 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6331 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6332 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6333 /* Only the pattern root stmt computes the original scalar value. */
6334 continue;
6335 bool mark_visited = true;
6336 gimple *orig_stmt = orig_stmt_info->stmt;
6337 ssa_op_iter op_iter;
6338 def_operand_p def_p;
6339 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6341 imm_use_iterator use_iter;
6342 gimple *use_stmt;
6343 stmt_vec_info use_stmt_info;
6344 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6345 if (!is_gimple_debug (use_stmt))
6347 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6348 if (!use_stmt_info
6349 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6351 STMT_VINFO_LIVE_P (stmt_info) = true;
6352 if (vectorizable_live_operation (bb_vinfo, stmt_info,
6353 node, instance, i,
6354 false, cost_vec))
6355 /* ??? So we know we can vectorize the live stmt
6356 from one SLP node. If we cannot do so from all
6357 or none consistently we'd have to record which
6358 SLP node (and lane) we want to use for the live
6359 operation. So make sure we can code-generate
6360 from all nodes. */
6361 mark_visited = false;
6362 else
6363 STMT_VINFO_LIVE_P (stmt_info) = false;
6364 break;
6367 /* We have to verify whether we can insert the lane extract
6368 before all uses. The following is a conservative approximation.
6369 We cannot put this into vectorizable_live_operation because
6370 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6371 doesn't work.
6372 Note that while the fact that we emit code for loads at the
6373 first load should make this a non-problem leafs we construct
6374 from scalars are vectorized after the last scalar def.
6375 ??? If we'd actually compute the insert location during
6376 analysis we could use sth less conservative than the last
6377 scalar stmt in the node for the dominance check. */
6378 /* ??? What remains is "live" uses in vector CTORs in the same
6379 SLP graph which is where those uses can end up code-generated
6380 right after their definition instead of close to their original
6381 use. But that would restrict us to code-generate lane-extracts
6382 from the latest stmt in a node. So we compensate for this
6383 during code-generation, simply not replacing uses for those
6384 hopefully rare cases. */
6385 if (STMT_VINFO_LIVE_P (stmt_info))
6386 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6387 if (!is_gimple_debug (use_stmt)
6388 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6389 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6390 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6392 if (dump_enabled_p ())
6393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6394 "Cannot determine insertion place for "
6395 "lane extract\n");
6396 STMT_VINFO_LIVE_P (stmt_info) = false;
6397 mark_visited = true;
6400 if (mark_visited)
6401 svisited.add (stmt_info);
6404 slp_tree child;
6405 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6406 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6407 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6408 cost_vec, svisited, visited);
6411 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
6413 static bool
6414 vectorizable_bb_reduc_epilogue (slp_instance instance,
6415 stmt_vector_for_cost *cost_vec)
6417 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6418 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6419 if (reduc_code == MINUS_EXPR)
6420 reduc_code = PLUS_EXPR;
6421 internal_fn reduc_fn;
6422 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6423 if (!vectype
6424 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6425 || reduc_fn == IFN_LAST
6426 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6427 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6428 TREE_TYPE (vectype)))
6430 if (dump_enabled_p ())
6431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6432 "not vectorized: basic block reduction epilogue "
6433 "operation unsupported.\n");
6434 return false;
6437 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6438 cost log2 vector operations plus shuffles and one extraction. */
6439 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6440 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6441 vectype, 0, vect_body);
6442 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6443 vectype, 0, vect_body);
6444 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6445 vectype, 0, vect_body);
6447 /* Since we replace all stmts of a possibly longer scalar reduction
6448 chain account for the extra scalar stmts for that. */
6449 record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6450 instance->root_stmts[0], 0, vect_body);
6451 return true;
6454 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6455 and recurse to children. */
6457 static void
6458 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6459 hash_set<slp_tree> &visited)
6461 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6462 || visited.add (node))
6463 return;
6465 stmt_vec_info stmt;
6466 unsigned i;
6467 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6468 roots.remove (vect_orig_stmt (stmt));
6470 slp_tree child;
6471 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6472 if (child)
6473 vect_slp_prune_covered_roots (child, roots, visited);
6476 /* Analyze statements in SLP instances of VINFO. Return true if the
6477 operations are supported. */
6479 bool
6480 vect_slp_analyze_operations (vec_info *vinfo)
6482 slp_instance instance;
6483 int i;
6485 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6487 hash_set<slp_tree> visited;
6488 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6490 auto_vec<slp_tree> visited_vec;
6491 stmt_vector_for_cost cost_vec;
6492 cost_vec.create (2);
6493 if (is_a <bb_vec_info> (vinfo))
6494 vect_location = instance->location ();
6495 if (!vect_slp_analyze_node_operations (vinfo,
6496 SLP_INSTANCE_TREE (instance),
6497 instance, visited, visited_vec,
6498 &cost_vec)
6499 /* CTOR instances require vectorized defs for the SLP tree root. */
6500 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6501 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6502 != vect_internal_def
6503 /* Make sure we vectorized with the expected type. */
6504 || !useless_type_conversion_p
6505 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6506 (instance->root_stmts[0]->stmt))),
6507 TREE_TYPE (SLP_TREE_VECTYPE
6508 (SLP_INSTANCE_TREE (instance))))))
6509 /* Check we can vectorize the reduction. */
6510 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6511 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6513 slp_tree node = SLP_INSTANCE_TREE (instance);
6514 stmt_vec_info stmt_info;
6515 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6516 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6517 else
6518 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6519 if (dump_enabled_p ())
6520 dump_printf_loc (MSG_NOTE, vect_location,
6521 "removing SLP instance operations starting from: %G",
6522 stmt_info->stmt);
6523 vect_free_slp_instance (instance);
6524 vinfo->slp_instances.ordered_remove (i);
6525 cost_vec.release ();
6526 while (!visited_vec.is_empty ())
6527 visited.remove (visited_vec.pop ());
6529 else
6531 i++;
6532 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6534 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6535 cost_vec.release ();
6537 else
6538 /* For BB vectorization remember the SLP graph entry
6539 cost for later. */
6540 instance->cost_vec = cost_vec;
6544 /* Now look for SLP instances with a root that are covered by other
6545 instances and remove them. */
6546 hash_set<stmt_vec_info> roots;
6547 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6548 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6549 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6550 if (!roots.is_empty ())
6552 visited.empty ();
6553 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6554 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6555 visited);
6556 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6557 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6558 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6560 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6561 if (dump_enabled_p ())
6562 dump_printf_loc (MSG_NOTE, vect_location,
6563 "removing SLP instance operations starting "
6564 "from: %G", root->stmt);
6565 vect_free_slp_instance (instance);
6566 vinfo->slp_instances.ordered_remove (i);
6568 else
6569 ++i;
6572 /* Compute vectorizable live stmts. */
6573 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6575 hash_set<stmt_vec_info> svisited;
6576 hash_set<slp_tree> visited;
6577 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6579 vect_location = instance->location ();
6580 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6581 instance, &instance->cost_vec, svisited,
6582 visited);
6586 return !vinfo->slp_instances.is_empty ();
6589 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6590 closing the eventual chain. */
6592 static slp_instance
6593 get_ultimate_leader (slp_instance instance,
6594 hash_map<slp_instance, slp_instance> &instance_leader)
6596 auto_vec<slp_instance *, 8> chain;
6597 slp_instance *tem;
6598 while (*(tem = instance_leader.get (instance)) != instance)
6600 chain.safe_push (tem);
6601 instance = *tem;
6603 while (!chain.is_empty ())
6604 *chain.pop () = instance;
6605 return instance;
6608 namespace {
6609 /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
6610 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6611 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
6613 INSTANCE_LEADER is as for get_ultimate_leader. */
6615 template<typename T>
6616 bool
6617 vect_map_to_instance (slp_instance instance, T key,
6618 hash_map<T, slp_instance> &key_to_instance,
6619 hash_map<slp_instance, slp_instance> &instance_leader)
6621 bool existed_p;
6622 slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6623 if (!existed_p)
6625 else if (key_instance != instance)
6627 /* If we're running into a previously marked key make us the
6628 leader of the current ultimate leader. This keeps the
6629 leader chain acyclic and works even when the current instance
6630 connects two previously independent graph parts. */
6631 slp_instance key_leader
6632 = get_ultimate_leader (key_instance, instance_leader);
6633 if (key_leader != instance)
6634 instance_leader.put (key_leader, instance);
6636 key_instance = instance;
6637 return existed_p;
6641 /* Worker of vect_bb_partition_graph, recurse on NODE. */
6643 static void
6644 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6645 slp_instance instance, slp_tree node,
6646 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6647 hash_map<slp_tree, slp_instance> &node_to_instance,
6648 hash_map<slp_instance, slp_instance> &instance_leader)
6650 stmt_vec_info stmt_info;
6651 unsigned i;
6653 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6654 vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6655 instance_leader);
6657 if (vect_map_to_instance (instance, node, node_to_instance,
6658 instance_leader))
6659 return;
6661 slp_tree child;
6662 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6663 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6664 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6665 node_to_instance, instance_leader);
6668 /* Partition the SLP graph into pieces that can be costed independently. */
6670 static void
6671 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6673 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6675 /* First walk the SLP graph assigning each involved scalar stmt a
6676 corresponding SLP graph entry and upon visiting a previously
6677 marked stmt, make the stmts leader the current SLP graph entry. */
6678 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6679 hash_map<slp_tree, slp_instance> node_to_instance;
6680 hash_map<slp_instance, slp_instance> instance_leader;
6681 slp_instance instance;
6682 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6684 instance_leader.put (instance, instance);
6685 vect_bb_partition_graph_r (bb_vinfo,
6686 instance, SLP_INSTANCE_TREE (instance),
6687 stmt_to_instance, node_to_instance,
6688 instance_leader);
6691 /* Then collect entries to each independent subgraph. */
6692 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6694 slp_instance leader = get_ultimate_leader (instance, instance_leader);
6695 leader->subgraph_entries.safe_push (instance);
6696 if (dump_enabled_p ()
6697 && leader != instance)
6698 dump_printf_loc (MSG_NOTE, vect_location,
6699 "instance %p is leader of %p\n",
6700 (void *) leader, (void *) instance);
6704 /* Compute the set of scalar stmts participating in internal and external
6705 nodes. */
6707 static void
6708 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6709 hash_set<slp_tree> &visited,
6710 hash_set<stmt_vec_info> &vstmts,
6711 hash_set<stmt_vec_info> &estmts)
6713 int i;
6714 stmt_vec_info stmt_info;
6715 slp_tree child;
6717 if (visited.add (node))
6718 return;
6720 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6722 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6723 vstmts.add (stmt_info);
6725 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6726 if (child)
6727 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6728 vstmts, estmts);
6730 else
6731 for (tree def : SLP_TREE_SCALAR_OPS (node))
6733 stmt_vec_info def_stmt = vinfo->lookup_def (def);
6734 if (def_stmt)
6735 estmts.add (def_stmt);
6740 /* Compute the scalar cost of the SLP node NODE and its children
6741 and return it. Do not account defs that are marked in LIFE and
6742 update LIFE according to uses of NODE. */
6744 static void
6745 vect_bb_slp_scalar_cost (vec_info *vinfo,
6746 slp_tree node, vec<bool, va_heap> *life,
6747 stmt_vector_for_cost *cost_vec,
6748 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6749 hash_set<slp_tree> &visited)
6751 unsigned i;
6752 stmt_vec_info stmt_info;
6753 slp_tree child;
6755 if (visited.add (node))
6756 return;
6758 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6760 ssa_op_iter op_iter;
6761 def_operand_p def_p;
6763 if ((*life)[i])
6764 continue;
6766 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6767 gimple *orig_stmt = orig_stmt_info->stmt;
6769 /* If there is a non-vectorized use of the defs then the scalar
6770 stmt is kept live in which case we do not account it or any
6771 required defs in the SLP children in the scalar cost. This
6772 way we make the vectorization more costly when compared to
6773 the scalar cost. */
6774 if (!STMT_VINFO_LIVE_P (stmt_info))
6776 auto_vec<gimple *, 8> worklist;
6777 hash_set<gimple *> *worklist_visited = NULL;
6778 worklist.quick_push (orig_stmt);
6781 gimple *work_stmt = worklist.pop ();
6782 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6784 imm_use_iterator use_iter;
6785 gimple *use_stmt;
6786 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6787 DEF_FROM_PTR (def_p))
6788 if (!is_gimple_debug (use_stmt))
6790 stmt_vec_info use_stmt_info
6791 = vinfo->lookup_stmt (use_stmt);
6792 if (!use_stmt_info
6793 || !vectorized_scalar_stmts.contains (use_stmt_info))
6795 if (use_stmt_info
6796 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6798 /* For stmts participating in patterns we have
6799 to check its uses recursively. */
6800 if (!worklist_visited)
6801 worklist_visited = new hash_set<gimple *> ();
6802 if (!worklist_visited->add (use_stmt))
6803 worklist.safe_push (use_stmt);
6804 continue;
6806 (*life)[i] = true;
6807 goto next_lane;
6812 while (!worklist.is_empty ());
6813 next_lane:
6814 if (worklist_visited)
6815 delete worklist_visited;
6816 if ((*life)[i])
6817 continue;
6820 /* Count scalar stmts only once. */
6821 if (gimple_visited_p (orig_stmt))
6822 continue;
6823 gimple_set_visited (orig_stmt, true);
6825 vect_cost_for_stmt kind;
6826 if (STMT_VINFO_DATA_REF (orig_stmt_info))
6828 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6829 kind = scalar_load;
6830 else
6831 kind = scalar_store;
6833 else if (vect_nop_conversion_p (orig_stmt_info))
6834 continue;
6835 /* For single-argument PHIs assume coalescing which means zero cost
6836 for the scalar and the vector PHIs. This avoids artificially
6837 favoring the vector path (but may pessimize it in some cases). */
6838 else if (is_a <gphi *> (orig_stmt_info->stmt)
6839 && gimple_phi_num_args
6840 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6841 continue;
6842 else
6843 kind = scalar_stmt;
6844 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6845 SLP_TREE_VECTYPE (node), 0, vect_body);
6848 auto_vec<bool, 20> subtree_life;
6849 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6851 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6853 /* Do not directly pass LIFE to the recursive call, copy it to
6854 confine changes in the callee to the current child/subtree. */
6855 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6857 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6858 for (unsigned j = 0;
6859 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6861 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6862 if (perm.first == i)
6863 subtree_life[perm.second] = (*life)[j];
6866 else
6868 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6869 subtree_life.safe_splice (*life);
6871 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6872 vectorized_scalar_stmts, visited);
6873 subtree_life.truncate (0);
6878 /* Comparator for the loop-index sorted cost vectors. */
6880 static int
6881 li_cost_vec_cmp (const void *a_, const void *b_)
6883 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6884 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6885 if (a->first < b->first)
6886 return -1;
6887 else if (a->first == b->first)
6888 return 0;
6889 return 1;
6892 /* Check if vectorization of the basic block is profitable for the
6893 subgraph denoted by SLP_INSTANCES. */
6895 static bool
6896 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6897 vec<slp_instance> slp_instances,
6898 loop_p orig_loop)
6900 slp_instance instance;
6901 int i;
6902 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
6903 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
6905 if (dump_enabled_p ())
6907 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
6908 hash_set<slp_tree> visited;
6909 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6910 vect_print_slp_graph (MSG_NOTE, vect_location,
6911 SLP_INSTANCE_TREE (instance), visited);
6914 /* Compute the set of scalar stmts we know will go away 'locally' when
6915 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
6916 not accurate for nodes promoted extern late or for scalar stmts that
6917 are used both in extern defs and in vectorized defs. */
6918 hash_set<stmt_vec_info> vectorized_scalar_stmts;
6919 hash_set<stmt_vec_info> scalar_stmts_in_externs;
6920 hash_set<slp_tree> visited;
6921 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6923 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
6924 SLP_INSTANCE_TREE (instance),
6925 visited,
6926 vectorized_scalar_stmts,
6927 scalar_stmts_in_externs);
6928 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
6929 vectorized_scalar_stmts.add (rstmt);
6931 /* Scalar stmts used as defs in external nodes need to be preseved, so
6932 remove them from vectorized_scalar_stmts. */
6933 for (stmt_vec_info stmt : scalar_stmts_in_externs)
6934 vectorized_scalar_stmts.remove (stmt);
6936 /* Calculate scalar cost and sum the cost for the vector stmts
6937 previously collected. */
6938 stmt_vector_for_cost scalar_costs = vNULL;
6939 stmt_vector_for_cost vector_costs = vNULL;
6940 visited.empty ();
6941 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6943 auto_vec<bool, 20> life;
6944 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
6945 true);
6946 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6947 record_stmt_cost (&scalar_costs,
6948 SLP_INSTANCE_ROOT_STMTS (instance).length (),
6949 scalar_stmt,
6950 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
6951 vect_bb_slp_scalar_cost (bb_vinfo,
6952 SLP_INSTANCE_TREE (instance),
6953 &life, &scalar_costs, vectorized_scalar_stmts,
6954 visited);
6955 vector_costs.safe_splice (instance->cost_vec);
6956 instance->cost_vec.release ();
6959 if (dump_enabled_p ())
6960 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
6962 /* When costing non-loop vectorization we need to consider each covered
6963 loop independently and make sure vectorization is profitable. For
6964 now we assume a loop may be not entered or executed an arbitrary
6965 number of iterations (??? static information can provide more
6966 precise info here) which means we can simply cost each containing
6967 loops stmts separately. */
6969 /* First produce cost vectors sorted by loop index. */
6970 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6971 li_scalar_costs (scalar_costs.length ());
6972 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6973 li_vector_costs (vector_costs.length ());
6974 stmt_info_for_cost *cost;
6975 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6977 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6978 li_scalar_costs.quick_push (std::make_pair (l, cost));
6980 /* Use a random used loop as fallback in case the first vector_costs
6981 entry does not have a stmt_info associated with it. */
6982 unsigned l = li_scalar_costs[0].first;
6983 FOR_EACH_VEC_ELT (vector_costs, i, cost)
6985 /* We inherit from the previous COST, invariants, externals and
6986 extracts immediately follow the cost for the related stmt. */
6987 if (cost->stmt_info)
6988 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6989 li_vector_costs.quick_push (std::make_pair (l, cost));
6991 li_scalar_costs.qsort (li_cost_vec_cmp);
6992 li_vector_costs.qsort (li_cost_vec_cmp);
6994 /* Now cost the portions individually. */
6995 unsigned vi = 0;
6996 unsigned si = 0;
6997 bool profitable = true;
6998 while (si < li_scalar_costs.length ()
6999 && vi < li_vector_costs.length ())
7001 unsigned sl = li_scalar_costs[si].first;
7002 unsigned vl = li_vector_costs[vi].first;
7003 if (sl != vl)
7005 if (dump_enabled_p ())
7006 dump_printf_loc (MSG_NOTE, vect_location,
7007 "Scalar %d and vector %d loop part do not "
7008 "match up, skipping scalar part\n", sl, vl);
7009 /* Skip the scalar part, assuming zero cost on the vector side. */
7012 si++;
7014 while (si < li_scalar_costs.length ()
7015 && li_scalar_costs[si].first == sl);
7016 continue;
7019 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7022 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7023 si++;
7025 while (si < li_scalar_costs.length ()
7026 && li_scalar_costs[si].first == sl);
7027 unsigned dummy;
7028 finish_cost (scalar_target_cost_data, nullptr,
7029 &dummy, &scalar_cost, &dummy);
7031 /* Complete the target-specific vector cost calculation. */
7032 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7035 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7036 vi++;
7038 while (vi < li_vector_costs.length ()
7039 && li_vector_costs[vi].first == vl);
7040 finish_cost (vect_target_cost_data, scalar_target_cost_data,
7041 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7042 delete scalar_target_cost_data;
7043 delete vect_target_cost_data;
7045 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7047 if (dump_enabled_p ())
7049 dump_printf_loc (MSG_NOTE, vect_location,
7050 "Cost model analysis for part in loop %d:\n", sl);
7051 dump_printf (MSG_NOTE, " Vector cost: %d\n",
7052 vec_inside_cost + vec_outside_cost);
7053 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
7056 /* Vectorization is profitable if its cost is more than the cost of scalar
7057 version. Note that we err on the vector side for equal cost because
7058 the cost estimate is otherwise quite pessimistic (constant uses are
7059 free on the scalar side but cost a load on the vector side for
7060 example). */
7061 if (vec_outside_cost + vec_inside_cost > scalar_cost)
7063 profitable = false;
7064 break;
7067 if (profitable && vi < li_vector_costs.length ())
7069 if (dump_enabled_p ())
7070 dump_printf_loc (MSG_NOTE, vect_location,
7071 "Excess vector cost for part in loop %d:\n",
7072 li_vector_costs[vi].first);
7073 profitable = false;
7076 /* Unset visited flag. This is delayed when the subgraph is profitable
7077 and we process the loop for remaining unvectorized if-converted code. */
7078 if (!orig_loop || !profitable)
7079 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7080 gimple_set_visited (cost->stmt_info->stmt, false);
7082 scalar_costs.release ();
7083 vector_costs.release ();
7085 return profitable;
7088 /* qsort comparator for lane defs. */
7090 static int
7091 vld_cmp (const void *a_, const void *b_)
7093 auto *a = (const std::pair<unsigned, tree> *)a_;
7094 auto *b = (const std::pair<unsigned, tree> *)b_;
7095 return a->first - b->first;
7098 /* Return true if USE_STMT is a vector lane insert into VEC and set
7099 *THIS_LANE to the lane number that is set. */
7101 static bool
7102 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7104 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7105 if (!use_ass
7106 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7107 || (vec
7108 ? gimple_assign_rhs1 (use_ass) != vec
7109 : ((vec = gimple_assign_rhs1 (use_ass)), false))
7110 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7111 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7112 || !constant_multiple_p
7113 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7114 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7115 this_lane))
7116 return false;
7117 return true;
7120 /* Find any vectorizable constructors and add them to the grouped_store
7121 array. */
7123 static void
7124 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7126 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7127 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7128 !gsi_end_p (gsi); gsi_next (&gsi))
7130 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7131 if (!assign)
7132 continue;
7134 tree rhs = gimple_assign_rhs1 (assign);
7135 enum tree_code code = gimple_assign_rhs_code (assign);
7136 use_operand_p use_p;
7137 gimple *use_stmt;
7138 if (code == CONSTRUCTOR)
7140 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7141 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7142 CONSTRUCTOR_NELTS (rhs))
7143 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7144 || uniform_vector_p (rhs))
7145 continue;
7147 unsigned j;
7148 tree val;
7149 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7150 if (TREE_CODE (val) != SSA_NAME
7151 || !bb_vinfo->lookup_def (val))
7152 break;
7153 if (j != CONSTRUCTOR_NELTS (rhs))
7154 continue;
7156 vec<stmt_vec_info> roots = vNULL;
7157 roots.safe_push (bb_vinfo->lookup_stmt (assign));
7158 vec<stmt_vec_info> stmts;
7159 stmts.create (CONSTRUCTOR_NELTS (rhs));
7160 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7161 stmts.quick_push
7162 (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7163 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7164 stmts, roots));
7166 else if (code == BIT_INSERT_EXPR
7167 && VECTOR_TYPE_P (TREE_TYPE (rhs))
7168 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7169 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7170 && integer_zerop (gimple_assign_rhs3 (assign))
7171 && useless_type_conversion_p
7172 (TREE_TYPE (TREE_TYPE (rhs)),
7173 TREE_TYPE (gimple_assign_rhs2 (assign)))
7174 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7176 /* We start to match on insert to lane zero but since the
7177 inserts need not be ordered we'd have to search both
7178 the def and the use chains. */
7179 tree vectype = TREE_TYPE (rhs);
7180 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7181 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7182 auto_sbitmap lanes (nlanes);
7183 bitmap_clear (lanes);
7184 bitmap_set_bit (lanes, 0);
7185 tree def = gimple_assign_lhs (assign);
7186 lane_defs.quick_push
7187 (std::make_pair (0, gimple_assign_rhs2 (assign)));
7188 unsigned lanes_found = 1;
7189 /* Start with the use chains, the last stmt will be the root. */
7190 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7191 vec<stmt_vec_info> roots = vNULL;
7192 roots.safe_push (last);
7195 use_operand_p use_p;
7196 gimple *use_stmt;
7197 if (!single_imm_use (def, &use_p, &use_stmt))
7198 break;
7199 unsigned this_lane;
7200 if (!bb_vinfo->lookup_stmt (use_stmt)
7201 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7202 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7203 break;
7204 if (bitmap_bit_p (lanes, this_lane))
7205 break;
7206 lanes_found++;
7207 bitmap_set_bit (lanes, this_lane);
7208 gassign *use_ass = as_a <gassign *> (use_stmt);
7209 lane_defs.quick_push (std::make_pair
7210 (this_lane, gimple_assign_rhs2 (use_ass)));
7211 last = bb_vinfo->lookup_stmt (use_ass);
7212 roots.safe_push (last);
7213 def = gimple_assign_lhs (use_ass);
7215 while (lanes_found < nlanes);
7216 if (roots.length () > 1)
7217 std::swap(roots[0], roots[roots.length () - 1]);
7218 if (lanes_found < nlanes)
7220 /* Now search the def chain. */
7221 def = gimple_assign_rhs1 (assign);
7224 if (TREE_CODE (def) != SSA_NAME
7225 || !has_single_use (def))
7226 break;
7227 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7228 unsigned this_lane;
7229 if (!bb_vinfo->lookup_stmt (def_stmt)
7230 || !vect_slp_is_lane_insert (def_stmt,
7231 NULL_TREE, &this_lane)
7232 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7233 break;
7234 if (bitmap_bit_p (lanes, this_lane))
7235 break;
7236 lanes_found++;
7237 bitmap_set_bit (lanes, this_lane);
7238 lane_defs.quick_push (std::make_pair
7239 (this_lane,
7240 gimple_assign_rhs2 (def_stmt)));
7241 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7242 def = gimple_assign_rhs1 (def_stmt);
7244 while (lanes_found < nlanes);
7246 if (lanes_found == nlanes)
7248 /* Sort lane_defs after the lane index and register the root. */
7249 lane_defs.qsort (vld_cmp);
7250 vec<stmt_vec_info> stmts;
7251 stmts.create (nlanes);
7252 for (unsigned i = 0; i < nlanes; ++i)
7253 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7254 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7255 stmts, roots));
7257 else
7258 roots.release ();
7260 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7261 && (associative_tree_code (code) || code == MINUS_EXPR)
7262 /* ??? This pessimizes a two-element reduction. PR54400.
7263 ??? In-order reduction could be handled if we only
7264 traverse one operand chain in vect_slp_linearize_chain. */
7265 && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7266 /* Ops with constants at the tail can be stripped here. */
7267 && TREE_CODE (rhs) == SSA_NAME
7268 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7269 /* Should be the chain end. */
7270 && (!single_imm_use (gimple_assign_lhs (assign),
7271 &use_p, &use_stmt)
7272 || !is_gimple_assign (use_stmt)
7273 || (gimple_assign_rhs_code (use_stmt) != code
7274 && ((code != PLUS_EXPR && code != MINUS_EXPR)
7275 || (gimple_assign_rhs_code (use_stmt)
7276 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7278 /* We start the match at the end of a possible association
7279 chain. */
7280 auto_vec<chain_op_t> chain;
7281 auto_vec<std::pair<tree_code, gimple *> > worklist;
7282 auto_vec<gimple *> chain_stmts;
7283 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7284 if (code == MINUS_EXPR)
7285 code = PLUS_EXPR;
7286 internal_fn reduc_fn;
7287 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7288 || reduc_fn == IFN_LAST)
7289 continue;
7290 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7291 /* ??? */
7292 code_stmt, alt_code_stmt, &chain_stmts);
7293 if (chain.length () > 1)
7295 /* Sort the chain according to def_type and operation. */
7296 chain.sort (dt_sort_cmp, bb_vinfo);
7297 /* ??? Now we'd want to strip externals and constants
7298 but record those to be handled in the epilogue. */
7299 /* ??? For now do not allow mixing ops or externs/constants. */
7300 bool invalid = false;
7301 unsigned remain_cnt = 0;
7302 for (unsigned i = 0; i < chain.length (); ++i)
7304 if (chain[i].code != code)
7306 invalid = true;
7307 break;
7309 if (chain[i].dt != vect_internal_def)
7310 remain_cnt++;
7312 if (!invalid && chain.length () - remain_cnt > 1)
7314 vec<stmt_vec_info> stmts;
7315 vec<tree> remain = vNULL;
7316 stmts.create (chain.length ());
7317 if (remain_cnt > 0)
7318 remain.create (remain_cnt);
7319 for (unsigned i = 0; i < chain.length (); ++i)
7321 if (chain[i].dt == vect_internal_def)
7322 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7323 else
7324 remain.quick_push (chain[i].op);
7326 vec<stmt_vec_info> roots;
7327 roots.create (chain_stmts.length ());
7328 for (unsigned i = 0; i < chain_stmts.length (); ++i)
7329 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7330 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7331 stmts, roots, remain));
7338 /* Walk the grouped store chains and replace entries with their
7339 pattern variant if any. */
7341 static void
7342 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7344 stmt_vec_info first_element;
7345 unsigned i;
7347 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7349 /* We also have CTORs in this array. */
7350 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7351 continue;
7352 if (STMT_VINFO_IN_PATTERN_P (first_element))
7354 stmt_vec_info orig = first_element;
7355 first_element = STMT_VINFO_RELATED_STMT (first_element);
7356 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7357 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7358 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7359 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7360 vinfo->grouped_stores[i] = first_element;
7362 stmt_vec_info prev = first_element;
7363 while (DR_GROUP_NEXT_ELEMENT (prev))
7365 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7366 if (STMT_VINFO_IN_PATTERN_P (elt))
7368 stmt_vec_info orig = elt;
7369 elt = STMT_VINFO_RELATED_STMT (elt);
7370 DR_GROUP_NEXT_ELEMENT (prev) = elt;
7371 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7372 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7374 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7375 prev = elt;
7380 /* Check if the region described by BB_VINFO can be vectorized, returning
7381 true if so. When returning false, set FATAL to true if the same failure
7382 would prevent vectorization at other vector sizes, false if it is still
7383 worth trying other sizes. N_STMTS is the number of statements in the
7384 region. */
7386 static bool
7387 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7388 vec<int> *dataref_groups)
7390 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7392 slp_instance instance;
7393 int i;
7394 poly_uint64 min_vf = 2;
7396 /* The first group of checks is independent of the vector size. */
7397 fatal = true;
7399 /* Analyze the data references. */
7401 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7403 if (dump_enabled_p ())
7404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7405 "not vectorized: unhandled data-ref in basic "
7406 "block.\n");
7407 return false;
7410 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7412 if (dump_enabled_p ())
7413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7414 "not vectorized: unhandled data access in "
7415 "basic block.\n");
7416 return false;
7419 vect_slp_check_for_roots (bb_vinfo);
7421 /* If there are no grouped stores and no constructors in the region
7422 there is no need to continue with pattern recog as vect_analyze_slp
7423 will fail anyway. */
7424 if (bb_vinfo->grouped_stores.is_empty ()
7425 && bb_vinfo->roots.is_empty ())
7427 if (dump_enabled_p ())
7428 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7429 "not vectorized: no grouped stores in "
7430 "basic block.\n");
7431 return false;
7434 /* While the rest of the analysis below depends on it in some way. */
7435 fatal = false;
7437 vect_pattern_recog (bb_vinfo);
7439 /* Update store groups from pattern processing. */
7440 vect_fixup_store_groups_with_patterns (bb_vinfo);
7442 /* Check the SLP opportunities in the basic block, analyze and build SLP
7443 trees. */
7444 if (!vect_analyze_slp (bb_vinfo, n_stmts))
7446 if (dump_enabled_p ())
7448 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7449 "Failed to SLP the basic block.\n");
7450 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7451 "not vectorized: failed to find SLP opportunities "
7452 "in basic block.\n");
7454 return false;
7457 /* Optimize permutations. */
7458 vect_optimize_slp (bb_vinfo);
7460 /* Gather the loads reachable from the SLP graph entries. */
7461 vect_gather_slp_loads (bb_vinfo);
7463 vect_record_base_alignments (bb_vinfo);
7465 /* Analyze and verify the alignment of data references and the
7466 dependence in the SLP instances. */
7467 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7469 vect_location = instance->location ();
7470 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7471 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7473 slp_tree node = SLP_INSTANCE_TREE (instance);
7474 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7475 if (dump_enabled_p ())
7476 dump_printf_loc (MSG_NOTE, vect_location,
7477 "removing SLP instance operations starting from: %G",
7478 stmt_info->stmt);
7479 vect_free_slp_instance (instance);
7480 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7481 continue;
7484 /* Mark all the statements that we want to vectorize as pure SLP and
7485 relevant. */
7486 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7487 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7488 unsigned j;
7489 stmt_vec_info root;
7490 /* Likewise consider instance root stmts as vectorized. */
7491 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7492 STMT_SLP_TYPE (root) = pure_slp;
7494 i++;
7496 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7497 return false;
7499 if (!vect_slp_analyze_operations (bb_vinfo))
7501 if (dump_enabled_p ())
7502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7503 "not vectorized: bad operation in basic block.\n");
7504 return false;
7507 vect_bb_partition_graph (bb_vinfo);
7509 return true;
7512 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
7513 basic blocks in BBS, returning true on success.
7514 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
7516 static bool
7517 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7518 vec<int> *dataref_groups, unsigned int n_stmts,
7519 loop_p orig_loop)
7521 bb_vec_info bb_vinfo;
7522 auto_vector_modes vector_modes;
7524 /* Autodetect first vector size we try. */
7525 machine_mode next_vector_mode = VOIDmode;
7526 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7527 unsigned int mode_i = 0;
7529 vec_info_shared shared;
7531 machine_mode autodetected_vector_mode = VOIDmode;
7532 while (1)
7534 bool vectorized = false;
7535 bool fatal = false;
7536 bb_vinfo = new _bb_vec_info (bbs, &shared);
7538 bool first_time_p = shared.datarefs.is_empty ();
7539 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7540 if (first_time_p)
7541 bb_vinfo->shared->save_datarefs ();
7542 else
7543 bb_vinfo->shared->check_datarefs ();
7544 bb_vinfo->vector_mode = next_vector_mode;
7546 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7548 if (dump_enabled_p ())
7550 dump_printf_loc (MSG_NOTE, vect_location,
7551 "***** Analysis succeeded with vector mode"
7552 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7553 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7556 bb_vinfo->shared->check_datarefs ();
7558 auto_vec<slp_instance> profitable_subgraphs;
7559 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7561 if (instance->subgraph_entries.is_empty ())
7562 continue;
7564 dump_user_location_t saved_vect_location = vect_location;
7565 vect_location = instance->location ();
7566 if (!unlimited_cost_model (NULL)
7567 && !vect_bb_vectorization_profitable_p
7568 (bb_vinfo, instance->subgraph_entries, orig_loop))
7570 if (dump_enabled_p ())
7571 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7572 "not vectorized: vectorization is not "
7573 "profitable.\n");
7574 vect_location = saved_vect_location;
7575 continue;
7578 vect_location = saved_vect_location;
7579 if (!dbg_cnt (vect_slp))
7580 continue;
7582 profitable_subgraphs.safe_push (instance);
7585 /* When we're vectorizing an if-converted loop body make sure
7586 we vectorized all if-converted code. */
7587 if (!profitable_subgraphs.is_empty ()
7588 && orig_loop)
7590 gcc_assert (bb_vinfo->bbs.length () == 1);
7591 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7592 !gsi_end_p (gsi); gsi_next (&gsi))
7594 /* The costing above left us with DCEable vectorized scalar
7595 stmts having the visited flag set on profitable
7596 subgraphs. Do the delayed clearing of the flag here. */
7597 if (gimple_visited_p (gsi_stmt (gsi)))
7599 gimple_set_visited (gsi_stmt (gsi), false);
7600 continue;
7602 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7603 continue;
7605 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7606 if (gimple_assign_rhs_code (ass) == COND_EXPR)
7608 if (!profitable_subgraphs.is_empty ()
7609 && dump_enabled_p ())
7610 dump_printf_loc (MSG_NOTE, vect_location,
7611 "not profitable because of "
7612 "unprofitable if-converted scalar "
7613 "code\n");
7614 profitable_subgraphs.truncate (0);
7619 /* Finally schedule the profitable subgraphs. */
7620 for (slp_instance instance : profitable_subgraphs)
7622 if (!vectorized && dump_enabled_p ())
7623 dump_printf_loc (MSG_NOTE, vect_location,
7624 "Basic block will be vectorized "
7625 "using SLP\n");
7626 vectorized = true;
7628 /* Dump before scheduling as store vectorization will remove
7629 the original stores and mess with the instance tree
7630 so querying its location will eventually ICE. */
7631 if (flag_checking)
7632 for (slp_instance sub : instance->subgraph_entries)
7633 gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7634 unsigned HOST_WIDE_INT bytes;
7635 if (dump_enabled_p ())
7636 for (slp_instance sub : instance->subgraph_entries)
7638 tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7639 if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7640 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7641 sub->location (),
7642 "basic block part vectorized using %wu "
7643 "byte vectors\n", bytes);
7644 else
7645 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7646 sub->location (),
7647 "basic block part vectorized using "
7648 "variable length vectors\n");
7651 dump_user_location_t saved_vect_location = vect_location;
7652 vect_location = instance->location ();
7654 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7656 vect_location = saved_vect_location;
7659 else
7661 if (dump_enabled_p ())
7662 dump_printf_loc (MSG_NOTE, vect_location,
7663 "***** Analysis failed with vector mode %s\n",
7664 GET_MODE_NAME (bb_vinfo->vector_mode));
7667 if (mode_i == 0)
7668 autodetected_vector_mode = bb_vinfo->vector_mode;
7670 if (!fatal)
7671 while (mode_i < vector_modes.length ()
7672 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7674 if (dump_enabled_p ())
7675 dump_printf_loc (MSG_NOTE, vect_location,
7676 "***** The result for vector mode %s would"
7677 " be the same\n",
7678 GET_MODE_NAME (vector_modes[mode_i]));
7679 mode_i += 1;
7682 delete bb_vinfo;
7684 if (mode_i < vector_modes.length ()
7685 && VECTOR_MODE_P (autodetected_vector_mode)
7686 && (related_vector_mode (vector_modes[mode_i],
7687 GET_MODE_INNER (autodetected_vector_mode))
7688 == autodetected_vector_mode)
7689 && (related_vector_mode (autodetected_vector_mode,
7690 GET_MODE_INNER (vector_modes[mode_i]))
7691 == vector_modes[mode_i]))
7693 if (dump_enabled_p ())
7694 dump_printf_loc (MSG_NOTE, vect_location,
7695 "***** Skipping vector mode %s, which would"
7696 " repeat the analysis for %s\n",
7697 GET_MODE_NAME (vector_modes[mode_i]),
7698 GET_MODE_NAME (autodetected_vector_mode));
7699 mode_i += 1;
7702 if (vectorized
7703 || mode_i == vector_modes.length ()
7704 || autodetected_vector_mode == VOIDmode
7705 /* If vect_slp_analyze_bb_1 signaled that analysis for all
7706 vector sizes will fail do not bother iterating. */
7707 || fatal)
7708 return vectorized;
7710 /* Try the next biggest vector size. */
7711 next_vector_mode = vector_modes[mode_i++];
7712 if (dump_enabled_p ())
7713 dump_printf_loc (MSG_NOTE, vect_location,
7714 "***** Re-trying analysis with vector mode %s\n",
7715 GET_MODE_NAME (next_vector_mode));
7720 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
7721 true if anything in the basic-block was vectorized. */
7723 static bool
7724 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7726 vec<data_reference_p> datarefs = vNULL;
7727 auto_vec<int> dataref_groups;
7728 int insns = 0;
7729 int current_group = 0;
7731 for (unsigned i = 0; i < bbs.length (); i++)
7733 basic_block bb = bbs[i];
7734 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7735 gsi_next (&gsi))
7737 gimple *stmt = gsi_stmt (gsi);
7738 if (is_gimple_debug (stmt))
7739 continue;
7741 insns++;
7743 if (gimple_location (stmt) != UNKNOWN_LOCATION)
7744 vect_location = stmt;
7746 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7747 &dataref_groups, current_group))
7748 ++current_group;
7750 /* New BBs always start a new DR group. */
7751 ++current_group;
7754 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7757 /* Special entry for the BB vectorizer. Analyze and transform a single
7758 if-converted BB with ORIG_LOOPs body being the not if-converted
7759 representation. Returns true if anything in the basic-block was
7760 vectorized. */
7762 bool
7763 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7765 auto_vec<basic_block> bbs;
7766 bbs.safe_push (bb);
7767 return vect_slp_bbs (bbs, orig_loop);
7770 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
7771 true if anything in the basic-block was vectorized. */
7773 bool
7774 vect_slp_function (function *fun)
7776 bool r = false;
7777 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7778 auto_bitmap exit_bbs;
7779 bitmap_set_bit (exit_bbs, EXIT_BLOCK);
7780 edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
7781 unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
7782 true, rpo, NULL);
7784 /* For the moment split the function into pieces to avoid making
7785 the iteration on the vector mode moot. Split at points we know
7786 to not handle well which is CFG merges (SLP discovery doesn't
7787 handle non-loop-header PHIs) and loop exits. Since pattern
7788 recog requires reverse iteration to visit uses before defs
7789 simply chop RPO into pieces. */
7790 auto_vec<basic_block> bbs;
7791 for (unsigned i = 0; i < n; i++)
7793 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7794 bool split = false;
7796 /* Split when a BB is not dominated by the first block. */
7797 if (!bbs.is_empty ()
7798 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7800 if (dump_enabled_p ())
7801 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7802 "splitting region at dominance boundary bb%d\n",
7803 bb->index);
7804 split = true;
7806 /* Split when the loop determined by the first block
7807 is exited. This is because we eventually insert
7808 invariants at region begin. */
7809 else if (!bbs.is_empty ()
7810 && bbs[0]->loop_father != bb->loop_father
7811 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7813 if (dump_enabled_p ())
7814 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7815 "splitting region at loop %d exit at bb%d\n",
7816 bbs[0]->loop_father->num, bb->index);
7817 split = true;
7819 else if (!bbs.is_empty ()
7820 && bb->loop_father->header == bb
7821 && bb->loop_father->dont_vectorize)
7823 if (dump_enabled_p ())
7824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7825 "splitting region at dont-vectorize loop %d "
7826 "entry at bb%d\n",
7827 bb->loop_father->num, bb->index);
7828 split = true;
7831 if (split && !bbs.is_empty ())
7833 r |= vect_slp_bbs (bbs, NULL);
7834 bbs.truncate (0);
7837 if (bbs.is_empty ())
7839 /* We need to be able to insert at the head of the region which
7840 we cannot for region starting with a returns-twice call. */
7841 if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
7842 if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
7844 if (dump_enabled_p ())
7845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7846 "skipping bb%d as start of region as it "
7847 "starts with returns-twice call\n",
7848 bb->index);
7849 continue;
7851 /* If the loop this BB belongs to is marked as not to be vectorized
7852 honor that also for BB vectorization. */
7853 if (bb->loop_father->dont_vectorize)
7854 continue;
7857 bbs.safe_push (bb);
7859 /* When we have a stmt ending this block and defining a
7860 value we have to insert on edges when inserting after it for
7861 a vector containing its definition. Avoid this for now. */
7862 if (gimple *last = *gsi_last_bb (bb))
7863 if (gimple_get_lhs (last)
7864 && is_ctrl_altering_stmt (last))
7866 if (dump_enabled_p ())
7867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7868 "splitting region at control altering "
7869 "definition %G", last);
7870 r |= vect_slp_bbs (bbs, NULL);
7871 bbs.truncate (0);
7875 if (!bbs.is_empty ())
7876 r |= vect_slp_bbs (bbs, NULL);
7878 free (rpo);
7880 return r;
7883 /* Build a variable-length vector in which the elements in ELTS are repeated
7884 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
7885 RESULTS and add any new instructions to SEQ.
7887 The approach we use is:
7889 (1) Find a vector mode VM with integer elements of mode IM.
7891 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7892 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
7893 from small vectors to IM.
7895 (3) Duplicate each ELTS'[I] into a vector of mode VM.
7897 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7898 correct byte contents.
7900 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7902 We try to find the largest IM for which this sequence works, in order
7903 to cut down on the number of interleaves. */
7905 void
7906 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
7907 const vec<tree> &elts, unsigned int nresults,
7908 vec<tree> &results)
7910 unsigned int nelts = elts.length ();
7911 tree element_type = TREE_TYPE (vector_type);
7913 /* (1) Find a vector mode VM with integer elements of mode IM. */
7914 unsigned int nvectors = 1;
7915 tree new_vector_type;
7916 tree permutes[2];
7917 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
7918 &nvectors, &new_vector_type,
7919 permutes))
7920 gcc_unreachable ();
7922 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
7923 unsigned int partial_nelts = nelts / nvectors;
7924 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
7926 tree_vector_builder partial_elts;
7927 auto_vec<tree, 32> pieces (nvectors * 2);
7928 pieces.quick_grow_cleared (nvectors * 2);
7929 for (unsigned int i = 0; i < nvectors; ++i)
7931 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7932 ELTS' has mode IM. */
7933 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
7934 for (unsigned int j = 0; j < partial_nelts; ++j)
7935 partial_elts.quick_push (elts[i * partial_nelts + j]);
7936 tree t = gimple_build_vector (seq, &partial_elts);
7937 t = gimple_build (seq, VIEW_CONVERT_EXPR,
7938 TREE_TYPE (new_vector_type), t);
7940 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
7941 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
7944 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
7945 correct byte contents.
7947 Conceptually, we need to repeat the following operation log2(nvectors)
7948 times, where hi_start = nvectors / 2:
7950 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
7951 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
7953 However, if each input repeats every N elements and the VF is
7954 a multiple of N * 2, the HI result is the same as the LO result.
7955 This will be true for the first N1 iterations of the outer loop,
7956 followed by N2 iterations for which both the LO and HI results
7957 are needed. I.e.:
7959 N1 + N2 = log2(nvectors)
7961 Each "N1 iteration" doubles the number of redundant vectors and the
7962 effect of the process as a whole is to have a sequence of nvectors/2**N1
7963 vectors that repeats 2**N1 times. Rather than generate these redundant
7964 vectors, we halve the number of vectors for each N1 iteration. */
7965 unsigned int in_start = 0;
7966 unsigned int out_start = nvectors;
7967 unsigned int new_nvectors = nvectors;
7968 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
7970 unsigned int hi_start = new_nvectors / 2;
7971 unsigned int out_i = 0;
7972 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
7974 if ((in_i & 1) != 0
7975 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
7976 2 * in_repeat))
7977 continue;
7979 tree output = make_ssa_name (new_vector_type);
7980 tree input1 = pieces[in_start + (in_i / 2)];
7981 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
7982 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
7983 input1, input2,
7984 permutes[in_i & 1]);
7985 gimple_seq_add_stmt (seq, stmt);
7986 pieces[out_start + out_i] = output;
7987 out_i += 1;
7989 std::swap (in_start, out_start);
7990 new_nvectors = out_i;
7993 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
7994 results.reserve (nresults);
7995 for (unsigned int i = 0; i < nresults; ++i)
7996 if (i < new_nvectors)
7997 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
7998 pieces[in_start + i]));
7999 else
8000 results.quick_push (results[i - new_nvectors]);
8004 /* For constant and loop invariant defs in OP_NODE this function creates
8005 vector defs that will be used in the vectorized stmts and stores them
8006 to SLP_TREE_VEC_DEFS of OP_NODE. */
8008 static void
8009 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8011 unsigned HOST_WIDE_INT nunits;
8012 tree vec_cst;
8013 unsigned j, number_of_places_left_in_vector;
8014 tree vector_type;
8015 tree vop;
8016 int group_size = op_node->ops.length ();
8017 unsigned int vec_num, i;
8018 unsigned number_of_copies = 1;
8019 bool constant_p;
8020 gimple_seq ctor_seq = NULL;
8021 auto_vec<tree, 16> permute_results;
8023 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
8024 vector_type = SLP_TREE_VECTYPE (op_node);
8026 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8027 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8028 auto_vec<tree> voprnds (number_of_vectors);
8030 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8031 created vectors. It is greater than 1 if unrolling is performed.
8033 For example, we have two scalar operands, s1 and s2 (e.g., group of
8034 strided accesses of size two), while NUNITS is four (i.e., four scalars
8035 of this type can be packed in a vector). The output vector will contain
8036 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
8037 will be 2).
8039 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8040 containing the operands.
8042 For example, NUNITS is four as before, and the group size is 8
8043 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
8044 {s5, s6, s7, s8}. */
8046 /* When using duplicate_and_interleave, we just need one element for
8047 each scalar statement. */
8048 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8049 nunits = group_size;
8051 number_of_copies = nunits * number_of_vectors / group_size;
8053 number_of_places_left_in_vector = nunits;
8054 constant_p = true;
8055 tree_vector_builder elts (vector_type, nunits, 1);
8056 elts.quick_grow (nunits);
8057 stmt_vec_info insert_after = NULL;
8058 for (j = 0; j < number_of_copies; j++)
8060 tree op;
8061 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8063 /* Create 'vect_ = {op0,op1,...,opn}'. */
8064 number_of_places_left_in_vector--;
8065 tree orig_op = op;
8066 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8068 if (CONSTANT_CLASS_P (op))
8070 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8072 /* Can't use VIEW_CONVERT_EXPR for booleans because
8073 of possibly different sizes of scalar value and
8074 vector element. */
8075 if (integer_zerop (op))
8076 op = build_int_cst (TREE_TYPE (vector_type), 0);
8077 else if (integer_onep (op))
8078 op = build_all_ones_cst (TREE_TYPE (vector_type));
8079 else
8080 gcc_unreachable ();
8082 else
8083 op = fold_unary (VIEW_CONVERT_EXPR,
8084 TREE_TYPE (vector_type), op);
8085 gcc_assert (op && CONSTANT_CLASS_P (op));
8087 else
8089 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8090 gimple *init_stmt;
8091 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8093 tree true_val
8094 = build_all_ones_cst (TREE_TYPE (vector_type));
8095 tree false_val
8096 = build_zero_cst (TREE_TYPE (vector_type));
8097 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8098 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8099 op, true_val,
8100 false_val);
8102 else
8104 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8105 op);
8106 init_stmt
8107 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8108 op);
8110 gimple_seq_add_stmt (&ctor_seq, init_stmt);
8111 op = new_temp;
8114 elts[number_of_places_left_in_vector] = op;
8115 if (!CONSTANT_CLASS_P (op))
8116 constant_p = false;
8117 /* For BB vectorization we have to compute an insert location
8118 when a def is inside the analyzed region since we cannot
8119 simply insert at the BB start in this case. */
8120 stmt_vec_info opdef;
8121 if (TREE_CODE (orig_op) == SSA_NAME
8122 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8123 && is_a <bb_vec_info> (vinfo)
8124 && (opdef = vinfo->lookup_def (orig_op)))
8126 if (!insert_after)
8127 insert_after = opdef;
8128 else
8129 insert_after = get_later_stmt (insert_after, opdef);
8132 if (number_of_places_left_in_vector == 0)
8134 if (constant_p
8135 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
8136 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
8137 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8138 else
8140 if (permute_results.is_empty ())
8141 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8142 elts, number_of_vectors,
8143 permute_results);
8144 vec_cst = permute_results[number_of_vectors - j - 1];
8146 if (!gimple_seq_empty_p (ctor_seq))
8148 if (insert_after)
8150 gimple_stmt_iterator gsi;
8151 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8153 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8154 gsi_insert_seq_before (&gsi, ctor_seq,
8155 GSI_CONTINUE_LINKING);
8157 else if (!stmt_ends_bb_p (insert_after->stmt))
8159 gsi = gsi_for_stmt (insert_after->stmt);
8160 gsi_insert_seq_after (&gsi, ctor_seq,
8161 GSI_CONTINUE_LINKING);
8163 else
8165 /* When we want to insert after a def where the
8166 defining stmt throws then insert on the fallthru
8167 edge. */
8168 edge e = find_fallthru_edge
8169 (gimple_bb (insert_after->stmt)->succs);
8170 basic_block new_bb
8171 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8172 gcc_assert (!new_bb);
8175 else
8176 vinfo->insert_seq_on_entry (NULL, ctor_seq);
8177 ctor_seq = NULL;
8179 voprnds.quick_push (vec_cst);
8180 insert_after = NULL;
8181 number_of_places_left_in_vector = nunits;
8182 constant_p = true;
8183 elts.new_vector (vector_type, nunits, 1);
8184 elts.quick_grow (nunits);
8189 /* Since the vectors are created in the reverse order, we should invert
8190 them. */
8191 vec_num = voprnds.length ();
8192 for (j = vec_num; j != 0; j--)
8194 vop = voprnds[j - 1];
8195 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8198 /* In case that VF is greater than the unrolling factor needed for the SLP
8199 group of stmts, NUMBER_OF_VECTORS to be created is greater than
8200 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8201 to replicate the vectors. */
8202 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8203 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8204 i++)
8205 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8208 /* Get the Ith vectorized definition from SLP_NODE. */
8210 tree
8211 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8213 return SLP_TREE_VEC_DEFS (slp_node)[i];
8216 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
8218 void
8219 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8221 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8222 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8225 /* Get N vectorized definitions for SLP_NODE. */
8227 void
8228 vect_get_slp_defs (vec_info *,
8229 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8231 if (n == -1U)
8232 n = SLP_TREE_CHILDREN (slp_node).length ();
8234 for (unsigned i = 0; i < n; ++i)
8236 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8237 vec<tree> vec_defs = vNULL;
8238 vect_get_slp_defs (child, &vec_defs);
8239 vec_oprnds->quick_push (vec_defs);
8243 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8244 - PERM gives the permutation that the caller wants to use for NODE,
8245 which might be different from SLP_LOAD_PERMUTATION.
8246 - DUMP_P controls whether the function dumps information. */
8248 static bool
8249 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8250 load_permutation_t &perm,
8251 const vec<tree> &dr_chain,
8252 gimple_stmt_iterator *gsi, poly_uint64 vf,
8253 bool analyze_only, bool dump_p,
8254 unsigned *n_perms, unsigned int *n_loads,
8255 bool dce_chain)
8257 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8258 int vec_index = 0;
8259 tree vectype = SLP_TREE_VECTYPE (node);
8260 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8261 unsigned int mask_element;
8262 unsigned dr_group_size;
8263 machine_mode mode;
8265 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8266 dr_group_size = 1;
8267 else
8269 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8270 dr_group_size = DR_GROUP_SIZE (stmt_info);
8273 mode = TYPE_MODE (vectype);
8274 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8275 unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8277 /* Initialize the vect stmts of NODE to properly insert the generated
8278 stmts later. */
8279 if (! analyze_only)
8280 for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8281 SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8283 /* Generate permutation masks for every NODE. Number of masks for each NODE
8284 is equal to GROUP_SIZE.
8285 E.g., we have a group of three nodes with three loads from the same
8286 location in each node, and the vector size is 4. I.e., we have a
8287 a0b0c0a1b1c1... sequence and we need to create the following vectors:
8288 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8289 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8292 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8293 The last mask is illegal since we assume two operands for permute
8294 operation, and the mask element values can't be outside that range.
8295 Hence, the last mask must be converted into {2,5,5,5}.
8296 For the first two permutations we need the first and the second input
8297 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8298 we need the second and the third vectors: {b1,c1,a2,b2} and
8299 {c2,a3,b3,c3}. */
8301 int vect_stmts_counter = 0;
8302 unsigned int index = 0;
8303 int first_vec_index = -1;
8304 int second_vec_index = -1;
8305 bool noop_p = true;
8306 *n_perms = 0;
8308 vec_perm_builder mask;
8309 unsigned int nelts_to_build;
8310 unsigned int nvectors_per_build;
8311 unsigned int in_nlanes;
8312 bool repeating_p = (group_size == dr_group_size
8313 && multiple_p (nunits, group_size));
8314 if (repeating_p)
8316 /* A single vector contains a whole number of copies of the node, so:
8317 (a) all permutes can use the same mask; and
8318 (b) the permutes only need a single vector input. */
8319 mask.new_vector (nunits, group_size, 3);
8320 nelts_to_build = mask.encoded_nelts ();
8321 /* It's possible to obtain zero nstmts during analyze_only, so make
8322 it at least one to ensure the later computation for n_perms
8323 proceed. */
8324 nvectors_per_build = nstmts > 0 ? nstmts : 1;
8325 in_nlanes = dr_group_size * 3;
8327 else
8329 /* We need to construct a separate mask for each vector statement. */
8330 unsigned HOST_WIDE_INT const_nunits, const_vf;
8331 if (!nunits.is_constant (&const_nunits)
8332 || !vf.is_constant (&const_vf))
8333 return false;
8334 mask.new_vector (const_nunits, const_nunits, 1);
8335 nelts_to_build = const_vf * group_size;
8336 nvectors_per_build = 1;
8337 in_nlanes = const_vf * dr_group_size;
8339 auto_sbitmap used_in_lanes (in_nlanes);
8340 bitmap_clear (used_in_lanes);
8341 auto_bitmap used_defs;
8343 unsigned int count = mask.encoded_nelts ();
8344 mask.quick_grow (count);
8345 vec_perm_indices indices;
8347 for (unsigned int j = 0; j < nelts_to_build; j++)
8349 unsigned int iter_num = j / group_size;
8350 unsigned int stmt_num = j % group_size;
8351 unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8352 bitmap_set_bit (used_in_lanes, i);
8353 if (repeating_p)
8355 first_vec_index = 0;
8356 mask_element = i;
8358 else
8360 /* Enforced before the loop when !repeating_p. */
8361 unsigned int const_nunits = nunits.to_constant ();
8362 vec_index = i / const_nunits;
8363 mask_element = i % const_nunits;
8364 if (vec_index == first_vec_index
8365 || first_vec_index == -1)
8367 first_vec_index = vec_index;
8369 else if (vec_index == second_vec_index
8370 || second_vec_index == -1)
8372 second_vec_index = vec_index;
8373 mask_element += const_nunits;
8375 else
8377 if (dump_p)
8378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8379 "permutation requires at "
8380 "least three vectors %G",
8381 stmt_info->stmt);
8382 gcc_assert (analyze_only);
8383 return false;
8386 gcc_assert (mask_element < 2 * const_nunits);
8389 if (mask_element != index)
8390 noop_p = false;
8391 mask[index++] = mask_element;
8393 if (index == count)
8395 if (!noop_p)
8397 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8398 if (!can_vec_perm_const_p (mode, mode, indices))
8400 if (dump_p)
8402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8403 "unsupported vect permute { ");
8404 for (i = 0; i < count; ++i)
8406 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8407 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8409 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8411 gcc_assert (analyze_only);
8412 return false;
8415 tree mask_vec = NULL_TREE;
8416 if (!analyze_only)
8417 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8419 if (second_vec_index == -1)
8420 second_vec_index = first_vec_index;
8422 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8424 ++*n_perms;
8425 if (analyze_only)
8426 continue;
8427 /* Generate the permute statement if necessary. */
8428 tree first_vec = dr_chain[first_vec_index + ri];
8429 tree second_vec = dr_chain[second_vec_index + ri];
8430 gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8431 tree perm_dest
8432 = vect_create_destination_var (gimple_assign_lhs (stmt),
8433 vectype);
8434 perm_dest = make_ssa_name (perm_dest);
8435 gimple *perm_stmt
8436 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8437 second_vec, mask_vec);
8438 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8439 gsi);
8440 if (dce_chain)
8442 bitmap_set_bit (used_defs, first_vec_index + ri);
8443 bitmap_set_bit (used_defs, second_vec_index + ri);
8446 /* Store the vector statement in NODE. */
8447 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8450 else if (!analyze_only)
8452 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8454 tree first_vec = dr_chain[first_vec_index + ri];
8455 /* If mask was NULL_TREE generate the requested
8456 identity transform. */
8457 if (dce_chain)
8458 bitmap_set_bit (used_defs, first_vec_index + ri);
8460 /* Store the vector statement in NODE. */
8461 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8465 index = 0;
8466 first_vec_index = -1;
8467 second_vec_index = -1;
8468 noop_p = true;
8472 if (n_loads)
8474 if (repeating_p)
8475 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8476 else
8478 /* Enforced above when !repeating_p. */
8479 unsigned int const_nunits = nunits.to_constant ();
8480 *n_loads = 0;
8481 bool load_seen = false;
8482 for (unsigned i = 0; i < in_nlanes; ++i)
8484 if (i % const_nunits == 0)
8486 if (load_seen)
8487 *n_loads += 1;
8488 load_seen = false;
8490 if (bitmap_bit_p (used_in_lanes, i))
8491 load_seen = true;
8493 if (load_seen)
8494 *n_loads += 1;
8498 if (dce_chain)
8499 for (unsigned i = 0; i < dr_chain.length (); ++i)
8500 if (!bitmap_bit_p (used_defs, i))
8502 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8503 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8504 gsi_remove (&rgsi, true);
8505 release_defs (stmt);
8508 return true;
8511 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8512 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8513 permute statements for the SLP node NODE. Store the number of vector
8514 permute instructions in *N_PERMS and the number of vector load
8515 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
8516 that were not needed. */
8518 bool
8519 vect_transform_slp_perm_load (vec_info *vinfo,
8520 slp_tree node, const vec<tree> &dr_chain,
8521 gimple_stmt_iterator *gsi, poly_uint64 vf,
8522 bool analyze_only, unsigned *n_perms,
8523 unsigned int *n_loads, bool dce_chain)
8525 return vect_transform_slp_perm_load_1 (vinfo, node,
8526 SLP_TREE_LOAD_PERMUTATION (node),
8527 dr_chain, gsi, vf, analyze_only,
8528 dump_enabled_p (), n_perms, n_loads,
8529 dce_chain);
8532 /* Produce the next vector result for SLP permutation NODE by adding a vector
8533 statement at GSI. If MASK_VEC is nonnull, add:
8535 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8537 otherwise add:
8539 <new SSA name> = FIRST_DEF. */
8541 static void
8542 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8543 slp_tree node, tree first_def, tree second_def,
8544 tree mask_vec, poly_uint64 identity_offset)
8546 tree vectype = SLP_TREE_VECTYPE (node);
8548 /* ??? We SLP match existing vector element extracts but
8549 allow punning which we need to re-instantiate at uses
8550 but have no good way of explicitly representing. */
8551 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8552 && !types_compatible_p (TREE_TYPE (first_def), vectype))
8554 gassign *conv_stmt
8555 = gimple_build_assign (make_ssa_name (vectype),
8556 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8557 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8558 first_def = gimple_assign_lhs (conv_stmt);
8560 gassign *perm_stmt;
8561 tree perm_dest = make_ssa_name (vectype);
8562 if (mask_vec)
8564 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8565 TYPE_SIZE (vectype))
8566 && !types_compatible_p (TREE_TYPE (second_def), vectype))
8568 gassign *conv_stmt
8569 = gimple_build_assign (make_ssa_name (vectype),
8570 build1 (VIEW_CONVERT_EXPR,
8571 vectype, second_def));
8572 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8573 second_def = gimple_assign_lhs (conv_stmt);
8575 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8576 first_def, second_def,
8577 mask_vec);
8579 else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8581 /* For identity permutes we still need to handle the case
8582 of offsetted extracts or concats. */
8583 unsigned HOST_WIDE_INT c;
8584 auto first_def_nunits
8585 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8586 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8588 unsigned HOST_WIDE_INT elsz
8589 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8590 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8591 TYPE_SIZE (vectype),
8592 bitsize_int (identity_offset * elsz));
8593 perm_stmt = gimple_build_assign (perm_dest, lowpart);
8595 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8596 first_def_nunits, &c) && c == 2)
8598 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8599 NULL_TREE, second_def);
8600 perm_stmt = gimple_build_assign (perm_dest, ctor);
8602 else
8603 gcc_unreachable ();
8605 else
8607 /* We need a copy here in case the def was external. */
8608 perm_stmt = gimple_build_assign (perm_dest, first_def);
8610 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8611 /* Store the vector statement in NODE. */
8612 node->push_vec_def (perm_stmt);
8615 /* Subroutine of vectorizable_slp_permutation. Check whether the target
8616 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8617 If GSI is nonnull, emit the permutation there.
8619 When GSI is null, the only purpose of NODE is to give properties
8620 of the result, such as the vector type and number of SLP lanes.
8621 The node does not need to be a VEC_PERM_EXPR.
8623 If the target supports the operation, return the number of individual
8624 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
8625 dump file if DUMP_P is true. */
8627 static int
8628 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8629 slp_tree node, lane_permutation_t &perm,
8630 vec<slp_tree> &children, bool dump_p)
8632 tree vectype = SLP_TREE_VECTYPE (node);
8634 /* ??? We currently only support all same vector input types
8635 while the SLP IL should really do a concat + select and thus accept
8636 arbitrary mismatches. */
8637 slp_tree child;
8638 unsigned i;
8639 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8640 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8641 tree op_vectype = NULL_TREE;
8642 FOR_EACH_VEC_ELT (children, i, child)
8643 if (SLP_TREE_VECTYPE (child))
8645 op_vectype = SLP_TREE_VECTYPE (child);
8646 break;
8648 if (!op_vectype)
8649 op_vectype = vectype;
8650 FOR_EACH_VEC_ELT (children, i, child)
8652 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8653 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8654 || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8655 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8657 if (dump_p)
8658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8659 "Unsupported vector types in lane permutation\n");
8660 return -1;
8662 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8663 repeating_p = false;
8666 gcc_assert (perm.length () == SLP_TREE_LANES (node));
8667 if (dump_p)
8669 dump_printf_loc (MSG_NOTE, vect_location,
8670 "vectorizing permutation");
8671 for (unsigned i = 0; i < perm.length (); ++i)
8672 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8673 if (repeating_p)
8674 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8675 dump_printf (MSG_NOTE, "\n");
8678 /* REPEATING_P is true if every output vector is guaranteed to use the
8679 same permute vector. We can handle that case for both variable-length
8680 and constant-length vectors, but we only handle other cases for
8681 constant-length vectors.
8683 Set:
8685 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8686 mask vector that we want to build.
8688 - NCOPIES to the number of copies of PERM that we need in order
8689 to build the necessary permute mask vectors.
8691 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8692 for each permute mask vector. This is only relevant when GSI is
8693 nonnull. */
8694 uint64_t npatterns;
8695 unsigned nelts_per_pattern;
8696 uint64_t ncopies;
8697 unsigned noutputs_per_mask;
8698 if (repeating_p)
8700 /* We need a single permute mask vector that has the form:
8702 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8704 In other words, the original n-element permute in PERM is
8705 "unrolled" to fill a full vector. The stepped vector encoding
8706 that we use for permutes requires 3n elements. */
8707 npatterns = SLP_TREE_LANES (node);
8708 nelts_per_pattern = ncopies = 3;
8709 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8711 else
8713 /* Calculate every element of every permute mask vector explicitly,
8714 instead of relying on the pattern described above. */
8715 if (!nunits.is_constant (&npatterns))
8716 return -1;
8717 nelts_per_pattern = ncopies = 1;
8718 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8719 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8720 return -1;
8721 noutputs_per_mask = 1;
8723 unsigned olanes = ncopies * SLP_TREE_LANES (node);
8724 gcc_assert (repeating_p || multiple_p (olanes, nunits));
8726 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8727 from the { SLP operand, scalar lane } permutation as recorded in the
8728 SLP node as intermediate step. This part should already work
8729 with SLP children with arbitrary number of lanes. */
8730 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8731 auto_vec<unsigned> active_lane;
8732 vperm.create (olanes);
8733 active_lane.safe_grow_cleared (children.length (), true);
8734 for (unsigned i = 0; i < ncopies; ++i)
8736 for (unsigned pi = 0; pi < perm.length (); ++pi)
8738 std::pair<unsigned, unsigned> p = perm[pi];
8739 tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8740 if (repeating_p)
8741 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8742 else
8744 /* We checked above that the vectors are constant-length. */
8745 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8746 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8747 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8748 vperm.quick_push ({{p.first, vi}, vl});
8751 /* Advance to the next group. */
8752 for (unsigned j = 0; j < children.length (); ++j)
8753 active_lane[j] += SLP_TREE_LANES (children[j]);
8756 if (dump_p)
8758 dump_printf_loc (MSG_NOTE, vect_location,
8759 "vectorizing permutation");
8760 for (unsigned i = 0; i < perm.length (); ++i)
8761 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8762 if (repeating_p)
8763 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8764 dump_printf (MSG_NOTE, "\n");
8765 dump_printf_loc (MSG_NOTE, vect_location, "as");
8766 for (unsigned i = 0; i < vperm.length (); ++i)
8768 if (i != 0
8769 && (repeating_p
8770 ? multiple_p (i, npatterns)
8771 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8772 dump_printf (MSG_NOTE, ",");
8773 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8774 vperm[i].first.first, vperm[i].first.second,
8775 vperm[i].second);
8777 dump_printf (MSG_NOTE, "\n");
8780 /* We can only handle two-vector permutes, everything else should
8781 be lowered on the SLP level. The following is closely inspired
8782 by vect_transform_slp_perm_load and is supposed to eventually
8783 replace it.
8784 ??? As intermediate step do code-gen in the SLP tree representation
8785 somehow? */
8786 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8787 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8788 unsigned int index = 0;
8789 poly_uint64 mask_element;
8790 vec_perm_builder mask;
8791 mask.new_vector (nunits, npatterns, nelts_per_pattern);
8792 unsigned int count = mask.encoded_nelts ();
8793 mask.quick_grow (count);
8794 vec_perm_indices indices;
8795 unsigned nperms = 0;
8796 for (unsigned i = 0; i < vperm.length (); ++i)
8798 mask_element = vperm[i].second;
8799 if (first_vec.first == -1U
8800 || first_vec == vperm[i].first)
8801 first_vec = vperm[i].first;
8802 else if (second_vec.first == -1U
8803 || second_vec == vperm[i].first)
8805 second_vec = vperm[i].first;
8806 mask_element += nunits;
8808 else
8810 if (dump_p)
8811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8812 "permutation requires at "
8813 "least three vectors\n");
8814 gcc_assert (!gsi);
8815 return -1;
8818 mask[index++] = mask_element;
8820 if (index == count)
8822 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8823 TYPE_VECTOR_SUBPARTS (op_vectype));
8824 bool identity_p = (indices.series_p (0, 1, mask[0], 1)
8825 && constant_multiple_p (mask[0], nunits));
8826 machine_mode vmode = TYPE_MODE (vectype);
8827 machine_mode op_vmode = TYPE_MODE (op_vectype);
8828 unsigned HOST_WIDE_INT c;
8829 if ((!identity_p
8830 && !can_vec_perm_const_p (vmode, op_vmode, indices))
8831 || (identity_p
8832 && !known_le (nunits,
8833 TYPE_VECTOR_SUBPARTS (op_vectype))
8834 && (!constant_multiple_p (nunits,
8835 TYPE_VECTOR_SUBPARTS (op_vectype),
8836 &c) || c != 2)))
8838 if (dump_p)
8840 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8841 vect_location,
8842 "unsupported vect permute { ");
8843 for (i = 0; i < count; ++i)
8845 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8846 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8848 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8850 gcc_assert (!gsi);
8851 return -1;
8854 if (!identity_p)
8855 nperms++;
8856 if (gsi)
8858 if (second_vec.first == -1U)
8859 second_vec = first_vec;
8861 slp_tree
8862 first_node = children[first_vec.first],
8863 second_node = children[second_vec.first];
8865 tree mask_vec = NULL_TREE;
8866 if (!identity_p)
8867 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8869 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8871 tree first_def
8872 = vect_get_slp_vect_def (first_node,
8873 first_vec.second + vi);
8874 tree second_def
8875 = vect_get_slp_vect_def (second_node,
8876 second_vec.second + vi);
8877 vect_add_slp_permutation (vinfo, gsi, node, first_def,
8878 second_def, mask_vec, mask[0]);
8882 index = 0;
8883 first_vec = std::make_pair (-1U, -1U);
8884 second_vec = std::make_pair (-1U, -1U);
8888 return nperms;
8891 /* Vectorize the SLP permutations in NODE as specified
8892 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8893 child number and lane number.
8894 Interleaving of two two-lane two-child SLP subtrees (not supported):
8895 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8896 A blend of two four-lane two-child SLP subtrees:
8897 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8898 Highpart of a four-lane one-child SLP subtree (not supported):
8899 [ { 0, 2 }, { 0, 3 } ]
8900 Where currently only a subset is supported by code generating below. */
8902 static bool
8903 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8904 slp_tree node, stmt_vector_for_cost *cost_vec)
8906 tree vectype = SLP_TREE_VECTYPE (node);
8907 lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
8908 int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
8909 SLP_TREE_CHILDREN (node),
8910 dump_enabled_p ());
8911 if (nperms < 0)
8912 return false;
8914 if (!gsi)
8915 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
8917 return true;
8920 /* Vectorize SLP NODE. */
8922 static void
8923 vect_schedule_slp_node (vec_info *vinfo,
8924 slp_tree node, slp_instance instance)
8926 gimple_stmt_iterator si;
8927 int i;
8928 slp_tree child;
8930 /* For existing vectors there's nothing to do. */
8931 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
8932 && SLP_TREE_VEC_DEFS (node).exists ())
8933 return;
8935 gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
8937 /* Vectorize externals and constants. */
8938 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
8939 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8941 /* ??? vectorizable_shift can end up using a scalar operand which is
8942 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
8943 node in this case. */
8944 if (!SLP_TREE_VECTYPE (node))
8945 return;
8947 vect_create_constant_vectors (vinfo, node);
8948 return;
8951 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8953 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
8954 SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
8956 if (dump_enabled_p ())
8957 dump_printf_loc (MSG_NOTE, vect_location,
8958 "------>vectorizing SLP node starting from: %G",
8959 stmt_info->stmt);
8961 if (STMT_VINFO_DATA_REF (stmt_info)
8962 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8964 /* Vectorized loads go before the first scalar load to make it
8965 ready early, vectorized stores go before the last scalar
8966 stmt which is where all uses are ready. */
8967 stmt_vec_info last_stmt_info = NULL;
8968 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
8969 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
8970 else /* DR_IS_WRITE */
8971 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
8972 si = gsi_for_stmt (last_stmt_info->stmt);
8974 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
8975 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
8976 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
8977 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8979 /* For PHI node vectorization we do not use the insertion iterator. */
8980 si = gsi_none ();
8982 else
8984 /* Emit other stmts after the children vectorized defs which is
8985 earliest possible. */
8986 gimple *last_stmt = NULL;
8987 bool seen_vector_def = false;
8988 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8989 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8991 /* For fold-left reductions we are retaining the scalar
8992 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
8993 set so the representation isn't perfect. Resort to the
8994 last scalar def here. */
8995 if (SLP_TREE_VEC_DEFS (child).is_empty ())
8997 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
8998 == cycle_phi_info_type);
8999 gphi *phi = as_a <gphi *>
9000 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
9001 if (!last_stmt
9002 || vect_stmt_dominates_stmt_p (last_stmt, phi))
9003 last_stmt = phi;
9005 /* We are emitting all vectorized stmts in the same place and
9006 the last one is the last.
9007 ??? Unless we have a load permutation applied and that
9008 figures to re-use an earlier generated load. */
9009 unsigned j;
9010 tree vdef;
9011 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9013 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9014 if (!last_stmt
9015 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9016 last_stmt = vstmt;
9019 else if (!SLP_TREE_VECTYPE (child))
9021 /* For externals we use unvectorized at all scalar defs. */
9022 unsigned j;
9023 tree def;
9024 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9025 if (TREE_CODE (def) == SSA_NAME
9026 && !SSA_NAME_IS_DEFAULT_DEF (def))
9028 gimple *stmt = SSA_NAME_DEF_STMT (def);
9029 if (!last_stmt
9030 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9031 last_stmt = stmt;
9034 else
9036 /* For externals we have to look at all defs since their
9037 insertion place is decided per vector. But beware
9038 of pre-existing vectors where we need to make sure
9039 we do not insert before the region boundary. */
9040 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9041 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9042 seen_vector_def = true;
9043 else
9045 unsigned j;
9046 tree vdef;
9047 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9048 if (TREE_CODE (vdef) == SSA_NAME
9049 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9051 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9052 if (!last_stmt
9053 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9054 last_stmt = vstmt;
9058 /* This can happen when all children are pre-existing vectors or
9059 constants. */
9060 if (!last_stmt)
9061 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9062 if (!last_stmt)
9064 gcc_assert (seen_vector_def);
9065 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9067 else if (is_ctrl_altering_stmt (last_stmt))
9069 /* We split regions to vectorize at control altering stmts
9070 with a definition so this must be an external which
9071 we can insert at the start of the region. */
9072 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9074 else if (is_a <bb_vec_info> (vinfo)
9075 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9076 && gimple_could_trap_p (stmt_info->stmt))
9078 /* We've constrained possibly trapping operations to all come
9079 from the same basic-block, if vectorized defs would allow earlier
9080 scheduling still force vectorized stmts to the original block.
9081 This is only necessary for BB vectorization since for loop vect
9082 all operations are in a single BB and scalar stmt based
9083 placement doesn't play well with epilogue vectorization. */
9084 gcc_assert (dominated_by_p (CDI_DOMINATORS,
9085 gimple_bb (stmt_info->stmt),
9086 gimple_bb (last_stmt)));
9087 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9089 else if (is_a <gphi *> (last_stmt))
9090 si = gsi_after_labels (gimple_bb (last_stmt));
9091 else
9093 si = gsi_for_stmt (last_stmt);
9094 gsi_next (&si);
9098 /* Handle purely internal nodes. */
9099 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9101 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
9102 be shared with different SLP nodes (but usually it's the same
9103 operation apart from the case the stmt is only there for denoting
9104 the actual scalar lane defs ...). So do not call vect_transform_stmt
9105 but open-code it here (partly). */
9106 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9107 gcc_assert (done);
9108 stmt_vec_info slp_stmt_info;
9109 unsigned int i;
9110 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9111 if (STMT_VINFO_LIVE_P (slp_stmt_info))
9113 done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9114 instance, i, true, NULL);
9115 gcc_assert (done);
9118 else
9119 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9122 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9123 For loop vectorization this is done in vectorizable_call, but for SLP
9124 it needs to be deferred until end of vect_schedule_slp, because multiple
9125 SLP instances may refer to the same scalar stmt. */
9127 static void
9128 vect_remove_slp_scalar_calls (vec_info *vinfo,
9129 slp_tree node, hash_set<slp_tree> &visited)
9131 gimple *new_stmt;
9132 gimple_stmt_iterator gsi;
9133 int i;
9134 slp_tree child;
9135 tree lhs;
9136 stmt_vec_info stmt_info;
9138 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9139 return;
9141 if (visited.add (node))
9142 return;
9144 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9145 vect_remove_slp_scalar_calls (vinfo, child, visited);
9147 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9149 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9150 if (!stmt || gimple_bb (stmt) == NULL)
9151 continue;
9152 if (is_pattern_stmt_p (stmt_info)
9153 || !PURE_SLP_STMT (stmt_info))
9154 continue;
9155 lhs = gimple_call_lhs (stmt);
9156 if (lhs)
9157 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9158 else
9160 new_stmt = gimple_build_nop ();
9161 unlink_stmt_vdef (stmt_info->stmt);
9163 gsi = gsi_for_stmt (stmt);
9164 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9165 if (lhs)
9166 SSA_NAME_DEF_STMT (lhs) = new_stmt;
9170 static void
9171 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9173 hash_set<slp_tree> visited;
9174 vect_remove_slp_scalar_calls (vinfo, node, visited);
9177 /* Vectorize the instance root. */
9179 void
9180 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9182 gassign *rstmt = NULL;
9184 if (instance->kind == slp_inst_kind_ctor)
9186 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9188 tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9189 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9190 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9191 TREE_TYPE (vect_lhs)))
9192 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9193 vect_lhs);
9194 rstmt = gimple_build_assign (root_lhs, vect_lhs);
9196 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9198 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9199 tree child_def;
9200 int j;
9201 vec<constructor_elt, va_gc> *v;
9202 vec_alloc (v, nelts);
9204 /* A CTOR can handle V16HI composition from VNx8HI so we
9205 do not need to convert vector elements if the types
9206 do not match. */
9207 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9208 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9209 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9210 tree rtype
9211 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9212 tree r_constructor = build_constructor (rtype, v);
9213 rstmt = gimple_build_assign (lhs, r_constructor);
9216 else if (instance->kind == slp_inst_kind_bb_reduc)
9218 /* Largely inspired by reduction chain epilogue handling in
9219 vect_create_epilog_for_reduction. */
9220 vec<tree> vec_defs = vNULL;
9221 vect_get_slp_defs (node, &vec_defs);
9222 enum tree_code reduc_code
9223 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9224 /* ??? We actually have to reflect signs somewhere. */
9225 if (reduc_code == MINUS_EXPR)
9226 reduc_code = PLUS_EXPR;
9227 gimple_seq epilogue = NULL;
9228 /* We may end up with more than one vector result, reduce them
9229 to one vector. */
9230 tree vec_def = vec_defs[0];
9231 tree vectype = TREE_TYPE (vec_def);
9232 tree compute_vectype = vectype;
9233 bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9234 && TYPE_OVERFLOW_UNDEFINED (vectype)
9235 && operation_can_overflow (reduc_code));
9236 if (pun_for_overflow_p)
9238 compute_vectype = unsigned_type_for (vectype);
9239 vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9240 compute_vectype, vec_def);
9242 for (unsigned i = 1; i < vec_defs.length (); ++i)
9244 tree def = vec_defs[i];
9245 if (pun_for_overflow_p)
9246 def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9247 compute_vectype, def);
9248 vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9249 vec_def, def);
9251 vec_defs.release ();
9252 /* ??? Support other schemes than direct internal fn. */
9253 internal_fn reduc_fn;
9254 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9255 || reduc_fn == IFN_LAST)
9256 gcc_unreachable ();
9257 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9258 TREE_TYPE (compute_vectype), vec_def);
9259 if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9261 tree rem_def = NULL_TREE;
9262 for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9264 def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9265 if (!rem_def)
9266 rem_def = def;
9267 else
9268 rem_def = gimple_build (&epilogue, reduc_code,
9269 TREE_TYPE (scalar_def),
9270 rem_def, def);
9272 scalar_def = gimple_build (&epilogue, reduc_code,
9273 TREE_TYPE (scalar_def),
9274 scalar_def, rem_def);
9276 scalar_def = gimple_convert (&epilogue,
9277 TREE_TYPE (vectype), scalar_def);
9278 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9279 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9280 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9281 update_stmt (gsi_stmt (rgsi));
9282 return;
9284 else
9285 gcc_unreachable ();
9287 gcc_assert (rstmt);
9289 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9290 gsi_replace (&rgsi, rstmt, true);
9293 struct slp_scc_info
9295 bool on_stack;
9296 int dfs;
9297 int lowlink;
9300 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
9302 static void
9303 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9304 hash_map<slp_tree, slp_scc_info> &scc_info,
9305 int &maxdfs, vec<slp_tree> &stack)
9307 bool existed_p;
9308 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9309 gcc_assert (!existed_p);
9310 info->dfs = maxdfs;
9311 info->lowlink = maxdfs;
9312 maxdfs++;
9314 /* Leaf. */
9315 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9317 info->on_stack = false;
9318 vect_schedule_slp_node (vinfo, node, instance);
9319 return;
9322 info->on_stack = true;
9323 stack.safe_push (node);
9325 unsigned i;
9326 slp_tree child;
9327 /* DFS recurse. */
9328 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9330 if (!child)
9331 continue;
9332 slp_scc_info *child_info = scc_info.get (child);
9333 if (!child_info)
9335 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9336 /* Recursion might have re-allocated the node. */
9337 info = scc_info.get (node);
9338 child_info = scc_info.get (child);
9339 info->lowlink = MIN (info->lowlink, child_info->lowlink);
9341 else if (child_info->on_stack)
9342 info->lowlink = MIN (info->lowlink, child_info->dfs);
9344 if (info->lowlink != info->dfs)
9345 return;
9347 auto_vec<slp_tree, 4> phis_to_fixup;
9349 /* Singleton. */
9350 if (stack.last () == node)
9352 stack.pop ();
9353 info->on_stack = false;
9354 vect_schedule_slp_node (vinfo, node, instance);
9355 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9356 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9357 phis_to_fixup.quick_push (node);
9359 else
9361 /* SCC. */
9362 int last_idx = stack.length () - 1;
9363 while (stack[last_idx] != node)
9364 last_idx--;
9365 /* We can break the cycle at PHIs who have at least one child
9366 code generated. Then we could re-start the DFS walk until
9367 all nodes in the SCC are covered (we might have new entries
9368 for only back-reachable nodes). But it's simpler to just
9369 iterate and schedule those that are ready. */
9370 unsigned todo = stack.length () - last_idx;
9373 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9375 slp_tree entry = stack[idx];
9376 if (!entry)
9377 continue;
9378 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9379 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9380 bool ready = !phi;
9381 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9382 if (!child)
9384 gcc_assert (phi);
9385 ready = true;
9386 break;
9388 else if (scc_info.get (child)->on_stack)
9390 if (!phi)
9392 ready = false;
9393 break;
9396 else
9398 if (phi)
9400 ready = true;
9401 break;
9404 if (ready)
9406 vect_schedule_slp_node (vinfo, entry, instance);
9407 scc_info.get (entry)->on_stack = false;
9408 stack[idx] = NULL;
9409 todo--;
9410 if (phi)
9411 phis_to_fixup.safe_push (entry);
9415 while (todo != 0);
9417 /* Pop the SCC. */
9418 stack.truncate (last_idx);
9421 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
9422 slp_tree phi_node;
9423 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9425 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9426 edge_iterator ei;
9427 edge e;
9428 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9430 unsigned dest_idx = e->dest_idx;
9431 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9432 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9433 continue;
9434 unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9435 /* Simply fill all args. */
9436 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9437 != vect_first_order_recurrence)
9438 for (unsigned i = 0; i < n; ++i)
9440 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9441 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9442 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9443 e, gimple_phi_arg_location (phi, dest_idx));
9445 else
9447 /* Unless it is a first order recurrence which needs
9448 args filled in for both the PHI node and the permutes. */
9449 gimple *perm
9450 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9451 gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9452 add_phi_arg (as_a <gphi *> (rphi),
9453 vect_get_slp_vect_def (child, n - 1),
9454 e, gimple_phi_arg_location (phi, dest_idx));
9455 for (unsigned i = 0; i < n; ++i)
9457 gimple *perm
9458 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9459 if (i > 0)
9460 gimple_assign_set_rhs1 (perm,
9461 vect_get_slp_vect_def (child, i - 1));
9462 gimple_assign_set_rhs2 (perm,
9463 vect_get_slp_vect_def (child, i));
9464 update_stmt (perm);
9471 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
9473 void
9474 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9476 slp_instance instance;
9477 unsigned int i;
9479 hash_map<slp_tree, slp_scc_info> scc_info;
9480 int maxdfs = 0;
9481 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9483 slp_tree node = SLP_INSTANCE_TREE (instance);
9484 if (dump_enabled_p ())
9486 dump_printf_loc (MSG_NOTE, vect_location,
9487 "Vectorizing SLP tree:\n");
9488 /* ??? Dump all? */
9489 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9490 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9491 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9492 vect_print_slp_graph (MSG_NOTE, vect_location,
9493 SLP_INSTANCE_TREE (instance));
9495 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9496 have a PHI be the node breaking the cycle. */
9497 auto_vec<slp_tree> stack;
9498 if (!scc_info.get (node))
9499 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9501 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9502 vectorize_slp_instance_root_stmt (node, instance);
9504 if (dump_enabled_p ())
9505 dump_printf_loc (MSG_NOTE, vect_location,
9506 "vectorizing stmts using SLP.\n");
9509 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9511 slp_tree root = SLP_INSTANCE_TREE (instance);
9512 stmt_vec_info store_info;
9513 unsigned int j;
9515 /* Remove scalar call stmts. Do not do this for basic-block
9516 vectorization as not all uses may be vectorized.
9517 ??? Why should this be necessary? DCE should be able to
9518 remove the stmts itself.
9519 ??? For BB vectorization we can as well remove scalar
9520 stmts starting from the SLP tree root if they have no
9521 uses. */
9522 if (is_a <loop_vec_info> (vinfo))
9523 vect_remove_slp_scalar_calls (vinfo, root);
9525 /* Remove vectorized stores original scalar stmts. */
9526 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9528 if (!STMT_VINFO_DATA_REF (store_info)
9529 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9530 break;
9532 store_info = vect_orig_stmt (store_info);
9533 /* Free the attached stmt_vec_info and remove the stmt. */
9534 vinfo->remove_stmt (store_info);
9536 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9537 to not crash in vect_free_slp_tree later. */
9538 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9539 SLP_TREE_REPRESENTATIVE (root) = NULL;