ada: Fix wrong finalization for call to BIP function in conditional expression
[official-gcc.git] / gcc / tree-vect-slp.cc
blobab89a82f1b31b9671d0e0c34a9e25c15a5eb2f06
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #define INCLUDE_ALGORITHM
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "insn-config.h"
35 #include "recog.h" /* FIXME: for insn_data */
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 #include "dump-context.h"
49 #include "cfganal.h"
50 #include "tree-eh.h"
51 #include "tree-cfg.h"
52 #include "alloc-pool.h"
53 #include "sreal.h"
54 #include "predict.h"
56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 load_permutation_t &,
58 const vec<tree> &,
59 gimple_stmt_iterator *,
60 poly_uint64, bool, bool,
61 unsigned *,
62 unsigned * = nullptr,
63 bool = false);
64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 slp_tree, lane_permutation_t &,
66 vec<slp_tree> &, bool);
67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
68 slp_tree, stmt_vector_for_cost *);
69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
71 static object_allocator<_slp_tree> *slp_tree_pool;
72 static slp_tree slp_first_node;
74 void
75 vect_slp_init (void)
77 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
80 void
81 vect_slp_fini (void)
83 while (slp_first_node)
84 delete slp_first_node;
85 delete slp_tree_pool;
86 slp_tree_pool = NULL;
89 void *
90 _slp_tree::operator new (size_t n)
92 gcc_assert (n == sizeof (_slp_tree));
93 return slp_tree_pool->allocate_raw ();
96 void
97 _slp_tree::operator delete (void *node, size_t n)
99 gcc_assert (n == sizeof (_slp_tree));
100 slp_tree_pool->remove_raw (node);
104 /* Initialize a SLP node. */
106 _slp_tree::_slp_tree ()
108 this->prev_node = NULL;
109 if (slp_first_node)
110 slp_first_node->prev_node = this;
111 this->next_node = slp_first_node;
112 slp_first_node = this;
113 SLP_TREE_SCALAR_STMTS (this) = vNULL;
114 SLP_TREE_SCALAR_OPS (this) = vNULL;
115 SLP_TREE_VEC_STMTS (this) = vNULL;
116 SLP_TREE_VEC_DEFS (this) = vNULL;
117 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
118 SLP_TREE_CHILDREN (this) = vNULL;
119 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
120 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
121 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
122 SLP_TREE_CODE (this) = ERROR_MARK;
123 SLP_TREE_VECTYPE (this) = NULL_TREE;
124 SLP_TREE_REPRESENTATIVE (this) = NULL;
125 SLP_TREE_REF_COUNT (this) = 1;
126 this->failed = NULL;
127 this->max_nunits = 1;
128 this->lanes = 0;
131 /* Tear down a SLP node. */
133 _slp_tree::~_slp_tree ()
135 if (this->prev_node)
136 this->prev_node->next_node = this->next_node;
137 else
138 slp_first_node = this->next_node;
139 if (this->next_node)
140 this->next_node->prev_node = this->prev_node;
141 SLP_TREE_CHILDREN (this).release ();
142 SLP_TREE_SCALAR_STMTS (this).release ();
143 SLP_TREE_SCALAR_OPS (this).release ();
144 SLP_TREE_VEC_STMTS (this).release ();
145 SLP_TREE_VEC_DEFS (this).release ();
146 SLP_TREE_LOAD_PERMUTATION (this).release ();
147 SLP_TREE_LANE_PERMUTATION (this).release ();
148 if (this->failed)
149 free (failed);
152 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
154 void
155 vect_free_slp_tree (slp_tree node)
157 int i;
158 slp_tree child;
160 if (--SLP_TREE_REF_COUNT (node) != 0)
161 return;
163 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
164 if (child)
165 vect_free_slp_tree (child);
167 /* If the node defines any SLP only patterns then those patterns are no
168 longer valid and should be removed. */
169 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
170 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
172 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
173 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
174 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
177 delete node;
180 /* Return a location suitable for dumpings related to the SLP instance. */
182 dump_user_location_t
183 _slp_instance::location () const
185 if (!root_stmts.is_empty ())
186 return root_stmts[0]->stmt;
187 else
188 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
192 /* Free the memory allocated for the SLP instance. */
194 void
195 vect_free_slp_instance (slp_instance instance)
197 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
198 SLP_INSTANCE_LOADS (instance).release ();
199 SLP_INSTANCE_ROOT_STMTS (instance).release ();
200 instance->subgraph_entries.release ();
201 instance->cost_vec.release ();
202 free (instance);
206 /* Create an SLP node for SCALAR_STMTS. */
208 slp_tree
209 vect_create_new_slp_node (unsigned nops, tree_code code)
211 slp_tree node = new _slp_tree;
212 SLP_TREE_SCALAR_STMTS (node) = vNULL;
213 SLP_TREE_CHILDREN (node).create (nops);
214 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
215 SLP_TREE_CODE (node) = code;
216 return node;
218 /* Create an SLP node for SCALAR_STMTS. */
220 static slp_tree
221 vect_create_new_slp_node (slp_tree node,
222 vec<stmt_vec_info> scalar_stmts, unsigned nops)
224 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
225 SLP_TREE_CHILDREN (node).create (nops);
226 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
227 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
228 SLP_TREE_LANES (node) = scalar_stmts.length ();
229 return node;
232 /* Create an SLP node for SCALAR_STMTS. */
234 static slp_tree
235 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
237 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
240 /* Create an SLP node for OPS. */
242 static slp_tree
243 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
245 SLP_TREE_SCALAR_OPS (node) = ops;
246 SLP_TREE_DEF_TYPE (node) = vect_external_def;
247 SLP_TREE_LANES (node) = ops.length ();
248 return node;
251 /* Create an SLP node for OPS. */
253 static slp_tree
254 vect_create_new_slp_node (vec<tree> ops)
256 return vect_create_new_slp_node (new _slp_tree, ops);
260 /* This structure is used in creation of an SLP tree. Each instance
261 corresponds to the same operand in a group of scalar stmts in an SLP
262 node. */
263 typedef struct _slp_oprnd_info
265 /* Def-stmts for the operands. */
266 vec<stmt_vec_info> def_stmts;
267 /* Operands. */
268 vec<tree> ops;
269 /* Information about the first statement, its vector def-type, type, the
270 operand itself in case it's constant, and an indication if it's a pattern
271 stmt. */
272 tree first_op_type;
273 enum vect_def_type first_dt;
274 bool any_pattern;
275 } *slp_oprnd_info;
278 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
279 operand. */
280 static vec<slp_oprnd_info>
281 vect_create_oprnd_info (int nops, int group_size)
283 int i;
284 slp_oprnd_info oprnd_info;
285 vec<slp_oprnd_info> oprnds_info;
287 oprnds_info.create (nops);
288 for (i = 0; i < nops; i++)
290 oprnd_info = XNEW (struct _slp_oprnd_info);
291 oprnd_info->def_stmts.create (group_size);
292 oprnd_info->ops.create (group_size);
293 oprnd_info->first_dt = vect_uninitialized_def;
294 oprnd_info->first_op_type = NULL_TREE;
295 oprnd_info->any_pattern = false;
296 oprnds_info.quick_push (oprnd_info);
299 return oprnds_info;
303 /* Free operands info. */
305 static void
306 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
308 int i;
309 slp_oprnd_info oprnd_info;
311 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
313 oprnd_info->def_stmts.release ();
314 oprnd_info->ops.release ();
315 XDELETE (oprnd_info);
318 oprnds_info.release ();
321 /* Return the execution frequency of NODE (so that a higher value indicates
322 a "more important" node when optimizing for speed). */
324 static sreal
325 vect_slp_node_weight (slp_tree node)
327 stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
328 basic_block bb = gimple_bb (stmt_info->stmt);
329 return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
332 /* Return true if STMTS contains a pattern statement. */
334 static bool
335 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
337 stmt_vec_info stmt_info;
338 unsigned int i;
339 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
340 if (is_pattern_stmt_p (stmt_info))
341 return true;
342 return false;
345 /* Return true when all lanes in the external or constant NODE have
346 the same value. */
348 static bool
349 vect_slp_tree_uniform_p (slp_tree node)
351 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
352 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
354 /* Pre-exsting vectors. */
355 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
356 return false;
358 unsigned i;
359 tree op, first = NULL_TREE;
360 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
361 if (!first)
362 first = op;
363 else if (!operand_equal_p (first, op, 0))
364 return false;
366 return true;
369 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
370 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
371 of the chain. */
374 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
375 stmt_vec_info first_stmt_info)
377 stmt_vec_info next_stmt_info = first_stmt_info;
378 int result = 0;
380 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
381 return -1;
385 if (next_stmt_info == stmt_info)
386 return result;
387 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
388 if (next_stmt_info)
389 result += DR_GROUP_GAP (next_stmt_info);
391 while (next_stmt_info);
393 return -1;
396 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
397 using the method implemented by duplicate_and_interleave. Return true
398 if so, returning the number of intermediate vectors in *NVECTORS_OUT
399 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
400 (if nonnull). */
402 bool
403 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
404 tree elt_type, unsigned int *nvectors_out,
405 tree *vector_type_out,
406 tree *permutes)
408 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
409 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
410 return false;
412 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
413 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
414 unsigned int nvectors = 1;
415 for (;;)
417 scalar_int_mode int_mode;
418 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
419 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
421 /* Get the natural vector type for this SLP group size. */
422 tree int_type = build_nonstandard_integer_type
423 (GET_MODE_BITSIZE (int_mode), 1);
424 tree vector_type
425 = get_vectype_for_scalar_type (vinfo, int_type, count);
426 poly_int64 half_nelts;
427 if (vector_type
428 && VECTOR_MODE_P (TYPE_MODE (vector_type))
429 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
430 GET_MODE_SIZE (base_vector_mode))
431 && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
432 2, &half_nelts))
434 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
435 together into elements of type INT_TYPE and using the result
436 to build NVECTORS vectors. */
437 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
438 vec_perm_builder sel1 (nelts, 2, 3);
439 vec_perm_builder sel2 (nelts, 2, 3);
441 for (unsigned int i = 0; i < 3; ++i)
443 sel1.quick_push (i);
444 sel1.quick_push (i + nelts);
445 sel2.quick_push (half_nelts + i);
446 sel2.quick_push (half_nelts + i + nelts);
448 vec_perm_indices indices1 (sel1, 2, nelts);
449 vec_perm_indices indices2 (sel2, 2, nelts);
450 machine_mode vmode = TYPE_MODE (vector_type);
451 if (can_vec_perm_const_p (vmode, vmode, indices1)
452 && can_vec_perm_const_p (vmode, vmode, indices2))
454 if (nvectors_out)
455 *nvectors_out = nvectors;
456 if (vector_type_out)
457 *vector_type_out = vector_type;
458 if (permutes)
460 permutes[0] = vect_gen_perm_mask_checked (vector_type,
461 indices1);
462 permutes[1] = vect_gen_perm_mask_checked (vector_type,
463 indices2);
465 return true;
469 if (!multiple_p (elt_bytes, 2, &elt_bytes))
470 return false;
471 nvectors *= 2;
475 /* Return true if DTA and DTB match. */
477 static bool
478 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
480 return (dta == dtb
481 || ((dta == vect_external_def || dta == vect_constant_def)
482 && (dtb == vect_external_def || dtb == vect_constant_def)));
485 static const int cond_expr_maps[3][5] = {
486 { 4, -1, -2, 1, 2 },
487 { 4, -2, -1, 1, 2 },
488 { 4, -1, -2, 2, 1 }
490 static const int arg1_map[] = { 1, 1 };
491 static const int arg2_map[] = { 1, 2 };
492 static const int arg1_arg4_map[] = { 2, 1, 4 };
493 static const int op1_op0_map[] = { 2, 1, 0 };
495 /* For most SLP statements, there is a one-to-one mapping between
496 gimple arguments and child nodes. If that is not true for STMT,
497 return an array that contains:
499 - the number of child nodes, followed by
500 - for each child node, the index of the argument associated with that node.
501 The special index -1 is the first operand of an embedded comparison and
502 the special index -2 is the second operand of an embedded comparison.
504 SWAP is as for vect_get_and_check_slp_defs. */
506 static const int *
507 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
509 if (auto assign = dyn_cast<const gassign *> (stmt))
511 if (gimple_assign_rhs_code (assign) == COND_EXPR
512 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
513 return cond_expr_maps[swap];
514 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
515 && swap)
516 return op1_op0_map;
518 gcc_assert (!swap);
519 if (auto call = dyn_cast<const gcall *> (stmt))
521 if (gimple_call_internal_p (call))
522 switch (gimple_call_internal_fn (call))
524 case IFN_MASK_LOAD:
525 return arg2_map;
527 case IFN_GATHER_LOAD:
528 return arg1_map;
530 case IFN_MASK_GATHER_LOAD:
531 return arg1_arg4_map;
533 default:
534 break;
537 return nullptr;
540 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
541 they are of a valid type and that they match the defs of the first stmt of
542 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
543 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
544 indicates swap is required for cond_expr stmts. Specifically, SWAP
545 is 1 if STMT is cond and operands of comparison need to be swapped;
546 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
548 If there was a fatal error return -1; if the error could be corrected by
549 swapping operands of father node of this one, return 1; if everything is
550 ok return 0. */
551 static int
552 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
553 bool *skip_args,
554 vec<stmt_vec_info> stmts, unsigned stmt_num,
555 vec<slp_oprnd_info> *oprnds_info)
557 stmt_vec_info stmt_info = stmts[stmt_num];
558 tree oprnd;
559 unsigned int i, number_of_oprnds;
560 enum vect_def_type dt = vect_uninitialized_def;
561 slp_oprnd_info oprnd_info;
562 unsigned int commutative_op = -1U;
563 bool first = stmt_num == 0;
565 if (!is_a<gcall *> (stmt_info->stmt)
566 && !is_a<gassign *> (stmt_info->stmt)
567 && !is_a<gphi *> (stmt_info->stmt))
568 return -1;
570 number_of_oprnds = gimple_num_args (stmt_info->stmt);
571 const int *map = vect_get_operand_map (stmt_info->stmt, swap);
572 if (map)
573 number_of_oprnds = *map++;
574 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
576 if (gimple_call_internal_p (stmt))
578 internal_fn ifn = gimple_call_internal_fn (stmt);
579 commutative_op = first_commutative_argument (ifn);
582 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
584 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
585 commutative_op = 0;
588 bool swapped = (swap != 0);
589 bool backedge = false;
590 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
591 for (i = 0; i < number_of_oprnds; i++)
593 int opno = map ? map[i] : int (i);
594 if (opno < 0)
595 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
596 else
598 oprnd = gimple_arg (stmt_info->stmt, opno);
599 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
600 backedge = dominated_by_p (CDI_DOMINATORS,
601 gimple_phi_arg_edge (stmt, opno)->src,
602 gimple_bb (stmt_info->stmt));
604 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
605 oprnd = TREE_OPERAND (oprnd, 0);
607 oprnd_info = (*oprnds_info)[i];
609 stmt_vec_info def_stmt_info;
610 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
612 if (dump_enabled_p ())
613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
614 "Build SLP failed: can't analyze def for %T\n",
615 oprnd);
617 return -1;
620 if (skip_args[i])
622 oprnd_info->def_stmts.quick_push (NULL);
623 oprnd_info->ops.quick_push (NULL_TREE);
624 oprnd_info->first_dt = vect_uninitialized_def;
625 continue;
628 oprnd_info->def_stmts.quick_push (def_stmt_info);
629 oprnd_info->ops.quick_push (oprnd);
631 if (def_stmt_info
632 && is_pattern_stmt_p (def_stmt_info))
634 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
635 != def_stmt_info)
636 oprnd_info->any_pattern = true;
637 else
638 /* If we promote this to external use the original stmt def. */
639 oprnd_info->ops.last ()
640 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
643 /* If there's a extern def on a backedge make sure we can
644 code-generate at the region start.
645 ??? This is another case that could be fixed by adjusting
646 how we split the function but at the moment we'd have conflicting
647 goals there. */
648 if (backedge
649 && dts[i] == vect_external_def
650 && is_a <bb_vec_info> (vinfo)
651 && TREE_CODE (oprnd) == SSA_NAME
652 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
653 && !dominated_by_p (CDI_DOMINATORS,
654 as_a <bb_vec_info> (vinfo)->bbs[0],
655 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
657 if (dump_enabled_p ())
658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
659 "Build SLP failed: extern def %T only defined "
660 "on backedge\n", oprnd);
661 return -1;
664 if (first)
666 tree type = TREE_TYPE (oprnd);
667 dt = dts[i];
668 if ((dt == vect_constant_def
669 || dt == vect_external_def)
670 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
671 && (TREE_CODE (type) == BOOLEAN_TYPE
672 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
673 type)))
675 if (dump_enabled_p ())
676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
677 "Build SLP failed: invalid type of def "
678 "for variable-length SLP %T\n", oprnd);
679 return -1;
682 /* For the swapping logic below force vect_reduction_def
683 for the reduction op in a SLP reduction group. */
684 if (!STMT_VINFO_DATA_REF (stmt_info)
685 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
686 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
687 && def_stmt_info)
688 dts[i] = dt = vect_reduction_def;
690 /* Check the types of the definition. */
691 switch (dt)
693 case vect_external_def:
694 case vect_constant_def:
695 case vect_internal_def:
696 case vect_reduction_def:
697 case vect_induction_def:
698 case vect_nested_cycle:
699 case vect_first_order_recurrence:
700 break;
702 default:
703 /* FORNOW: Not supported. */
704 if (dump_enabled_p ())
705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
706 "Build SLP failed: illegal type of def %T\n",
707 oprnd);
708 return -1;
711 oprnd_info->first_dt = dt;
712 oprnd_info->first_op_type = type;
715 if (first)
716 return 0;
718 /* Now match the operand definition types to that of the first stmt. */
719 for (i = 0; i < number_of_oprnds;)
721 if (skip_args[i])
723 ++i;
724 continue;
727 oprnd_info = (*oprnds_info)[i];
728 dt = dts[i];
729 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
730 oprnd = oprnd_info->ops[stmt_num];
731 tree type = TREE_TYPE (oprnd);
733 if (!types_compatible_p (oprnd_info->first_op_type, type))
735 if (dump_enabled_p ())
736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
737 "Build SLP failed: different operand types\n");
738 return 1;
741 /* Not first stmt of the group, check that the def-stmt/s match
742 the def-stmt/s of the first stmt. Allow different definition
743 types for reduction chains: the first stmt must be a
744 vect_reduction_def (a phi node), and the rest
745 end in the reduction chain. */
746 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
747 && !(oprnd_info->first_dt == vect_reduction_def
748 && !STMT_VINFO_DATA_REF (stmt_info)
749 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
750 && def_stmt_info
751 && !STMT_VINFO_DATA_REF (def_stmt_info)
752 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
753 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
754 || (!STMT_VINFO_DATA_REF (stmt_info)
755 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
756 && ((!def_stmt_info
757 || STMT_VINFO_DATA_REF (def_stmt_info)
758 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
759 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
760 != (oprnd_info->first_dt != vect_reduction_def))))
762 /* Try swapping operands if we got a mismatch. For BB
763 vectorization only in case it will clearly improve things. */
764 if (i == commutative_op && !swapped
765 && (!is_a <bb_vec_info> (vinfo)
766 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
767 dts[i+1])
768 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
769 || vect_def_types_match
770 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
772 if (dump_enabled_p ())
773 dump_printf_loc (MSG_NOTE, vect_location,
774 "trying swapped operands\n");
775 std::swap (dts[i], dts[i+1]);
776 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
777 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
778 std::swap ((*oprnds_info)[i]->ops[stmt_num],
779 (*oprnds_info)[i+1]->ops[stmt_num]);
780 swapped = true;
781 continue;
784 if (is_a <bb_vec_info> (vinfo)
785 && !oprnd_info->any_pattern)
787 /* Now for commutative ops we should see whether we can
788 make the other operand matching. */
789 if (dump_enabled_p ())
790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
791 "treating operand as external\n");
792 oprnd_info->first_dt = dt = vect_external_def;
794 else
796 if (dump_enabled_p ())
797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
798 "Build SLP failed: different types\n");
799 return 1;
803 /* Make sure to demote the overall operand to external. */
804 if (dt == vect_external_def)
805 oprnd_info->first_dt = vect_external_def;
806 /* For a SLP reduction chain we want to duplicate the reduction to
807 each of the chain members. That gets us a sane SLP graph (still
808 the stmts are not 100% correct wrt the initial values). */
809 else if ((dt == vect_internal_def
810 || dt == vect_reduction_def)
811 && oprnd_info->first_dt == vect_reduction_def
812 && !STMT_VINFO_DATA_REF (stmt_info)
813 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
814 && !STMT_VINFO_DATA_REF (def_stmt_info)
815 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
816 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
818 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
819 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
822 ++i;
825 /* Swap operands. */
826 if (swapped)
828 if (dump_enabled_p ())
829 dump_printf_loc (MSG_NOTE, vect_location,
830 "swapped operands to match def types in %G",
831 stmt_info->stmt);
834 return 0;
837 /* Return true if call statements CALL1 and CALL2 are similar enough
838 to be combined into the same SLP group. */
840 bool
841 compatible_calls_p (gcall *call1, gcall *call2)
843 unsigned int nargs = gimple_call_num_args (call1);
844 if (nargs != gimple_call_num_args (call2))
845 return false;
847 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
848 return false;
850 if (gimple_call_internal_p (call1))
852 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
853 TREE_TYPE (gimple_call_lhs (call2))))
854 return false;
855 for (unsigned int i = 0; i < nargs; ++i)
856 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
857 TREE_TYPE (gimple_call_arg (call2, i))))
858 return false;
860 else
862 if (!operand_equal_p (gimple_call_fn (call1),
863 gimple_call_fn (call2), 0))
864 return false;
866 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
867 return false;
870 /* Check that any unvectorized arguments are equal. */
871 if (const int *map = vect_get_operand_map (call1))
873 unsigned int nkept = *map++;
874 unsigned int mapi = 0;
875 for (unsigned int i = 0; i < nargs; ++i)
876 if (mapi < nkept && map[mapi] == int (i))
877 mapi += 1;
878 else if (!operand_equal_p (gimple_call_arg (call1, i),
879 gimple_call_arg (call2, i)))
880 return false;
883 return true;
886 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
887 caller's attempt to find the vector type in STMT_INFO with the narrowest
888 element type. Return true if VECTYPE is nonnull and if it is valid
889 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
890 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
891 vect_build_slp_tree. */
893 static bool
894 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
895 unsigned int group_size,
896 tree vectype, poly_uint64 *max_nunits)
898 if (!vectype)
900 if (dump_enabled_p ())
901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
902 "Build SLP failed: unsupported data-type in %G\n",
903 stmt_info->stmt);
904 /* Fatal mismatch. */
905 return false;
908 /* If populating the vector type requires unrolling then fail
909 before adjusting *max_nunits for basic-block vectorization. */
910 if (is_a <bb_vec_info> (vinfo)
911 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
913 if (dump_enabled_p ())
914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
915 "Build SLP failed: unrolling required "
916 "in basic block SLP\n");
917 /* Fatal mismatch. */
918 return false;
921 /* In case of multiple types we need to detect the smallest type. */
922 vect_update_max_nunits (max_nunits, vectype);
923 return true;
926 /* Verify if the scalar stmts STMTS are isomorphic, require data
927 permutation or are of unsupported types of operation. Return
928 true if they are, otherwise return false and indicate in *MATCHES
929 which stmts are not isomorphic to the first one. If MATCHES[0]
930 is false then this indicates the comparison could not be
931 carried out or the stmts will never be vectorized by SLP.
933 Note COND_EXPR is possibly isomorphic to another one after swapping its
934 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
935 the first stmt by swapping the two operands of comparison; set SWAP[i]
936 to 2 if stmt I is isormorphic to the first stmt by inverting the code
937 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
938 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
940 static bool
941 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
942 vec<stmt_vec_info> stmts, unsigned int group_size,
943 poly_uint64 *max_nunits, bool *matches,
944 bool *two_operators, tree *node_vectype)
946 unsigned int i;
947 stmt_vec_info first_stmt_info = stmts[0];
948 code_helper first_stmt_code = ERROR_MARK;
949 code_helper alt_stmt_code = ERROR_MARK;
950 code_helper rhs_code = ERROR_MARK;
951 code_helper first_cond_code = ERROR_MARK;
952 tree lhs;
953 bool need_same_oprnds = false;
954 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
955 stmt_vec_info first_load = NULL, prev_first_load = NULL;
956 bool first_stmt_load_p = false, load_p = false;
957 bool first_stmt_phi_p = false, phi_p = false;
958 bool maybe_soft_fail = false;
959 tree soft_fail_nunits_vectype = NULL_TREE;
961 /* For every stmt in NODE find its def stmt/s. */
962 stmt_vec_info stmt_info;
963 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
965 gimple *stmt = stmt_info->stmt;
966 swap[i] = 0;
967 matches[i] = false;
969 if (dump_enabled_p ())
970 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
972 /* Fail to vectorize statements marked as unvectorizable, throw
973 or are volatile. */
974 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
975 || stmt_can_throw_internal (cfun, stmt)
976 || gimple_has_volatile_ops (stmt))
978 if (dump_enabled_p ())
979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
980 "Build SLP failed: unvectorizable statement %G",
981 stmt);
982 /* ??? For BB vectorization we want to commutate operands in a way
983 to shuffle all unvectorizable defs into one operand and have
984 the other still vectorized. The following doesn't reliably
985 work for this though but it's the easiest we can do here. */
986 if (is_a <bb_vec_info> (vinfo) && i != 0)
987 continue;
988 /* Fatal mismatch. */
989 matches[0] = false;
990 return false;
993 lhs = gimple_get_lhs (stmt);
994 if (lhs == NULL_TREE)
996 if (dump_enabled_p ())
997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
998 "Build SLP failed: not GIMPLE_ASSIGN nor "
999 "GIMPLE_CALL %G", stmt);
1000 if (is_a <bb_vec_info> (vinfo) && i != 0)
1001 continue;
1002 /* Fatal mismatch. */
1003 matches[0] = false;
1004 return false;
1007 tree nunits_vectype;
1008 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1009 &nunits_vectype, group_size))
1011 if (is_a <bb_vec_info> (vinfo) && i != 0)
1012 continue;
1013 /* Fatal mismatch. */
1014 matches[0] = false;
1015 return false;
1017 /* Record nunits required but continue analysis, producing matches[]
1018 as if nunits was not an issue. This allows splitting of groups
1019 to happen. */
1020 if (nunits_vectype
1021 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1022 nunits_vectype, max_nunits))
1024 gcc_assert (is_a <bb_vec_info> (vinfo));
1025 maybe_soft_fail = true;
1026 soft_fail_nunits_vectype = nunits_vectype;
1029 gcc_assert (vectype);
1031 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1032 if (call_stmt)
1034 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1035 if (cfn != CFN_LAST)
1036 rhs_code = cfn;
1037 else
1038 rhs_code = CALL_EXPR;
1040 if (cfn == CFN_MASK_LOAD
1041 || cfn == CFN_GATHER_LOAD
1042 || cfn == CFN_MASK_GATHER_LOAD)
1043 load_p = true;
1044 else if ((internal_fn_p (cfn)
1045 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1046 || gimple_call_tail_p (call_stmt)
1047 || gimple_call_noreturn_p (call_stmt)
1048 || gimple_call_chain (call_stmt))
1050 if (dump_enabled_p ())
1051 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1052 "Build SLP failed: unsupported call type %G",
1053 (gimple *) call_stmt);
1054 if (is_a <bb_vec_info> (vinfo) && i != 0)
1055 continue;
1056 /* Fatal mismatch. */
1057 matches[0] = false;
1058 return false;
1061 else if (gimple_code (stmt) == GIMPLE_PHI)
1063 rhs_code = ERROR_MARK;
1064 phi_p = true;
1066 else
1068 rhs_code = gimple_assign_rhs_code (stmt);
1069 load_p = gimple_vuse (stmt);
1072 /* Check the operation. */
1073 if (i == 0)
1075 *node_vectype = vectype;
1076 first_stmt_code = rhs_code;
1077 first_stmt_load_p = load_p;
1078 first_stmt_phi_p = phi_p;
1080 /* Shift arguments should be equal in all the packed stmts for a
1081 vector shift with scalar shift operand. */
1082 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1083 || rhs_code == LROTATE_EXPR
1084 || rhs_code == RROTATE_EXPR)
1086 /* First see if we have a vector/vector shift. */
1087 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1089 /* No vector/vector shift, try for a vector/scalar shift. */
1090 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1092 if (dump_enabled_p ())
1093 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1094 "Build SLP failed: "
1095 "op not supported by target.\n");
1096 if (is_a <bb_vec_info> (vinfo) && i != 0)
1097 continue;
1098 /* Fatal mismatch. */
1099 matches[0] = false;
1100 return false;
1102 need_same_oprnds = true;
1103 first_op1 = gimple_assign_rhs2 (stmt);
1106 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1108 need_same_oprnds = true;
1109 first_op1 = gimple_assign_rhs2 (stmt);
1111 else if (!load_p
1112 && rhs_code == BIT_FIELD_REF)
1114 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1115 if (!is_a <bb_vec_info> (vinfo)
1116 || TREE_CODE (vec) != SSA_NAME
1117 /* When the element types are not compatible we pun the
1118 source to the target vectype which requires equal size. */
1119 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1120 || !types_compatible_p (TREE_TYPE (vectype),
1121 TREE_TYPE (TREE_TYPE (vec))))
1122 && !operand_equal_p (TYPE_SIZE (vectype),
1123 TYPE_SIZE (TREE_TYPE (vec)))))
1125 if (dump_enabled_p ())
1126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1127 "Build SLP failed: "
1128 "BIT_FIELD_REF not supported\n");
1129 /* Fatal mismatch. */
1130 matches[0] = false;
1131 return false;
1134 else if (rhs_code == CFN_DIV_POW2)
1136 need_same_oprnds = true;
1137 first_op1 = gimple_call_arg (call_stmt, 1);
1140 else
1142 if (first_stmt_code != rhs_code
1143 && alt_stmt_code == ERROR_MARK)
1144 alt_stmt_code = rhs_code;
1145 if ((first_stmt_code != rhs_code
1146 && (first_stmt_code != IMAGPART_EXPR
1147 || rhs_code != REALPART_EXPR)
1148 && (first_stmt_code != REALPART_EXPR
1149 || rhs_code != IMAGPART_EXPR)
1150 /* Handle mismatches in plus/minus by computing both
1151 and merging the results. */
1152 && !((first_stmt_code == PLUS_EXPR
1153 || first_stmt_code == MINUS_EXPR)
1154 && (alt_stmt_code == PLUS_EXPR
1155 || alt_stmt_code == MINUS_EXPR)
1156 && rhs_code == alt_stmt_code)
1157 && !(first_stmt_code.is_tree_code ()
1158 && rhs_code.is_tree_code ()
1159 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1160 == tcc_comparison)
1161 && (swap_tree_comparison (tree_code (first_stmt_code))
1162 == tree_code (rhs_code)))
1163 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1164 && (first_stmt_code == ARRAY_REF
1165 || first_stmt_code == BIT_FIELD_REF
1166 || first_stmt_code == INDIRECT_REF
1167 || first_stmt_code == COMPONENT_REF
1168 || first_stmt_code == MEM_REF)
1169 && (rhs_code == ARRAY_REF
1170 || rhs_code == BIT_FIELD_REF
1171 || rhs_code == INDIRECT_REF
1172 || rhs_code == COMPONENT_REF
1173 || rhs_code == MEM_REF)))
1174 || first_stmt_load_p != load_p
1175 || first_stmt_phi_p != phi_p)
1177 if (dump_enabled_p ())
1179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1180 "Build SLP failed: different operation "
1181 "in stmt %G", stmt);
1182 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1183 "original stmt %G", first_stmt_info->stmt);
1185 /* Mismatch. */
1186 continue;
1189 if (!load_p
1190 && first_stmt_code == BIT_FIELD_REF
1191 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1192 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1194 if (dump_enabled_p ())
1195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196 "Build SLP failed: different BIT_FIELD_REF "
1197 "arguments in %G", stmt);
1198 /* Mismatch. */
1199 continue;
1202 if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
1204 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1205 call_stmt))
1207 if (dump_enabled_p ())
1208 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1209 "Build SLP failed: different calls in %G",
1210 stmt);
1211 /* Mismatch. */
1212 continue;
1216 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1217 && (gimple_bb (first_stmt_info->stmt)
1218 != gimple_bb (stmt_info->stmt)))
1220 if (dump_enabled_p ())
1221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1222 "Build SLP failed: different BB for PHI "
1223 "or possibly trapping operation in %G", stmt);
1224 /* Mismatch. */
1225 continue;
1228 if (need_same_oprnds)
1230 tree other_op1 = gimple_arg (stmt, 1);
1231 if (!operand_equal_p (first_op1, other_op1, 0))
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235 "Build SLP failed: different shift "
1236 "arguments in %G", stmt);
1237 /* Mismatch. */
1238 continue;
1242 if (!types_compatible_p (vectype, *node_vectype))
1244 if (dump_enabled_p ())
1245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1246 "Build SLP failed: different vector type "
1247 "in %G", stmt);
1248 /* Mismatch. */
1249 continue;
1253 /* Grouped store or load. */
1254 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1256 if (REFERENCE_CLASS_P (lhs))
1258 /* Store. */
1261 else
1263 /* Load. */
1264 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1265 if (prev_first_load)
1267 /* Check that there are no loads from different interleaving
1268 chains in the same node. */
1269 if (prev_first_load != first_load)
1271 if (dump_enabled_p ())
1272 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1273 vect_location,
1274 "Build SLP failed: different "
1275 "interleaving chains in one node %G",
1276 stmt);
1277 /* Mismatch. */
1278 continue;
1281 else
1282 prev_first_load = first_load;
1284 } /* Grouped access. */
1285 else
1287 if (load_p
1288 && rhs_code != CFN_GATHER_LOAD
1289 && rhs_code != CFN_MASK_GATHER_LOAD)
1291 /* Not grouped load. */
1292 if (dump_enabled_p ())
1293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1294 "Build SLP failed: not grouped load %G", stmt);
1296 /* FORNOW: Not grouped loads are not supported. */
1297 if (is_a <bb_vec_info> (vinfo) && i != 0)
1298 continue;
1299 /* Fatal mismatch. */
1300 matches[0] = false;
1301 return false;
1304 /* Not memory operation. */
1305 if (!phi_p
1306 && rhs_code.is_tree_code ()
1307 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1308 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1309 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1310 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1311 && rhs_code != VIEW_CONVERT_EXPR
1312 && rhs_code != CALL_EXPR
1313 && rhs_code != BIT_FIELD_REF)
1315 if (dump_enabled_p ())
1316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1317 "Build SLP failed: operation unsupported %G",
1318 stmt);
1319 if (is_a <bb_vec_info> (vinfo) && i != 0)
1320 continue;
1321 /* Fatal mismatch. */
1322 matches[0] = false;
1323 return false;
1326 if (rhs_code == COND_EXPR)
1328 tree cond_expr = gimple_assign_rhs1 (stmt);
1329 enum tree_code cond_code = TREE_CODE (cond_expr);
1330 enum tree_code swap_code = ERROR_MARK;
1331 enum tree_code invert_code = ERROR_MARK;
1333 if (i == 0)
1334 first_cond_code = TREE_CODE (cond_expr);
1335 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1337 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1338 swap_code = swap_tree_comparison (cond_code);
1339 invert_code = invert_tree_comparison (cond_code, honor_nans);
1342 if (first_cond_code == cond_code)
1344 /* Isomorphic can be achieved by swapping. */
1345 else if (first_cond_code == swap_code)
1346 swap[i] = 1;
1347 /* Isomorphic can be achieved by inverting. */
1348 else if (first_cond_code == invert_code)
1349 swap[i] = 2;
1350 else
1352 if (dump_enabled_p ())
1353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1354 "Build SLP failed: different"
1355 " operation %G", stmt);
1356 /* Mismatch. */
1357 continue;
1361 if (rhs_code.is_tree_code ()
1362 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1363 && (swap_tree_comparison ((tree_code)first_stmt_code)
1364 == (tree_code)rhs_code))
1365 swap[i] = 1;
1368 matches[i] = true;
1371 for (i = 0; i < group_size; ++i)
1372 if (!matches[i])
1373 return false;
1375 /* If we allowed a two-operation SLP node verify the target can cope
1376 with the permute we are going to use. */
1377 if (alt_stmt_code != ERROR_MARK
1378 && (!alt_stmt_code.is_tree_code ()
1379 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1380 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1382 *two_operators = true;
1385 if (maybe_soft_fail)
1387 unsigned HOST_WIDE_INT const_nunits;
1388 if (!TYPE_VECTOR_SUBPARTS
1389 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1390 || const_nunits > group_size)
1391 matches[0] = false;
1392 else
1394 /* With constant vector elements simulate a mismatch at the
1395 point we need to split. */
1396 unsigned tail = group_size & (const_nunits - 1);
1397 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1399 return false;
1402 return true;
1405 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1406 Note we never remove apart from at destruction time so we do not
1407 need a special value for deleted that differs from empty. */
1408 struct bst_traits
1410 typedef vec <stmt_vec_info> value_type;
1411 typedef vec <stmt_vec_info> compare_type;
1412 static inline hashval_t hash (value_type);
1413 static inline bool equal (value_type existing, value_type candidate);
1414 static inline bool is_empty (value_type x) { return !x.exists (); }
1415 static inline bool is_deleted (value_type x) { return !x.exists (); }
1416 static const bool empty_zero_p = true;
1417 static inline void mark_empty (value_type &x) { x.release (); }
1418 static inline void mark_deleted (value_type &x) { x.release (); }
1419 static inline void remove (value_type &x) { x.release (); }
1421 inline hashval_t
1422 bst_traits::hash (value_type x)
1424 inchash::hash h;
1425 for (unsigned i = 0; i < x.length (); ++i)
1426 h.add_int (gimple_uid (x[i]->stmt));
1427 return h.end ();
1429 inline bool
1430 bst_traits::equal (value_type existing, value_type candidate)
1432 if (existing.length () != candidate.length ())
1433 return false;
1434 for (unsigned i = 0; i < existing.length (); ++i)
1435 if (existing[i] != candidate[i])
1436 return false;
1437 return true;
1440 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1441 but then vec::insert does memmove and that's not compatible with
1442 std::pair. */
1443 struct chain_op_t
1445 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1446 : code (code_), dt (dt_), op (op_) {}
1447 tree_code code;
1448 vect_def_type dt;
1449 tree op;
1452 /* Comparator for sorting associatable chains. */
1454 static int
1455 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1457 auto *op1 = (const chain_op_t *) op1_;
1458 auto *op2 = (const chain_op_t *) op2_;
1459 if (op1->dt != op2->dt)
1460 return (int)op1->dt - (int)op2->dt;
1461 return (int)op1->code - (int)op2->code;
1464 /* Linearize the associatable expression chain at START with the
1465 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1466 filling CHAIN with the result and using WORKLIST as intermediate storage.
1467 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1468 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1469 stmts, starting with START. */
1471 static void
1472 vect_slp_linearize_chain (vec_info *vinfo,
1473 vec<std::pair<tree_code, gimple *> > &worklist,
1474 vec<chain_op_t> &chain,
1475 enum tree_code code, gimple *start,
1476 gimple *&code_stmt, gimple *&alt_code_stmt,
1477 vec<gimple *> *chain_stmts)
1479 /* For each lane linearize the addition/subtraction (or other
1480 uniform associatable operation) expression tree. */
1481 worklist.safe_push (std::make_pair (code, start));
1482 while (!worklist.is_empty ())
1484 auto entry = worklist.pop ();
1485 gassign *stmt = as_a <gassign *> (entry.second);
1486 enum tree_code in_code = entry.first;
1487 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1488 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1489 if (!code_stmt
1490 && gimple_assign_rhs_code (stmt) == code)
1491 code_stmt = stmt;
1492 else if (!alt_code_stmt
1493 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1494 alt_code_stmt = stmt;
1495 if (chain_stmts)
1496 chain_stmts->safe_push (stmt);
1497 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1499 tree op = gimple_op (stmt, opnum);
1500 vect_def_type dt;
1501 stmt_vec_info def_stmt_info;
1502 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1503 gcc_assert (res);
1504 if (dt == vect_internal_def
1505 && is_pattern_stmt_p (def_stmt_info))
1506 op = gimple_get_lhs (def_stmt_info->stmt);
1507 gimple *use_stmt;
1508 use_operand_p use_p;
1509 if (dt == vect_internal_def
1510 && single_imm_use (op, &use_p, &use_stmt)
1511 && is_gimple_assign (def_stmt_info->stmt)
1512 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1513 || (code == PLUS_EXPR
1514 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1515 == MINUS_EXPR))))
1517 tree_code op_def_code = this_code;
1518 if (op_def_code == MINUS_EXPR && opnum == 1)
1519 op_def_code = PLUS_EXPR;
1520 if (in_code == MINUS_EXPR)
1521 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1522 worklist.safe_push (std::make_pair (op_def_code,
1523 def_stmt_info->stmt));
1525 else
1527 tree_code op_def_code = this_code;
1528 if (op_def_code == MINUS_EXPR && opnum == 1)
1529 op_def_code = PLUS_EXPR;
1530 if (in_code == MINUS_EXPR)
1531 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1532 chain.safe_push (chain_op_t (op_def_code, dt, op));
1538 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1539 simple_hashmap_traits <bst_traits, slp_tree> >
1540 scalar_stmts_to_slp_tree_map_t;
1542 static slp_tree
1543 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1544 vec<stmt_vec_info> stmts, unsigned int group_size,
1545 poly_uint64 *max_nunits,
1546 bool *matches, unsigned *limit, unsigned *tree_size,
1547 scalar_stmts_to_slp_tree_map_t *bst_map);
1549 static slp_tree
1550 vect_build_slp_tree (vec_info *vinfo,
1551 vec<stmt_vec_info> stmts, unsigned int group_size,
1552 poly_uint64 *max_nunits,
1553 bool *matches, unsigned *limit, unsigned *tree_size,
1554 scalar_stmts_to_slp_tree_map_t *bst_map)
1556 if (slp_tree *leader = bst_map->get (stmts))
1558 if (dump_enabled_p ())
1559 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1560 !(*leader)->failed ? "" : "failed ",
1561 (void *) *leader);
1562 if (!(*leader)->failed)
1564 SLP_TREE_REF_COUNT (*leader)++;
1565 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1566 stmts.release ();
1567 return *leader;
1569 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1570 return NULL;
1573 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1574 so we can pick up backedge destinations during discovery. */
1575 slp_tree res = new _slp_tree;
1576 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1577 SLP_TREE_SCALAR_STMTS (res) = stmts;
1578 bst_map->put (stmts.copy (), res);
1580 if (*limit == 0)
1582 if (dump_enabled_p ())
1583 dump_printf_loc (MSG_NOTE, vect_location,
1584 "SLP discovery limit exceeded\n");
1585 /* Mark the node invalid so we can detect those when still in use
1586 as backedge destinations. */
1587 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1588 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1589 res->failed = XNEWVEC (bool, group_size);
1590 memset (res->failed, 0, sizeof (bool) * group_size);
1591 memset (matches, 0, sizeof (bool) * group_size);
1592 return NULL;
1594 --*limit;
1596 if (dump_enabled_p ())
1597 dump_printf_loc (MSG_NOTE, vect_location,
1598 "starting SLP discovery for node %p\n", (void *) res);
1600 poly_uint64 this_max_nunits = 1;
1601 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1602 &this_max_nunits,
1603 matches, limit, tree_size, bst_map);
1604 if (!res_)
1606 if (dump_enabled_p ())
1607 dump_printf_loc (MSG_NOTE, vect_location,
1608 "SLP discovery for node %p failed\n", (void *) res);
1609 /* Mark the node invalid so we can detect those when still in use
1610 as backedge destinations. */
1611 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1612 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1613 res->failed = XNEWVEC (bool, group_size);
1614 if (flag_checking)
1616 unsigned i;
1617 for (i = 0; i < group_size; ++i)
1618 if (!matches[i])
1619 break;
1620 gcc_assert (i < group_size);
1622 memcpy (res->failed, matches, sizeof (bool) * group_size);
1624 else
1626 if (dump_enabled_p ())
1627 dump_printf_loc (MSG_NOTE, vect_location,
1628 "SLP discovery for node %p succeeded\n",
1629 (void *) res);
1630 gcc_assert (res_ == res);
1631 res->max_nunits = this_max_nunits;
1632 vect_update_max_nunits (max_nunits, this_max_nunits);
1633 /* Keep a reference for the bst_map use. */
1634 SLP_TREE_REF_COUNT (res)++;
1636 return res_;
1639 /* Helper for building an associated SLP node chain. */
1641 static void
1642 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1643 slp_tree op0, slp_tree op1,
1644 stmt_vec_info oper1, stmt_vec_info oper2,
1645 vec<std::pair<unsigned, unsigned> > lperm)
1647 unsigned group_size = SLP_TREE_LANES (op1);
1649 slp_tree child1 = new _slp_tree;
1650 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1651 SLP_TREE_VECTYPE (child1) = vectype;
1652 SLP_TREE_LANES (child1) = group_size;
1653 SLP_TREE_CHILDREN (child1).create (2);
1654 SLP_TREE_CHILDREN (child1).quick_push (op0);
1655 SLP_TREE_CHILDREN (child1).quick_push (op1);
1656 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1658 slp_tree child2 = new _slp_tree;
1659 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1660 SLP_TREE_VECTYPE (child2) = vectype;
1661 SLP_TREE_LANES (child2) = group_size;
1662 SLP_TREE_CHILDREN (child2).create (2);
1663 SLP_TREE_CHILDREN (child2).quick_push (op0);
1664 SLP_TREE_REF_COUNT (op0)++;
1665 SLP_TREE_CHILDREN (child2).quick_push (op1);
1666 SLP_TREE_REF_COUNT (op1)++;
1667 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1669 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1670 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1671 SLP_TREE_VECTYPE (perm) = vectype;
1672 SLP_TREE_LANES (perm) = group_size;
1673 /* ??? We should set this NULL but that's not expected. */
1674 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1675 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1676 SLP_TREE_CHILDREN (perm).quick_push (child1);
1677 SLP_TREE_CHILDREN (perm).quick_push (child2);
1680 /* Recursively build an SLP tree starting from NODE.
1681 Fail (and return a value not equal to zero) if def-stmts are not
1682 isomorphic, require data permutation or are of unsupported types of
1683 operation. Otherwise, return 0.
1684 The value returned is the depth in the SLP tree where a mismatch
1685 was found. */
1687 static slp_tree
1688 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1689 vec<stmt_vec_info> stmts, unsigned int group_size,
1690 poly_uint64 *max_nunits,
1691 bool *matches, unsigned *limit, unsigned *tree_size,
1692 scalar_stmts_to_slp_tree_map_t *bst_map)
1694 unsigned nops, i, this_tree_size = 0;
1695 poly_uint64 this_max_nunits = *max_nunits;
1697 matches[0] = false;
1699 stmt_vec_info stmt_info = stmts[0];
1700 if (!is_a<gcall *> (stmt_info->stmt)
1701 && !is_a<gassign *> (stmt_info->stmt)
1702 && !is_a<gphi *> (stmt_info->stmt))
1703 return NULL;
1705 nops = gimple_num_args (stmt_info->stmt);
1706 if (const int *map = vect_get_operand_map (stmt_info->stmt))
1707 nops = map[0];
1709 /* If the SLP node is a PHI (induction or reduction), terminate
1710 the recursion. */
1711 bool *skip_args = XALLOCAVEC (bool, nops);
1712 memset (skip_args, 0, sizeof (bool) * nops);
1713 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1714 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1716 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1717 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1718 group_size);
1719 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1720 max_nunits))
1721 return NULL;
1723 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1724 if (def_type == vect_induction_def)
1726 /* Induction PHIs are not cycles but walk the initial
1727 value. Only for inner loops through, for outer loops
1728 we need to pick up the value from the actual PHIs
1729 to more easily support peeling and epilogue vectorization. */
1730 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1731 if (!nested_in_vect_loop_p (loop, stmt_info))
1732 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1733 else
1734 loop = loop->inner;
1735 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1737 else if (def_type == vect_reduction_def
1738 || def_type == vect_double_reduction_def
1739 || def_type == vect_nested_cycle
1740 || def_type == vect_first_order_recurrence)
1742 /* Else def types have to match. */
1743 stmt_vec_info other_info;
1744 bool all_same = true;
1745 FOR_EACH_VEC_ELT (stmts, i, other_info)
1747 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1748 return NULL;
1749 if (other_info != stmt_info)
1750 all_same = false;
1752 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1753 /* Reduction initial values are not explicitely represented. */
1754 if (def_type != vect_first_order_recurrence
1755 && !nested_in_vect_loop_p (loop, stmt_info))
1756 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1757 /* Reduction chain backedge defs are filled manually.
1758 ??? Need a better way to identify a SLP reduction chain PHI.
1759 Or a better overall way to SLP match those. */
1760 if (all_same && def_type == vect_reduction_def)
1761 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1763 else if (def_type != vect_internal_def)
1764 return NULL;
1768 bool two_operators = false;
1769 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1770 tree vectype = NULL_TREE;
1771 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1772 &this_max_nunits, matches, &two_operators,
1773 &vectype))
1774 return NULL;
1776 /* If the SLP node is a load, terminate the recursion unless masked. */
1777 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1778 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1780 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1781 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1782 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1783 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1784 else
1786 *max_nunits = this_max_nunits;
1787 (*tree_size)++;
1788 node = vect_create_new_slp_node (node, stmts, 0);
1789 SLP_TREE_VECTYPE (node) = vectype;
1790 /* And compute the load permutation. Whether it is actually
1791 a permutation depends on the unrolling factor which is
1792 decided later. */
1793 vec<unsigned> load_permutation;
1794 int j;
1795 stmt_vec_info load_info;
1796 load_permutation.create (group_size);
1797 stmt_vec_info first_stmt_info
1798 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1799 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1801 int load_place = vect_get_place_in_interleaving_chain
1802 (load_info, first_stmt_info);
1803 gcc_assert (load_place != -1);
1804 load_permutation.safe_push (load_place);
1806 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1807 return node;
1810 else if (gimple_assign_single_p (stmt_info->stmt)
1811 && !gimple_vuse (stmt_info->stmt)
1812 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1814 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1815 the same SSA name vector of a compatible type to vectype. */
1816 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1817 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1818 stmt_vec_info estmt_info;
1819 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1821 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1822 tree bfref = gimple_assign_rhs1 (estmt);
1823 HOST_WIDE_INT lane;
1824 if (!known_eq (bit_field_size (bfref),
1825 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1826 || !constant_multiple_p (bit_field_offset (bfref),
1827 bit_field_size (bfref), &lane))
1829 lperm.release ();
1830 matches[0] = false;
1831 return NULL;
1833 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1835 slp_tree vnode = vect_create_new_slp_node (vNULL);
1836 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1837 /* ??? We record vectype here but we hide eventually necessary
1838 punning and instead rely on code generation to materialize
1839 VIEW_CONVERT_EXPRs as necessary. We instead should make
1840 this explicit somehow. */
1841 SLP_TREE_VECTYPE (vnode) = vectype;
1842 else
1844 /* For different size but compatible elements we can still
1845 use VEC_PERM_EXPR without punning. */
1846 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1847 && types_compatible_p (TREE_TYPE (vectype),
1848 TREE_TYPE (TREE_TYPE (vec))));
1849 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1851 auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
1852 unsigned HOST_WIDE_INT const_nunits;
1853 if (nunits.is_constant (&const_nunits))
1854 SLP_TREE_LANES (vnode) = const_nunits;
1855 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1856 /* We are always building a permutation node even if it is an identity
1857 permute to shield the rest of the vectorizer from the odd node
1858 representing an actual vector without any scalar ops.
1859 ??? We could hide it completely with making the permute node
1860 external? */
1861 node = vect_create_new_slp_node (node, stmts, 1);
1862 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1863 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1864 SLP_TREE_VECTYPE (node) = vectype;
1865 SLP_TREE_CHILDREN (node).quick_push (vnode);
1866 return node;
1868 /* When discovery reaches an associatable operation see whether we can
1869 improve that to match up lanes in a way superior to the operand
1870 swapping code which at most looks at two defs.
1871 ??? For BB vectorization we cannot do the brute-force search
1872 for matching as we can succeed by means of builds from scalars
1873 and have no good way to "cost" one build against another. */
1874 else if (is_a <loop_vec_info> (vinfo)
1875 /* ??? We don't handle !vect_internal_def defs below. */
1876 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1877 && is_gimple_assign (stmt_info->stmt)
1878 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1879 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1880 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1881 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1882 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1884 /* See if we have a chain of (mixed) adds or subtracts or other
1885 associatable ops. */
1886 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1887 if (code == MINUS_EXPR)
1888 code = PLUS_EXPR;
1889 stmt_vec_info other_op_stmt_info = NULL;
1890 stmt_vec_info op_stmt_info = NULL;
1891 unsigned chain_len = 0;
1892 auto_vec<chain_op_t> chain;
1893 auto_vec<std::pair<tree_code, gimple *> > worklist;
1894 auto_vec<vec<chain_op_t> > chains (group_size);
1895 auto_vec<slp_tree, 4> children;
1896 bool hard_fail = true;
1897 for (unsigned lane = 0; lane < group_size; ++lane)
1899 /* For each lane linearize the addition/subtraction (or other
1900 uniform associatable operation) expression tree. */
1901 gimple *op_stmt = NULL, *other_op_stmt = NULL;
1902 vect_slp_linearize_chain (vinfo, worklist, chain, code,
1903 stmts[lane]->stmt, op_stmt, other_op_stmt,
1904 NULL);
1905 if (!op_stmt_info && op_stmt)
1906 op_stmt_info = vinfo->lookup_stmt (op_stmt);
1907 if (!other_op_stmt_info && other_op_stmt)
1908 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1909 if (chain.length () == 2)
1911 /* In a chain of just two elements resort to the regular
1912 operand swapping scheme. If we run into a length
1913 mismatch still hard-FAIL. */
1914 if (chain_len == 0)
1915 hard_fail = false;
1916 else
1918 matches[lane] = false;
1919 /* ??? We might want to process the other lanes, but
1920 make sure to not give false matching hints to the
1921 caller for lanes we did not process. */
1922 if (lane != group_size - 1)
1923 matches[0] = false;
1925 break;
1927 else if (chain_len == 0)
1928 chain_len = chain.length ();
1929 else if (chain.length () != chain_len)
1931 /* ??? Here we could slip in magic to compensate with
1932 neutral operands. */
1933 matches[lane] = false;
1934 if (lane != group_size - 1)
1935 matches[0] = false;
1936 break;
1938 chains.quick_push (chain.copy ());
1939 chain.truncate (0);
1941 if (chains.length () == group_size)
1943 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
1944 if (!op_stmt_info)
1946 hard_fail = false;
1947 goto out;
1949 /* Now we have a set of chains with the same length. */
1950 /* 1. pre-sort according to def_type and operation. */
1951 for (unsigned lane = 0; lane < group_size; ++lane)
1952 chains[lane].stablesort (dt_sort_cmp, vinfo);
1953 if (dump_enabled_p ())
1955 dump_printf_loc (MSG_NOTE, vect_location,
1956 "pre-sorted chains of %s\n",
1957 get_tree_code_name (code));
1958 for (unsigned lane = 0; lane < group_size; ++lane)
1960 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1961 dump_printf (MSG_NOTE, "%s %T ",
1962 get_tree_code_name (chains[lane][opnum].code),
1963 chains[lane][opnum].op);
1964 dump_printf (MSG_NOTE, "\n");
1967 /* 2. try to build children nodes, associating as necessary. */
1968 for (unsigned n = 0; n < chain_len; ++n)
1970 vect_def_type dt = chains[0][n].dt;
1971 unsigned lane;
1972 for (lane = 0; lane < group_size; ++lane)
1973 if (chains[lane][n].dt != dt)
1975 if (dt == vect_constant_def
1976 && chains[lane][n].dt == vect_external_def)
1977 dt = vect_external_def;
1978 else if (dt == vect_external_def
1979 && chains[lane][n].dt == vect_constant_def)
1981 else
1982 break;
1984 if (lane != group_size)
1986 if (dump_enabled_p ())
1987 dump_printf_loc (MSG_NOTE, vect_location,
1988 "giving up on chain due to mismatched "
1989 "def types\n");
1990 matches[lane] = false;
1991 if (lane != group_size - 1)
1992 matches[0] = false;
1993 goto out;
1995 if (dt == vect_constant_def
1996 || dt == vect_external_def)
1998 /* Check whether we can build the invariant. If we can't
1999 we never will be able to. */
2000 tree type = TREE_TYPE (chains[0][n].op);
2001 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2002 && (TREE_CODE (type) == BOOLEAN_TYPE
2003 || !can_duplicate_and_interleave_p (vinfo, group_size,
2004 type)))
2006 matches[0] = false;
2007 goto out;
2009 vec<tree> ops;
2010 ops.create (group_size);
2011 for (lane = 0; lane < group_size; ++lane)
2012 ops.quick_push (chains[lane][n].op);
2013 slp_tree child = vect_create_new_slp_node (ops);
2014 SLP_TREE_DEF_TYPE (child) = dt;
2015 children.safe_push (child);
2017 else if (dt != vect_internal_def)
2019 /* Not sure, we might need sth special.
2020 gcc.dg/vect/pr96854.c,
2021 gfortran.dg/vect/fast-math-pr37021.f90
2022 and gfortran.dg/vect/pr61171.f trigger. */
2023 /* Soft-fail for now. */
2024 hard_fail = false;
2025 goto out;
2027 else
2029 vec<stmt_vec_info> op_stmts;
2030 op_stmts.create (group_size);
2031 slp_tree child = NULL;
2032 /* Brute-force our way. We have to consider a lane
2033 failing after fixing an earlier fail up in the
2034 SLP discovery recursion. So track the current
2035 permute per lane. */
2036 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2037 memset (perms, 0, sizeof (unsigned) * group_size);
2040 op_stmts.truncate (0);
2041 for (lane = 0; lane < group_size; ++lane)
2042 op_stmts.quick_push
2043 (vinfo->lookup_def (chains[lane][n].op));
2044 child = vect_build_slp_tree (vinfo, op_stmts,
2045 group_size, &this_max_nunits,
2046 matches, limit,
2047 &this_tree_size, bst_map);
2048 /* ??? We're likely getting too many fatal mismatches
2049 here so maybe we want to ignore them (but then we
2050 have no idea which lanes fatally mismatched). */
2051 if (child || !matches[0])
2052 break;
2053 /* Swap another lane we have not yet matched up into
2054 lanes that did not match. If we run out of
2055 permute possibilities for a lane terminate the
2056 search. */
2057 bool term = false;
2058 for (lane = 1; lane < group_size; ++lane)
2059 if (!matches[lane])
2061 if (n + perms[lane] + 1 == chain_len)
2063 term = true;
2064 break;
2066 std::swap (chains[lane][n],
2067 chains[lane][n + perms[lane] + 1]);
2068 perms[lane]++;
2070 if (term)
2071 break;
2073 while (1);
2074 if (!child)
2076 if (dump_enabled_p ())
2077 dump_printf_loc (MSG_NOTE, vect_location,
2078 "failed to match up op %d\n", n);
2079 op_stmts.release ();
2080 if (lane != group_size - 1)
2081 matches[0] = false;
2082 else
2083 matches[lane] = false;
2084 goto out;
2086 if (dump_enabled_p ())
2088 dump_printf_loc (MSG_NOTE, vect_location,
2089 "matched up op %d to\n", n);
2090 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2092 children.safe_push (child);
2095 /* 3. build SLP nodes to combine the chain. */
2096 for (unsigned lane = 0; lane < group_size; ++lane)
2097 if (chains[lane][0].code != code)
2099 /* See if there's any alternate all-PLUS entry. */
2100 unsigned n;
2101 for (n = 1; n < chain_len; ++n)
2103 for (lane = 0; lane < group_size; ++lane)
2104 if (chains[lane][n].code != code)
2105 break;
2106 if (lane == group_size)
2107 break;
2109 if (n != chain_len)
2111 /* Swap that in at first position. */
2112 std::swap (children[0], children[n]);
2113 for (lane = 0; lane < group_size; ++lane)
2114 std::swap (chains[lane][0], chains[lane][n]);
2116 else
2118 /* ??? When this triggers and we end up with two
2119 vect_constant/external_def up-front things break (ICE)
2120 spectacularly finding an insertion place for the
2121 all-constant op. We should have a fully
2122 vect_internal_def operand though(?) so we can swap
2123 that into first place and then prepend the all-zero
2124 constant. */
2125 if (dump_enabled_p ())
2126 dump_printf_loc (MSG_NOTE, vect_location,
2127 "inserting constant zero to compensate "
2128 "for (partially) negated first "
2129 "operand\n");
2130 chain_len++;
2131 for (lane = 0; lane < group_size; ++lane)
2132 chains[lane].safe_insert
2133 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2134 vec<tree> zero_ops;
2135 zero_ops.create (group_size);
2136 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2137 for (lane = 1; lane < group_size; ++lane)
2138 zero_ops.quick_push (zero_ops[0]);
2139 slp_tree zero = vect_create_new_slp_node (zero_ops);
2140 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2141 children.safe_insert (0, zero);
2143 break;
2145 for (unsigned i = 1; i < children.length (); ++i)
2147 slp_tree op0 = children[i - 1];
2148 slp_tree op1 = children[i];
2149 bool this_two_op = false;
2150 for (unsigned lane = 0; lane < group_size; ++lane)
2151 if (chains[lane][i].code != chains[0][i].code)
2153 this_two_op = true;
2154 break;
2156 slp_tree child;
2157 if (i == children.length () - 1)
2158 child = vect_create_new_slp_node (node, stmts, 2);
2159 else
2160 child = vect_create_new_slp_node (2, ERROR_MARK);
2161 if (this_two_op)
2163 vec<std::pair<unsigned, unsigned> > lperm;
2164 lperm.create (group_size);
2165 for (unsigned lane = 0; lane < group_size; ++lane)
2166 lperm.quick_push (std::make_pair
2167 (chains[lane][i].code != chains[0][i].code, lane));
2168 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2169 (chains[0][i].code == code
2170 ? op_stmt_info
2171 : other_op_stmt_info),
2172 (chains[0][i].code == code
2173 ? other_op_stmt_info
2174 : op_stmt_info),
2175 lperm);
2177 else
2179 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2180 SLP_TREE_VECTYPE (child) = vectype;
2181 SLP_TREE_LANES (child) = group_size;
2182 SLP_TREE_CHILDREN (child).quick_push (op0);
2183 SLP_TREE_CHILDREN (child).quick_push (op1);
2184 SLP_TREE_REPRESENTATIVE (child)
2185 = (chains[0][i].code == code
2186 ? op_stmt_info : other_op_stmt_info);
2188 children[i] = child;
2190 *tree_size += this_tree_size + 1;
2191 *max_nunits = this_max_nunits;
2192 while (!chains.is_empty ())
2193 chains.pop ().release ();
2194 return node;
2196 out:
2197 while (!children.is_empty ())
2198 vect_free_slp_tree (children.pop ());
2199 while (!chains.is_empty ())
2200 chains.pop ().release ();
2201 /* Hard-fail, otherwise we might run into quadratic processing of the
2202 chains starting one stmt into the chain again. */
2203 if (hard_fail)
2204 return NULL;
2205 /* Fall thru to normal processing. */
2208 /* Get at the operands, verifying they are compatible. */
2209 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2210 slp_oprnd_info oprnd_info;
2211 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2213 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2214 stmts, i, &oprnds_info);
2215 if (res != 0)
2216 matches[(res == -1) ? 0 : i] = false;
2217 if (!matches[0])
2218 break;
2220 for (i = 0; i < group_size; ++i)
2221 if (!matches[i])
2223 vect_free_oprnd_info (oprnds_info);
2224 return NULL;
2226 swap = NULL;
2228 auto_vec<slp_tree, 4> children;
2230 stmt_info = stmts[0];
2232 /* Create SLP_TREE nodes for the definition node/s. */
2233 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2235 slp_tree child;
2236 unsigned int j;
2238 /* We're skipping certain operands from processing, for example
2239 outer loop reduction initial defs. */
2240 if (skip_args[i])
2242 children.safe_push (NULL);
2243 continue;
2246 if (oprnd_info->first_dt == vect_uninitialized_def)
2248 /* COND_EXPR have one too many eventually if the condition
2249 is a SSA name. */
2250 gcc_assert (i == 3 && nops == 4);
2251 continue;
2254 if (is_a <bb_vec_info> (vinfo)
2255 && oprnd_info->first_dt == vect_internal_def
2256 && !oprnd_info->any_pattern)
2258 /* For BB vectorization, if all defs are the same do not
2259 bother to continue the build along the single-lane
2260 graph but use a splat of the scalar value. */
2261 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2262 for (j = 1; j < group_size; ++j)
2263 if (oprnd_info->def_stmts[j] != first_def)
2264 break;
2265 if (j == group_size
2266 /* But avoid doing this for loads where we may be
2267 able to CSE things, unless the stmt is not
2268 vectorizable. */
2269 && (!STMT_VINFO_VECTORIZABLE (first_def)
2270 || !gimple_vuse (first_def->stmt)))
2272 if (dump_enabled_p ())
2273 dump_printf_loc (MSG_NOTE, vect_location,
2274 "Using a splat of the uniform operand %G",
2275 first_def->stmt);
2276 oprnd_info->first_dt = vect_external_def;
2280 if (oprnd_info->first_dt == vect_external_def
2281 || oprnd_info->first_dt == vect_constant_def)
2283 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2284 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2285 oprnd_info->ops = vNULL;
2286 children.safe_push (invnode);
2287 continue;
2290 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2291 group_size, &this_max_nunits,
2292 matches, limit,
2293 &this_tree_size, bst_map)) != NULL)
2295 oprnd_info->def_stmts = vNULL;
2296 children.safe_push (child);
2297 continue;
2300 /* If the SLP build for operand zero failed and operand zero
2301 and one can be commutated try that for the scalar stmts
2302 that failed the match. */
2303 if (i == 0
2304 /* A first scalar stmt mismatch signals a fatal mismatch. */
2305 && matches[0]
2306 /* ??? For COND_EXPRs we can swap the comparison operands
2307 as well as the arms under some constraints. */
2308 && nops == 2
2309 && oprnds_info[1]->first_dt == vect_internal_def
2310 && is_gimple_assign (stmt_info->stmt)
2311 /* Swapping operands for reductions breaks assumptions later on. */
2312 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2313 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2315 /* See whether we can swap the matching or the non-matching
2316 stmt operands. */
2317 bool swap_not_matching = true;
2320 for (j = 0; j < group_size; ++j)
2322 if (matches[j] != !swap_not_matching)
2323 continue;
2324 stmt_vec_info stmt_info = stmts[j];
2325 /* Verify if we can swap operands of this stmt. */
2326 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2327 if (!stmt
2328 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2330 if (!swap_not_matching)
2331 goto fail;
2332 swap_not_matching = false;
2333 break;
2337 while (j != group_size);
2339 /* Swap mismatched definition stmts. */
2340 if (dump_enabled_p ())
2341 dump_printf_loc (MSG_NOTE, vect_location,
2342 "Re-trying with swapped operands of stmts ");
2343 for (j = 0; j < group_size; ++j)
2344 if (matches[j] == !swap_not_matching)
2346 std::swap (oprnds_info[0]->def_stmts[j],
2347 oprnds_info[1]->def_stmts[j]);
2348 std::swap (oprnds_info[0]->ops[j],
2349 oprnds_info[1]->ops[j]);
2350 if (dump_enabled_p ())
2351 dump_printf (MSG_NOTE, "%d ", j);
2353 if (dump_enabled_p ())
2354 dump_printf (MSG_NOTE, "\n");
2355 /* After swapping some operands we lost track whether an
2356 operand has any pattern defs so be conservative here. */
2357 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2358 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2359 /* And try again with scratch 'matches' ... */
2360 bool *tem = XALLOCAVEC (bool, group_size);
2361 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2362 group_size, &this_max_nunits,
2363 tem, limit,
2364 &this_tree_size, bst_map)) != NULL)
2366 oprnd_info->def_stmts = vNULL;
2367 children.safe_push (child);
2368 continue;
2371 fail:
2373 /* If the SLP build failed and we analyze a basic-block
2374 simply treat nodes we fail to build as externally defined
2375 (and thus build vectors from the scalar defs).
2376 The cost model will reject outright expensive cases.
2377 ??? This doesn't treat cases where permutation ultimatively
2378 fails (or we don't try permutation below). Ideally we'd
2379 even compute a permutation that will end up with the maximum
2380 SLP tree size... */
2381 if (is_a <bb_vec_info> (vinfo)
2382 /* ??? Rejecting patterns this way doesn't work. We'd have to
2383 do extra work to cancel the pattern so the uses see the
2384 scalar version. */
2385 && !is_pattern_stmt_p (stmt_info)
2386 && !oprnd_info->any_pattern)
2388 /* But if there's a leading vector sized set of matching stmts
2389 fail here so we can split the group. This matches the condition
2390 vect_analyze_slp_instance uses. */
2391 /* ??? We might want to split here and combine the results to support
2392 multiple vector sizes better. */
2393 for (j = 0; j < group_size; ++j)
2394 if (!matches[j])
2395 break;
2396 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2398 if (dump_enabled_p ())
2399 dump_printf_loc (MSG_NOTE, vect_location,
2400 "Building vector operands from scalars\n");
2401 this_tree_size++;
2402 child = vect_create_new_slp_node (oprnd_info->ops);
2403 children.safe_push (child);
2404 oprnd_info->ops = vNULL;
2405 continue;
2409 gcc_assert (child == NULL);
2410 FOR_EACH_VEC_ELT (children, j, child)
2411 if (child)
2412 vect_free_slp_tree (child);
2413 vect_free_oprnd_info (oprnds_info);
2414 return NULL;
2417 vect_free_oprnd_info (oprnds_info);
2419 /* If we have all children of a child built up from uniform scalars
2420 or does more than one possibly expensive vector construction then
2421 just throw that away, causing it built up from scalars.
2422 The exception is the SLP node for the vector store. */
2423 if (is_a <bb_vec_info> (vinfo)
2424 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2425 /* ??? Rejecting patterns this way doesn't work. We'd have to
2426 do extra work to cancel the pattern so the uses see the
2427 scalar version. */
2428 && !is_pattern_stmt_p (stmt_info))
2430 slp_tree child;
2431 unsigned j;
2432 bool all_uniform_p = true;
2433 unsigned n_vector_builds = 0;
2434 FOR_EACH_VEC_ELT (children, j, child)
2436 if (!child)
2438 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2439 all_uniform_p = false;
2440 else if (!vect_slp_tree_uniform_p (child))
2442 all_uniform_p = false;
2443 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2444 n_vector_builds++;
2447 if (all_uniform_p
2448 || n_vector_builds > 1
2449 || (n_vector_builds == children.length ()
2450 && is_a <gphi *> (stmt_info->stmt)))
2452 /* Roll back. */
2453 matches[0] = false;
2454 FOR_EACH_VEC_ELT (children, j, child)
2455 if (child)
2456 vect_free_slp_tree (child);
2458 if (dump_enabled_p ())
2459 dump_printf_loc (MSG_NOTE, vect_location,
2460 "Building parent vector operands from "
2461 "scalars instead\n");
2462 return NULL;
2466 *tree_size += this_tree_size + 1;
2467 *max_nunits = this_max_nunits;
2469 if (two_operators)
2471 /* ??? We'd likely want to either cache in bst_map sth like
2472 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2473 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2474 explicit stmts to put in so the keying on 'stmts' doesn't
2475 work (but we have the same issue with nodes that use 'ops'). */
2476 slp_tree one = new _slp_tree;
2477 slp_tree two = new _slp_tree;
2478 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2479 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2480 SLP_TREE_VECTYPE (one) = vectype;
2481 SLP_TREE_VECTYPE (two) = vectype;
2482 SLP_TREE_CHILDREN (one).safe_splice (children);
2483 SLP_TREE_CHILDREN (two).safe_splice (children);
2484 slp_tree child;
2485 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2486 SLP_TREE_REF_COUNT (child)++;
2488 /* Here we record the original defs since this
2489 node represents the final lane configuration. */
2490 node = vect_create_new_slp_node (node, stmts, 2);
2491 SLP_TREE_VECTYPE (node) = vectype;
2492 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2493 SLP_TREE_CHILDREN (node).quick_push (one);
2494 SLP_TREE_CHILDREN (node).quick_push (two);
2495 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2496 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2497 enum tree_code ocode = ERROR_MARK;
2498 stmt_vec_info ostmt_info;
2499 unsigned j = 0;
2500 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2502 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2503 if (gimple_assign_rhs_code (ostmt) != code0)
2505 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2506 ocode = gimple_assign_rhs_code (ostmt);
2507 j = i;
2509 else
2510 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2512 SLP_TREE_CODE (one) = code0;
2513 SLP_TREE_CODE (two) = ocode;
2514 SLP_TREE_LANES (one) = stmts.length ();
2515 SLP_TREE_LANES (two) = stmts.length ();
2516 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2517 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2518 return node;
2521 node = vect_create_new_slp_node (node, stmts, nops);
2522 SLP_TREE_VECTYPE (node) = vectype;
2523 SLP_TREE_CHILDREN (node).splice (children);
2524 return node;
2527 /* Dump a single SLP tree NODE. */
2529 static void
2530 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2531 slp_tree node)
2533 unsigned i, j;
2534 slp_tree child;
2535 stmt_vec_info stmt_info;
2536 tree op;
2538 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2539 dump_user_location_t user_loc = loc.get_user_location ();
2540 dump_printf_loc (metadata, user_loc,
2541 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2542 ", refcnt=%u)",
2543 SLP_TREE_DEF_TYPE (node) == vect_external_def
2544 ? " (external)"
2545 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2546 ? " (constant)"
2547 : ""), (void *) node,
2548 estimated_poly_value (node->max_nunits),
2549 SLP_TREE_REF_COUNT (node));
2550 if (SLP_TREE_VECTYPE (node))
2551 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2552 dump_printf (metadata, "\n");
2553 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2555 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2556 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2557 else
2558 dump_printf_loc (metadata, user_loc, "op template: %G",
2559 SLP_TREE_REPRESENTATIVE (node)->stmt);
2561 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2562 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2563 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2564 else
2566 dump_printf_loc (metadata, user_loc, "\t{ ");
2567 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2568 dump_printf (metadata, "%T%s ", op,
2569 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2570 dump_printf (metadata, "}\n");
2572 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2574 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2575 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2576 dump_printf (dump_kind, " %u", j);
2577 dump_printf (dump_kind, " }\n");
2579 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2581 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2582 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2583 dump_printf (dump_kind, " %u[%u]",
2584 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2585 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2586 dump_printf (dump_kind, " }\n");
2588 if (SLP_TREE_CHILDREN (node).is_empty ())
2589 return;
2590 dump_printf_loc (metadata, user_loc, "\tchildren");
2591 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2592 dump_printf (dump_kind, " %p", (void *)child);
2593 dump_printf (dump_kind, "\n");
2596 DEBUG_FUNCTION void
2597 debug (slp_tree node)
2599 debug_dump_context ctx;
2600 vect_print_slp_tree (MSG_NOTE,
2601 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2602 node);
2605 /* Recursive helper for the dot producer below. */
2607 static void
2608 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2610 if (visited.add (node))
2611 return;
2613 fprintf (f, "\"%p\" [label=\"", (void *)node);
2614 vect_print_slp_tree (MSG_NOTE,
2615 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2616 node);
2617 fprintf (f, "\"];\n");
2620 for (slp_tree child : SLP_TREE_CHILDREN (node))
2621 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2623 for (slp_tree child : SLP_TREE_CHILDREN (node))
2624 if (child)
2625 dot_slp_tree (f, child, visited);
2628 DEBUG_FUNCTION void
2629 dot_slp_tree (const char *fname, slp_tree node)
2631 FILE *f = fopen (fname, "w");
2632 fprintf (f, "digraph {\n");
2633 fflush (f);
2635 debug_dump_context ctx (f);
2636 hash_set<slp_tree> visited;
2637 dot_slp_tree (f, node, visited);
2639 fflush (f);
2640 fprintf (f, "}\n");
2641 fclose (f);
2644 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2646 static void
2647 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2648 slp_tree node, hash_set<slp_tree> &visited)
2650 unsigned i;
2651 slp_tree child;
2653 if (visited.add (node))
2654 return;
2656 vect_print_slp_tree (dump_kind, loc, node);
2658 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2659 if (child)
2660 vect_print_slp_graph (dump_kind, loc, child, visited);
2663 static void
2664 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2665 slp_tree entry)
2667 hash_set<slp_tree> visited;
2668 vect_print_slp_graph (dump_kind, loc, entry, visited);
2671 /* Mark the tree rooted at NODE with PURE_SLP. */
2673 static void
2674 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2676 int i;
2677 stmt_vec_info stmt_info;
2678 slp_tree child;
2680 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2681 return;
2683 if (visited.add (node))
2684 return;
2686 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2687 STMT_SLP_TYPE (stmt_info) = pure_slp;
2689 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2690 if (child)
2691 vect_mark_slp_stmts (child, visited);
2694 static void
2695 vect_mark_slp_stmts (slp_tree node)
2697 hash_set<slp_tree> visited;
2698 vect_mark_slp_stmts (node, visited);
2701 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2703 static void
2704 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2706 int i;
2707 stmt_vec_info stmt_info;
2708 slp_tree child;
2710 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2711 return;
2713 if (visited.add (node))
2714 return;
2716 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2718 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2719 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2720 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2723 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2724 if (child)
2725 vect_mark_slp_stmts_relevant (child, visited);
2728 static void
2729 vect_mark_slp_stmts_relevant (slp_tree node)
2731 hash_set<slp_tree> visited;
2732 vect_mark_slp_stmts_relevant (node, visited);
2736 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2738 static void
2739 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2740 hash_set<slp_tree> &visited)
2742 if (!node || visited.add (node))
2743 return;
2745 if (SLP_TREE_CHILDREN (node).length () == 0)
2747 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2748 return;
2749 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2750 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2751 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2752 loads.safe_push (node);
2754 else
2756 unsigned i;
2757 slp_tree child;
2758 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2759 vect_gather_slp_loads (loads, child, visited);
2764 /* Find the last store in SLP INSTANCE. */
2766 stmt_vec_info
2767 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2769 stmt_vec_info last = NULL;
2770 stmt_vec_info stmt_vinfo;
2772 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2774 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2775 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2778 return last;
2781 /* Find the first stmt in NODE. */
2783 stmt_vec_info
2784 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2786 stmt_vec_info first = NULL;
2787 stmt_vec_info stmt_vinfo;
2789 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2791 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2792 if (!first
2793 || get_later_stmt (stmt_vinfo, first) == first)
2794 first = stmt_vinfo;
2797 return first;
2800 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2801 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2802 (also containing the first GROUP1_SIZE stmts, since stores are
2803 consecutive), the second containing the remainder.
2804 Return the first stmt in the second group. */
2806 static stmt_vec_info
2807 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2809 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2810 gcc_assert (group1_size > 0);
2811 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2812 gcc_assert (group2_size > 0);
2813 DR_GROUP_SIZE (first_vinfo) = group1_size;
2815 stmt_vec_info stmt_info = first_vinfo;
2816 for (unsigned i = group1_size; i > 1; i--)
2818 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2819 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2821 /* STMT is now the last element of the first group. */
2822 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2823 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2825 DR_GROUP_SIZE (group2) = group2_size;
2826 for (stmt_info = group2; stmt_info;
2827 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2829 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2830 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2833 /* For the second group, the DR_GROUP_GAP is that before the original group,
2834 plus skipping over the first vector. */
2835 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2837 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2838 DR_GROUP_GAP (first_vinfo) += group2_size;
2840 if (dump_enabled_p ())
2841 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2842 group1_size, group2_size);
2844 return group2;
2847 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2848 statements and a vector of NUNITS elements. */
2850 static poly_uint64
2851 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2853 return exact_div (common_multiple (nunits, group_size), group_size);
2856 /* Helper that checks to see if a node is a load node. */
2858 static inline bool
2859 vect_is_slp_load_node (slp_tree root)
2861 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2862 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2863 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2867 /* Helper function of optimize_load_redistribution that performs the operation
2868 recursively. */
2870 static slp_tree
2871 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2872 vec_info *vinfo, unsigned int group_size,
2873 hash_map<slp_tree, slp_tree> *load_map,
2874 slp_tree root)
2876 if (slp_tree *leader = load_map->get (root))
2877 return *leader;
2879 slp_tree node;
2880 unsigned i;
2882 /* For now, we don't know anything about externals so do not do anything. */
2883 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2884 return NULL;
2885 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2887 /* First convert this node into a load node and add it to the leaves
2888 list and flatten the permute from a lane to a load one. If it's
2889 unneeded it will be elided later. */
2890 vec<stmt_vec_info> stmts;
2891 stmts.create (SLP_TREE_LANES (root));
2892 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2893 for (unsigned j = 0; j < lane_perm.length (); j++)
2895 std::pair<unsigned, unsigned> perm = lane_perm[j];
2896 node = SLP_TREE_CHILDREN (root)[perm.first];
2898 if (!vect_is_slp_load_node (node)
2899 || SLP_TREE_CHILDREN (node).exists ())
2901 stmts.release ();
2902 goto next;
2905 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2908 if (dump_enabled_p ())
2909 dump_printf_loc (MSG_NOTE, vect_location,
2910 "converting stmts on permute node %p\n",
2911 (void *) root);
2913 bool *matches = XALLOCAVEC (bool, group_size);
2914 poly_uint64 max_nunits = 1;
2915 unsigned tree_size = 0, limit = 1;
2916 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2917 matches, &limit, &tree_size, bst_map);
2918 if (!node)
2919 stmts.release ();
2921 load_map->put (root, node);
2922 return node;
2925 next:
2926 load_map->put (root, NULL);
2928 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2930 slp_tree value
2931 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2932 node);
2933 if (value)
2935 SLP_TREE_REF_COUNT (value)++;
2936 SLP_TREE_CHILDREN (root)[i] = value;
2937 /* ??? We know the original leafs of the replaced nodes will
2938 be referenced by bst_map, only the permutes created by
2939 pattern matching are not. */
2940 if (SLP_TREE_REF_COUNT (node) == 1)
2941 load_map->remove (node);
2942 vect_free_slp_tree (node);
2946 return NULL;
2949 /* Temporary workaround for loads not being CSEd during SLP build. This
2950 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2951 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2952 same DR such that the final operation is equal to a permuted load. Such
2953 NODES are then directly converted into LOADS themselves. The nodes are
2954 CSEd using BST_MAP. */
2956 static void
2957 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2958 vec_info *vinfo, unsigned int group_size,
2959 hash_map<slp_tree, slp_tree> *load_map,
2960 slp_tree root)
2962 slp_tree node;
2963 unsigned i;
2965 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2967 slp_tree value
2968 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2969 node);
2970 if (value)
2972 SLP_TREE_REF_COUNT (value)++;
2973 SLP_TREE_CHILDREN (root)[i] = value;
2974 /* ??? We know the original leafs of the replaced nodes will
2975 be referenced by bst_map, only the permutes created by
2976 pattern matching are not. */
2977 if (SLP_TREE_REF_COUNT (node) == 1)
2978 load_map->remove (node);
2979 vect_free_slp_tree (node);
2984 /* Helper function of vect_match_slp_patterns.
2986 Attempts to match patterns against the slp tree rooted in REF_NODE using
2987 VINFO. Patterns are matched in post-order traversal.
2989 If matching is successful the value in REF_NODE is updated and returned, if
2990 not then it is returned unchanged. */
2992 static bool
2993 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2994 slp_tree_to_load_perm_map_t *perm_cache,
2995 slp_compat_nodes_map_t *compat_cache,
2996 hash_set<slp_tree> *visited)
2998 unsigned i;
2999 slp_tree node = *ref_node;
3000 bool found_p = false;
3001 if (!node || visited->add (node))
3002 return false;
3004 slp_tree child;
3005 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3006 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3007 vinfo, perm_cache, compat_cache,
3008 visited);
3010 for (unsigned x = 0; x < num__slp_patterns; x++)
3012 vect_pattern *pattern
3013 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3014 if (pattern)
3016 pattern->build (vinfo);
3017 delete pattern;
3018 found_p = true;
3022 return found_p;
3025 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3026 vec_info VINFO.
3028 The modified tree is returned. Patterns are tried in order and multiple
3029 patterns may match. */
3031 static bool
3032 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3033 hash_set<slp_tree> *visited,
3034 slp_tree_to_load_perm_map_t *perm_cache,
3035 slp_compat_nodes_map_t *compat_cache)
3037 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3038 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3040 if (dump_enabled_p ())
3041 dump_printf_loc (MSG_NOTE, vect_location,
3042 "Analyzing SLP tree %p for patterns\n",
3043 (void *) SLP_INSTANCE_TREE (instance));
3045 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3046 visited);
3049 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3050 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3051 Return true if we could use IFN_STORE_LANES instead and if that appears
3052 to be the better approach. */
3054 static bool
3055 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3056 unsigned int group_size,
3057 unsigned int new_group_size)
3059 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3060 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3061 if (!vectype)
3062 return false;
3063 /* Allow the split if one of the two new groups would operate on full
3064 vectors *within* rather than across one scalar loop iteration.
3065 This is purely a heuristic, but it should work well for group
3066 sizes of 3 and 4, where the possible splits are:
3068 3->2+1: OK if the vector has exactly two elements
3069 4->2+2: Likewise
3070 4->3+1: Less clear-cut. */
3071 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3072 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3073 return false;
3074 return vect_store_lanes_supported (vectype, group_size, false);
3077 /* Analyze an SLP instance starting from a group of grouped stores. Call
3078 vect_build_slp_tree to build a tree of packed stmts if possible.
3079 Return FALSE if it's impossible to SLP any stmt in the loop. */
3081 static bool
3082 vect_analyze_slp_instance (vec_info *vinfo,
3083 scalar_stmts_to_slp_tree_map_t *bst_map,
3084 stmt_vec_info stmt_info, slp_instance_kind kind,
3085 unsigned max_tree_size, unsigned *limit);
3087 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3088 of KIND. Return true if successful. */
3090 static bool
3091 vect_build_slp_instance (vec_info *vinfo,
3092 slp_instance_kind kind,
3093 vec<stmt_vec_info> &scalar_stmts,
3094 vec<stmt_vec_info> &root_stmt_infos,
3095 unsigned max_tree_size, unsigned *limit,
3096 scalar_stmts_to_slp_tree_map_t *bst_map,
3097 /* ??? We need stmt_info for group splitting. */
3098 stmt_vec_info stmt_info_)
3100 if (dump_enabled_p ())
3102 dump_printf_loc (MSG_NOTE, vect_location,
3103 "Starting SLP discovery for\n");
3104 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3105 dump_printf_loc (MSG_NOTE, vect_location,
3106 " %G", scalar_stmts[i]->stmt);
3109 /* Build the tree for the SLP instance. */
3110 unsigned int group_size = scalar_stmts.length ();
3111 bool *matches = XALLOCAVEC (bool, group_size);
3112 poly_uint64 max_nunits = 1;
3113 unsigned tree_size = 0;
3114 unsigned i;
3115 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3116 &max_nunits, matches, limit,
3117 &tree_size, bst_map);
3118 if (node != NULL)
3120 /* Calculate the unrolling factor based on the smallest type. */
3121 poly_uint64 unrolling_factor
3122 = calculate_unrolling_factor (max_nunits, group_size);
3124 if (maybe_ne (unrolling_factor, 1U)
3125 && is_a <bb_vec_info> (vinfo))
3127 unsigned HOST_WIDE_INT const_max_nunits;
3128 if (!max_nunits.is_constant (&const_max_nunits)
3129 || const_max_nunits > group_size)
3131 if (dump_enabled_p ())
3132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3133 "Build SLP failed: store group "
3134 "size not a multiple of the vector size "
3135 "in basic block SLP\n");
3136 vect_free_slp_tree (node);
3137 return false;
3139 /* Fatal mismatch. */
3140 if (dump_enabled_p ())
3141 dump_printf_loc (MSG_NOTE, vect_location,
3142 "SLP discovery succeeded but node needs "
3143 "splitting\n");
3144 memset (matches, true, group_size);
3145 matches[group_size / const_max_nunits * const_max_nunits] = false;
3146 vect_free_slp_tree (node);
3148 else
3150 /* Create a new SLP instance. */
3151 slp_instance new_instance = XNEW (class _slp_instance);
3152 SLP_INSTANCE_TREE (new_instance) = node;
3153 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3154 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3155 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3156 SLP_INSTANCE_KIND (new_instance) = kind;
3157 new_instance->reduc_phis = NULL;
3158 new_instance->cost_vec = vNULL;
3159 new_instance->subgraph_entries = vNULL;
3161 if (dump_enabled_p ())
3162 dump_printf_loc (MSG_NOTE, vect_location,
3163 "SLP size %u vs. limit %u.\n",
3164 tree_size, max_tree_size);
3166 /* Fixup SLP reduction chains. */
3167 if (kind == slp_inst_kind_reduc_chain)
3169 /* If this is a reduction chain with a conversion in front
3170 amend the SLP tree with a node for that. */
3171 gimple *scalar_def
3172 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3173 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3175 /* Get at the conversion stmt - we know it's the single use
3176 of the last stmt of the reduction chain. */
3177 use_operand_p use_p;
3178 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3179 &use_p, &scalar_def);
3180 gcc_assert (r);
3181 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3182 next_info = vect_stmt_to_vectorize (next_info);
3183 scalar_stmts = vNULL;
3184 scalar_stmts.create (group_size);
3185 for (unsigned i = 0; i < group_size; ++i)
3186 scalar_stmts.quick_push (next_info);
3187 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3188 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3189 SLP_TREE_CHILDREN (conv).quick_push (node);
3190 SLP_INSTANCE_TREE (new_instance) = conv;
3191 /* We also have to fake this conversion stmt as SLP reduction
3192 group so we don't have to mess with too much code
3193 elsewhere. */
3194 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3195 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3197 /* Fill the backedge child of the PHI SLP node. The
3198 general matching code cannot find it because the
3199 scalar code does not reflect how we vectorize the
3200 reduction. */
3201 use_operand_p use_p;
3202 imm_use_iterator imm_iter;
3203 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3204 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3205 gimple_get_lhs (scalar_def))
3206 /* There are exactly two non-debug uses, the reduction
3207 PHI and the loop-closed PHI node. */
3208 if (!is_gimple_debug (USE_STMT (use_p))
3209 && gimple_bb (USE_STMT (use_p)) == loop->header)
3211 auto_vec<stmt_vec_info, 64> phis (group_size);
3212 stmt_vec_info phi_info
3213 = vinfo->lookup_stmt (USE_STMT (use_p));
3214 for (unsigned i = 0; i < group_size; ++i)
3215 phis.quick_push (phi_info);
3216 slp_tree *phi_node = bst_map->get (phis);
3217 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3218 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3219 = SLP_INSTANCE_TREE (new_instance);
3220 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3224 vinfo->slp_instances.safe_push (new_instance);
3226 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3227 the number of scalar stmts in the root in a few places.
3228 Verify that assumption holds. */
3229 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3230 .length () == group_size);
3232 if (dump_enabled_p ())
3234 dump_printf_loc (MSG_NOTE, vect_location,
3235 "Final SLP tree for instance %p:\n",
3236 (void *) new_instance);
3237 vect_print_slp_graph (MSG_NOTE, vect_location,
3238 SLP_INSTANCE_TREE (new_instance));
3241 return true;
3244 else
3246 /* Failed to SLP. */
3247 /* Free the allocated memory. */
3248 scalar_stmts.release ();
3251 stmt_vec_info stmt_info = stmt_info_;
3252 /* Try to break the group up into pieces. */
3253 if (kind == slp_inst_kind_store)
3255 /* ??? We could delay all the actual splitting of store-groups
3256 until after SLP discovery of the original group completed.
3257 Then we can recurse to vect_build_slp_instance directly. */
3258 for (i = 0; i < group_size; i++)
3259 if (!matches[i])
3260 break;
3262 /* For basic block SLP, try to break the group up into multiples of
3263 a vector size. */
3264 if (is_a <bb_vec_info> (vinfo)
3265 && (i > 1 && i < group_size))
3267 tree scalar_type
3268 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3269 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3270 1 << floor_log2 (i));
3271 unsigned HOST_WIDE_INT const_nunits;
3272 if (vectype
3273 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3275 /* Split into two groups at the first vector boundary. */
3276 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3277 unsigned group1_size = i & ~(const_nunits - 1);
3279 if (dump_enabled_p ())
3280 dump_printf_loc (MSG_NOTE, vect_location,
3281 "Splitting SLP group at stmt %u\n", i);
3282 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3283 group1_size);
3284 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3285 kind, max_tree_size,
3286 limit);
3287 /* Split the rest at the failure point and possibly
3288 re-analyze the remaining matching part if it has
3289 at least two lanes. */
3290 if (group1_size < i
3291 && (i + 1 < group_size
3292 || i - group1_size > 1))
3294 stmt_vec_info rest2 = rest;
3295 rest = vect_split_slp_store_group (rest, i - group1_size);
3296 if (i - group1_size > 1)
3297 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3298 kind, max_tree_size,
3299 limit);
3301 /* Re-analyze the non-matching tail if it has at least
3302 two lanes. */
3303 if (i + 1 < group_size)
3304 res |= vect_analyze_slp_instance (vinfo, bst_map,
3305 rest, kind, max_tree_size,
3306 limit);
3307 return res;
3311 /* For loop vectorization split into arbitrary pieces of size > 1. */
3312 if (is_a <loop_vec_info> (vinfo)
3313 && (i > 1 && i < group_size)
3314 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3316 unsigned group1_size = i;
3318 if (dump_enabled_p ())
3319 dump_printf_loc (MSG_NOTE, vect_location,
3320 "Splitting SLP group at stmt %u\n", i);
3322 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3323 group1_size);
3324 /* Loop vectorization cannot handle gaps in stores, make sure
3325 the split group appears as strided. */
3326 STMT_VINFO_STRIDED_P (rest) = 1;
3327 DR_GROUP_GAP (rest) = 0;
3328 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3329 DR_GROUP_GAP (stmt_info) = 0;
3331 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3332 kind, max_tree_size, limit);
3333 if (i + 1 < group_size)
3334 res |= vect_analyze_slp_instance (vinfo, bst_map,
3335 rest, kind, max_tree_size, limit);
3337 return res;
3340 /* Even though the first vector did not all match, we might be able to SLP
3341 (some) of the remainder. FORNOW ignore this possibility. */
3344 /* Failed to SLP. */
3345 if (dump_enabled_p ())
3346 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3347 return false;
3351 /* Analyze an SLP instance starting from a group of grouped stores. Call
3352 vect_build_slp_tree to build a tree of packed stmts if possible.
3353 Return FALSE if it's impossible to SLP any stmt in the loop. */
3355 static bool
3356 vect_analyze_slp_instance (vec_info *vinfo,
3357 scalar_stmts_to_slp_tree_map_t *bst_map,
3358 stmt_vec_info stmt_info,
3359 slp_instance_kind kind,
3360 unsigned max_tree_size, unsigned *limit)
3362 unsigned int i;
3363 vec<stmt_vec_info> scalar_stmts;
3365 if (is_a <bb_vec_info> (vinfo))
3366 vect_location = stmt_info->stmt;
3368 stmt_vec_info next_info = stmt_info;
3369 if (kind == slp_inst_kind_store)
3371 /* Collect the stores and store them in scalar_stmts. */
3372 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3373 while (next_info)
3375 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3376 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3379 else if (kind == slp_inst_kind_reduc_chain)
3381 /* Collect the reduction stmts and store them in scalar_stmts. */
3382 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3383 while (next_info)
3385 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3386 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3388 /* Mark the first element of the reduction chain as reduction to properly
3389 transform the node. In the reduction analysis phase only the last
3390 element of the chain is marked as reduction. */
3391 STMT_VINFO_DEF_TYPE (stmt_info)
3392 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3393 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3394 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3396 else if (kind == slp_inst_kind_ctor)
3398 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3399 tree val;
3400 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3401 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3403 stmt_vec_info def_info = vinfo->lookup_def (val);
3404 def_info = vect_stmt_to_vectorize (def_info);
3405 scalar_stmts.quick_push (def_info);
3407 if (dump_enabled_p ())
3408 dump_printf_loc (MSG_NOTE, vect_location,
3409 "Analyzing vectorizable constructor: %G\n",
3410 stmt_info->stmt);
3412 else if (kind == slp_inst_kind_reduc_group)
3414 /* Collect reduction statements. */
3415 const vec<stmt_vec_info> &reductions
3416 = as_a <loop_vec_info> (vinfo)->reductions;
3417 scalar_stmts.create (reductions.length ());
3418 for (i = 0; reductions.iterate (i, &next_info); i++)
3419 if ((STMT_VINFO_RELEVANT_P (next_info)
3420 || STMT_VINFO_LIVE_P (next_info))
3421 /* ??? Make sure we didn't skip a conversion around a reduction
3422 path. In that case we'd have to reverse engineer that conversion
3423 stmt following the chain using reduc_idx and from the PHI
3424 using reduc_def. */
3425 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3426 scalar_stmts.quick_push (next_info);
3427 /* If less than two were relevant/live there's nothing to SLP. */
3428 if (scalar_stmts.length () < 2)
3429 return false;
3431 else
3432 gcc_unreachable ();
3434 vec<stmt_vec_info> roots = vNULL;
3435 if (kind == slp_inst_kind_ctor)
3437 roots.create (1);
3438 roots.quick_push (stmt_info);
3440 /* Build the tree for the SLP instance. */
3441 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3442 roots,
3443 max_tree_size, limit, bst_map,
3444 kind == slp_inst_kind_store
3445 ? stmt_info : NULL);
3446 if (!res)
3447 roots.release ();
3449 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3450 where we should do store group splitting. */
3452 return res;
3455 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3456 trees of packed scalar stmts if SLP is possible. */
3458 opt_result
3459 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3461 unsigned int i;
3462 stmt_vec_info first_element;
3463 slp_instance instance;
3465 DUMP_VECT_SCOPE ("vect_analyze_slp");
3467 unsigned limit = max_tree_size;
3469 scalar_stmts_to_slp_tree_map_t *bst_map
3470 = new scalar_stmts_to_slp_tree_map_t ();
3472 /* Find SLP sequences starting from groups of grouped stores. */
3473 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3474 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3475 STMT_VINFO_GROUPED_ACCESS (first_element)
3476 ? slp_inst_kind_store : slp_inst_kind_ctor,
3477 max_tree_size, &limit);
3479 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3481 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3483 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3484 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3485 bb_vinfo->roots[i].stmts,
3486 bb_vinfo->roots[i].roots,
3487 max_tree_size, &limit, bst_map, NULL))
3489 bb_vinfo->roots[i].stmts = vNULL;
3490 bb_vinfo->roots[i].roots = vNULL;
3495 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3497 /* Find SLP sequences starting from reduction chains. */
3498 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3499 if (! STMT_VINFO_RELEVANT_P (first_element)
3500 && ! STMT_VINFO_LIVE_P (first_element))
3502 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3503 slp_inst_kind_reduc_chain,
3504 max_tree_size, &limit))
3506 /* Dissolve reduction chain group. */
3507 stmt_vec_info vinfo = first_element;
3508 stmt_vec_info last = NULL;
3509 while (vinfo)
3511 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3512 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3513 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3514 last = vinfo;
3515 vinfo = next;
3517 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3518 /* It can be still vectorized as part of an SLP reduction. */
3519 loop_vinfo->reductions.safe_push (last);
3522 /* Find SLP sequences starting from groups of reductions. */
3523 if (loop_vinfo->reductions.length () > 1)
3524 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3525 slp_inst_kind_reduc_group, max_tree_size,
3526 &limit);
3529 hash_set<slp_tree> visited_patterns;
3530 slp_tree_to_load_perm_map_t perm_cache;
3531 slp_compat_nodes_map_t compat_cache;
3533 /* See if any patterns can be found in the SLP tree. */
3534 bool pattern_found = false;
3535 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3536 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3537 &visited_patterns, &perm_cache,
3538 &compat_cache);
3540 /* If any were found optimize permutations of loads. */
3541 if (pattern_found)
3543 hash_map<slp_tree, slp_tree> load_map;
3544 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3546 slp_tree root = SLP_INSTANCE_TREE (instance);
3547 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3548 &load_map, root);
3554 /* The map keeps a reference on SLP nodes built, release that. */
3555 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3556 it != bst_map->end (); ++it)
3557 if ((*it).second)
3558 vect_free_slp_tree ((*it).second);
3559 delete bst_map;
3561 if (pattern_found && dump_enabled_p ())
3563 dump_printf_loc (MSG_NOTE, vect_location,
3564 "Pattern matched SLP tree\n");
3565 hash_set<slp_tree> visited;
3566 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3567 vect_print_slp_graph (MSG_NOTE, vect_location,
3568 SLP_INSTANCE_TREE (instance), visited);
3571 return opt_result::success ();
3574 /* Estimates the cost of inserting layout changes into the SLP graph.
3575 It can also say that the insertion is impossible. */
3577 struct slpg_layout_cost
3579 slpg_layout_cost () = default;
3580 slpg_layout_cost (sreal, bool);
3582 static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3583 bool is_possible () const { return depth != sreal::max (); }
3585 bool operator== (const slpg_layout_cost &) const;
3586 bool operator!= (const slpg_layout_cost &) const;
3588 bool is_better_than (const slpg_layout_cost &, bool) const;
3590 void add_parallel_cost (const slpg_layout_cost &);
3591 void add_serial_cost (const slpg_layout_cost &);
3592 void split (unsigned int);
3594 /* The longest sequence of layout changes needed during any traversal
3595 of the partition dag, weighted by execution frequency.
3597 This is the most important metric when optimizing for speed, since
3598 it helps to ensure that we keep the number of operations on
3599 critical paths to a minimum. */
3600 sreal depth = 0;
3602 /* An estimate of the total number of operations needed. It is weighted by
3603 execution frequency when optimizing for speed but not when optimizing for
3604 size. In order to avoid double-counting, a node with a fanout of N will
3605 distribute 1/N of its total cost to each successor.
3607 This is the most important metric when optimizing for size, since
3608 it helps to keep the total number of operations to a minimum, */
3609 sreal total = 0;
3612 /* Construct costs for a node with weight WEIGHT. A higher weight
3613 indicates more frequent execution. IS_FOR_SIZE is true if we are
3614 optimizing for size rather than speed. */
3616 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3617 : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3621 bool
3622 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3624 return depth == other.depth && total == other.total;
3627 bool
3628 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3630 return !operator== (other);
3633 /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
3634 true if we are optimizing for size rather than speed. */
3636 bool
3637 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3638 bool is_for_size) const
3640 if (is_for_size)
3642 if (total != other.total)
3643 return total < other.total;
3644 return depth < other.depth;
3646 else
3648 if (depth != other.depth)
3649 return depth < other.depth;
3650 return total < other.total;
3654 /* Increase the costs to account for something with cost INPUT_COST
3655 happening in parallel with the current costs. */
3657 void
3658 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3660 depth = std::max (depth, input_cost.depth);
3661 total += input_cost.total;
3664 /* Increase the costs to account for something with cost INPUT_COST
3665 happening in series with the current costs. */
3667 void
3668 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3670 depth += other.depth;
3671 total += other.total;
3674 /* Split the total cost among TIMES successors or predecessors. */
3676 void
3677 slpg_layout_cost::split (unsigned int times)
3679 if (times > 1)
3680 total /= times;
3683 /* Information about one node in the SLP graph, for use during
3684 vect_optimize_slp_pass. */
3686 struct slpg_vertex
3688 slpg_vertex (slp_tree node_) : node (node_) {}
3690 /* The node itself. */
3691 slp_tree node;
3693 /* Which partition the node belongs to, or -1 if none. Nodes outside of
3694 partitions are flexible; they can have whichever layout consumers
3695 want them to have. */
3696 int partition = -1;
3698 /* The number of nodes that directly use the result of this one
3699 (i.e. the number of nodes that count this one as a child). */
3700 unsigned int out_degree = 0;
3702 /* The execution frequency of the node. */
3703 sreal weight = 0;
3705 /* The total execution frequency of all nodes that directly use the
3706 result of this one. */
3707 sreal out_weight = 0;
3710 /* Information about one partition of the SLP graph, for use during
3711 vect_optimize_slp_pass. */
3713 struct slpg_partition_info
3715 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3716 of m_partitioned_nodes. */
3717 unsigned int node_begin = 0;
3718 unsigned int node_end = 0;
3720 /* Which layout we've chosen to use for this partition, or -1 if
3721 we haven't picked one yet. */
3722 int layout = -1;
3724 /* The number of predecessors and successors in the partition dag.
3725 The predecessors always have lower partition numbers and the
3726 successors always have higher partition numbers.
3728 Note that the directions of these edges are not necessarily the
3729 same as in the data flow graph. For example, if an SCC has separate
3730 partitions for an inner loop and an outer loop, the inner loop's
3731 partition will have at least two incoming edges from the outer loop's
3732 partition: one for a live-in value and one for a live-out value.
3733 In data flow terms, one of these edges would also be from the outer loop
3734 to the inner loop, but the other would be in the opposite direction. */
3735 unsigned int in_degree = 0;
3736 unsigned int out_degree = 0;
3739 /* Information about the costs of using a particular layout for a
3740 particular partition. It can also say that the combination is
3741 impossible. */
3743 struct slpg_partition_layout_costs
3745 bool is_possible () const { return internal_cost.is_possible (); }
3746 void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3748 /* The costs inherited from predecessor partitions. */
3749 slpg_layout_cost in_cost;
3751 /* The inherent cost of the layout within the node itself. For example,
3752 this is nonzero for a load if choosing a particular layout would require
3753 the load to permute the loaded elements. It is nonzero for a
3754 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3755 to full-vector moves. */
3756 slpg_layout_cost internal_cost;
3758 /* The costs inherited from successor partitions. */
3759 slpg_layout_cost out_cost;
3762 /* This class tries to optimize the layout of vectors in order to avoid
3763 unnecessary shuffling. At the moment, the set of possible layouts are
3764 restricted to bijective permutations.
3766 The goal of the pass depends on whether we're optimizing for size or
3767 for speed. When optimizing for size, the goal is to reduce the overall
3768 number of layout changes (including layout changes implied by things
3769 like load permutations). When optimizing for speed, the goal is to
3770 reduce the maximum latency attributable to layout changes on any
3771 non-cyclical path through the data flow graph.
3773 For example, when optimizing a loop nest for speed, we will prefer
3774 to make layout changes outside of a loop rather than inside of a loop,
3775 and will prefer to make layout changes in parallel rather than serially,
3776 even if that increases the overall number of layout changes.
3778 The high-level procedure is:
3780 (1) Build a graph in which edges go from uses (parents) to definitions
3781 (children).
3783 (2) Divide the graph into a dag of strongly-connected components (SCCs).
3785 (3) When optimizing for speed, partition the nodes in each SCC based
3786 on their containing cfg loop. When optimizing for size, treat
3787 each SCC as a single partition.
3789 This gives us a dag of partitions. The goal is now to assign a
3790 layout to each partition.
3792 (4) Construct a set of vector layouts that are worth considering.
3793 Record which nodes must keep their current layout.
3795 (5) Perform a forward walk over the partition dag (from loads to stores)
3796 accumulating the "forward" cost of using each layout. When visiting
3797 each partition, assign a tentative choice of layout to the partition
3798 and use that choice when calculating the cost of using a different
3799 layout in successor partitions.
3801 (6) Perform a backward walk over the partition dag (from stores to loads),
3802 accumulating the "backward" cost of using each layout. When visiting
3803 each partition, make a final choice of layout for that partition based
3804 on the accumulated forward costs (from (5)) and backward costs
3805 (from (6)).
3807 (7) Apply the chosen layouts to the SLP graph.
3809 For example, consider the SLP statements:
3811 S1: a_1 = load
3812 loop:
3813 S2: a_2 = PHI<a_1, a_3>
3814 S3: b_1 = load
3815 S4: a_3 = a_2 + b_1
3816 exit:
3817 S5: a_4 = PHI<a_3>
3818 S6: store a_4
3820 S2 and S4 form an SCC and are part of the same loop. Every other
3821 statement is in a singleton SCC. In this example there is a one-to-one
3822 mapping between SCCs and partitions and the partition dag looks like this;
3824 S1 S3
3826 S2+S4
3832 S2, S3 and S4 will have a higher execution frequency than the other
3833 statements, so when optimizing for speed, the goal is to avoid any
3834 layout changes:
3836 - within S3
3837 - within S2+S4
3838 - on the S3->S2+S4 edge
3840 For example, if S3 was originally a reversing load, the goal of the
3841 pass is to make it an unreversed load and change the layout on the
3842 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
3843 on S1->S2+S4 and S5->S6 would also be acceptable.)
3845 The difference between SCCs and partitions becomes important if we
3846 add an outer loop:
3848 S1: a_1 = ...
3849 loop1:
3850 S2: a_2 = PHI<a_1, a_6>
3851 S3: b_1 = load
3852 S4: a_3 = a_2 + b_1
3853 loop2:
3854 S5: a_4 = PHI<a_3, a_5>
3855 S6: c_1 = load
3856 S7: a_5 = a_4 + c_1
3857 exit2:
3858 S8: a_6 = PHI<a_5>
3859 S9: store a_6
3860 exit1:
3862 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
3863 for speed, we usually do not want restrictions in the outer loop to "infect"
3864 the decision for the inner loop. For example, if an outer-loop node
3865 in the SCC contains a statement with a fixed layout, that should not
3866 prevent the inner loop from using a different layout. Conversely,
3867 the inner loop should not dictate a layout to the outer loop: if the
3868 outer loop does a lot of computation, then it may not be efficient to
3869 do all of that computation in the inner loop's preferred layout.
3871 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
3872 and S5+S7 (inner). We also try to arrange partitions so that:
3874 - the partition for an outer loop comes before the partition for
3875 an inner loop
3877 - if a sibling loop A dominates a sibling loop B, A's partition
3878 comes before B's
3880 This gives the following partition dag for the example above:
3882 S1 S3
3884 S2+S4+S8 S6
3885 | \\ /
3886 | S5+S7
3890 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
3891 one for a reversal of the edge S7->S8.
3893 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
3894 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
3895 preferred layout against the cost of changing the layout on entry to the
3896 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
3898 Although this works well when optimizing for speed, it has the downside
3899 when optimizing for size that the choice of layout for S5+S7 is completely
3900 independent of S9, which lessens the chance of reducing the overall number
3901 of permutations. We therefore do not partition SCCs when optimizing
3902 for size.
3904 To give a concrete example of the difference between optimizing
3905 for size and speed, consider:
3907 a[0] = (b[1] << c[3]) - d[1];
3908 a[1] = (b[0] << c[2]) - d[0];
3909 a[2] = (b[3] << c[1]) - d[3];
3910 a[3] = (b[2] << c[0]) - d[2];
3912 There are three different layouts here: one for a, one for b and d,
3913 and one for c. When optimizing for speed it is better to permute each
3914 of b, c and d into the order required by a, since those permutations
3915 happen in parallel. But when optimizing for size, it is better to:
3917 - permute c into the same order as b
3918 - do the arithmetic
3919 - permute the result into the order required by a
3921 This gives 2 permutations rather than 3. */
3923 class vect_optimize_slp_pass
3925 public:
3926 vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
3927 void run ();
3929 private:
3930 /* Graph building. */
3931 struct loop *containing_loop (slp_tree);
3932 bool is_cfg_latch_edge (graph_edge *);
3933 void build_vertices (hash_set<slp_tree> &, slp_tree);
3934 void build_vertices ();
3935 void build_graph ();
3937 /* Partitioning. */
3938 void create_partitions ();
3939 template<typename T> void for_each_partition_edge (unsigned int, T);
3941 /* Layout selection. */
3942 bool is_compatible_layout (slp_tree, unsigned int);
3943 int change_layout_cost (slp_tree, unsigned int, unsigned int);
3944 slpg_partition_layout_costs &partition_layout_costs (unsigned int,
3945 unsigned int);
3946 void change_vec_perm_layout (slp_tree, lane_permutation_t &,
3947 int, unsigned int);
3948 int internal_node_cost (slp_tree, int, unsigned int);
3949 void start_choosing_layouts ();
3951 /* Cost propagation. */
3952 slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
3953 unsigned int, unsigned int);
3954 slpg_layout_cost total_in_cost (unsigned int);
3955 slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
3956 slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
3957 void forward_pass ();
3958 void backward_pass ();
3960 /* Rematerialization. */
3961 slp_tree get_result_with_layout (slp_tree, unsigned int);
3962 void materialize ();
3964 /* Clean-up. */
3965 void remove_redundant_permutations ();
3967 void dump ();
3969 vec_info *m_vinfo;
3971 /* True if we should optimize the graph for size, false if we should
3972 optimize it for speed. (It wouldn't be easy to make this decision
3973 more locally.) */
3974 bool m_optimize_size;
3976 /* A graph of all SLP nodes, with edges leading from uses to definitions.
3977 In other words, a node's predecessors are its slp_tree parents and
3978 a node's successors are its slp_tree children. */
3979 graph *m_slpg = nullptr;
3981 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
3982 auto_vec<slpg_vertex> m_vertices;
3984 /* The list of all leaves of M_SLPG. such as external definitions, constants,
3985 and loads. */
3986 auto_vec<int> m_leafs;
3988 /* This array has one entry for every vector layout that we're considering.
3989 Element 0 is null and indicates "no change". Other entries describe
3990 permutations that are inherent in the current graph and that we would
3991 like to reverse if possible.
3993 For example, a permutation { 1, 2, 3, 0 } means that something has
3994 effectively been permuted in that way, such as a load group
3995 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
3996 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
3997 in order to put things "back" in order. */
3998 auto_vec<vec<unsigned> > m_perms;
4000 /* A partitioning of the nodes for which a layout must be chosen.
4001 Each partition represents an <SCC, cfg loop> pair; that is,
4002 nodes in different SCCs belong to different partitions, and nodes
4003 within an SCC can be further partitioned according to a containing
4004 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
4006 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4007 from leaves (such as loads) to roots (such as stores).
4009 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
4010 auto_vec<slpg_partition_info> m_partitions;
4012 /* The list of all nodes for which a layout must be chosen. Nodes for
4013 partition P come before the nodes for partition P+1. Nodes within a
4014 partition are in reverse postorder. */
4015 auto_vec<unsigned int> m_partitioned_nodes;
4017 /* Index P * num-layouts + L contains the cost of using layout L
4018 for partition P. */
4019 auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4021 /* Index N * num-layouts + L, if nonnull, is a node that provides the
4022 original output of node N adjusted to have layout L. */
4023 auto_vec<slp_tree> m_node_layouts;
4026 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4027 Also record whether we should optimize anything for speed rather
4028 than size. */
4030 void
4031 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4032 slp_tree node)
4034 unsigned i;
4035 slp_tree child;
4037 if (visited.add (node))
4038 return;
4040 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4042 basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4043 if (optimize_bb_for_speed_p (bb))
4044 m_optimize_size = false;
4047 node->vertex = m_vertices.length ();
4048 m_vertices.safe_push (slpg_vertex (node));
4050 bool leaf = true;
4051 bool force_leaf = false;
4052 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4053 if (child)
4055 leaf = false;
4056 build_vertices (visited, child);
4058 else
4059 force_leaf = true;
4060 /* Since SLP discovery works along use-def edges all cycles have an
4061 entry - but there's the exception of cycles where we do not handle
4062 the entry explicitely (but with a NULL SLP node), like some reductions
4063 and inductions. Force those SLP PHIs to act as leafs to make them
4064 backwards reachable. */
4065 if (leaf || force_leaf)
4066 m_leafs.safe_push (node->vertex);
4069 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
4071 void
4072 vect_optimize_slp_pass::build_vertices ()
4074 hash_set<slp_tree> visited;
4075 unsigned i;
4076 slp_instance instance;
4077 FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4078 build_vertices (visited, SLP_INSTANCE_TREE (instance));
4081 /* Apply (reverse) bijectite PERM to VEC. */
4083 template <class T>
4084 static void
4085 vect_slp_permute (vec<unsigned> perm,
4086 vec<T> &vec, bool reverse)
4088 auto_vec<T, 64> saved;
4089 saved.create (vec.length ());
4090 for (unsigned i = 0; i < vec.length (); ++i)
4091 saved.quick_push (vec[i]);
4093 if (reverse)
4095 for (unsigned i = 0; i < vec.length (); ++i)
4096 vec[perm[i]] = saved[i];
4097 for (unsigned i = 0; i < vec.length (); ++i)
4098 gcc_assert (vec[perm[i]] == saved[i]);
4100 else
4102 for (unsigned i = 0; i < vec.length (); ++i)
4103 vec[i] = saved[perm[i]];
4104 for (unsigned i = 0; i < vec.length (); ++i)
4105 gcc_assert (vec[i] == saved[perm[i]]);
4109 /* Return the cfg loop that contains NODE. */
4111 struct loop *
4112 vect_optimize_slp_pass::containing_loop (slp_tree node)
4114 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4115 if (!rep)
4116 return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4117 return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4120 /* Return true if UD (an edge from a use to a definition) is associated
4121 with a loop latch edge in the cfg. */
4123 bool
4124 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4126 slp_tree use = m_vertices[ud->src].node;
4127 slp_tree def = m_vertices[ud->dest].node;
4128 if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4129 || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4130 return false;
4132 stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4133 return (is_a<gphi *> (use_rep->stmt)
4134 && bb_loop_header_p (gimple_bb (use_rep->stmt))
4135 && containing_loop (def) == containing_loop (use));
4138 /* Build the graph. Mark edges that correspond to cfg loop latch edges with
4139 a nonnull data field. */
4141 void
4142 vect_optimize_slp_pass::build_graph ()
4144 m_optimize_size = true;
4145 build_vertices ();
4147 m_slpg = new_graph (m_vertices.length ());
4148 for (slpg_vertex &v : m_vertices)
4149 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4150 if (child)
4152 graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4153 if (is_cfg_latch_edge (ud))
4154 ud->data = this;
4158 /* Return true if E corresponds to a loop latch edge in the cfg. */
4160 static bool
4161 skip_cfg_latch_edges (graph_edge *e)
4163 return e->data;
4166 /* Create the node partitions. */
4168 void
4169 vect_optimize_slp_pass::create_partitions ()
4171 /* Calculate a postorder of the graph, ignoring edges that correspond
4172 to natural latch edges in the cfg. Reading the vector from the end
4173 to the beginning gives the reverse postorder. */
4174 auto_vec<int> initial_rpo;
4175 graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4176 false, NULL, skip_cfg_latch_edges);
4177 gcc_assert (initial_rpo.length () == m_vertices.length ());
4179 /* Calculate the strongly connected components of the graph. */
4180 auto_vec<int> scc_grouping;
4181 unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4183 /* Create a new index order in which all nodes from the same SCC are
4184 consecutive. Use scc_pos to record the index of the first node in
4185 each SCC. */
4186 auto_vec<unsigned int> scc_pos (num_sccs);
4187 int last_component = -1;
4188 unsigned int node_count = 0;
4189 for (unsigned int node_i : scc_grouping)
4191 if (last_component != m_slpg->vertices[node_i].component)
4193 last_component = m_slpg->vertices[node_i].component;
4194 gcc_assert (last_component == int (scc_pos.length ()));
4195 scc_pos.quick_push (node_count);
4197 node_count += 1;
4199 gcc_assert (node_count == initial_rpo.length ()
4200 && last_component + 1 == int (num_sccs));
4202 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4203 inside each SCC following the RPO we calculated above. The fact that
4204 we ignored natural latch edges when calculating the RPO should ensure
4205 that, for natural loop nests:
4207 - the first node that we encounter in a cfg loop is the loop header phi
4208 - the loop header phis are in dominance order
4210 Arranging for this is an optimization (see below) rather than a
4211 correctness issue. Unnatural loops with a tangled mess of backedges
4212 will still work correctly, but might give poorer results.
4214 Also update scc_pos so that it gives 1 + the index of the last node
4215 in the SCC. */
4216 m_partitioned_nodes.safe_grow (node_count);
4217 for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4219 unsigned int node_i = initial_rpo[old_i];
4220 unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4221 m_partitioned_nodes[new_i] = node_i;
4224 /* When optimizing for speed, partition each SCC based on the containing
4225 cfg loop. The order we constructed above should ensure that, for natural
4226 cfg loops, we'll create sub-SCC partitions for outer loops before
4227 the corresponding sub-SCC partitions for inner loops. Similarly,
4228 when one sibling loop A dominates another sibling loop B, we should
4229 create a sub-SCC partition for A before a sub-SCC partition for B.
4231 As above, nothing depends for correctness on whether this achieves
4232 a natural nesting, but we should get better results when it does. */
4233 m_partitions.reserve (m_vertices.length ());
4234 unsigned int next_partition_i = 0;
4235 hash_map<struct loop *, int> loop_partitions;
4236 unsigned int rpo_begin = 0;
4237 unsigned int num_partitioned_nodes = 0;
4238 for (unsigned int rpo_end : scc_pos)
4240 loop_partitions.empty ();
4241 unsigned int partition_i = next_partition_i;
4242 for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4244 /* Handle externals and constants optimistically throughout.
4245 But treat existing vectors as fixed since we do not handle
4246 permuting them. */
4247 unsigned int node_i = m_partitioned_nodes[rpo_i];
4248 auto &vertex = m_vertices[node_i];
4249 if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4250 && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4251 || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4252 vertex.partition = -1;
4253 else
4255 bool existed;
4256 if (m_optimize_size)
4257 existed = next_partition_i > partition_i;
4258 else
4260 struct loop *loop = containing_loop (vertex.node);
4261 auto &entry = loop_partitions.get_or_insert (loop, &existed);
4262 if (!existed)
4263 entry = next_partition_i;
4264 partition_i = entry;
4266 if (!existed)
4268 m_partitions.quick_push (slpg_partition_info ());
4269 next_partition_i += 1;
4271 vertex.partition = partition_i;
4272 num_partitioned_nodes += 1;
4273 m_partitions[partition_i].node_end += 1;
4276 rpo_begin = rpo_end;
4279 /* Assign ranges of consecutive node indices to each partition,
4280 in partition order. Start with node_end being the same as
4281 node_begin so that the next loop can use it as a counter. */
4282 unsigned int node_begin = 0;
4283 for (auto &partition : m_partitions)
4285 partition.node_begin = node_begin;
4286 node_begin += partition.node_end;
4287 partition.node_end = partition.node_begin;
4289 gcc_assert (node_begin == num_partitioned_nodes);
4291 /* Finally build the list of nodes in partition order. */
4292 m_partitioned_nodes.truncate (num_partitioned_nodes);
4293 for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4295 int partition_i = m_vertices[node_i].partition;
4296 if (partition_i >= 0)
4298 unsigned int order_i = m_partitions[partition_i].node_end++;
4299 m_partitioned_nodes[order_i] = node_i;
4304 /* Look for edges from earlier partitions into node NODE_I and edges from
4305 node NODE_I into later partitions. Call:
4307 FN (ud, other_node_i)
4309 for each such use-to-def edge ud, where other_node_i is the node at the
4310 other end of the edge. */
4312 template<typename T>
4313 void
4314 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4316 int partition_i = m_vertices[node_i].partition;
4317 for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4318 pred; pred = pred->pred_next)
4320 int src_partition_i = m_vertices[pred->src].partition;
4321 if (src_partition_i >= 0 && src_partition_i != partition_i)
4322 fn (pred, pred->src);
4324 for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4325 succ; succ = succ->succ_next)
4327 int dest_partition_i = m_vertices[succ->dest].partition;
4328 if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4329 fn (succ, succ->dest);
4333 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4334 that NODE would operate on. This test is independent of NODE's actual
4335 operation. */
4337 bool
4338 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4339 unsigned int layout_i)
4341 if (layout_i == 0)
4342 return true;
4344 if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4345 return false;
4347 return true;
4350 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4351 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
4352 layouts is incompatible with NODE or if the change is not possible for
4353 some other reason.
4355 The properties taken from NODE include the number of lanes and the
4356 vector type. The actual operation doesn't matter. */
4359 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4360 unsigned int from_layout_i,
4361 unsigned int to_layout_i)
4363 if (!is_compatible_layout (node, from_layout_i)
4364 || !is_compatible_layout (node, to_layout_i))
4365 return -1;
4367 if (from_layout_i == to_layout_i)
4368 return 0;
4370 auto_vec<slp_tree, 1> children (1);
4371 children.quick_push (node);
4372 auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4373 if (from_layout_i > 0)
4374 for (unsigned int i : m_perms[from_layout_i])
4375 perm.quick_push ({ 0, i });
4376 else
4377 for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4378 perm.quick_push ({ 0, i });
4379 if (to_layout_i > 0)
4380 vect_slp_permute (m_perms[to_layout_i], perm, true);
4381 auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4382 children, false);
4383 if (count >= 0)
4384 return MAX (count, 1);
4386 /* ??? In principle we could try changing via layout 0, giving two
4387 layout changes rather than 1. Doing that would require
4388 corresponding support in get_result_with_layout. */
4389 return -1;
4392 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
4394 inline slpg_partition_layout_costs &
4395 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4396 unsigned int layout_i)
4398 return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4401 /* Change PERM in one of two ways:
4403 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4404 chosen for child I of NODE.
4406 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4408 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
4410 void
4411 vect_optimize_slp_pass::
4412 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4413 int in_layout_i, unsigned int out_layout_i)
4415 for (auto &entry : perm)
4417 int this_in_layout_i = in_layout_i;
4418 if (this_in_layout_i < 0)
4420 slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4421 unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4422 this_in_layout_i = m_partitions[in_partition_i].layout;
4424 if (this_in_layout_i > 0)
4425 entry.second = m_perms[this_in_layout_i][entry.second];
4427 if (out_layout_i > 0)
4428 vect_slp_permute (m_perms[out_layout_i], perm, true);
4431 /* Check whether the target allows NODE to be rearranged so that the node's
4432 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
4433 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
4435 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4436 NODE can adapt to the layout changes that have (perhaps provisionally)
4437 been chosen for NODE's children, so that no extra permutations are
4438 needed on either the input or the output of NODE.
4440 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4441 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4443 IN_LAYOUT_I has no meaning for other types of node.
4445 Keeping the node as-is is always valid. If the target doesn't appear
4446 to support the node as-is, but might realistically support other layouts,
4447 then layout 0 instead has the cost of a worst-case permutation. On the
4448 one hand, this ensures that every node has at least one valid layout,
4449 avoiding what would otherwise be an awkward special case. On the other,
4450 it still encourages the pass to change an invalid pre-existing layout
4451 choice into a valid one. */
4454 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4455 unsigned int out_layout_i)
4457 const int fallback_cost = 1;
4459 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4461 auto_lane_permutation_t tmp_perm;
4462 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4464 /* Check that the child nodes support the chosen layout. Checking
4465 the first child is enough, since any second child would have the
4466 same shape. */
4467 auto first_child = SLP_TREE_CHILDREN (node)[0];
4468 if (in_layout_i > 0
4469 && !is_compatible_layout (first_child, in_layout_i))
4470 return -1;
4472 change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4473 int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4474 node, tmp_perm,
4475 SLP_TREE_CHILDREN (node),
4476 false);
4477 if (count < 0)
4479 if (in_layout_i == 0 && out_layout_i == 0)
4481 /* Use the fallback cost if the node could in principle support
4482 some nonzero layout for both the inputs and the outputs.
4483 Otherwise assume that the node will be rejected later
4484 and rebuilt from scalars. */
4485 if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4486 return fallback_cost;
4487 return 0;
4489 return -1;
4492 /* We currently have no way of telling whether the new layout is cheaper
4493 or more expensive than the old one. But at least in principle,
4494 it should be worth making zero permutations (whole-vector shuffles)
4495 cheaper than real permutations, in case the pass is able to remove
4496 the latter. */
4497 return count == 0 ? 0 : 1;
4500 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4501 if (rep
4502 && STMT_VINFO_DATA_REF (rep)
4503 && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4504 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4506 auto_load_permutation_t tmp_perm;
4507 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4508 if (out_layout_i > 0)
4509 vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4511 poly_uint64 vf = 1;
4512 if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4513 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4514 unsigned int n_perms;
4515 if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4516 nullptr, vf, true, false, &n_perms))
4518 auto rep = SLP_TREE_REPRESENTATIVE (node);
4519 if (out_layout_i == 0)
4521 /* Use the fallback cost if the load is an N-to-N permutation.
4522 Otherwise assume that the node will be rejected later
4523 and rebuilt from scalars. */
4524 if (STMT_VINFO_GROUPED_ACCESS (rep)
4525 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4526 == SLP_TREE_LANES (node)))
4527 return fallback_cost;
4528 return 0;
4530 return -1;
4533 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
4534 return n_perms == 0 ? 0 : 1;
4537 return 0;
4540 /* Decide which element layouts we should consider using. Calculate the
4541 weights associated with inserting layout changes on partition edges.
4542 Also mark partitions that cannot change layout, by setting their
4543 layout to zero. */
4545 void
4546 vect_optimize_slp_pass::start_choosing_layouts ()
4548 /* Used to assign unique permutation indices. */
4549 using perm_hash = unbounded_hashmap_traits<
4550 vec_free_hash_base<int_hash_base<unsigned>>,
4551 int_hash<int, -1, -2>
4553 hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4555 /* Layout 0 is "no change". */
4556 m_perms.safe_push (vNULL);
4558 /* Create layouts from existing permutations. */
4559 auto_load_permutation_t tmp_perm;
4560 for (unsigned int node_i : m_partitioned_nodes)
4562 /* Leafs also double as entries to the reverse graph. Allow the
4563 layout of those to be changed. */
4564 auto &vertex = m_vertices[node_i];
4565 auto &partition = m_partitions[vertex.partition];
4566 if (!m_slpg->vertices[node_i].succ)
4567 partition.layout = 0;
4569 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
4570 slp_tree node = vertex.node;
4571 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4572 slp_tree child;
4573 unsigned HOST_WIDE_INT imin, imax = 0;
4574 bool any_permute = false;
4575 tmp_perm.truncate (0);
4576 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4578 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4579 unpermuted, record a layout that reverses this permutation.
4581 We would need more work to cope with loads that are internally
4582 permuted and also have inputs (such as masks for
4583 IFN_MASK_LOADs). */
4584 gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4585 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4586 continue;
4587 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4588 imin = DR_GROUP_SIZE (dr_stmt) + 1;
4589 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4591 else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4592 && SLP_TREE_CHILDREN (node).length () == 1
4593 && (child = SLP_TREE_CHILDREN (node)[0])
4594 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4595 .is_constant (&imin)))
4597 /* If the child has the same vector size as this node,
4598 reversing the permutation can make the permutation a no-op.
4599 In other cases it can change a true permutation into a
4600 full-vector extract. */
4601 tmp_perm.reserve (SLP_TREE_LANES (node));
4602 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4603 tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4605 else
4606 continue;
4608 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4610 unsigned idx = tmp_perm[j];
4611 imin = MIN (imin, idx);
4612 imax = MAX (imax, idx);
4613 if (idx - tmp_perm[0] != j)
4614 any_permute = true;
4616 /* If the span doesn't match we'd disrupt VF computation, avoid
4617 that for now. */
4618 if (imax - imin + 1 != SLP_TREE_LANES (node))
4619 continue;
4620 /* If there's no permute no need to split one out. In this case
4621 we can consider turning a load into a permuted load, if that
4622 turns out to be cheaper than alternatives. */
4623 if (!any_permute)
4625 partition.layout = -1;
4626 continue;
4629 /* For now only handle true permutes, like
4630 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
4631 when permuting constants and invariants keeping the permute
4632 bijective. */
4633 auto_sbitmap load_index (SLP_TREE_LANES (node));
4634 bitmap_clear (load_index);
4635 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4636 bitmap_set_bit (load_index, tmp_perm[j] - imin);
4637 unsigned j;
4638 for (j = 0; j < SLP_TREE_LANES (node); ++j)
4639 if (!bitmap_bit_p (load_index, j))
4640 break;
4641 if (j != SLP_TREE_LANES (node))
4642 continue;
4644 vec<unsigned> perm = vNULL;
4645 perm.safe_grow (SLP_TREE_LANES (node), true);
4646 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4647 perm[j] = tmp_perm[j] - imin;
4649 if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4651 /* Continue to use existing layouts, but don't add any more. */
4652 int *entry = layout_ids.get (perm);
4653 partition.layout = entry ? *entry : 0;
4654 perm.release ();
4656 else
4658 bool existed;
4659 int &layout_i = layout_ids.get_or_insert (perm, &existed);
4660 if (existed)
4661 perm.release ();
4662 else
4664 layout_i = m_perms.length ();
4665 m_perms.safe_push (perm);
4667 partition.layout = layout_i;
4671 /* Initially assume that every layout is possible and has zero cost
4672 in every partition. */
4673 m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4674 * m_perms.length ());
4676 /* We have to mark outgoing permutations facing non-reduction graph
4677 entries that are not represented as to be materialized. */
4678 for (slp_instance instance : m_vinfo->slp_instances)
4679 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4681 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4682 m_partitions[m_vertices[node_i].partition].layout = 0;
4685 /* Check which layouts each node and partition can handle. Calculate the
4686 weights associated with inserting layout changes on edges. */
4687 for (unsigned int node_i : m_partitioned_nodes)
4689 auto &vertex = m_vertices[node_i];
4690 auto &partition = m_partitions[vertex.partition];
4691 slp_tree node = vertex.node;
4693 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4695 vertex.weight = vect_slp_node_weight (node);
4697 /* We do not handle stores with a permutation, so all
4698 incoming permutations must have been materialized.
4700 We also don't handle masked grouped loads, which lack a
4701 permutation vector. In this case the memory locations
4702 form an implicit second input to the loads, on top of the
4703 explicit mask input, and the memory input's layout cannot
4704 be changed.
4706 On the other hand, we do support permuting gather loads and
4707 masked gather loads, where each scalar load is independent
4708 of the others. This can be useful if the address/index input
4709 benefits from permutation. */
4710 if (STMT_VINFO_DATA_REF (rep)
4711 && STMT_VINFO_GROUPED_ACCESS (rep)
4712 && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4713 partition.layout = 0;
4715 /* We cannot change the layout of an operation that is
4716 not independent on lanes. Note this is an explicit
4717 negative list since that's much shorter than the respective
4718 positive one but it's critical to keep maintaining it. */
4719 if (is_gimple_call (STMT_VINFO_STMT (rep)))
4720 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4722 case CFN_COMPLEX_ADD_ROT90:
4723 case CFN_COMPLEX_ADD_ROT270:
4724 case CFN_COMPLEX_MUL:
4725 case CFN_COMPLEX_MUL_CONJ:
4726 case CFN_VEC_ADDSUB:
4727 case CFN_VEC_FMADDSUB:
4728 case CFN_VEC_FMSUBADD:
4729 partition.layout = 0;
4730 default:;
4734 auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4736 auto &other_vertex = m_vertices[other_node_i];
4738 /* Count the number of edges from earlier partitions and the number
4739 of edges to later partitions. */
4740 if (other_vertex.partition < vertex.partition)
4741 partition.in_degree += 1;
4742 else
4743 partition.out_degree += 1;
4745 /* If the current node uses the result of OTHER_NODE_I, accumulate
4746 the effects of that. */
4747 if (ud->src == int (node_i))
4749 other_vertex.out_weight += vertex.weight;
4750 other_vertex.out_degree += 1;
4753 for_each_partition_edge (node_i, process_edge);
4757 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4758 its current (provisional) choice of layout. The inputs do not necessarily
4759 have the same layout as each other. */
4761 slpg_layout_cost
4762 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4764 auto &vertex = m_vertices[node_i];
4765 slpg_layout_cost cost;
4766 auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4768 auto &other_vertex = m_vertices[other_node_i];
4769 if (other_vertex.partition < vertex.partition)
4771 auto &other_partition = m_partitions[other_vertex.partition];
4772 auto &other_costs = partition_layout_costs (other_vertex.partition,
4773 other_partition.layout);
4774 slpg_layout_cost this_cost = other_costs.in_cost;
4775 this_cost.add_serial_cost (other_costs.internal_cost);
4776 this_cost.split (other_partition.out_degree);
4777 cost.add_parallel_cost (this_cost);
4780 for_each_partition_edge (node_i, add_cost);
4781 return cost;
4784 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4785 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
4786 slpg_layout_cost::impossible () if the change isn't possible. */
4788 slpg_layout_cost
4789 vect_optimize_slp_pass::
4790 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4791 unsigned int layout2_i)
4793 auto &def_vertex = m_vertices[ud->dest];
4794 auto &use_vertex = m_vertices[ud->src];
4795 auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4796 auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4797 auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4798 use_layout_i);
4799 if (factor < 0)
4800 return slpg_layout_cost::impossible ();
4802 /* We have a choice of putting the layout change at the site of the
4803 definition or at the site of the use. Prefer the former when
4804 optimizing for size or when the execution frequency of the
4805 definition is no greater than the combined execution frequencies of
4806 the uses. When putting the layout change at the site of the definition,
4807 divvy up the cost among all consumers. */
4808 if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4810 slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4811 cost.split (def_vertex.out_degree);
4812 return cost;
4814 return { use_vertex.weight * factor, m_optimize_size };
4817 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4818 partition; FROM_NODE_I could be the definition node or the use node.
4819 The node at the other end of the link wants to use layout TO_LAYOUT_I.
4820 Return the cost of any necessary fix-ups on edge UD, or return
4821 slpg_layout_cost::impossible () if the change isn't possible.
4823 At this point, FROM_NODE_I's partition has chosen the cheapest
4824 layout based on the information available so far, but this choice
4825 is only provisional. */
4827 slpg_layout_cost
4828 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
4829 unsigned int to_layout_i)
4831 auto &from_vertex = m_vertices[from_node_i];
4832 unsigned int from_partition_i = from_vertex.partition;
4833 slpg_partition_info &from_partition = m_partitions[from_partition_i];
4834 gcc_assert (from_partition.layout >= 0);
4836 /* First calculate the cost on the assumption that FROM_PARTITION sticks
4837 with its current layout preference. */
4838 slpg_layout_cost cost = slpg_layout_cost::impossible ();
4839 auto edge_cost = edge_layout_cost (ud, from_node_i,
4840 from_partition.layout, to_layout_i);
4841 if (edge_cost.is_possible ())
4843 auto &from_costs = partition_layout_costs (from_partition_i,
4844 from_partition.layout);
4845 cost = from_costs.in_cost;
4846 cost.add_serial_cost (from_costs.internal_cost);
4847 cost.split (from_partition.out_degree);
4848 cost.add_serial_cost (edge_cost);
4851 /* Take the minimum of that cost and the cost that applies if
4852 FROM_PARTITION instead switches to TO_LAYOUT_I. */
4853 auto &direct_layout_costs = partition_layout_costs (from_partition_i,
4854 to_layout_i);
4855 if (direct_layout_costs.is_possible ())
4857 slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
4858 direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
4859 direct_cost.split (from_partition.out_degree);
4860 if (!cost.is_possible ()
4861 || direct_cost.is_better_than (cost, m_optimize_size))
4862 cost = direct_cost;
4865 return cost;
4868 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
4869 partition; TO_NODE_I could be the definition node or the use node.
4870 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
4871 return the cost of any necessary fix-ups on edge UD, or
4872 slpg_layout_cost::impossible () if the choice cannot be made.
4874 At this point, TO_NODE_I's partition has a fixed choice of layout. */
4876 slpg_layout_cost
4877 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
4878 unsigned int from_layout_i)
4880 auto &to_vertex = m_vertices[to_node_i];
4881 unsigned int to_partition_i = to_vertex.partition;
4882 slpg_partition_info &to_partition = m_partitions[to_partition_i];
4883 gcc_assert (to_partition.layout >= 0);
4885 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
4886 adjusted for this input having layout FROM_LAYOUT_I. Assume that
4887 any other inputs keep their current choice of layout. */
4888 auto &to_costs = partition_layout_costs (to_partition_i,
4889 to_partition.layout);
4890 if (ud->src == int (to_node_i)
4891 && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
4893 auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
4894 auto old_layout = from_partition.layout;
4895 from_partition.layout = from_layout_i;
4896 int factor = internal_node_cost (to_vertex.node, -1,
4897 to_partition.layout);
4898 from_partition.layout = old_layout;
4899 if (factor >= 0)
4901 slpg_layout_cost cost = to_costs.out_cost;
4902 cost.add_serial_cost ({ to_vertex.weight * factor,
4903 m_optimize_size });
4904 cost.split (to_partition.in_degree);
4905 return cost;
4909 /* Compute the cost if we insert any necessary layout change on edge UD. */
4910 auto edge_cost = edge_layout_cost (ud, to_node_i,
4911 to_partition.layout, from_layout_i);
4912 if (edge_cost.is_possible ())
4914 slpg_layout_cost cost = to_costs.out_cost;
4915 cost.add_serial_cost (to_costs.internal_cost);
4916 cost.split (to_partition.in_degree);
4917 cost.add_serial_cost (edge_cost);
4918 return cost;
4921 return slpg_layout_cost::impossible ();
4924 /* Make a forward pass through the partitions, accumulating input costs.
4925 Make a tentative (provisional) choice of layout for each partition,
4926 ensuring that this choice still allows later partitions to keep
4927 their original layout. */
4929 void
4930 vect_optimize_slp_pass::forward_pass ()
4932 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
4933 ++partition_i)
4935 auto &partition = m_partitions[partition_i];
4937 /* If the partition consists of a single VEC_PERM_EXPR, precompute
4938 the incoming cost that would apply if every predecessor partition
4939 keeps its current layout. This is used within the loop below. */
4940 slpg_layout_cost in_cost;
4941 slp_tree single_node = nullptr;
4942 if (partition.node_end == partition.node_begin + 1)
4944 unsigned int node_i = m_partitioned_nodes[partition.node_begin];
4945 single_node = m_vertices[node_i].node;
4946 if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
4947 in_cost = total_in_cost (node_i);
4950 /* Go through the possible layouts. Decide which ones are valid
4951 for this partition and record which of the valid layouts has
4952 the lowest cost. */
4953 unsigned int min_layout_i = 0;
4954 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
4955 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
4957 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
4958 if (!layout_costs.is_possible ())
4959 continue;
4961 /* If the recorded layout is already 0 then the layout cannot
4962 change. */
4963 if (partition.layout == 0 && layout_i != 0)
4965 layout_costs.mark_impossible ();
4966 continue;
4969 bool is_possible = true;
4970 for (unsigned int order_i = partition.node_begin;
4971 order_i < partition.node_end; ++order_i)
4973 unsigned int node_i = m_partitioned_nodes[order_i];
4974 auto &vertex = m_vertices[node_i];
4976 /* Reject the layout if it is individually incompatible
4977 with any node in the partition. */
4978 if (!is_compatible_layout (vertex.node, layout_i))
4980 is_possible = false;
4981 break;
4984 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
4986 auto &other_vertex = m_vertices[other_node_i];
4987 if (other_vertex.partition < vertex.partition)
4989 /* Accumulate the incoming costs from earlier
4990 partitions, plus the cost of any layout changes
4991 on UD itself. */
4992 auto cost = forward_cost (ud, other_node_i, layout_i);
4993 if (!cost.is_possible ())
4994 is_possible = false;
4995 else
4996 layout_costs.in_cost.add_parallel_cost (cost);
4998 else
4999 /* Reject the layout if it would make layout 0 impossible
5000 for later partitions. This amounts to testing that the
5001 target supports reversing the layout change on edges
5002 to later partitions.
5004 In principle, it might be possible to push a layout
5005 change all the way down a graph, so that it never
5006 needs to be reversed and so that the target doesn't
5007 need to support the reverse operation. But it would
5008 be awkward to bail out if we hit a partition that
5009 does not support the new layout, especially since
5010 we are not dealing with a lattice. */
5011 is_possible &= edge_layout_cost (ud, other_node_i, 0,
5012 layout_i).is_possible ();
5014 for_each_partition_edge (node_i, add_cost);
5016 /* Accumulate the cost of using LAYOUT_I within NODE,
5017 both for the inputs and the outputs. */
5018 int factor = internal_node_cost (vertex.node, layout_i,
5019 layout_i);
5020 if (factor < 0)
5022 is_possible = false;
5023 break;
5025 else if (factor)
5026 layout_costs.internal_cost.add_serial_cost
5027 ({ vertex.weight * factor, m_optimize_size });
5029 if (!is_possible)
5031 layout_costs.mark_impossible ();
5032 continue;
5035 /* Combine the incoming and partition-internal costs. */
5036 slpg_layout_cost combined_cost = layout_costs.in_cost;
5037 combined_cost.add_serial_cost (layout_costs.internal_cost);
5039 /* If this partition consists of a single VEC_PERM_EXPR, see
5040 if the VEC_PERM_EXPR can be changed to support output layout
5041 LAYOUT_I while keeping all the provisional choices of input
5042 layout. */
5043 if (single_node
5044 && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5046 int factor = internal_node_cost (single_node, -1, layout_i);
5047 if (factor >= 0)
5049 auto weight = m_vertices[single_node->vertex].weight;
5050 slpg_layout_cost internal_cost
5051 = { weight * factor, m_optimize_size };
5053 slpg_layout_cost alt_cost = in_cost;
5054 alt_cost.add_serial_cost (internal_cost);
5055 if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5057 combined_cost = alt_cost;
5058 layout_costs.in_cost = in_cost;
5059 layout_costs.internal_cost = internal_cost;
5064 /* Record the layout with the lowest cost. Prefer layout 0 in
5065 the event of a tie between it and another layout. */
5066 if (!min_layout_cost.is_possible ()
5067 || combined_cost.is_better_than (min_layout_cost,
5068 m_optimize_size))
5070 min_layout_i = layout_i;
5071 min_layout_cost = combined_cost;
5075 /* This loop's handling of earlier partitions should ensure that
5076 choosing the original layout for the current partition is no
5077 less valid than it was in the original graph, even with the
5078 provisional layout choices for those earlier partitions. */
5079 gcc_assert (min_layout_cost.is_possible ());
5080 partition.layout = min_layout_i;
5084 /* Make a backward pass through the partitions, accumulating output costs.
5085 Make a final choice of layout for each partition. */
5087 void
5088 vect_optimize_slp_pass::backward_pass ()
5090 for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5092 auto &partition = m_partitions[partition_i];
5094 unsigned int min_layout_i = 0;
5095 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5096 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5098 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5099 if (!layout_costs.is_possible ())
5100 continue;
5102 /* Accumulate the costs from successor partitions. */
5103 bool is_possible = true;
5104 for (unsigned int order_i = partition.node_begin;
5105 order_i < partition.node_end; ++order_i)
5107 unsigned int node_i = m_partitioned_nodes[order_i];
5108 auto &vertex = m_vertices[node_i];
5109 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5111 auto &other_vertex = m_vertices[other_node_i];
5112 auto &other_partition = m_partitions[other_vertex.partition];
5113 if (other_vertex.partition > vertex.partition)
5115 /* Accumulate the incoming costs from later
5116 partitions, plus the cost of any layout changes
5117 on UD itself. */
5118 auto cost = backward_cost (ud, other_node_i, layout_i);
5119 if (!cost.is_possible ())
5120 is_possible = false;
5121 else
5122 layout_costs.out_cost.add_parallel_cost (cost);
5124 else
5125 /* Make sure that earlier partitions can (if necessary
5126 or beneficial) keep the layout that they chose in
5127 the forward pass. This ensures that there is at
5128 least one valid choice of layout. */
5129 is_possible &= edge_layout_cost (ud, other_node_i,
5130 other_partition.layout,
5131 layout_i).is_possible ();
5133 for_each_partition_edge (node_i, add_cost);
5135 if (!is_possible)
5137 layout_costs.mark_impossible ();
5138 continue;
5141 /* Locally combine the costs from the forward and backward passes.
5142 (This combined cost is not passed on, since that would lead
5143 to double counting.) */
5144 slpg_layout_cost combined_cost = layout_costs.in_cost;
5145 combined_cost.add_serial_cost (layout_costs.internal_cost);
5146 combined_cost.add_serial_cost (layout_costs.out_cost);
5148 /* Record the layout with the lowest cost. Prefer layout 0 in
5149 the event of a tie between it and another layout. */
5150 if (!min_layout_cost.is_possible ()
5151 || combined_cost.is_better_than (min_layout_cost,
5152 m_optimize_size))
5154 min_layout_i = layout_i;
5155 min_layout_cost = combined_cost;
5159 gcc_assert (min_layout_cost.is_possible ());
5160 partition.layout = min_layout_i;
5164 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5165 NODE already has the layout that was selected for its partition. */
5167 slp_tree
5168 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5169 unsigned int to_layout_i)
5171 unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5172 slp_tree result = m_node_layouts[result_i];
5173 if (result)
5174 return result;
5176 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5177 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5179 /* If the vector is uniform or unchanged, there's nothing to do. */
5180 if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5181 result = node;
5182 else
5184 auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5185 result = vect_create_new_slp_node (scalar_ops);
5186 vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5189 else
5191 unsigned int partition_i = m_vertices[node->vertex].partition;
5192 unsigned int from_layout_i = m_partitions[partition_i].layout;
5193 if (from_layout_i == to_layout_i)
5194 return node;
5196 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5197 permutation instead of a serial one. Leave the new permutation
5198 in TMP_PERM on success. */
5199 auto_lane_permutation_t tmp_perm;
5200 unsigned int num_inputs = 1;
5201 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5203 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5204 if (from_layout_i != 0)
5205 vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5206 if (to_layout_i != 0)
5207 vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5208 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5209 tmp_perm,
5210 SLP_TREE_CHILDREN (node),
5211 false) >= 0)
5212 num_inputs = SLP_TREE_CHILDREN (node).length ();
5213 else
5214 tmp_perm.truncate (0);
5217 if (dump_enabled_p ())
5219 if (tmp_perm.length () > 0)
5220 dump_printf_loc (MSG_NOTE, vect_location,
5221 "duplicating permutation node %p with"
5222 " layout %d\n",
5223 (void *) node, to_layout_i);
5224 else
5225 dump_printf_loc (MSG_NOTE, vect_location,
5226 "inserting permutation node in place of %p\n",
5227 (void *) node);
5230 unsigned int num_lanes = SLP_TREE_LANES (node);
5231 result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5232 if (SLP_TREE_SCALAR_STMTS (node).length ())
5234 auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5235 stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5236 if (from_layout_i != 0)
5237 vect_slp_permute (m_perms[from_layout_i], stmts, false);
5238 if (to_layout_i != 0)
5239 vect_slp_permute (m_perms[to_layout_i], stmts, true);
5241 SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5242 SLP_TREE_LANES (result) = num_lanes;
5243 SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5244 result->vertex = -1;
5246 auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5247 if (tmp_perm.length ())
5249 lane_perm.safe_splice (tmp_perm);
5250 SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5252 else
5254 lane_perm.create (num_lanes);
5255 for (unsigned j = 0; j < num_lanes; ++j)
5256 lane_perm.quick_push ({ 0, j });
5257 if (from_layout_i != 0)
5258 vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5259 if (to_layout_i != 0)
5260 vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5261 SLP_TREE_CHILDREN (result).safe_push (node);
5263 for (slp_tree child : SLP_TREE_CHILDREN (result))
5264 child->refcnt++;
5266 m_node_layouts[result_i] = result;
5267 return result;
5270 /* Apply the chosen vector layouts to the SLP graph. */
5272 void
5273 vect_optimize_slp_pass::materialize ()
5275 /* We no longer need the costs, so avoid having two O(N * P) arrays
5276 live at the same time. */
5277 m_partition_layout_costs.release ();
5278 m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5280 auto_sbitmap fully_folded (m_vertices.length ());
5281 bitmap_clear (fully_folded);
5282 for (unsigned int node_i : m_partitioned_nodes)
5284 auto &vertex = m_vertices[node_i];
5285 slp_tree node = vertex.node;
5286 int layout_i = m_partitions[vertex.partition].layout;
5287 gcc_assert (layout_i >= 0);
5289 /* Rearrange the scalar statements to match the chosen layout. */
5290 if (layout_i > 0)
5291 vect_slp_permute (m_perms[layout_i],
5292 SLP_TREE_SCALAR_STMTS (node), true);
5294 /* Update load and lane permutations. */
5295 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5297 /* First try to absorb the input vector layouts. If that fails,
5298 force the inputs to have layout LAYOUT_I too. We checked that
5299 that was possible before deciding to use nonzero output layouts.
5300 (Note that at this stage we don't really have any guarantee that
5301 the target supports the original VEC_PERM_EXPR.) */
5302 auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5303 auto_lane_permutation_t tmp_perm;
5304 tmp_perm.safe_splice (perm);
5305 change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5306 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5307 tmp_perm,
5308 SLP_TREE_CHILDREN (node),
5309 false) >= 0)
5311 if (dump_enabled_p ()
5312 && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5313 perm.begin ()))
5314 dump_printf_loc (MSG_NOTE, vect_location,
5315 "absorbing input layouts into %p\n",
5316 (void *) node);
5317 std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5318 bitmap_set_bit (fully_folded, node_i);
5320 else
5322 /* Not MSG_MISSED because it would make no sense to users. */
5323 if (dump_enabled_p ())
5324 dump_printf_loc (MSG_NOTE, vect_location,
5325 "failed to absorb input layouts into %p\n",
5326 (void *) node);
5327 change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5330 else
5332 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5333 auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5334 if (layout_i > 0)
5335 /* ??? When we handle non-bijective permutes the idea
5336 is that we can force the load-permutation to be
5337 { min, min + 1, min + 2, ... max }. But then the
5338 scalar defs might no longer match the lane content
5339 which means wrong-code with live lane vectorization.
5340 So we possibly have to have NULL entries for those. */
5341 vect_slp_permute (m_perms[layout_i], load_perm, true);
5345 /* Do this before any nodes disappear, since it involves a walk
5346 over the leaves. */
5347 remove_redundant_permutations ();
5349 /* Replace each child with a correctly laid-out version. */
5350 for (unsigned int node_i : m_partitioned_nodes)
5352 /* Skip nodes that have already been handled above. */
5353 if (bitmap_bit_p (fully_folded, node_i))
5354 continue;
5356 auto &vertex = m_vertices[node_i];
5357 int in_layout_i = m_partitions[vertex.partition].layout;
5358 gcc_assert (in_layout_i >= 0);
5360 unsigned j;
5361 slp_tree child;
5362 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5364 if (!child)
5365 continue;
5367 slp_tree new_child = get_result_with_layout (child, in_layout_i);
5368 if (new_child != child)
5370 vect_free_slp_tree (child);
5371 SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5372 new_child->refcnt += 1;
5378 /* Elide load permutations that are not necessary. Such permutations might
5379 be pre-existing, rather than created by the layout optimizations. */
5381 void
5382 vect_optimize_slp_pass::remove_redundant_permutations ()
5384 for (unsigned int node_i : m_leafs)
5386 slp_tree node = m_vertices[node_i].node;
5387 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5388 continue;
5390 /* In basic block vectorization we allow any subchain of an interleaving
5391 chain.
5392 FORNOW: not in loop SLP because of realignment complications. */
5393 if (is_a <bb_vec_info> (m_vinfo))
5395 bool subchain_p = true;
5396 stmt_vec_info next_load_info = NULL;
5397 stmt_vec_info load_info;
5398 unsigned j;
5399 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5401 if (j != 0
5402 && (next_load_info != load_info
5403 || DR_GROUP_GAP (load_info) != 1))
5405 subchain_p = false;
5406 break;
5408 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5410 if (subchain_p)
5412 SLP_TREE_LOAD_PERMUTATION (node).release ();
5413 continue;
5416 else
5418 loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5419 stmt_vec_info load_info;
5420 bool this_load_permuted = false;
5421 unsigned j;
5422 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5423 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5425 this_load_permuted = true;
5426 break;
5428 stmt_vec_info first_stmt_info
5429 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5430 if (!this_load_permuted
5431 /* The load requires permutation when unrolling exposes
5432 a gap either because the group is larger than the SLP
5433 group-size or because there is a gap between the groups. */
5434 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5435 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5436 && DR_GROUP_GAP (first_stmt_info) == 0)))
5438 SLP_TREE_LOAD_PERMUTATION (node).release ();
5439 continue;
5445 /* Print the partition graph and layout information to the dump file. */
5447 void
5448 vect_optimize_slp_pass::dump ()
5450 dump_printf_loc (MSG_NOTE, vect_location,
5451 "SLP optimize permutations:\n");
5452 for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5454 dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
5455 const char *sep = "";
5456 for (unsigned int idx : m_perms[layout_i])
5458 dump_printf (MSG_NOTE, "%s%d", sep, idx);
5459 sep = ", ";
5461 dump_printf (MSG_NOTE, " }\n");
5463 dump_printf_loc (MSG_NOTE, vect_location,
5464 "SLP optimize partitions:\n");
5465 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5466 ++partition_i)
5468 auto &partition = m_partitions[partition_i];
5469 dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
5470 dump_printf_loc (MSG_NOTE, vect_location,
5471 " partition %d (layout %d):\n",
5472 partition_i, partition.layout);
5473 dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
5474 for (unsigned int order_i = partition.node_begin;
5475 order_i < partition.node_end; ++order_i)
5477 auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5478 dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
5479 (void *) vertex.node);
5480 dump_printf_loc (MSG_NOTE, vect_location,
5481 " weight: %f\n",
5482 vertex.weight.to_double ());
5483 if (vertex.out_degree)
5484 dump_printf_loc (MSG_NOTE, vect_location,
5485 " out weight: %f (degree %d)\n",
5486 vertex.out_weight.to_double (),
5487 vertex.out_degree);
5488 if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5489 dump_printf_loc (MSG_NOTE, vect_location,
5490 " op: VEC_PERM_EXPR\n");
5491 else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5492 dump_printf_loc (MSG_NOTE, vect_location,
5493 " op template: %G", rep->stmt);
5495 dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
5496 for (unsigned int order_i = partition.node_begin;
5497 order_i < partition.node_end; ++order_i)
5499 unsigned int node_i = m_partitioned_nodes[order_i];
5500 auto &vertex = m_vertices[node_i];
5501 auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5503 auto &other_vertex = m_vertices[other_node_i];
5504 if (other_vertex.partition < vertex.partition)
5505 dump_printf_loc (MSG_NOTE, vect_location,
5506 " - %p [%d] --> %p\n",
5507 (void *) other_vertex.node,
5508 other_vertex.partition,
5509 (void *) vertex.node);
5510 else
5511 dump_printf_loc (MSG_NOTE, vect_location,
5512 " - %p --> [%d] %p\n",
5513 (void *) vertex.node,
5514 other_vertex.partition,
5515 (void *) other_vertex.node);
5517 for_each_partition_edge (node_i, print_edge);
5520 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5522 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5523 if (layout_costs.is_possible ())
5525 dump_printf_loc (MSG_NOTE, vect_location,
5526 " layout %d:%s\n", layout_i,
5527 partition.layout == int (layout_i)
5528 ? " (*)" : "");
5529 slpg_layout_cost combined_cost = layout_costs.in_cost;
5530 combined_cost.add_serial_cost (layout_costs.internal_cost);
5531 combined_cost.add_serial_cost (layout_costs.out_cost);
5532 #define TEMPLATE "{depth: %f, total: %f}"
5533 dump_printf_loc (MSG_NOTE, vect_location,
5534 " " TEMPLATE "\n",
5535 layout_costs.in_cost.depth.to_double (),
5536 layout_costs.in_cost.total.to_double ());
5537 dump_printf_loc (MSG_NOTE, vect_location,
5538 " + " TEMPLATE "\n",
5539 layout_costs.internal_cost.depth.to_double (),
5540 layout_costs.internal_cost.total.to_double ());
5541 dump_printf_loc (MSG_NOTE, vect_location,
5542 " + " TEMPLATE "\n",
5543 layout_costs.out_cost.depth.to_double (),
5544 layout_costs.out_cost.total.to_double ());
5545 dump_printf_loc (MSG_NOTE, vect_location,
5546 " = " TEMPLATE "\n",
5547 combined_cost.depth.to_double (),
5548 combined_cost.total.to_double ());
5549 #undef TEMPLATE
5551 else
5552 dump_printf_loc (MSG_NOTE, vect_location,
5553 " layout %d: rejected\n", layout_i);
5558 /* Main entry point for the SLP graph optimization pass. */
5560 void
5561 vect_optimize_slp_pass::run ()
5563 build_graph ();
5564 create_partitions ();
5565 start_choosing_layouts ();
5566 if (m_perms.length () > 1)
5568 forward_pass ();
5569 backward_pass ();
5570 if (dump_enabled_p ())
5571 dump ();
5572 materialize ();
5573 while (!m_perms.is_empty ())
5574 m_perms.pop ().release ();
5576 else
5577 remove_redundant_permutations ();
5578 free_graph (m_slpg);
5581 /* Optimize the SLP graph of VINFO. */
5583 void
5584 vect_optimize_slp (vec_info *vinfo)
5586 if (vinfo->slp_instances.is_empty ())
5587 return;
5588 vect_optimize_slp_pass (vinfo).run ();
5591 /* Gather loads reachable from the individual SLP graph entries. */
5593 void
5594 vect_gather_slp_loads (vec_info *vinfo)
5596 unsigned i;
5597 slp_instance instance;
5598 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5600 hash_set<slp_tree> visited;
5601 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5602 SLP_INSTANCE_TREE (instance), visited);
5607 /* For each possible SLP instance decide whether to SLP it and calculate overall
5608 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
5609 least one instance. */
5611 bool
5612 vect_make_slp_decision (loop_vec_info loop_vinfo)
5614 unsigned int i;
5615 poly_uint64 unrolling_factor = 1;
5616 const vec<slp_instance> &slp_instances
5617 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5618 slp_instance instance;
5619 int decided_to_slp = 0;
5621 DUMP_VECT_SCOPE ("vect_make_slp_decision");
5623 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5625 /* FORNOW: SLP if you can. */
5626 /* All unroll factors have the form:
5628 GET_MODE_SIZE (vinfo->vector_mode) * X
5630 for some rational X, so they must have a common multiple. */
5631 unrolling_factor
5632 = force_common_multiple (unrolling_factor,
5633 SLP_INSTANCE_UNROLLING_FACTOR (instance));
5635 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
5636 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5637 loop-based vectorization. Such stmts will be marked as HYBRID. */
5638 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5639 decided_to_slp++;
5642 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5644 if (decided_to_slp && dump_enabled_p ())
5646 dump_printf_loc (MSG_NOTE, vect_location,
5647 "Decided to SLP %d instances. Unrolling factor ",
5648 decided_to_slp);
5649 dump_dec (MSG_NOTE, unrolling_factor);
5650 dump_printf (MSG_NOTE, "\n");
5653 return (decided_to_slp > 0);
5656 /* Private data for vect_detect_hybrid_slp. */
5657 struct vdhs_data
5659 loop_vec_info loop_vinfo;
5660 vec<stmt_vec_info> *worklist;
5663 /* Walker for walk_gimple_op. */
5665 static tree
5666 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5668 walk_stmt_info *wi = (walk_stmt_info *)data;
5669 vdhs_data *dat = (vdhs_data *)wi->info;
5671 if (wi->is_lhs)
5672 return NULL_TREE;
5674 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5675 if (!def_stmt_info)
5676 return NULL_TREE;
5677 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5678 if (PURE_SLP_STMT (def_stmt_info))
5680 if (dump_enabled_p ())
5681 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5682 def_stmt_info->stmt);
5683 STMT_SLP_TYPE (def_stmt_info) = hybrid;
5684 dat->worklist->safe_push (def_stmt_info);
5687 return NULL_TREE;
5690 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5691 if so, otherwise pushing it to WORKLIST. */
5693 static void
5694 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5695 vec<stmt_vec_info> &worklist,
5696 stmt_vec_info stmt_info)
5698 if (dump_enabled_p ())
5699 dump_printf_loc (MSG_NOTE, vect_location,
5700 "Processing hybrid candidate : %G", stmt_info->stmt);
5701 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5702 imm_use_iterator iter2;
5703 ssa_op_iter iter1;
5704 use_operand_p use_p;
5705 def_operand_p def_p;
5706 bool any_def = false;
5707 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5709 any_def = true;
5710 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5712 if (is_gimple_debug (USE_STMT (use_p)))
5713 continue;
5714 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5715 /* An out-of loop use means this is a loop_vect sink. */
5716 if (!use_info)
5718 if (dump_enabled_p ())
5719 dump_printf_loc (MSG_NOTE, vect_location,
5720 "Found loop_vect sink: %G", stmt_info->stmt);
5721 worklist.safe_push (stmt_info);
5722 return;
5724 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5726 if (dump_enabled_p ())
5727 dump_printf_loc (MSG_NOTE, vect_location,
5728 "Found loop_vect use: %G", use_info->stmt);
5729 worklist.safe_push (stmt_info);
5730 return;
5734 /* No def means this is a loo_vect sink. */
5735 if (!any_def)
5737 if (dump_enabled_p ())
5738 dump_printf_loc (MSG_NOTE, vect_location,
5739 "Found loop_vect sink: %G", stmt_info->stmt);
5740 worklist.safe_push (stmt_info);
5741 return;
5743 if (dump_enabled_p ())
5744 dump_printf_loc (MSG_NOTE, vect_location,
5745 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5746 STMT_SLP_TYPE (stmt_info) = pure_slp;
5749 /* Find stmts that must be both vectorized and SLPed. */
5751 void
5752 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5754 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5756 /* All stmts participating in SLP are marked pure_slp, all other
5757 stmts are loop_vect.
5758 First collect all loop_vect stmts into a worklist.
5759 SLP patterns cause not all original scalar stmts to appear in
5760 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5761 Rectify this here and do a backward walk over the IL only considering
5762 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5763 mark them as pure_slp. */
5764 auto_vec<stmt_vec_info> worklist;
5765 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5767 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5768 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5769 gsi_next (&gsi))
5771 gphi *phi = gsi.phi ();
5772 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5773 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5774 maybe_push_to_hybrid_worklist (loop_vinfo,
5775 worklist, stmt_info);
5777 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5778 gsi_prev (&gsi))
5780 gimple *stmt = gsi_stmt (gsi);
5781 if (is_gimple_debug (stmt))
5782 continue;
5783 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5784 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5786 for (gimple_stmt_iterator gsi2
5787 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5788 !gsi_end_p (gsi2); gsi_next (&gsi2))
5790 stmt_vec_info patt_info
5791 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5792 if (!STMT_SLP_TYPE (patt_info)
5793 && STMT_VINFO_RELEVANT (patt_info))
5794 maybe_push_to_hybrid_worklist (loop_vinfo,
5795 worklist, patt_info);
5797 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5799 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5800 maybe_push_to_hybrid_worklist (loop_vinfo,
5801 worklist, stmt_info);
5805 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5806 mark any SLP vectorized stmt as hybrid.
5807 ??? We're visiting def stmts N times (once for each non-SLP and
5808 once for each hybrid-SLP use). */
5809 walk_stmt_info wi;
5810 vdhs_data dat;
5811 dat.worklist = &worklist;
5812 dat.loop_vinfo = loop_vinfo;
5813 memset (&wi, 0, sizeof (wi));
5814 wi.info = (void *)&dat;
5815 while (!worklist.is_empty ())
5817 stmt_vec_info stmt_info = worklist.pop ();
5818 /* Since SSA operands are not set up for pattern stmts we need
5819 to use walk_gimple_op. */
5820 wi.is_lhs = 0;
5821 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
5822 /* For gather/scatter make sure to walk the offset operand, that
5823 can be a scaling and conversion away. */
5824 gather_scatter_info gs_info;
5825 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5826 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
5828 int dummy;
5829 vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
5835 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
5837 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
5838 : vec_info (vec_info::bb, shared),
5839 bbs (_bbs),
5840 roots (vNULL)
5842 for (unsigned i = 0; i < bbs.length (); ++i)
5844 if (i != 0)
5845 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5846 gsi_next (&si))
5848 gphi *phi = si.phi ();
5849 gimple_set_uid (phi, 0);
5850 add_stmt (phi);
5852 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5853 !gsi_end_p (gsi); gsi_next (&gsi))
5855 gimple *stmt = gsi_stmt (gsi);
5856 gimple_set_uid (stmt, 0);
5857 if (is_gimple_debug (stmt))
5858 continue;
5859 add_stmt (stmt);
5865 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
5866 stmts in the basic block. */
5868 _bb_vec_info::~_bb_vec_info ()
5870 /* Reset region marker. */
5871 for (unsigned i = 0; i < bbs.length (); ++i)
5873 if (i != 0)
5874 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5875 gsi_next (&si))
5877 gphi *phi = si.phi ();
5878 gimple_set_uid (phi, -1);
5880 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5881 !gsi_end_p (gsi); gsi_next (&gsi))
5883 gimple *stmt = gsi_stmt (gsi);
5884 gimple_set_uid (stmt, -1);
5888 for (unsigned i = 0; i < roots.length (); ++i)
5890 roots[i].stmts.release ();
5891 roots[i].roots.release ();
5893 roots.release ();
5896 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
5897 given then that child nodes have already been processed, and that
5898 their def types currently match their SLP node's def type. */
5900 static bool
5901 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
5902 slp_instance node_instance,
5903 stmt_vector_for_cost *cost_vec)
5905 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5907 /* Calculate the number of vector statements to be created for the
5908 scalar stmts in this node. For SLP reductions it is equal to the
5909 number of vector statements in the children (which has already been
5910 calculated by the recursive call). Otherwise it is the number of
5911 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
5912 VF divided by the number of elements in a vector. */
5913 if (!STMT_VINFO_DATA_REF (stmt_info)
5914 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5916 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
5917 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
5919 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5920 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
5921 break;
5924 else
5926 poly_uint64 vf;
5927 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5928 vf = loop_vinfo->vectorization_factor;
5929 else
5930 vf = 1;
5931 unsigned int group_size = SLP_TREE_LANES (node);
5932 tree vectype = SLP_TREE_VECTYPE (node);
5933 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5934 = vect_get_num_vectors (vf * group_size, vectype);
5937 /* Handle purely internal nodes. */
5938 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5940 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
5941 return false;
5943 stmt_vec_info slp_stmt_info;
5944 unsigned int i;
5945 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
5947 if (STMT_VINFO_LIVE_P (slp_stmt_info)
5948 && !vectorizable_live_operation (vinfo,
5949 slp_stmt_info, NULL, node,
5950 node_instance, i,
5951 false, cost_vec))
5952 return false;
5954 return true;
5957 bool dummy;
5958 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
5959 node, node_instance, cost_vec);
5962 /* Try to build NODE from scalars, returning true on success.
5963 NODE_INSTANCE is the SLP instance that contains NODE. */
5965 static bool
5966 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
5967 slp_instance node_instance)
5969 stmt_vec_info stmt_info;
5970 unsigned int i;
5972 if (!is_a <bb_vec_info> (vinfo)
5973 || node == SLP_INSTANCE_TREE (node_instance)
5974 || !SLP_TREE_SCALAR_STMTS (node).exists ()
5975 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
5976 /* Force the mask use to be built from scalars instead. */
5977 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
5978 return false;
5980 if (dump_enabled_p ())
5981 dump_printf_loc (MSG_NOTE, vect_location,
5982 "Building vector operands of %p from scalars instead\n",
5983 (void *) node);
5985 /* Don't remove and free the child nodes here, since they could be
5986 referenced by other structures. The analysis and scheduling phases
5987 (need to) ignore child nodes of anything that isn't vect_internal_def. */
5988 unsigned int group_size = SLP_TREE_LANES (node);
5989 SLP_TREE_DEF_TYPE (node) = vect_external_def;
5990 /* Invariants get their vector type from the uses. */
5991 SLP_TREE_VECTYPE (node) = NULL_TREE;
5992 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
5993 SLP_TREE_LOAD_PERMUTATION (node).release ();
5994 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5996 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5997 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
5999 return true;
6002 /* Return true if all elements of the slice are the same. */
6003 bool
6004 vect_scalar_ops_slice::all_same_p () const
6006 for (unsigned int i = 1; i < length; ++i)
6007 if (!operand_equal_p (op (0), op (i)))
6008 return false;
6009 return true;
6012 hashval_t
6013 vect_scalar_ops_slice_hash::hash (const value_type &s)
6015 hashval_t hash = 0;
6016 for (unsigned i = 0; i < s.length; ++i)
6017 hash = iterative_hash_expr (s.op (i), hash);
6018 return hash;
6021 bool
6022 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6023 const compare_type &s2)
6025 if (s1.length != s2.length)
6026 return false;
6027 for (unsigned i = 0; i < s1.length; ++i)
6028 if (!operand_equal_p (s1.op (i), s2.op (i)))
6029 return false;
6030 return true;
6033 /* Compute the prologue cost for invariant or constant operands represented
6034 by NODE. */
6036 static void
6037 vect_prologue_cost_for_slp (slp_tree node,
6038 stmt_vector_for_cost *cost_vec)
6040 /* There's a special case of an existing vector, that costs nothing. */
6041 if (SLP_TREE_SCALAR_OPS (node).length () == 0
6042 && !SLP_TREE_VEC_DEFS (node).is_empty ())
6043 return;
6044 /* Without looking at the actual initializer a vector of
6045 constants can be implemented as load from the constant pool.
6046 When all elements are the same we can use a splat. */
6047 tree vectype = SLP_TREE_VECTYPE (node);
6048 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6049 unsigned HOST_WIDE_INT const_nunits;
6050 unsigned nelt_limit;
6051 auto ops = &SLP_TREE_SCALAR_OPS (node);
6052 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6053 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6054 && ! multiple_p (const_nunits, group_size))
6056 nelt_limit = const_nunits;
6057 hash_set<vect_scalar_ops_slice_hash> vector_ops;
6058 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6059 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6060 starts.quick_push (i * const_nunits);
6062 else
6064 /* If either the vector has variable length or the vectors
6065 are composed of repeated whole groups we only need to
6066 cost construction once. All vectors will be the same. */
6067 nelt_limit = group_size;
6068 starts.quick_push (0);
6070 /* ??? We're just tracking whether vectors in a single node are the same.
6071 Ideally we'd do something more global. */
6072 bool passed = false;
6073 for (unsigned int start : starts)
6075 vect_cost_for_stmt kind;
6076 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6077 kind = vector_load;
6078 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6079 kind = scalar_to_vec;
6080 else
6081 kind = vec_construct;
6082 /* The target cost hook has no idea which part of the SLP node
6083 we are costing so avoid passing it down more than once. Pass
6084 it to the first vec_construct or scalar_to_vec part since for those
6085 the x86 backend tries to account for GPR to XMM register moves. */
6086 record_stmt_cost (cost_vec, 1, kind,
6087 (kind != vector_load && !passed) ? node : nullptr,
6088 vectype, 0, vect_prologue);
6089 if (kind != vector_load)
6090 passed = true;
6094 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6095 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6097 Return true if the operations are supported. */
6099 static bool
6100 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6101 slp_instance node_instance,
6102 hash_set<slp_tree> &visited_set,
6103 vec<slp_tree> &visited_vec,
6104 stmt_vector_for_cost *cost_vec)
6106 int i, j;
6107 slp_tree child;
6109 /* Assume we can code-generate all invariants. */
6110 if (!node
6111 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6112 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6113 return true;
6115 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6117 if (dump_enabled_p ())
6118 dump_printf_loc (MSG_NOTE, vect_location,
6119 "Failed cyclic SLP reference in %p\n", (void *) node);
6120 return false;
6122 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6124 /* If we already analyzed the exact same set of scalar stmts we're done.
6125 We share the generated vector stmts for those. */
6126 if (visited_set.add (node))
6127 return true;
6128 visited_vec.safe_push (node);
6130 bool res = true;
6131 unsigned visited_rec_start = visited_vec.length ();
6132 unsigned cost_vec_rec_start = cost_vec->length ();
6133 bool seen_non_constant_child = false;
6134 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6136 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6137 visited_set, visited_vec,
6138 cost_vec);
6139 if (!res)
6140 break;
6141 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6142 seen_non_constant_child = true;
6144 /* We're having difficulties scheduling nodes with just constant
6145 operands and no scalar stmts since we then cannot compute a stmt
6146 insertion place. */
6147 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6149 if (dump_enabled_p ())
6150 dump_printf_loc (MSG_NOTE, vect_location,
6151 "Cannot vectorize all-constant op node %p\n",
6152 (void *) node);
6153 res = false;
6156 if (res)
6157 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6158 cost_vec);
6159 /* If analysis failed we have to pop all recursive visited nodes
6160 plus ourselves. */
6161 if (!res)
6163 while (visited_vec.length () >= visited_rec_start)
6164 visited_set.remove (visited_vec.pop ());
6165 cost_vec->truncate (cost_vec_rec_start);
6168 /* When the node can be vectorized cost invariant nodes it references.
6169 This is not done in DFS order to allow the refering node
6170 vectorizable_* calls to nail down the invariant nodes vector type
6171 and possibly unshare it if it needs a different vector type than
6172 other referrers. */
6173 if (res)
6174 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6175 if (child
6176 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6177 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6178 /* Perform usual caching, note code-generation still
6179 code-gens these nodes multiple times but we expect
6180 to CSE them later. */
6181 && !visited_set.add (child))
6183 visited_vec.safe_push (child);
6184 /* ??? After auditing more code paths make a "default"
6185 and push the vector type from NODE to all children
6186 if it is not already set. */
6187 /* Compute the number of vectors to be generated. */
6188 tree vector_type = SLP_TREE_VECTYPE (child);
6189 if (!vector_type)
6191 /* For shifts with a scalar argument we don't need
6192 to cost or code-generate anything.
6193 ??? Represent this more explicitely. */
6194 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6195 == shift_vec_info_type)
6196 && j == 1);
6197 continue;
6199 unsigned group_size = SLP_TREE_LANES (child);
6200 poly_uint64 vf = 1;
6201 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6202 vf = loop_vinfo->vectorization_factor;
6203 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6204 = vect_get_num_vectors (vf * group_size, vector_type);
6205 /* And cost them. */
6206 vect_prologue_cost_for_slp (child, cost_vec);
6209 /* If this node or any of its children can't be vectorized, try pruning
6210 the tree here rather than felling the whole thing. */
6211 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6213 /* We'll need to revisit this for invariant costing and number
6214 of vectorized stmt setting. */
6215 res = true;
6218 return res;
6221 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6222 region and that can be vectorized using vectorizable_live_operation
6223 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
6224 scalar code computing it to be retained. */
6226 static void
6227 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6228 slp_instance instance,
6229 stmt_vector_for_cost *cost_vec,
6230 hash_set<stmt_vec_info> &svisited,
6231 hash_set<slp_tree> &visited)
6233 if (visited.add (node))
6234 return;
6236 unsigned i;
6237 stmt_vec_info stmt_info;
6238 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6239 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6241 if (svisited.contains (stmt_info))
6242 continue;
6243 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6244 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6245 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6246 /* Only the pattern root stmt computes the original scalar value. */
6247 continue;
6248 bool mark_visited = true;
6249 gimple *orig_stmt = orig_stmt_info->stmt;
6250 ssa_op_iter op_iter;
6251 def_operand_p def_p;
6252 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6254 imm_use_iterator use_iter;
6255 gimple *use_stmt;
6256 stmt_vec_info use_stmt_info;
6257 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6258 if (!is_gimple_debug (use_stmt))
6260 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6261 if (!use_stmt_info
6262 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6264 STMT_VINFO_LIVE_P (stmt_info) = true;
6265 if (vectorizable_live_operation (bb_vinfo, stmt_info,
6266 NULL, node, instance, i,
6267 false, cost_vec))
6268 /* ??? So we know we can vectorize the live stmt
6269 from one SLP node. If we cannot do so from all
6270 or none consistently we'd have to record which
6271 SLP node (and lane) we want to use for the live
6272 operation. So make sure we can code-generate
6273 from all nodes. */
6274 mark_visited = false;
6275 else
6276 STMT_VINFO_LIVE_P (stmt_info) = false;
6277 break;
6280 /* We have to verify whether we can insert the lane extract
6281 before all uses. The following is a conservative approximation.
6282 We cannot put this into vectorizable_live_operation because
6283 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6284 doesn't work.
6285 Note that while the fact that we emit code for loads at the
6286 first load should make this a non-problem leafs we construct
6287 from scalars are vectorized after the last scalar def.
6288 ??? If we'd actually compute the insert location during
6289 analysis we could use sth less conservative than the last
6290 scalar stmt in the node for the dominance check. */
6291 /* ??? What remains is "live" uses in vector CTORs in the same
6292 SLP graph which is where those uses can end up code-generated
6293 right after their definition instead of close to their original
6294 use. But that would restrict us to code-generate lane-extracts
6295 from the latest stmt in a node. So we compensate for this
6296 during code-generation, simply not replacing uses for those
6297 hopefully rare cases. */
6298 if (STMT_VINFO_LIVE_P (stmt_info))
6299 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6300 if (!is_gimple_debug (use_stmt)
6301 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6302 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6303 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6305 if (dump_enabled_p ())
6306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6307 "Cannot determine insertion place for "
6308 "lane extract\n");
6309 STMT_VINFO_LIVE_P (stmt_info) = false;
6310 mark_visited = true;
6313 if (mark_visited)
6314 svisited.add (stmt_info);
6317 slp_tree child;
6318 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6319 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6320 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6321 cost_vec, svisited, visited);
6324 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
6326 static bool
6327 vectorizable_bb_reduc_epilogue (slp_instance instance,
6328 stmt_vector_for_cost *cost_vec)
6330 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6331 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6332 if (reduc_code == MINUS_EXPR)
6333 reduc_code = PLUS_EXPR;
6334 internal_fn reduc_fn;
6335 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6336 if (!vectype
6337 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6338 || reduc_fn == IFN_LAST
6339 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6340 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6341 TREE_TYPE (vectype)))
6342 return false;
6344 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6345 cost log2 vector operations plus shuffles and one extraction. */
6346 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6347 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6348 vectype, 0, vect_body);
6349 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6350 vectype, 0, vect_body);
6351 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6352 vectype, 0, vect_body);
6353 return true;
6356 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6357 and recurse to children. */
6359 static void
6360 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6361 hash_set<slp_tree> &visited)
6363 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6364 || visited.add (node))
6365 return;
6367 stmt_vec_info stmt;
6368 unsigned i;
6369 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6370 roots.remove (vect_orig_stmt (stmt));
6372 slp_tree child;
6373 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6374 if (child)
6375 vect_slp_prune_covered_roots (child, roots, visited);
6378 /* Analyze statements in SLP instances of VINFO. Return true if the
6379 operations are supported. */
6381 bool
6382 vect_slp_analyze_operations (vec_info *vinfo)
6384 slp_instance instance;
6385 int i;
6387 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6389 hash_set<slp_tree> visited;
6390 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6392 auto_vec<slp_tree> visited_vec;
6393 stmt_vector_for_cost cost_vec;
6394 cost_vec.create (2);
6395 if (is_a <bb_vec_info> (vinfo))
6396 vect_location = instance->location ();
6397 if (!vect_slp_analyze_node_operations (vinfo,
6398 SLP_INSTANCE_TREE (instance),
6399 instance, visited, visited_vec,
6400 &cost_vec)
6401 /* CTOR instances require vectorized defs for the SLP tree root. */
6402 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6403 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6404 != vect_internal_def
6405 /* Make sure we vectorized with the expected type. */
6406 || !useless_type_conversion_p
6407 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6408 (instance->root_stmts[0]->stmt))),
6409 TREE_TYPE (SLP_TREE_VECTYPE
6410 (SLP_INSTANCE_TREE (instance))))))
6411 /* Check we can vectorize the reduction. */
6412 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6413 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6415 slp_tree node = SLP_INSTANCE_TREE (instance);
6416 stmt_vec_info stmt_info;
6417 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6418 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6419 else
6420 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6421 if (dump_enabled_p ())
6422 dump_printf_loc (MSG_NOTE, vect_location,
6423 "removing SLP instance operations starting from: %G",
6424 stmt_info->stmt);
6425 vect_free_slp_instance (instance);
6426 vinfo->slp_instances.ordered_remove (i);
6427 cost_vec.release ();
6428 while (!visited_vec.is_empty ())
6429 visited.remove (visited_vec.pop ());
6431 else
6433 i++;
6434 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6436 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6437 cost_vec.release ();
6439 else
6440 /* For BB vectorization remember the SLP graph entry
6441 cost for later. */
6442 instance->cost_vec = cost_vec;
6446 /* Now look for SLP instances with a root that are covered by other
6447 instances and remove them. */
6448 hash_set<stmt_vec_info> roots;
6449 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6450 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6451 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6452 if (!roots.is_empty ())
6454 visited.empty ();
6455 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6456 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6457 visited);
6458 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6459 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6460 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6462 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6463 if (dump_enabled_p ())
6464 dump_printf_loc (MSG_NOTE, vect_location,
6465 "removing SLP instance operations starting "
6466 "from: %G", root->stmt);
6467 vect_free_slp_instance (instance);
6468 vinfo->slp_instances.ordered_remove (i);
6470 else
6471 ++i;
6474 /* Compute vectorizable live stmts. */
6475 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6477 hash_set<stmt_vec_info> svisited;
6478 hash_set<slp_tree> visited;
6479 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6481 vect_location = instance->location ();
6482 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6483 instance, &instance->cost_vec, svisited,
6484 visited);
6488 return !vinfo->slp_instances.is_empty ();
6491 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6492 closing the eventual chain. */
6494 static slp_instance
6495 get_ultimate_leader (slp_instance instance,
6496 hash_map<slp_instance, slp_instance> &instance_leader)
6498 auto_vec<slp_instance *, 8> chain;
6499 slp_instance *tem;
6500 while (*(tem = instance_leader.get (instance)) != instance)
6502 chain.safe_push (tem);
6503 instance = *tem;
6505 while (!chain.is_empty ())
6506 *chain.pop () = instance;
6507 return instance;
6510 namespace {
6511 /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
6512 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6513 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
6515 INSTANCE_LEADER is as for get_ultimate_leader. */
6517 template<typename T>
6518 bool
6519 vect_map_to_instance (slp_instance instance, T key,
6520 hash_map<T, slp_instance> &key_to_instance,
6521 hash_map<slp_instance, slp_instance> &instance_leader)
6523 bool existed_p;
6524 slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6525 if (!existed_p)
6527 else if (key_instance != instance)
6529 /* If we're running into a previously marked key make us the
6530 leader of the current ultimate leader. This keeps the
6531 leader chain acyclic and works even when the current instance
6532 connects two previously independent graph parts. */
6533 slp_instance key_leader
6534 = get_ultimate_leader (key_instance, instance_leader);
6535 if (key_leader != instance)
6536 instance_leader.put (key_leader, instance);
6538 key_instance = instance;
6539 return existed_p;
6543 /* Worker of vect_bb_partition_graph, recurse on NODE. */
6545 static void
6546 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6547 slp_instance instance, slp_tree node,
6548 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6549 hash_map<slp_tree, slp_instance> &node_to_instance,
6550 hash_map<slp_instance, slp_instance> &instance_leader)
6552 stmt_vec_info stmt_info;
6553 unsigned i;
6555 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6556 vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6557 instance_leader);
6559 if (vect_map_to_instance (instance, node, node_to_instance,
6560 instance_leader))
6561 return;
6563 slp_tree child;
6564 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6565 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6566 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6567 node_to_instance, instance_leader);
6570 /* Partition the SLP graph into pieces that can be costed independently. */
6572 static void
6573 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6575 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6577 /* First walk the SLP graph assigning each involved scalar stmt a
6578 corresponding SLP graph entry and upon visiting a previously
6579 marked stmt, make the stmts leader the current SLP graph entry. */
6580 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6581 hash_map<slp_tree, slp_instance> node_to_instance;
6582 hash_map<slp_instance, slp_instance> instance_leader;
6583 slp_instance instance;
6584 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6586 instance_leader.put (instance, instance);
6587 vect_bb_partition_graph_r (bb_vinfo,
6588 instance, SLP_INSTANCE_TREE (instance),
6589 stmt_to_instance, node_to_instance,
6590 instance_leader);
6593 /* Then collect entries to each independent subgraph. */
6594 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6596 slp_instance leader = get_ultimate_leader (instance, instance_leader);
6597 leader->subgraph_entries.safe_push (instance);
6598 if (dump_enabled_p ()
6599 && leader != instance)
6600 dump_printf_loc (MSG_NOTE, vect_location,
6601 "instance %p is leader of %p\n",
6602 (void *) leader, (void *) instance);
6606 /* Compute the set of scalar stmts participating in internal and external
6607 nodes. */
6609 static void
6610 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6611 hash_set<slp_tree> &visited,
6612 hash_set<stmt_vec_info> &vstmts,
6613 hash_set<stmt_vec_info> &estmts)
6615 int i;
6616 stmt_vec_info stmt_info;
6617 slp_tree child;
6619 if (visited.add (node))
6620 return;
6622 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6624 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6625 vstmts.add (stmt_info);
6627 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6628 if (child)
6629 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6630 vstmts, estmts);
6632 else
6633 for (tree def : SLP_TREE_SCALAR_OPS (node))
6635 stmt_vec_info def_stmt = vinfo->lookup_def (def);
6636 if (def_stmt)
6637 estmts.add (def_stmt);
6642 /* Compute the scalar cost of the SLP node NODE and its children
6643 and return it. Do not account defs that are marked in LIFE and
6644 update LIFE according to uses of NODE. */
6646 static void
6647 vect_bb_slp_scalar_cost (vec_info *vinfo,
6648 slp_tree node, vec<bool, va_heap> *life,
6649 stmt_vector_for_cost *cost_vec,
6650 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6651 hash_set<slp_tree> &visited)
6653 unsigned i;
6654 stmt_vec_info stmt_info;
6655 slp_tree child;
6657 if (visited.add (node))
6658 return;
6660 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6662 ssa_op_iter op_iter;
6663 def_operand_p def_p;
6665 if ((*life)[i])
6666 continue;
6668 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6669 gimple *orig_stmt = orig_stmt_info->stmt;
6671 /* If there is a non-vectorized use of the defs then the scalar
6672 stmt is kept live in which case we do not account it or any
6673 required defs in the SLP children in the scalar cost. This
6674 way we make the vectorization more costly when compared to
6675 the scalar cost. */
6676 if (!STMT_VINFO_LIVE_P (stmt_info))
6678 auto_vec<gimple *, 8> worklist;
6679 hash_set<gimple *> *worklist_visited = NULL;
6680 worklist.quick_push (orig_stmt);
6683 gimple *work_stmt = worklist.pop ();
6684 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6686 imm_use_iterator use_iter;
6687 gimple *use_stmt;
6688 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6689 DEF_FROM_PTR (def_p))
6690 if (!is_gimple_debug (use_stmt))
6692 stmt_vec_info use_stmt_info
6693 = vinfo->lookup_stmt (use_stmt);
6694 if (!use_stmt_info
6695 || !vectorized_scalar_stmts.contains (use_stmt_info))
6697 if (use_stmt_info
6698 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6700 /* For stmts participating in patterns we have
6701 to check its uses recursively. */
6702 if (!worklist_visited)
6703 worklist_visited = new hash_set<gimple *> ();
6704 if (!worklist_visited->add (use_stmt))
6705 worklist.safe_push (use_stmt);
6706 continue;
6708 (*life)[i] = true;
6709 goto next_lane;
6714 while (!worklist.is_empty ());
6715 next_lane:
6716 if (worklist_visited)
6717 delete worklist_visited;
6718 if ((*life)[i])
6719 continue;
6722 /* Count scalar stmts only once. */
6723 if (gimple_visited_p (orig_stmt))
6724 continue;
6725 gimple_set_visited (orig_stmt, true);
6727 vect_cost_for_stmt kind;
6728 if (STMT_VINFO_DATA_REF (orig_stmt_info))
6730 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6731 kind = scalar_load;
6732 else
6733 kind = scalar_store;
6735 else if (vect_nop_conversion_p (orig_stmt_info))
6736 continue;
6737 /* For single-argument PHIs assume coalescing which means zero cost
6738 for the scalar and the vector PHIs. This avoids artificially
6739 favoring the vector path (but may pessimize it in some cases). */
6740 else if (is_a <gphi *> (orig_stmt_info->stmt)
6741 && gimple_phi_num_args
6742 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6743 continue;
6744 else
6745 kind = scalar_stmt;
6746 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6747 SLP_TREE_VECTYPE (node), 0, vect_body);
6750 auto_vec<bool, 20> subtree_life;
6751 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6753 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6755 /* Do not directly pass LIFE to the recursive call, copy it to
6756 confine changes in the callee to the current child/subtree. */
6757 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6759 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6760 for (unsigned j = 0;
6761 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6763 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6764 if (perm.first == i)
6765 subtree_life[perm.second] = (*life)[j];
6768 else
6770 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6771 subtree_life.safe_splice (*life);
6773 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6774 vectorized_scalar_stmts, visited);
6775 subtree_life.truncate (0);
6780 /* Comparator for the loop-index sorted cost vectors. */
6782 static int
6783 li_cost_vec_cmp (const void *a_, const void *b_)
6785 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6786 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6787 if (a->first < b->first)
6788 return -1;
6789 else if (a->first == b->first)
6790 return 0;
6791 return 1;
6794 /* Check if vectorization of the basic block is profitable for the
6795 subgraph denoted by SLP_INSTANCES. */
6797 static bool
6798 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6799 vec<slp_instance> slp_instances,
6800 loop_p orig_loop)
6802 slp_instance instance;
6803 int i;
6804 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
6805 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
6807 if (dump_enabled_p ())
6809 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
6810 hash_set<slp_tree> visited;
6811 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6812 vect_print_slp_graph (MSG_NOTE, vect_location,
6813 SLP_INSTANCE_TREE (instance), visited);
6816 /* Compute the set of scalar stmts we know will go away 'locally' when
6817 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
6818 not accurate for nodes promoted extern late or for scalar stmts that
6819 are used both in extern defs and in vectorized defs. */
6820 hash_set<stmt_vec_info> vectorized_scalar_stmts;
6821 hash_set<stmt_vec_info> scalar_stmts_in_externs;
6822 hash_set<slp_tree> visited;
6823 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6825 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
6826 SLP_INSTANCE_TREE (instance),
6827 visited,
6828 vectorized_scalar_stmts,
6829 scalar_stmts_in_externs);
6830 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
6831 vectorized_scalar_stmts.add (rstmt);
6833 /* Scalar stmts used as defs in external nodes need to be preseved, so
6834 remove them from vectorized_scalar_stmts. */
6835 for (stmt_vec_info stmt : scalar_stmts_in_externs)
6836 vectorized_scalar_stmts.remove (stmt);
6838 /* Calculate scalar cost and sum the cost for the vector stmts
6839 previously collected. */
6840 stmt_vector_for_cost scalar_costs = vNULL;
6841 stmt_vector_for_cost vector_costs = vNULL;
6842 visited.empty ();
6843 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6845 auto_vec<bool, 20> life;
6846 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
6847 true);
6848 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6849 record_stmt_cost (&scalar_costs,
6850 SLP_INSTANCE_ROOT_STMTS (instance).length (),
6851 scalar_stmt,
6852 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
6853 vect_bb_slp_scalar_cost (bb_vinfo,
6854 SLP_INSTANCE_TREE (instance),
6855 &life, &scalar_costs, vectorized_scalar_stmts,
6856 visited);
6857 vector_costs.safe_splice (instance->cost_vec);
6858 instance->cost_vec.release ();
6861 if (dump_enabled_p ())
6862 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
6864 /* When costing non-loop vectorization we need to consider each covered
6865 loop independently and make sure vectorization is profitable. For
6866 now we assume a loop may be not entered or executed an arbitrary
6867 number of iterations (??? static information can provide more
6868 precise info here) which means we can simply cost each containing
6869 loops stmts separately. */
6871 /* First produce cost vectors sorted by loop index. */
6872 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6873 li_scalar_costs (scalar_costs.length ());
6874 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6875 li_vector_costs (vector_costs.length ());
6876 stmt_info_for_cost *cost;
6877 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6879 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6880 li_scalar_costs.quick_push (std::make_pair (l, cost));
6882 /* Use a random used loop as fallback in case the first vector_costs
6883 entry does not have a stmt_info associated with it. */
6884 unsigned l = li_scalar_costs[0].first;
6885 FOR_EACH_VEC_ELT (vector_costs, i, cost)
6887 /* We inherit from the previous COST, invariants, externals and
6888 extracts immediately follow the cost for the related stmt. */
6889 if (cost->stmt_info)
6890 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6891 li_vector_costs.quick_push (std::make_pair (l, cost));
6893 li_scalar_costs.qsort (li_cost_vec_cmp);
6894 li_vector_costs.qsort (li_cost_vec_cmp);
6896 /* Now cost the portions individually. */
6897 unsigned vi = 0;
6898 unsigned si = 0;
6899 bool profitable = true;
6900 while (si < li_scalar_costs.length ()
6901 && vi < li_vector_costs.length ())
6903 unsigned sl = li_scalar_costs[si].first;
6904 unsigned vl = li_vector_costs[vi].first;
6905 if (sl != vl)
6907 if (dump_enabled_p ())
6908 dump_printf_loc (MSG_NOTE, vect_location,
6909 "Scalar %d and vector %d loop part do not "
6910 "match up, skipping scalar part\n", sl, vl);
6911 /* Skip the scalar part, assuming zero cost on the vector side. */
6914 si++;
6916 while (si < li_scalar_costs.length ()
6917 && li_scalar_costs[si].first == sl);
6918 continue;
6921 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
6924 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
6925 si++;
6927 while (si < li_scalar_costs.length ()
6928 && li_scalar_costs[si].first == sl);
6929 unsigned dummy;
6930 finish_cost (scalar_target_cost_data, nullptr,
6931 &dummy, &scalar_cost, &dummy);
6933 /* Complete the target-specific vector cost calculation. */
6934 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
6937 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
6938 vi++;
6940 while (vi < li_vector_costs.length ()
6941 && li_vector_costs[vi].first == vl);
6942 finish_cost (vect_target_cost_data, scalar_target_cost_data,
6943 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
6944 delete scalar_target_cost_data;
6945 delete vect_target_cost_data;
6947 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
6949 if (dump_enabled_p ())
6951 dump_printf_loc (MSG_NOTE, vect_location,
6952 "Cost model analysis for part in loop %d:\n", sl);
6953 dump_printf (MSG_NOTE, " Vector cost: %d\n",
6954 vec_inside_cost + vec_outside_cost);
6955 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
6958 /* Vectorization is profitable if its cost is more than the cost of scalar
6959 version. Note that we err on the vector side for equal cost because
6960 the cost estimate is otherwise quite pessimistic (constant uses are
6961 free on the scalar side but cost a load on the vector side for
6962 example). */
6963 if (vec_outside_cost + vec_inside_cost > scalar_cost)
6965 profitable = false;
6966 break;
6969 if (profitable && vi < li_vector_costs.length ())
6971 if (dump_enabled_p ())
6972 dump_printf_loc (MSG_NOTE, vect_location,
6973 "Excess vector cost for part in loop %d:\n",
6974 li_vector_costs[vi].first);
6975 profitable = false;
6978 /* Unset visited flag. This is delayed when the subgraph is profitable
6979 and we process the loop for remaining unvectorized if-converted code. */
6980 if (!orig_loop || !profitable)
6981 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6982 gimple_set_visited (cost->stmt_info->stmt, false);
6984 scalar_costs.release ();
6985 vector_costs.release ();
6987 return profitable;
6990 /* qsort comparator for lane defs. */
6992 static int
6993 vld_cmp (const void *a_, const void *b_)
6995 auto *a = (const std::pair<unsigned, tree> *)a_;
6996 auto *b = (const std::pair<unsigned, tree> *)b_;
6997 return a->first - b->first;
7000 /* Return true if USE_STMT is a vector lane insert into VEC and set
7001 *THIS_LANE to the lane number that is set. */
7003 static bool
7004 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7006 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7007 if (!use_ass
7008 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7009 || (vec
7010 ? gimple_assign_rhs1 (use_ass) != vec
7011 : ((vec = gimple_assign_rhs1 (use_ass)), false))
7012 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7013 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7014 || !constant_multiple_p
7015 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7016 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7017 this_lane))
7018 return false;
7019 return true;
7022 /* Find any vectorizable constructors and add them to the grouped_store
7023 array. */
7025 static void
7026 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
7028 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7029 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7030 !gsi_end_p (gsi); gsi_next (&gsi))
7032 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7033 if (!assign)
7034 continue;
7036 tree rhs = gimple_assign_rhs1 (assign);
7037 enum tree_code code = gimple_assign_rhs_code (assign);
7038 use_operand_p use_p;
7039 gimple *use_stmt;
7040 if (code == CONSTRUCTOR)
7042 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7043 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7044 CONSTRUCTOR_NELTS (rhs))
7045 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7046 || uniform_vector_p (rhs))
7047 continue;
7049 unsigned j;
7050 tree val;
7051 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7052 if (TREE_CODE (val) != SSA_NAME
7053 || !bb_vinfo->lookup_def (val))
7054 break;
7055 if (j != CONSTRUCTOR_NELTS (rhs))
7056 continue;
7058 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
7059 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
7061 else if (code == BIT_INSERT_EXPR
7062 && VECTOR_TYPE_P (TREE_TYPE (rhs))
7063 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7064 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7065 && integer_zerop (gimple_assign_rhs3 (assign))
7066 && useless_type_conversion_p
7067 (TREE_TYPE (TREE_TYPE (rhs)),
7068 TREE_TYPE (gimple_assign_rhs2 (assign)))
7069 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7071 /* We start to match on insert to lane zero but since the
7072 inserts need not be ordered we'd have to search both
7073 the def and the use chains. */
7074 tree vectype = TREE_TYPE (rhs);
7075 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7076 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7077 auto_sbitmap lanes (nlanes);
7078 bitmap_clear (lanes);
7079 bitmap_set_bit (lanes, 0);
7080 tree def = gimple_assign_lhs (assign);
7081 lane_defs.quick_push
7082 (std::make_pair (0, gimple_assign_rhs2 (assign)));
7083 unsigned lanes_found = 1;
7084 /* Start with the use chains, the last stmt will be the root. */
7085 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7086 vec<stmt_vec_info> roots = vNULL;
7087 roots.safe_push (last);
7090 use_operand_p use_p;
7091 gimple *use_stmt;
7092 if (!single_imm_use (def, &use_p, &use_stmt))
7093 break;
7094 unsigned this_lane;
7095 if (!bb_vinfo->lookup_stmt (use_stmt)
7096 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7097 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7098 break;
7099 if (bitmap_bit_p (lanes, this_lane))
7100 break;
7101 lanes_found++;
7102 bitmap_set_bit (lanes, this_lane);
7103 gassign *use_ass = as_a <gassign *> (use_stmt);
7104 lane_defs.quick_push (std::make_pair
7105 (this_lane, gimple_assign_rhs2 (use_ass)));
7106 last = bb_vinfo->lookup_stmt (use_ass);
7107 roots.safe_push (last);
7108 def = gimple_assign_lhs (use_ass);
7110 while (lanes_found < nlanes);
7111 if (roots.length () > 1)
7112 std::swap(roots[0], roots[roots.length () - 1]);
7113 if (lanes_found < nlanes)
7115 /* Now search the def chain. */
7116 def = gimple_assign_rhs1 (assign);
7119 if (TREE_CODE (def) != SSA_NAME
7120 || !has_single_use (def))
7121 break;
7122 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7123 unsigned this_lane;
7124 if (!bb_vinfo->lookup_stmt (def_stmt)
7125 || !vect_slp_is_lane_insert (def_stmt,
7126 NULL_TREE, &this_lane)
7127 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7128 break;
7129 if (bitmap_bit_p (lanes, this_lane))
7130 break;
7131 lanes_found++;
7132 bitmap_set_bit (lanes, this_lane);
7133 lane_defs.quick_push (std::make_pair
7134 (this_lane,
7135 gimple_assign_rhs2 (def_stmt)));
7136 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7137 def = gimple_assign_rhs1 (def_stmt);
7139 while (lanes_found < nlanes);
7141 if (lanes_found == nlanes)
7143 /* Sort lane_defs after the lane index and register the root. */
7144 lane_defs.qsort (vld_cmp);
7145 vec<stmt_vec_info> stmts;
7146 stmts.create (nlanes);
7147 for (unsigned i = 0; i < nlanes; ++i)
7148 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7149 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7150 stmts, roots));
7152 else
7153 roots.release ();
7155 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7156 && (associative_tree_code (code) || code == MINUS_EXPR)
7157 /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS
7158 checks pessimize a two-element reduction. PR54400.
7159 ??? In-order reduction could be handled if we only
7160 traverse one operand chain in vect_slp_linearize_chain. */
7161 && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
7162 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
7163 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
7164 /* Ops with constants at the tail can be stripped here. */
7165 && TREE_CODE (rhs) == SSA_NAME
7166 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7167 /* Should be the chain end. */
7168 && (!single_imm_use (gimple_assign_lhs (assign),
7169 &use_p, &use_stmt)
7170 || !is_gimple_assign (use_stmt)
7171 || (gimple_assign_rhs_code (use_stmt) != code
7172 && ((code != PLUS_EXPR && code != MINUS_EXPR)
7173 || (gimple_assign_rhs_code (use_stmt)
7174 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7176 /* We start the match at the end of a possible association
7177 chain. */
7178 auto_vec<chain_op_t> chain;
7179 auto_vec<std::pair<tree_code, gimple *> > worklist;
7180 auto_vec<gimple *> chain_stmts;
7181 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7182 if (code == MINUS_EXPR)
7183 code = PLUS_EXPR;
7184 internal_fn reduc_fn;
7185 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7186 || reduc_fn == IFN_LAST)
7187 continue;
7188 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7189 /* ??? */
7190 code_stmt, alt_code_stmt, &chain_stmts);
7191 if (chain.length () > 1)
7193 /* Sort the chain according to def_type and operation. */
7194 chain.sort (dt_sort_cmp, bb_vinfo);
7195 /* ??? Now we'd want to strip externals and constants
7196 but record those to be handled in the epilogue. */
7197 /* ??? For now do not allow mixing ops or externs/constants. */
7198 bool invalid = false;
7199 for (unsigned i = 0; i < chain.length (); ++i)
7200 if (chain[i].dt != vect_internal_def
7201 || chain[i].code != code)
7202 invalid = true;
7203 if (!invalid)
7205 vec<stmt_vec_info> stmts;
7206 stmts.create (chain.length ());
7207 for (unsigned i = 0; i < chain.length (); ++i)
7208 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7209 vec<stmt_vec_info> roots;
7210 roots.create (chain_stmts.length ());
7211 for (unsigned i = 0; i < chain_stmts.length (); ++i)
7212 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7213 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7214 stmts, roots));
7221 /* Walk the grouped store chains and replace entries with their
7222 pattern variant if any. */
7224 static void
7225 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7227 stmt_vec_info first_element;
7228 unsigned i;
7230 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7232 /* We also have CTORs in this array. */
7233 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7234 continue;
7235 if (STMT_VINFO_IN_PATTERN_P (first_element))
7237 stmt_vec_info orig = first_element;
7238 first_element = STMT_VINFO_RELATED_STMT (first_element);
7239 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7240 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7241 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7242 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7243 vinfo->grouped_stores[i] = first_element;
7245 stmt_vec_info prev = first_element;
7246 while (DR_GROUP_NEXT_ELEMENT (prev))
7248 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7249 if (STMT_VINFO_IN_PATTERN_P (elt))
7251 stmt_vec_info orig = elt;
7252 elt = STMT_VINFO_RELATED_STMT (elt);
7253 DR_GROUP_NEXT_ELEMENT (prev) = elt;
7254 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7255 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7257 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7258 prev = elt;
7263 /* Check if the region described by BB_VINFO can be vectorized, returning
7264 true if so. When returning false, set FATAL to true if the same failure
7265 would prevent vectorization at other vector sizes, false if it is still
7266 worth trying other sizes. N_STMTS is the number of statements in the
7267 region. */
7269 static bool
7270 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7271 vec<int> *dataref_groups)
7273 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7275 slp_instance instance;
7276 int i;
7277 poly_uint64 min_vf = 2;
7279 /* The first group of checks is independent of the vector size. */
7280 fatal = true;
7282 /* Analyze the data references. */
7284 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7286 if (dump_enabled_p ())
7287 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7288 "not vectorized: unhandled data-ref in basic "
7289 "block.\n");
7290 return false;
7293 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7295 if (dump_enabled_p ())
7296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7297 "not vectorized: unhandled data access in "
7298 "basic block.\n");
7299 return false;
7302 vect_slp_check_for_constructors (bb_vinfo);
7304 /* If there are no grouped stores and no constructors in the region
7305 there is no need to continue with pattern recog as vect_analyze_slp
7306 will fail anyway. */
7307 if (bb_vinfo->grouped_stores.is_empty ()
7308 && bb_vinfo->roots.is_empty ())
7310 if (dump_enabled_p ())
7311 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7312 "not vectorized: no grouped stores in "
7313 "basic block.\n");
7314 return false;
7317 /* While the rest of the analysis below depends on it in some way. */
7318 fatal = false;
7320 vect_pattern_recog (bb_vinfo);
7322 /* Update store groups from pattern processing. */
7323 vect_fixup_store_groups_with_patterns (bb_vinfo);
7325 /* Check the SLP opportunities in the basic block, analyze and build SLP
7326 trees. */
7327 if (!vect_analyze_slp (bb_vinfo, n_stmts))
7329 if (dump_enabled_p ())
7331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7332 "Failed to SLP the basic block.\n");
7333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7334 "not vectorized: failed to find SLP opportunities "
7335 "in basic block.\n");
7337 return false;
7340 /* Optimize permutations. */
7341 vect_optimize_slp (bb_vinfo);
7343 /* Gather the loads reachable from the SLP graph entries. */
7344 vect_gather_slp_loads (bb_vinfo);
7346 vect_record_base_alignments (bb_vinfo);
7348 /* Analyze and verify the alignment of data references and the
7349 dependence in the SLP instances. */
7350 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7352 vect_location = instance->location ();
7353 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7354 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7356 slp_tree node = SLP_INSTANCE_TREE (instance);
7357 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7358 if (dump_enabled_p ())
7359 dump_printf_loc (MSG_NOTE, vect_location,
7360 "removing SLP instance operations starting from: %G",
7361 stmt_info->stmt);
7362 vect_free_slp_instance (instance);
7363 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7364 continue;
7367 /* Mark all the statements that we want to vectorize as pure SLP and
7368 relevant. */
7369 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7370 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7371 unsigned j;
7372 stmt_vec_info root;
7373 /* Likewise consider instance root stmts as vectorized. */
7374 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7375 STMT_SLP_TYPE (root) = pure_slp;
7377 i++;
7379 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7380 return false;
7382 if (!vect_slp_analyze_operations (bb_vinfo))
7384 if (dump_enabled_p ())
7385 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7386 "not vectorized: bad operation in basic block.\n");
7387 return false;
7390 vect_bb_partition_graph (bb_vinfo);
7392 return true;
7395 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
7396 basic blocks in BBS, returning true on success.
7397 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
7399 static bool
7400 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7401 vec<int> *dataref_groups, unsigned int n_stmts,
7402 loop_p orig_loop)
7404 bb_vec_info bb_vinfo;
7405 auto_vector_modes vector_modes;
7407 /* Autodetect first vector size we try. */
7408 machine_mode next_vector_mode = VOIDmode;
7409 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7410 unsigned int mode_i = 0;
7412 vec_info_shared shared;
7414 machine_mode autodetected_vector_mode = VOIDmode;
7415 while (1)
7417 bool vectorized = false;
7418 bool fatal = false;
7419 bb_vinfo = new _bb_vec_info (bbs, &shared);
7421 bool first_time_p = shared.datarefs.is_empty ();
7422 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7423 if (first_time_p)
7424 bb_vinfo->shared->save_datarefs ();
7425 else
7426 bb_vinfo->shared->check_datarefs ();
7427 bb_vinfo->vector_mode = next_vector_mode;
7429 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7431 if (dump_enabled_p ())
7433 dump_printf_loc (MSG_NOTE, vect_location,
7434 "***** Analysis succeeded with vector mode"
7435 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7436 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7439 bb_vinfo->shared->check_datarefs ();
7441 auto_vec<slp_instance> profitable_subgraphs;
7442 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7444 if (instance->subgraph_entries.is_empty ())
7445 continue;
7447 vect_location = instance->location ();
7448 if (!unlimited_cost_model (NULL)
7449 && !vect_bb_vectorization_profitable_p
7450 (bb_vinfo, instance->subgraph_entries, orig_loop))
7452 if (dump_enabled_p ())
7453 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7454 "not vectorized: vectorization is not "
7455 "profitable.\n");
7456 continue;
7459 if (!dbg_cnt (vect_slp))
7460 continue;
7462 profitable_subgraphs.safe_push (instance);
7465 /* When we're vectorizing an if-converted loop body make sure
7466 we vectorized all if-converted code. */
7467 if (!profitable_subgraphs.is_empty ()
7468 && orig_loop)
7470 gcc_assert (bb_vinfo->bbs.length () == 1);
7471 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7472 !gsi_end_p (gsi); gsi_next (&gsi))
7474 /* The costing above left us with DCEable vectorized scalar
7475 stmts having the visited flag set on profitable
7476 subgraphs. Do the delayed clearing of the flag here. */
7477 if (gimple_visited_p (gsi_stmt (gsi)))
7479 gimple_set_visited (gsi_stmt (gsi), false);
7480 continue;
7482 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7483 continue;
7485 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7486 if (gimple_assign_rhs_code (ass) == COND_EXPR)
7488 if (!profitable_subgraphs.is_empty ()
7489 && dump_enabled_p ())
7490 dump_printf_loc (MSG_NOTE, vect_location,
7491 "not profitable because of "
7492 "unprofitable if-converted scalar "
7493 "code\n");
7494 profitable_subgraphs.truncate (0);
7499 /* Finally schedule the profitable subgraphs. */
7500 for (slp_instance instance : profitable_subgraphs)
7502 if (!vectorized && dump_enabled_p ())
7503 dump_printf_loc (MSG_NOTE, vect_location,
7504 "Basic block will be vectorized "
7505 "using SLP\n");
7506 vectorized = true;
7508 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7510 unsigned HOST_WIDE_INT bytes;
7511 if (dump_enabled_p ())
7513 if (GET_MODE_SIZE
7514 (bb_vinfo->vector_mode).is_constant (&bytes))
7515 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
7516 "basic block part vectorized using %wu "
7517 "byte vectors\n", bytes);
7518 else
7519 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
7520 "basic block part vectorized using "
7521 "variable length vectors\n");
7525 else
7527 if (dump_enabled_p ())
7528 dump_printf_loc (MSG_NOTE, vect_location,
7529 "***** Analysis failed with vector mode %s\n",
7530 GET_MODE_NAME (bb_vinfo->vector_mode));
7533 if (mode_i == 0)
7534 autodetected_vector_mode = bb_vinfo->vector_mode;
7536 if (!fatal)
7537 while (mode_i < vector_modes.length ()
7538 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7540 if (dump_enabled_p ())
7541 dump_printf_loc (MSG_NOTE, vect_location,
7542 "***** The result for vector mode %s would"
7543 " be the same\n",
7544 GET_MODE_NAME (vector_modes[mode_i]));
7545 mode_i += 1;
7548 delete bb_vinfo;
7550 if (mode_i < vector_modes.length ()
7551 && VECTOR_MODE_P (autodetected_vector_mode)
7552 && (related_vector_mode (vector_modes[mode_i],
7553 GET_MODE_INNER (autodetected_vector_mode))
7554 == autodetected_vector_mode)
7555 && (related_vector_mode (autodetected_vector_mode,
7556 GET_MODE_INNER (vector_modes[mode_i]))
7557 == vector_modes[mode_i]))
7559 if (dump_enabled_p ())
7560 dump_printf_loc (MSG_NOTE, vect_location,
7561 "***** Skipping vector mode %s, which would"
7562 " repeat the analysis for %s\n",
7563 GET_MODE_NAME (vector_modes[mode_i]),
7564 GET_MODE_NAME (autodetected_vector_mode));
7565 mode_i += 1;
7568 if (vectorized
7569 || mode_i == vector_modes.length ()
7570 || autodetected_vector_mode == VOIDmode
7571 /* If vect_slp_analyze_bb_1 signaled that analysis for all
7572 vector sizes will fail do not bother iterating. */
7573 || fatal)
7574 return vectorized;
7576 /* Try the next biggest vector size. */
7577 next_vector_mode = vector_modes[mode_i++];
7578 if (dump_enabled_p ())
7579 dump_printf_loc (MSG_NOTE, vect_location,
7580 "***** Re-trying analysis with vector mode %s\n",
7581 GET_MODE_NAME (next_vector_mode));
7586 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
7587 true if anything in the basic-block was vectorized. */
7589 static bool
7590 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7592 vec<data_reference_p> datarefs = vNULL;
7593 auto_vec<int> dataref_groups;
7594 int insns = 0;
7595 int current_group = 0;
7597 for (unsigned i = 0; i < bbs.length (); i++)
7599 basic_block bb = bbs[i];
7600 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7601 gsi_next (&gsi))
7603 gimple *stmt = gsi_stmt (gsi);
7604 if (is_gimple_debug (stmt))
7605 continue;
7607 insns++;
7609 if (gimple_location (stmt) != UNKNOWN_LOCATION)
7610 vect_location = stmt;
7612 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7613 &dataref_groups, current_group))
7614 ++current_group;
7616 /* New BBs always start a new DR group. */
7617 ++current_group;
7620 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7623 /* Special entry for the BB vectorizer. Analyze and transform a single
7624 if-converted BB with ORIG_LOOPs body being the not if-converted
7625 representation. Returns true if anything in the basic-block was
7626 vectorized. */
7628 bool
7629 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7631 auto_vec<basic_block> bbs;
7632 bbs.safe_push (bb);
7633 return vect_slp_bbs (bbs, orig_loop);
7636 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
7637 true if anything in the basic-block was vectorized. */
7639 bool
7640 vect_slp_function (function *fun)
7642 bool r = false;
7643 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7644 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
7646 /* For the moment split the function into pieces to avoid making
7647 the iteration on the vector mode moot. Split at points we know
7648 to not handle well which is CFG merges (SLP discovery doesn't
7649 handle non-loop-header PHIs) and loop exits. Since pattern
7650 recog requires reverse iteration to visit uses before defs
7651 simply chop RPO into pieces. */
7652 auto_vec<basic_block> bbs;
7653 for (unsigned i = 0; i < n; i++)
7655 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7656 bool split = false;
7658 /* Split when a BB is not dominated by the first block. */
7659 if (!bbs.is_empty ()
7660 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7662 if (dump_enabled_p ())
7663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7664 "splitting region at dominance boundary bb%d\n",
7665 bb->index);
7666 split = true;
7668 /* Split when the loop determined by the first block
7669 is exited. This is because we eventually insert
7670 invariants at region begin. */
7671 else if (!bbs.is_empty ()
7672 && bbs[0]->loop_father != bb->loop_father
7673 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7675 if (dump_enabled_p ())
7676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7677 "splitting region at loop %d exit at bb%d\n",
7678 bbs[0]->loop_father->num, bb->index);
7679 split = true;
7682 if (split && !bbs.is_empty ())
7684 r |= vect_slp_bbs (bbs, NULL);
7685 bbs.truncate (0);
7688 /* We need to be able to insert at the head of the region which
7689 we cannot for region starting with a returns-twice call. */
7690 if (bbs.is_empty ())
7691 if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
7692 if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
7694 if (dump_enabled_p ())
7695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7696 "skipping bb%d as start of region as it "
7697 "starts with returns-twice call\n",
7698 bb->index);
7699 continue;
7702 bbs.safe_push (bb);
7704 /* When we have a stmt ending this block and defining a
7705 value we have to insert on edges when inserting after it for
7706 a vector containing its definition. Avoid this for now. */
7707 if (gimple *last = *gsi_last_bb (bb))
7708 if (gimple_get_lhs (last)
7709 && is_ctrl_altering_stmt (last))
7711 if (dump_enabled_p ())
7712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7713 "splitting region at control altering "
7714 "definition %G", last);
7715 r |= vect_slp_bbs (bbs, NULL);
7716 bbs.truncate (0);
7720 if (!bbs.is_empty ())
7721 r |= vect_slp_bbs (bbs, NULL);
7723 free (rpo);
7725 return r;
7728 /* Build a variable-length vector in which the elements in ELTS are repeated
7729 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
7730 RESULTS and add any new instructions to SEQ.
7732 The approach we use is:
7734 (1) Find a vector mode VM with integer elements of mode IM.
7736 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7737 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
7738 from small vectors to IM.
7740 (3) Duplicate each ELTS'[I] into a vector of mode VM.
7742 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7743 correct byte contents.
7745 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7747 We try to find the largest IM for which this sequence works, in order
7748 to cut down on the number of interleaves. */
7750 void
7751 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
7752 const vec<tree> &elts, unsigned int nresults,
7753 vec<tree> &results)
7755 unsigned int nelts = elts.length ();
7756 tree element_type = TREE_TYPE (vector_type);
7758 /* (1) Find a vector mode VM with integer elements of mode IM. */
7759 unsigned int nvectors = 1;
7760 tree new_vector_type;
7761 tree permutes[2];
7762 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
7763 &nvectors, &new_vector_type,
7764 permutes))
7765 gcc_unreachable ();
7767 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
7768 unsigned int partial_nelts = nelts / nvectors;
7769 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
7771 tree_vector_builder partial_elts;
7772 auto_vec<tree, 32> pieces (nvectors * 2);
7773 pieces.quick_grow_cleared (nvectors * 2);
7774 for (unsigned int i = 0; i < nvectors; ++i)
7776 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7777 ELTS' has mode IM. */
7778 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
7779 for (unsigned int j = 0; j < partial_nelts; ++j)
7780 partial_elts.quick_push (elts[i * partial_nelts + j]);
7781 tree t = gimple_build_vector (seq, &partial_elts);
7782 t = gimple_build (seq, VIEW_CONVERT_EXPR,
7783 TREE_TYPE (new_vector_type), t);
7785 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
7786 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
7789 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
7790 correct byte contents.
7792 Conceptually, we need to repeat the following operation log2(nvectors)
7793 times, where hi_start = nvectors / 2:
7795 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
7796 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
7798 However, if each input repeats every N elements and the VF is
7799 a multiple of N * 2, the HI result is the same as the LO result.
7800 This will be true for the first N1 iterations of the outer loop,
7801 followed by N2 iterations for which both the LO and HI results
7802 are needed. I.e.:
7804 N1 + N2 = log2(nvectors)
7806 Each "N1 iteration" doubles the number of redundant vectors and the
7807 effect of the process as a whole is to have a sequence of nvectors/2**N1
7808 vectors that repeats 2**N1 times. Rather than generate these redundant
7809 vectors, we halve the number of vectors for each N1 iteration. */
7810 unsigned int in_start = 0;
7811 unsigned int out_start = nvectors;
7812 unsigned int new_nvectors = nvectors;
7813 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
7815 unsigned int hi_start = new_nvectors / 2;
7816 unsigned int out_i = 0;
7817 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
7819 if ((in_i & 1) != 0
7820 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
7821 2 * in_repeat))
7822 continue;
7824 tree output = make_ssa_name (new_vector_type);
7825 tree input1 = pieces[in_start + (in_i / 2)];
7826 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
7827 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
7828 input1, input2,
7829 permutes[in_i & 1]);
7830 gimple_seq_add_stmt (seq, stmt);
7831 pieces[out_start + out_i] = output;
7832 out_i += 1;
7834 std::swap (in_start, out_start);
7835 new_nvectors = out_i;
7838 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
7839 results.reserve (nresults);
7840 for (unsigned int i = 0; i < nresults; ++i)
7841 if (i < new_nvectors)
7842 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
7843 pieces[in_start + i]));
7844 else
7845 results.quick_push (results[i - new_nvectors]);
7849 /* For constant and loop invariant defs in OP_NODE this function creates
7850 vector defs that will be used in the vectorized stmts and stores them
7851 to SLP_TREE_VEC_DEFS of OP_NODE. */
7853 static void
7854 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
7856 unsigned HOST_WIDE_INT nunits;
7857 tree vec_cst;
7858 unsigned j, number_of_places_left_in_vector;
7859 tree vector_type;
7860 tree vop;
7861 int group_size = op_node->ops.length ();
7862 unsigned int vec_num, i;
7863 unsigned number_of_copies = 1;
7864 bool constant_p;
7865 gimple_seq ctor_seq = NULL;
7866 auto_vec<tree, 16> permute_results;
7868 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
7869 vector_type = SLP_TREE_VECTYPE (op_node);
7871 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
7872 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
7873 auto_vec<tree> voprnds (number_of_vectors);
7875 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
7876 created vectors. It is greater than 1 if unrolling is performed.
7878 For example, we have two scalar operands, s1 and s2 (e.g., group of
7879 strided accesses of size two), while NUNITS is four (i.e., four scalars
7880 of this type can be packed in a vector). The output vector will contain
7881 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
7882 will be 2).
7884 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
7885 containing the operands.
7887 For example, NUNITS is four as before, and the group size is 8
7888 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
7889 {s5, s6, s7, s8}. */
7891 /* When using duplicate_and_interleave, we just need one element for
7892 each scalar statement. */
7893 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
7894 nunits = group_size;
7896 number_of_copies = nunits * number_of_vectors / group_size;
7898 number_of_places_left_in_vector = nunits;
7899 constant_p = true;
7900 tree_vector_builder elts (vector_type, nunits, 1);
7901 elts.quick_grow (nunits);
7902 stmt_vec_info insert_after = NULL;
7903 for (j = 0; j < number_of_copies; j++)
7905 tree op;
7906 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
7908 /* Create 'vect_ = {op0,op1,...,opn}'. */
7909 number_of_places_left_in_vector--;
7910 tree orig_op = op;
7911 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
7913 if (CONSTANT_CLASS_P (op))
7915 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
7917 /* Can't use VIEW_CONVERT_EXPR for booleans because
7918 of possibly different sizes of scalar value and
7919 vector element. */
7920 if (integer_zerop (op))
7921 op = build_int_cst (TREE_TYPE (vector_type), 0);
7922 else if (integer_onep (op))
7923 op = build_all_ones_cst (TREE_TYPE (vector_type));
7924 else
7925 gcc_unreachable ();
7927 else
7928 op = fold_unary (VIEW_CONVERT_EXPR,
7929 TREE_TYPE (vector_type), op);
7930 gcc_assert (op && CONSTANT_CLASS_P (op));
7932 else
7934 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
7935 gimple *init_stmt;
7936 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
7938 tree true_val
7939 = build_all_ones_cst (TREE_TYPE (vector_type));
7940 tree false_val
7941 = build_zero_cst (TREE_TYPE (vector_type));
7942 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
7943 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
7944 op, true_val,
7945 false_val);
7947 else
7949 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
7950 op);
7951 init_stmt
7952 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
7953 op);
7955 gimple_seq_add_stmt (&ctor_seq, init_stmt);
7956 op = new_temp;
7959 elts[number_of_places_left_in_vector] = op;
7960 if (!CONSTANT_CLASS_P (op))
7961 constant_p = false;
7962 /* For BB vectorization we have to compute an insert location
7963 when a def is inside the analyzed region since we cannot
7964 simply insert at the BB start in this case. */
7965 stmt_vec_info opdef;
7966 if (TREE_CODE (orig_op) == SSA_NAME
7967 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
7968 && is_a <bb_vec_info> (vinfo)
7969 && (opdef = vinfo->lookup_def (orig_op)))
7971 if (!insert_after)
7972 insert_after = opdef;
7973 else
7974 insert_after = get_later_stmt (insert_after, opdef);
7977 if (number_of_places_left_in_vector == 0)
7979 if (constant_p
7980 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
7981 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
7982 vec_cst = gimple_build_vector (&ctor_seq, &elts);
7983 else
7985 if (permute_results.is_empty ())
7986 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
7987 elts, number_of_vectors,
7988 permute_results);
7989 vec_cst = permute_results[number_of_vectors - j - 1];
7991 if (!gimple_seq_empty_p (ctor_seq))
7993 if (insert_after)
7995 gimple_stmt_iterator gsi;
7996 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
7998 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
7999 gsi_insert_seq_before (&gsi, ctor_seq,
8000 GSI_CONTINUE_LINKING);
8002 else if (!stmt_ends_bb_p (insert_after->stmt))
8004 gsi = gsi_for_stmt (insert_after->stmt);
8005 gsi_insert_seq_after (&gsi, ctor_seq,
8006 GSI_CONTINUE_LINKING);
8008 else
8010 /* When we want to insert after a def where the
8011 defining stmt throws then insert on the fallthru
8012 edge. */
8013 edge e = find_fallthru_edge
8014 (gimple_bb (insert_after->stmt)->succs);
8015 basic_block new_bb
8016 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8017 gcc_assert (!new_bb);
8020 else
8021 vinfo->insert_seq_on_entry (NULL, ctor_seq);
8022 ctor_seq = NULL;
8024 voprnds.quick_push (vec_cst);
8025 insert_after = NULL;
8026 number_of_places_left_in_vector = nunits;
8027 constant_p = true;
8028 elts.new_vector (vector_type, nunits, 1);
8029 elts.quick_grow (nunits);
8034 /* Since the vectors are created in the reverse order, we should invert
8035 them. */
8036 vec_num = voprnds.length ();
8037 for (j = vec_num; j != 0; j--)
8039 vop = voprnds[j - 1];
8040 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8043 /* In case that VF is greater than the unrolling factor needed for the SLP
8044 group of stmts, NUMBER_OF_VECTORS to be created is greater than
8045 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8046 to replicate the vectors. */
8047 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8048 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8049 i++)
8050 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8053 /* Get the Ith vectorized definition from SLP_NODE. */
8055 tree
8056 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8058 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
8059 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
8060 else
8061 return SLP_TREE_VEC_DEFS (slp_node)[i];
8064 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
8066 void
8067 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8069 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8070 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
8072 unsigned j;
8073 gimple *vec_def_stmt;
8074 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
8075 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
8077 else
8078 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8081 /* Get N vectorized definitions for SLP_NODE. */
8083 void
8084 vect_get_slp_defs (vec_info *,
8085 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8087 if (n == -1U)
8088 n = SLP_TREE_CHILDREN (slp_node).length ();
8090 for (unsigned i = 0; i < n; ++i)
8092 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8093 vec<tree> vec_defs = vNULL;
8094 vect_get_slp_defs (child, &vec_defs);
8095 vec_oprnds->quick_push (vec_defs);
8099 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8100 - PERM gives the permutation that the caller wants to use for NODE,
8101 which might be different from SLP_LOAD_PERMUTATION.
8102 - DUMP_P controls whether the function dumps information. */
8104 static bool
8105 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8106 load_permutation_t &perm,
8107 const vec<tree> &dr_chain,
8108 gimple_stmt_iterator *gsi, poly_uint64 vf,
8109 bool analyze_only, bool dump_p,
8110 unsigned *n_perms, unsigned int *n_loads,
8111 bool dce_chain)
8113 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8114 int vec_index = 0;
8115 tree vectype = SLP_TREE_VECTYPE (node);
8116 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8117 unsigned int mask_element;
8118 machine_mode mode;
8120 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8121 return false;
8123 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8125 mode = TYPE_MODE (vectype);
8126 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8127 unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8129 /* Initialize the vect stmts of NODE to properly insert the generated
8130 stmts later. */
8131 if (! analyze_only)
8132 for (unsigned i = SLP_TREE_VEC_STMTS (node).length (); i < nstmts; i++)
8133 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
8135 /* Generate permutation masks for every NODE. Number of masks for each NODE
8136 is equal to GROUP_SIZE.
8137 E.g., we have a group of three nodes with three loads from the same
8138 location in each node, and the vector size is 4. I.e., we have a
8139 a0b0c0a1b1c1... sequence and we need to create the following vectors:
8140 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8141 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8144 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8145 The last mask is illegal since we assume two operands for permute
8146 operation, and the mask element values can't be outside that range.
8147 Hence, the last mask must be converted into {2,5,5,5}.
8148 For the first two permutations we need the first and the second input
8149 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8150 we need the second and the third vectors: {b1,c1,a2,b2} and
8151 {c2,a3,b3,c3}. */
8153 int vect_stmts_counter = 0;
8154 unsigned int index = 0;
8155 int first_vec_index = -1;
8156 int second_vec_index = -1;
8157 bool noop_p = true;
8158 *n_perms = 0;
8160 vec_perm_builder mask;
8161 unsigned int nelts_to_build;
8162 unsigned int nvectors_per_build;
8163 unsigned int in_nlanes;
8164 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
8165 && multiple_p (nunits, group_size));
8166 if (repeating_p)
8168 /* A single vector contains a whole number of copies of the node, so:
8169 (a) all permutes can use the same mask; and
8170 (b) the permutes only need a single vector input. */
8171 mask.new_vector (nunits, group_size, 3);
8172 nelts_to_build = mask.encoded_nelts ();
8173 /* It's possible to obtain zero nstmts during analyze_only, so make
8174 it at least one to ensure the later computation for n_perms
8175 proceed. */
8176 nvectors_per_build = nstmts > 0 ? nstmts : 1;
8177 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
8179 else
8181 /* We need to construct a separate mask for each vector statement. */
8182 unsigned HOST_WIDE_INT const_nunits, const_vf;
8183 if (!nunits.is_constant (&const_nunits)
8184 || !vf.is_constant (&const_vf))
8185 return false;
8186 mask.new_vector (const_nunits, const_nunits, 1);
8187 nelts_to_build = const_vf * group_size;
8188 nvectors_per_build = 1;
8189 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
8191 auto_sbitmap used_in_lanes (in_nlanes);
8192 bitmap_clear (used_in_lanes);
8193 auto_bitmap used_defs;
8195 unsigned int count = mask.encoded_nelts ();
8196 mask.quick_grow (count);
8197 vec_perm_indices indices;
8199 for (unsigned int j = 0; j < nelts_to_build; j++)
8201 unsigned int iter_num = j / group_size;
8202 unsigned int stmt_num = j % group_size;
8203 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) + perm[stmt_num]);
8204 bitmap_set_bit (used_in_lanes, i);
8205 if (repeating_p)
8207 first_vec_index = 0;
8208 mask_element = i;
8210 else
8212 /* Enforced before the loop when !repeating_p. */
8213 unsigned int const_nunits = nunits.to_constant ();
8214 vec_index = i / const_nunits;
8215 mask_element = i % const_nunits;
8216 if (vec_index == first_vec_index
8217 || first_vec_index == -1)
8219 first_vec_index = vec_index;
8221 else if (vec_index == second_vec_index
8222 || second_vec_index == -1)
8224 second_vec_index = vec_index;
8225 mask_element += const_nunits;
8227 else
8229 if (dump_p)
8230 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8231 "permutation requires at "
8232 "least three vectors %G",
8233 stmt_info->stmt);
8234 gcc_assert (analyze_only);
8235 return false;
8238 gcc_assert (mask_element < 2 * const_nunits);
8241 if (mask_element != index)
8242 noop_p = false;
8243 mask[index++] = mask_element;
8245 if (index == count)
8247 if (!noop_p)
8249 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8250 if (!can_vec_perm_const_p (mode, mode, indices))
8252 if (dump_p)
8254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8255 "unsupported vect permute { ");
8256 for (i = 0; i < count; ++i)
8258 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8259 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8261 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8263 gcc_assert (analyze_only);
8264 return false;
8267 tree mask_vec = NULL_TREE;
8268 if (!analyze_only)
8269 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8271 if (second_vec_index == -1)
8272 second_vec_index = first_vec_index;
8274 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8276 ++*n_perms;
8277 if (analyze_only)
8278 continue;
8279 /* Generate the permute statement if necessary. */
8280 tree first_vec = dr_chain[first_vec_index + ri];
8281 tree second_vec = dr_chain[second_vec_index + ri];
8282 gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8283 tree perm_dest
8284 = vect_create_destination_var (gimple_assign_lhs (stmt),
8285 vectype);
8286 perm_dest = make_ssa_name (perm_dest);
8287 gimple *perm_stmt
8288 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8289 second_vec, mask_vec);
8290 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8291 gsi);
8292 if (dce_chain)
8294 bitmap_set_bit (used_defs, first_vec_index + ri);
8295 bitmap_set_bit (used_defs, second_vec_index + ri);
8298 /* Store the vector statement in NODE. */
8299 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
8302 else if (!analyze_only)
8304 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8306 tree first_vec = dr_chain[first_vec_index + ri];
8307 /* If mask was NULL_TREE generate the requested
8308 identity transform. */
8309 gimple *perm_stmt = SSA_NAME_DEF_STMT (first_vec);
8310 if (dce_chain)
8311 bitmap_set_bit (used_defs, first_vec_index + ri);
8313 /* Store the vector statement in NODE. */
8314 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
8318 index = 0;
8319 first_vec_index = -1;
8320 second_vec_index = -1;
8321 noop_p = true;
8325 if (n_loads)
8327 if (repeating_p)
8328 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8329 else
8331 /* Enforced above when !repeating_p. */
8332 unsigned int const_nunits = nunits.to_constant ();
8333 *n_loads = 0;
8334 bool load_seen = false;
8335 for (unsigned i = 0; i < in_nlanes; ++i)
8337 if (i % const_nunits == 0)
8339 if (load_seen)
8340 *n_loads += 1;
8341 load_seen = false;
8343 if (bitmap_bit_p (used_in_lanes, i))
8344 load_seen = true;
8346 if (load_seen)
8347 *n_loads += 1;
8351 if (dce_chain)
8352 for (unsigned i = 0; i < dr_chain.length (); ++i)
8353 if (!bitmap_bit_p (used_defs, i))
8355 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8356 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8357 gsi_remove (&rgsi, true);
8358 release_defs (stmt);
8361 return true;
8364 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8365 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8366 permute statements for the SLP node NODE. Store the number of vector
8367 permute instructions in *N_PERMS and the number of vector load
8368 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
8369 that were not needed. */
8371 bool
8372 vect_transform_slp_perm_load (vec_info *vinfo,
8373 slp_tree node, const vec<tree> &dr_chain,
8374 gimple_stmt_iterator *gsi, poly_uint64 vf,
8375 bool analyze_only, unsigned *n_perms,
8376 unsigned int *n_loads, bool dce_chain)
8378 return vect_transform_slp_perm_load_1 (vinfo, node,
8379 SLP_TREE_LOAD_PERMUTATION (node),
8380 dr_chain, gsi, vf, analyze_only,
8381 dump_enabled_p (), n_perms, n_loads,
8382 dce_chain);
8385 /* Produce the next vector result for SLP permutation NODE by adding a vector
8386 statement at GSI. If MASK_VEC is nonnull, add:
8388 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8390 otherwise add:
8392 <new SSA name> = FIRST_DEF. */
8394 static void
8395 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8396 slp_tree node, tree first_def, tree second_def,
8397 tree mask_vec)
8399 tree vectype = SLP_TREE_VECTYPE (node);
8401 /* ??? We SLP match existing vector element extracts but
8402 allow punning which we need to re-instantiate at uses
8403 but have no good way of explicitly representing. */
8404 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8405 && !types_compatible_p (TREE_TYPE (first_def), vectype))
8407 gassign *conv_stmt
8408 = gimple_build_assign (make_ssa_name (vectype),
8409 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8410 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8411 first_def = gimple_assign_lhs (conv_stmt);
8413 gassign *perm_stmt;
8414 tree perm_dest = make_ssa_name (vectype);
8415 if (mask_vec)
8417 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8418 TYPE_SIZE (vectype))
8419 && !types_compatible_p (TREE_TYPE (second_def), vectype))
8421 gassign *conv_stmt
8422 = gimple_build_assign (make_ssa_name (vectype),
8423 build1 (VIEW_CONVERT_EXPR,
8424 vectype, second_def));
8425 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8426 second_def = gimple_assign_lhs (conv_stmt);
8428 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8429 first_def, second_def,
8430 mask_vec);
8432 else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8434 /* For identity permutes we still need to handle the case
8435 of lowpart extracts or concats. */
8436 unsigned HOST_WIDE_INT c;
8437 auto first_def_nunits
8438 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8439 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8441 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8442 TYPE_SIZE (vectype), bitsize_zero_node);
8443 perm_stmt = gimple_build_assign (perm_dest, lowpart);
8445 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8446 first_def_nunits, &c) && c == 2)
8448 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8449 NULL_TREE, second_def);
8450 perm_stmt = gimple_build_assign (perm_dest, ctor);
8452 else
8453 gcc_unreachable ();
8455 else
8457 /* We need a copy here in case the def was external. */
8458 perm_stmt = gimple_build_assign (perm_dest, first_def);
8460 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8461 /* Store the vector statement in NODE. */
8462 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
8465 /* Subroutine of vectorizable_slp_permutation. Check whether the target
8466 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8467 If GSI is nonnull, emit the permutation there.
8469 When GSI is null, the only purpose of NODE is to give properties
8470 of the result, such as the vector type and number of SLP lanes.
8471 The node does not need to be a VEC_PERM_EXPR.
8473 If the target supports the operation, return the number of individual
8474 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
8475 dump file if DUMP_P is true. */
8477 static int
8478 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8479 slp_tree node, lane_permutation_t &perm,
8480 vec<slp_tree> &children, bool dump_p)
8482 tree vectype = SLP_TREE_VECTYPE (node);
8484 /* ??? We currently only support all same vector input types
8485 while the SLP IL should really do a concat + select and thus accept
8486 arbitrary mismatches. */
8487 slp_tree child;
8488 unsigned i;
8489 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8490 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8491 tree op_vectype = NULL_TREE;
8492 FOR_EACH_VEC_ELT (children, i, child)
8493 if (SLP_TREE_VECTYPE (child))
8495 op_vectype = SLP_TREE_VECTYPE (child);
8496 break;
8498 if (!op_vectype)
8499 op_vectype = vectype;
8500 FOR_EACH_VEC_ELT (children, i, child)
8502 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8503 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8504 || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8505 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8507 if (dump_p)
8508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8509 "Unsupported vector types in lane permutation\n");
8510 return -1;
8512 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8513 repeating_p = false;
8516 gcc_assert (perm.length () == SLP_TREE_LANES (node));
8517 if (dump_p)
8519 dump_printf_loc (MSG_NOTE, vect_location,
8520 "vectorizing permutation");
8521 for (unsigned i = 0; i < perm.length (); ++i)
8522 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8523 if (repeating_p)
8524 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8525 dump_printf (MSG_NOTE, "\n");
8528 /* REPEATING_P is true if every output vector is guaranteed to use the
8529 same permute vector. We can handle that case for both variable-length
8530 and constant-length vectors, but we only handle other cases for
8531 constant-length vectors.
8533 Set:
8535 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8536 mask vector that we want to build.
8538 - NCOPIES to the number of copies of PERM that we need in order
8539 to build the necessary permute mask vectors.
8541 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8542 for each permute mask vector. This is only relevant when GSI is
8543 nonnull. */
8544 uint64_t npatterns;
8545 unsigned nelts_per_pattern;
8546 uint64_t ncopies;
8547 unsigned noutputs_per_mask;
8548 if (repeating_p)
8550 /* We need a single permute mask vector that has the form:
8552 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8554 In other words, the original n-element permute in PERM is
8555 "unrolled" to fill a full vector. The stepped vector encoding
8556 that we use for permutes requires 3n elements. */
8557 npatterns = SLP_TREE_LANES (node);
8558 nelts_per_pattern = ncopies = 3;
8559 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8561 else
8563 /* Calculate every element of every permute mask vector explicitly,
8564 instead of relying on the pattern described above. */
8565 if (!nunits.is_constant (&npatterns))
8566 return -1;
8567 nelts_per_pattern = ncopies = 1;
8568 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8569 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8570 return -1;
8571 noutputs_per_mask = 1;
8573 unsigned olanes = ncopies * SLP_TREE_LANES (node);
8574 gcc_assert (repeating_p || multiple_p (olanes, nunits));
8576 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8577 from the { SLP operand, scalar lane } permutation as recorded in the
8578 SLP node as intermediate step. This part should already work
8579 with SLP children with arbitrary number of lanes. */
8580 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8581 auto_vec<unsigned> active_lane;
8582 vperm.create (olanes);
8583 active_lane.safe_grow_cleared (children.length (), true);
8584 for (unsigned i = 0; i < ncopies; ++i)
8586 for (unsigned pi = 0; pi < perm.length (); ++pi)
8588 std::pair<unsigned, unsigned> p = perm[pi];
8589 tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8590 if (repeating_p)
8591 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8592 else
8594 /* We checked above that the vectors are constant-length. */
8595 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8596 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8597 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8598 vperm.quick_push ({{p.first, vi}, vl});
8601 /* Advance to the next group. */
8602 for (unsigned j = 0; j < children.length (); ++j)
8603 active_lane[j] += SLP_TREE_LANES (children[j]);
8606 if (dump_p)
8608 dump_printf_loc (MSG_NOTE, vect_location,
8609 "vectorizing permutation");
8610 for (unsigned i = 0; i < perm.length (); ++i)
8611 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8612 if (repeating_p)
8613 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8614 dump_printf (MSG_NOTE, "\n");
8615 dump_printf_loc (MSG_NOTE, vect_location, "as");
8616 for (unsigned i = 0; i < vperm.length (); ++i)
8618 if (i != 0
8619 && (repeating_p
8620 ? multiple_p (i, npatterns)
8621 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8622 dump_printf (MSG_NOTE, ",");
8623 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8624 vperm[i].first.first, vperm[i].first.second,
8625 vperm[i].second);
8627 dump_printf (MSG_NOTE, "\n");
8630 /* We can only handle two-vector permutes, everything else should
8631 be lowered on the SLP level. The following is closely inspired
8632 by vect_transform_slp_perm_load and is supposed to eventually
8633 replace it.
8634 ??? As intermediate step do code-gen in the SLP tree representation
8635 somehow? */
8636 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8637 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8638 unsigned int index = 0;
8639 poly_uint64 mask_element;
8640 vec_perm_builder mask;
8641 mask.new_vector (nunits, npatterns, nelts_per_pattern);
8642 unsigned int count = mask.encoded_nelts ();
8643 mask.quick_grow (count);
8644 vec_perm_indices indices;
8645 unsigned nperms = 0;
8646 for (unsigned i = 0; i < vperm.length (); ++i)
8648 mask_element = vperm[i].second;
8649 if (first_vec.first == -1U
8650 || first_vec == vperm[i].first)
8651 first_vec = vperm[i].first;
8652 else if (second_vec.first == -1U
8653 || second_vec == vperm[i].first)
8655 second_vec = vperm[i].first;
8656 mask_element += nunits;
8658 else
8660 if (dump_p)
8661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8662 "permutation requires at "
8663 "least three vectors\n");
8664 gcc_assert (!gsi);
8665 return -1;
8668 mask[index++] = mask_element;
8670 if (index == count)
8672 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8673 TYPE_VECTOR_SUBPARTS (op_vectype));
8674 bool identity_p = indices.series_p (0, 1, 0, 1);
8675 machine_mode vmode = TYPE_MODE (vectype);
8676 machine_mode op_vmode = TYPE_MODE (op_vectype);
8677 unsigned HOST_WIDE_INT c;
8678 if ((!identity_p
8679 && !can_vec_perm_const_p (vmode, op_vmode, indices))
8680 || (identity_p
8681 && !known_le (nunits,
8682 TYPE_VECTOR_SUBPARTS (op_vectype))
8683 && (!constant_multiple_p (nunits,
8684 TYPE_VECTOR_SUBPARTS (op_vectype),
8685 &c) || c != 2)))
8687 if (dump_p)
8689 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8690 vect_location,
8691 "unsupported vect permute { ");
8692 for (i = 0; i < count; ++i)
8694 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8695 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8697 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8699 gcc_assert (!gsi);
8700 return -1;
8703 if (!identity_p)
8704 nperms++;
8705 if (gsi)
8707 if (second_vec.first == -1U)
8708 second_vec = first_vec;
8710 slp_tree
8711 first_node = children[first_vec.first],
8712 second_node = children[second_vec.first];
8714 tree mask_vec = NULL_TREE;
8715 if (!identity_p)
8716 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8718 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8720 tree first_def
8721 = vect_get_slp_vect_def (first_node,
8722 first_vec.second + vi);
8723 tree second_def
8724 = vect_get_slp_vect_def (second_node,
8725 second_vec.second + vi);
8726 vect_add_slp_permutation (vinfo, gsi, node, first_def,
8727 second_def, mask_vec);
8731 index = 0;
8732 first_vec = std::make_pair (-1U, -1U);
8733 second_vec = std::make_pair (-1U, -1U);
8737 return nperms;
8740 /* Vectorize the SLP permutations in NODE as specified
8741 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8742 child number and lane number.
8743 Interleaving of two two-lane two-child SLP subtrees (not supported):
8744 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8745 A blend of two four-lane two-child SLP subtrees:
8746 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8747 Highpart of a four-lane one-child SLP subtree (not supported):
8748 [ { 0, 2 }, { 0, 3 } ]
8749 Where currently only a subset is supported by code generating below. */
8751 static bool
8752 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8753 slp_tree node, stmt_vector_for_cost *cost_vec)
8755 tree vectype = SLP_TREE_VECTYPE (node);
8756 lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
8757 int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
8758 SLP_TREE_CHILDREN (node),
8759 dump_enabled_p ());
8760 if (nperms < 0)
8761 return false;
8763 if (!gsi)
8764 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
8766 return true;
8769 /* Vectorize SLP NODE. */
8771 static void
8772 vect_schedule_slp_node (vec_info *vinfo,
8773 slp_tree node, slp_instance instance)
8775 gimple_stmt_iterator si;
8776 int i;
8777 slp_tree child;
8779 /* For existing vectors there's nothing to do. */
8780 if (SLP_TREE_VEC_DEFS (node).exists ())
8781 return;
8783 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
8785 /* Vectorize externals and constants. */
8786 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
8787 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8789 /* ??? vectorizable_shift can end up using a scalar operand which is
8790 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
8791 node in this case. */
8792 if (!SLP_TREE_VECTYPE (node))
8793 return;
8795 vect_create_constant_vectors (vinfo, node);
8796 return;
8799 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8801 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
8802 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
8804 if (dump_enabled_p ())
8805 dump_printf_loc (MSG_NOTE, vect_location,
8806 "------>vectorizing SLP node starting from: %G",
8807 stmt_info->stmt);
8809 if (STMT_VINFO_DATA_REF (stmt_info)
8810 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8812 /* Vectorized loads go before the first scalar load to make it
8813 ready early, vectorized stores go before the last scalar
8814 stmt which is where all uses are ready. */
8815 stmt_vec_info last_stmt_info = NULL;
8816 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
8817 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
8818 else /* DR_IS_WRITE */
8819 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
8820 si = gsi_for_stmt (last_stmt_info->stmt);
8822 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
8823 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
8824 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
8825 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8827 /* For PHI node vectorization we do not use the insertion iterator. */
8828 si = gsi_none ();
8830 else
8832 /* Emit other stmts after the children vectorized defs which is
8833 earliest possible. */
8834 gimple *last_stmt = NULL;
8835 bool seen_vector_def = false;
8836 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8837 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8839 /* For fold-left reductions we are retaining the scalar
8840 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
8841 set so the representation isn't perfect. Resort to the
8842 last scalar def here. */
8843 if (SLP_TREE_VEC_STMTS (child).is_empty ())
8845 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
8846 == cycle_phi_info_type);
8847 gphi *phi = as_a <gphi *>
8848 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
8849 if (!last_stmt
8850 || vect_stmt_dominates_stmt_p (last_stmt, phi))
8851 last_stmt = phi;
8853 /* We are emitting all vectorized stmts in the same place and
8854 the last one is the last.
8855 ??? Unless we have a load permutation applied and that
8856 figures to re-use an earlier generated load. */
8857 unsigned j;
8858 gimple *vstmt;
8859 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
8860 if (!last_stmt
8861 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8862 last_stmt = vstmt;
8864 else if (!SLP_TREE_VECTYPE (child))
8866 /* For externals we use unvectorized at all scalar defs. */
8867 unsigned j;
8868 tree def;
8869 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
8870 if (TREE_CODE (def) == SSA_NAME
8871 && !SSA_NAME_IS_DEFAULT_DEF (def))
8873 gimple *stmt = SSA_NAME_DEF_STMT (def);
8874 if (!last_stmt
8875 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
8876 last_stmt = stmt;
8879 else
8881 /* For externals we have to look at all defs since their
8882 insertion place is decided per vector. But beware
8883 of pre-existing vectors where we need to make sure
8884 we do not insert before the region boundary. */
8885 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
8886 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
8887 seen_vector_def = true;
8888 else
8890 unsigned j;
8891 tree vdef;
8892 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
8893 if (TREE_CODE (vdef) == SSA_NAME
8894 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
8896 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
8897 if (!last_stmt
8898 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8899 last_stmt = vstmt;
8903 /* This can happen when all children are pre-existing vectors or
8904 constants. */
8905 if (!last_stmt)
8906 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
8907 if (!last_stmt)
8909 gcc_assert (seen_vector_def);
8910 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
8912 else if (is_ctrl_altering_stmt (last_stmt))
8914 /* We split regions to vectorize at control altering stmts
8915 with a definition so this must be an external which
8916 we can insert at the start of the region. */
8917 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
8919 else if (is_a <bb_vec_info> (vinfo)
8920 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
8921 && gimple_could_trap_p (stmt_info->stmt))
8923 /* We've constrained possibly trapping operations to all come
8924 from the same basic-block, if vectorized defs would allow earlier
8925 scheduling still force vectorized stmts to the original block.
8926 This is only necessary for BB vectorization since for loop vect
8927 all operations are in a single BB and scalar stmt based
8928 placement doesn't play well with epilogue vectorization. */
8929 gcc_assert (dominated_by_p (CDI_DOMINATORS,
8930 gimple_bb (stmt_info->stmt),
8931 gimple_bb (last_stmt)));
8932 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
8934 else if (is_a <gphi *> (last_stmt))
8935 si = gsi_after_labels (gimple_bb (last_stmt));
8936 else
8938 si = gsi_for_stmt (last_stmt);
8939 gsi_next (&si);
8943 /* Handle purely internal nodes. */
8944 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
8946 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
8947 be shared with different SLP nodes (but usually it's the same
8948 operation apart from the case the stmt is only there for denoting
8949 the actual scalar lane defs ...). So do not call vect_transform_stmt
8950 but open-code it here (partly). */
8951 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
8952 gcc_assert (done);
8953 stmt_vec_info slp_stmt_info;
8954 unsigned int i;
8955 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
8956 if (STMT_VINFO_LIVE_P (slp_stmt_info))
8958 done = vectorizable_live_operation (vinfo,
8959 slp_stmt_info, &si, node,
8960 instance, i, true, NULL);
8961 gcc_assert (done);
8964 else
8965 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
8968 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
8969 For loop vectorization this is done in vectorizable_call, but for SLP
8970 it needs to be deferred until end of vect_schedule_slp, because multiple
8971 SLP instances may refer to the same scalar stmt. */
8973 static void
8974 vect_remove_slp_scalar_calls (vec_info *vinfo,
8975 slp_tree node, hash_set<slp_tree> &visited)
8977 gimple *new_stmt;
8978 gimple_stmt_iterator gsi;
8979 int i;
8980 slp_tree child;
8981 tree lhs;
8982 stmt_vec_info stmt_info;
8984 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
8985 return;
8987 if (visited.add (node))
8988 return;
8990 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8991 vect_remove_slp_scalar_calls (vinfo, child, visited);
8993 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8995 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
8996 if (!stmt || gimple_bb (stmt) == NULL)
8997 continue;
8998 if (is_pattern_stmt_p (stmt_info)
8999 || !PURE_SLP_STMT (stmt_info))
9000 continue;
9001 lhs = gimple_call_lhs (stmt);
9002 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9003 gsi = gsi_for_stmt (stmt);
9004 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9005 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
9009 static void
9010 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9012 hash_set<slp_tree> visited;
9013 vect_remove_slp_scalar_calls (vinfo, node, visited);
9016 /* Vectorize the instance root. */
9018 void
9019 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9021 gassign *rstmt = NULL;
9023 if (instance->kind == slp_inst_kind_ctor)
9025 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9027 gimple *child_stmt = SLP_TREE_VEC_STMTS (node)[0];
9028 tree vect_lhs = gimple_get_lhs (child_stmt);
9029 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9030 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9031 TREE_TYPE (vect_lhs)))
9032 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9033 vect_lhs);
9034 rstmt = gimple_build_assign (root_lhs, vect_lhs);
9036 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9038 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9039 gimple *child_stmt;
9040 int j;
9041 vec<constructor_elt, va_gc> *v;
9042 vec_alloc (v, nelts);
9044 /* A CTOR can handle V16HI composition from VNx8HI so we
9045 do not need to convert vector elements if the types
9046 do not match. */
9047 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
9048 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
9049 gimple_get_lhs (child_stmt));
9050 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9051 tree rtype
9052 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9053 tree r_constructor = build_constructor (rtype, v);
9054 rstmt = gimple_build_assign (lhs, r_constructor);
9057 else if (instance->kind == slp_inst_kind_bb_reduc)
9059 /* Largely inspired by reduction chain epilogue handling in
9060 vect_create_epilog_for_reduction. */
9061 vec<tree> vec_defs = vNULL;
9062 vect_get_slp_defs (node, &vec_defs);
9063 enum tree_code reduc_code
9064 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9065 /* ??? We actually have to reflect signs somewhere. */
9066 if (reduc_code == MINUS_EXPR)
9067 reduc_code = PLUS_EXPR;
9068 gimple_seq epilogue = NULL;
9069 /* We may end up with more than one vector result, reduce them
9070 to one vector. */
9071 tree vec_def = vec_defs[0];
9072 for (unsigned i = 1; i < vec_defs.length (); ++i)
9073 vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
9074 vec_def, vec_defs[i]);
9075 vec_defs.release ();
9076 /* ??? Support other schemes than direct internal fn. */
9077 internal_fn reduc_fn;
9078 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9079 || reduc_fn == IFN_LAST)
9080 gcc_unreachable ();
9081 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9082 TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
9084 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9085 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9086 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9087 update_stmt (gsi_stmt (rgsi));
9088 return;
9090 else
9091 gcc_unreachable ();
9093 gcc_assert (rstmt);
9095 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9096 gsi_replace (&rgsi, rstmt, true);
9099 struct slp_scc_info
9101 bool on_stack;
9102 int dfs;
9103 int lowlink;
9106 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
9108 static void
9109 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9110 hash_map<slp_tree, slp_scc_info> &scc_info,
9111 int &maxdfs, vec<slp_tree> &stack)
9113 bool existed_p;
9114 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9115 gcc_assert (!existed_p);
9116 info->dfs = maxdfs;
9117 info->lowlink = maxdfs;
9118 maxdfs++;
9120 /* Leaf. */
9121 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9123 info->on_stack = false;
9124 vect_schedule_slp_node (vinfo, node, instance);
9125 return;
9128 info->on_stack = true;
9129 stack.safe_push (node);
9131 unsigned i;
9132 slp_tree child;
9133 /* DFS recurse. */
9134 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9136 if (!child)
9137 continue;
9138 slp_scc_info *child_info = scc_info.get (child);
9139 if (!child_info)
9141 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9142 /* Recursion might have re-allocated the node. */
9143 info = scc_info.get (node);
9144 child_info = scc_info.get (child);
9145 info->lowlink = MIN (info->lowlink, child_info->lowlink);
9147 else if (child_info->on_stack)
9148 info->lowlink = MIN (info->lowlink, child_info->dfs);
9150 if (info->lowlink != info->dfs)
9151 return;
9153 auto_vec<slp_tree, 4> phis_to_fixup;
9155 /* Singleton. */
9156 if (stack.last () == node)
9158 stack.pop ();
9159 info->on_stack = false;
9160 vect_schedule_slp_node (vinfo, node, instance);
9161 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9162 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9163 phis_to_fixup.quick_push (node);
9165 else
9167 /* SCC. */
9168 int last_idx = stack.length () - 1;
9169 while (stack[last_idx] != node)
9170 last_idx--;
9171 /* We can break the cycle at PHIs who have at least one child
9172 code generated. Then we could re-start the DFS walk until
9173 all nodes in the SCC are covered (we might have new entries
9174 for only back-reachable nodes). But it's simpler to just
9175 iterate and schedule those that are ready. */
9176 unsigned todo = stack.length () - last_idx;
9179 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9181 slp_tree entry = stack[idx];
9182 if (!entry)
9183 continue;
9184 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9185 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9186 bool ready = !phi;
9187 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9188 if (!child)
9190 gcc_assert (phi);
9191 ready = true;
9192 break;
9194 else if (scc_info.get (child)->on_stack)
9196 if (!phi)
9198 ready = false;
9199 break;
9202 else
9204 if (phi)
9206 ready = true;
9207 break;
9210 if (ready)
9212 vect_schedule_slp_node (vinfo, entry, instance);
9213 scc_info.get (entry)->on_stack = false;
9214 stack[idx] = NULL;
9215 todo--;
9216 if (phi)
9217 phis_to_fixup.safe_push (entry);
9221 while (todo != 0);
9223 /* Pop the SCC. */
9224 stack.truncate (last_idx);
9227 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
9228 slp_tree phi_node;
9229 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9231 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9232 edge_iterator ei;
9233 edge e;
9234 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9236 unsigned dest_idx = e->dest_idx;
9237 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9238 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9239 continue;
9240 unsigned n = SLP_TREE_VEC_STMTS (phi_node).length ();
9241 /* Simply fill all args. */
9242 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9243 != vect_first_order_recurrence)
9244 for (unsigned i = 0; i < n; ++i)
9245 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
9246 vect_get_slp_vect_def (child, i),
9247 e, gimple_phi_arg_location (phi, dest_idx));
9248 else
9250 /* Unless it is a first order recurrence which needs
9251 args filled in for both the PHI node and the permutes. */
9252 gimple *perm = SLP_TREE_VEC_STMTS (phi_node)[0];
9253 gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9254 add_phi_arg (as_a <gphi *> (rphi),
9255 vect_get_slp_vect_def (child, n - 1),
9256 e, gimple_phi_arg_location (phi, dest_idx));
9257 for (unsigned i = 0; i < n; ++i)
9259 gimple *perm = SLP_TREE_VEC_STMTS (phi_node)[i];
9260 if (i > 0)
9261 gimple_assign_set_rhs1 (perm,
9262 vect_get_slp_vect_def (child, i - 1));
9263 gimple_assign_set_rhs2 (perm,
9264 vect_get_slp_vect_def (child, i));
9265 update_stmt (perm);
9272 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
9274 void
9275 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9277 slp_instance instance;
9278 unsigned int i;
9280 hash_map<slp_tree, slp_scc_info> scc_info;
9281 int maxdfs = 0;
9282 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9284 slp_tree node = SLP_INSTANCE_TREE (instance);
9285 if (dump_enabled_p ())
9287 dump_printf_loc (MSG_NOTE, vect_location,
9288 "Vectorizing SLP tree:\n");
9289 /* ??? Dump all? */
9290 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9291 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9292 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9293 vect_print_slp_graph (MSG_NOTE, vect_location,
9294 SLP_INSTANCE_TREE (instance));
9296 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9297 have a PHI be the node breaking the cycle. */
9298 auto_vec<slp_tree> stack;
9299 if (!scc_info.get (node))
9300 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9302 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9303 vectorize_slp_instance_root_stmt (node, instance);
9305 if (dump_enabled_p ())
9306 dump_printf_loc (MSG_NOTE, vect_location,
9307 "vectorizing stmts using SLP.\n");
9310 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9312 slp_tree root = SLP_INSTANCE_TREE (instance);
9313 stmt_vec_info store_info;
9314 unsigned int j;
9316 /* Remove scalar call stmts. Do not do this for basic-block
9317 vectorization as not all uses may be vectorized.
9318 ??? Why should this be necessary? DCE should be able to
9319 remove the stmts itself.
9320 ??? For BB vectorization we can as well remove scalar
9321 stmts starting from the SLP tree root if they have no
9322 uses. */
9323 if (is_a <loop_vec_info> (vinfo))
9324 vect_remove_slp_scalar_calls (vinfo, root);
9326 /* Remove vectorized stores original scalar stmts. */
9327 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9329 if (!STMT_VINFO_DATA_REF (store_info)
9330 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9331 break;
9333 store_info = vect_orig_stmt (store_info);
9334 /* Free the attached stmt_vec_info and remove the stmt. */
9335 vinfo->remove_stmt (store_info);
9337 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9338 to not crash in vect_free_slp_tree later. */
9339 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9340 SLP_TREE_REPRESENTATIVE (root) = NULL;