c++: remove some xfails
[official-gcc.git] / gcc / tree-vect-slp.cc
blobdab5daddcc5a6d8e2003144cf5b4660350cc4844
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
57 static object_allocator<_slp_tree> *slp_tree_pool;
58 static slp_tree slp_first_node;
60 void
61 vect_slp_init (void)
63 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
66 void
67 vect_slp_fini (void)
69 while (slp_first_node)
70 delete slp_first_node;
71 delete slp_tree_pool;
72 slp_tree_pool = NULL;
75 void *
76 _slp_tree::operator new (size_t n)
78 gcc_assert (n == sizeof (_slp_tree));
79 return slp_tree_pool->allocate_raw ();
82 void
83 _slp_tree::operator delete (void *node, size_t n)
85 gcc_assert (n == sizeof (_slp_tree));
86 slp_tree_pool->remove_raw (node);
90 /* Initialize a SLP node. */
92 _slp_tree::_slp_tree ()
94 this->prev_node = NULL;
95 if (slp_first_node)
96 slp_first_node->prev_node = this;
97 this->next_node = slp_first_node;
98 slp_first_node = this;
99 SLP_TREE_SCALAR_STMTS (this) = vNULL;
100 SLP_TREE_SCALAR_OPS (this) = vNULL;
101 SLP_TREE_VEC_STMTS (this) = vNULL;
102 SLP_TREE_VEC_DEFS (this) = vNULL;
103 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
104 SLP_TREE_CHILDREN (this) = vNULL;
105 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
106 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
107 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
108 SLP_TREE_CODE (this) = ERROR_MARK;
109 SLP_TREE_VECTYPE (this) = NULL_TREE;
110 SLP_TREE_REPRESENTATIVE (this) = NULL;
111 SLP_TREE_REF_COUNT (this) = 1;
112 this->failed = NULL;
113 this->max_nunits = 1;
114 this->lanes = 0;
117 /* Tear down a SLP node. */
119 _slp_tree::~_slp_tree ()
121 if (this->prev_node)
122 this->prev_node->next_node = this->next_node;
123 else
124 slp_first_node = this->next_node;
125 if (this->next_node)
126 this->next_node->prev_node = this->prev_node;
127 SLP_TREE_CHILDREN (this).release ();
128 SLP_TREE_SCALAR_STMTS (this).release ();
129 SLP_TREE_SCALAR_OPS (this).release ();
130 SLP_TREE_VEC_STMTS (this).release ();
131 SLP_TREE_VEC_DEFS (this).release ();
132 SLP_TREE_LOAD_PERMUTATION (this).release ();
133 SLP_TREE_LANE_PERMUTATION (this).release ();
134 if (this->failed)
135 free (failed);
138 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
140 void
141 vect_free_slp_tree (slp_tree node)
143 int i;
144 slp_tree child;
146 if (--SLP_TREE_REF_COUNT (node) != 0)
147 return;
149 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
150 if (child)
151 vect_free_slp_tree (child);
153 /* If the node defines any SLP only patterns then those patterns are no
154 longer valid and should be removed. */
155 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
156 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
158 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
159 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
160 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
163 delete node;
166 /* Return a location suitable for dumpings related to the SLP instance. */
168 dump_user_location_t
169 _slp_instance::location () const
171 if (!root_stmts.is_empty ())
172 return root_stmts[0]->stmt;
173 else
174 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
178 /* Free the memory allocated for the SLP instance. */
180 void
181 vect_free_slp_instance (slp_instance instance)
183 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
184 SLP_INSTANCE_LOADS (instance).release ();
185 SLP_INSTANCE_ROOT_STMTS (instance).release ();
186 instance->subgraph_entries.release ();
187 instance->cost_vec.release ();
188 free (instance);
192 /* Create an SLP node for SCALAR_STMTS. */
194 slp_tree
195 vect_create_new_slp_node (unsigned nops, tree_code code)
197 slp_tree node = new _slp_tree;
198 SLP_TREE_SCALAR_STMTS (node) = vNULL;
199 SLP_TREE_CHILDREN (node).create (nops);
200 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
201 SLP_TREE_CODE (node) = code;
202 return node;
204 /* Create an SLP node for SCALAR_STMTS. */
206 static slp_tree
207 vect_create_new_slp_node (slp_tree node,
208 vec<stmt_vec_info> scalar_stmts, unsigned nops)
210 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
211 SLP_TREE_CHILDREN (node).create (nops);
212 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
213 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
214 SLP_TREE_LANES (node) = scalar_stmts.length ();
215 return node;
218 /* Create an SLP node for SCALAR_STMTS. */
220 static slp_tree
221 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
223 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
226 /* Create an SLP node for OPS. */
228 static slp_tree
229 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
231 SLP_TREE_SCALAR_OPS (node) = ops;
232 SLP_TREE_DEF_TYPE (node) = vect_external_def;
233 SLP_TREE_LANES (node) = ops.length ();
234 return node;
237 /* Create an SLP node for OPS. */
239 static slp_tree
240 vect_create_new_slp_node (vec<tree> ops)
242 return vect_create_new_slp_node (new _slp_tree, ops);
246 /* This structure is used in creation of an SLP tree. Each instance
247 corresponds to the same operand in a group of scalar stmts in an SLP
248 node. */
249 typedef struct _slp_oprnd_info
251 /* Def-stmts for the operands. */
252 vec<stmt_vec_info> def_stmts;
253 /* Operands. */
254 vec<tree> ops;
255 /* Information about the first statement, its vector def-type, type, the
256 operand itself in case it's constant, and an indication if it's a pattern
257 stmt. */
258 tree first_op_type;
259 enum vect_def_type first_dt;
260 bool any_pattern;
261 } *slp_oprnd_info;
264 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
265 operand. */
266 static vec<slp_oprnd_info>
267 vect_create_oprnd_info (int nops, int group_size)
269 int i;
270 slp_oprnd_info oprnd_info;
271 vec<slp_oprnd_info> oprnds_info;
273 oprnds_info.create (nops);
274 for (i = 0; i < nops; i++)
276 oprnd_info = XNEW (struct _slp_oprnd_info);
277 oprnd_info->def_stmts.create (group_size);
278 oprnd_info->ops.create (group_size);
279 oprnd_info->first_dt = vect_uninitialized_def;
280 oprnd_info->first_op_type = NULL_TREE;
281 oprnd_info->any_pattern = false;
282 oprnds_info.quick_push (oprnd_info);
285 return oprnds_info;
289 /* Free operands info. */
291 static void
292 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
294 int i;
295 slp_oprnd_info oprnd_info;
297 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
299 oprnd_info->def_stmts.release ();
300 oprnd_info->ops.release ();
301 XDELETE (oprnd_info);
304 oprnds_info.release ();
308 /* Return true if STMTS contains a pattern statement. */
310 static bool
311 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
313 stmt_vec_info stmt_info;
314 unsigned int i;
315 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
316 if (is_pattern_stmt_p (stmt_info))
317 return true;
318 return false;
321 /* Return true when all lanes in the external or constant NODE have
322 the same value. */
324 static bool
325 vect_slp_tree_uniform_p (slp_tree node)
327 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
328 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
330 /* Pre-exsting vectors. */
331 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
332 return false;
334 unsigned i;
335 tree op, first = NULL_TREE;
336 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
337 if (!first)
338 first = op;
339 else if (!operand_equal_p (first, op, 0))
340 return false;
342 return true;
345 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
346 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
347 of the chain. */
350 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
351 stmt_vec_info first_stmt_info)
353 stmt_vec_info next_stmt_info = first_stmt_info;
354 int result = 0;
356 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
357 return -1;
361 if (next_stmt_info == stmt_info)
362 return result;
363 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
364 if (next_stmt_info)
365 result += DR_GROUP_GAP (next_stmt_info);
367 while (next_stmt_info);
369 return -1;
372 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
373 using the method implemented by duplicate_and_interleave. Return true
374 if so, returning the number of intermediate vectors in *NVECTORS_OUT
375 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
376 (if nonnull). */
378 bool
379 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
380 tree elt_type, unsigned int *nvectors_out,
381 tree *vector_type_out,
382 tree *permutes)
384 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
385 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
386 return false;
388 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
389 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
390 unsigned int nvectors = 1;
391 for (;;)
393 scalar_int_mode int_mode;
394 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
395 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
397 /* Get the natural vector type for this SLP group size. */
398 tree int_type = build_nonstandard_integer_type
399 (GET_MODE_BITSIZE (int_mode), 1);
400 tree vector_type
401 = get_vectype_for_scalar_type (vinfo, int_type, count);
402 if (vector_type
403 && VECTOR_MODE_P (TYPE_MODE (vector_type))
404 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
405 GET_MODE_SIZE (base_vector_mode)))
407 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
408 together into elements of type INT_TYPE and using the result
409 to build NVECTORS vectors. */
410 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
411 vec_perm_builder sel1 (nelts, 2, 3);
412 vec_perm_builder sel2 (nelts, 2, 3);
413 poly_int64 half_nelts = exact_div (nelts, 2);
414 for (unsigned int i = 0; i < 3; ++i)
416 sel1.quick_push (i);
417 sel1.quick_push (i + nelts);
418 sel2.quick_push (half_nelts + i);
419 sel2.quick_push (half_nelts + i + nelts);
421 vec_perm_indices indices1 (sel1, 2, nelts);
422 vec_perm_indices indices2 (sel2, 2, nelts);
423 machine_mode vmode = TYPE_MODE (vector_type);
424 if (can_vec_perm_const_p (vmode, vmode, indices1)
425 && can_vec_perm_const_p (vmode, vmode, indices2))
427 if (nvectors_out)
428 *nvectors_out = nvectors;
429 if (vector_type_out)
430 *vector_type_out = vector_type;
431 if (permutes)
433 permutes[0] = vect_gen_perm_mask_checked (vector_type,
434 indices1);
435 permutes[1] = vect_gen_perm_mask_checked (vector_type,
436 indices2);
438 return true;
442 if (!multiple_p (elt_bytes, 2, &elt_bytes))
443 return false;
444 nvectors *= 2;
448 /* Return true if DTA and DTB match. */
450 static bool
451 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
453 return (dta == dtb
454 || ((dta == vect_external_def || dta == vect_constant_def)
455 && (dtb == vect_external_def || dtb == vect_constant_def)));
458 static const int cond_expr_maps[3][5] = {
459 { 4, -1, -2, 1, 2 },
460 { 4, -2, -1, 1, 2 },
461 { 4, -1, -2, 2, 1 }
463 static const int arg1_map[] = { 1, 1 };
464 static const int arg2_map[] = { 1, 2 };
465 static const int arg1_arg4_map[] = { 2, 1, 4 };
466 static const int op1_op0_map[] = { 2, 1, 0 };
468 /* For most SLP statements, there is a one-to-one mapping between
469 gimple arguments and child nodes. If that is not true for STMT,
470 return an array that contains:
472 - the number of child nodes, followed by
473 - for each child node, the index of the argument associated with that node.
474 The special index -1 is the first operand of an embedded comparison and
475 the special index -2 is the second operand of an embedded comparison.
477 SWAP is as for vect_get_and_check_slp_defs. */
479 static const int *
480 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
482 if (auto assign = dyn_cast<const gassign *> (stmt))
484 if (gimple_assign_rhs_code (assign) == COND_EXPR
485 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
486 return cond_expr_maps[swap];
487 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
488 && swap)
489 return op1_op0_map;
491 gcc_assert (!swap);
492 if (auto call = dyn_cast<const gcall *> (stmt))
494 if (gimple_call_internal_p (call))
495 switch (gimple_call_internal_fn (call))
497 case IFN_MASK_LOAD:
498 return arg2_map;
500 case IFN_GATHER_LOAD:
501 return arg1_map;
503 case IFN_MASK_GATHER_LOAD:
504 return arg1_arg4_map;
506 default:
507 break;
510 return nullptr;
513 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
514 they are of a valid type and that they match the defs of the first stmt of
515 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
516 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
517 indicates swap is required for cond_expr stmts. Specifically, SWAP
518 is 1 if STMT is cond and operands of comparison need to be swapped;
519 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
521 If there was a fatal error return -1; if the error could be corrected by
522 swapping operands of father node of this one, return 1; if everything is
523 ok return 0. */
524 static int
525 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
526 bool *skip_args,
527 vec<stmt_vec_info> stmts, unsigned stmt_num,
528 vec<slp_oprnd_info> *oprnds_info)
530 stmt_vec_info stmt_info = stmts[stmt_num];
531 tree oprnd;
532 unsigned int i, number_of_oprnds;
533 enum vect_def_type dt = vect_uninitialized_def;
534 slp_oprnd_info oprnd_info;
535 unsigned int commutative_op = -1U;
536 bool first = stmt_num == 0;
538 if (!is_a<gcall *> (stmt_info->stmt)
539 && !is_a<gassign *> (stmt_info->stmt)
540 && !is_a<gphi *> (stmt_info->stmt))
541 return -1;
543 number_of_oprnds = gimple_num_args (stmt_info->stmt);
544 const int *map = vect_get_operand_map (stmt_info->stmt, swap);
545 if (map)
546 number_of_oprnds = *map++;
547 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
549 if (gimple_call_internal_p (stmt))
551 internal_fn ifn = gimple_call_internal_fn (stmt);
552 commutative_op = first_commutative_argument (ifn);
555 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
557 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
558 commutative_op = 0;
561 bool swapped = (swap != 0);
562 bool backedge = false;
563 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
564 for (i = 0; i < number_of_oprnds; i++)
566 int opno = map ? map[i] : int (i);
567 if (opno < 0)
568 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
569 else
571 oprnd = gimple_arg (stmt_info->stmt, opno);
572 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
573 backedge = dominated_by_p (CDI_DOMINATORS,
574 gimple_phi_arg_edge (stmt, opno)->src,
575 gimple_bb (stmt_info->stmt));
577 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
578 oprnd = TREE_OPERAND (oprnd, 0);
580 oprnd_info = (*oprnds_info)[i];
582 stmt_vec_info def_stmt_info;
583 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
585 if (dump_enabled_p ())
586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
587 "Build SLP failed: can't analyze def for %T\n",
588 oprnd);
590 return -1;
593 if (skip_args[i])
595 oprnd_info->def_stmts.quick_push (NULL);
596 oprnd_info->ops.quick_push (NULL_TREE);
597 oprnd_info->first_dt = vect_uninitialized_def;
598 continue;
601 oprnd_info->def_stmts.quick_push (def_stmt_info);
602 oprnd_info->ops.quick_push (oprnd);
604 if (def_stmt_info
605 && is_pattern_stmt_p (def_stmt_info))
607 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
608 != def_stmt_info)
609 oprnd_info->any_pattern = true;
610 else
611 /* If we promote this to external use the original stmt def. */
612 oprnd_info->ops.last ()
613 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
616 /* If there's a extern def on a backedge make sure we can
617 code-generate at the region start.
618 ??? This is another case that could be fixed by adjusting
619 how we split the function but at the moment we'd have conflicting
620 goals there. */
621 if (backedge
622 && dts[i] == vect_external_def
623 && is_a <bb_vec_info> (vinfo)
624 && TREE_CODE (oprnd) == SSA_NAME
625 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
626 && !dominated_by_p (CDI_DOMINATORS,
627 as_a <bb_vec_info> (vinfo)->bbs[0],
628 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
630 if (dump_enabled_p ())
631 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
632 "Build SLP failed: extern def %T only defined "
633 "on backedge\n", oprnd);
634 return -1;
637 if (first)
639 tree type = TREE_TYPE (oprnd);
640 dt = dts[i];
641 if ((dt == vect_constant_def
642 || dt == vect_external_def)
643 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
644 && (TREE_CODE (type) == BOOLEAN_TYPE
645 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
646 type)))
648 if (dump_enabled_p ())
649 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
650 "Build SLP failed: invalid type of def "
651 "for variable-length SLP %T\n", oprnd);
652 return -1;
655 /* For the swapping logic below force vect_reduction_def
656 for the reduction op in a SLP reduction group. */
657 if (!STMT_VINFO_DATA_REF (stmt_info)
658 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
659 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
660 && def_stmt_info)
661 dts[i] = dt = vect_reduction_def;
663 /* Check the types of the definition. */
664 switch (dt)
666 case vect_external_def:
667 case vect_constant_def:
668 case vect_internal_def:
669 case vect_reduction_def:
670 case vect_induction_def:
671 case vect_nested_cycle:
672 break;
674 default:
675 /* FORNOW: Not supported. */
676 if (dump_enabled_p ())
677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
678 "Build SLP failed: illegal type of def %T\n",
679 oprnd);
680 return -1;
683 oprnd_info->first_dt = dt;
684 oprnd_info->first_op_type = type;
687 if (first)
688 return 0;
690 /* Now match the operand definition types to that of the first stmt. */
691 for (i = 0; i < number_of_oprnds;)
693 if (skip_args[i])
695 ++i;
696 continue;
699 oprnd_info = (*oprnds_info)[i];
700 dt = dts[i];
701 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
702 oprnd = oprnd_info->ops[stmt_num];
703 tree type = TREE_TYPE (oprnd);
705 if (!types_compatible_p (oprnd_info->first_op_type, type))
707 if (dump_enabled_p ())
708 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
709 "Build SLP failed: different operand types\n");
710 return 1;
713 /* Not first stmt of the group, check that the def-stmt/s match
714 the def-stmt/s of the first stmt. Allow different definition
715 types for reduction chains: the first stmt must be a
716 vect_reduction_def (a phi node), and the rest
717 end in the reduction chain. */
718 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
719 && !(oprnd_info->first_dt == vect_reduction_def
720 && !STMT_VINFO_DATA_REF (stmt_info)
721 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
722 && def_stmt_info
723 && !STMT_VINFO_DATA_REF (def_stmt_info)
724 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
725 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
726 || (!STMT_VINFO_DATA_REF (stmt_info)
727 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
728 && ((!def_stmt_info
729 || STMT_VINFO_DATA_REF (def_stmt_info)
730 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
731 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
732 != (oprnd_info->first_dt != vect_reduction_def))))
734 /* Try swapping operands if we got a mismatch. For BB
735 vectorization only in case it will clearly improve things. */
736 if (i == commutative_op && !swapped
737 && (!is_a <bb_vec_info> (vinfo)
738 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
739 dts[i+1])
740 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
741 || vect_def_types_match
742 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
744 if (dump_enabled_p ())
745 dump_printf_loc (MSG_NOTE, vect_location,
746 "trying swapped operands\n");
747 std::swap (dts[i], dts[i+1]);
748 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
749 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
750 std::swap ((*oprnds_info)[i]->ops[stmt_num],
751 (*oprnds_info)[i+1]->ops[stmt_num]);
752 swapped = true;
753 continue;
756 if (is_a <bb_vec_info> (vinfo)
757 && !oprnd_info->any_pattern)
759 /* Now for commutative ops we should see whether we can
760 make the other operand matching. */
761 if (dump_enabled_p ())
762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
763 "treating operand as external\n");
764 oprnd_info->first_dt = dt = vect_external_def;
766 else
768 if (dump_enabled_p ())
769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
770 "Build SLP failed: different types\n");
771 return 1;
775 /* Make sure to demote the overall operand to external. */
776 if (dt == vect_external_def)
777 oprnd_info->first_dt = vect_external_def;
778 /* For a SLP reduction chain we want to duplicate the reduction to
779 each of the chain members. That gets us a sane SLP graph (still
780 the stmts are not 100% correct wrt the initial values). */
781 else if ((dt == vect_internal_def
782 || dt == vect_reduction_def)
783 && oprnd_info->first_dt == vect_reduction_def
784 && !STMT_VINFO_DATA_REF (stmt_info)
785 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
786 && !STMT_VINFO_DATA_REF (def_stmt_info)
787 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
788 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
790 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
791 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
794 ++i;
797 /* Swap operands. */
798 if (swapped)
800 if (dump_enabled_p ())
801 dump_printf_loc (MSG_NOTE, vect_location,
802 "swapped operands to match def types in %G",
803 stmt_info->stmt);
806 return 0;
809 /* Return true if call statements CALL1 and CALL2 are similar enough
810 to be combined into the same SLP group. */
812 bool
813 compatible_calls_p (gcall *call1, gcall *call2)
815 unsigned int nargs = gimple_call_num_args (call1);
816 if (nargs != gimple_call_num_args (call2))
817 return false;
819 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
820 return false;
822 if (gimple_call_internal_p (call1))
824 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
825 TREE_TYPE (gimple_call_lhs (call2))))
826 return false;
827 for (unsigned int i = 0; i < nargs; ++i)
828 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
829 TREE_TYPE (gimple_call_arg (call2, i))))
830 return false;
832 else
834 if (!operand_equal_p (gimple_call_fn (call1),
835 gimple_call_fn (call2), 0))
836 return false;
838 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
839 return false;
842 /* Check that any unvectorized arguments are equal. */
843 if (const int *map = vect_get_operand_map (call1))
845 unsigned int nkept = *map++;
846 unsigned int mapi = 0;
847 for (unsigned int i = 0; i < nargs; ++i)
848 if (mapi < nkept && map[mapi] == int (i))
849 mapi += 1;
850 else if (!operand_equal_p (gimple_call_arg (call1, i),
851 gimple_call_arg (call2, i)))
852 return false;
855 return true;
858 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
859 caller's attempt to find the vector type in STMT_INFO with the narrowest
860 element type. Return true if VECTYPE is nonnull and if it is valid
861 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
862 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
863 vect_build_slp_tree. */
865 static bool
866 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
867 unsigned int group_size,
868 tree vectype, poly_uint64 *max_nunits)
870 if (!vectype)
872 if (dump_enabled_p ())
873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
874 "Build SLP failed: unsupported data-type in %G\n",
875 stmt_info->stmt);
876 /* Fatal mismatch. */
877 return false;
880 /* If populating the vector type requires unrolling then fail
881 before adjusting *max_nunits for basic-block vectorization. */
882 if (is_a <bb_vec_info> (vinfo)
883 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
885 if (dump_enabled_p ())
886 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
887 "Build SLP failed: unrolling required "
888 "in basic block SLP\n");
889 /* Fatal mismatch. */
890 return false;
893 /* In case of multiple types we need to detect the smallest type. */
894 vect_update_max_nunits (max_nunits, vectype);
895 return true;
898 /* Verify if the scalar stmts STMTS are isomorphic, require data
899 permutation or are of unsupported types of operation. Return
900 true if they are, otherwise return false and indicate in *MATCHES
901 which stmts are not isomorphic to the first one. If MATCHES[0]
902 is false then this indicates the comparison could not be
903 carried out or the stmts will never be vectorized by SLP.
905 Note COND_EXPR is possibly isomorphic to another one after swapping its
906 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
907 the first stmt by swapping the two operands of comparison; set SWAP[i]
908 to 2 if stmt I is isormorphic to the first stmt by inverting the code
909 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
910 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
912 static bool
913 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
914 vec<stmt_vec_info> stmts, unsigned int group_size,
915 poly_uint64 *max_nunits, bool *matches,
916 bool *two_operators, tree *node_vectype)
918 unsigned int i;
919 stmt_vec_info first_stmt_info = stmts[0];
920 code_helper first_stmt_code = ERROR_MARK;
921 code_helper alt_stmt_code = ERROR_MARK;
922 code_helper rhs_code = ERROR_MARK;
923 code_helper first_cond_code = ERROR_MARK;
924 tree lhs;
925 bool need_same_oprnds = false;
926 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
927 stmt_vec_info first_load = NULL, prev_first_load = NULL;
928 bool first_stmt_load_p = false, load_p = false;
929 bool first_stmt_phi_p = false, phi_p = false;
930 bool maybe_soft_fail = false;
931 tree soft_fail_nunits_vectype = NULL_TREE;
933 /* For every stmt in NODE find its def stmt/s. */
934 stmt_vec_info stmt_info;
935 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
937 gimple *stmt = stmt_info->stmt;
938 swap[i] = 0;
939 matches[i] = false;
941 if (dump_enabled_p ())
942 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
944 /* Fail to vectorize statements marked as unvectorizable, throw
945 or are volatile. */
946 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
947 || stmt_can_throw_internal (cfun, stmt)
948 || gimple_has_volatile_ops (stmt))
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
952 "Build SLP failed: unvectorizable statement %G",
953 stmt);
954 /* ??? For BB vectorization we want to commutate operands in a way
955 to shuffle all unvectorizable defs into one operand and have
956 the other still vectorized. The following doesn't reliably
957 work for this though but it's the easiest we can do here. */
958 if (is_a <bb_vec_info> (vinfo) && i != 0)
959 continue;
960 /* Fatal mismatch. */
961 matches[0] = false;
962 return false;
965 lhs = gimple_get_lhs (stmt);
966 if (lhs == NULL_TREE)
968 if (dump_enabled_p ())
969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
970 "Build SLP failed: not GIMPLE_ASSIGN nor "
971 "GIMPLE_CALL %G", stmt);
972 if (is_a <bb_vec_info> (vinfo) && i != 0)
973 continue;
974 /* Fatal mismatch. */
975 matches[0] = false;
976 return false;
979 tree nunits_vectype;
980 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
981 &nunits_vectype, group_size))
983 if (is_a <bb_vec_info> (vinfo) && i != 0)
984 continue;
985 /* Fatal mismatch. */
986 matches[0] = false;
987 return false;
989 /* Record nunits required but continue analysis, producing matches[]
990 as if nunits was not an issue. This allows splitting of groups
991 to happen. */
992 if (nunits_vectype
993 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
994 nunits_vectype, max_nunits))
996 gcc_assert (is_a <bb_vec_info> (vinfo));
997 maybe_soft_fail = true;
998 soft_fail_nunits_vectype = nunits_vectype;
1001 gcc_assert (vectype);
1003 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1004 if (call_stmt)
1006 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1007 if (cfn != CFN_LAST)
1008 rhs_code = cfn;
1009 else
1010 rhs_code = CALL_EXPR;
1012 if (cfn == CFN_MASK_LOAD
1013 || cfn == CFN_GATHER_LOAD
1014 || cfn == CFN_MASK_GATHER_LOAD)
1015 load_p = true;
1016 else if ((internal_fn_p (cfn)
1017 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1018 || gimple_call_tail_p (call_stmt)
1019 || gimple_call_noreturn_p (call_stmt)
1020 || gimple_call_chain (call_stmt))
1022 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1024 "Build SLP failed: unsupported call type %G",
1025 call_stmt);
1026 if (is_a <bb_vec_info> (vinfo) && i != 0)
1027 continue;
1028 /* Fatal mismatch. */
1029 matches[0] = false;
1030 return false;
1033 else if (gimple_code (stmt) == GIMPLE_PHI)
1035 rhs_code = ERROR_MARK;
1036 phi_p = true;
1038 else
1040 rhs_code = gimple_assign_rhs_code (stmt);
1041 load_p = gimple_vuse (stmt);
1044 /* Check the operation. */
1045 if (i == 0)
1047 *node_vectype = vectype;
1048 first_stmt_code = rhs_code;
1049 first_stmt_load_p = load_p;
1050 first_stmt_phi_p = phi_p;
1052 /* Shift arguments should be equal in all the packed stmts for a
1053 vector shift with scalar shift operand. */
1054 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1055 || rhs_code == LROTATE_EXPR
1056 || rhs_code == RROTATE_EXPR)
1058 /* First see if we have a vector/vector shift. */
1059 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1061 /* No vector/vector shift, try for a vector/scalar shift. */
1062 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1064 if (dump_enabled_p ())
1065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1066 "Build SLP failed: "
1067 "op not supported by target.\n");
1068 if (is_a <bb_vec_info> (vinfo) && i != 0)
1069 continue;
1070 /* Fatal mismatch. */
1071 matches[0] = false;
1072 return false;
1074 need_same_oprnds = true;
1075 first_op1 = gimple_assign_rhs2 (stmt);
1078 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1080 need_same_oprnds = true;
1081 first_op1 = gimple_assign_rhs2 (stmt);
1083 else if (!load_p
1084 && rhs_code == BIT_FIELD_REF)
1086 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1087 if (!is_a <bb_vec_info> (vinfo)
1088 || TREE_CODE (vec) != SSA_NAME
1089 /* When the element types are not compatible we pun the
1090 source to the target vectype which requires equal size. */
1091 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1092 || !types_compatible_p (TREE_TYPE (vectype),
1093 TREE_TYPE (TREE_TYPE (vec))))
1094 && !operand_equal_p (TYPE_SIZE (vectype),
1095 TYPE_SIZE (TREE_TYPE (vec)))))
1097 if (dump_enabled_p ())
1098 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1099 "Build SLP failed: "
1100 "BIT_FIELD_REF not supported\n");
1101 /* Fatal mismatch. */
1102 matches[0] = false;
1103 return false;
1106 else if (rhs_code == CFN_DIV_POW2)
1108 need_same_oprnds = true;
1109 first_op1 = gimple_call_arg (call_stmt, 1);
1112 else
1114 if (first_stmt_code != rhs_code
1115 && alt_stmt_code == ERROR_MARK)
1116 alt_stmt_code = rhs_code;
1117 if ((first_stmt_code != rhs_code
1118 && (first_stmt_code != IMAGPART_EXPR
1119 || rhs_code != REALPART_EXPR)
1120 && (first_stmt_code != REALPART_EXPR
1121 || rhs_code != IMAGPART_EXPR)
1122 /* Handle mismatches in plus/minus by computing both
1123 and merging the results. */
1124 && !((first_stmt_code == PLUS_EXPR
1125 || first_stmt_code == MINUS_EXPR)
1126 && (alt_stmt_code == PLUS_EXPR
1127 || alt_stmt_code == MINUS_EXPR)
1128 && rhs_code == alt_stmt_code)
1129 && !(first_stmt_code.is_tree_code ()
1130 && rhs_code.is_tree_code ()
1131 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1132 == tcc_comparison)
1133 && (swap_tree_comparison (tree_code (first_stmt_code))
1134 == tree_code (rhs_code)))
1135 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1136 && (first_stmt_code == ARRAY_REF
1137 || first_stmt_code == BIT_FIELD_REF
1138 || first_stmt_code == INDIRECT_REF
1139 || first_stmt_code == COMPONENT_REF
1140 || first_stmt_code == MEM_REF)
1141 && (rhs_code == ARRAY_REF
1142 || rhs_code == BIT_FIELD_REF
1143 || rhs_code == INDIRECT_REF
1144 || rhs_code == COMPONENT_REF
1145 || rhs_code == MEM_REF)))
1146 || first_stmt_load_p != load_p
1147 || first_stmt_phi_p != phi_p)
1149 if (dump_enabled_p ())
1151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1152 "Build SLP failed: different operation "
1153 "in stmt %G", stmt);
1154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1155 "original stmt %G", first_stmt_info->stmt);
1157 /* Mismatch. */
1158 continue;
1161 if (!load_p
1162 && first_stmt_code == BIT_FIELD_REF
1163 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1164 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1166 if (dump_enabled_p ())
1167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1168 "Build SLP failed: different BIT_FIELD_REF "
1169 "arguments in %G", stmt);
1170 /* Mismatch. */
1171 continue;
1174 if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
1176 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1177 call_stmt))
1179 if (dump_enabled_p ())
1180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1181 "Build SLP failed: different calls in %G",
1182 stmt);
1183 /* Mismatch. */
1184 continue;
1188 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1189 && (gimple_bb (first_stmt_info->stmt)
1190 != gimple_bb (stmt_info->stmt)))
1192 if (dump_enabled_p ())
1193 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1194 "Build SLP failed: different BB for PHI "
1195 "or possibly trapping operation in %G", stmt);
1196 /* Mismatch. */
1197 continue;
1200 if (need_same_oprnds)
1202 tree other_op1 = gimple_arg (stmt, 1);
1203 if (!operand_equal_p (first_op1, other_op1, 0))
1205 if (dump_enabled_p ())
1206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1207 "Build SLP failed: different shift "
1208 "arguments in %G", stmt);
1209 /* Mismatch. */
1210 continue;
1214 if (!types_compatible_p (vectype, *node_vectype))
1216 if (dump_enabled_p ())
1217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218 "Build SLP failed: different vector type "
1219 "in %G", stmt);
1220 /* Mismatch. */
1221 continue;
1225 /* Grouped store or load. */
1226 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1228 if (REFERENCE_CLASS_P (lhs))
1230 /* Store. */
1233 else
1235 /* Load. */
1236 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1237 if (prev_first_load)
1239 /* Check that there are no loads from different interleaving
1240 chains in the same node. */
1241 if (prev_first_load != first_load)
1243 if (dump_enabled_p ())
1244 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1245 vect_location,
1246 "Build SLP failed: different "
1247 "interleaving chains in one node %G",
1248 stmt);
1249 /* Mismatch. */
1250 continue;
1253 else
1254 prev_first_load = first_load;
1256 } /* Grouped access. */
1257 else
1259 if (load_p
1260 && rhs_code != CFN_GATHER_LOAD
1261 && rhs_code != CFN_MASK_GATHER_LOAD)
1263 /* Not grouped load. */
1264 if (dump_enabled_p ())
1265 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1266 "Build SLP failed: not grouped load %G", stmt);
1268 /* FORNOW: Not grouped loads are not supported. */
1269 if (is_a <bb_vec_info> (vinfo) && i != 0)
1270 continue;
1271 /* Fatal mismatch. */
1272 matches[0] = false;
1273 return false;
1276 /* Not memory operation. */
1277 if (!phi_p
1278 && rhs_code.is_tree_code ()
1279 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1280 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1281 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1282 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1283 && rhs_code != VIEW_CONVERT_EXPR
1284 && rhs_code != CALL_EXPR
1285 && rhs_code != BIT_FIELD_REF)
1287 if (dump_enabled_p ())
1288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1289 "Build SLP failed: operation unsupported %G",
1290 stmt);
1291 if (is_a <bb_vec_info> (vinfo) && i != 0)
1292 continue;
1293 /* Fatal mismatch. */
1294 matches[0] = false;
1295 return false;
1298 if (rhs_code == COND_EXPR)
1300 tree cond_expr = gimple_assign_rhs1 (stmt);
1301 enum tree_code cond_code = TREE_CODE (cond_expr);
1302 enum tree_code swap_code = ERROR_MARK;
1303 enum tree_code invert_code = ERROR_MARK;
1305 if (i == 0)
1306 first_cond_code = TREE_CODE (cond_expr);
1307 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1309 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1310 swap_code = swap_tree_comparison (cond_code);
1311 invert_code = invert_tree_comparison (cond_code, honor_nans);
1314 if (first_cond_code == cond_code)
1316 /* Isomorphic can be achieved by swapping. */
1317 else if (first_cond_code == swap_code)
1318 swap[i] = 1;
1319 /* Isomorphic can be achieved by inverting. */
1320 else if (first_cond_code == invert_code)
1321 swap[i] = 2;
1322 else
1324 if (dump_enabled_p ())
1325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1326 "Build SLP failed: different"
1327 " operation %G", stmt);
1328 /* Mismatch. */
1329 continue;
1333 if (rhs_code.is_tree_code ()
1334 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1335 && (swap_tree_comparison ((tree_code)first_stmt_code)
1336 == (tree_code)rhs_code))
1337 swap[i] = 1;
1340 matches[i] = true;
1343 for (i = 0; i < group_size; ++i)
1344 if (!matches[i])
1345 return false;
1347 /* If we allowed a two-operation SLP node verify the target can cope
1348 with the permute we are going to use. */
1349 if (alt_stmt_code != ERROR_MARK
1350 && (!alt_stmt_code.is_tree_code ()
1351 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1352 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1354 *two_operators = true;
1357 if (maybe_soft_fail)
1359 unsigned HOST_WIDE_INT const_nunits;
1360 if (!TYPE_VECTOR_SUBPARTS
1361 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1362 || const_nunits > group_size)
1363 matches[0] = false;
1364 else
1366 /* With constant vector elements simulate a mismatch at the
1367 point we need to split. */
1368 unsigned tail = group_size & (const_nunits - 1);
1369 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1371 return false;
1374 return true;
1377 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1378 Note we never remove apart from at destruction time so we do not
1379 need a special value for deleted that differs from empty. */
1380 struct bst_traits
1382 typedef vec <stmt_vec_info> value_type;
1383 typedef vec <stmt_vec_info> compare_type;
1384 static inline hashval_t hash (value_type);
1385 static inline bool equal (value_type existing, value_type candidate);
1386 static inline bool is_empty (value_type x) { return !x.exists (); }
1387 static inline bool is_deleted (value_type x) { return !x.exists (); }
1388 static const bool empty_zero_p = true;
1389 static inline void mark_empty (value_type &x) { x.release (); }
1390 static inline void mark_deleted (value_type &x) { x.release (); }
1391 static inline void remove (value_type &x) { x.release (); }
1393 inline hashval_t
1394 bst_traits::hash (value_type x)
1396 inchash::hash h;
1397 for (unsigned i = 0; i < x.length (); ++i)
1398 h.add_int (gimple_uid (x[i]->stmt));
1399 return h.end ();
1401 inline bool
1402 bst_traits::equal (value_type existing, value_type candidate)
1404 if (existing.length () != candidate.length ())
1405 return false;
1406 for (unsigned i = 0; i < existing.length (); ++i)
1407 if (existing[i] != candidate[i])
1408 return false;
1409 return true;
1412 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1413 but then vec::insert does memmove and that's not compatible with
1414 std::pair. */
1415 struct chain_op_t
1417 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1418 : code (code_), dt (dt_), op (op_) {}
1419 tree_code code;
1420 vect_def_type dt;
1421 tree op;
1424 /* Comparator for sorting associatable chains. */
1426 static int
1427 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1429 auto *op1 = (const chain_op_t *) op1_;
1430 auto *op2 = (const chain_op_t *) op2_;
1431 if (op1->dt != op2->dt)
1432 return (int)op1->dt - (int)op2->dt;
1433 return (int)op1->code - (int)op2->code;
1436 /* Linearize the associatable expression chain at START with the
1437 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1438 filling CHAIN with the result and using WORKLIST as intermediate storage.
1439 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1440 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1441 stmts, starting with START. */
1443 static void
1444 vect_slp_linearize_chain (vec_info *vinfo,
1445 vec<std::pair<tree_code, gimple *> > &worklist,
1446 vec<chain_op_t> &chain,
1447 enum tree_code code, gimple *start,
1448 gimple *&code_stmt, gimple *&alt_code_stmt,
1449 vec<gimple *> *chain_stmts)
1451 /* For each lane linearize the addition/subtraction (or other
1452 uniform associatable operation) expression tree. */
1453 worklist.safe_push (std::make_pair (code, start));
1454 while (!worklist.is_empty ())
1456 auto entry = worklist.pop ();
1457 gassign *stmt = as_a <gassign *> (entry.second);
1458 enum tree_code in_code = entry.first;
1459 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1460 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1461 if (!code_stmt
1462 && gimple_assign_rhs_code (stmt) == code)
1463 code_stmt = stmt;
1464 else if (!alt_code_stmt
1465 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1466 alt_code_stmt = stmt;
1467 if (chain_stmts)
1468 chain_stmts->safe_push (stmt);
1469 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1471 tree op = gimple_op (stmt, opnum);
1472 vect_def_type dt;
1473 stmt_vec_info def_stmt_info;
1474 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1475 gcc_assert (res);
1476 if (dt == vect_internal_def
1477 && is_pattern_stmt_p (def_stmt_info))
1478 op = gimple_get_lhs (def_stmt_info->stmt);
1479 gimple *use_stmt;
1480 use_operand_p use_p;
1481 if (dt == vect_internal_def
1482 && single_imm_use (op, &use_p, &use_stmt)
1483 && is_gimple_assign (def_stmt_info->stmt)
1484 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1485 || (code == PLUS_EXPR
1486 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1487 == MINUS_EXPR))))
1489 tree_code op_def_code = this_code;
1490 if (op_def_code == MINUS_EXPR && opnum == 1)
1491 op_def_code = PLUS_EXPR;
1492 if (in_code == MINUS_EXPR)
1493 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1494 worklist.safe_push (std::make_pair (op_def_code,
1495 def_stmt_info->stmt));
1497 else
1499 tree_code op_def_code = this_code;
1500 if (op_def_code == MINUS_EXPR && opnum == 1)
1501 op_def_code = PLUS_EXPR;
1502 if (in_code == MINUS_EXPR)
1503 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1504 chain.safe_push (chain_op_t (op_def_code, dt, op));
1510 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1511 simple_hashmap_traits <bst_traits, slp_tree> >
1512 scalar_stmts_to_slp_tree_map_t;
1514 static slp_tree
1515 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1516 vec<stmt_vec_info> stmts, unsigned int group_size,
1517 poly_uint64 *max_nunits,
1518 bool *matches, unsigned *limit, unsigned *tree_size,
1519 scalar_stmts_to_slp_tree_map_t *bst_map);
1521 static slp_tree
1522 vect_build_slp_tree (vec_info *vinfo,
1523 vec<stmt_vec_info> stmts, unsigned int group_size,
1524 poly_uint64 *max_nunits,
1525 bool *matches, unsigned *limit, unsigned *tree_size,
1526 scalar_stmts_to_slp_tree_map_t *bst_map)
1528 if (slp_tree *leader = bst_map->get (stmts))
1530 if (dump_enabled_p ())
1531 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1532 !(*leader)->failed ? "" : "failed ", *leader);
1533 if (!(*leader)->failed)
1535 SLP_TREE_REF_COUNT (*leader)++;
1536 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1537 stmts.release ();
1538 return *leader;
1540 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1541 return NULL;
1544 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1545 so we can pick up backedge destinations during discovery. */
1546 slp_tree res = new _slp_tree;
1547 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1548 SLP_TREE_SCALAR_STMTS (res) = stmts;
1549 bst_map->put (stmts.copy (), res);
1551 if (*limit == 0)
1553 if (dump_enabled_p ())
1554 dump_printf_loc (MSG_NOTE, vect_location,
1555 "SLP discovery limit exceeded\n");
1556 /* Mark the node invalid so we can detect those when still in use
1557 as backedge destinations. */
1558 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1559 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1560 res->failed = XNEWVEC (bool, group_size);
1561 memset (res->failed, 0, sizeof (bool) * group_size);
1562 memset (matches, 0, sizeof (bool) * group_size);
1563 return NULL;
1565 --*limit;
1567 if (dump_enabled_p ())
1568 dump_printf_loc (MSG_NOTE, vect_location,
1569 "starting SLP discovery for node %p\n", res);
1571 poly_uint64 this_max_nunits = 1;
1572 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1573 &this_max_nunits,
1574 matches, limit, tree_size, bst_map);
1575 if (!res_)
1577 if (dump_enabled_p ())
1578 dump_printf_loc (MSG_NOTE, vect_location,
1579 "SLP discovery for node %p failed\n", res);
1580 /* Mark the node invalid so we can detect those when still in use
1581 as backedge destinations. */
1582 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1583 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1584 res->failed = XNEWVEC (bool, group_size);
1585 if (flag_checking)
1587 unsigned i;
1588 for (i = 0; i < group_size; ++i)
1589 if (!matches[i])
1590 break;
1591 gcc_assert (i < group_size);
1593 memcpy (res->failed, matches, sizeof (bool) * group_size);
1595 else
1597 if (dump_enabled_p ())
1598 dump_printf_loc (MSG_NOTE, vect_location,
1599 "SLP discovery for node %p succeeded\n", res);
1600 gcc_assert (res_ == res);
1601 res->max_nunits = this_max_nunits;
1602 vect_update_max_nunits (max_nunits, this_max_nunits);
1603 /* Keep a reference for the bst_map use. */
1604 SLP_TREE_REF_COUNT (res)++;
1606 return res_;
1609 /* Helper for building an associated SLP node chain. */
1611 static void
1612 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1613 slp_tree op0, slp_tree op1,
1614 stmt_vec_info oper1, stmt_vec_info oper2,
1615 vec<std::pair<unsigned, unsigned> > lperm)
1617 unsigned group_size = SLP_TREE_LANES (op1);
1619 slp_tree child1 = new _slp_tree;
1620 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1621 SLP_TREE_VECTYPE (child1) = vectype;
1622 SLP_TREE_LANES (child1) = group_size;
1623 SLP_TREE_CHILDREN (child1).create (2);
1624 SLP_TREE_CHILDREN (child1).quick_push (op0);
1625 SLP_TREE_CHILDREN (child1).quick_push (op1);
1626 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1628 slp_tree child2 = new _slp_tree;
1629 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1630 SLP_TREE_VECTYPE (child2) = vectype;
1631 SLP_TREE_LANES (child2) = group_size;
1632 SLP_TREE_CHILDREN (child2).create (2);
1633 SLP_TREE_CHILDREN (child2).quick_push (op0);
1634 SLP_TREE_REF_COUNT (op0)++;
1635 SLP_TREE_CHILDREN (child2).quick_push (op1);
1636 SLP_TREE_REF_COUNT (op1)++;
1637 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1639 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1640 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1641 SLP_TREE_VECTYPE (perm) = vectype;
1642 SLP_TREE_LANES (perm) = group_size;
1643 /* ??? We should set this NULL but that's not expected. */
1644 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1645 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1646 SLP_TREE_CHILDREN (perm).quick_push (child1);
1647 SLP_TREE_CHILDREN (perm).quick_push (child2);
1650 /* Recursively build an SLP tree starting from NODE.
1651 Fail (and return a value not equal to zero) if def-stmts are not
1652 isomorphic, require data permutation or are of unsupported types of
1653 operation. Otherwise, return 0.
1654 The value returned is the depth in the SLP tree where a mismatch
1655 was found. */
1657 static slp_tree
1658 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1659 vec<stmt_vec_info> stmts, unsigned int group_size,
1660 poly_uint64 *max_nunits,
1661 bool *matches, unsigned *limit, unsigned *tree_size,
1662 scalar_stmts_to_slp_tree_map_t *bst_map)
1664 unsigned nops, i, this_tree_size = 0;
1665 poly_uint64 this_max_nunits = *max_nunits;
1667 matches[0] = false;
1669 stmt_vec_info stmt_info = stmts[0];
1670 if (!is_a<gcall *> (stmt_info->stmt)
1671 && !is_a<gassign *> (stmt_info->stmt)
1672 && !is_a<gphi *> (stmt_info->stmt))
1673 return NULL;
1675 nops = gimple_num_args (stmt_info->stmt);
1676 if (const int *map = vect_get_operand_map (stmt_info->stmt))
1677 nops = map[0];
1679 /* If the SLP node is a PHI (induction or reduction), terminate
1680 the recursion. */
1681 bool *skip_args = XALLOCAVEC (bool, nops);
1682 memset (skip_args, 0, sizeof (bool) * nops);
1683 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1684 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1686 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1687 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1688 group_size);
1689 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1690 max_nunits))
1691 return NULL;
1693 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1694 if (def_type == vect_induction_def)
1696 /* Induction PHIs are not cycles but walk the initial
1697 value. Only for inner loops through, for outer loops
1698 we need to pick up the value from the actual PHIs
1699 to more easily support peeling and epilogue vectorization. */
1700 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1701 if (!nested_in_vect_loop_p (loop, stmt_info))
1702 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1703 else
1704 loop = loop->inner;
1705 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1707 else if (def_type == vect_reduction_def
1708 || def_type == vect_double_reduction_def
1709 || def_type == vect_nested_cycle)
1711 /* Else def types have to match. */
1712 stmt_vec_info other_info;
1713 bool all_same = true;
1714 FOR_EACH_VEC_ELT (stmts, i, other_info)
1716 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1717 return NULL;
1718 if (other_info != stmt_info)
1719 all_same = false;
1721 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1722 /* Reduction initial values are not explicitely represented. */
1723 if (!nested_in_vect_loop_p (loop, stmt_info))
1724 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1725 /* Reduction chain backedge defs are filled manually.
1726 ??? Need a better way to identify a SLP reduction chain PHI.
1727 Or a better overall way to SLP match those. */
1728 if (all_same && def_type == vect_reduction_def)
1729 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1731 else if (def_type != vect_internal_def)
1732 return NULL;
1736 bool two_operators = false;
1737 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1738 tree vectype = NULL_TREE;
1739 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1740 &this_max_nunits, matches, &two_operators,
1741 &vectype))
1742 return NULL;
1744 /* If the SLP node is a load, terminate the recursion unless masked. */
1745 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1746 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1748 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1749 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1750 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1751 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1752 else
1754 *max_nunits = this_max_nunits;
1755 (*tree_size)++;
1756 node = vect_create_new_slp_node (node, stmts, 0);
1757 SLP_TREE_VECTYPE (node) = vectype;
1758 /* And compute the load permutation. Whether it is actually
1759 a permutation depends on the unrolling factor which is
1760 decided later. */
1761 vec<unsigned> load_permutation;
1762 int j;
1763 stmt_vec_info load_info;
1764 load_permutation.create (group_size);
1765 stmt_vec_info first_stmt_info
1766 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1767 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1769 int load_place = vect_get_place_in_interleaving_chain
1770 (load_info, first_stmt_info);
1771 gcc_assert (load_place != -1);
1772 load_permutation.safe_push (load_place);
1774 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1775 return node;
1778 else if (gimple_assign_single_p (stmt_info->stmt)
1779 && !gimple_vuse (stmt_info->stmt)
1780 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1782 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1783 the same SSA name vector of a compatible type to vectype. */
1784 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1785 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1786 stmt_vec_info estmt_info;
1787 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1789 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1790 tree bfref = gimple_assign_rhs1 (estmt);
1791 HOST_WIDE_INT lane;
1792 if (!known_eq (bit_field_size (bfref),
1793 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1794 || !constant_multiple_p (bit_field_offset (bfref),
1795 bit_field_size (bfref), &lane))
1797 lperm.release ();
1798 matches[0] = false;
1799 return NULL;
1801 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1803 slp_tree vnode = vect_create_new_slp_node (vNULL);
1804 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1805 /* ??? We record vectype here but we hide eventually necessary
1806 punning and instead rely on code generation to materialize
1807 VIEW_CONVERT_EXPRs as necessary. We instead should make
1808 this explicit somehow. */
1809 SLP_TREE_VECTYPE (vnode) = vectype;
1810 else
1812 /* For different size but compatible elements we can still
1813 use VEC_PERM_EXPR without punning. */
1814 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1815 && types_compatible_p (TREE_TYPE (vectype),
1816 TREE_TYPE (TREE_TYPE (vec))));
1817 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1819 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1820 /* We are always building a permutation node even if it is an identity
1821 permute to shield the rest of the vectorizer from the odd node
1822 representing an actual vector without any scalar ops.
1823 ??? We could hide it completely with making the permute node
1824 external? */
1825 node = vect_create_new_slp_node (node, stmts, 1);
1826 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1827 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1828 SLP_TREE_VECTYPE (node) = vectype;
1829 SLP_TREE_CHILDREN (node).quick_push (vnode);
1830 return node;
1832 /* When discovery reaches an associatable operation see whether we can
1833 improve that to match up lanes in a way superior to the operand
1834 swapping code which at most looks at two defs.
1835 ??? For BB vectorization we cannot do the brute-force search
1836 for matching as we can succeed by means of builds from scalars
1837 and have no good way to "cost" one build against another. */
1838 else if (is_a <loop_vec_info> (vinfo)
1839 /* ??? We don't handle !vect_internal_def defs below. */
1840 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1841 && is_gimple_assign (stmt_info->stmt)
1842 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1843 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1844 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1845 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1846 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1848 /* See if we have a chain of (mixed) adds or subtracts or other
1849 associatable ops. */
1850 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1851 if (code == MINUS_EXPR)
1852 code = PLUS_EXPR;
1853 stmt_vec_info other_op_stmt_info = NULL;
1854 stmt_vec_info op_stmt_info = NULL;
1855 unsigned chain_len = 0;
1856 auto_vec<chain_op_t> chain;
1857 auto_vec<std::pair<tree_code, gimple *> > worklist;
1858 auto_vec<vec<chain_op_t> > chains (group_size);
1859 auto_vec<slp_tree, 4> children;
1860 bool hard_fail = true;
1861 for (unsigned lane = 0; lane < group_size; ++lane)
1863 /* For each lane linearize the addition/subtraction (or other
1864 uniform associatable operation) expression tree. */
1865 gimple *op_stmt = NULL, *other_op_stmt = NULL;
1866 vect_slp_linearize_chain (vinfo, worklist, chain, code,
1867 stmts[lane]->stmt, op_stmt, other_op_stmt,
1868 NULL);
1869 if (!op_stmt_info && op_stmt)
1870 op_stmt_info = vinfo->lookup_stmt (op_stmt);
1871 if (!other_op_stmt_info && other_op_stmt)
1872 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1873 if (chain.length () == 2)
1875 /* In a chain of just two elements resort to the regular
1876 operand swapping scheme. If we run into a length
1877 mismatch still hard-FAIL. */
1878 if (chain_len == 0)
1879 hard_fail = false;
1880 else
1882 matches[lane] = false;
1883 /* ??? We might want to process the other lanes, but
1884 make sure to not give false matching hints to the
1885 caller for lanes we did not process. */
1886 if (lane != group_size - 1)
1887 matches[0] = false;
1889 break;
1891 else if (chain_len == 0)
1892 chain_len = chain.length ();
1893 else if (chain.length () != chain_len)
1895 /* ??? Here we could slip in magic to compensate with
1896 neutral operands. */
1897 matches[lane] = false;
1898 if (lane != group_size - 1)
1899 matches[0] = false;
1900 break;
1902 chains.quick_push (chain.copy ());
1903 chain.truncate (0);
1905 if (chains.length () == group_size)
1907 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
1908 if (!op_stmt_info)
1910 hard_fail = false;
1911 goto out;
1913 /* Now we have a set of chains with the same length. */
1914 /* 1. pre-sort according to def_type and operation. */
1915 for (unsigned lane = 0; lane < group_size; ++lane)
1916 chains[lane].stablesort (dt_sort_cmp, vinfo);
1917 if (dump_enabled_p ())
1919 dump_printf_loc (MSG_NOTE, vect_location,
1920 "pre-sorted chains of %s\n",
1921 get_tree_code_name (code));
1922 for (unsigned lane = 0; lane < group_size; ++lane)
1924 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1925 dump_printf (MSG_NOTE, "%s %T ",
1926 get_tree_code_name (chains[lane][opnum].code),
1927 chains[lane][opnum].op);
1928 dump_printf (MSG_NOTE, "\n");
1931 /* 2. try to build children nodes, associating as necessary. */
1932 for (unsigned n = 0; n < chain_len; ++n)
1934 vect_def_type dt = chains[0][n].dt;
1935 unsigned lane;
1936 for (lane = 0; lane < group_size; ++lane)
1937 if (chains[lane][n].dt != dt)
1939 if (dt == vect_constant_def
1940 && chains[lane][n].dt == vect_external_def)
1941 dt = vect_external_def;
1942 else if (dt == vect_external_def
1943 && chains[lane][n].dt == vect_constant_def)
1945 else
1946 break;
1948 if (lane != group_size)
1950 if (dump_enabled_p ())
1951 dump_printf_loc (MSG_NOTE, vect_location,
1952 "giving up on chain due to mismatched "
1953 "def types\n");
1954 matches[lane] = false;
1955 if (lane != group_size - 1)
1956 matches[0] = false;
1957 goto out;
1959 if (dt == vect_constant_def
1960 || dt == vect_external_def)
1962 /* Check whether we can build the invariant. If we can't
1963 we never will be able to. */
1964 tree type = TREE_TYPE (chains[0][n].op);
1965 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
1966 && (TREE_CODE (type) == BOOLEAN_TYPE
1967 || !can_duplicate_and_interleave_p (vinfo, group_size,
1968 type)))
1970 matches[0] = false;
1971 goto out;
1973 vec<tree> ops;
1974 ops.create (group_size);
1975 for (lane = 0; lane < group_size; ++lane)
1976 ops.quick_push (chains[lane][n].op);
1977 slp_tree child = vect_create_new_slp_node (ops);
1978 SLP_TREE_DEF_TYPE (child) = dt;
1979 children.safe_push (child);
1981 else if (dt != vect_internal_def)
1983 /* Not sure, we might need sth special.
1984 gcc.dg/vect/pr96854.c,
1985 gfortran.dg/vect/fast-math-pr37021.f90
1986 and gfortran.dg/vect/pr61171.f trigger. */
1987 /* Soft-fail for now. */
1988 hard_fail = false;
1989 goto out;
1991 else
1993 vec<stmt_vec_info> op_stmts;
1994 op_stmts.create (group_size);
1995 slp_tree child = NULL;
1996 /* Brute-force our way. We have to consider a lane
1997 failing after fixing an earlier fail up in the
1998 SLP discovery recursion. So track the current
1999 permute per lane. */
2000 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2001 memset (perms, 0, sizeof (unsigned) * group_size);
2004 op_stmts.truncate (0);
2005 for (lane = 0; lane < group_size; ++lane)
2006 op_stmts.quick_push
2007 (vinfo->lookup_def (chains[lane][n].op));
2008 child = vect_build_slp_tree (vinfo, op_stmts,
2009 group_size, &this_max_nunits,
2010 matches, limit,
2011 &this_tree_size, bst_map);
2012 /* ??? We're likely getting too many fatal mismatches
2013 here so maybe we want to ignore them (but then we
2014 have no idea which lanes fatally mismatched). */
2015 if (child || !matches[0])
2016 break;
2017 /* Swap another lane we have not yet matched up into
2018 lanes that did not match. If we run out of
2019 permute possibilities for a lane terminate the
2020 search. */
2021 bool term = false;
2022 for (lane = 1; lane < group_size; ++lane)
2023 if (!matches[lane])
2025 if (n + perms[lane] + 1 == chain_len)
2027 term = true;
2028 break;
2030 std::swap (chains[lane][n],
2031 chains[lane][n + perms[lane] + 1]);
2032 perms[lane]++;
2034 if (term)
2035 break;
2037 while (1);
2038 if (!child)
2040 if (dump_enabled_p ())
2041 dump_printf_loc (MSG_NOTE, vect_location,
2042 "failed to match up op %d\n", n);
2043 op_stmts.release ();
2044 if (lane != group_size - 1)
2045 matches[0] = false;
2046 else
2047 matches[lane] = false;
2048 goto out;
2050 if (dump_enabled_p ())
2052 dump_printf_loc (MSG_NOTE, vect_location,
2053 "matched up op %d to\n", n);
2054 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2056 children.safe_push (child);
2059 /* 3. build SLP nodes to combine the chain. */
2060 for (unsigned lane = 0; lane < group_size; ++lane)
2061 if (chains[lane][0].code != code)
2063 /* See if there's any alternate all-PLUS entry. */
2064 unsigned n;
2065 for (n = 1; n < chain_len; ++n)
2067 for (lane = 0; lane < group_size; ++lane)
2068 if (chains[lane][n].code != code)
2069 break;
2070 if (lane == group_size)
2071 break;
2073 if (n != chain_len)
2075 /* Swap that in at first position. */
2076 std::swap (children[0], children[n]);
2077 for (lane = 0; lane < group_size; ++lane)
2078 std::swap (chains[lane][0], chains[lane][n]);
2080 else
2082 /* ??? When this triggers and we end up with two
2083 vect_constant/external_def up-front things break (ICE)
2084 spectacularly finding an insertion place for the
2085 all-constant op. We should have a fully
2086 vect_internal_def operand though(?) so we can swap
2087 that into first place and then prepend the all-zero
2088 constant. */
2089 if (dump_enabled_p ())
2090 dump_printf_loc (MSG_NOTE, vect_location,
2091 "inserting constant zero to compensate "
2092 "for (partially) negated first "
2093 "operand\n");
2094 chain_len++;
2095 for (lane = 0; lane < group_size; ++lane)
2096 chains[lane].safe_insert
2097 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2098 vec<tree> zero_ops;
2099 zero_ops.create (group_size);
2100 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2101 for (lane = 1; lane < group_size; ++lane)
2102 zero_ops.quick_push (zero_ops[0]);
2103 slp_tree zero = vect_create_new_slp_node (zero_ops);
2104 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2105 children.safe_insert (0, zero);
2107 break;
2109 for (unsigned i = 1; i < children.length (); ++i)
2111 slp_tree op0 = children[i - 1];
2112 slp_tree op1 = children[i];
2113 bool this_two_op = false;
2114 for (unsigned lane = 0; lane < group_size; ++lane)
2115 if (chains[lane][i].code != chains[0][i].code)
2117 this_two_op = true;
2118 break;
2120 slp_tree child;
2121 if (i == children.length () - 1)
2122 child = vect_create_new_slp_node (node, stmts, 2);
2123 else
2124 child = vect_create_new_slp_node (2, ERROR_MARK);
2125 if (this_two_op)
2127 vec<std::pair<unsigned, unsigned> > lperm;
2128 lperm.create (group_size);
2129 for (unsigned lane = 0; lane < group_size; ++lane)
2130 lperm.quick_push (std::make_pair
2131 (chains[lane][i].code != chains[0][i].code, lane));
2132 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2133 (chains[0][i].code == code
2134 ? op_stmt_info
2135 : other_op_stmt_info),
2136 (chains[0][i].code == code
2137 ? other_op_stmt_info
2138 : op_stmt_info),
2139 lperm);
2141 else
2143 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2144 SLP_TREE_VECTYPE (child) = vectype;
2145 SLP_TREE_LANES (child) = group_size;
2146 SLP_TREE_CHILDREN (child).quick_push (op0);
2147 SLP_TREE_CHILDREN (child).quick_push (op1);
2148 SLP_TREE_REPRESENTATIVE (child)
2149 = (chains[0][i].code == code
2150 ? op_stmt_info : other_op_stmt_info);
2152 children[i] = child;
2154 *tree_size += this_tree_size + 1;
2155 *max_nunits = this_max_nunits;
2156 while (!chains.is_empty ())
2157 chains.pop ().release ();
2158 return node;
2160 out:
2161 while (!children.is_empty ())
2162 vect_free_slp_tree (children.pop ());
2163 while (!chains.is_empty ())
2164 chains.pop ().release ();
2165 /* Hard-fail, otherwise we might run into quadratic processing of the
2166 chains starting one stmt into the chain again. */
2167 if (hard_fail)
2168 return NULL;
2169 /* Fall thru to normal processing. */
2172 /* Get at the operands, verifying they are compatible. */
2173 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2174 slp_oprnd_info oprnd_info;
2175 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2177 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2178 stmts, i, &oprnds_info);
2179 if (res != 0)
2180 matches[(res == -1) ? 0 : i] = false;
2181 if (!matches[0])
2182 break;
2184 for (i = 0; i < group_size; ++i)
2185 if (!matches[i])
2187 vect_free_oprnd_info (oprnds_info);
2188 return NULL;
2190 swap = NULL;
2192 auto_vec<slp_tree, 4> children;
2194 stmt_info = stmts[0];
2196 /* Create SLP_TREE nodes for the definition node/s. */
2197 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2199 slp_tree child;
2200 unsigned int j;
2202 /* We're skipping certain operands from processing, for example
2203 outer loop reduction initial defs. */
2204 if (skip_args[i])
2206 children.safe_push (NULL);
2207 continue;
2210 if (oprnd_info->first_dt == vect_uninitialized_def)
2212 /* COND_EXPR have one too many eventually if the condition
2213 is a SSA name. */
2214 gcc_assert (i == 3 && nops == 4);
2215 continue;
2218 if (is_a <bb_vec_info> (vinfo)
2219 && oprnd_info->first_dt == vect_internal_def
2220 && !oprnd_info->any_pattern)
2222 /* For BB vectorization, if all defs are the same do not
2223 bother to continue the build along the single-lane
2224 graph but use a splat of the scalar value. */
2225 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2226 for (j = 1; j < group_size; ++j)
2227 if (oprnd_info->def_stmts[j] != first_def)
2228 break;
2229 if (j == group_size
2230 /* But avoid doing this for loads where we may be
2231 able to CSE things, unless the stmt is not
2232 vectorizable. */
2233 && (!STMT_VINFO_VECTORIZABLE (first_def)
2234 || !gimple_vuse (first_def->stmt)))
2236 if (dump_enabled_p ())
2237 dump_printf_loc (MSG_NOTE, vect_location,
2238 "Using a splat of the uniform operand %G",
2239 first_def->stmt);
2240 oprnd_info->first_dt = vect_external_def;
2244 if (oprnd_info->first_dt == vect_external_def
2245 || oprnd_info->first_dt == vect_constant_def)
2247 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2248 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2249 oprnd_info->ops = vNULL;
2250 children.safe_push (invnode);
2251 continue;
2254 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2255 group_size, &this_max_nunits,
2256 matches, limit,
2257 &this_tree_size, bst_map)) != NULL)
2259 oprnd_info->def_stmts = vNULL;
2260 children.safe_push (child);
2261 continue;
2264 /* If the SLP build for operand zero failed and operand zero
2265 and one can be commutated try that for the scalar stmts
2266 that failed the match. */
2267 if (i == 0
2268 /* A first scalar stmt mismatch signals a fatal mismatch. */
2269 && matches[0]
2270 /* ??? For COND_EXPRs we can swap the comparison operands
2271 as well as the arms under some constraints. */
2272 && nops == 2
2273 && oprnds_info[1]->first_dt == vect_internal_def
2274 && is_gimple_assign (stmt_info->stmt)
2275 /* Swapping operands for reductions breaks assumptions later on. */
2276 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2277 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2279 /* See whether we can swap the matching or the non-matching
2280 stmt operands. */
2281 bool swap_not_matching = true;
2284 for (j = 0; j < group_size; ++j)
2286 if (matches[j] != !swap_not_matching)
2287 continue;
2288 stmt_vec_info stmt_info = stmts[j];
2289 /* Verify if we can swap operands of this stmt. */
2290 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2291 if (!stmt
2292 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2294 if (!swap_not_matching)
2295 goto fail;
2296 swap_not_matching = false;
2297 break;
2301 while (j != group_size);
2303 /* Swap mismatched definition stmts. */
2304 if (dump_enabled_p ())
2305 dump_printf_loc (MSG_NOTE, vect_location,
2306 "Re-trying with swapped operands of stmts ");
2307 for (j = 0; j < group_size; ++j)
2308 if (matches[j] == !swap_not_matching)
2310 std::swap (oprnds_info[0]->def_stmts[j],
2311 oprnds_info[1]->def_stmts[j]);
2312 std::swap (oprnds_info[0]->ops[j],
2313 oprnds_info[1]->ops[j]);
2314 if (dump_enabled_p ())
2315 dump_printf (MSG_NOTE, "%d ", j);
2317 if (dump_enabled_p ())
2318 dump_printf (MSG_NOTE, "\n");
2319 /* After swapping some operands we lost track whether an
2320 operand has any pattern defs so be conservative here. */
2321 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2322 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2323 /* And try again with scratch 'matches' ... */
2324 bool *tem = XALLOCAVEC (bool, group_size);
2325 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2326 group_size, &this_max_nunits,
2327 tem, limit,
2328 &this_tree_size, bst_map)) != NULL)
2330 oprnd_info->def_stmts = vNULL;
2331 children.safe_push (child);
2332 continue;
2335 fail:
2337 /* If the SLP build failed and we analyze a basic-block
2338 simply treat nodes we fail to build as externally defined
2339 (and thus build vectors from the scalar defs).
2340 The cost model will reject outright expensive cases.
2341 ??? This doesn't treat cases where permutation ultimatively
2342 fails (or we don't try permutation below). Ideally we'd
2343 even compute a permutation that will end up with the maximum
2344 SLP tree size... */
2345 if (is_a <bb_vec_info> (vinfo)
2346 /* ??? Rejecting patterns this way doesn't work. We'd have to
2347 do extra work to cancel the pattern so the uses see the
2348 scalar version. */
2349 && !is_pattern_stmt_p (stmt_info)
2350 && !oprnd_info->any_pattern)
2352 /* But if there's a leading vector sized set of matching stmts
2353 fail here so we can split the group. This matches the condition
2354 vect_analyze_slp_instance uses. */
2355 /* ??? We might want to split here and combine the results to support
2356 multiple vector sizes better. */
2357 for (j = 0; j < group_size; ++j)
2358 if (!matches[j])
2359 break;
2360 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2362 if (dump_enabled_p ())
2363 dump_printf_loc (MSG_NOTE, vect_location,
2364 "Building vector operands from scalars\n");
2365 this_tree_size++;
2366 child = vect_create_new_slp_node (oprnd_info->ops);
2367 children.safe_push (child);
2368 oprnd_info->ops = vNULL;
2369 continue;
2373 gcc_assert (child == NULL);
2374 FOR_EACH_VEC_ELT (children, j, child)
2375 if (child)
2376 vect_free_slp_tree (child);
2377 vect_free_oprnd_info (oprnds_info);
2378 return NULL;
2381 vect_free_oprnd_info (oprnds_info);
2383 /* If we have all children of a child built up from uniform scalars
2384 or does more than one possibly expensive vector construction then
2385 just throw that away, causing it built up from scalars.
2386 The exception is the SLP node for the vector store. */
2387 if (is_a <bb_vec_info> (vinfo)
2388 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2389 /* ??? Rejecting patterns this way doesn't work. We'd have to
2390 do extra work to cancel the pattern so the uses see the
2391 scalar version. */
2392 && !is_pattern_stmt_p (stmt_info))
2394 slp_tree child;
2395 unsigned j;
2396 bool all_uniform_p = true;
2397 unsigned n_vector_builds = 0;
2398 FOR_EACH_VEC_ELT (children, j, child)
2400 if (!child)
2402 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2403 all_uniform_p = false;
2404 else if (!vect_slp_tree_uniform_p (child))
2406 all_uniform_p = false;
2407 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2408 n_vector_builds++;
2411 if (all_uniform_p
2412 || n_vector_builds > 1
2413 || (n_vector_builds == children.length ()
2414 && is_a <gphi *> (stmt_info->stmt)))
2416 /* Roll back. */
2417 matches[0] = false;
2418 FOR_EACH_VEC_ELT (children, j, child)
2419 if (child)
2420 vect_free_slp_tree (child);
2422 if (dump_enabled_p ())
2423 dump_printf_loc (MSG_NOTE, vect_location,
2424 "Building parent vector operands from "
2425 "scalars instead\n");
2426 return NULL;
2430 *tree_size += this_tree_size + 1;
2431 *max_nunits = this_max_nunits;
2433 if (two_operators)
2435 /* ??? We'd likely want to either cache in bst_map sth like
2436 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2437 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2438 explicit stmts to put in so the keying on 'stmts' doesn't
2439 work (but we have the same issue with nodes that use 'ops'). */
2440 slp_tree one = new _slp_tree;
2441 slp_tree two = new _slp_tree;
2442 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2443 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2444 SLP_TREE_VECTYPE (one) = vectype;
2445 SLP_TREE_VECTYPE (two) = vectype;
2446 SLP_TREE_CHILDREN (one).safe_splice (children);
2447 SLP_TREE_CHILDREN (two).safe_splice (children);
2448 slp_tree child;
2449 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2450 SLP_TREE_REF_COUNT (child)++;
2452 /* Here we record the original defs since this
2453 node represents the final lane configuration. */
2454 node = vect_create_new_slp_node (node, stmts, 2);
2455 SLP_TREE_VECTYPE (node) = vectype;
2456 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2457 SLP_TREE_CHILDREN (node).quick_push (one);
2458 SLP_TREE_CHILDREN (node).quick_push (two);
2459 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2460 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2461 enum tree_code ocode = ERROR_MARK;
2462 stmt_vec_info ostmt_info;
2463 unsigned j = 0;
2464 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2466 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2467 if (gimple_assign_rhs_code (ostmt) != code0)
2469 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2470 ocode = gimple_assign_rhs_code (ostmt);
2471 j = i;
2473 else
2474 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2476 SLP_TREE_CODE (one) = code0;
2477 SLP_TREE_CODE (two) = ocode;
2478 SLP_TREE_LANES (one) = stmts.length ();
2479 SLP_TREE_LANES (two) = stmts.length ();
2480 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2481 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2482 return node;
2485 node = vect_create_new_slp_node (node, stmts, nops);
2486 SLP_TREE_VECTYPE (node) = vectype;
2487 SLP_TREE_CHILDREN (node).splice (children);
2488 return node;
2491 /* Dump a single SLP tree NODE. */
2493 static void
2494 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2495 slp_tree node)
2497 unsigned i, j;
2498 slp_tree child;
2499 stmt_vec_info stmt_info;
2500 tree op;
2502 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2503 dump_user_location_t user_loc = loc.get_user_location ();
2504 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)",
2505 SLP_TREE_DEF_TYPE (node) == vect_external_def
2506 ? " (external)"
2507 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2508 ? " (constant)"
2509 : ""), node,
2510 estimated_poly_value (node->max_nunits),
2511 SLP_TREE_REF_COUNT (node));
2512 if (SLP_TREE_VECTYPE (node))
2513 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2514 dump_printf (metadata, "\n");
2515 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2517 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2518 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2519 else
2520 dump_printf_loc (metadata, user_loc, "op template: %G",
2521 SLP_TREE_REPRESENTATIVE (node)->stmt);
2523 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2524 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2525 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2526 else
2528 dump_printf_loc (metadata, user_loc, "\t{ ");
2529 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2530 dump_printf (metadata, "%T%s ", op,
2531 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2532 dump_printf (metadata, "}\n");
2534 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2536 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2537 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2538 dump_printf (dump_kind, " %u", j);
2539 dump_printf (dump_kind, " }\n");
2541 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2543 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2544 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2545 dump_printf (dump_kind, " %u[%u]",
2546 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2547 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2548 dump_printf (dump_kind, " }\n");
2550 if (SLP_TREE_CHILDREN (node).is_empty ())
2551 return;
2552 dump_printf_loc (metadata, user_loc, "\tchildren");
2553 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2554 dump_printf (dump_kind, " %p", (void *)child);
2555 dump_printf (dump_kind, "\n");
2558 DEBUG_FUNCTION void
2559 debug (slp_tree node)
2561 debug_dump_context ctx;
2562 vect_print_slp_tree (MSG_NOTE,
2563 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2564 node);
2567 /* Recursive helper for the dot producer below. */
2569 static void
2570 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2572 if (visited.add (node))
2573 return;
2575 fprintf (f, "\"%p\" [label=\"", (void *)node);
2576 vect_print_slp_tree (MSG_NOTE,
2577 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2578 node);
2579 fprintf (f, "\"];\n");
2582 for (slp_tree child : SLP_TREE_CHILDREN (node))
2583 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2585 for (slp_tree child : SLP_TREE_CHILDREN (node))
2586 if (child)
2587 dot_slp_tree (f, child, visited);
2590 DEBUG_FUNCTION void
2591 dot_slp_tree (const char *fname, slp_tree node)
2593 FILE *f = fopen (fname, "w");
2594 fprintf (f, "digraph {\n");
2595 fflush (f);
2597 debug_dump_context ctx (f);
2598 hash_set<slp_tree> visited;
2599 dot_slp_tree (f, node, visited);
2601 fflush (f);
2602 fprintf (f, "}\n");
2603 fclose (f);
2606 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2608 static void
2609 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2610 slp_tree node, hash_set<slp_tree> &visited)
2612 unsigned i;
2613 slp_tree child;
2615 if (visited.add (node))
2616 return;
2618 vect_print_slp_tree (dump_kind, loc, node);
2620 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2621 if (child)
2622 vect_print_slp_graph (dump_kind, loc, child, visited);
2625 static void
2626 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2627 slp_tree entry)
2629 hash_set<slp_tree> visited;
2630 vect_print_slp_graph (dump_kind, loc, entry, visited);
2633 /* Mark the tree rooted at NODE with PURE_SLP. */
2635 static void
2636 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2638 int i;
2639 stmt_vec_info stmt_info;
2640 slp_tree child;
2642 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2643 return;
2645 if (visited.add (node))
2646 return;
2648 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2649 STMT_SLP_TYPE (stmt_info) = pure_slp;
2651 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2652 if (child)
2653 vect_mark_slp_stmts (child, visited);
2656 static void
2657 vect_mark_slp_stmts (slp_tree node)
2659 hash_set<slp_tree> visited;
2660 vect_mark_slp_stmts (node, visited);
2663 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2665 static void
2666 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2668 int i;
2669 stmt_vec_info stmt_info;
2670 slp_tree child;
2672 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2673 return;
2675 if (visited.add (node))
2676 return;
2678 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2680 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2681 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2682 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2685 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2686 if (child)
2687 vect_mark_slp_stmts_relevant (child, visited);
2690 static void
2691 vect_mark_slp_stmts_relevant (slp_tree node)
2693 hash_set<slp_tree> visited;
2694 vect_mark_slp_stmts_relevant (node, visited);
2698 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2700 static void
2701 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2702 hash_set<slp_tree> &visited)
2704 if (!node || visited.add (node))
2705 return;
2707 if (SLP_TREE_CHILDREN (node).length () == 0)
2709 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2710 return;
2711 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2712 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2713 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2714 loads.safe_push (node);
2716 else
2718 unsigned i;
2719 slp_tree child;
2720 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2721 vect_gather_slp_loads (loads, child, visited);
2726 /* Find the last store in SLP INSTANCE. */
2728 stmt_vec_info
2729 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2731 stmt_vec_info last = NULL;
2732 stmt_vec_info stmt_vinfo;
2734 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2736 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2737 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2740 return last;
2743 /* Find the first stmt in NODE. */
2745 stmt_vec_info
2746 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2748 stmt_vec_info first = NULL;
2749 stmt_vec_info stmt_vinfo;
2751 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2753 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2754 if (!first
2755 || get_later_stmt (stmt_vinfo, first) == first)
2756 first = stmt_vinfo;
2759 return first;
2762 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2763 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2764 (also containing the first GROUP1_SIZE stmts, since stores are
2765 consecutive), the second containing the remainder.
2766 Return the first stmt in the second group. */
2768 static stmt_vec_info
2769 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2771 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2772 gcc_assert (group1_size > 0);
2773 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2774 gcc_assert (group2_size > 0);
2775 DR_GROUP_SIZE (first_vinfo) = group1_size;
2777 stmt_vec_info stmt_info = first_vinfo;
2778 for (unsigned i = group1_size; i > 1; i--)
2780 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2781 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2783 /* STMT is now the last element of the first group. */
2784 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2785 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2787 DR_GROUP_SIZE (group2) = group2_size;
2788 for (stmt_info = group2; stmt_info;
2789 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2791 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2792 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2795 /* For the second group, the DR_GROUP_GAP is that before the original group,
2796 plus skipping over the first vector. */
2797 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2799 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2800 DR_GROUP_GAP (first_vinfo) += group2_size;
2802 if (dump_enabled_p ())
2803 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2804 group1_size, group2_size);
2806 return group2;
2809 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2810 statements and a vector of NUNITS elements. */
2812 static poly_uint64
2813 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2815 return exact_div (common_multiple (nunits, group_size), group_size);
2818 /* Helper that checks to see if a node is a load node. */
2820 static inline bool
2821 vect_is_slp_load_node (slp_tree root)
2823 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2824 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2825 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2829 /* Helper function of optimize_load_redistribution that performs the operation
2830 recursively. */
2832 static slp_tree
2833 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2834 vec_info *vinfo, unsigned int group_size,
2835 hash_map<slp_tree, slp_tree> *load_map,
2836 slp_tree root)
2838 if (slp_tree *leader = load_map->get (root))
2839 return *leader;
2841 slp_tree node;
2842 unsigned i;
2844 /* For now, we don't know anything about externals so do not do anything. */
2845 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2846 return NULL;
2847 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2849 /* First convert this node into a load node and add it to the leaves
2850 list and flatten the permute from a lane to a load one. If it's
2851 unneeded it will be elided later. */
2852 vec<stmt_vec_info> stmts;
2853 stmts.create (SLP_TREE_LANES (root));
2854 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2855 for (unsigned j = 0; j < lane_perm.length (); j++)
2857 std::pair<unsigned, unsigned> perm = lane_perm[j];
2858 node = SLP_TREE_CHILDREN (root)[perm.first];
2860 if (!vect_is_slp_load_node (node)
2861 || SLP_TREE_CHILDREN (node).exists ())
2863 stmts.release ();
2864 goto next;
2867 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2870 if (dump_enabled_p ())
2871 dump_printf_loc (MSG_NOTE, vect_location,
2872 "converting stmts on permute node %p\n", root);
2874 bool *matches = XALLOCAVEC (bool, group_size);
2875 poly_uint64 max_nunits = 1;
2876 unsigned tree_size = 0, limit = 1;
2877 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2878 matches, &limit, &tree_size, bst_map);
2879 if (!node)
2880 stmts.release ();
2882 load_map->put (root, node);
2883 return node;
2886 next:
2887 load_map->put (root, NULL);
2889 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2891 slp_tree value
2892 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2893 node);
2894 if (value)
2896 SLP_TREE_REF_COUNT (value)++;
2897 SLP_TREE_CHILDREN (root)[i] = value;
2898 /* ??? We know the original leafs of the replaced nodes will
2899 be referenced by bst_map, only the permutes created by
2900 pattern matching are not. */
2901 if (SLP_TREE_REF_COUNT (node) == 1)
2902 load_map->remove (node);
2903 vect_free_slp_tree (node);
2907 return NULL;
2910 /* Temporary workaround for loads not being CSEd during SLP build. This
2911 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2912 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2913 same DR such that the final operation is equal to a permuted load. Such
2914 NODES are then directly converted into LOADS themselves. The nodes are
2915 CSEd using BST_MAP. */
2917 static void
2918 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2919 vec_info *vinfo, unsigned int group_size,
2920 hash_map<slp_tree, slp_tree> *load_map,
2921 slp_tree root)
2923 slp_tree node;
2924 unsigned i;
2926 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2928 slp_tree value
2929 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2930 node);
2931 if (value)
2933 SLP_TREE_REF_COUNT (value)++;
2934 SLP_TREE_CHILDREN (root)[i] = value;
2935 /* ??? We know the original leafs of the replaced nodes will
2936 be referenced by bst_map, only the permutes created by
2937 pattern matching are not. */
2938 if (SLP_TREE_REF_COUNT (node) == 1)
2939 load_map->remove (node);
2940 vect_free_slp_tree (node);
2945 /* Helper function of vect_match_slp_patterns.
2947 Attempts to match patterns against the slp tree rooted in REF_NODE using
2948 VINFO. Patterns are matched in post-order traversal.
2950 If matching is successful the value in REF_NODE is updated and returned, if
2951 not then it is returned unchanged. */
2953 static bool
2954 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2955 slp_tree_to_load_perm_map_t *perm_cache,
2956 slp_compat_nodes_map_t *compat_cache,
2957 hash_set<slp_tree> *visited)
2959 unsigned i;
2960 slp_tree node = *ref_node;
2961 bool found_p = false;
2962 if (!node || visited->add (node))
2963 return false;
2965 slp_tree child;
2966 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2967 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2968 vinfo, perm_cache, compat_cache,
2969 visited);
2971 for (unsigned x = 0; x < num__slp_patterns; x++)
2973 vect_pattern *pattern
2974 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
2975 if (pattern)
2977 pattern->build (vinfo);
2978 delete pattern;
2979 found_p = true;
2983 return found_p;
2986 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2987 vec_info VINFO.
2989 The modified tree is returned. Patterns are tried in order and multiple
2990 patterns may match. */
2992 static bool
2993 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2994 hash_set<slp_tree> *visited,
2995 slp_tree_to_load_perm_map_t *perm_cache,
2996 slp_compat_nodes_map_t *compat_cache)
2998 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2999 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3001 if (dump_enabled_p ())
3002 dump_printf_loc (MSG_NOTE, vect_location,
3003 "Analyzing SLP tree %p for patterns\n",
3004 SLP_INSTANCE_TREE (instance));
3006 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3007 visited);
3010 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3011 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3012 Return true if we could use IFN_STORE_LANES instead and if that appears
3013 to be the better approach. */
3015 static bool
3016 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3017 unsigned int group_size,
3018 unsigned int new_group_size)
3020 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3021 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3022 if (!vectype)
3023 return false;
3024 /* Allow the split if one of the two new groups would operate on full
3025 vectors *within* rather than across one scalar loop iteration.
3026 This is purely a heuristic, but it should work well for group
3027 sizes of 3 and 4, where the possible splits are:
3029 3->2+1: OK if the vector has exactly two elements
3030 4->2+2: Likewise
3031 4->3+1: Less clear-cut. */
3032 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3033 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3034 return false;
3035 return vect_store_lanes_supported (vectype, group_size, false);
3038 /* Analyze an SLP instance starting from a group of grouped stores. Call
3039 vect_build_slp_tree to build a tree of packed stmts if possible.
3040 Return FALSE if it's impossible to SLP any stmt in the loop. */
3042 static bool
3043 vect_analyze_slp_instance (vec_info *vinfo,
3044 scalar_stmts_to_slp_tree_map_t *bst_map,
3045 stmt_vec_info stmt_info, slp_instance_kind kind,
3046 unsigned max_tree_size, unsigned *limit);
3048 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3049 of KIND. Return true if successful. */
3051 static bool
3052 vect_build_slp_instance (vec_info *vinfo,
3053 slp_instance_kind kind,
3054 vec<stmt_vec_info> &scalar_stmts,
3055 vec<stmt_vec_info> &root_stmt_infos,
3056 unsigned max_tree_size, unsigned *limit,
3057 scalar_stmts_to_slp_tree_map_t *bst_map,
3058 /* ??? We need stmt_info for group splitting. */
3059 stmt_vec_info stmt_info_)
3061 if (dump_enabled_p ())
3063 dump_printf_loc (MSG_NOTE, vect_location,
3064 "Starting SLP discovery for\n");
3065 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3066 dump_printf_loc (MSG_NOTE, vect_location,
3067 " %G", scalar_stmts[i]->stmt);
3070 /* Build the tree for the SLP instance. */
3071 unsigned int group_size = scalar_stmts.length ();
3072 bool *matches = XALLOCAVEC (bool, group_size);
3073 poly_uint64 max_nunits = 1;
3074 unsigned tree_size = 0;
3075 unsigned i;
3076 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3077 &max_nunits, matches, limit,
3078 &tree_size, bst_map);
3079 if (node != NULL)
3081 /* Calculate the unrolling factor based on the smallest type. */
3082 poly_uint64 unrolling_factor
3083 = calculate_unrolling_factor (max_nunits, group_size);
3085 if (maybe_ne (unrolling_factor, 1U)
3086 && is_a <bb_vec_info> (vinfo))
3088 unsigned HOST_WIDE_INT const_max_nunits;
3089 if (!max_nunits.is_constant (&const_max_nunits)
3090 || const_max_nunits > group_size)
3092 if (dump_enabled_p ())
3093 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3094 "Build SLP failed: store group "
3095 "size not a multiple of the vector size "
3096 "in basic block SLP\n");
3097 vect_free_slp_tree (node);
3098 return false;
3100 /* Fatal mismatch. */
3101 if (dump_enabled_p ())
3102 dump_printf_loc (MSG_NOTE, vect_location,
3103 "SLP discovery succeeded but node needs "
3104 "splitting\n");
3105 memset (matches, true, group_size);
3106 matches[group_size / const_max_nunits * const_max_nunits] = false;
3107 vect_free_slp_tree (node);
3109 else
3111 /* Create a new SLP instance. */
3112 slp_instance new_instance = XNEW (class _slp_instance);
3113 SLP_INSTANCE_TREE (new_instance) = node;
3114 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3115 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3116 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3117 SLP_INSTANCE_KIND (new_instance) = kind;
3118 new_instance->reduc_phis = NULL;
3119 new_instance->cost_vec = vNULL;
3120 new_instance->subgraph_entries = vNULL;
3122 if (dump_enabled_p ())
3123 dump_printf_loc (MSG_NOTE, vect_location,
3124 "SLP size %u vs. limit %u.\n",
3125 tree_size, max_tree_size);
3127 /* Fixup SLP reduction chains. */
3128 if (kind == slp_inst_kind_reduc_chain)
3130 /* If this is a reduction chain with a conversion in front
3131 amend the SLP tree with a node for that. */
3132 gimple *scalar_def
3133 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3134 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3136 /* Get at the conversion stmt - we know it's the single use
3137 of the last stmt of the reduction chain. */
3138 use_operand_p use_p;
3139 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3140 &use_p, &scalar_def);
3141 gcc_assert (r);
3142 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3143 next_info = vect_stmt_to_vectorize (next_info);
3144 scalar_stmts = vNULL;
3145 scalar_stmts.create (group_size);
3146 for (unsigned i = 0; i < group_size; ++i)
3147 scalar_stmts.quick_push (next_info);
3148 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3149 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3150 SLP_TREE_CHILDREN (conv).quick_push (node);
3151 SLP_INSTANCE_TREE (new_instance) = conv;
3152 /* We also have to fake this conversion stmt as SLP reduction
3153 group so we don't have to mess with too much code
3154 elsewhere. */
3155 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3156 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3158 /* Fill the backedge child of the PHI SLP node. The
3159 general matching code cannot find it because the
3160 scalar code does not reflect how we vectorize the
3161 reduction. */
3162 use_operand_p use_p;
3163 imm_use_iterator imm_iter;
3164 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3165 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3166 gimple_get_lhs (scalar_def))
3167 /* There are exactly two non-debug uses, the reduction
3168 PHI and the loop-closed PHI node. */
3169 if (!is_gimple_debug (USE_STMT (use_p))
3170 && gimple_bb (USE_STMT (use_p)) == loop->header)
3172 auto_vec<stmt_vec_info, 64> phis (group_size);
3173 stmt_vec_info phi_info
3174 = vinfo->lookup_stmt (USE_STMT (use_p));
3175 for (unsigned i = 0; i < group_size; ++i)
3176 phis.quick_push (phi_info);
3177 slp_tree *phi_node = bst_map->get (phis);
3178 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3179 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3180 = SLP_INSTANCE_TREE (new_instance);
3181 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3185 vinfo->slp_instances.safe_push (new_instance);
3187 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3188 the number of scalar stmts in the root in a few places.
3189 Verify that assumption holds. */
3190 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3191 .length () == group_size);
3193 if (dump_enabled_p ())
3195 dump_printf_loc (MSG_NOTE, vect_location,
3196 "Final SLP tree for instance %p:\n", new_instance);
3197 vect_print_slp_graph (MSG_NOTE, vect_location,
3198 SLP_INSTANCE_TREE (new_instance));
3201 return true;
3204 else
3206 /* Failed to SLP. */
3207 /* Free the allocated memory. */
3208 scalar_stmts.release ();
3211 stmt_vec_info stmt_info = stmt_info_;
3212 /* Try to break the group up into pieces. */
3213 if (kind == slp_inst_kind_store)
3215 /* ??? We could delay all the actual splitting of store-groups
3216 until after SLP discovery of the original group completed.
3217 Then we can recurse to vect_build_slp_instance directly. */
3218 for (i = 0; i < group_size; i++)
3219 if (!matches[i])
3220 break;
3222 /* For basic block SLP, try to break the group up into multiples of
3223 a vector size. */
3224 if (is_a <bb_vec_info> (vinfo)
3225 && (i > 1 && i < group_size))
3227 tree scalar_type
3228 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3229 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3230 1 << floor_log2 (i));
3231 unsigned HOST_WIDE_INT const_nunits;
3232 if (vectype
3233 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3235 /* Split into two groups at the first vector boundary. */
3236 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3237 unsigned group1_size = i & ~(const_nunits - 1);
3239 if (dump_enabled_p ())
3240 dump_printf_loc (MSG_NOTE, vect_location,
3241 "Splitting SLP group at stmt %u\n", i);
3242 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3243 group1_size);
3244 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3245 kind, max_tree_size,
3246 limit);
3247 /* Split the rest at the failure point and possibly
3248 re-analyze the remaining matching part if it has
3249 at least two lanes. */
3250 if (group1_size < i
3251 && (i + 1 < group_size
3252 || i - group1_size > 1))
3254 stmt_vec_info rest2 = rest;
3255 rest = vect_split_slp_store_group (rest, i - group1_size);
3256 if (i - group1_size > 1)
3257 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3258 kind, max_tree_size,
3259 limit);
3261 /* Re-analyze the non-matching tail if it has at least
3262 two lanes. */
3263 if (i + 1 < group_size)
3264 res |= vect_analyze_slp_instance (vinfo, bst_map,
3265 rest, kind, max_tree_size,
3266 limit);
3267 return res;
3271 /* For loop vectorization split into arbitrary pieces of size > 1. */
3272 if (is_a <loop_vec_info> (vinfo)
3273 && (i > 1 && i < group_size)
3274 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3276 unsigned group1_size = i;
3278 if (dump_enabled_p ())
3279 dump_printf_loc (MSG_NOTE, vect_location,
3280 "Splitting SLP group at stmt %u\n", i);
3282 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3283 group1_size);
3284 /* Loop vectorization cannot handle gaps in stores, make sure
3285 the split group appears as strided. */
3286 STMT_VINFO_STRIDED_P (rest) = 1;
3287 DR_GROUP_GAP (rest) = 0;
3288 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3289 DR_GROUP_GAP (stmt_info) = 0;
3291 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3292 kind, max_tree_size, limit);
3293 if (i + 1 < group_size)
3294 res |= vect_analyze_slp_instance (vinfo, bst_map,
3295 rest, kind, max_tree_size, limit);
3297 return res;
3300 /* Even though the first vector did not all match, we might be able to SLP
3301 (some) of the remainder. FORNOW ignore this possibility. */
3304 /* Failed to SLP. */
3305 if (dump_enabled_p ())
3306 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3307 return false;
3311 /* Analyze an SLP instance starting from a group of grouped stores. Call
3312 vect_build_slp_tree to build a tree of packed stmts if possible.
3313 Return FALSE if it's impossible to SLP any stmt in the loop. */
3315 static bool
3316 vect_analyze_slp_instance (vec_info *vinfo,
3317 scalar_stmts_to_slp_tree_map_t *bst_map,
3318 stmt_vec_info stmt_info,
3319 slp_instance_kind kind,
3320 unsigned max_tree_size, unsigned *limit)
3322 unsigned int i;
3323 vec<stmt_vec_info> scalar_stmts;
3325 if (is_a <bb_vec_info> (vinfo))
3326 vect_location = stmt_info->stmt;
3328 stmt_vec_info next_info = stmt_info;
3329 if (kind == slp_inst_kind_store)
3331 /* Collect the stores and store them in scalar_stmts. */
3332 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3333 while (next_info)
3335 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3336 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3339 else if (kind == slp_inst_kind_reduc_chain)
3341 /* Collect the reduction stmts and store them in scalar_stmts. */
3342 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3343 while (next_info)
3345 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3346 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3348 /* Mark the first element of the reduction chain as reduction to properly
3349 transform the node. In the reduction analysis phase only the last
3350 element of the chain is marked as reduction. */
3351 STMT_VINFO_DEF_TYPE (stmt_info)
3352 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3353 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3354 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3356 else if (kind == slp_inst_kind_ctor)
3358 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3359 tree val;
3360 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3361 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3363 stmt_vec_info def_info = vinfo->lookup_def (val);
3364 def_info = vect_stmt_to_vectorize (def_info);
3365 scalar_stmts.quick_push (def_info);
3367 if (dump_enabled_p ())
3368 dump_printf_loc (MSG_NOTE, vect_location,
3369 "Analyzing vectorizable constructor: %G\n",
3370 stmt_info->stmt);
3372 else if (kind == slp_inst_kind_reduc_group)
3374 /* Collect reduction statements. */
3375 const vec<stmt_vec_info> &reductions
3376 = as_a <loop_vec_info> (vinfo)->reductions;
3377 scalar_stmts.create (reductions.length ());
3378 for (i = 0; reductions.iterate (i, &next_info); i++)
3379 if ((STMT_VINFO_RELEVANT_P (next_info)
3380 || STMT_VINFO_LIVE_P (next_info))
3381 /* ??? Make sure we didn't skip a conversion around a reduction
3382 path. In that case we'd have to reverse engineer that conversion
3383 stmt following the chain using reduc_idx and from the PHI
3384 using reduc_def. */
3385 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3386 scalar_stmts.quick_push (next_info);
3387 /* If less than two were relevant/live there's nothing to SLP. */
3388 if (scalar_stmts.length () < 2)
3389 return false;
3391 else
3392 gcc_unreachable ();
3394 vec<stmt_vec_info> roots = vNULL;
3395 if (kind == slp_inst_kind_ctor)
3397 roots.create (1);
3398 roots.quick_push (stmt_info);
3400 /* Build the tree for the SLP instance. */
3401 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3402 roots,
3403 max_tree_size, limit, bst_map,
3404 kind == slp_inst_kind_store
3405 ? stmt_info : NULL);
3406 if (!res)
3407 roots.release ();
3409 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3410 where we should do store group splitting. */
3412 return res;
3415 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3416 trees of packed scalar stmts if SLP is possible. */
3418 opt_result
3419 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3421 unsigned int i;
3422 stmt_vec_info first_element;
3423 slp_instance instance;
3425 DUMP_VECT_SCOPE ("vect_analyze_slp");
3427 unsigned limit = max_tree_size;
3429 scalar_stmts_to_slp_tree_map_t *bst_map
3430 = new scalar_stmts_to_slp_tree_map_t ();
3432 /* Find SLP sequences starting from groups of grouped stores. */
3433 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3434 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3435 STMT_VINFO_GROUPED_ACCESS (first_element)
3436 ? slp_inst_kind_store : slp_inst_kind_ctor,
3437 max_tree_size, &limit);
3439 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3441 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3443 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3444 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3445 bb_vinfo->roots[i].stmts,
3446 bb_vinfo->roots[i].roots,
3447 max_tree_size, &limit, bst_map, NULL))
3449 bb_vinfo->roots[i].stmts = vNULL;
3450 bb_vinfo->roots[i].roots = vNULL;
3455 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3457 /* Find SLP sequences starting from reduction chains. */
3458 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3459 if (! STMT_VINFO_RELEVANT_P (first_element)
3460 && ! STMT_VINFO_LIVE_P (first_element))
3462 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3463 slp_inst_kind_reduc_chain,
3464 max_tree_size, &limit))
3466 /* Dissolve reduction chain group. */
3467 stmt_vec_info vinfo = first_element;
3468 stmt_vec_info last = NULL;
3469 while (vinfo)
3471 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3472 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3473 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3474 last = vinfo;
3475 vinfo = next;
3477 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3478 /* It can be still vectorized as part of an SLP reduction. */
3479 loop_vinfo->reductions.safe_push (last);
3482 /* Find SLP sequences starting from groups of reductions. */
3483 if (loop_vinfo->reductions.length () > 1)
3484 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3485 slp_inst_kind_reduc_group, max_tree_size,
3486 &limit);
3489 hash_set<slp_tree> visited_patterns;
3490 slp_tree_to_load_perm_map_t perm_cache;
3491 slp_compat_nodes_map_t compat_cache;
3493 /* See if any patterns can be found in the SLP tree. */
3494 bool pattern_found = false;
3495 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3496 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3497 &visited_patterns, &perm_cache,
3498 &compat_cache);
3500 /* If any were found optimize permutations of loads. */
3501 if (pattern_found)
3503 hash_map<slp_tree, slp_tree> load_map;
3504 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3506 slp_tree root = SLP_INSTANCE_TREE (instance);
3507 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3508 &load_map, root);
3514 /* The map keeps a reference on SLP nodes built, release that. */
3515 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3516 it != bst_map->end (); ++it)
3517 if ((*it).second)
3518 vect_free_slp_tree ((*it).second);
3519 delete bst_map;
3521 if (pattern_found && dump_enabled_p ())
3523 dump_printf_loc (MSG_NOTE, vect_location,
3524 "Pattern matched SLP tree\n");
3525 hash_set<slp_tree> visited;
3526 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3527 vect_print_slp_graph (MSG_NOTE, vect_location,
3528 SLP_INSTANCE_TREE (instance), visited);
3531 return opt_result::success ();
3534 struct slpg_vertex
3536 slpg_vertex (slp_tree node_)
3537 : node (node_), perm_in (-1), perm_out (-1) {}
3539 int get_perm_materialized () const
3540 { return perm_in != perm_out ? perm_in : 0; }
3542 slp_tree node;
3543 /* The common permutation on the incoming lanes (towards SLP children). */
3544 int perm_in;
3545 /* The permutation on the outgoing lanes (towards SLP parents). When
3546 the node is a materialization point for a permute this differs
3547 from perm_in (and is then usually zero). Materialization happens
3548 on the input side. */
3549 int perm_out;
3552 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3554 static void
3555 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
3556 vec<slpg_vertex> &vertices, vec<int> &leafs)
3558 unsigned i;
3559 slp_tree child;
3561 if (visited.add (node))
3562 return;
3564 node->vertex = vertices.length ();
3565 vertices.safe_push (slpg_vertex (node));
3567 bool leaf = true;
3568 bool force_leaf = false;
3569 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3570 if (child)
3572 leaf = false;
3573 vect_slp_build_vertices (visited, child, vertices, leafs);
3575 else
3576 force_leaf = true;
3577 /* Since SLP discovery works along use-def edges all cycles have an
3578 entry - but there's the exception of cycles where we do not handle
3579 the entry explicitely (but with a NULL SLP node), like some reductions
3580 and inductions. Force those SLP PHIs to act as leafs to make them
3581 backwards reachable. */
3582 if (leaf || force_leaf)
3583 leafs.safe_push (node->vertex);
3586 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3588 static void
3589 vect_slp_build_vertices (vec_info *info, vec<slpg_vertex> &vertices,
3590 vec<int> &leafs)
3592 hash_set<slp_tree> visited;
3593 unsigned i;
3594 slp_instance instance;
3595 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
3596 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
3597 leafs);
3600 /* Apply (reverse) bijectite PERM to VEC. */
3602 template <class T>
3603 static void
3604 vect_slp_permute (vec<unsigned> perm,
3605 vec<T> &vec, bool reverse)
3607 auto_vec<T, 64> saved;
3608 saved.create (vec.length ());
3609 for (unsigned i = 0; i < vec.length (); ++i)
3610 saved.quick_push (vec[i]);
3612 if (reverse)
3614 for (unsigned i = 0; i < vec.length (); ++i)
3615 vec[perm[i]] = saved[i];
3616 for (unsigned i = 0; i < vec.length (); ++i)
3617 gcc_assert (vec[perm[i]] == saved[i]);
3619 else
3621 for (unsigned i = 0; i < vec.length (); ++i)
3622 vec[i] = saved[perm[i]];
3623 for (unsigned i = 0; i < vec.length (); ++i)
3624 gcc_assert (vec[i] == saved[perm[i]]);
3628 /* Return whether permutations PERM_A and PERM_B as recorded in the
3629 PERMS vector are equal. */
3631 static bool
3632 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
3633 int perm_a, int perm_b)
3635 return (perm_a == perm_b
3636 || (perm_a != -1 && perm_b != -1
3637 && perms[perm_a].length () == perms[perm_b].length ()
3638 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
3639 sizeof (unsigned) * perms[perm_a].length ()) == 0));
3642 /* Optimize the SLP graph of VINFO. */
3644 void
3645 vect_optimize_slp (vec_info *vinfo)
3647 if (vinfo->slp_instances.is_empty ())
3648 return;
3650 slp_tree node;
3651 unsigned i;
3652 auto_vec<slpg_vertex> vertices;
3653 auto_vec<int> leafs;
3654 vect_slp_build_vertices (vinfo, vertices, leafs);
3656 struct graph *slpg = new_graph (vertices.length ());
3657 for (slpg_vertex &v : vertices)
3658 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
3659 if (child)
3660 add_edge (slpg, v.node->vertex, child->vertex);
3662 /* Compute (reverse) postorder on the inverted graph. */
3663 auto_vec<int> ipo;
3664 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3666 auto_vec<vec<unsigned> > perms;
3667 perms.safe_push (vNULL); /* zero is no permute */
3669 /* Produce initial permutations. */
3670 for (i = 0; i < leafs.length (); ++i)
3672 int idx = leafs[i];
3673 slp_tree node = vertices[idx].node;
3675 /* Handle externals and constants optimistically throughout the
3676 iteration. But treat existing vectors as fixed since we
3677 do not handle permuting them below. */
3678 if ((SLP_TREE_DEF_TYPE (node) == vect_external_def
3679 && !SLP_TREE_VEC_DEFS (node).exists ())
3680 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3681 continue;
3683 /* Leafs do not change across iterations. Note leafs also double
3684 as entries to the reverse graph. */
3685 if (!slpg->vertices[idx].succ)
3687 vertices[idx].perm_in = 0;
3688 vertices[idx].perm_out = 0;
3691 /* Loads are the only thing generating permutes. */
3692 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3693 continue;
3695 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3696 node unpermuted, record this permute. */
3697 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3698 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3699 continue;
3700 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3701 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3702 bool any_permute = false;
3703 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3705 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3706 imin = MIN (imin, idx);
3707 imax = MAX (imax, idx);
3708 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3709 any_permute = true;
3711 /* If there's no permute no need to split one out. */
3712 if (!any_permute)
3713 continue;
3714 /* If the span doesn't match we'd disrupt VF computation, avoid
3715 that for now. */
3716 if (imax - imin + 1 != SLP_TREE_LANES (node))
3717 continue;
3719 /* For now only handle true permutes, like
3720 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3721 when permuting constants and invariants keeping the permute
3722 bijective. */
3723 auto_sbitmap load_index (SLP_TREE_LANES (node));
3724 bitmap_clear (load_index);
3725 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3726 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3727 unsigned j;
3728 for (j = 0; j < SLP_TREE_LANES (node); ++j)
3729 if (!bitmap_bit_p (load_index, j))
3730 break;
3731 if (j != SLP_TREE_LANES (node))
3732 continue;
3734 vec<unsigned> perm = vNULL;
3735 perm.safe_grow (SLP_TREE_LANES (node), true);
3736 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3737 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3738 perms.safe_push (perm);
3739 vertices[idx].perm_in = perms.length () - 1;
3740 vertices[idx].perm_out = perms.length () - 1;
3743 /* In addition to the above we have to mark outgoing permutes facing
3744 non-reduction graph entries that are not represented as to be
3745 materialized. */
3746 for (slp_instance instance : vinfo->slp_instances)
3747 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
3749 /* Just setting perm_out isn't enough for the propagation to
3750 pick this up. */
3751 vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_in = 0;
3752 vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_out = 0;
3755 /* Propagate permutes along the graph and compute materialization points. */
3756 bool changed;
3757 bool do_materialization = false;
3758 unsigned iteration = 0;
3761 changed = false;
3762 ++iteration;
3764 if (dump_enabled_p ())
3765 dump_printf_loc (MSG_NOTE, vect_location,
3766 "SLP optimize iteration %d\n", iteration);
3768 for (i = vertices.length (); i > 0 ; --i)
3770 int idx = ipo[i-1];
3771 slp_tree node = vertices[idx].node;
3773 /* Handle externals and constants optimistically throughout the
3774 iteration. */
3775 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3776 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3777 continue;
3779 /* We still eventually have failed backedge SLP nodes in the
3780 graph, those are only cancelled when analyzing operations.
3781 Simply treat them as transparent ops, propagating permutes
3782 through them. */
3783 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3785 /* We do not handle stores with a permutation, so all
3786 incoming permutes must have been materialized. */
3787 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
3788 if (STMT_VINFO_DATA_REF (rep)
3789 && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
3791 /* ??? We're forcing materialization in place
3792 of the child here, we'd need special handling
3793 in materialization to leave perm_in -1 here. */
3794 vertices[idx].perm_in = 0;
3795 vertices[idx].perm_out = 0;
3797 /* We cannot move a permute across an operation that is
3798 not independent on lanes. Note this is an explicit
3799 negative list since that's much shorter than the respective
3800 positive one but it's critical to keep maintaining it. */
3801 if (is_gimple_call (STMT_VINFO_STMT (rep)))
3802 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
3804 case CFN_COMPLEX_ADD_ROT90:
3805 case CFN_COMPLEX_ADD_ROT270:
3806 case CFN_COMPLEX_MUL:
3807 case CFN_COMPLEX_MUL_CONJ:
3808 case CFN_VEC_ADDSUB:
3809 case CFN_VEC_FMADDSUB:
3810 case CFN_VEC_FMSUBADD:
3811 vertices[idx].perm_in = 0;
3812 vertices[idx].perm_out = 0;
3813 default:;
3817 if (!slpg->vertices[idx].succ)
3818 /* Pick up pre-computed leaf values. */
3820 else
3822 bool any_succ_perm_out_m1 = false;
3823 int perm_in = vertices[idx].perm_in;
3824 for (graph_edge *succ = slpg->vertices[idx].succ;
3825 succ; succ = succ->succ_next)
3827 int succ_idx = succ->dest;
3828 int succ_perm = vertices[succ_idx].perm_out;
3829 /* Handle unvisited (and constant) nodes optimistically. */
3830 /* ??? But for constants once we want to handle
3831 non-bijective permutes we have to verify the permute,
3832 when unifying lanes, will not unify different constants.
3833 For example see gcc.dg/vect/bb-slp-14.c for a case
3834 that would break. */
3835 if (succ_perm == -1)
3837 /* When we handled a non-leaf optimistically, note
3838 that so we can adjust its outgoing permute below. */
3839 slp_tree succ_node = vertices[succ_idx].node;
3840 if (SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3841 && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3842 any_succ_perm_out_m1 = true;
3843 continue;
3845 if (perm_in == -1)
3846 perm_in = succ_perm;
3847 else if (succ_perm == 0
3848 || !vect_slp_perms_eq (perms, perm_in, succ_perm))
3850 perm_in = 0;
3851 break;
3855 /* Adjust any incoming permutes we treated optimistically. */
3856 if (perm_in != -1 && any_succ_perm_out_m1)
3858 for (graph_edge *succ = slpg->vertices[idx].succ;
3859 succ; succ = succ->succ_next)
3861 slp_tree succ_node = vertices[succ->dest].node;
3862 if (vertices[succ->dest].perm_out == -1
3863 && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3864 && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3866 vertices[succ->dest].perm_out = perm_in;
3867 /* And ensure this propagates. */
3868 if (vertices[succ->dest].perm_in == -1)
3869 vertices[succ->dest].perm_in = perm_in;
3872 changed = true;
3875 if (!vect_slp_perms_eq (perms, perm_in,
3876 vertices[idx].perm_in))
3878 /* Make sure we eventually converge. */
3879 gcc_checking_assert (vertices[idx].perm_in == -1
3880 || perm_in == 0);
3881 vertices[idx].perm_in = perm_in;
3883 /* While we can handle VEC_PERM nodes as transparent
3884 pass-through they can be a cheap materialization
3885 point as well. In addition they can act as source
3886 of a random permutation as well.
3887 The following ensures that former materialization
3888 points that now have zero incoming permutes no
3889 longer appear as such and that former "any" permutes
3890 get pass-through. We keep VEC_PERM nodes optimistic
3891 as "any" outgoing permute though. */
3892 if (vertices[idx].perm_out != 0
3893 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
3894 vertices[idx].perm_out = perm_in;
3895 changed = true;
3899 /* Elide pruning at materialization points in the first
3900 iteration phase. */
3901 if (!do_materialization)
3902 continue;
3904 int perm = vertices[idx].perm_out;
3905 if (perm == 0 || perm == -1)
3906 continue;
3908 /* Decide on permute materialization. Look whether there's
3909 a use (pred) edge that is permuted differently than us.
3910 In that case mark ourselves so the permutation is applied. */
3911 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
3912 if (all_preds_permuted)
3913 for (graph_edge *pred = slpg->vertices[idx].pred;
3914 pred; pred = pred->pred_next)
3916 int pred_perm = vertices[pred->src].perm_in;
3917 gcc_checking_assert (pred_perm != -1);
3918 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3920 all_preds_permuted = false;
3921 break;
3924 if (!all_preds_permuted)
3926 vertices[idx].perm_out = 0;
3927 changed = true;
3931 /* If the initial propagation converged, switch on materialization
3932 and re-propagate. */
3933 if (!changed && !do_materialization)
3935 do_materialization = true;
3936 changed = true;
3939 while (changed);
3940 statistics_histogram_event (cfun, "SLP optimize perm iterations", iteration);
3942 /* Materialize. */
3943 for (i = 0; i < vertices.length (); ++i)
3945 int perm_in = vertices[i].perm_in;
3946 slp_tree node = vertices[i].node;
3948 /* First permute invariant/external original successors, we handle
3949 those optimistically during propagation and duplicate them if
3950 they are used with different permutations. */
3951 unsigned j;
3952 slp_tree child;
3953 if (perm_in > 0)
3954 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3956 if (!child
3957 || (SLP_TREE_DEF_TYPE (child) != vect_constant_def
3958 && SLP_TREE_DEF_TYPE (child) != vect_external_def))
3959 continue;
3961 /* If the vector is uniform there's nothing to do. */
3962 if (vect_slp_tree_uniform_p (child))
3963 continue;
3965 /* We can end up sharing some externals via two_operator
3966 handling. Be prepared to unshare those. */
3967 if (child->refcnt != 1)
3969 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3970 SLP_TREE_CHILDREN (node)[j] = child
3971 = vect_create_new_slp_node
3972 (SLP_TREE_SCALAR_OPS (child).copy ());
3974 vect_slp_permute (perms[perm_in],
3975 SLP_TREE_SCALAR_OPS (child), true);
3978 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3980 /* Apply the common permutes to the input vectors. */
3981 if (perm_in > 0)
3983 /* If the node is already a permute node we can apply
3984 the permutation to the lane selection, effectively
3985 materializing it on the incoming vectors. */
3986 if (dump_enabled_p ())
3987 dump_printf_loc (MSG_NOTE, vect_location,
3988 "simplifying permute node %p\n",
3989 node);
3990 for (unsigned k = 0;
3991 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3992 SLP_TREE_LANE_PERMUTATION (node)[k].second
3993 = perms[perm_in][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3995 /* Apply the anticipated output permute to the permute and
3996 stmt vectors. */
3997 int perm_out = vertices[i].perm_out;
3998 if (perm_out > 0)
4000 vect_slp_permute (perms[perm_out],
4001 SLP_TREE_SCALAR_STMTS (node), true);
4002 vect_slp_permute (perms[perm_out],
4003 SLP_TREE_LANE_PERMUTATION (node), true);
4006 else if (vertices[i].get_perm_materialized () != 0)
4008 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4009 /* For loads simply drop the permutation, the load permutation
4010 already performs the desired permutation. */
4012 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
4013 gcc_unreachable ();
4014 else
4016 if (dump_enabled_p ())
4017 dump_printf_loc (MSG_NOTE, vect_location,
4018 "inserting permute node in place of %p\n",
4019 node);
4021 /* Make a copy of NODE and in-place change it to a
4022 VEC_PERM node to permute the lanes of the copy. */
4023 slp_tree copy = new _slp_tree;
4024 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
4025 SLP_TREE_CHILDREN (node) = vNULL;
4026 SLP_TREE_SCALAR_STMTS (copy)
4027 = SLP_TREE_SCALAR_STMTS (node).copy ();
4028 vect_slp_permute (perms[perm_in],
4029 SLP_TREE_SCALAR_STMTS (copy), true);
4030 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
4031 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
4032 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
4033 SLP_TREE_LANE_PERMUTATION (copy)
4034 = SLP_TREE_LANE_PERMUTATION (node);
4035 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
4036 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
4037 copy->refcnt = 1;
4038 copy->max_nunits = node->max_nunits;
4039 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
4040 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
4041 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
4043 /* Now turn NODE into a VEC_PERM. */
4044 SLP_TREE_CHILDREN (node).safe_push (copy);
4045 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
4046 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4047 SLP_TREE_LANE_PERMUTATION (node)
4048 .quick_push (std::make_pair (0, perms[perm_in][j]));
4049 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
4052 else if (perm_in > 0) /* perm_in == perm_out */
4054 /* Apply the reverse permutation to our stmts. */
4055 vect_slp_permute (perms[perm_in],
4056 SLP_TREE_SCALAR_STMTS (node), true);
4057 /* And to the lane/load permutation, which we can simply
4058 make regular by design. */
4059 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4061 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
4062 /* ??? When we handle non-bijective permutes the idea
4063 is that we can force the load-permutation to be
4064 { min, min + 1, min + 2, ... max }. But then the
4065 scalar defs might no longer match the lane content
4066 which means wrong-code with live lane vectorization.
4067 So we possibly have to have NULL entries for those. */
4068 vect_slp_permute (perms[perm_in],
4069 SLP_TREE_LOAD_PERMUTATION (node), true);
4071 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
4072 gcc_unreachable ();
4076 /* Elide any permutations at BB reduction roots. */
4077 if (is_a <bb_vec_info> (vinfo))
4079 for (slp_instance instance : vinfo->slp_instances)
4081 if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
4082 continue;
4083 slp_tree old = SLP_INSTANCE_TREE (instance);
4084 if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
4085 && SLP_TREE_CHILDREN (old).length () == 1)
4087 slp_tree child = SLP_TREE_CHILDREN (old)[0];
4088 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
4090 /* Preserve the special VEC_PERM we use to shield existing
4091 vector defs from the rest. But make it a no-op. */
4092 unsigned i = 0;
4093 for (std::pair<unsigned, unsigned> &p
4094 : SLP_TREE_LANE_PERMUTATION (old))
4095 p.second = i++;
4097 else
4099 SLP_INSTANCE_TREE (instance) = child;
4100 SLP_TREE_REF_COUNT (child)++;
4101 vect_free_slp_tree (old);
4104 else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
4105 && SLP_TREE_REF_COUNT (old) == 1
4106 && vertices[old->vertex].get_perm_materialized () != 0)
4108 /* ??? For loads the situation is more complex since
4109 we can't modify the permute in place in case the
4110 node is used multiple times. In fact for loads this
4111 should be somehow handled in the propagation engine. */
4112 /* Apply the reverse permutation to our stmts. */
4113 int perm = vertices[old->vertex].get_perm_materialized ();
4114 vect_slp_permute (perms[perm],
4115 SLP_TREE_SCALAR_STMTS (old), true);
4116 vect_slp_permute (perms[perm],
4117 SLP_TREE_LOAD_PERMUTATION (old), true);
4122 /* Free the perms vector used for propagation. */
4123 while (!perms.is_empty ())
4124 perms.pop ().release ();
4125 free_graph (slpg);
4128 /* Now elide load permutations that are not necessary. */
4129 for (i = 0; i < leafs.length (); ++i)
4131 node = vertices[leafs[i]].node;
4132 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
4133 continue;
4135 /* In basic block vectorization we allow any subchain of an interleaving
4136 chain.
4137 FORNOW: not in loop SLP because of realignment complications. */
4138 if (is_a <bb_vec_info> (vinfo))
4140 bool subchain_p = true;
4141 stmt_vec_info next_load_info = NULL;
4142 stmt_vec_info load_info;
4143 unsigned j;
4144 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4146 if (j != 0
4147 && (next_load_info != load_info
4148 || DR_GROUP_GAP (load_info) != 1))
4150 subchain_p = false;
4151 break;
4153 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
4155 if (subchain_p)
4157 SLP_TREE_LOAD_PERMUTATION (node).release ();
4158 continue;
4161 else
4163 stmt_vec_info load_info;
4164 bool this_load_permuted = false;
4165 unsigned j;
4166 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4167 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
4169 this_load_permuted = true;
4170 break;
4172 stmt_vec_info first_stmt_info
4173 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
4174 if (!this_load_permuted
4175 /* The load requires permutation when unrolling exposes
4176 a gap either because the group is larger than the SLP
4177 group-size or because there is a gap between the groups. */
4178 && (known_eq (LOOP_VINFO_VECT_FACTOR
4179 (as_a <loop_vec_info> (vinfo)), 1U)
4180 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
4181 && DR_GROUP_GAP (first_stmt_info) == 0)))
4183 SLP_TREE_LOAD_PERMUTATION (node).release ();
4184 continue;
4190 /* Gather loads reachable from the individual SLP graph entries. */
4192 void
4193 vect_gather_slp_loads (vec_info *vinfo)
4195 unsigned i;
4196 slp_instance instance;
4197 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
4199 hash_set<slp_tree> visited;
4200 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
4201 SLP_INSTANCE_TREE (instance), visited);
4206 /* For each possible SLP instance decide whether to SLP it and calculate overall
4207 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
4208 least one instance. */
4210 bool
4211 vect_make_slp_decision (loop_vec_info loop_vinfo)
4213 unsigned int i;
4214 poly_uint64 unrolling_factor = 1;
4215 const vec<slp_instance> &slp_instances
4216 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
4217 slp_instance instance;
4218 int decided_to_slp = 0;
4220 DUMP_VECT_SCOPE ("vect_make_slp_decision");
4222 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4224 /* FORNOW: SLP if you can. */
4225 /* All unroll factors have the form:
4227 GET_MODE_SIZE (vinfo->vector_mode) * X
4229 for some rational X, so they must have a common multiple. */
4230 unrolling_factor
4231 = force_common_multiple (unrolling_factor,
4232 SLP_INSTANCE_UNROLLING_FACTOR (instance));
4234 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
4235 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
4236 loop-based vectorization. Such stmts will be marked as HYBRID. */
4237 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4238 decided_to_slp++;
4241 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
4243 if (decided_to_slp && dump_enabled_p ())
4245 dump_printf_loc (MSG_NOTE, vect_location,
4246 "Decided to SLP %d instances. Unrolling factor ",
4247 decided_to_slp);
4248 dump_dec (MSG_NOTE, unrolling_factor);
4249 dump_printf (MSG_NOTE, "\n");
4252 return (decided_to_slp > 0);
4255 /* Private data for vect_detect_hybrid_slp. */
4256 struct vdhs_data
4258 loop_vec_info loop_vinfo;
4259 vec<stmt_vec_info> *worklist;
4262 /* Walker for walk_gimple_op. */
4264 static tree
4265 vect_detect_hybrid_slp (tree *tp, int *, void *data)
4267 walk_stmt_info *wi = (walk_stmt_info *)data;
4268 vdhs_data *dat = (vdhs_data *)wi->info;
4270 if (wi->is_lhs)
4271 return NULL_TREE;
4273 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
4274 if (!def_stmt_info)
4275 return NULL_TREE;
4276 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
4277 if (PURE_SLP_STMT (def_stmt_info))
4279 if (dump_enabled_p ())
4280 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
4281 def_stmt_info->stmt);
4282 STMT_SLP_TYPE (def_stmt_info) = hybrid;
4283 dat->worklist->safe_push (def_stmt_info);
4286 return NULL_TREE;
4289 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
4290 if so, otherwise pushing it to WORKLIST. */
4292 static void
4293 maybe_push_to_hybrid_worklist (vec_info *vinfo,
4294 vec<stmt_vec_info> &worklist,
4295 stmt_vec_info stmt_info)
4297 if (dump_enabled_p ())
4298 dump_printf_loc (MSG_NOTE, vect_location,
4299 "Processing hybrid candidate : %G", stmt_info->stmt);
4300 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
4301 imm_use_iterator iter2;
4302 ssa_op_iter iter1;
4303 use_operand_p use_p;
4304 def_operand_p def_p;
4305 bool any_def = false;
4306 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
4308 any_def = true;
4309 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
4311 if (is_gimple_debug (USE_STMT (use_p)))
4312 continue;
4313 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
4314 /* An out-of loop use means this is a loop_vect sink. */
4315 if (!use_info)
4317 if (dump_enabled_p ())
4318 dump_printf_loc (MSG_NOTE, vect_location,
4319 "Found loop_vect sink: %G", stmt_info->stmt);
4320 worklist.safe_push (stmt_info);
4321 return;
4323 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
4325 if (dump_enabled_p ())
4326 dump_printf_loc (MSG_NOTE, vect_location,
4327 "Found loop_vect use: %G", use_info->stmt);
4328 worklist.safe_push (stmt_info);
4329 return;
4333 /* No def means this is a loo_vect sink. */
4334 if (!any_def)
4336 if (dump_enabled_p ())
4337 dump_printf_loc (MSG_NOTE, vect_location,
4338 "Found loop_vect sink: %G", stmt_info->stmt);
4339 worklist.safe_push (stmt_info);
4340 return;
4342 if (dump_enabled_p ())
4343 dump_printf_loc (MSG_NOTE, vect_location,
4344 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
4345 STMT_SLP_TYPE (stmt_info) = pure_slp;
4348 /* Find stmts that must be both vectorized and SLPed. */
4350 void
4351 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
4353 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
4355 /* All stmts participating in SLP are marked pure_slp, all other
4356 stmts are loop_vect.
4357 First collect all loop_vect stmts into a worklist.
4358 SLP patterns cause not all original scalar stmts to appear in
4359 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
4360 Rectify this here and do a backward walk over the IL only considering
4361 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
4362 mark them as pure_slp. */
4363 auto_vec<stmt_vec_info> worklist;
4364 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
4366 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
4367 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
4368 gsi_next (&gsi))
4370 gphi *phi = gsi.phi ();
4371 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
4372 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4373 maybe_push_to_hybrid_worklist (loop_vinfo,
4374 worklist, stmt_info);
4376 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
4377 gsi_prev (&gsi))
4379 gimple *stmt = gsi_stmt (gsi);
4380 if (is_gimple_debug (stmt))
4381 continue;
4382 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
4383 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
4385 for (gimple_stmt_iterator gsi2
4386 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
4387 !gsi_end_p (gsi2); gsi_next (&gsi2))
4389 stmt_vec_info patt_info
4390 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
4391 if (!STMT_SLP_TYPE (patt_info)
4392 && STMT_VINFO_RELEVANT (patt_info))
4393 maybe_push_to_hybrid_worklist (loop_vinfo,
4394 worklist, patt_info);
4396 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4398 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4399 maybe_push_to_hybrid_worklist (loop_vinfo,
4400 worklist, stmt_info);
4404 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
4405 mark any SLP vectorized stmt as hybrid.
4406 ??? We're visiting def stmts N times (once for each non-SLP and
4407 once for each hybrid-SLP use). */
4408 walk_stmt_info wi;
4409 vdhs_data dat;
4410 dat.worklist = &worklist;
4411 dat.loop_vinfo = loop_vinfo;
4412 memset (&wi, 0, sizeof (wi));
4413 wi.info = (void *)&dat;
4414 while (!worklist.is_empty ())
4416 stmt_vec_info stmt_info = worklist.pop ();
4417 /* Since SSA operands are not set up for pattern stmts we need
4418 to use walk_gimple_op. */
4419 wi.is_lhs = 0;
4420 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
4425 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
4427 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
4428 : vec_info (vec_info::bb, shared),
4429 bbs (_bbs),
4430 roots (vNULL)
4432 for (unsigned i = 0; i < bbs.length (); ++i)
4434 if (i != 0)
4435 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4436 gsi_next (&si))
4438 gphi *phi = si.phi ();
4439 gimple_set_uid (phi, 0);
4440 add_stmt (phi);
4442 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4443 !gsi_end_p (gsi); gsi_next (&gsi))
4445 gimple *stmt = gsi_stmt (gsi);
4446 gimple_set_uid (stmt, 0);
4447 if (is_gimple_debug (stmt))
4448 continue;
4449 add_stmt (stmt);
4455 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
4456 stmts in the basic block. */
4458 _bb_vec_info::~_bb_vec_info ()
4460 /* Reset region marker. */
4461 for (unsigned i = 0; i < bbs.length (); ++i)
4463 if (i != 0)
4464 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4465 gsi_next (&si))
4467 gphi *phi = si.phi ();
4468 gimple_set_uid (phi, -1);
4470 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4471 !gsi_end_p (gsi); gsi_next (&gsi))
4473 gimple *stmt = gsi_stmt (gsi);
4474 gimple_set_uid (stmt, -1);
4478 for (unsigned i = 0; i < roots.length (); ++i)
4480 roots[i].stmts.release ();
4481 roots[i].roots.release ();
4483 roots.release ();
4486 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
4487 given then that child nodes have already been processed, and that
4488 their def types currently match their SLP node's def type. */
4490 static bool
4491 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
4492 slp_instance node_instance,
4493 stmt_vector_for_cost *cost_vec)
4495 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4497 /* Calculate the number of vector statements to be created for the
4498 scalar stmts in this node. For SLP reductions it is equal to the
4499 number of vector statements in the children (which has already been
4500 calculated by the recursive call). Otherwise it is the number of
4501 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
4502 VF divided by the number of elements in a vector. */
4503 if (!STMT_VINFO_DATA_REF (stmt_info)
4504 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
4506 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
4507 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
4509 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4510 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
4511 break;
4514 else
4516 poly_uint64 vf;
4517 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4518 vf = loop_vinfo->vectorization_factor;
4519 else
4520 vf = 1;
4521 unsigned int group_size = SLP_TREE_LANES (node);
4522 tree vectype = SLP_TREE_VECTYPE (node);
4523 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4524 = vect_get_num_vectors (vf * group_size, vectype);
4527 /* Handle purely internal nodes. */
4528 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4529 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
4531 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
4533 bool dummy;
4534 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
4535 node, node_instance, cost_vec);
4538 /* Try to build NODE from scalars, returning true on success.
4539 NODE_INSTANCE is the SLP instance that contains NODE. */
4541 static bool
4542 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
4543 slp_instance node_instance)
4545 stmt_vec_info stmt_info;
4546 unsigned int i;
4548 if (!is_a <bb_vec_info> (vinfo)
4549 || node == SLP_INSTANCE_TREE (node_instance)
4550 || !SLP_TREE_SCALAR_STMTS (node).exists ()
4551 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
4552 /* Force the mask use to be built from scalars instead. */
4553 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
4554 return false;
4556 if (dump_enabled_p ())
4557 dump_printf_loc (MSG_NOTE, vect_location,
4558 "Building vector operands of %p from scalars instead\n", node);
4560 /* Don't remove and free the child nodes here, since they could be
4561 referenced by other structures. The analysis and scheduling phases
4562 (need to) ignore child nodes of anything that isn't vect_internal_def. */
4563 unsigned int group_size = SLP_TREE_LANES (node);
4564 SLP_TREE_DEF_TYPE (node) = vect_external_def;
4565 /* Invariants get their vector type from the uses. */
4566 SLP_TREE_VECTYPE (node) = NULL_TREE;
4567 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
4568 SLP_TREE_LOAD_PERMUTATION (node).release ();
4569 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4571 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
4572 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
4574 return true;
4577 /* Return true if all elements of the slice are the same. */
4578 bool
4579 vect_scalar_ops_slice::all_same_p () const
4581 for (unsigned int i = 1; i < length; ++i)
4582 if (!operand_equal_p (op (0), op (i)))
4583 return false;
4584 return true;
4587 hashval_t
4588 vect_scalar_ops_slice_hash::hash (const value_type &s)
4590 hashval_t hash = 0;
4591 for (unsigned i = 0; i < s.length; ++i)
4592 hash = iterative_hash_expr (s.op (i), hash);
4593 return hash;
4596 bool
4597 vect_scalar_ops_slice_hash::equal (const value_type &s1,
4598 const compare_type &s2)
4600 if (s1.length != s2.length)
4601 return false;
4602 for (unsigned i = 0; i < s1.length; ++i)
4603 if (!operand_equal_p (s1.op (i), s2.op (i)))
4604 return false;
4605 return true;
4608 /* Compute the prologue cost for invariant or constant operands represented
4609 by NODE. */
4611 static void
4612 vect_prologue_cost_for_slp (slp_tree node,
4613 stmt_vector_for_cost *cost_vec)
4615 /* There's a special case of an existing vector, that costs nothing. */
4616 if (SLP_TREE_SCALAR_OPS (node).length () == 0
4617 && !SLP_TREE_VEC_DEFS (node).is_empty ())
4618 return;
4619 /* Without looking at the actual initializer a vector of
4620 constants can be implemented as load from the constant pool.
4621 When all elements are the same we can use a splat. */
4622 tree vectype = SLP_TREE_VECTYPE (node);
4623 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
4624 unsigned HOST_WIDE_INT const_nunits;
4625 unsigned nelt_limit;
4626 auto ops = &SLP_TREE_SCALAR_OPS (node);
4627 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4628 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
4629 && ! multiple_p (const_nunits, group_size))
4631 nelt_limit = const_nunits;
4632 hash_set<vect_scalar_ops_slice_hash> vector_ops;
4633 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
4634 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
4635 starts.quick_push (i * const_nunits);
4637 else
4639 /* If either the vector has variable length or the vectors
4640 are composed of repeated whole groups we only need to
4641 cost construction once. All vectors will be the same. */
4642 nelt_limit = group_size;
4643 starts.quick_push (0);
4645 /* ??? We're just tracking whether vectors in a single node are the same.
4646 Ideally we'd do something more global. */
4647 for (unsigned int start : starts)
4649 vect_cost_for_stmt kind;
4650 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
4651 kind = vector_load;
4652 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
4653 kind = scalar_to_vec;
4654 else
4655 kind = vec_construct;
4656 record_stmt_cost (cost_vec, 1, kind, node, vectype, 0, vect_prologue);
4660 /* Analyze statements contained in SLP tree NODE after recursively analyzing
4661 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
4663 Return true if the operations are supported. */
4665 static bool
4666 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
4667 slp_instance node_instance,
4668 hash_set<slp_tree> &visited_set,
4669 vec<slp_tree> &visited_vec,
4670 stmt_vector_for_cost *cost_vec)
4672 int i, j;
4673 slp_tree child;
4675 /* Assume we can code-generate all invariants. */
4676 if (!node
4677 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
4678 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4679 return true;
4681 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
4683 if (dump_enabled_p ())
4684 dump_printf_loc (MSG_NOTE, vect_location,
4685 "Failed cyclic SLP reference in %p\n", node);
4686 return false;
4688 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
4690 /* If we already analyzed the exact same set of scalar stmts we're done.
4691 We share the generated vector stmts for those. */
4692 if (visited_set.add (node))
4693 return true;
4694 visited_vec.safe_push (node);
4696 bool res = true;
4697 unsigned visited_rec_start = visited_vec.length ();
4698 unsigned cost_vec_rec_start = cost_vec->length ();
4699 bool seen_non_constant_child = false;
4700 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4702 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
4703 visited_set, visited_vec,
4704 cost_vec);
4705 if (!res)
4706 break;
4707 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
4708 seen_non_constant_child = true;
4710 /* We're having difficulties scheduling nodes with just constant
4711 operands and no scalar stmts since we then cannot compute a stmt
4712 insertion place. */
4713 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
4715 if (dump_enabled_p ())
4716 dump_printf_loc (MSG_NOTE, vect_location,
4717 "Cannot vectorize all-constant op node %p\n", node);
4718 res = false;
4721 if (res)
4722 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
4723 cost_vec);
4724 /* If analysis failed we have to pop all recursive visited nodes
4725 plus ourselves. */
4726 if (!res)
4728 while (visited_vec.length () >= visited_rec_start)
4729 visited_set.remove (visited_vec.pop ());
4730 cost_vec->truncate (cost_vec_rec_start);
4733 /* When the node can be vectorized cost invariant nodes it references.
4734 This is not done in DFS order to allow the refering node
4735 vectorizable_* calls to nail down the invariant nodes vector type
4736 and possibly unshare it if it needs a different vector type than
4737 other referrers. */
4738 if (res)
4739 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
4740 if (child
4741 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
4742 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
4743 /* Perform usual caching, note code-generation still
4744 code-gens these nodes multiple times but we expect
4745 to CSE them later. */
4746 && !visited_set.add (child))
4748 visited_vec.safe_push (child);
4749 /* ??? After auditing more code paths make a "default"
4750 and push the vector type from NODE to all children
4751 if it is not already set. */
4752 /* Compute the number of vectors to be generated. */
4753 tree vector_type = SLP_TREE_VECTYPE (child);
4754 if (!vector_type)
4756 /* For shifts with a scalar argument we don't need
4757 to cost or code-generate anything.
4758 ??? Represent this more explicitely. */
4759 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4760 == shift_vec_info_type)
4761 && j == 1);
4762 continue;
4764 unsigned group_size = SLP_TREE_LANES (child);
4765 poly_uint64 vf = 1;
4766 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4767 vf = loop_vinfo->vectorization_factor;
4768 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
4769 = vect_get_num_vectors (vf * group_size, vector_type);
4770 /* And cost them. */
4771 vect_prologue_cost_for_slp (child, cost_vec);
4774 /* If this node or any of its children can't be vectorized, try pruning
4775 the tree here rather than felling the whole thing. */
4776 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
4778 /* We'll need to revisit this for invariant costing and number
4779 of vectorized stmt setting. */
4780 res = true;
4783 return res;
4786 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4787 region and that can be vectorized using vectorizable_live_operation
4788 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4789 scalar code computing it to be retained. */
4791 static void
4792 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
4793 slp_instance instance,
4794 stmt_vector_for_cost *cost_vec,
4795 hash_set<stmt_vec_info> &svisited,
4796 hash_set<slp_tree> &visited)
4798 if (visited.add (node))
4799 return;
4801 unsigned i;
4802 stmt_vec_info stmt_info;
4803 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
4804 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4806 if (svisited.contains (stmt_info))
4807 continue;
4808 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4809 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
4810 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
4811 /* Only the pattern root stmt computes the original scalar value. */
4812 continue;
4813 bool mark_visited = true;
4814 gimple *orig_stmt = orig_stmt_info->stmt;
4815 ssa_op_iter op_iter;
4816 def_operand_p def_p;
4817 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4819 imm_use_iterator use_iter;
4820 gimple *use_stmt;
4821 stmt_vec_info use_stmt_info;
4822 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4823 if (!is_gimple_debug (use_stmt))
4825 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
4826 if (!use_stmt_info
4827 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4829 STMT_VINFO_LIVE_P (stmt_info) = true;
4830 if (vectorizable_live_operation (bb_vinfo, stmt_info,
4831 NULL, node, instance, i,
4832 false, cost_vec))
4833 /* ??? So we know we can vectorize the live stmt
4834 from one SLP node. If we cannot do so from all
4835 or none consistently we'd have to record which
4836 SLP node (and lane) we want to use for the live
4837 operation. So make sure we can code-generate
4838 from all nodes. */
4839 mark_visited = false;
4840 else
4841 STMT_VINFO_LIVE_P (stmt_info) = false;
4842 break;
4845 /* We have to verify whether we can insert the lane extract
4846 before all uses. The following is a conservative approximation.
4847 We cannot put this into vectorizable_live_operation because
4848 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4849 doesn't work.
4850 Note that while the fact that we emit code for loads at the
4851 first load should make this a non-problem leafs we construct
4852 from scalars are vectorized after the last scalar def.
4853 ??? If we'd actually compute the insert location during
4854 analysis we could use sth less conservative than the last
4855 scalar stmt in the node for the dominance check. */
4856 /* ??? What remains is "live" uses in vector CTORs in the same
4857 SLP graph which is where those uses can end up code-generated
4858 right after their definition instead of close to their original
4859 use. But that would restrict us to code-generate lane-extracts
4860 from the latest stmt in a node. So we compensate for this
4861 during code-generation, simply not replacing uses for those
4862 hopefully rare cases. */
4863 if (STMT_VINFO_LIVE_P (stmt_info))
4864 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4865 if (!is_gimple_debug (use_stmt)
4866 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4867 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4868 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4870 if (dump_enabled_p ())
4871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4872 "Cannot determine insertion place for "
4873 "lane extract\n");
4874 STMT_VINFO_LIVE_P (stmt_info) = false;
4875 mark_visited = true;
4878 if (mark_visited)
4879 svisited.add (stmt_info);
4882 slp_tree child;
4883 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4884 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4885 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4886 cost_vec, svisited, visited);
4889 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
4891 static bool
4892 vectorizable_bb_reduc_epilogue (slp_instance instance,
4893 stmt_vector_for_cost *cost_vec)
4895 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
4896 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
4897 if (reduc_code == MINUS_EXPR)
4898 reduc_code = PLUS_EXPR;
4899 internal_fn reduc_fn;
4900 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
4901 if (!vectype
4902 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
4903 || reduc_fn == IFN_LAST
4904 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
4905 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
4906 TREE_TYPE (vectype)))
4907 return false;
4909 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
4910 cost log2 vector operations plus shuffles and one extraction. */
4911 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
4912 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
4913 vectype, 0, vect_body);
4914 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
4915 vectype, 0, vect_body);
4916 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
4917 vectype, 0, vect_body);
4918 return true;
4921 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
4922 and recurse to children. */
4924 static void
4925 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
4926 hash_set<slp_tree> &visited)
4928 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
4929 || visited.add (node))
4930 return;
4932 stmt_vec_info stmt;
4933 unsigned i;
4934 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4935 roots.remove (vect_orig_stmt (stmt));
4937 slp_tree child;
4938 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4939 if (child)
4940 vect_slp_prune_covered_roots (child, roots, visited);
4943 /* Analyze statements in SLP instances of VINFO. Return true if the
4944 operations are supported. */
4946 bool
4947 vect_slp_analyze_operations (vec_info *vinfo)
4949 slp_instance instance;
4950 int i;
4952 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4954 hash_set<slp_tree> visited;
4955 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4957 auto_vec<slp_tree> visited_vec;
4958 stmt_vector_for_cost cost_vec;
4959 cost_vec.create (2);
4960 if (is_a <bb_vec_info> (vinfo))
4961 vect_location = instance->location ();
4962 if (!vect_slp_analyze_node_operations (vinfo,
4963 SLP_INSTANCE_TREE (instance),
4964 instance, visited, visited_vec,
4965 &cost_vec)
4966 /* CTOR instances require vectorized defs for the SLP tree root. */
4967 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
4968 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
4969 != vect_internal_def
4970 /* Make sure we vectorized with the expected type. */
4971 || !useless_type_conversion_p
4972 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
4973 (instance->root_stmts[0]->stmt))),
4974 TREE_TYPE (SLP_TREE_VECTYPE
4975 (SLP_INSTANCE_TREE (instance))))))
4976 /* Check we can vectorize the reduction. */
4977 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
4978 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
4980 slp_tree node = SLP_INSTANCE_TREE (instance);
4981 stmt_vec_info stmt_info;
4982 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4983 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
4984 else
4985 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4986 if (dump_enabled_p ())
4987 dump_printf_loc (MSG_NOTE, vect_location,
4988 "removing SLP instance operations starting from: %G",
4989 stmt_info->stmt);
4990 vect_free_slp_instance (instance);
4991 vinfo->slp_instances.ordered_remove (i);
4992 cost_vec.release ();
4993 while (!visited_vec.is_empty ())
4994 visited.remove (visited_vec.pop ());
4996 else
4998 i++;
4999 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
5001 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
5002 cost_vec.release ();
5004 else
5005 /* For BB vectorization remember the SLP graph entry
5006 cost for later. */
5007 instance->cost_vec = cost_vec;
5011 /* Now look for SLP instances with a root that are covered by other
5012 instances and remove them. */
5013 hash_set<stmt_vec_info> roots;
5014 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5015 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5016 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
5017 if (!roots.is_empty ())
5019 visited.empty ();
5020 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5021 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
5022 visited);
5023 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
5024 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
5025 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
5027 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
5028 if (dump_enabled_p ())
5029 dump_printf_loc (MSG_NOTE, vect_location,
5030 "removing SLP instance operations starting "
5031 "from: %G", root->stmt);
5032 vect_free_slp_instance (instance);
5033 vinfo->slp_instances.ordered_remove (i);
5035 else
5036 ++i;
5039 /* Compute vectorizable live stmts. */
5040 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
5042 hash_set<stmt_vec_info> svisited;
5043 hash_set<slp_tree> visited;
5044 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5046 vect_location = instance->location ();
5047 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
5048 instance, &instance->cost_vec, svisited,
5049 visited);
5053 return !vinfo->slp_instances.is_empty ();
5056 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
5057 closing the eventual chain. */
5059 static slp_instance
5060 get_ultimate_leader (slp_instance instance,
5061 hash_map<slp_instance, slp_instance> &instance_leader)
5063 auto_vec<slp_instance *, 8> chain;
5064 slp_instance *tem;
5065 while (*(tem = instance_leader.get (instance)) != instance)
5067 chain.safe_push (tem);
5068 instance = *tem;
5070 while (!chain.is_empty ())
5071 *chain.pop () = instance;
5072 return instance;
5075 /* Worker of vect_bb_partition_graph, recurse on NODE. */
5077 static void
5078 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
5079 slp_instance instance, slp_tree node,
5080 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
5081 hash_map<slp_instance, slp_instance> &instance_leader,
5082 hash_set<slp_tree> &visited)
5084 stmt_vec_info stmt_info;
5085 unsigned i;
5087 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5089 bool existed_p;
5090 slp_instance &stmt_instance
5091 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
5092 if (!existed_p)
5094 else if (stmt_instance != instance)
5096 /* If we're running into a previously marked stmt make us the
5097 leader of the current ultimate leader. This keeps the
5098 leader chain acyclic and works even when the current instance
5099 connects two previously independent graph parts. */
5100 slp_instance stmt_leader
5101 = get_ultimate_leader (stmt_instance, instance_leader);
5102 if (stmt_leader != instance)
5103 instance_leader.put (stmt_leader, instance);
5105 stmt_instance = instance;
5108 if (!SLP_TREE_SCALAR_STMTS (node).is_empty () && visited.add (node))
5109 return;
5111 slp_tree child;
5112 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5113 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5114 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
5115 instance_leader, visited);
5118 /* Partition the SLP graph into pieces that can be costed independently. */
5120 static void
5121 vect_bb_partition_graph (bb_vec_info bb_vinfo)
5123 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
5125 /* First walk the SLP graph assigning each involved scalar stmt a
5126 corresponding SLP graph entry and upon visiting a previously
5127 marked stmt, make the stmts leader the current SLP graph entry. */
5128 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
5129 hash_map<slp_instance, slp_instance> instance_leader;
5130 hash_set<slp_tree> visited;
5131 slp_instance instance;
5132 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5134 instance_leader.put (instance, instance);
5135 vect_bb_partition_graph_r (bb_vinfo,
5136 instance, SLP_INSTANCE_TREE (instance),
5137 stmt_to_instance, instance_leader,
5138 visited);
5141 /* Then collect entries to each independent subgraph. */
5142 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5144 slp_instance leader = get_ultimate_leader (instance, instance_leader);
5145 leader->subgraph_entries.safe_push (instance);
5146 if (dump_enabled_p ()
5147 && leader != instance)
5148 dump_printf_loc (MSG_NOTE, vect_location,
5149 "instance %p is leader of %p\n",
5150 leader, instance);
5154 /* Compute the set of scalar stmts participating in internal and external
5155 nodes. */
5157 static void
5158 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
5159 hash_set<slp_tree> &visited,
5160 hash_set<stmt_vec_info> &vstmts,
5161 hash_set<stmt_vec_info> &estmts)
5163 int i;
5164 stmt_vec_info stmt_info;
5165 slp_tree child;
5167 if (visited.add (node))
5168 return;
5170 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
5172 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5173 vstmts.add (stmt_info);
5175 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5176 if (child)
5177 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
5178 vstmts, estmts);
5180 else
5181 for (tree def : SLP_TREE_SCALAR_OPS (node))
5183 stmt_vec_info def_stmt = vinfo->lookup_def (def);
5184 if (def_stmt)
5185 estmts.add (def_stmt);
5190 /* Compute the scalar cost of the SLP node NODE and its children
5191 and return it. Do not account defs that are marked in LIFE and
5192 update LIFE according to uses of NODE. */
5194 static void
5195 vect_bb_slp_scalar_cost (vec_info *vinfo,
5196 slp_tree node, vec<bool, va_heap> *life,
5197 stmt_vector_for_cost *cost_vec,
5198 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
5199 hash_set<slp_tree> &visited)
5201 unsigned i;
5202 stmt_vec_info stmt_info;
5203 slp_tree child;
5205 if (visited.add (node))
5206 return;
5208 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5210 ssa_op_iter op_iter;
5211 def_operand_p def_p;
5213 if ((*life)[i])
5214 continue;
5216 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5217 gimple *orig_stmt = orig_stmt_info->stmt;
5219 /* If there is a non-vectorized use of the defs then the scalar
5220 stmt is kept live in which case we do not account it or any
5221 required defs in the SLP children in the scalar cost. This
5222 way we make the vectorization more costly when compared to
5223 the scalar cost. */
5224 if (!STMT_VINFO_LIVE_P (stmt_info))
5226 auto_vec<gimple *, 8> worklist;
5227 hash_set<gimple *> *worklist_visited = NULL;
5228 worklist.quick_push (orig_stmt);
5231 gimple *work_stmt = worklist.pop ();
5232 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
5234 imm_use_iterator use_iter;
5235 gimple *use_stmt;
5236 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
5237 DEF_FROM_PTR (def_p))
5238 if (!is_gimple_debug (use_stmt))
5240 stmt_vec_info use_stmt_info
5241 = vinfo->lookup_stmt (use_stmt);
5242 if (!use_stmt_info
5243 || !vectorized_scalar_stmts.contains (use_stmt_info))
5245 if (use_stmt_info
5246 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
5248 /* For stmts participating in patterns we have
5249 to check its uses recursively. */
5250 if (!worklist_visited)
5251 worklist_visited = new hash_set<gimple *> ();
5252 if (!worklist_visited->add (use_stmt))
5253 worklist.safe_push (use_stmt);
5254 continue;
5256 (*life)[i] = true;
5257 goto next_lane;
5262 while (!worklist.is_empty ());
5263 next_lane:
5264 if (worklist_visited)
5265 delete worklist_visited;
5266 if ((*life)[i])
5267 continue;
5270 /* Count scalar stmts only once. */
5271 if (gimple_visited_p (orig_stmt))
5272 continue;
5273 gimple_set_visited (orig_stmt, true);
5275 vect_cost_for_stmt kind;
5276 if (STMT_VINFO_DATA_REF (orig_stmt_info))
5278 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
5279 kind = scalar_load;
5280 else
5281 kind = scalar_store;
5283 else if (vect_nop_conversion_p (orig_stmt_info))
5284 continue;
5285 /* For single-argument PHIs assume coalescing which means zero cost
5286 for the scalar and the vector PHIs. This avoids artificially
5287 favoring the vector path (but may pessimize it in some cases). */
5288 else if (is_a <gphi *> (orig_stmt_info->stmt)
5289 && gimple_phi_num_args
5290 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
5291 continue;
5292 else
5293 kind = scalar_stmt;
5294 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
5295 SLP_TREE_VECTYPE (node), 0, vect_body);
5298 auto_vec<bool, 20> subtree_life;
5299 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5301 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5303 /* Do not directly pass LIFE to the recursive call, copy it to
5304 confine changes in the callee to the current child/subtree. */
5305 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5307 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
5308 for (unsigned j = 0;
5309 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
5311 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
5312 if (perm.first == i)
5313 subtree_life[perm.second] = (*life)[j];
5316 else
5318 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
5319 subtree_life.safe_splice (*life);
5321 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
5322 vectorized_scalar_stmts, visited);
5323 subtree_life.truncate (0);
5328 /* Comparator for the loop-index sorted cost vectors. */
5330 static int
5331 li_cost_vec_cmp (const void *a_, const void *b_)
5333 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
5334 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
5335 if (a->first < b->first)
5336 return -1;
5337 else if (a->first == b->first)
5338 return 0;
5339 return 1;
5342 /* Check if vectorization of the basic block is profitable for the
5343 subgraph denoted by SLP_INSTANCES. */
5345 static bool
5346 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
5347 vec<slp_instance> slp_instances,
5348 loop_p orig_loop)
5350 slp_instance instance;
5351 int i;
5352 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
5353 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
5355 if (dump_enabled_p ())
5357 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
5358 hash_set<slp_tree> visited;
5359 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5360 vect_print_slp_graph (MSG_NOTE, vect_location,
5361 SLP_INSTANCE_TREE (instance), visited);
5364 /* Compute the set of scalar stmts we know will go away 'locally' when
5365 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
5366 not accurate for nodes promoted extern late or for scalar stmts that
5367 are used both in extern defs and in vectorized defs. */
5368 hash_set<stmt_vec_info> vectorized_scalar_stmts;
5369 hash_set<stmt_vec_info> scalar_stmts_in_externs;
5370 hash_set<slp_tree> visited;
5371 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5373 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
5374 SLP_INSTANCE_TREE (instance),
5375 visited,
5376 vectorized_scalar_stmts,
5377 scalar_stmts_in_externs);
5378 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
5379 vectorized_scalar_stmts.add (rstmt);
5381 /* Scalar stmts used as defs in external nodes need to be preseved, so
5382 remove them from vectorized_scalar_stmts. */
5383 for (stmt_vec_info stmt : scalar_stmts_in_externs)
5384 vectorized_scalar_stmts.remove (stmt);
5386 /* Calculate scalar cost and sum the cost for the vector stmts
5387 previously collected. */
5388 stmt_vector_for_cost scalar_costs = vNULL;
5389 stmt_vector_for_cost vector_costs = vNULL;
5390 visited.empty ();
5391 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5393 auto_vec<bool, 20> life;
5394 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
5395 true);
5396 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5397 record_stmt_cost (&scalar_costs,
5398 SLP_INSTANCE_ROOT_STMTS (instance).length (),
5399 scalar_stmt,
5400 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
5401 vect_bb_slp_scalar_cost (bb_vinfo,
5402 SLP_INSTANCE_TREE (instance),
5403 &life, &scalar_costs, vectorized_scalar_stmts,
5404 visited);
5405 vector_costs.safe_splice (instance->cost_vec);
5406 instance->cost_vec.release ();
5409 if (dump_enabled_p ())
5410 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5412 /* When costing non-loop vectorization we need to consider each covered
5413 loop independently and make sure vectorization is profitable. For
5414 now we assume a loop may be not entered or executed an arbitrary
5415 number of iterations (??? static information can provide more
5416 precise info here) which means we can simply cost each containing
5417 loops stmts separately. */
5419 /* First produce cost vectors sorted by loop index. */
5420 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5421 li_scalar_costs (scalar_costs.length ());
5422 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5423 li_vector_costs (vector_costs.length ());
5424 stmt_info_for_cost *cost;
5425 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5427 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5428 li_scalar_costs.quick_push (std::make_pair (l, cost));
5430 /* Use a random used loop as fallback in case the first vector_costs
5431 entry does not have a stmt_info associated with it. */
5432 unsigned l = li_scalar_costs[0].first;
5433 FOR_EACH_VEC_ELT (vector_costs, i, cost)
5435 /* We inherit from the previous COST, invariants, externals and
5436 extracts immediately follow the cost for the related stmt. */
5437 if (cost->stmt_info)
5438 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5439 li_vector_costs.quick_push (std::make_pair (l, cost));
5441 li_scalar_costs.qsort (li_cost_vec_cmp);
5442 li_vector_costs.qsort (li_cost_vec_cmp);
5444 /* Now cost the portions individually. */
5445 unsigned vi = 0;
5446 unsigned si = 0;
5447 bool profitable = true;
5448 while (si < li_scalar_costs.length ()
5449 && vi < li_vector_costs.length ())
5451 unsigned sl = li_scalar_costs[si].first;
5452 unsigned vl = li_vector_costs[vi].first;
5453 if (sl != vl)
5455 if (dump_enabled_p ())
5456 dump_printf_loc (MSG_NOTE, vect_location,
5457 "Scalar %d and vector %d loop part do not "
5458 "match up, skipping scalar part\n", sl, vl);
5459 /* Skip the scalar part, assuming zero cost on the vector side. */
5462 si++;
5464 while (si < li_scalar_costs.length ()
5465 && li_scalar_costs[si].first == sl);
5466 continue;
5469 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
5472 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
5473 si++;
5475 while (si < li_scalar_costs.length ()
5476 && li_scalar_costs[si].first == sl);
5477 unsigned dummy;
5478 finish_cost (scalar_target_cost_data, nullptr,
5479 &dummy, &scalar_cost, &dummy);
5481 /* Complete the target-specific vector cost calculation. */
5482 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
5485 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
5486 vi++;
5488 while (vi < li_vector_costs.length ()
5489 && li_vector_costs[vi].first == vl);
5490 finish_cost (vect_target_cost_data, scalar_target_cost_data,
5491 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
5492 delete scalar_target_cost_data;
5493 delete vect_target_cost_data;
5495 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
5497 if (dump_enabled_p ())
5499 dump_printf_loc (MSG_NOTE, vect_location,
5500 "Cost model analysis for part in loop %d:\n", sl);
5501 dump_printf (MSG_NOTE, " Vector cost: %d\n",
5502 vec_inside_cost + vec_outside_cost);
5503 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
5506 /* Vectorization is profitable if its cost is more than the cost of scalar
5507 version. Note that we err on the vector side for equal cost because
5508 the cost estimate is otherwise quite pessimistic (constant uses are
5509 free on the scalar side but cost a load on the vector side for
5510 example). */
5511 if (vec_outside_cost + vec_inside_cost > scalar_cost)
5513 profitable = false;
5514 break;
5517 if (profitable && vi < li_vector_costs.length ())
5519 if (dump_enabled_p ())
5520 dump_printf_loc (MSG_NOTE, vect_location,
5521 "Excess vector cost for part in loop %d:\n",
5522 li_vector_costs[vi].first);
5523 profitable = false;
5526 /* Unset visited flag. This is delayed when the subgraph is profitable
5527 and we process the loop for remaining unvectorized if-converted code. */
5528 if (!orig_loop || !profitable)
5529 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5530 gimple_set_visited (cost->stmt_info->stmt, false);
5532 scalar_costs.release ();
5533 vector_costs.release ();
5535 return profitable;
5538 /* qsort comparator for lane defs. */
5540 static int
5541 vld_cmp (const void *a_, const void *b_)
5543 auto *a = (const std::pair<unsigned, tree> *)a_;
5544 auto *b = (const std::pair<unsigned, tree> *)b_;
5545 return a->first - b->first;
5548 /* Return true if USE_STMT is a vector lane insert into VEC and set
5549 *THIS_LANE to the lane number that is set. */
5551 static bool
5552 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
5554 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
5555 if (!use_ass
5556 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
5557 || (vec
5558 ? gimple_assign_rhs1 (use_ass) != vec
5559 : ((vec = gimple_assign_rhs1 (use_ass)), false))
5560 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
5561 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
5562 || !constant_multiple_p
5563 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
5564 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
5565 this_lane))
5566 return false;
5567 return true;
5570 /* Find any vectorizable constructors and add them to the grouped_store
5571 array. */
5573 static void
5574 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
5576 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
5577 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
5578 !gsi_end_p (gsi); gsi_next (&gsi))
5580 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
5581 if (!assign)
5582 continue;
5584 tree rhs = gimple_assign_rhs1 (assign);
5585 enum tree_code code = gimple_assign_rhs_code (assign);
5586 use_operand_p use_p;
5587 gimple *use_stmt;
5588 if (code == CONSTRUCTOR)
5590 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5591 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
5592 CONSTRUCTOR_NELTS (rhs))
5593 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
5594 || uniform_vector_p (rhs))
5595 continue;
5597 unsigned j;
5598 tree val;
5599 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
5600 if (TREE_CODE (val) != SSA_NAME
5601 || !bb_vinfo->lookup_def (val))
5602 break;
5603 if (j != CONSTRUCTOR_NELTS (rhs))
5604 continue;
5606 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
5607 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
5609 else if (code == BIT_INSERT_EXPR
5610 && VECTOR_TYPE_P (TREE_TYPE (rhs))
5611 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
5612 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
5613 && integer_zerop (gimple_assign_rhs3 (assign))
5614 && useless_type_conversion_p
5615 (TREE_TYPE (TREE_TYPE (rhs)),
5616 TREE_TYPE (gimple_assign_rhs2 (assign)))
5617 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
5619 /* We start to match on insert to lane zero but since the
5620 inserts need not be ordered we'd have to search both
5621 the def and the use chains. */
5622 tree vectype = TREE_TYPE (rhs);
5623 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5624 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
5625 auto_sbitmap lanes (nlanes);
5626 bitmap_clear (lanes);
5627 bitmap_set_bit (lanes, 0);
5628 tree def = gimple_assign_lhs (assign);
5629 lane_defs.quick_push
5630 (std::make_pair (0, gimple_assign_rhs2 (assign)));
5631 unsigned lanes_found = 1;
5632 /* Start with the use chains, the last stmt will be the root. */
5633 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
5634 vec<stmt_vec_info> roots = vNULL;
5635 roots.safe_push (last);
5638 use_operand_p use_p;
5639 gimple *use_stmt;
5640 if (!single_imm_use (def, &use_p, &use_stmt))
5641 break;
5642 unsigned this_lane;
5643 if (!bb_vinfo->lookup_stmt (use_stmt)
5644 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
5645 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
5646 break;
5647 if (bitmap_bit_p (lanes, this_lane))
5648 break;
5649 lanes_found++;
5650 bitmap_set_bit (lanes, this_lane);
5651 gassign *use_ass = as_a <gassign *> (use_stmt);
5652 lane_defs.quick_push (std::make_pair
5653 (this_lane, gimple_assign_rhs2 (use_ass)));
5654 last = bb_vinfo->lookup_stmt (use_ass);
5655 roots.safe_push (last);
5656 def = gimple_assign_lhs (use_ass);
5658 while (lanes_found < nlanes);
5659 if (roots.length () > 1)
5660 std::swap(roots[0], roots[roots.length () - 1]);
5661 if (lanes_found < nlanes)
5663 /* Now search the def chain. */
5664 def = gimple_assign_rhs1 (assign);
5667 if (TREE_CODE (def) != SSA_NAME
5668 || !has_single_use (def))
5669 break;
5670 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
5671 unsigned this_lane;
5672 if (!bb_vinfo->lookup_stmt (def_stmt)
5673 || !vect_slp_is_lane_insert (def_stmt,
5674 NULL_TREE, &this_lane)
5675 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
5676 break;
5677 if (bitmap_bit_p (lanes, this_lane))
5678 break;
5679 lanes_found++;
5680 bitmap_set_bit (lanes, this_lane);
5681 lane_defs.quick_push (std::make_pair
5682 (this_lane,
5683 gimple_assign_rhs2 (def_stmt)));
5684 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
5685 def = gimple_assign_rhs1 (def_stmt);
5687 while (lanes_found < nlanes);
5689 if (lanes_found == nlanes)
5691 /* Sort lane_defs after the lane index and register the root. */
5692 lane_defs.qsort (vld_cmp);
5693 vec<stmt_vec_info> stmts;
5694 stmts.create (nlanes);
5695 for (unsigned i = 0; i < nlanes; ++i)
5696 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
5697 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
5698 stmts, roots));
5700 else
5701 roots.release ();
5703 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5704 && (associative_tree_code (code) || code == MINUS_EXPR)
5705 /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS
5706 checks pessimize a two-element reduction. PR54400.
5707 ??? In-order reduction could be handled if we only
5708 traverse one operand chain in vect_slp_linearize_chain. */
5709 && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
5710 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
5711 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
5712 /* Ops with constants at the tail can be stripped here. */
5713 && TREE_CODE (rhs) == SSA_NAME
5714 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
5715 /* Should be the chain end. */
5716 && (!single_imm_use (gimple_assign_lhs (assign),
5717 &use_p, &use_stmt)
5718 || !is_gimple_assign (use_stmt)
5719 || (gimple_assign_rhs_code (use_stmt) != code
5720 && ((code != PLUS_EXPR && code != MINUS_EXPR)
5721 || (gimple_assign_rhs_code (use_stmt)
5722 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
5724 /* We start the match at the end of a possible association
5725 chain. */
5726 auto_vec<chain_op_t> chain;
5727 auto_vec<std::pair<tree_code, gimple *> > worklist;
5728 auto_vec<gimple *> chain_stmts;
5729 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
5730 if (code == MINUS_EXPR)
5731 code = PLUS_EXPR;
5732 internal_fn reduc_fn;
5733 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
5734 || reduc_fn == IFN_LAST)
5735 continue;
5736 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
5737 /* ??? */
5738 code_stmt, alt_code_stmt, &chain_stmts);
5739 if (chain.length () > 1)
5741 /* Sort the chain according to def_type and operation. */
5742 chain.sort (dt_sort_cmp, bb_vinfo);
5743 /* ??? Now we'd want to strip externals and constants
5744 but record those to be handled in the epilogue. */
5745 /* ??? For now do not allow mixing ops or externs/constants. */
5746 bool invalid = false;
5747 for (unsigned i = 0; i < chain.length (); ++i)
5748 if (chain[i].dt != vect_internal_def
5749 || chain[i].code != code)
5750 invalid = true;
5751 if (!invalid)
5753 vec<stmt_vec_info> stmts;
5754 stmts.create (chain.length ());
5755 for (unsigned i = 0; i < chain.length (); ++i)
5756 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
5757 vec<stmt_vec_info> roots;
5758 roots.create (chain_stmts.length ());
5759 for (unsigned i = 0; i < chain_stmts.length (); ++i)
5760 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
5761 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
5762 stmts, roots));
5769 /* Walk the grouped store chains and replace entries with their
5770 pattern variant if any. */
5772 static void
5773 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
5775 stmt_vec_info first_element;
5776 unsigned i;
5778 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5780 /* We also have CTORs in this array. */
5781 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
5782 continue;
5783 if (STMT_VINFO_IN_PATTERN_P (first_element))
5785 stmt_vec_info orig = first_element;
5786 first_element = STMT_VINFO_RELATED_STMT (first_element);
5787 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
5788 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
5789 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
5790 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
5791 vinfo->grouped_stores[i] = first_element;
5793 stmt_vec_info prev = first_element;
5794 while (DR_GROUP_NEXT_ELEMENT (prev))
5796 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
5797 if (STMT_VINFO_IN_PATTERN_P (elt))
5799 stmt_vec_info orig = elt;
5800 elt = STMT_VINFO_RELATED_STMT (elt);
5801 DR_GROUP_NEXT_ELEMENT (prev) = elt;
5802 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
5803 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
5805 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
5806 prev = elt;
5811 /* Check if the region described by BB_VINFO can be vectorized, returning
5812 true if so. When returning false, set FATAL to true if the same failure
5813 would prevent vectorization at other vector sizes, false if it is still
5814 worth trying other sizes. N_STMTS is the number of statements in the
5815 region. */
5817 static bool
5818 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
5819 vec<int> *dataref_groups)
5821 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
5823 slp_instance instance;
5824 int i;
5825 poly_uint64 min_vf = 2;
5827 /* The first group of checks is independent of the vector size. */
5828 fatal = true;
5830 /* Analyze the data references. */
5832 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
5834 if (dump_enabled_p ())
5835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5836 "not vectorized: unhandled data-ref in basic "
5837 "block.\n");
5838 return false;
5841 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
5843 if (dump_enabled_p ())
5844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5845 "not vectorized: unhandled data access in "
5846 "basic block.\n");
5847 return false;
5850 vect_slp_check_for_constructors (bb_vinfo);
5852 /* If there are no grouped stores and no constructors in the region
5853 there is no need to continue with pattern recog as vect_analyze_slp
5854 will fail anyway. */
5855 if (bb_vinfo->grouped_stores.is_empty ()
5856 && bb_vinfo->roots.is_empty ())
5858 if (dump_enabled_p ())
5859 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5860 "not vectorized: no grouped stores in "
5861 "basic block.\n");
5862 return false;
5865 /* While the rest of the analysis below depends on it in some way. */
5866 fatal = false;
5868 vect_pattern_recog (bb_vinfo);
5870 /* Update store groups from pattern processing. */
5871 vect_fixup_store_groups_with_patterns (bb_vinfo);
5873 /* Check the SLP opportunities in the basic block, analyze and build SLP
5874 trees. */
5875 if (!vect_analyze_slp (bb_vinfo, n_stmts))
5877 if (dump_enabled_p ())
5879 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5880 "Failed to SLP the basic block.\n");
5881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5882 "not vectorized: failed to find SLP opportunities "
5883 "in basic block.\n");
5885 return false;
5888 /* Optimize permutations. */
5889 vect_optimize_slp (bb_vinfo);
5891 /* Gather the loads reachable from the SLP graph entries. */
5892 vect_gather_slp_loads (bb_vinfo);
5894 vect_record_base_alignments (bb_vinfo);
5896 /* Analyze and verify the alignment of data references and the
5897 dependence in the SLP instances. */
5898 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
5900 vect_location = instance->location ();
5901 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
5902 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
5904 slp_tree node = SLP_INSTANCE_TREE (instance);
5905 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5906 if (dump_enabled_p ())
5907 dump_printf_loc (MSG_NOTE, vect_location,
5908 "removing SLP instance operations starting from: %G",
5909 stmt_info->stmt);
5910 vect_free_slp_instance (instance);
5911 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
5912 continue;
5915 /* Mark all the statements that we want to vectorize as pure SLP and
5916 relevant. */
5917 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5918 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
5919 unsigned j;
5920 stmt_vec_info root;
5921 /* Likewise consider instance root stmts as vectorized. */
5922 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
5923 STMT_SLP_TYPE (root) = pure_slp;
5925 i++;
5927 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
5928 return false;
5930 if (!vect_slp_analyze_operations (bb_vinfo))
5932 if (dump_enabled_p ())
5933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5934 "not vectorized: bad operation in basic block.\n");
5935 return false;
5938 vect_bb_partition_graph (bb_vinfo);
5940 return true;
5943 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
5944 basic blocks in BBS, returning true on success.
5945 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
5947 static bool
5948 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
5949 vec<int> *dataref_groups, unsigned int n_stmts,
5950 loop_p orig_loop)
5952 bb_vec_info bb_vinfo;
5953 auto_vector_modes vector_modes;
5955 /* Autodetect first vector size we try. */
5956 machine_mode next_vector_mode = VOIDmode;
5957 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
5958 unsigned int mode_i = 0;
5960 vec_info_shared shared;
5962 machine_mode autodetected_vector_mode = VOIDmode;
5963 while (1)
5965 bool vectorized = false;
5966 bool fatal = false;
5967 bb_vinfo = new _bb_vec_info (bbs, &shared);
5969 bool first_time_p = shared.datarefs.is_empty ();
5970 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
5971 if (first_time_p)
5972 bb_vinfo->shared->save_datarefs ();
5973 else
5974 bb_vinfo->shared->check_datarefs ();
5975 bb_vinfo->vector_mode = next_vector_mode;
5977 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
5979 if (dump_enabled_p ())
5981 dump_printf_loc (MSG_NOTE, vect_location,
5982 "***** Analysis succeeded with vector mode"
5983 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
5984 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
5987 bb_vinfo->shared->check_datarefs ();
5989 auto_vec<slp_instance> profitable_subgraphs;
5990 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
5992 if (instance->subgraph_entries.is_empty ())
5993 continue;
5995 vect_location = instance->location ();
5996 if (!unlimited_cost_model (NULL)
5997 && !vect_bb_vectorization_profitable_p
5998 (bb_vinfo, instance->subgraph_entries, orig_loop))
6000 if (dump_enabled_p ())
6001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6002 "not vectorized: vectorization is not "
6003 "profitable.\n");
6004 continue;
6007 if (!dbg_cnt (vect_slp))
6008 continue;
6010 profitable_subgraphs.safe_push (instance);
6013 /* When we're vectorizing an if-converted loop body make sure
6014 we vectorized all if-converted code. */
6015 if (!profitable_subgraphs.is_empty ()
6016 && orig_loop)
6018 gcc_assert (bb_vinfo->bbs.length () == 1);
6019 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
6020 !gsi_end_p (gsi); gsi_next (&gsi))
6022 /* The costing above left us with DCEable vectorized scalar
6023 stmts having the visited flag set on profitable
6024 subgraphs. Do the delayed clearing of the flag here. */
6025 if (gimple_visited_p (gsi_stmt (gsi)))
6027 gimple_set_visited (gsi_stmt (gsi), false);
6028 continue;
6030 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
6031 continue;
6033 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
6034 if (gimple_assign_rhs_code (ass) == COND_EXPR)
6036 if (!profitable_subgraphs.is_empty ()
6037 && dump_enabled_p ())
6038 dump_printf_loc (MSG_NOTE, vect_location,
6039 "not profitable because of "
6040 "unprofitable if-converted scalar "
6041 "code\n");
6042 profitable_subgraphs.truncate (0);
6047 /* Finally schedule the profitable subgraphs. */
6048 for (slp_instance instance : profitable_subgraphs)
6050 if (!vectorized && dump_enabled_p ())
6051 dump_printf_loc (MSG_NOTE, vect_location,
6052 "Basic block will be vectorized "
6053 "using SLP\n");
6054 vectorized = true;
6056 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
6058 unsigned HOST_WIDE_INT bytes;
6059 if (dump_enabled_p ())
6061 if (GET_MODE_SIZE
6062 (bb_vinfo->vector_mode).is_constant (&bytes))
6063 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
6064 "basic block part vectorized using %wu "
6065 "byte vectors\n", bytes);
6066 else
6067 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
6068 "basic block part vectorized using "
6069 "variable length vectors\n");
6073 else
6075 if (dump_enabled_p ())
6076 dump_printf_loc (MSG_NOTE, vect_location,
6077 "***** Analysis failed with vector mode %s\n",
6078 GET_MODE_NAME (bb_vinfo->vector_mode));
6081 if (mode_i == 0)
6082 autodetected_vector_mode = bb_vinfo->vector_mode;
6084 if (!fatal)
6085 while (mode_i < vector_modes.length ()
6086 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
6088 if (dump_enabled_p ())
6089 dump_printf_loc (MSG_NOTE, vect_location,
6090 "***** The result for vector mode %s would"
6091 " be the same\n",
6092 GET_MODE_NAME (vector_modes[mode_i]));
6093 mode_i += 1;
6096 delete bb_vinfo;
6098 if (mode_i < vector_modes.length ()
6099 && VECTOR_MODE_P (autodetected_vector_mode)
6100 && (related_vector_mode (vector_modes[mode_i],
6101 GET_MODE_INNER (autodetected_vector_mode))
6102 == autodetected_vector_mode)
6103 && (related_vector_mode (autodetected_vector_mode,
6104 GET_MODE_INNER (vector_modes[mode_i]))
6105 == vector_modes[mode_i]))
6107 if (dump_enabled_p ())
6108 dump_printf_loc (MSG_NOTE, vect_location,
6109 "***** Skipping vector mode %s, which would"
6110 " repeat the analysis for %s\n",
6111 GET_MODE_NAME (vector_modes[mode_i]),
6112 GET_MODE_NAME (autodetected_vector_mode));
6113 mode_i += 1;
6116 if (vectorized
6117 || mode_i == vector_modes.length ()
6118 || autodetected_vector_mode == VOIDmode
6119 /* If vect_slp_analyze_bb_1 signaled that analysis for all
6120 vector sizes will fail do not bother iterating. */
6121 || fatal)
6122 return vectorized;
6124 /* Try the next biggest vector size. */
6125 next_vector_mode = vector_modes[mode_i++];
6126 if (dump_enabled_p ())
6127 dump_printf_loc (MSG_NOTE, vect_location,
6128 "***** Re-trying analysis with vector mode %s\n",
6129 GET_MODE_NAME (next_vector_mode));
6134 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
6135 true if anything in the basic-block was vectorized. */
6137 static bool
6138 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
6140 vec<data_reference_p> datarefs = vNULL;
6141 auto_vec<int> dataref_groups;
6142 int insns = 0;
6143 int current_group = 0;
6145 for (unsigned i = 0; i < bbs.length (); i++)
6147 basic_block bb = bbs[i];
6148 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
6149 gsi_next (&gsi))
6151 gimple *stmt = gsi_stmt (gsi);
6152 if (is_gimple_debug (stmt))
6153 continue;
6155 insns++;
6157 if (gimple_location (stmt) != UNKNOWN_LOCATION)
6158 vect_location = stmt;
6160 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
6161 &dataref_groups, current_group))
6162 ++current_group;
6164 /* New BBs always start a new DR group. */
6165 ++current_group;
6168 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
6171 /* Special entry for the BB vectorizer. Analyze and transform a single
6172 if-converted BB with ORIG_LOOPs body being the not if-converted
6173 representation. Returns true if anything in the basic-block was
6174 vectorized. */
6176 bool
6177 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
6179 auto_vec<basic_block> bbs;
6180 bbs.safe_push (bb);
6181 return vect_slp_bbs (bbs, orig_loop);
6184 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
6185 true if anything in the basic-block was vectorized. */
6187 bool
6188 vect_slp_function (function *fun)
6190 bool r = false;
6191 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
6192 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
6194 /* For the moment split the function into pieces to avoid making
6195 the iteration on the vector mode moot. Split at points we know
6196 to not handle well which is CFG merges (SLP discovery doesn't
6197 handle non-loop-header PHIs) and loop exits. Since pattern
6198 recog requires reverse iteration to visit uses before defs
6199 simply chop RPO into pieces. */
6200 auto_vec<basic_block> bbs;
6201 for (unsigned i = 0; i < n; i++)
6203 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
6204 bool split = false;
6206 /* Split when a BB is not dominated by the first block. */
6207 if (!bbs.is_empty ()
6208 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
6210 if (dump_enabled_p ())
6211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6212 "splitting region at dominance boundary bb%d\n",
6213 bb->index);
6214 split = true;
6216 /* Split when the loop determined by the first block
6217 is exited. This is because we eventually insert
6218 invariants at region begin. */
6219 else if (!bbs.is_empty ()
6220 && bbs[0]->loop_father != bb->loop_father
6221 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
6223 if (dump_enabled_p ())
6224 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6225 "splitting region at loop %d exit at bb%d\n",
6226 bbs[0]->loop_father->num, bb->index);
6227 split = true;
6230 if (split && !bbs.is_empty ())
6232 r |= vect_slp_bbs (bbs, NULL);
6233 bbs.truncate (0);
6234 bbs.quick_push (bb);
6236 else
6237 bbs.safe_push (bb);
6239 /* When we have a stmt ending this block and defining a
6240 value we have to insert on edges when inserting after it for
6241 a vector containing its definition. Avoid this for now. */
6242 if (gimple *last = last_stmt (bb))
6243 if (gimple_get_lhs (last)
6244 && is_ctrl_altering_stmt (last))
6246 if (dump_enabled_p ())
6247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6248 "splitting region at control altering "
6249 "definition %G", last);
6250 r |= vect_slp_bbs (bbs, NULL);
6251 bbs.truncate (0);
6255 if (!bbs.is_empty ())
6256 r |= vect_slp_bbs (bbs, NULL);
6258 free (rpo);
6260 return r;
6263 /* Build a variable-length vector in which the elements in ELTS are repeated
6264 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
6265 RESULTS and add any new instructions to SEQ.
6267 The approach we use is:
6269 (1) Find a vector mode VM with integer elements of mode IM.
6271 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6272 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
6273 from small vectors to IM.
6275 (3) Duplicate each ELTS'[I] into a vector of mode VM.
6277 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
6278 correct byte contents.
6280 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
6282 We try to find the largest IM for which this sequence works, in order
6283 to cut down on the number of interleaves. */
6285 void
6286 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
6287 const vec<tree> &elts, unsigned int nresults,
6288 vec<tree> &results)
6290 unsigned int nelts = elts.length ();
6291 tree element_type = TREE_TYPE (vector_type);
6293 /* (1) Find a vector mode VM with integer elements of mode IM. */
6294 unsigned int nvectors = 1;
6295 tree new_vector_type;
6296 tree permutes[2];
6297 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
6298 &nvectors, &new_vector_type,
6299 permutes))
6300 gcc_unreachable ();
6302 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
6303 unsigned int partial_nelts = nelts / nvectors;
6304 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
6306 tree_vector_builder partial_elts;
6307 auto_vec<tree, 32> pieces (nvectors * 2);
6308 pieces.quick_grow_cleared (nvectors * 2);
6309 for (unsigned int i = 0; i < nvectors; ++i)
6311 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6312 ELTS' has mode IM. */
6313 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
6314 for (unsigned int j = 0; j < partial_nelts; ++j)
6315 partial_elts.quick_push (elts[i * partial_nelts + j]);
6316 tree t = gimple_build_vector (seq, &partial_elts);
6317 t = gimple_build (seq, VIEW_CONVERT_EXPR,
6318 TREE_TYPE (new_vector_type), t);
6320 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
6321 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
6324 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
6325 correct byte contents.
6327 Conceptually, we need to repeat the following operation log2(nvectors)
6328 times, where hi_start = nvectors / 2:
6330 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
6331 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
6333 However, if each input repeats every N elements and the VF is
6334 a multiple of N * 2, the HI result is the same as the LO result.
6335 This will be true for the first N1 iterations of the outer loop,
6336 followed by N2 iterations for which both the LO and HI results
6337 are needed. I.e.:
6339 N1 + N2 = log2(nvectors)
6341 Each "N1 iteration" doubles the number of redundant vectors and the
6342 effect of the process as a whole is to have a sequence of nvectors/2**N1
6343 vectors that repeats 2**N1 times. Rather than generate these redundant
6344 vectors, we halve the number of vectors for each N1 iteration. */
6345 unsigned int in_start = 0;
6346 unsigned int out_start = nvectors;
6347 unsigned int new_nvectors = nvectors;
6348 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
6350 unsigned int hi_start = new_nvectors / 2;
6351 unsigned int out_i = 0;
6352 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
6354 if ((in_i & 1) != 0
6355 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
6356 2 * in_repeat))
6357 continue;
6359 tree output = make_ssa_name (new_vector_type);
6360 tree input1 = pieces[in_start + (in_i / 2)];
6361 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
6362 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
6363 input1, input2,
6364 permutes[in_i & 1]);
6365 gimple_seq_add_stmt (seq, stmt);
6366 pieces[out_start + out_i] = output;
6367 out_i += 1;
6369 std::swap (in_start, out_start);
6370 new_nvectors = out_i;
6373 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
6374 results.reserve (nresults);
6375 for (unsigned int i = 0; i < nresults; ++i)
6376 if (i < new_nvectors)
6377 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
6378 pieces[in_start + i]));
6379 else
6380 results.quick_push (results[i - new_nvectors]);
6384 /* For constant and loop invariant defs in OP_NODE this function creates
6385 vector defs that will be used in the vectorized stmts and stores them
6386 to SLP_TREE_VEC_DEFS of OP_NODE. */
6388 static void
6389 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
6391 unsigned HOST_WIDE_INT nunits;
6392 tree vec_cst;
6393 unsigned j, number_of_places_left_in_vector;
6394 tree vector_type;
6395 tree vop;
6396 int group_size = op_node->ops.length ();
6397 unsigned int vec_num, i;
6398 unsigned number_of_copies = 1;
6399 bool constant_p;
6400 gimple_seq ctor_seq = NULL;
6401 auto_vec<tree, 16> permute_results;
6403 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
6404 vector_type = SLP_TREE_VECTYPE (op_node);
6406 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
6407 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
6408 auto_vec<tree> voprnds (number_of_vectors);
6410 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
6411 created vectors. It is greater than 1 if unrolling is performed.
6413 For example, we have two scalar operands, s1 and s2 (e.g., group of
6414 strided accesses of size two), while NUNITS is four (i.e., four scalars
6415 of this type can be packed in a vector). The output vector will contain
6416 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
6417 will be 2).
6419 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
6420 containing the operands.
6422 For example, NUNITS is four as before, and the group size is 8
6423 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
6424 {s5, s6, s7, s8}. */
6426 /* When using duplicate_and_interleave, we just need one element for
6427 each scalar statement. */
6428 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
6429 nunits = group_size;
6431 number_of_copies = nunits * number_of_vectors / group_size;
6433 number_of_places_left_in_vector = nunits;
6434 constant_p = true;
6435 tree_vector_builder elts (vector_type, nunits, 1);
6436 elts.quick_grow (nunits);
6437 stmt_vec_info insert_after = NULL;
6438 for (j = 0; j < number_of_copies; j++)
6440 tree op;
6441 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
6443 /* Create 'vect_ = {op0,op1,...,opn}'. */
6444 number_of_places_left_in_vector--;
6445 tree orig_op = op;
6446 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
6448 if (CONSTANT_CLASS_P (op))
6450 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6452 /* Can't use VIEW_CONVERT_EXPR for booleans because
6453 of possibly different sizes of scalar value and
6454 vector element. */
6455 if (integer_zerop (op))
6456 op = build_int_cst (TREE_TYPE (vector_type), 0);
6457 else if (integer_onep (op))
6458 op = build_all_ones_cst (TREE_TYPE (vector_type));
6459 else
6460 gcc_unreachable ();
6462 else
6463 op = fold_unary (VIEW_CONVERT_EXPR,
6464 TREE_TYPE (vector_type), op);
6465 gcc_assert (op && CONSTANT_CLASS_P (op));
6467 else
6469 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
6470 gimple *init_stmt;
6471 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6473 tree true_val
6474 = build_all_ones_cst (TREE_TYPE (vector_type));
6475 tree false_val
6476 = build_zero_cst (TREE_TYPE (vector_type));
6477 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
6478 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
6479 op, true_val,
6480 false_val);
6482 else
6484 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
6485 op);
6486 init_stmt
6487 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
6488 op);
6490 gimple_seq_add_stmt (&ctor_seq, init_stmt);
6491 op = new_temp;
6494 elts[number_of_places_left_in_vector] = op;
6495 if (!CONSTANT_CLASS_P (op))
6496 constant_p = false;
6497 /* For BB vectorization we have to compute an insert location
6498 when a def is inside the analyzed region since we cannot
6499 simply insert at the BB start in this case. */
6500 stmt_vec_info opdef;
6501 if (TREE_CODE (orig_op) == SSA_NAME
6502 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
6503 && is_a <bb_vec_info> (vinfo)
6504 && (opdef = vinfo->lookup_def (orig_op)))
6506 if (!insert_after)
6507 insert_after = opdef;
6508 else
6509 insert_after = get_later_stmt (insert_after, opdef);
6512 if (number_of_places_left_in_vector == 0)
6514 if (constant_p
6515 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
6516 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
6517 vec_cst = gimple_build_vector (&ctor_seq, &elts);
6518 else
6520 if (permute_results.is_empty ())
6521 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
6522 elts, number_of_vectors,
6523 permute_results);
6524 vec_cst = permute_results[number_of_vectors - j - 1];
6526 if (!gimple_seq_empty_p (ctor_seq))
6528 if (insert_after)
6530 gimple_stmt_iterator gsi;
6531 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
6533 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
6534 gsi_insert_seq_before (&gsi, ctor_seq,
6535 GSI_CONTINUE_LINKING);
6537 else if (!stmt_ends_bb_p (insert_after->stmt))
6539 gsi = gsi_for_stmt (insert_after->stmt);
6540 gsi_insert_seq_after (&gsi, ctor_seq,
6541 GSI_CONTINUE_LINKING);
6543 else
6545 /* When we want to insert after a def where the
6546 defining stmt throws then insert on the fallthru
6547 edge. */
6548 edge e = find_fallthru_edge
6549 (gimple_bb (insert_after->stmt)->succs);
6550 basic_block new_bb
6551 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
6552 gcc_assert (!new_bb);
6555 else
6556 vinfo->insert_seq_on_entry (NULL, ctor_seq);
6557 ctor_seq = NULL;
6559 voprnds.quick_push (vec_cst);
6560 insert_after = NULL;
6561 number_of_places_left_in_vector = nunits;
6562 constant_p = true;
6563 elts.new_vector (vector_type, nunits, 1);
6564 elts.quick_grow (nunits);
6569 /* Since the vectors are created in the reverse order, we should invert
6570 them. */
6571 vec_num = voprnds.length ();
6572 for (j = vec_num; j != 0; j--)
6574 vop = voprnds[j - 1];
6575 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6578 /* In case that VF is greater than the unrolling factor needed for the SLP
6579 group of stmts, NUMBER_OF_VECTORS to be created is greater than
6580 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
6581 to replicate the vectors. */
6582 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
6583 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
6584 i++)
6585 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6588 /* Get the Ith vectorized definition from SLP_NODE. */
6590 tree
6591 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
6593 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
6594 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
6595 else
6596 return SLP_TREE_VEC_DEFS (slp_node)[i];
6599 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
6601 void
6602 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
6604 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
6605 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
6607 unsigned j;
6608 gimple *vec_def_stmt;
6609 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
6610 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
6612 else
6613 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
6616 /* Get N vectorized definitions for SLP_NODE. */
6618 void
6619 vect_get_slp_defs (vec_info *,
6620 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
6622 if (n == -1U)
6623 n = SLP_TREE_CHILDREN (slp_node).length ();
6625 for (unsigned i = 0; i < n; ++i)
6627 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
6628 vec<tree> vec_defs = vNULL;
6629 vect_get_slp_defs (child, &vec_defs);
6630 vec_oprnds->quick_push (vec_defs);
6634 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6635 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6636 permute statements for the SLP node NODE. Store the number of vector
6637 permute instructions in *N_PERMS and the number of vector load
6638 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
6639 that were not needed. */
6641 bool
6642 vect_transform_slp_perm_load (vec_info *vinfo,
6643 slp_tree node, const vec<tree> &dr_chain,
6644 gimple_stmt_iterator *gsi, poly_uint64 vf,
6645 bool analyze_only, unsigned *n_perms,
6646 unsigned int *n_loads, bool dce_chain)
6648 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6649 int vec_index = 0;
6650 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6651 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
6652 unsigned int mask_element;
6653 machine_mode mode;
6655 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
6656 return false;
6658 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6660 mode = TYPE_MODE (vectype);
6661 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6663 /* Initialize the vect stmts of NODE to properly insert the generated
6664 stmts later. */
6665 if (! analyze_only)
6666 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
6667 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6668 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
6670 /* Generate permutation masks for every NODE. Number of masks for each NODE
6671 is equal to GROUP_SIZE.
6672 E.g., we have a group of three nodes with three loads from the same
6673 location in each node, and the vector size is 4. I.e., we have a
6674 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6675 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6676 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6679 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
6680 The last mask is illegal since we assume two operands for permute
6681 operation, and the mask element values can't be outside that range.
6682 Hence, the last mask must be converted into {2,5,5,5}.
6683 For the first two permutations we need the first and the second input
6684 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6685 we need the second and the third vectors: {b1,c1,a2,b2} and
6686 {c2,a3,b3,c3}. */
6688 int vect_stmts_counter = 0;
6689 unsigned int index = 0;
6690 int first_vec_index = -1;
6691 int second_vec_index = -1;
6692 bool noop_p = true;
6693 *n_perms = 0;
6695 vec_perm_builder mask;
6696 unsigned int nelts_to_build;
6697 unsigned int nvectors_per_build;
6698 unsigned int in_nlanes;
6699 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
6700 && multiple_p (nunits, group_size));
6701 if (repeating_p)
6703 /* A single vector contains a whole number of copies of the node, so:
6704 (a) all permutes can use the same mask; and
6705 (b) the permutes only need a single vector input. */
6706 mask.new_vector (nunits, group_size, 3);
6707 nelts_to_build = mask.encoded_nelts ();
6708 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
6709 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
6711 else
6713 /* We need to construct a separate mask for each vector statement. */
6714 unsigned HOST_WIDE_INT const_nunits, const_vf;
6715 if (!nunits.is_constant (&const_nunits)
6716 || !vf.is_constant (&const_vf))
6717 return false;
6718 mask.new_vector (const_nunits, const_nunits, 1);
6719 nelts_to_build = const_vf * group_size;
6720 nvectors_per_build = 1;
6721 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
6723 auto_sbitmap used_in_lanes (in_nlanes);
6724 bitmap_clear (used_in_lanes);
6725 auto_bitmap used_defs;
6727 unsigned int count = mask.encoded_nelts ();
6728 mask.quick_grow (count);
6729 vec_perm_indices indices;
6731 for (unsigned int j = 0; j < nelts_to_build; j++)
6733 unsigned int iter_num = j / group_size;
6734 unsigned int stmt_num = j % group_size;
6735 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
6736 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
6737 bitmap_set_bit (used_in_lanes, i);
6738 if (repeating_p)
6740 first_vec_index = 0;
6741 mask_element = i;
6743 else
6745 /* Enforced before the loop when !repeating_p. */
6746 unsigned int const_nunits = nunits.to_constant ();
6747 vec_index = i / const_nunits;
6748 mask_element = i % const_nunits;
6749 if (vec_index == first_vec_index
6750 || first_vec_index == -1)
6752 first_vec_index = vec_index;
6754 else if (vec_index == second_vec_index
6755 || second_vec_index == -1)
6757 second_vec_index = vec_index;
6758 mask_element += const_nunits;
6760 else
6762 if (dump_enabled_p ())
6763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6764 "permutation requires at "
6765 "least three vectors %G",
6766 stmt_info->stmt);
6767 gcc_assert (analyze_only);
6768 return false;
6771 gcc_assert (mask_element < 2 * const_nunits);
6774 if (mask_element != index)
6775 noop_p = false;
6776 mask[index++] = mask_element;
6778 if (index == count && !noop_p)
6780 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
6781 if (!can_vec_perm_const_p (mode, mode, indices))
6783 if (dump_enabled_p ())
6785 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6786 vect_location,
6787 "unsupported vect permute { ");
6788 for (i = 0; i < count; ++i)
6790 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6791 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6793 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6795 gcc_assert (analyze_only);
6796 return false;
6799 ++*n_perms;
6802 if (index == count)
6804 if (!analyze_only)
6806 tree mask_vec = NULL_TREE;
6808 if (! noop_p)
6809 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6811 if (second_vec_index == -1)
6812 second_vec_index = first_vec_index;
6814 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
6816 /* Generate the permute statement if necessary. */
6817 tree first_vec = dr_chain[first_vec_index + ri];
6818 tree second_vec = dr_chain[second_vec_index + ri];
6819 gimple *perm_stmt;
6820 if (! noop_p)
6822 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6823 tree perm_dest
6824 = vect_create_destination_var (gimple_assign_lhs (stmt),
6825 vectype);
6826 perm_dest = make_ssa_name (perm_dest);
6827 perm_stmt
6828 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6829 first_vec, second_vec,
6830 mask_vec);
6831 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
6832 gsi);
6833 if (dce_chain)
6835 bitmap_set_bit (used_defs, first_vec_index + ri);
6836 bitmap_set_bit (used_defs, second_vec_index + ri);
6839 else
6841 /* If mask was NULL_TREE generate the requested
6842 identity transform. */
6843 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
6844 if (dce_chain)
6845 bitmap_set_bit (used_defs, first_vec_index + ri);
6848 /* Store the vector statement in NODE. */
6849 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
6853 index = 0;
6854 first_vec_index = -1;
6855 second_vec_index = -1;
6856 noop_p = true;
6860 if (n_loads)
6862 if (repeating_p)
6863 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6864 else
6866 /* Enforced above when !repeating_p. */
6867 unsigned int const_nunits = nunits.to_constant ();
6868 *n_loads = 0;
6869 bool load_seen = false;
6870 for (unsigned i = 0; i < in_nlanes; ++i)
6872 if (i % const_nunits == 0)
6874 if (load_seen)
6875 *n_loads += 1;
6876 load_seen = false;
6878 if (bitmap_bit_p (used_in_lanes, i))
6879 load_seen = true;
6881 if (load_seen)
6882 *n_loads += 1;
6886 if (dce_chain)
6887 for (unsigned i = 0; i < dr_chain.length (); ++i)
6888 if (!bitmap_bit_p (used_defs, i))
6890 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
6891 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
6892 gsi_remove (&rgsi, true);
6893 release_defs (stmt);
6896 return true;
6899 /* Produce the next vector result for SLP permutation NODE by adding a vector
6900 statement at GSI. If MASK_VEC is nonnull, add:
6902 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
6904 otherwise add:
6906 <new SSA name> = FIRST_DEF. */
6908 static void
6909 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6910 slp_tree node, tree first_def, tree second_def,
6911 tree mask_vec)
6913 tree vectype = SLP_TREE_VECTYPE (node);
6915 /* ??? We SLP match existing vector element extracts but
6916 allow punning which we need to re-instantiate at uses
6917 but have no good way of explicitly representing. */
6918 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
6919 && !types_compatible_p (TREE_TYPE (first_def), vectype))
6921 gassign *conv_stmt
6922 = gimple_build_assign (make_ssa_name (vectype),
6923 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
6924 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6925 first_def = gimple_assign_lhs (conv_stmt);
6927 gassign *perm_stmt;
6928 tree perm_dest = make_ssa_name (vectype);
6929 if (mask_vec)
6931 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
6932 TYPE_SIZE (vectype))
6933 && !types_compatible_p (TREE_TYPE (second_def), vectype))
6935 gassign *conv_stmt
6936 = gimple_build_assign (make_ssa_name (vectype),
6937 build1 (VIEW_CONVERT_EXPR,
6938 vectype, second_def));
6939 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6940 second_def = gimple_assign_lhs (conv_stmt);
6942 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6943 first_def, second_def,
6944 mask_vec);
6946 else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
6948 /* For identity permutes we still need to handle the case
6949 of lowpart extracts or concats. */
6950 unsigned HOST_WIDE_INT c;
6951 auto first_def_nunits
6952 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
6953 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
6955 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
6956 TYPE_SIZE (vectype), bitsize_zero_node);
6957 perm_stmt = gimple_build_assign (perm_dest, lowpart);
6959 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
6960 first_def_nunits, &c) && c == 2)
6962 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
6963 NULL_TREE, second_def);
6964 perm_stmt = gimple_build_assign (perm_dest, ctor);
6966 else
6967 gcc_unreachable ();
6969 else
6971 /* We need a copy here in case the def was external. */
6972 perm_stmt = gimple_build_assign (perm_dest, first_def);
6974 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
6975 /* Store the vector statement in NODE. */
6976 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
6979 /* Vectorize the SLP permutations in NODE as specified
6980 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
6981 child number and lane number.
6982 Interleaving of two two-lane two-child SLP subtrees (not supported):
6983 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
6984 A blend of two four-lane two-child SLP subtrees:
6985 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
6986 Highpart of a four-lane one-child SLP subtree (not supported):
6987 [ { 0, 2 }, { 0, 3 } ]
6988 Where currently only a subset is supported by code generating below. */
6990 static bool
6991 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6992 slp_tree node, stmt_vector_for_cost *cost_vec)
6994 tree vectype = SLP_TREE_VECTYPE (node);
6996 /* ??? We currently only support all same vector input types
6997 while the SLP IL should really do a concat + select and thus accept
6998 arbitrary mismatches. */
6999 slp_tree child;
7000 unsigned i;
7001 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7002 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
7003 tree op_vectype = NULL_TREE;
7004 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7005 if (SLP_TREE_VECTYPE (child))
7007 op_vectype = SLP_TREE_VECTYPE (child);
7008 break;
7010 if (!op_vectype)
7011 op_vectype = vectype;
7012 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7014 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
7015 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
7016 || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
7017 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
7019 if (dump_enabled_p ())
7020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7021 "Unsupported vector types in lane permutation\n");
7022 return false;
7024 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
7025 repeating_p = false;
7028 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
7029 gcc_assert (perm.length () == SLP_TREE_LANES (node));
7030 if (dump_enabled_p ())
7032 dump_printf_loc (MSG_NOTE, vect_location,
7033 "vectorizing permutation");
7034 for (unsigned i = 0; i < perm.length (); ++i)
7035 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
7036 if (repeating_p)
7037 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
7038 dump_printf (MSG_NOTE, "\n");
7041 /* REPEATING_P is true if every output vector is guaranteed to use the
7042 same permute vector. We can handle that case for both variable-length
7043 and constant-length vectors, but we only handle other cases for
7044 constant-length vectors.
7046 Set:
7048 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
7049 mask vector that we want to build.
7051 - NCOPIES to the number of copies of PERM that we need in order
7052 to build the necessary permute mask vectors.
7054 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
7055 for each permute mask vector. This is only relevant when GSI is
7056 nonnull. */
7057 uint64_t npatterns;
7058 unsigned nelts_per_pattern;
7059 uint64_t ncopies;
7060 unsigned noutputs_per_mask;
7061 if (repeating_p)
7063 /* We need a single permute mask vector that has the form:
7065 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
7067 In other words, the original n-element permute in PERM is
7068 "unrolled" to fill a full vector. The stepped vector encoding
7069 that we use for permutes requires 3n elements. */
7070 npatterns = SLP_TREE_LANES (node);
7071 nelts_per_pattern = ncopies = 3;
7072 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7074 else
7076 /* Calculate every element of every permute mask vector explicitly,
7077 instead of relying on the pattern described above. */
7078 if (!nunits.is_constant (&npatterns))
7079 return false;
7080 nelts_per_pattern = ncopies = 1;
7081 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
7082 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
7083 return false;
7084 noutputs_per_mask = 1;
7086 unsigned olanes = ncopies * SLP_TREE_LANES (node);
7087 gcc_assert (repeating_p || multiple_p (olanes, nunits));
7089 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
7090 from the { SLP operand, scalar lane } permutation as recorded in the
7091 SLP node as intermediate step. This part should already work
7092 with SLP children with arbitrary number of lanes. */
7093 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
7094 auto_vec<unsigned> active_lane;
7095 vperm.create (olanes);
7096 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
7097 for (unsigned i = 0; i < ncopies; ++i)
7099 for (unsigned pi = 0; pi < perm.length (); ++pi)
7101 std::pair<unsigned, unsigned> p = perm[pi];
7102 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
7103 if (repeating_p)
7104 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
7105 else
7107 /* We checked above that the vectors are constant-length. */
7108 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
7109 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
7110 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
7111 vperm.quick_push ({{p.first, vi}, vl});
7114 /* Advance to the next group. */
7115 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
7116 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
7119 if (dump_enabled_p ())
7121 dump_printf_loc (MSG_NOTE, vect_location, "as");
7122 for (unsigned i = 0; i < vperm.length (); ++i)
7124 if (i != 0
7125 && (repeating_p
7126 ? multiple_p (i, npatterns)
7127 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
7128 dump_printf (MSG_NOTE, ",");
7129 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
7130 vperm[i].first.first, vperm[i].first.second,
7131 vperm[i].second);
7133 dump_printf (MSG_NOTE, "\n");
7136 /* We can only handle two-vector permutes, everything else should
7137 be lowered on the SLP level. The following is closely inspired
7138 by vect_transform_slp_perm_load and is supposed to eventually
7139 replace it.
7140 ??? As intermediate step do code-gen in the SLP tree representation
7141 somehow? */
7142 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
7143 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
7144 unsigned int index = 0;
7145 poly_uint64 mask_element;
7146 vec_perm_builder mask;
7147 mask.new_vector (nunits, npatterns, nelts_per_pattern);
7148 unsigned int count = mask.encoded_nelts ();
7149 mask.quick_grow (count);
7150 vec_perm_indices indices;
7151 unsigned nperms = 0;
7152 for (unsigned i = 0; i < vperm.length (); ++i)
7154 mask_element = vperm[i].second;
7155 if (first_vec.first == -1U
7156 || first_vec == vperm[i].first)
7157 first_vec = vperm[i].first;
7158 else if (second_vec.first == -1U
7159 || second_vec == vperm[i].first)
7161 second_vec = vperm[i].first;
7162 mask_element += nunits;
7164 else
7166 if (dump_enabled_p ())
7167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7168 "permutation requires at "
7169 "least three vectors\n");
7170 gcc_assert (!gsi);
7171 return false;
7174 mask[index++] = mask_element;
7176 if (index == count)
7178 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
7179 TYPE_VECTOR_SUBPARTS (op_vectype));
7180 bool identity_p = indices.series_p (0, 1, 0, 1);
7181 machine_mode vmode = TYPE_MODE (vectype);
7182 machine_mode op_vmode = TYPE_MODE (op_vectype);
7183 unsigned HOST_WIDE_INT c;
7184 if ((!identity_p
7185 && !can_vec_perm_const_p (vmode, op_vmode, indices))
7186 || (identity_p
7187 && !known_le (nunits,
7188 TYPE_VECTOR_SUBPARTS (op_vectype))
7189 && (!constant_multiple_p (nunits,
7190 TYPE_VECTOR_SUBPARTS (op_vectype),
7191 &c) || c != 2)))
7193 if (dump_enabled_p ())
7195 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
7196 vect_location,
7197 "unsupported vect permute { ");
7198 for (i = 0; i < count; ++i)
7200 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
7201 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
7203 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
7205 gcc_assert (!gsi);
7206 return false;
7209 if (!identity_p)
7210 nperms++;
7211 if (gsi)
7213 if (second_vec.first == -1U)
7214 second_vec = first_vec;
7216 slp_tree
7217 first_node = SLP_TREE_CHILDREN (node)[first_vec.first],
7218 second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
7220 tree mask_vec = NULL_TREE;
7221 if (!identity_p)
7222 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
7224 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
7226 tree first_def
7227 = vect_get_slp_vect_def (first_node,
7228 first_vec.second + vi);
7229 tree second_def
7230 = vect_get_slp_vect_def (second_node,
7231 second_vec.second + vi);
7232 vect_add_slp_permutation (vinfo, gsi, node, first_def,
7233 second_def, mask_vec);
7237 index = 0;
7238 first_vec = std::make_pair (-1U, -1U);
7239 second_vec = std::make_pair (-1U, -1U);
7243 if (!gsi)
7244 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
7246 return true;
7249 /* Vectorize SLP NODE. */
7251 static void
7252 vect_schedule_slp_node (vec_info *vinfo,
7253 slp_tree node, slp_instance instance)
7255 gimple_stmt_iterator si;
7256 int i;
7257 slp_tree child;
7259 /* For existing vectors there's nothing to do. */
7260 if (SLP_TREE_VEC_DEFS (node).exists ())
7261 return;
7263 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
7265 /* Vectorize externals and constants. */
7266 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
7267 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
7269 /* ??? vectorizable_shift can end up using a scalar operand which is
7270 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
7271 node in this case. */
7272 if (!SLP_TREE_VECTYPE (node))
7273 return;
7275 vect_create_constant_vectors (vinfo, node);
7276 return;
7279 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7281 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
7282 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
7284 if (dump_enabled_p ())
7285 dump_printf_loc (MSG_NOTE, vect_location,
7286 "------>vectorizing SLP node starting from: %G",
7287 stmt_info->stmt);
7289 if (STMT_VINFO_DATA_REF (stmt_info)
7290 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7292 /* Vectorized loads go before the first scalar load to make it
7293 ready early, vectorized stores go before the last scalar
7294 stmt which is where all uses are ready. */
7295 stmt_vec_info last_stmt_info = NULL;
7296 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
7297 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
7298 else /* DR_IS_WRITE */
7299 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
7300 si = gsi_for_stmt (last_stmt_info->stmt);
7302 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
7303 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
7304 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
7305 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7307 /* For PHI node vectorization we do not use the insertion iterator. */
7308 si = gsi_none ();
7310 else
7312 /* Emit other stmts after the children vectorized defs which is
7313 earliest possible. */
7314 gimple *last_stmt = NULL;
7315 bool seen_vector_def = false;
7316 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7317 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7319 /* For fold-left reductions we are retaining the scalar
7320 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
7321 set so the representation isn't perfect. Resort to the
7322 last scalar def here. */
7323 if (SLP_TREE_VEC_STMTS (child).is_empty ())
7325 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
7326 == cycle_phi_info_type);
7327 gphi *phi = as_a <gphi *>
7328 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
7329 if (!last_stmt
7330 || vect_stmt_dominates_stmt_p (last_stmt, phi))
7331 last_stmt = phi;
7333 /* We are emitting all vectorized stmts in the same place and
7334 the last one is the last.
7335 ??? Unless we have a load permutation applied and that
7336 figures to re-use an earlier generated load. */
7337 unsigned j;
7338 gimple *vstmt;
7339 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
7340 if (!last_stmt
7341 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7342 last_stmt = vstmt;
7344 else if (!SLP_TREE_VECTYPE (child))
7346 /* For externals we use unvectorized at all scalar defs. */
7347 unsigned j;
7348 tree def;
7349 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
7350 if (TREE_CODE (def) == SSA_NAME
7351 && !SSA_NAME_IS_DEFAULT_DEF (def))
7353 gimple *stmt = SSA_NAME_DEF_STMT (def);
7354 if (!last_stmt
7355 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
7356 last_stmt = stmt;
7359 else
7361 /* For externals we have to look at all defs since their
7362 insertion place is decided per vector. But beware
7363 of pre-existing vectors where we need to make sure
7364 we do not insert before the region boundary. */
7365 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
7366 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
7367 seen_vector_def = true;
7368 else
7370 unsigned j;
7371 tree vdef;
7372 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
7373 if (TREE_CODE (vdef) == SSA_NAME
7374 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
7376 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
7377 if (!last_stmt
7378 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7379 last_stmt = vstmt;
7383 /* This can happen when all children are pre-existing vectors or
7384 constants. */
7385 if (!last_stmt)
7386 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
7387 if (!last_stmt)
7389 gcc_assert (seen_vector_def);
7390 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
7392 else if (is_ctrl_altering_stmt (last_stmt))
7394 /* We split regions to vectorize at control altering stmts
7395 with a definition so this must be an external which
7396 we can insert at the start of the region. */
7397 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
7399 else if (is_a <bb_vec_info> (vinfo)
7400 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
7401 && gimple_could_trap_p (stmt_info->stmt))
7403 /* We've constrained possibly trapping operations to all come
7404 from the same basic-block, if vectorized defs would allow earlier
7405 scheduling still force vectorized stmts to the original block.
7406 This is only necessary for BB vectorization since for loop vect
7407 all operations are in a single BB and scalar stmt based
7408 placement doesn't play well with epilogue vectorization. */
7409 gcc_assert (dominated_by_p (CDI_DOMINATORS,
7410 gimple_bb (stmt_info->stmt),
7411 gimple_bb (last_stmt)));
7412 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
7414 else if (is_a <gphi *> (last_stmt))
7415 si = gsi_after_labels (gimple_bb (last_stmt));
7416 else
7418 si = gsi_for_stmt (last_stmt);
7419 gsi_next (&si);
7423 bool done_p = false;
7425 /* Handle purely internal nodes. */
7426 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7428 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
7429 be shared with different SLP nodes (but usually it's the same
7430 operation apart from the case the stmt is only there for denoting
7431 the actual scalar lane defs ...). So do not call vect_transform_stmt
7432 but open-code it here (partly). */
7433 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
7434 gcc_assert (done);
7435 done_p = true;
7437 if (!done_p)
7438 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
7441 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
7442 For loop vectorization this is done in vectorizable_call, but for SLP
7443 it needs to be deferred until end of vect_schedule_slp, because multiple
7444 SLP instances may refer to the same scalar stmt. */
7446 static void
7447 vect_remove_slp_scalar_calls (vec_info *vinfo,
7448 slp_tree node, hash_set<slp_tree> &visited)
7450 gimple *new_stmt;
7451 gimple_stmt_iterator gsi;
7452 int i;
7453 slp_tree child;
7454 tree lhs;
7455 stmt_vec_info stmt_info;
7457 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7458 return;
7460 if (visited.add (node))
7461 return;
7463 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7464 vect_remove_slp_scalar_calls (vinfo, child, visited);
7466 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7468 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
7469 if (!stmt || gimple_bb (stmt) == NULL)
7470 continue;
7471 if (is_pattern_stmt_p (stmt_info)
7472 || !PURE_SLP_STMT (stmt_info))
7473 continue;
7474 lhs = gimple_call_lhs (stmt);
7475 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
7476 gsi = gsi_for_stmt (stmt);
7477 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
7478 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
7482 static void
7483 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
7485 hash_set<slp_tree> visited;
7486 vect_remove_slp_scalar_calls (vinfo, node, visited);
7489 /* Vectorize the instance root. */
7491 void
7492 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
7494 gassign *rstmt = NULL;
7496 if (instance->kind == slp_inst_kind_ctor)
7498 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
7500 gimple *child_stmt = SLP_TREE_VEC_STMTS (node)[0];
7501 tree vect_lhs = gimple_get_lhs (child_stmt);
7502 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7503 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
7504 TREE_TYPE (vect_lhs)))
7505 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
7506 vect_lhs);
7507 rstmt = gimple_build_assign (root_lhs, vect_lhs);
7509 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
7511 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7512 gimple *child_stmt;
7513 int j;
7514 vec<constructor_elt, va_gc> *v;
7515 vec_alloc (v, nelts);
7517 /* A CTOR can handle V16HI composition from VNx8HI so we
7518 do not need to convert vector elements if the types
7519 do not match. */
7520 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7521 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
7522 gimple_get_lhs (child_stmt));
7523 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7524 tree rtype
7525 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
7526 tree r_constructor = build_constructor (rtype, v);
7527 rstmt = gimple_build_assign (lhs, r_constructor);
7530 else if (instance->kind == slp_inst_kind_bb_reduc)
7532 /* Largely inspired by reduction chain epilogue handling in
7533 vect_create_epilog_for_reduction. */
7534 vec<tree> vec_defs = vNULL;
7535 vect_get_slp_defs (node, &vec_defs);
7536 enum tree_code reduc_code
7537 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
7538 /* ??? We actually have to reflect signs somewhere. */
7539 if (reduc_code == MINUS_EXPR)
7540 reduc_code = PLUS_EXPR;
7541 gimple_seq epilogue = NULL;
7542 /* We may end up with more than one vector result, reduce them
7543 to one vector. */
7544 tree vec_def = vec_defs[0];
7545 for (unsigned i = 1; i < vec_defs.length (); ++i)
7546 vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
7547 vec_def, vec_defs[i]);
7548 vec_defs.release ();
7549 /* ??? Support other schemes than direct internal fn. */
7550 internal_fn reduc_fn;
7551 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
7552 || reduc_fn == IFN_LAST)
7553 gcc_unreachable ();
7554 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
7555 TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
7557 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7558 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
7559 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
7560 update_stmt (gsi_stmt (rgsi));
7561 return;
7563 else
7564 gcc_unreachable ();
7566 gcc_assert (rstmt);
7568 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7569 gsi_replace (&rgsi, rstmt, true);
7572 struct slp_scc_info
7574 bool on_stack;
7575 int dfs;
7576 int lowlink;
7579 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
7581 static void
7582 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
7583 hash_map<slp_tree, slp_scc_info> &scc_info,
7584 int &maxdfs, vec<slp_tree> &stack)
7586 bool existed_p;
7587 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
7588 gcc_assert (!existed_p);
7589 info->dfs = maxdfs;
7590 info->lowlink = maxdfs;
7591 maxdfs++;
7593 /* Leaf. */
7594 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7596 info->on_stack = false;
7597 vect_schedule_slp_node (vinfo, node, instance);
7598 return;
7601 info->on_stack = true;
7602 stack.safe_push (node);
7604 unsigned i;
7605 slp_tree child;
7606 /* DFS recurse. */
7607 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7609 if (!child)
7610 continue;
7611 slp_scc_info *child_info = scc_info.get (child);
7612 if (!child_info)
7614 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
7615 /* Recursion might have re-allocated the node. */
7616 info = scc_info.get (node);
7617 child_info = scc_info.get (child);
7618 info->lowlink = MIN (info->lowlink, child_info->lowlink);
7620 else if (child_info->on_stack)
7621 info->lowlink = MIN (info->lowlink, child_info->dfs);
7623 if (info->lowlink != info->dfs)
7624 return;
7626 auto_vec<slp_tree, 4> phis_to_fixup;
7628 /* Singleton. */
7629 if (stack.last () == node)
7631 stack.pop ();
7632 info->on_stack = false;
7633 vect_schedule_slp_node (vinfo, node, instance);
7634 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
7635 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
7636 phis_to_fixup.quick_push (node);
7638 else
7640 /* SCC. */
7641 int last_idx = stack.length () - 1;
7642 while (stack[last_idx] != node)
7643 last_idx--;
7644 /* We can break the cycle at PHIs who have at least one child
7645 code generated. Then we could re-start the DFS walk until
7646 all nodes in the SCC are covered (we might have new entries
7647 for only back-reachable nodes). But it's simpler to just
7648 iterate and schedule those that are ready. */
7649 unsigned todo = stack.length () - last_idx;
7652 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
7654 slp_tree entry = stack[idx];
7655 if (!entry)
7656 continue;
7657 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
7658 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
7659 bool ready = !phi;
7660 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
7661 if (!child)
7663 gcc_assert (phi);
7664 ready = true;
7665 break;
7667 else if (scc_info.get (child)->on_stack)
7669 if (!phi)
7671 ready = false;
7672 break;
7675 else
7677 if (phi)
7679 ready = true;
7680 break;
7683 if (ready)
7685 vect_schedule_slp_node (vinfo, entry, instance);
7686 scc_info.get (entry)->on_stack = false;
7687 stack[idx] = NULL;
7688 todo--;
7689 if (phi)
7690 phis_to_fixup.safe_push (entry);
7694 while (todo != 0);
7696 /* Pop the SCC. */
7697 stack.truncate (last_idx);
7700 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
7701 slp_tree phi_node;
7702 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
7704 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
7705 edge_iterator ei;
7706 edge e;
7707 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
7709 unsigned dest_idx = e->dest_idx;
7710 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
7711 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
7712 continue;
7713 /* Simply fill all args. */
7714 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
7715 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
7716 vect_get_slp_vect_def (child, i),
7717 e, gimple_phi_arg_location (phi, dest_idx));
7722 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
7724 void
7725 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
7727 slp_instance instance;
7728 unsigned int i;
7730 hash_map<slp_tree, slp_scc_info> scc_info;
7731 int maxdfs = 0;
7732 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7734 slp_tree node = SLP_INSTANCE_TREE (instance);
7735 if (dump_enabled_p ())
7737 dump_printf_loc (MSG_NOTE, vect_location,
7738 "Vectorizing SLP tree:\n");
7739 /* ??? Dump all? */
7740 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7741 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
7742 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
7743 vect_print_slp_graph (MSG_NOTE, vect_location,
7744 SLP_INSTANCE_TREE (instance));
7746 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
7747 have a PHI be the node breaking the cycle. */
7748 auto_vec<slp_tree> stack;
7749 if (!scc_info.get (node))
7750 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
7752 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7753 vectorize_slp_instance_root_stmt (node, instance);
7755 if (dump_enabled_p ())
7756 dump_printf_loc (MSG_NOTE, vect_location,
7757 "vectorizing stmts using SLP.\n");
7760 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7762 slp_tree root = SLP_INSTANCE_TREE (instance);
7763 stmt_vec_info store_info;
7764 unsigned int j;
7766 /* Remove scalar call stmts. Do not do this for basic-block
7767 vectorization as not all uses may be vectorized.
7768 ??? Why should this be necessary? DCE should be able to
7769 remove the stmts itself.
7770 ??? For BB vectorization we can as well remove scalar
7771 stmts starting from the SLP tree root if they have no
7772 uses. */
7773 if (is_a <loop_vec_info> (vinfo))
7774 vect_remove_slp_scalar_calls (vinfo, root);
7776 /* Remove vectorized stores original scalar stmts. */
7777 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
7779 if (!STMT_VINFO_DATA_REF (store_info)
7780 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
7781 break;
7783 store_info = vect_orig_stmt (store_info);
7784 /* Free the attached stmt_vec_info and remove the stmt. */
7785 vinfo->remove_stmt (store_info);
7787 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
7788 to not crash in vect_free_slp_tree later. */
7789 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
7790 SLP_TREE_REPRESENTATIVE (root) = NULL;