Default to dwarf version 4 on hppa64-hpux
[official-gcc.git] / gcc / tree-vect-slp.c
blobc70d06e5f20ceb413b670c57e4bb9bad6d9290e9
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
57 static object_allocator<_slp_tree> *slp_tree_pool;
58 static slp_tree slp_first_node;
60 void
61 vect_slp_init (void)
63 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
66 void
67 vect_slp_fini (void)
69 while (slp_first_node)
70 delete slp_first_node;
71 delete slp_tree_pool;
72 slp_tree_pool = NULL;
75 void *
76 _slp_tree::operator new (size_t n)
78 gcc_assert (n == sizeof (_slp_tree));
79 return slp_tree_pool->allocate_raw ();
82 void
83 _slp_tree::operator delete (void *node, size_t n)
85 gcc_assert (n == sizeof (_slp_tree));
86 slp_tree_pool->remove_raw (node);
90 /* Initialize a SLP node. */
92 _slp_tree::_slp_tree ()
94 this->prev_node = NULL;
95 if (slp_first_node)
96 slp_first_node->prev_node = this;
97 this->next_node = slp_first_node;
98 slp_first_node = this;
99 SLP_TREE_SCALAR_STMTS (this) = vNULL;
100 SLP_TREE_SCALAR_OPS (this) = vNULL;
101 SLP_TREE_VEC_STMTS (this) = vNULL;
102 SLP_TREE_VEC_DEFS (this) = vNULL;
103 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
104 SLP_TREE_CHILDREN (this) = vNULL;
105 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
106 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
107 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
108 SLP_TREE_CODE (this) = ERROR_MARK;
109 SLP_TREE_VECTYPE (this) = NULL_TREE;
110 SLP_TREE_REPRESENTATIVE (this) = NULL;
111 SLP_TREE_REF_COUNT (this) = 1;
112 this->failed = NULL;
113 this->max_nunits = 1;
114 this->lanes = 0;
117 /* Tear down a SLP node. */
119 _slp_tree::~_slp_tree ()
121 if (this->prev_node)
122 this->prev_node->next_node = this->next_node;
123 else
124 slp_first_node = this->next_node;
125 if (this->next_node)
126 this->next_node->prev_node = this->prev_node;
127 SLP_TREE_CHILDREN (this).release ();
128 SLP_TREE_SCALAR_STMTS (this).release ();
129 SLP_TREE_SCALAR_OPS (this).release ();
130 SLP_TREE_VEC_STMTS (this).release ();
131 SLP_TREE_VEC_DEFS (this).release ();
132 SLP_TREE_LOAD_PERMUTATION (this).release ();
133 SLP_TREE_LANE_PERMUTATION (this).release ();
134 if (this->failed)
135 free (failed);
138 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
140 void
141 vect_free_slp_tree (slp_tree node)
143 int i;
144 slp_tree child;
146 if (--SLP_TREE_REF_COUNT (node) != 0)
147 return;
149 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
150 if (child)
151 vect_free_slp_tree (child);
153 /* If the node defines any SLP only patterns then those patterns are no
154 longer valid and should be removed. */
155 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
156 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
158 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
159 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
160 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
163 delete node;
166 /* Return a location suitable for dumpings related to the SLP instance. */
168 dump_user_location_t
169 _slp_instance::location () const
171 if (!root_stmts.is_empty ())
172 return root_stmts[0]->stmt;
173 else
174 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
178 /* Free the memory allocated for the SLP instance. */
180 void
181 vect_free_slp_instance (slp_instance instance)
183 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
184 SLP_INSTANCE_LOADS (instance).release ();
185 SLP_INSTANCE_ROOT_STMTS (instance).release ();
186 instance->subgraph_entries.release ();
187 instance->cost_vec.release ();
188 free (instance);
192 /* Create an SLP node for SCALAR_STMTS. */
194 slp_tree
195 vect_create_new_slp_node (unsigned nops, tree_code code)
197 slp_tree node = new _slp_tree;
198 SLP_TREE_SCALAR_STMTS (node) = vNULL;
199 SLP_TREE_CHILDREN (node).create (nops);
200 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
201 SLP_TREE_CODE (node) = code;
202 return node;
204 /* Create an SLP node for SCALAR_STMTS. */
206 static slp_tree
207 vect_create_new_slp_node (slp_tree node,
208 vec<stmt_vec_info> scalar_stmts, unsigned nops)
210 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
211 SLP_TREE_CHILDREN (node).create (nops);
212 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
213 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
214 SLP_TREE_LANES (node) = scalar_stmts.length ();
215 return node;
218 /* Create an SLP node for SCALAR_STMTS. */
220 static slp_tree
221 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
223 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
226 /* Create an SLP node for OPS. */
228 static slp_tree
229 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
231 SLP_TREE_SCALAR_OPS (node) = ops;
232 SLP_TREE_DEF_TYPE (node) = vect_external_def;
233 SLP_TREE_LANES (node) = ops.length ();
234 return node;
237 /* Create an SLP node for OPS. */
239 static slp_tree
240 vect_create_new_slp_node (vec<tree> ops)
242 return vect_create_new_slp_node (new _slp_tree, ops);
246 /* This structure is used in creation of an SLP tree. Each instance
247 corresponds to the same operand in a group of scalar stmts in an SLP
248 node. */
249 typedef struct _slp_oprnd_info
251 /* Def-stmts for the operands. */
252 vec<stmt_vec_info> def_stmts;
253 /* Operands. */
254 vec<tree> ops;
255 /* Information about the first statement, its vector def-type, type, the
256 operand itself in case it's constant, and an indication if it's a pattern
257 stmt. */
258 tree first_op_type;
259 enum vect_def_type first_dt;
260 bool any_pattern;
261 } *slp_oprnd_info;
264 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
265 operand. */
266 static vec<slp_oprnd_info>
267 vect_create_oprnd_info (int nops, int group_size)
269 int i;
270 slp_oprnd_info oprnd_info;
271 vec<slp_oprnd_info> oprnds_info;
273 oprnds_info.create (nops);
274 for (i = 0; i < nops; i++)
276 oprnd_info = XNEW (struct _slp_oprnd_info);
277 oprnd_info->def_stmts.create (group_size);
278 oprnd_info->ops.create (group_size);
279 oprnd_info->first_dt = vect_uninitialized_def;
280 oprnd_info->first_op_type = NULL_TREE;
281 oprnd_info->any_pattern = false;
282 oprnds_info.quick_push (oprnd_info);
285 return oprnds_info;
289 /* Free operands info. */
291 static void
292 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
294 int i;
295 slp_oprnd_info oprnd_info;
297 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
299 oprnd_info->def_stmts.release ();
300 oprnd_info->ops.release ();
301 XDELETE (oprnd_info);
304 oprnds_info.release ();
308 /* Return true if STMTS contains a pattern statement. */
310 static bool
311 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
313 stmt_vec_info stmt_info;
314 unsigned int i;
315 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
316 if (is_pattern_stmt_p (stmt_info))
317 return true;
318 return false;
321 /* Return true when all lanes in the external or constant NODE have
322 the same value. */
324 static bool
325 vect_slp_tree_uniform_p (slp_tree node)
327 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
328 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
330 /* Pre-exsting vectors. */
331 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
332 return false;
334 unsigned i;
335 tree op, first = NULL_TREE;
336 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
337 if (!first)
338 first = op;
339 else if (!operand_equal_p (first, op, 0))
340 return false;
342 return true;
345 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
346 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
347 of the chain. */
350 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
351 stmt_vec_info first_stmt_info)
353 stmt_vec_info next_stmt_info = first_stmt_info;
354 int result = 0;
356 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
357 return -1;
361 if (next_stmt_info == stmt_info)
362 return result;
363 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
364 if (next_stmt_info)
365 result += DR_GROUP_GAP (next_stmt_info);
367 while (next_stmt_info);
369 return -1;
372 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
373 using the method implemented by duplicate_and_interleave. Return true
374 if so, returning the number of intermediate vectors in *NVECTORS_OUT
375 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
376 (if nonnull). */
378 bool
379 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
380 tree elt_type, unsigned int *nvectors_out,
381 tree *vector_type_out,
382 tree *permutes)
384 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
385 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
386 return false;
388 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
389 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
390 unsigned int nvectors = 1;
391 for (;;)
393 scalar_int_mode int_mode;
394 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
395 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
397 /* Get the natural vector type for this SLP group size. */
398 tree int_type = build_nonstandard_integer_type
399 (GET_MODE_BITSIZE (int_mode), 1);
400 tree vector_type
401 = get_vectype_for_scalar_type (vinfo, int_type, count);
402 if (vector_type
403 && VECTOR_MODE_P (TYPE_MODE (vector_type))
404 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
405 GET_MODE_SIZE (base_vector_mode)))
407 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
408 together into elements of type INT_TYPE and using the result
409 to build NVECTORS vectors. */
410 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
411 vec_perm_builder sel1 (nelts, 2, 3);
412 vec_perm_builder sel2 (nelts, 2, 3);
413 poly_int64 half_nelts = exact_div (nelts, 2);
414 for (unsigned int i = 0; i < 3; ++i)
416 sel1.quick_push (i);
417 sel1.quick_push (i + nelts);
418 sel2.quick_push (half_nelts + i);
419 sel2.quick_push (half_nelts + i + nelts);
421 vec_perm_indices indices1 (sel1, 2, nelts);
422 vec_perm_indices indices2 (sel2, 2, nelts);
423 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
424 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
426 if (nvectors_out)
427 *nvectors_out = nvectors;
428 if (vector_type_out)
429 *vector_type_out = vector_type;
430 if (permutes)
432 permutes[0] = vect_gen_perm_mask_checked (vector_type,
433 indices1);
434 permutes[1] = vect_gen_perm_mask_checked (vector_type,
435 indices2);
437 return true;
441 if (!multiple_p (elt_bytes, 2, &elt_bytes))
442 return false;
443 nvectors *= 2;
447 /* Return true if DTA and DTB match. */
449 static bool
450 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
452 return (dta == dtb
453 || ((dta == vect_external_def || dta == vect_constant_def)
454 && (dtb == vect_external_def || dtb == vect_constant_def)));
457 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
458 they are of a valid type and that they match the defs of the first stmt of
459 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
460 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
461 indicates swap is required for cond_expr stmts. Specifically, *SWAP
462 is 1 if STMT is cond and operands of comparison need to be swapped;
463 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
464 If there is any operand swap in this function, *SWAP is set to non-zero
465 value.
466 If there was a fatal error return -1; if the error could be corrected by
467 swapping operands of father node of this one, return 1; if everything is
468 ok return 0. */
469 static int
470 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
471 bool *skip_args,
472 vec<stmt_vec_info> stmts, unsigned stmt_num,
473 vec<slp_oprnd_info> *oprnds_info)
475 stmt_vec_info stmt_info = stmts[stmt_num];
476 tree oprnd;
477 unsigned int i, number_of_oprnds;
478 enum vect_def_type dt = vect_uninitialized_def;
479 slp_oprnd_info oprnd_info;
480 int first_op_idx = 1;
481 unsigned int commutative_op = -1U;
482 bool first_op_cond = false;
483 bool first = stmt_num == 0;
485 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
487 number_of_oprnds = gimple_call_num_args (stmt);
488 first_op_idx = 3;
489 if (gimple_call_internal_p (stmt))
491 internal_fn ifn = gimple_call_internal_fn (stmt);
492 commutative_op = first_commutative_argument (ifn);
494 /* Masked load, only look at mask. */
495 if (ifn == IFN_MASK_LOAD)
497 number_of_oprnds = 1;
498 /* Mask operand index. */
499 first_op_idx = 5;
503 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
505 enum tree_code code = gimple_assign_rhs_code (stmt);
506 number_of_oprnds = gimple_num_ops (stmt) - 1;
507 /* Swap can only be done for cond_expr if asked to, otherwise we
508 could result in different comparison code to the first stmt. */
509 if (code == COND_EXPR
510 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
512 first_op_cond = true;
513 number_of_oprnds++;
515 else
516 commutative_op = commutative_tree_code (code) ? 0U : -1U;
518 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
519 number_of_oprnds = gimple_phi_num_args (stmt);
520 else
521 return -1;
523 bool swapped = (swap != 0);
524 bool backedge = false;
525 gcc_assert (!swapped || first_op_cond);
526 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
527 for (i = 0; i < number_of_oprnds; i++)
529 if (first_op_cond)
531 /* Map indicating how operands of cond_expr should be swapped. */
532 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
533 int *map = maps[swap];
535 if (i < 2)
536 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
537 first_op_idx), map[i]);
538 else
539 oprnd = gimple_op (stmt_info->stmt, map[i]);
541 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
543 oprnd = gimple_phi_arg_def (stmt, i);
544 backedge = dominated_by_p (CDI_DOMINATORS,
545 gimple_phi_arg_edge (stmt, i)->src,
546 gimple_bb (stmt_info->stmt));
548 else
549 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
550 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
551 oprnd = TREE_OPERAND (oprnd, 0);
553 oprnd_info = (*oprnds_info)[i];
555 stmt_vec_info def_stmt_info;
556 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
560 "Build SLP failed: can't analyze def for %T\n",
561 oprnd);
563 return -1;
566 if (skip_args[i])
568 oprnd_info->def_stmts.quick_push (NULL);
569 oprnd_info->ops.quick_push (NULL_TREE);
570 oprnd_info->first_dt = vect_uninitialized_def;
571 continue;
574 oprnd_info->def_stmts.quick_push (def_stmt_info);
575 oprnd_info->ops.quick_push (oprnd);
577 if (def_stmt_info
578 && is_pattern_stmt_p (def_stmt_info))
580 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
581 != def_stmt_info)
582 oprnd_info->any_pattern = true;
583 else
584 /* If we promote this to external use the original stmt def. */
585 oprnd_info->ops.last ()
586 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
589 /* If there's a extern def on a backedge make sure we can
590 code-generate at the region start.
591 ??? This is another case that could be fixed by adjusting
592 how we split the function but at the moment we'd have conflicting
593 goals there. */
594 if (backedge
595 && dts[i] == vect_external_def
596 && is_a <bb_vec_info> (vinfo)
597 && TREE_CODE (oprnd) == SSA_NAME
598 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
599 && !dominated_by_p (CDI_DOMINATORS,
600 as_a <bb_vec_info> (vinfo)->bbs[0],
601 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
603 if (dump_enabled_p ())
604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
605 "Build SLP failed: extern def %T only defined "
606 "on backedge\n", oprnd);
607 return -1;
610 if (first)
612 tree type = TREE_TYPE (oprnd);
613 dt = dts[i];
614 if ((dt == vect_constant_def
615 || dt == vect_external_def)
616 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
617 && (TREE_CODE (type) == BOOLEAN_TYPE
618 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
619 type)))
621 if (dump_enabled_p ())
622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
623 "Build SLP failed: invalid type of def "
624 "for variable-length SLP %T\n", oprnd);
625 return -1;
628 /* For the swapping logic below force vect_reduction_def
629 for the reduction op in a SLP reduction group. */
630 if (!STMT_VINFO_DATA_REF (stmt_info)
631 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
632 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
633 && def_stmt_info)
634 dts[i] = dt = vect_reduction_def;
636 /* Check the types of the definition. */
637 switch (dt)
639 case vect_external_def:
640 case vect_constant_def:
641 case vect_internal_def:
642 case vect_reduction_def:
643 case vect_induction_def:
644 case vect_nested_cycle:
645 break;
647 default:
648 /* FORNOW: Not supported. */
649 if (dump_enabled_p ())
650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
651 "Build SLP failed: illegal type of def %T\n",
652 oprnd);
653 return -1;
656 oprnd_info->first_dt = dt;
657 oprnd_info->first_op_type = type;
660 if (first)
661 return 0;
663 /* Now match the operand definition types to that of the first stmt. */
664 for (i = 0; i < number_of_oprnds;)
666 if (skip_args[i])
668 ++i;
669 continue;
672 oprnd_info = (*oprnds_info)[i];
673 dt = dts[i];
674 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
675 oprnd = oprnd_info->ops[stmt_num];
676 tree type = TREE_TYPE (oprnd);
678 if (!types_compatible_p (oprnd_info->first_op_type, type))
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
682 "Build SLP failed: different operand types\n");
683 return 1;
686 /* Not first stmt of the group, check that the def-stmt/s match
687 the def-stmt/s of the first stmt. Allow different definition
688 types for reduction chains: the first stmt must be a
689 vect_reduction_def (a phi node), and the rest
690 end in the reduction chain. */
691 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
692 && !(oprnd_info->first_dt == vect_reduction_def
693 && !STMT_VINFO_DATA_REF (stmt_info)
694 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
695 && def_stmt_info
696 && !STMT_VINFO_DATA_REF (def_stmt_info)
697 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
698 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
699 || (!STMT_VINFO_DATA_REF (stmt_info)
700 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
701 && ((!def_stmt_info
702 || STMT_VINFO_DATA_REF (def_stmt_info)
703 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
704 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
705 != (oprnd_info->first_dt != vect_reduction_def))))
707 /* Try swapping operands if we got a mismatch. For BB
708 vectorization only in case it will clearly improve things. */
709 if (i == commutative_op && !swapped
710 && (!is_a <bb_vec_info> (vinfo)
711 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
712 dts[i+1])
713 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
714 || vect_def_types_match
715 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
717 if (dump_enabled_p ())
718 dump_printf_loc (MSG_NOTE, vect_location,
719 "trying swapped operands\n");
720 std::swap (dts[i], dts[i+1]);
721 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
722 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
723 std::swap ((*oprnds_info)[i]->ops[stmt_num],
724 (*oprnds_info)[i+1]->ops[stmt_num]);
725 swapped = true;
726 continue;
729 if (is_a <bb_vec_info> (vinfo)
730 && !oprnd_info->any_pattern)
732 /* Now for commutative ops we should see whether we can
733 make the other operand matching. */
734 if (dump_enabled_p ())
735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
736 "treating operand as external\n");
737 oprnd_info->first_dt = dt = vect_external_def;
739 else
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
743 "Build SLP failed: different types\n");
744 return 1;
748 /* Make sure to demote the overall operand to external. */
749 if (dt == vect_external_def)
750 oprnd_info->first_dt = vect_external_def;
751 /* For a SLP reduction chain we want to duplicate the reduction to
752 each of the chain members. That gets us a sane SLP graph (still
753 the stmts are not 100% correct wrt the initial values). */
754 else if ((dt == vect_internal_def
755 || dt == vect_reduction_def)
756 && oprnd_info->first_dt == vect_reduction_def
757 && !STMT_VINFO_DATA_REF (stmt_info)
758 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
759 && !STMT_VINFO_DATA_REF (def_stmt_info)
760 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
761 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
763 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
764 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
767 ++i;
770 /* Swap operands. */
771 if (swapped)
773 if (dump_enabled_p ())
774 dump_printf_loc (MSG_NOTE, vect_location,
775 "swapped operands to match def types in %G",
776 stmt_info->stmt);
779 return 0;
782 /* Return true if call statements CALL1 and CALL2 are similar enough
783 to be combined into the same SLP group. */
785 static bool
786 compatible_calls_p (gcall *call1, gcall *call2)
788 unsigned int nargs = gimple_call_num_args (call1);
789 if (nargs != gimple_call_num_args (call2))
790 return false;
792 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
793 return false;
795 if (gimple_call_internal_p (call1))
797 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
798 TREE_TYPE (gimple_call_lhs (call2))))
799 return false;
800 for (unsigned int i = 0; i < nargs; ++i)
801 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
802 TREE_TYPE (gimple_call_arg (call2, i))))
803 return false;
805 else
807 if (!operand_equal_p (gimple_call_fn (call1),
808 gimple_call_fn (call2), 0))
809 return false;
811 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
812 return false;
814 return true;
817 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
818 caller's attempt to find the vector type in STMT_INFO with the narrowest
819 element type. Return true if VECTYPE is nonnull and if it is valid
820 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
821 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
822 vect_build_slp_tree. */
824 static bool
825 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
826 unsigned int group_size,
827 tree vectype, poly_uint64 *max_nunits)
829 if (!vectype)
831 if (dump_enabled_p ())
832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
833 "Build SLP failed: unsupported data-type in %G\n",
834 stmt_info->stmt);
835 /* Fatal mismatch. */
836 return false;
839 /* If populating the vector type requires unrolling then fail
840 before adjusting *max_nunits for basic-block vectorization. */
841 if (is_a <bb_vec_info> (vinfo)
842 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
844 if (dump_enabled_p ())
845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
846 "Build SLP failed: unrolling required "
847 "in basic block SLP\n");
848 /* Fatal mismatch. */
849 return false;
852 /* In case of multiple types we need to detect the smallest type. */
853 vect_update_max_nunits (max_nunits, vectype);
854 return true;
857 /* Verify if the scalar stmts STMTS are isomorphic, require data
858 permutation or are of unsupported types of operation. Return
859 true if they are, otherwise return false and indicate in *MATCHES
860 which stmts are not isomorphic to the first one. If MATCHES[0]
861 is false then this indicates the comparison could not be
862 carried out or the stmts will never be vectorized by SLP.
864 Note COND_EXPR is possibly isomorphic to another one after swapping its
865 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
866 the first stmt by swapping the two operands of comparison; set SWAP[i]
867 to 2 if stmt I is isormorphic to the first stmt by inverting the code
868 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
869 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
871 static bool
872 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
873 vec<stmt_vec_info> stmts, unsigned int group_size,
874 poly_uint64 *max_nunits, bool *matches,
875 bool *two_operators, tree *node_vectype)
877 unsigned int i;
878 stmt_vec_info first_stmt_info = stmts[0];
879 enum tree_code first_stmt_code = ERROR_MARK;
880 enum tree_code alt_stmt_code = ERROR_MARK;
881 enum tree_code rhs_code = ERROR_MARK;
882 enum tree_code first_cond_code = ERROR_MARK;
883 tree lhs;
884 bool need_same_oprnds = false;
885 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
886 optab optab;
887 int icode;
888 machine_mode optab_op2_mode;
889 machine_mode vec_mode;
890 stmt_vec_info first_load = NULL, prev_first_load = NULL;
891 bool first_stmt_load_p = false, load_p = false;
892 bool first_stmt_phi_p = false, phi_p = false;
893 bool maybe_soft_fail = false;
894 tree soft_fail_nunits_vectype = NULL_TREE;
896 /* For every stmt in NODE find its def stmt/s. */
897 stmt_vec_info stmt_info;
898 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
900 gimple *stmt = stmt_info->stmt;
901 swap[i] = 0;
902 matches[i] = false;
904 if (dump_enabled_p ())
905 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
907 /* Fail to vectorize statements marked as unvectorizable, throw
908 or are volatile. */
909 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
910 || stmt_can_throw_internal (cfun, stmt)
911 || gimple_has_volatile_ops (stmt))
913 if (dump_enabled_p ())
914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
915 "Build SLP failed: unvectorizable statement %G",
916 stmt);
917 /* ??? For BB vectorization we want to commutate operands in a way
918 to shuffle all unvectorizable defs into one operand and have
919 the other still vectorized. The following doesn't reliably
920 work for this though but it's the easiest we can do here. */
921 if (is_a <bb_vec_info> (vinfo) && i != 0)
922 continue;
923 /* Fatal mismatch. */
924 matches[0] = false;
925 return false;
928 lhs = gimple_get_lhs (stmt);
929 if (lhs == NULL_TREE)
931 if (dump_enabled_p ())
932 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
933 "Build SLP failed: not GIMPLE_ASSIGN nor "
934 "GIMPLE_CALL %G", stmt);
935 if (is_a <bb_vec_info> (vinfo) && i != 0)
936 continue;
937 /* Fatal mismatch. */
938 matches[0] = false;
939 return false;
942 tree nunits_vectype;
943 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
944 &nunits_vectype, group_size))
946 if (is_a <bb_vec_info> (vinfo) && i != 0)
947 continue;
948 /* Fatal mismatch. */
949 matches[0] = false;
950 return false;
952 /* Record nunits required but continue analysis, producing matches[]
953 as if nunits was not an issue. This allows splitting of groups
954 to happen. */
955 if (nunits_vectype
956 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
957 nunits_vectype, max_nunits))
959 gcc_assert (is_a <bb_vec_info> (vinfo));
960 maybe_soft_fail = true;
961 soft_fail_nunits_vectype = nunits_vectype;
964 gcc_assert (vectype);
966 gcall *call_stmt = dyn_cast <gcall *> (stmt);
967 if (call_stmt)
969 rhs_code = CALL_EXPR;
971 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
972 load_p = true;
973 else if ((gimple_call_internal_p (call_stmt)
974 && (!vectorizable_internal_fn_p
975 (gimple_call_internal_fn (call_stmt))))
976 || gimple_call_tail_p (call_stmt)
977 || gimple_call_noreturn_p (call_stmt)
978 || !gimple_call_nothrow_p (call_stmt)
979 || gimple_call_chain (call_stmt))
981 if (dump_enabled_p ())
982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
983 "Build SLP failed: unsupported call type %G",
984 call_stmt);
985 if (is_a <bb_vec_info> (vinfo) && i != 0)
986 continue;
987 /* Fatal mismatch. */
988 matches[0] = false;
989 return false;
992 else if (gimple_code (stmt) == GIMPLE_PHI)
994 rhs_code = ERROR_MARK;
995 phi_p = true;
997 else
999 rhs_code = gimple_assign_rhs_code (stmt);
1000 load_p = gimple_vuse (stmt);
1003 /* Check the operation. */
1004 if (i == 0)
1006 *node_vectype = vectype;
1007 first_stmt_code = rhs_code;
1008 first_stmt_load_p = load_p;
1009 first_stmt_phi_p = phi_p;
1011 /* Shift arguments should be equal in all the packed stmts for a
1012 vector shift with scalar shift operand. */
1013 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1014 || rhs_code == LROTATE_EXPR
1015 || rhs_code == RROTATE_EXPR)
1017 vec_mode = TYPE_MODE (vectype);
1019 /* First see if we have a vector/vector shift. */
1020 optab = optab_for_tree_code (rhs_code, vectype,
1021 optab_vector);
1023 if (!optab
1024 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1026 /* No vector/vector shift, try for a vector/scalar shift. */
1027 optab = optab_for_tree_code (rhs_code, vectype,
1028 optab_scalar);
1030 if (!optab)
1032 if (dump_enabled_p ())
1033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1034 "Build SLP failed: no optab.\n");
1035 if (is_a <bb_vec_info> (vinfo) && i != 0)
1036 continue;
1037 /* Fatal mismatch. */
1038 matches[0] = false;
1039 return false;
1041 icode = (int) optab_handler (optab, vec_mode);
1042 if (icode == CODE_FOR_nothing)
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1046 "Build SLP failed: "
1047 "op not supported by target.\n");
1048 if (is_a <bb_vec_info> (vinfo) && i != 0)
1049 continue;
1050 /* Fatal mismatch. */
1051 matches[0] = false;
1052 return false;
1054 optab_op2_mode = insn_data[icode].operand[2].mode;
1055 if (!VECTOR_MODE_P (optab_op2_mode))
1057 need_same_oprnds = true;
1058 first_op1 = gimple_assign_rhs2 (stmt);
1062 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1064 need_same_oprnds = true;
1065 first_op1 = gimple_assign_rhs2 (stmt);
1067 else if (!load_p
1068 && rhs_code == BIT_FIELD_REF)
1070 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1071 if (!is_a <bb_vec_info> (vinfo)
1072 || TREE_CODE (vec) != SSA_NAME
1073 || !operand_equal_p (TYPE_SIZE (vectype),
1074 TYPE_SIZE (TREE_TYPE (vec))))
1076 if (dump_enabled_p ())
1077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1078 "Build SLP failed: "
1079 "BIT_FIELD_REF not supported\n");
1080 /* Fatal mismatch. */
1081 matches[0] = false;
1082 return false;
1085 else if (call_stmt
1086 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1088 need_same_oprnds = true;
1089 first_op1 = gimple_call_arg (call_stmt, 1);
1092 else
1094 if (first_stmt_code != rhs_code
1095 && alt_stmt_code == ERROR_MARK)
1096 alt_stmt_code = rhs_code;
1097 if ((first_stmt_code != rhs_code
1098 && (first_stmt_code != IMAGPART_EXPR
1099 || rhs_code != REALPART_EXPR)
1100 && (first_stmt_code != REALPART_EXPR
1101 || rhs_code != IMAGPART_EXPR)
1102 /* Handle mismatches in plus/minus by computing both
1103 and merging the results. */
1104 && !((first_stmt_code == PLUS_EXPR
1105 || first_stmt_code == MINUS_EXPR)
1106 && (alt_stmt_code == PLUS_EXPR
1107 || alt_stmt_code == MINUS_EXPR)
1108 && rhs_code == alt_stmt_code)
1109 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1110 && (first_stmt_code == ARRAY_REF
1111 || first_stmt_code == BIT_FIELD_REF
1112 || first_stmt_code == INDIRECT_REF
1113 || first_stmt_code == COMPONENT_REF
1114 || first_stmt_code == MEM_REF)))
1115 || first_stmt_load_p != load_p
1116 || first_stmt_phi_p != phi_p)
1118 if (dump_enabled_p ())
1120 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1121 "Build SLP failed: different operation "
1122 "in stmt %G", stmt);
1123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1124 "original stmt %G", first_stmt_info->stmt);
1126 /* Mismatch. */
1127 continue;
1130 if (!load_p
1131 && first_stmt_code == BIT_FIELD_REF
1132 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1133 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1135 if (dump_enabled_p ())
1136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1137 "Build SLP failed: different BIT_FIELD_REF "
1138 "arguments in %G", stmt);
1139 /* Mismatch. */
1140 continue;
1143 if (!load_p && rhs_code == CALL_EXPR)
1145 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1146 as_a <gcall *> (stmt)))
1148 if (dump_enabled_p ())
1149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1150 "Build SLP failed: different calls in %G",
1151 stmt);
1152 /* Mismatch. */
1153 continue;
1157 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1158 && (gimple_bb (first_stmt_info->stmt)
1159 != gimple_bb (stmt_info->stmt)))
1161 if (dump_enabled_p ())
1162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1163 "Build SLP failed: different BB for PHI "
1164 "or possibly trapping operation in %G", stmt);
1165 /* Mismatch. */
1166 continue;
1169 if (need_same_oprnds)
1171 tree other_op1 = (call_stmt
1172 ? gimple_call_arg (call_stmt, 1)
1173 : gimple_assign_rhs2 (stmt));
1174 if (!operand_equal_p (first_op1, other_op1, 0))
1176 if (dump_enabled_p ())
1177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1178 "Build SLP failed: different shift "
1179 "arguments in %G", stmt);
1180 /* Mismatch. */
1181 continue;
1185 if (!types_compatible_p (vectype, *node_vectype))
1187 if (dump_enabled_p ())
1188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189 "Build SLP failed: different vector type "
1190 "in %G", stmt);
1191 /* Mismatch. */
1192 continue;
1196 /* Grouped store or load. */
1197 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1199 if (REFERENCE_CLASS_P (lhs))
1201 /* Store. */
1204 else
1206 /* Load. */
1207 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1208 if (prev_first_load)
1210 /* Check that there are no loads from different interleaving
1211 chains in the same node. */
1212 if (prev_first_load != first_load)
1214 if (dump_enabled_p ())
1215 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1216 vect_location,
1217 "Build SLP failed: different "
1218 "interleaving chains in one node %G",
1219 stmt);
1220 /* Mismatch. */
1221 continue;
1224 else
1225 prev_first_load = first_load;
1227 } /* Grouped access. */
1228 else
1230 if (load_p)
1232 /* Not grouped load. */
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235 "Build SLP failed: not grouped load %G", stmt);
1237 /* FORNOW: Not grouped loads are not supported. */
1238 if (is_a <bb_vec_info> (vinfo) && i != 0)
1239 continue;
1240 /* Fatal mismatch. */
1241 matches[0] = false;
1242 return false;
1245 /* Not memory operation. */
1246 if (!phi_p
1247 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1248 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1249 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1250 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1251 && rhs_code != VIEW_CONVERT_EXPR
1252 && rhs_code != CALL_EXPR
1253 && rhs_code != BIT_FIELD_REF)
1255 if (dump_enabled_p ())
1256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1257 "Build SLP failed: operation unsupported %G",
1258 stmt);
1259 if (is_a <bb_vec_info> (vinfo) && i != 0)
1260 continue;
1261 /* Fatal mismatch. */
1262 matches[0] = false;
1263 return false;
1266 if (rhs_code == COND_EXPR)
1268 tree cond_expr = gimple_assign_rhs1 (stmt);
1269 enum tree_code cond_code = TREE_CODE (cond_expr);
1270 enum tree_code swap_code = ERROR_MARK;
1271 enum tree_code invert_code = ERROR_MARK;
1273 if (i == 0)
1274 first_cond_code = TREE_CODE (cond_expr);
1275 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1277 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1278 swap_code = swap_tree_comparison (cond_code);
1279 invert_code = invert_tree_comparison (cond_code, honor_nans);
1282 if (first_cond_code == cond_code)
1284 /* Isomorphic can be achieved by swapping. */
1285 else if (first_cond_code == swap_code)
1286 swap[i] = 1;
1287 /* Isomorphic can be achieved by inverting. */
1288 else if (first_cond_code == invert_code)
1289 swap[i] = 2;
1290 else
1292 if (dump_enabled_p ())
1293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1294 "Build SLP failed: different"
1295 " operation %G", stmt);
1296 /* Mismatch. */
1297 continue;
1302 matches[i] = true;
1305 for (i = 0; i < group_size; ++i)
1306 if (!matches[i])
1307 return false;
1309 /* If we allowed a two-operation SLP node verify the target can cope
1310 with the permute we are going to use. */
1311 if (alt_stmt_code != ERROR_MARK
1312 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1314 *two_operators = true;
1317 if (maybe_soft_fail)
1319 unsigned HOST_WIDE_INT const_nunits;
1320 if (!TYPE_VECTOR_SUBPARTS
1321 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1322 || const_nunits > group_size)
1323 matches[0] = false;
1324 else
1326 /* With constant vector elements simulate a mismatch at the
1327 point we need to split. */
1328 unsigned tail = group_size & (const_nunits - 1);
1329 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1331 return false;
1334 return true;
1337 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1338 Note we never remove apart from at destruction time so we do not
1339 need a special value for deleted that differs from empty. */
1340 struct bst_traits
1342 typedef vec <stmt_vec_info> value_type;
1343 typedef vec <stmt_vec_info> compare_type;
1344 static inline hashval_t hash (value_type);
1345 static inline bool equal (value_type existing, value_type candidate);
1346 static inline bool is_empty (value_type x) { return !x.exists (); }
1347 static inline bool is_deleted (value_type x) { return !x.exists (); }
1348 static const bool empty_zero_p = true;
1349 static inline void mark_empty (value_type &x) { x.release (); }
1350 static inline void mark_deleted (value_type &x) { x.release (); }
1351 static inline void remove (value_type &x) { x.release (); }
1353 inline hashval_t
1354 bst_traits::hash (value_type x)
1356 inchash::hash h;
1357 for (unsigned i = 0; i < x.length (); ++i)
1358 h.add_int (gimple_uid (x[i]->stmt));
1359 return h.end ();
1361 inline bool
1362 bst_traits::equal (value_type existing, value_type candidate)
1364 if (existing.length () != candidate.length ())
1365 return false;
1366 for (unsigned i = 0; i < existing.length (); ++i)
1367 if (existing[i] != candidate[i])
1368 return false;
1369 return true;
1372 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1373 but then vec::insert does memmove and that's not compatible with
1374 std::pair. */
1375 struct chain_op_t
1377 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1378 : code (code_), dt (dt_), op (op_) {}
1379 tree_code code;
1380 vect_def_type dt;
1381 tree op;
1384 /* Comparator for sorting associatable chains. */
1386 static int
1387 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1389 auto *op1 = (const chain_op_t *) op1_;
1390 auto *op2 = (const chain_op_t *) op2_;
1391 if (op1->dt != op2->dt)
1392 return (int)op1->dt - (int)op2->dt;
1393 return (int)op1->code - (int)op2->code;
1396 /* Linearize the associatable expression chain at START with the
1397 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1398 filling CHAIN with the result and using WORKLIST as intermediate storage.
1399 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1400 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1401 stmts, starting with START. */
1403 static void
1404 vect_slp_linearize_chain (vec_info *vinfo,
1405 vec<std::pair<tree_code, gimple *> > &worklist,
1406 vec<chain_op_t> &chain,
1407 enum tree_code code, gimple *start,
1408 gimple *&code_stmt, gimple *&alt_code_stmt,
1409 vec<gimple *> *chain_stmts)
1411 /* For each lane linearize the addition/subtraction (or other
1412 uniform associatable operation) expression tree. */
1413 worklist.safe_push (std::make_pair (code, start));
1414 while (!worklist.is_empty ())
1416 auto entry = worklist.pop ();
1417 gassign *stmt = as_a <gassign *> (entry.second);
1418 enum tree_code in_code = entry.first;
1419 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1420 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1421 if (!code_stmt
1422 && gimple_assign_rhs_code (stmt) == code)
1423 code_stmt = stmt;
1424 else if (!alt_code_stmt
1425 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1426 alt_code_stmt = stmt;
1427 if (chain_stmts)
1428 chain_stmts->safe_push (stmt);
1429 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1431 tree op = gimple_op (stmt, opnum);
1432 vect_def_type dt;
1433 stmt_vec_info def_stmt_info;
1434 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1435 gcc_assert (res);
1436 if (dt == vect_internal_def
1437 && is_pattern_stmt_p (def_stmt_info))
1438 op = gimple_get_lhs (def_stmt_info->stmt);
1439 gimple *use_stmt;
1440 use_operand_p use_p;
1441 if (dt == vect_internal_def
1442 && single_imm_use (op, &use_p, &use_stmt)
1443 && is_gimple_assign (def_stmt_info->stmt)
1444 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1445 || (code == PLUS_EXPR
1446 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1447 == MINUS_EXPR))))
1449 tree_code op_def_code = this_code;
1450 if (op_def_code == MINUS_EXPR && opnum == 1)
1451 op_def_code = PLUS_EXPR;
1452 if (in_code == MINUS_EXPR)
1453 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1454 worklist.safe_push (std::make_pair (op_def_code,
1455 def_stmt_info->stmt));
1457 else
1459 tree_code op_def_code = this_code;
1460 if (op_def_code == MINUS_EXPR && opnum == 1)
1461 op_def_code = PLUS_EXPR;
1462 if (in_code == MINUS_EXPR)
1463 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1464 chain.safe_push (chain_op_t (op_def_code, dt, op));
1470 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1471 simple_hashmap_traits <bst_traits, slp_tree> >
1472 scalar_stmts_to_slp_tree_map_t;
1474 static slp_tree
1475 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1476 vec<stmt_vec_info> stmts, unsigned int group_size,
1477 poly_uint64 *max_nunits,
1478 bool *matches, unsigned *limit, unsigned *tree_size,
1479 scalar_stmts_to_slp_tree_map_t *bst_map);
1481 static slp_tree
1482 vect_build_slp_tree (vec_info *vinfo,
1483 vec<stmt_vec_info> stmts, unsigned int group_size,
1484 poly_uint64 *max_nunits,
1485 bool *matches, unsigned *limit, unsigned *tree_size,
1486 scalar_stmts_to_slp_tree_map_t *bst_map)
1488 if (slp_tree *leader = bst_map->get (stmts))
1490 if (dump_enabled_p ())
1491 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1492 !(*leader)->failed ? "" : "failed ", *leader);
1493 if (!(*leader)->failed)
1495 SLP_TREE_REF_COUNT (*leader)++;
1496 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1497 stmts.release ();
1498 return *leader;
1500 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1501 return NULL;
1504 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1505 so we can pick up backedge destinations during discovery. */
1506 slp_tree res = new _slp_tree;
1507 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1508 SLP_TREE_SCALAR_STMTS (res) = stmts;
1509 bst_map->put (stmts.copy (), res);
1511 if (*limit == 0)
1513 if (dump_enabled_p ())
1514 dump_printf_loc (MSG_NOTE, vect_location,
1515 "SLP discovery limit exceeded\n");
1516 /* Mark the node invalid so we can detect those when still in use
1517 as backedge destinations. */
1518 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1519 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1520 res->failed = XNEWVEC (bool, group_size);
1521 memset (res->failed, 0, sizeof (bool) * group_size);
1522 memset (matches, 0, sizeof (bool) * group_size);
1523 return NULL;
1525 --*limit;
1527 if (dump_enabled_p ())
1528 dump_printf_loc (MSG_NOTE, vect_location,
1529 "starting SLP discovery for node %p\n", res);
1531 poly_uint64 this_max_nunits = 1;
1532 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1533 &this_max_nunits,
1534 matches, limit, tree_size, bst_map);
1535 if (!res_)
1537 if (dump_enabled_p ())
1538 dump_printf_loc (MSG_NOTE, vect_location,
1539 "SLP discovery for node %p failed\n", res);
1540 /* Mark the node invalid so we can detect those when still in use
1541 as backedge destinations. */
1542 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1543 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1544 res->failed = XNEWVEC (bool, group_size);
1545 if (flag_checking)
1547 unsigned i;
1548 for (i = 0; i < group_size; ++i)
1549 if (!matches[i])
1550 break;
1551 gcc_assert (i < group_size);
1553 memcpy (res->failed, matches, sizeof (bool) * group_size);
1555 else
1557 if (dump_enabled_p ())
1558 dump_printf_loc (MSG_NOTE, vect_location,
1559 "SLP discovery for node %p succeeded\n", res);
1560 gcc_assert (res_ == res);
1561 res->max_nunits = this_max_nunits;
1562 vect_update_max_nunits (max_nunits, this_max_nunits);
1563 /* Keep a reference for the bst_map use. */
1564 SLP_TREE_REF_COUNT (res)++;
1566 return res_;
1569 /* Helper for building an associated SLP node chain. */
1571 static void
1572 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1573 slp_tree op0, slp_tree op1,
1574 stmt_vec_info oper1, stmt_vec_info oper2,
1575 vec<std::pair<unsigned, unsigned> > lperm)
1577 unsigned group_size = SLP_TREE_LANES (op1);
1579 slp_tree child1 = new _slp_tree;
1580 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1581 SLP_TREE_VECTYPE (child1) = vectype;
1582 SLP_TREE_LANES (child1) = group_size;
1583 SLP_TREE_CHILDREN (child1).create (2);
1584 SLP_TREE_CHILDREN (child1).quick_push (op0);
1585 SLP_TREE_CHILDREN (child1).quick_push (op1);
1586 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1588 slp_tree child2 = new _slp_tree;
1589 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1590 SLP_TREE_VECTYPE (child2) = vectype;
1591 SLP_TREE_LANES (child2) = group_size;
1592 SLP_TREE_CHILDREN (child2).create (2);
1593 SLP_TREE_CHILDREN (child2).quick_push (op0);
1594 SLP_TREE_REF_COUNT (op0)++;
1595 SLP_TREE_CHILDREN (child2).quick_push (op1);
1596 SLP_TREE_REF_COUNT (op1)++;
1597 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1599 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1600 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1601 SLP_TREE_VECTYPE (perm) = vectype;
1602 SLP_TREE_LANES (perm) = group_size;
1603 /* ??? We should set this NULL but that's not expected. */
1604 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1605 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1606 SLP_TREE_CHILDREN (perm).quick_push (child1);
1607 SLP_TREE_CHILDREN (perm).quick_push (child2);
1610 /* Recursively build an SLP tree starting from NODE.
1611 Fail (and return a value not equal to zero) if def-stmts are not
1612 isomorphic, require data permutation or are of unsupported types of
1613 operation. Otherwise, return 0.
1614 The value returned is the depth in the SLP tree where a mismatch
1615 was found. */
1617 static slp_tree
1618 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1619 vec<stmt_vec_info> stmts, unsigned int group_size,
1620 poly_uint64 *max_nunits,
1621 bool *matches, unsigned *limit, unsigned *tree_size,
1622 scalar_stmts_to_slp_tree_map_t *bst_map)
1624 unsigned nops, i, this_tree_size = 0;
1625 poly_uint64 this_max_nunits = *max_nunits;
1627 matches[0] = false;
1629 stmt_vec_info stmt_info = stmts[0];
1630 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1631 nops = gimple_call_num_args (stmt);
1632 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1634 nops = gimple_num_ops (stmt) - 1;
1635 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1636 nops++;
1638 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1639 nops = gimple_phi_num_args (phi);
1640 else
1641 return NULL;
1643 /* If the SLP node is a PHI (induction or reduction), terminate
1644 the recursion. */
1645 bool *skip_args = XALLOCAVEC (bool, nops);
1646 memset (skip_args, 0, sizeof (bool) * nops);
1647 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1648 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1650 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1651 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1652 group_size);
1653 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1654 max_nunits))
1655 return NULL;
1657 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1658 if (def_type == vect_induction_def)
1660 /* Induction PHIs are not cycles but walk the initial
1661 value. Only for inner loops through, for outer loops
1662 we need to pick up the value from the actual PHIs
1663 to more easily support peeling and epilogue vectorization. */
1664 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1665 if (!nested_in_vect_loop_p (loop, stmt_info))
1666 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1667 else
1668 loop = loop->inner;
1669 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1671 else if (def_type == vect_reduction_def
1672 || def_type == vect_double_reduction_def
1673 || def_type == vect_nested_cycle)
1675 /* Else def types have to match. */
1676 stmt_vec_info other_info;
1677 bool all_same = true;
1678 FOR_EACH_VEC_ELT (stmts, i, other_info)
1680 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1681 return NULL;
1682 if (other_info != stmt_info)
1683 all_same = false;
1685 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1686 /* Reduction initial values are not explicitely represented. */
1687 if (!nested_in_vect_loop_p (loop, stmt_info))
1688 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1689 /* Reduction chain backedge defs are filled manually.
1690 ??? Need a better way to identify a SLP reduction chain PHI.
1691 Or a better overall way to SLP match those. */
1692 if (all_same && def_type == vect_reduction_def)
1693 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1695 else if (def_type != vect_internal_def)
1696 return NULL;
1700 bool two_operators = false;
1701 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1702 tree vectype = NULL_TREE;
1703 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1704 &this_max_nunits, matches, &two_operators,
1705 &vectype))
1706 return NULL;
1708 /* If the SLP node is a load, terminate the recursion unless masked. */
1709 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1710 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1712 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1714 /* Masked load. */
1715 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1716 nops = 1;
1718 else
1720 *max_nunits = this_max_nunits;
1721 (*tree_size)++;
1722 node = vect_create_new_slp_node (node, stmts, 0);
1723 SLP_TREE_VECTYPE (node) = vectype;
1724 /* And compute the load permutation. Whether it is actually
1725 a permutation depends on the unrolling factor which is
1726 decided later. */
1727 vec<unsigned> load_permutation;
1728 int j;
1729 stmt_vec_info load_info;
1730 load_permutation.create (group_size);
1731 stmt_vec_info first_stmt_info
1732 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1733 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1735 int load_place = vect_get_place_in_interleaving_chain
1736 (load_info, first_stmt_info);
1737 gcc_assert (load_place != -1);
1738 load_permutation.safe_push (load_place);
1740 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1741 return node;
1744 else if (gimple_assign_single_p (stmt_info->stmt)
1745 && !gimple_vuse (stmt_info->stmt)
1746 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1748 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1749 the same SSA name vector of a compatible type to vectype. */
1750 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1751 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1752 stmt_vec_info estmt_info;
1753 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1755 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1756 tree bfref = gimple_assign_rhs1 (estmt);
1757 HOST_WIDE_INT lane;
1758 if (!known_eq (bit_field_size (bfref),
1759 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1760 || !constant_multiple_p (bit_field_offset (bfref),
1761 bit_field_size (bfref), &lane))
1763 lperm.release ();
1764 return NULL;
1766 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1768 slp_tree vnode = vect_create_new_slp_node (vNULL);
1769 /* ??? We record vectype here but we hide eventually necessary
1770 punning and instead rely on code generation to materialize
1771 VIEW_CONVERT_EXPRs as necessary. We instead should make
1772 this explicit somehow. */
1773 SLP_TREE_VECTYPE (vnode) = vectype;
1774 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1775 /* We are always building a permutation node even if it is an identity
1776 permute to shield the rest of the vectorizer from the odd node
1777 representing an actual vector without any scalar ops.
1778 ??? We could hide it completely with making the permute node
1779 external? */
1780 node = vect_create_new_slp_node (node, stmts, 1);
1781 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1782 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1783 SLP_TREE_VECTYPE (node) = vectype;
1784 SLP_TREE_CHILDREN (node).quick_push (vnode);
1785 return node;
1787 /* When discovery reaches an associatable operation see whether we can
1788 improve that to match up lanes in a way superior to the operand
1789 swapping code which at most looks at two defs.
1790 ??? For BB vectorization we cannot do the brute-force search
1791 for matching as we can succeed by means of builds from scalars
1792 and have no good way to "cost" one build against another. */
1793 else if (is_a <loop_vec_info> (vinfo)
1794 /* ??? We don't handle !vect_internal_def defs below. */
1795 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1796 && is_gimple_assign (stmt_info->stmt)
1797 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1798 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1799 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1800 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1801 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1803 /* See if we have a chain of (mixed) adds or subtracts or other
1804 associatable ops. */
1805 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1806 if (code == MINUS_EXPR)
1807 code = PLUS_EXPR;
1808 stmt_vec_info other_op_stmt_info = NULL;
1809 stmt_vec_info op_stmt_info = NULL;
1810 unsigned chain_len = 0;
1811 auto_vec<chain_op_t> chain;
1812 auto_vec<std::pair<tree_code, gimple *> > worklist;
1813 auto_vec<vec<chain_op_t> > chains (group_size);
1814 auto_vec<slp_tree, 4> children;
1815 bool hard_fail = true;
1816 for (unsigned lane = 0; lane < group_size; ++lane)
1818 /* For each lane linearize the addition/subtraction (or other
1819 uniform associatable operation) expression tree. */
1820 gimple *op_stmt = NULL, *other_op_stmt = NULL;
1821 vect_slp_linearize_chain (vinfo, worklist, chain, code,
1822 stmts[lane]->stmt, op_stmt, other_op_stmt,
1823 NULL);
1824 if (!op_stmt_info && op_stmt)
1825 op_stmt_info = vinfo->lookup_stmt (op_stmt);
1826 if (!other_op_stmt_info && other_op_stmt)
1827 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1828 if (chain.length () == 2)
1830 /* In a chain of just two elements resort to the regular
1831 operand swapping scheme. If we run into a length
1832 mismatch still hard-FAIL. */
1833 if (chain_len == 0)
1834 hard_fail = false;
1835 else
1837 matches[lane] = false;
1838 /* ??? We might want to process the other lanes, but
1839 make sure to not give false matching hints to the
1840 caller for lanes we did not process. */
1841 if (lane != group_size - 1)
1842 matches[0] = false;
1844 break;
1846 else if (chain_len == 0)
1847 chain_len = chain.length ();
1848 else if (chain.length () != chain_len)
1850 /* ??? Here we could slip in magic to compensate with
1851 neutral operands. */
1852 matches[lane] = false;
1853 if (lane != group_size - 1)
1854 matches[0] = false;
1855 break;
1857 chains.quick_push (chain.copy ());
1858 chain.truncate (0);
1860 if (chains.length () == group_size)
1862 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
1863 if (!op_stmt_info)
1865 hard_fail = false;
1866 goto out;
1868 /* Now we have a set of chains with the same length. */
1869 /* 1. pre-sort according to def_type and operation. */
1870 for (unsigned lane = 0; lane < group_size; ++lane)
1871 chains[lane].stablesort (dt_sort_cmp, vinfo);
1872 if (dump_enabled_p ())
1874 dump_printf_loc (MSG_NOTE, vect_location,
1875 "pre-sorted chains of %s\n",
1876 get_tree_code_name (code));
1877 for (unsigned lane = 0; lane < group_size; ++lane)
1879 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1880 dump_printf (MSG_NOTE, "%s %T ",
1881 get_tree_code_name (chains[lane][opnum].code),
1882 chains[lane][opnum].op);
1883 dump_printf (MSG_NOTE, "\n");
1886 /* 2. try to build children nodes, associating as necessary. */
1887 for (unsigned n = 0; n < chain_len; ++n)
1889 vect_def_type dt = chains[0][n].dt;
1890 unsigned lane;
1891 for (lane = 0; lane < group_size; ++lane)
1892 if (chains[lane][n].dt != dt)
1894 if (dt == vect_constant_def
1895 && chains[lane][n].dt == vect_external_def)
1896 dt = vect_external_def;
1897 else if (dt == vect_external_def
1898 && chains[lane][n].dt == vect_constant_def)
1900 else
1901 break;
1903 if (lane != group_size)
1905 if (dump_enabled_p ())
1906 dump_printf_loc (MSG_NOTE, vect_location,
1907 "giving up on chain due to mismatched "
1908 "def types\n");
1909 matches[lane] = false;
1910 if (lane != group_size - 1)
1911 matches[0] = false;
1912 goto out;
1914 if (dt == vect_constant_def
1915 || dt == vect_external_def)
1917 /* We can always build those. Might want to sort last
1918 or defer building. */
1919 vec<tree> ops;
1920 ops.create (group_size);
1921 for (lane = 0; lane < group_size; ++lane)
1922 ops.quick_push (chains[lane][n].op);
1923 slp_tree child = vect_create_new_slp_node (ops);
1924 SLP_TREE_DEF_TYPE (child) = dt;
1925 children.safe_push (child);
1927 else if (dt != vect_internal_def)
1929 /* Not sure, we might need sth special.
1930 gcc.dg/vect/pr96854.c,
1931 gfortran.dg/vect/fast-math-pr37021.f90
1932 and gfortran.dg/vect/pr61171.f trigger. */
1933 /* Soft-fail for now. */
1934 hard_fail = false;
1935 goto out;
1937 else
1939 vec<stmt_vec_info> op_stmts;
1940 op_stmts.create (group_size);
1941 slp_tree child = NULL;
1942 /* Brute-force our way. We have to consider a lane
1943 failing after fixing an earlier fail up in the
1944 SLP discovery recursion. So track the current
1945 permute per lane. */
1946 unsigned *perms = XALLOCAVEC (unsigned, group_size);
1947 memset (perms, 0, sizeof (unsigned) * group_size);
1950 op_stmts.truncate (0);
1951 for (lane = 0; lane < group_size; ++lane)
1952 op_stmts.quick_push
1953 (vinfo->lookup_def (chains[lane][n].op));
1954 child = vect_build_slp_tree (vinfo, op_stmts,
1955 group_size, &this_max_nunits,
1956 matches, limit,
1957 &this_tree_size, bst_map);
1958 /* ??? We're likely getting too many fatal mismatches
1959 here so maybe we want to ignore them (but then we
1960 have no idea which lanes fatally mismatched). */
1961 if (child || !matches[0])
1962 break;
1963 /* Swap another lane we have not yet matched up into
1964 lanes that did not match. If we run out of
1965 permute possibilities for a lane terminate the
1966 search. */
1967 bool term = false;
1968 for (lane = 1; lane < group_size; ++lane)
1969 if (!matches[lane])
1971 if (n + perms[lane] + 1 == chain_len)
1973 term = true;
1974 break;
1976 std::swap (chains[lane][n],
1977 chains[lane][n + perms[lane] + 1]);
1978 perms[lane]++;
1980 if (term)
1981 break;
1983 while (1);
1984 if (!child)
1986 if (dump_enabled_p ())
1987 dump_printf_loc (MSG_NOTE, vect_location,
1988 "failed to match up op %d\n", n);
1989 op_stmts.release ();
1990 if (lane != group_size - 1)
1991 matches[0] = false;
1992 else
1993 matches[lane] = false;
1994 goto out;
1996 if (dump_enabled_p ())
1998 dump_printf_loc (MSG_NOTE, vect_location,
1999 "matched up op %d to\n", n);
2000 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2002 children.safe_push (child);
2005 /* 3. build SLP nodes to combine the chain. */
2006 for (unsigned lane = 0; lane < group_size; ++lane)
2007 if (chains[lane][0].code != code)
2009 /* See if there's any alternate all-PLUS entry. */
2010 unsigned n;
2011 for (n = 1; n < chain_len; ++n)
2013 for (lane = 0; lane < group_size; ++lane)
2014 if (chains[lane][n].code != code)
2015 break;
2016 if (lane == group_size)
2017 break;
2019 if (n != chain_len)
2021 /* Swap that in at first position. */
2022 std::swap (children[0], children[n]);
2023 for (lane = 0; lane < group_size; ++lane)
2024 std::swap (chains[lane][0], chains[lane][n]);
2026 else
2028 /* ??? When this triggers and we end up with two
2029 vect_constant/external_def up-front things break (ICE)
2030 spectacularly finding an insertion place for the
2031 all-constant op. We should have a fully
2032 vect_internal_def operand though(?) so we can swap
2033 that into first place and then prepend the all-zero
2034 constant. */
2035 if (dump_enabled_p ())
2036 dump_printf_loc (MSG_NOTE, vect_location,
2037 "inserting constant zero to compensate "
2038 "for (partially) negated first "
2039 "operand\n");
2040 chain_len++;
2041 for (lane = 0; lane < group_size; ++lane)
2042 chains[lane].safe_insert
2043 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2044 vec<tree> zero_ops;
2045 zero_ops.create (group_size);
2046 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2047 for (lane = 1; lane < group_size; ++lane)
2048 zero_ops.quick_push (zero_ops[0]);
2049 slp_tree zero = vect_create_new_slp_node (zero_ops);
2050 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2051 children.safe_insert (0, zero);
2053 break;
2055 for (unsigned i = 1; i < children.length (); ++i)
2057 slp_tree op0 = children[i - 1];
2058 slp_tree op1 = children[i];
2059 bool this_two_op = false;
2060 for (unsigned lane = 0; lane < group_size; ++lane)
2061 if (chains[lane][i].code != chains[0][i].code)
2063 this_two_op = true;
2064 break;
2066 slp_tree child;
2067 if (i == children.length () - 1)
2068 child = vect_create_new_slp_node (node, stmts, 2);
2069 else
2070 child = vect_create_new_slp_node (2, ERROR_MARK);
2071 if (this_two_op)
2073 vec<std::pair<unsigned, unsigned> > lperm;
2074 lperm.create (group_size);
2075 for (unsigned lane = 0; lane < group_size; ++lane)
2076 lperm.quick_push (std::make_pair
2077 (chains[lane][i].code != chains[0][i].code, lane));
2078 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2079 (chains[0][i].code == code
2080 ? op_stmt_info
2081 : other_op_stmt_info),
2082 (chains[0][i].code == code
2083 ? other_op_stmt_info
2084 : op_stmt_info),
2085 lperm);
2087 else
2089 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2090 SLP_TREE_VECTYPE (child) = vectype;
2091 SLP_TREE_LANES (child) = group_size;
2092 SLP_TREE_CHILDREN (child).quick_push (op0);
2093 SLP_TREE_CHILDREN (child).quick_push (op1);
2094 SLP_TREE_REPRESENTATIVE (child)
2095 = (chains[0][i].code == code
2096 ? op_stmt_info : other_op_stmt_info);
2098 children[i] = child;
2100 *tree_size += this_tree_size + 1;
2101 *max_nunits = this_max_nunits;
2102 while (!chains.is_empty ())
2103 chains.pop ().release ();
2104 return node;
2106 out:
2107 while (!children.is_empty ())
2108 vect_free_slp_tree (children.pop ());
2109 while (!chains.is_empty ())
2110 chains.pop ().release ();
2111 /* Hard-fail, otherwise we might run into quadratic processing of the
2112 chains starting one stmt into the chain again. */
2113 if (hard_fail)
2114 return NULL;
2115 /* Fall thru to normal processing. */
2118 /* Get at the operands, verifying they are compatible. */
2119 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2120 slp_oprnd_info oprnd_info;
2121 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2123 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2124 stmts, i, &oprnds_info);
2125 if (res != 0)
2126 matches[(res == -1) ? 0 : i] = false;
2127 if (!matches[0])
2128 break;
2130 for (i = 0; i < group_size; ++i)
2131 if (!matches[i])
2133 vect_free_oprnd_info (oprnds_info);
2134 return NULL;
2136 swap = NULL;
2138 auto_vec<slp_tree, 4> children;
2140 stmt_info = stmts[0];
2142 /* Create SLP_TREE nodes for the definition node/s. */
2143 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2145 slp_tree child;
2146 unsigned int j;
2148 /* We're skipping certain operands from processing, for example
2149 outer loop reduction initial defs. */
2150 if (skip_args[i])
2152 children.safe_push (NULL);
2153 continue;
2156 if (oprnd_info->first_dt == vect_uninitialized_def)
2158 /* COND_EXPR have one too many eventually if the condition
2159 is a SSA name. */
2160 gcc_assert (i == 3 && nops == 4);
2161 continue;
2164 if (is_a <bb_vec_info> (vinfo)
2165 && oprnd_info->first_dt == vect_internal_def
2166 && !oprnd_info->any_pattern)
2168 /* For BB vectorization, if all defs are the same do not
2169 bother to continue the build along the single-lane
2170 graph but use a splat of the scalar value. */
2171 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2172 for (j = 1; j < group_size; ++j)
2173 if (oprnd_info->def_stmts[j] != first_def)
2174 break;
2175 if (j == group_size
2176 /* But avoid doing this for loads where we may be
2177 able to CSE things, unless the stmt is not
2178 vectorizable. */
2179 && (!STMT_VINFO_VECTORIZABLE (first_def)
2180 || !gimple_vuse (first_def->stmt)))
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_NOTE, vect_location,
2184 "Using a splat of the uniform operand\n");
2185 oprnd_info->first_dt = vect_external_def;
2189 if (oprnd_info->first_dt == vect_external_def
2190 || oprnd_info->first_dt == vect_constant_def)
2192 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2193 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2194 oprnd_info->ops = vNULL;
2195 children.safe_push (invnode);
2196 continue;
2199 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2200 group_size, &this_max_nunits,
2201 matches, limit,
2202 &this_tree_size, bst_map)) != NULL)
2204 oprnd_info->def_stmts = vNULL;
2205 children.safe_push (child);
2206 continue;
2209 /* If the SLP build for operand zero failed and operand zero
2210 and one can be commutated try that for the scalar stmts
2211 that failed the match. */
2212 if (i == 0
2213 /* A first scalar stmt mismatch signals a fatal mismatch. */
2214 && matches[0]
2215 /* ??? For COND_EXPRs we can swap the comparison operands
2216 as well as the arms under some constraints. */
2217 && nops == 2
2218 && oprnds_info[1]->first_dt == vect_internal_def
2219 && is_gimple_assign (stmt_info->stmt)
2220 /* Swapping operands for reductions breaks assumptions later on. */
2221 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2222 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2224 /* See whether we can swap the matching or the non-matching
2225 stmt operands. */
2226 bool swap_not_matching = true;
2229 for (j = 0; j < group_size; ++j)
2231 if (matches[j] != !swap_not_matching)
2232 continue;
2233 stmt_vec_info stmt_info = stmts[j];
2234 /* Verify if we can swap operands of this stmt. */
2235 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2236 if (!stmt
2237 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2239 if (!swap_not_matching)
2240 goto fail;
2241 swap_not_matching = false;
2242 break;
2246 while (j != group_size);
2248 /* Swap mismatched definition stmts. */
2249 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_NOTE, vect_location,
2251 "Re-trying with swapped operands of stmts ");
2252 for (j = 0; j < group_size; ++j)
2253 if (matches[j] == !swap_not_matching)
2255 std::swap (oprnds_info[0]->def_stmts[j],
2256 oprnds_info[1]->def_stmts[j]);
2257 std::swap (oprnds_info[0]->ops[j],
2258 oprnds_info[1]->ops[j]);
2259 if (dump_enabled_p ())
2260 dump_printf (MSG_NOTE, "%d ", j);
2262 if (dump_enabled_p ())
2263 dump_printf (MSG_NOTE, "\n");
2264 /* After swapping some operands we lost track whether an
2265 operand has any pattern defs so be conservative here. */
2266 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2267 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2268 /* And try again with scratch 'matches' ... */
2269 bool *tem = XALLOCAVEC (bool, group_size);
2270 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2271 group_size, &this_max_nunits,
2272 tem, limit,
2273 &this_tree_size, bst_map)) != NULL)
2275 oprnd_info->def_stmts = vNULL;
2276 children.safe_push (child);
2277 continue;
2280 fail:
2282 /* If the SLP build failed and we analyze a basic-block
2283 simply treat nodes we fail to build as externally defined
2284 (and thus build vectors from the scalar defs).
2285 The cost model will reject outright expensive cases.
2286 ??? This doesn't treat cases where permutation ultimatively
2287 fails (or we don't try permutation below). Ideally we'd
2288 even compute a permutation that will end up with the maximum
2289 SLP tree size... */
2290 if (is_a <bb_vec_info> (vinfo)
2291 /* ??? Rejecting patterns this way doesn't work. We'd have to
2292 do extra work to cancel the pattern so the uses see the
2293 scalar version. */
2294 && !is_pattern_stmt_p (stmt_info)
2295 && !oprnd_info->any_pattern)
2297 /* But if there's a leading vector sized set of matching stmts
2298 fail here so we can split the group. This matches the condition
2299 vect_analyze_slp_instance uses. */
2300 /* ??? We might want to split here and combine the results to support
2301 multiple vector sizes better. */
2302 for (j = 0; j < group_size; ++j)
2303 if (!matches[j])
2304 break;
2305 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2307 if (dump_enabled_p ())
2308 dump_printf_loc (MSG_NOTE, vect_location,
2309 "Building vector operands from scalars\n");
2310 this_tree_size++;
2311 child = vect_create_new_slp_node (oprnd_info->ops);
2312 children.safe_push (child);
2313 oprnd_info->ops = vNULL;
2314 continue;
2318 gcc_assert (child == NULL);
2319 FOR_EACH_VEC_ELT (children, j, child)
2320 if (child)
2321 vect_free_slp_tree (child);
2322 vect_free_oprnd_info (oprnds_info);
2323 return NULL;
2326 vect_free_oprnd_info (oprnds_info);
2328 /* If we have all children of a child built up from uniform scalars
2329 or does more than one possibly expensive vector construction then
2330 just throw that away, causing it built up from scalars.
2331 The exception is the SLP node for the vector store. */
2332 if (is_a <bb_vec_info> (vinfo)
2333 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2334 /* ??? Rejecting patterns this way doesn't work. We'd have to
2335 do extra work to cancel the pattern so the uses see the
2336 scalar version. */
2337 && !is_pattern_stmt_p (stmt_info))
2339 slp_tree child;
2340 unsigned j;
2341 bool all_uniform_p = true;
2342 unsigned n_vector_builds = 0;
2343 FOR_EACH_VEC_ELT (children, j, child)
2345 if (!child)
2347 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2348 all_uniform_p = false;
2349 else if (!vect_slp_tree_uniform_p (child))
2351 all_uniform_p = false;
2352 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2353 n_vector_builds++;
2356 if (all_uniform_p
2357 || n_vector_builds > 1
2358 || (n_vector_builds == children.length ()
2359 && is_a <gphi *> (stmt_info->stmt)))
2361 /* Roll back. */
2362 matches[0] = false;
2363 FOR_EACH_VEC_ELT (children, j, child)
2364 if (child)
2365 vect_free_slp_tree (child);
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_NOTE, vect_location,
2369 "Building parent vector operands from "
2370 "scalars instead\n");
2371 return NULL;
2375 *tree_size += this_tree_size + 1;
2376 *max_nunits = this_max_nunits;
2378 if (two_operators)
2380 /* ??? We'd likely want to either cache in bst_map sth like
2381 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2382 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2383 explicit stmts to put in so the keying on 'stmts' doesn't
2384 work (but we have the same issue with nodes that use 'ops'). */
2385 slp_tree one = new _slp_tree;
2386 slp_tree two = new _slp_tree;
2387 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2388 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2389 SLP_TREE_VECTYPE (one) = vectype;
2390 SLP_TREE_VECTYPE (two) = vectype;
2391 SLP_TREE_CHILDREN (one).safe_splice (children);
2392 SLP_TREE_CHILDREN (two).safe_splice (children);
2393 slp_tree child;
2394 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2395 SLP_TREE_REF_COUNT (child)++;
2397 /* Here we record the original defs since this
2398 node represents the final lane configuration. */
2399 node = vect_create_new_slp_node (node, stmts, 2);
2400 SLP_TREE_VECTYPE (node) = vectype;
2401 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2402 SLP_TREE_CHILDREN (node).quick_push (one);
2403 SLP_TREE_CHILDREN (node).quick_push (two);
2404 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2405 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2406 enum tree_code ocode = ERROR_MARK;
2407 stmt_vec_info ostmt_info;
2408 unsigned j = 0;
2409 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2411 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2412 if (gimple_assign_rhs_code (ostmt) != code0)
2414 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2415 ocode = gimple_assign_rhs_code (ostmt);
2416 j = i;
2418 else
2419 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2421 SLP_TREE_CODE (one) = code0;
2422 SLP_TREE_CODE (two) = ocode;
2423 SLP_TREE_LANES (one) = stmts.length ();
2424 SLP_TREE_LANES (two) = stmts.length ();
2425 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2426 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2427 return node;
2430 node = vect_create_new_slp_node (node, stmts, nops);
2431 SLP_TREE_VECTYPE (node) = vectype;
2432 SLP_TREE_CHILDREN (node).splice (children);
2433 return node;
2436 /* Dump a single SLP tree NODE. */
2438 static void
2439 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2440 slp_tree node)
2442 unsigned i, j;
2443 slp_tree child;
2444 stmt_vec_info stmt_info;
2445 tree op;
2447 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2448 dump_user_location_t user_loc = loc.get_user_location ();
2449 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
2450 SLP_TREE_DEF_TYPE (node) == vect_external_def
2451 ? " (external)"
2452 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2453 ? " (constant)"
2454 : ""), node,
2455 estimated_poly_value (node->max_nunits),
2456 SLP_TREE_REF_COUNT (node));
2457 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2459 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2460 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2461 else
2462 dump_printf_loc (metadata, user_loc, "op template: %G",
2463 SLP_TREE_REPRESENTATIVE (node)->stmt);
2465 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2466 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2467 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2468 else
2470 dump_printf_loc (metadata, user_loc, "\t{ ");
2471 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2472 dump_printf (metadata, "%T%s ", op,
2473 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2474 dump_printf (metadata, "}\n");
2476 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2478 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2479 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2480 dump_printf (dump_kind, " %u", j);
2481 dump_printf (dump_kind, " }\n");
2483 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2485 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2486 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2487 dump_printf (dump_kind, " %u[%u]",
2488 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2489 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2490 dump_printf (dump_kind, " }\n");
2492 if (SLP_TREE_CHILDREN (node).is_empty ())
2493 return;
2494 dump_printf_loc (metadata, user_loc, "\tchildren");
2495 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2496 dump_printf (dump_kind, " %p", (void *)child);
2497 dump_printf (dump_kind, "\n");
2500 DEBUG_FUNCTION void
2501 debug (slp_tree node)
2503 debug_dump_context ctx;
2504 vect_print_slp_tree (MSG_NOTE,
2505 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2506 node);
2509 /* Recursive helper for the dot producer below. */
2511 static void
2512 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2514 if (visited.add (node))
2515 return;
2517 fprintf (f, "\"%p\" [label=\"", (void *)node);
2518 vect_print_slp_tree (MSG_NOTE,
2519 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2520 node);
2521 fprintf (f, "\"];\n");
2524 for (slp_tree child : SLP_TREE_CHILDREN (node))
2525 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2527 for (slp_tree child : SLP_TREE_CHILDREN (node))
2528 dot_slp_tree (f, child, visited);
2531 DEBUG_FUNCTION void
2532 dot_slp_tree (const char *fname, slp_tree node)
2534 FILE *f = fopen (fname, "w");
2535 fprintf (f, "digraph {\n");
2536 fflush (f);
2538 debug_dump_context ctx (f);
2539 hash_set<slp_tree> visited;
2540 dot_slp_tree (f, node, visited);
2542 fflush (f);
2543 fprintf (f, "}\n");
2544 fclose (f);
2547 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2549 static void
2550 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2551 slp_tree node, hash_set<slp_tree> &visited)
2553 unsigned i;
2554 slp_tree child;
2556 if (visited.add (node))
2557 return;
2559 vect_print_slp_tree (dump_kind, loc, node);
2561 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2562 if (child)
2563 vect_print_slp_graph (dump_kind, loc, child, visited);
2566 static void
2567 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2568 slp_tree entry)
2570 hash_set<slp_tree> visited;
2571 vect_print_slp_graph (dump_kind, loc, entry, visited);
2574 /* Mark the tree rooted at NODE with PURE_SLP. */
2576 static void
2577 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2579 int i;
2580 stmt_vec_info stmt_info;
2581 slp_tree child;
2583 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2584 return;
2586 if (visited.add (node))
2587 return;
2589 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2590 STMT_SLP_TYPE (stmt_info) = pure_slp;
2592 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2593 if (child)
2594 vect_mark_slp_stmts (child, visited);
2597 static void
2598 vect_mark_slp_stmts (slp_tree node)
2600 hash_set<slp_tree> visited;
2601 vect_mark_slp_stmts (node, visited);
2604 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2606 static void
2607 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2609 int i;
2610 stmt_vec_info stmt_info;
2611 slp_tree child;
2613 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2614 return;
2616 if (visited.add (node))
2617 return;
2619 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2621 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2622 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2623 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2626 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2627 if (child)
2628 vect_mark_slp_stmts_relevant (child, visited);
2631 static void
2632 vect_mark_slp_stmts_relevant (slp_tree node)
2634 hash_set<slp_tree> visited;
2635 vect_mark_slp_stmts_relevant (node, visited);
2639 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2641 static void
2642 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2643 hash_set<slp_tree> &visited)
2645 if (!node || visited.add (node))
2646 return;
2648 if (SLP_TREE_CHILDREN (node).length () == 0)
2650 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2651 return;
2652 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2653 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2654 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2655 loads.safe_push (node);
2657 else
2659 unsigned i;
2660 slp_tree child;
2661 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2662 vect_gather_slp_loads (loads, child, visited);
2667 /* Find the last store in SLP INSTANCE. */
2669 stmt_vec_info
2670 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2672 stmt_vec_info last = NULL;
2673 stmt_vec_info stmt_vinfo;
2675 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2677 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2678 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2681 return last;
2684 /* Find the first stmt in NODE. */
2686 stmt_vec_info
2687 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2689 stmt_vec_info first = NULL;
2690 stmt_vec_info stmt_vinfo;
2692 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2694 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2695 if (!first
2696 || get_later_stmt (stmt_vinfo, first) == first)
2697 first = stmt_vinfo;
2700 return first;
2703 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2704 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2705 (also containing the first GROUP1_SIZE stmts, since stores are
2706 consecutive), the second containing the remainder.
2707 Return the first stmt in the second group. */
2709 static stmt_vec_info
2710 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2712 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2713 gcc_assert (group1_size > 0);
2714 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2715 gcc_assert (group2_size > 0);
2716 DR_GROUP_SIZE (first_vinfo) = group1_size;
2718 stmt_vec_info stmt_info = first_vinfo;
2719 for (unsigned i = group1_size; i > 1; i--)
2721 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2722 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2724 /* STMT is now the last element of the first group. */
2725 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2726 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2728 DR_GROUP_SIZE (group2) = group2_size;
2729 for (stmt_info = group2; stmt_info;
2730 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2732 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2733 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2736 /* For the second group, the DR_GROUP_GAP is that before the original group,
2737 plus skipping over the first vector. */
2738 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2740 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2741 DR_GROUP_GAP (first_vinfo) += group2_size;
2743 if (dump_enabled_p ())
2744 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2745 group1_size, group2_size);
2747 return group2;
2750 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2751 statements and a vector of NUNITS elements. */
2753 static poly_uint64
2754 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2756 return exact_div (common_multiple (nunits, group_size), group_size);
2759 /* Helper that checks to see if a node is a load node. */
2761 static inline bool
2762 vect_is_slp_load_node (slp_tree root)
2764 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2765 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2766 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2770 /* Helper function of optimize_load_redistribution that performs the operation
2771 recursively. */
2773 static slp_tree
2774 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2775 vec_info *vinfo, unsigned int group_size,
2776 hash_map<slp_tree, slp_tree> *load_map,
2777 slp_tree root)
2779 if (slp_tree *leader = load_map->get (root))
2780 return *leader;
2782 slp_tree node;
2783 unsigned i;
2785 /* For now, we don't know anything about externals so do not do anything. */
2786 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2787 return NULL;
2788 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2790 /* First convert this node into a load node and add it to the leaves
2791 list and flatten the permute from a lane to a load one. If it's
2792 unneeded it will be elided later. */
2793 vec<stmt_vec_info> stmts;
2794 stmts.create (SLP_TREE_LANES (root));
2795 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2796 for (unsigned j = 0; j < lane_perm.length (); j++)
2798 std::pair<unsigned, unsigned> perm = lane_perm[j];
2799 node = SLP_TREE_CHILDREN (root)[perm.first];
2801 if (!vect_is_slp_load_node (node)
2802 || SLP_TREE_CHILDREN (node).exists ())
2804 stmts.release ();
2805 goto next;
2808 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2811 if (dump_enabled_p ())
2812 dump_printf_loc (MSG_NOTE, vect_location,
2813 "converting stmts on permute node %p\n", root);
2815 bool *matches = XALLOCAVEC (bool, group_size);
2816 poly_uint64 max_nunits = 1;
2817 unsigned tree_size = 0, limit = 1;
2818 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2819 matches, &limit, &tree_size, bst_map);
2820 if (!node)
2821 stmts.release ();
2823 load_map->put (root, node);
2824 return node;
2827 next:
2828 load_map->put (root, NULL);
2830 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2832 slp_tree value
2833 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2834 node);
2835 if (value)
2837 SLP_TREE_REF_COUNT (value)++;
2838 SLP_TREE_CHILDREN (root)[i] = value;
2839 /* ??? We know the original leafs of the replaced nodes will
2840 be referenced by bst_map, only the permutes created by
2841 pattern matching are not. */
2842 if (SLP_TREE_REF_COUNT (node) == 1)
2843 load_map->remove (node);
2844 vect_free_slp_tree (node);
2848 return NULL;
2851 /* Temporary workaround for loads not being CSEd during SLP build. This
2852 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2853 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2854 same DR such that the final operation is equal to a permuted load. Such
2855 NODES are then directly converted into LOADS themselves. The nodes are
2856 CSEd using BST_MAP. */
2858 static void
2859 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2860 vec_info *vinfo, unsigned int group_size,
2861 hash_map<slp_tree, slp_tree> *load_map,
2862 slp_tree root)
2864 slp_tree node;
2865 unsigned i;
2867 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2869 slp_tree value
2870 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2871 node);
2872 if (value)
2874 SLP_TREE_REF_COUNT (value)++;
2875 SLP_TREE_CHILDREN (root)[i] = value;
2876 /* ??? We know the original leafs of the replaced nodes will
2877 be referenced by bst_map, only the permutes created by
2878 pattern matching are not. */
2879 if (SLP_TREE_REF_COUNT (node) == 1)
2880 load_map->remove (node);
2881 vect_free_slp_tree (node);
2886 /* Helper function of vect_match_slp_patterns.
2888 Attempts to match patterns against the slp tree rooted in REF_NODE using
2889 VINFO. Patterns are matched in post-order traversal.
2891 If matching is successful the value in REF_NODE is updated and returned, if
2892 not then it is returned unchanged. */
2894 static bool
2895 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2896 slp_tree_to_load_perm_map_t *perm_cache,
2897 hash_set<slp_tree> *visited)
2899 unsigned i;
2900 slp_tree node = *ref_node;
2901 bool found_p = false;
2902 if (!node || visited->add (node))
2903 return false;
2905 slp_tree child;
2906 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2907 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2908 vinfo, perm_cache, visited);
2910 for (unsigned x = 0; x < num__slp_patterns; x++)
2912 vect_pattern *pattern = slp_patterns[x] (perm_cache, ref_node);
2913 if (pattern)
2915 pattern->build (vinfo);
2916 delete pattern;
2917 found_p = true;
2921 return found_p;
2924 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2925 vec_info VINFO.
2927 The modified tree is returned. Patterns are tried in order and multiple
2928 patterns may match. */
2930 static bool
2931 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2932 hash_set<slp_tree> *visited,
2933 slp_tree_to_load_perm_map_t *perm_cache)
2935 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2936 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2938 if (dump_enabled_p ())
2939 dump_printf_loc (MSG_NOTE, vect_location,
2940 "Analyzing SLP tree %p for patterns\n",
2941 SLP_INSTANCE_TREE (instance));
2943 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
2946 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
2947 splitting into two, with the first split group having size NEW_GROUP_SIZE.
2948 Return true if we could use IFN_STORE_LANES instead and if that appears
2949 to be the better approach. */
2951 static bool
2952 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
2953 unsigned int group_size,
2954 unsigned int new_group_size)
2956 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2957 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
2958 if (!vectype)
2959 return false;
2960 /* Allow the split if one of the two new groups would operate on full
2961 vectors *within* rather than across one scalar loop iteration.
2962 This is purely a heuristic, but it should work well for group
2963 sizes of 3 and 4, where the possible splits are:
2965 3->2+1: OK if the vector has exactly two elements
2966 4->2+2: Likewise
2967 4->3+1: Less clear-cut. */
2968 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
2969 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
2970 return false;
2971 return vect_store_lanes_supported (vectype, group_size, false);
2974 /* Analyze an SLP instance starting from a group of grouped stores. Call
2975 vect_build_slp_tree to build a tree of packed stmts if possible.
2976 Return FALSE if it's impossible to SLP any stmt in the loop. */
2978 static bool
2979 vect_analyze_slp_instance (vec_info *vinfo,
2980 scalar_stmts_to_slp_tree_map_t *bst_map,
2981 stmt_vec_info stmt_info, slp_instance_kind kind,
2982 unsigned max_tree_size, unsigned *limit);
2984 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2985 of KIND. Return true if successful. */
2987 static bool
2988 vect_build_slp_instance (vec_info *vinfo,
2989 slp_instance_kind kind,
2990 vec<stmt_vec_info> &scalar_stmts,
2991 vec<stmt_vec_info> &root_stmt_infos,
2992 unsigned max_tree_size, unsigned *limit,
2993 scalar_stmts_to_slp_tree_map_t *bst_map,
2994 /* ??? We need stmt_info for group splitting. */
2995 stmt_vec_info stmt_info_)
2997 if (dump_enabled_p ())
2999 dump_printf_loc (MSG_NOTE, vect_location,
3000 "Starting SLP discovery for\n");
3001 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3002 dump_printf_loc (MSG_NOTE, vect_location,
3003 " %G", scalar_stmts[i]->stmt);
3006 /* Build the tree for the SLP instance. */
3007 unsigned int group_size = scalar_stmts.length ();
3008 bool *matches = XALLOCAVEC (bool, group_size);
3009 poly_uint64 max_nunits = 1;
3010 unsigned tree_size = 0;
3011 unsigned i;
3012 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3013 &max_nunits, matches, limit,
3014 &tree_size, bst_map);
3015 if (node != NULL)
3017 /* Calculate the unrolling factor based on the smallest type. */
3018 poly_uint64 unrolling_factor
3019 = calculate_unrolling_factor (max_nunits, group_size);
3021 if (maybe_ne (unrolling_factor, 1U)
3022 && is_a <bb_vec_info> (vinfo))
3024 unsigned HOST_WIDE_INT const_max_nunits;
3025 if (!max_nunits.is_constant (&const_max_nunits)
3026 || const_max_nunits > group_size)
3028 if (dump_enabled_p ())
3029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3030 "Build SLP failed: store group "
3031 "size not a multiple of the vector size "
3032 "in basic block SLP\n");
3033 vect_free_slp_tree (node);
3034 return false;
3036 /* Fatal mismatch. */
3037 if (dump_enabled_p ())
3038 dump_printf_loc (MSG_NOTE, vect_location,
3039 "SLP discovery succeeded but node needs "
3040 "splitting\n");
3041 memset (matches, true, group_size);
3042 matches[group_size / const_max_nunits * const_max_nunits] = false;
3043 vect_free_slp_tree (node);
3045 else
3047 /* Create a new SLP instance. */
3048 slp_instance new_instance = XNEW (class _slp_instance);
3049 SLP_INSTANCE_TREE (new_instance) = node;
3050 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3051 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3052 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3053 SLP_INSTANCE_KIND (new_instance) = kind;
3054 new_instance->reduc_phis = NULL;
3055 new_instance->cost_vec = vNULL;
3056 new_instance->subgraph_entries = vNULL;
3058 if (dump_enabled_p ())
3059 dump_printf_loc (MSG_NOTE, vect_location,
3060 "SLP size %u vs. limit %u.\n",
3061 tree_size, max_tree_size);
3063 /* Fixup SLP reduction chains. */
3064 if (kind == slp_inst_kind_reduc_chain)
3066 /* If this is a reduction chain with a conversion in front
3067 amend the SLP tree with a node for that. */
3068 gimple *scalar_def
3069 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3070 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3072 /* Get at the conversion stmt - we know it's the single use
3073 of the last stmt of the reduction chain. */
3074 use_operand_p use_p;
3075 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3076 &use_p, &scalar_def);
3077 gcc_assert (r);
3078 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3079 next_info = vect_stmt_to_vectorize (next_info);
3080 scalar_stmts = vNULL;
3081 scalar_stmts.create (group_size);
3082 for (unsigned i = 0; i < group_size; ++i)
3083 scalar_stmts.quick_push (next_info);
3084 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3085 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3086 SLP_TREE_CHILDREN (conv).quick_push (node);
3087 SLP_INSTANCE_TREE (new_instance) = conv;
3088 /* We also have to fake this conversion stmt as SLP reduction
3089 group so we don't have to mess with too much code
3090 elsewhere. */
3091 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3092 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3094 /* Fill the backedge child of the PHI SLP node. The
3095 general matching code cannot find it because the
3096 scalar code does not reflect how we vectorize the
3097 reduction. */
3098 use_operand_p use_p;
3099 imm_use_iterator imm_iter;
3100 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3101 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3102 gimple_get_lhs (scalar_def))
3103 /* There are exactly two non-debug uses, the reduction
3104 PHI and the loop-closed PHI node. */
3105 if (!is_gimple_debug (USE_STMT (use_p))
3106 && gimple_bb (USE_STMT (use_p)) == loop->header)
3108 auto_vec<stmt_vec_info, 64> phis (group_size);
3109 stmt_vec_info phi_info
3110 = vinfo->lookup_stmt (USE_STMT (use_p));
3111 for (unsigned i = 0; i < group_size; ++i)
3112 phis.quick_push (phi_info);
3113 slp_tree *phi_node = bst_map->get (phis);
3114 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3115 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3116 = SLP_INSTANCE_TREE (new_instance);
3117 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3121 vinfo->slp_instances.safe_push (new_instance);
3123 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3124 the number of scalar stmts in the root in a few places.
3125 Verify that assumption holds. */
3126 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3127 .length () == group_size);
3129 if (dump_enabled_p ())
3131 dump_printf_loc (MSG_NOTE, vect_location,
3132 "Final SLP tree for instance %p:\n", new_instance);
3133 vect_print_slp_graph (MSG_NOTE, vect_location,
3134 SLP_INSTANCE_TREE (new_instance));
3137 return true;
3140 else
3142 /* Failed to SLP. */
3143 /* Free the allocated memory. */
3144 scalar_stmts.release ();
3147 stmt_vec_info stmt_info = stmt_info_;
3148 /* Try to break the group up into pieces. */
3149 if (kind == slp_inst_kind_store)
3151 /* ??? We could delay all the actual splitting of store-groups
3152 until after SLP discovery of the original group completed.
3153 Then we can recurse to vect_build_slp_instance directly. */
3154 for (i = 0; i < group_size; i++)
3155 if (!matches[i])
3156 break;
3158 /* For basic block SLP, try to break the group up into multiples of
3159 a vector size. */
3160 if (is_a <bb_vec_info> (vinfo)
3161 && (i > 1 && i < group_size))
3163 tree scalar_type
3164 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3165 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3166 1 << floor_log2 (i));
3167 unsigned HOST_WIDE_INT const_nunits;
3168 if (vectype
3169 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3171 /* Split into two groups at the first vector boundary. */
3172 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3173 unsigned group1_size = i & ~(const_nunits - 1);
3175 if (dump_enabled_p ())
3176 dump_printf_loc (MSG_NOTE, vect_location,
3177 "Splitting SLP group at stmt %u\n", i);
3178 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3179 group1_size);
3180 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3181 kind, max_tree_size,
3182 limit);
3183 /* Split the rest at the failure point and possibly
3184 re-analyze the remaining matching part if it has
3185 at least two lanes. */
3186 if (group1_size < i
3187 && (i + 1 < group_size
3188 || i - group1_size > 1))
3190 stmt_vec_info rest2 = rest;
3191 rest = vect_split_slp_store_group (rest, i - group1_size);
3192 if (i - group1_size > 1)
3193 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3194 kind, max_tree_size,
3195 limit);
3197 /* Re-analyze the non-matching tail if it has at least
3198 two lanes. */
3199 if (i + 1 < group_size)
3200 res |= vect_analyze_slp_instance (vinfo, bst_map,
3201 rest, kind, max_tree_size,
3202 limit);
3203 return res;
3207 /* For loop vectorization split into arbitrary pieces of size > 1. */
3208 if (is_a <loop_vec_info> (vinfo)
3209 && (i > 1 && i < group_size)
3210 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3212 unsigned group1_size = i;
3214 if (dump_enabled_p ())
3215 dump_printf_loc (MSG_NOTE, vect_location,
3216 "Splitting SLP group at stmt %u\n", i);
3218 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3219 group1_size);
3220 /* Loop vectorization cannot handle gaps in stores, make sure
3221 the split group appears as strided. */
3222 STMT_VINFO_STRIDED_P (rest) = 1;
3223 DR_GROUP_GAP (rest) = 0;
3224 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3225 DR_GROUP_GAP (stmt_info) = 0;
3227 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3228 kind, max_tree_size, limit);
3229 if (i + 1 < group_size)
3230 res |= vect_analyze_slp_instance (vinfo, bst_map,
3231 rest, kind, max_tree_size, limit);
3233 return res;
3236 /* Even though the first vector did not all match, we might be able to SLP
3237 (some) of the remainder. FORNOW ignore this possibility. */
3240 /* Failed to SLP. */
3241 if (dump_enabled_p ())
3242 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3243 return false;
3247 /* Analyze an SLP instance starting from a group of grouped stores. Call
3248 vect_build_slp_tree to build a tree of packed stmts if possible.
3249 Return FALSE if it's impossible to SLP any stmt in the loop. */
3251 static bool
3252 vect_analyze_slp_instance (vec_info *vinfo,
3253 scalar_stmts_to_slp_tree_map_t *bst_map,
3254 stmt_vec_info stmt_info,
3255 slp_instance_kind kind,
3256 unsigned max_tree_size, unsigned *limit)
3258 unsigned int i;
3259 vec<stmt_vec_info> scalar_stmts;
3261 if (is_a <bb_vec_info> (vinfo))
3262 vect_location = stmt_info->stmt;
3264 stmt_vec_info next_info = stmt_info;
3265 if (kind == slp_inst_kind_store)
3267 /* Collect the stores and store them in scalar_stmts. */
3268 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3269 while (next_info)
3271 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3272 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3275 else if (kind == slp_inst_kind_reduc_chain)
3277 /* Collect the reduction stmts and store them in scalar_stmts. */
3278 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3279 while (next_info)
3281 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3282 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3284 /* Mark the first element of the reduction chain as reduction to properly
3285 transform the node. In the reduction analysis phase only the last
3286 element of the chain is marked as reduction. */
3287 STMT_VINFO_DEF_TYPE (stmt_info)
3288 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3289 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3290 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3292 else if (kind == slp_inst_kind_ctor)
3294 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3295 tree val;
3296 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3297 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3299 stmt_vec_info def_info = vinfo->lookup_def (val);
3300 def_info = vect_stmt_to_vectorize (def_info);
3301 scalar_stmts.quick_push (def_info);
3303 if (dump_enabled_p ())
3304 dump_printf_loc (MSG_NOTE, vect_location,
3305 "Analyzing vectorizable constructor: %G\n",
3306 stmt_info->stmt);
3308 else if (kind == slp_inst_kind_reduc_group)
3310 /* Collect reduction statements. */
3311 const vec<stmt_vec_info> &reductions
3312 = as_a <loop_vec_info> (vinfo)->reductions;
3313 scalar_stmts.create (reductions.length ());
3314 for (i = 0; reductions.iterate (i, &next_info); i++)
3315 if (STMT_VINFO_RELEVANT_P (next_info)
3316 || STMT_VINFO_LIVE_P (next_info))
3317 scalar_stmts.quick_push (next_info);
3318 /* If less than two were relevant/live there's nothing to SLP. */
3319 if (scalar_stmts.length () < 2)
3320 return false;
3322 else
3323 gcc_unreachable ();
3325 vec<stmt_vec_info> roots = vNULL;
3326 if (kind == slp_inst_kind_ctor)
3328 roots.create (1);
3329 roots.quick_push (stmt_info);
3331 /* Build the tree for the SLP instance. */
3332 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3333 roots,
3334 max_tree_size, limit, bst_map,
3335 kind == slp_inst_kind_store
3336 ? stmt_info : NULL);
3337 if (!res)
3338 roots.release ();
3340 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3341 where we should do store group splitting. */
3343 return res;
3346 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3347 trees of packed scalar stmts if SLP is possible. */
3349 opt_result
3350 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3352 unsigned int i;
3353 stmt_vec_info first_element;
3354 slp_instance instance;
3356 DUMP_VECT_SCOPE ("vect_analyze_slp");
3358 unsigned limit = max_tree_size;
3360 scalar_stmts_to_slp_tree_map_t *bst_map
3361 = new scalar_stmts_to_slp_tree_map_t ();
3363 /* Find SLP sequences starting from groups of grouped stores. */
3364 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3365 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3366 STMT_VINFO_GROUPED_ACCESS (first_element)
3367 ? slp_inst_kind_store : slp_inst_kind_ctor,
3368 max_tree_size, &limit);
3370 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3372 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3374 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3375 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3376 bb_vinfo->roots[i].stmts,
3377 bb_vinfo->roots[i].roots,
3378 max_tree_size, &limit, bst_map, NULL))
3380 bb_vinfo->roots[i].stmts = vNULL;
3381 bb_vinfo->roots[i].roots = vNULL;
3386 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3388 /* Find SLP sequences starting from reduction chains. */
3389 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3390 if (! STMT_VINFO_RELEVANT_P (first_element)
3391 && ! STMT_VINFO_LIVE_P (first_element))
3393 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3394 slp_inst_kind_reduc_chain,
3395 max_tree_size, &limit))
3397 /* Dissolve reduction chain group. */
3398 stmt_vec_info vinfo = first_element;
3399 stmt_vec_info last = NULL;
3400 while (vinfo)
3402 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3403 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3404 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3405 last = vinfo;
3406 vinfo = next;
3408 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3409 /* It can be still vectorized as part of an SLP reduction. */
3410 loop_vinfo->reductions.safe_push (last);
3413 /* Find SLP sequences starting from groups of reductions. */
3414 if (loop_vinfo->reductions.length () > 1)
3415 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3416 slp_inst_kind_reduc_group, max_tree_size,
3417 &limit);
3420 hash_set<slp_tree> visited_patterns;
3421 slp_tree_to_load_perm_map_t perm_cache;
3423 /* See if any patterns can be found in the SLP tree. */
3424 bool pattern_found = false;
3425 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3426 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3427 &visited_patterns, &perm_cache);
3429 /* If any were found optimize permutations of loads. */
3430 if (pattern_found)
3432 hash_map<slp_tree, slp_tree> load_map;
3433 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3435 slp_tree root = SLP_INSTANCE_TREE (instance);
3436 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3437 &load_map, root);
3443 /* The map keeps a reference on SLP nodes built, release that. */
3444 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3445 it != bst_map->end (); ++it)
3446 if ((*it).second)
3447 vect_free_slp_tree ((*it).second);
3448 delete bst_map;
3450 if (pattern_found && dump_enabled_p ())
3452 dump_printf_loc (MSG_NOTE, vect_location,
3453 "Pattern matched SLP tree\n");
3454 hash_set<slp_tree> visited;
3455 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3456 vect_print_slp_graph (MSG_NOTE, vect_location,
3457 SLP_INSTANCE_TREE (instance), visited);
3460 return opt_result::success ();
3463 struct slpg_vertex
3465 slpg_vertex (slp_tree node_)
3466 : node (node_), perm_in (-1), perm_out (-1) {}
3468 int get_perm_materialized () const
3469 { return perm_in != perm_out ? perm_in : 0; }
3471 slp_tree node;
3472 /* The common permutation on the incoming lanes (towards SLP children). */
3473 int perm_in;
3474 /* The permutation on the outgoing lanes (towards SLP parents). When
3475 the node is a materialization point for a permute this differs
3476 from perm_in (and is then usually zero). Materialization happens
3477 on the input side. */
3478 int perm_out;
3481 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3483 static void
3484 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
3485 vec<slpg_vertex> &vertices, vec<int> &leafs)
3487 unsigned i;
3488 slp_tree child;
3490 if (visited.add (node))
3491 return;
3493 node->vertex = vertices.length ();
3494 vertices.safe_push (slpg_vertex (node));
3496 bool leaf = true;
3497 bool force_leaf = false;
3498 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3499 if (child)
3501 leaf = false;
3502 vect_slp_build_vertices (visited, child, vertices, leafs);
3504 else
3505 force_leaf = true;
3506 /* Since SLP discovery works along use-def edges all cycles have an
3507 entry - but there's the exception of cycles where we do not handle
3508 the entry explicitely (but with a NULL SLP node), like some reductions
3509 and inductions. Force those SLP PHIs to act as leafs to make them
3510 backwards reachable. */
3511 if (leaf || force_leaf)
3512 leafs.safe_push (node->vertex);
3515 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3517 static void
3518 vect_slp_build_vertices (vec_info *info, vec<slpg_vertex> &vertices,
3519 vec<int> &leafs)
3521 hash_set<slp_tree> visited;
3522 unsigned i;
3523 slp_instance instance;
3524 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
3525 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
3526 leafs);
3529 /* Apply (reverse) bijectite PERM to VEC. */
3531 template <class T>
3532 static void
3533 vect_slp_permute (vec<unsigned> perm,
3534 vec<T> &vec, bool reverse)
3536 auto_vec<T, 64> saved;
3537 saved.create (vec.length ());
3538 for (unsigned i = 0; i < vec.length (); ++i)
3539 saved.quick_push (vec[i]);
3541 if (reverse)
3543 for (unsigned i = 0; i < vec.length (); ++i)
3544 vec[perm[i]] = saved[i];
3545 for (unsigned i = 0; i < vec.length (); ++i)
3546 gcc_assert (vec[perm[i]] == saved[i]);
3548 else
3550 for (unsigned i = 0; i < vec.length (); ++i)
3551 vec[i] = saved[perm[i]];
3552 for (unsigned i = 0; i < vec.length (); ++i)
3553 gcc_assert (vec[i] == saved[perm[i]]);
3557 /* Return whether permutations PERM_A and PERM_B as recorded in the
3558 PERMS vector are equal. */
3560 static bool
3561 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
3562 int perm_a, int perm_b)
3564 return (perm_a == perm_b
3565 || (perm_a != -1 && perm_b != -1
3566 && perms[perm_a].length () == perms[perm_b].length ()
3567 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
3568 sizeof (unsigned) * perms[perm_a].length ()) == 0));
3571 /* Optimize the SLP graph of VINFO. */
3573 void
3574 vect_optimize_slp (vec_info *vinfo)
3576 if (vinfo->slp_instances.is_empty ())
3577 return;
3579 slp_tree node;
3580 unsigned i;
3581 auto_vec<slpg_vertex> vertices;
3582 auto_vec<int> leafs;
3583 vect_slp_build_vertices (vinfo, vertices, leafs);
3585 struct graph *slpg = new_graph (vertices.length ());
3586 for (slpg_vertex &v : vertices)
3587 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
3588 if (child)
3589 add_edge (slpg, v.node->vertex, child->vertex);
3591 /* Compute (reverse) postorder on the inverted graph. */
3592 auto_vec<int> ipo;
3593 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3595 auto_vec<vec<unsigned> > perms;
3596 perms.safe_push (vNULL); /* zero is no permute */
3598 /* Produce initial permutations. */
3599 for (i = 0; i < leafs.length (); ++i)
3601 int idx = leafs[i];
3602 slp_tree node = vertices[idx].node;
3604 /* Handle externals and constants optimistically throughout the
3605 iteration. But treat existing vectors as fixed since we
3606 do not handle permuting them below. */
3607 if ((SLP_TREE_DEF_TYPE (node) == vect_external_def
3608 && !SLP_TREE_VEC_DEFS (node).exists ())
3609 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3610 continue;
3612 /* Leafs do not change across iterations. Note leafs also double
3613 as entries to the reverse graph. */
3614 if (!slpg->vertices[idx].succ)
3616 vertices[idx].perm_in = 0;
3617 vertices[idx].perm_out = 0;
3620 /* Loads are the only thing generating permutes. */
3621 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3622 continue;
3624 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3625 node unpermuted, record this permute. */
3626 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3627 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3628 continue;
3629 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3630 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3631 bool any_permute = false;
3632 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3634 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3635 imin = MIN (imin, idx);
3636 imax = MAX (imax, idx);
3637 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3638 any_permute = true;
3640 /* If there's no permute no need to split one out. */
3641 if (!any_permute)
3642 continue;
3643 /* If the span doesn't match we'd disrupt VF computation, avoid
3644 that for now. */
3645 if (imax - imin + 1 != SLP_TREE_LANES (node))
3646 continue;
3648 /* For now only handle true permutes, like
3649 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3650 when permuting constants and invariants keeping the permute
3651 bijective. */
3652 auto_sbitmap load_index (SLP_TREE_LANES (node));
3653 bitmap_clear (load_index);
3654 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3655 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3656 unsigned j;
3657 for (j = 0; j < SLP_TREE_LANES (node); ++j)
3658 if (!bitmap_bit_p (load_index, j))
3659 break;
3660 if (j != SLP_TREE_LANES (node))
3661 continue;
3663 vec<unsigned> perm = vNULL;
3664 perm.safe_grow (SLP_TREE_LANES (node), true);
3665 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3666 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3667 perms.safe_push (perm);
3668 vertices[idx].perm_in = perms.length () - 1;
3669 vertices[idx].perm_out = perms.length () - 1;
3672 /* In addition to the above we have to mark outgoing permutes facing
3673 non-reduction graph entries that are not represented as to be
3674 materialized. */
3675 for (slp_instance instance : vinfo->slp_instances)
3676 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
3678 /* Just setting perm_out isn't enough for the propagation to
3679 pick this up. */
3680 vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_in = 0;
3681 vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_out = 0;
3684 /* Propagate permutes along the graph and compute materialization points. */
3685 bool changed;
3686 bool do_materialization = false;
3687 unsigned iteration = 0;
3690 changed = false;
3691 ++iteration;
3693 if (dump_enabled_p ())
3694 dump_printf_loc (MSG_NOTE, vect_location,
3695 "SLP optimize iteration %d\n", iteration);
3697 for (i = vertices.length (); i > 0 ; --i)
3699 int idx = ipo[i-1];
3700 slp_tree node = vertices[idx].node;
3702 /* Handle externals and constants optimistically throughout the
3703 iteration. */
3704 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3705 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3706 continue;
3708 /* We still eventually have failed backedge SLP nodes in the
3709 graph, those are only cancelled when analyzing operations.
3710 Simply treat them as transparent ops, propagating permutes
3711 through them. */
3712 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3714 /* We do not handle stores with a permutation, so all
3715 incoming permutes must have been materialized. */
3716 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
3717 if (STMT_VINFO_DATA_REF (rep)
3718 && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
3720 /* ??? We're forcing materialization in place
3721 of the child here, we'd need special handling
3722 in materialization to leave perm_in -1 here. */
3723 vertices[idx].perm_in = 0;
3724 vertices[idx].perm_out = 0;
3726 /* We cannot move a permute across an operation that is
3727 not independent on lanes. Note this is an explicit
3728 negative list since that's much shorter than the respective
3729 positive one but it's critical to keep maintaining it. */
3730 if (is_gimple_call (STMT_VINFO_STMT (rep)))
3731 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
3733 case CFN_COMPLEX_ADD_ROT90:
3734 case CFN_COMPLEX_ADD_ROT270:
3735 case CFN_COMPLEX_MUL:
3736 case CFN_COMPLEX_MUL_CONJ:
3737 case CFN_VEC_ADDSUB:
3738 case CFN_VEC_FMADDSUB:
3739 case CFN_VEC_FMSUBADD:
3740 vertices[idx].perm_in = 0;
3741 vertices[idx].perm_out = 0;
3742 default:;
3746 if (!slpg->vertices[idx].succ)
3747 /* Pick up pre-computed leaf values. */
3749 else
3751 bool any_succ_perm_out_m1 = false;
3752 int perm_in = vertices[idx].perm_in;
3753 for (graph_edge *succ = slpg->vertices[idx].succ;
3754 succ; succ = succ->succ_next)
3756 int succ_idx = succ->dest;
3757 int succ_perm = vertices[succ_idx].perm_out;
3758 /* Handle unvisited (and constant) nodes optimistically. */
3759 /* ??? But for constants once we want to handle
3760 non-bijective permutes we have to verify the permute,
3761 when unifying lanes, will not unify different constants.
3762 For example see gcc.dg/vect/bb-slp-14.c for a case
3763 that would break. */
3764 if (succ_perm == -1)
3766 /* When we handled a non-leaf optimistically, note
3767 that so we can adjust its outgoing permute below. */
3768 slp_tree succ_node = vertices[succ_idx].node;
3769 if (SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3770 && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3771 any_succ_perm_out_m1 = true;
3772 continue;
3774 if (perm_in == -1)
3775 perm_in = succ_perm;
3776 else if (succ_perm == 0
3777 || !vect_slp_perms_eq (perms, perm_in, succ_perm))
3779 perm_in = 0;
3780 break;
3784 /* Adjust any incoming permutes we treated optimistically. */
3785 if (perm_in != -1 && any_succ_perm_out_m1)
3787 for (graph_edge *succ = slpg->vertices[idx].succ;
3788 succ; succ = succ->succ_next)
3790 slp_tree succ_node = vertices[succ->dest].node;
3791 if (vertices[succ->dest].perm_out == -1
3792 && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3793 && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3795 vertices[succ->dest].perm_out = perm_in;
3796 /* And ensure this propagates. */
3797 if (vertices[succ->dest].perm_in == -1)
3798 vertices[succ->dest].perm_in = perm_in;
3801 changed = true;
3804 if (!vect_slp_perms_eq (perms, perm_in,
3805 vertices[idx].perm_in))
3807 /* Make sure we eventually converge. */
3808 gcc_checking_assert (vertices[idx].perm_in == -1
3809 || perm_in == 0);
3810 vertices[idx].perm_in = perm_in;
3812 /* While we can handle VEC_PERM nodes as transparent
3813 pass-through they can be a cheap materialization
3814 point as well. In addition they can act as source
3815 of a random permutation as well.
3816 The following ensures that former materialization
3817 points that now have zero incoming permutes no
3818 longer appear as such and that former "any" permutes
3819 get pass-through. We keep VEC_PERM nodes optimistic
3820 as "any" outgoing permute though. */
3821 if (vertices[idx].perm_out != 0
3822 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
3823 vertices[idx].perm_out = perm_in;
3824 changed = true;
3828 /* Elide pruning at materialization points in the first
3829 iteration phase. */
3830 if (!do_materialization)
3831 continue;
3833 int perm = vertices[idx].perm_out;
3834 if (perm == 0 || perm == -1)
3835 continue;
3837 /* Decide on permute materialization. Look whether there's
3838 a use (pred) edge that is permuted differently than us.
3839 In that case mark ourselves so the permutation is applied. */
3840 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
3841 if (all_preds_permuted)
3842 for (graph_edge *pred = slpg->vertices[idx].pred;
3843 pred; pred = pred->pred_next)
3845 int pred_perm = vertices[pred->src].perm_in;
3846 gcc_checking_assert (pred_perm != -1);
3847 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3849 all_preds_permuted = false;
3850 break;
3853 if (!all_preds_permuted)
3855 vertices[idx].perm_out = 0;
3856 changed = true;
3860 /* If the initial propagation converged, switch on materialization
3861 and re-propagate. */
3862 if (!changed && !do_materialization)
3864 do_materialization = true;
3865 changed = true;
3868 while (changed);
3869 statistics_histogram_event (cfun, "SLP optimize perm iterations", iteration);
3871 /* Materialize. */
3872 for (i = 0; i < vertices.length (); ++i)
3874 int perm_in = vertices[i].perm_in;
3875 slp_tree node = vertices[i].node;
3877 /* First permute invariant/external original successors, we handle
3878 those optimistically during propagation and duplicate them if
3879 they are used with different permutations. */
3880 unsigned j;
3881 slp_tree child;
3882 if (perm_in > 0)
3883 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3885 if (!child
3886 || (SLP_TREE_DEF_TYPE (child) != vect_constant_def
3887 && SLP_TREE_DEF_TYPE (child) != vect_external_def))
3888 continue;
3890 /* If the vector is uniform there's nothing to do. */
3891 if (vect_slp_tree_uniform_p (child))
3892 continue;
3894 /* We can end up sharing some externals via two_operator
3895 handling. Be prepared to unshare those. */
3896 if (child->refcnt != 1)
3898 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3899 SLP_TREE_CHILDREN (node)[j] = child
3900 = vect_create_new_slp_node
3901 (SLP_TREE_SCALAR_OPS (child).copy ());
3903 vect_slp_permute (perms[perm_in],
3904 SLP_TREE_SCALAR_OPS (child), true);
3907 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3909 /* Apply the common permutes to the input vectors. */
3910 if (perm_in > 0)
3912 /* If the node is already a permute node we can apply
3913 the permutation to the lane selection, effectively
3914 materializing it on the incoming vectors. */
3915 if (dump_enabled_p ())
3916 dump_printf_loc (MSG_NOTE, vect_location,
3917 "simplifying permute node %p\n",
3918 node);
3919 for (unsigned k = 0;
3920 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3921 SLP_TREE_LANE_PERMUTATION (node)[k].second
3922 = perms[perm_in][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3924 /* Apply the anticipated output permute to the permute and
3925 stmt vectors. */
3926 int perm_out = vertices[i].perm_out;
3927 if (perm_out > 0)
3929 vect_slp_permute (perms[perm_out],
3930 SLP_TREE_SCALAR_STMTS (node), true);
3931 vect_slp_permute (perms[perm_out],
3932 SLP_TREE_LANE_PERMUTATION (node), true);
3935 else if (vertices[i].get_perm_materialized () != 0)
3937 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3938 /* For loads simply drop the permutation, the load permutation
3939 already performs the desired permutation. */
3941 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3942 gcc_unreachable ();
3943 else
3945 if (dump_enabled_p ())
3946 dump_printf_loc (MSG_NOTE, vect_location,
3947 "inserting permute node in place of %p\n",
3948 node);
3950 /* Make a copy of NODE and in-place change it to a
3951 VEC_PERM node to permute the lanes of the copy. */
3952 slp_tree copy = new _slp_tree;
3953 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3954 SLP_TREE_CHILDREN (node) = vNULL;
3955 SLP_TREE_SCALAR_STMTS (copy)
3956 = SLP_TREE_SCALAR_STMTS (node).copy ();
3957 vect_slp_permute (perms[perm_in],
3958 SLP_TREE_SCALAR_STMTS (copy), true);
3959 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3960 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3961 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3962 SLP_TREE_LANE_PERMUTATION (copy)
3963 = SLP_TREE_LANE_PERMUTATION (node);
3964 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3965 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3966 copy->refcnt = 1;
3967 copy->max_nunits = node->max_nunits;
3968 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3969 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3970 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3972 /* Now turn NODE into a VEC_PERM. */
3973 SLP_TREE_CHILDREN (node).safe_push (copy);
3974 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3975 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3976 SLP_TREE_LANE_PERMUTATION (node)
3977 .quick_push (std::make_pair (0, perms[perm_in][j]));
3978 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3981 else if (perm_in > 0) /* perm_in == perm_out */
3983 /* Apply the reverse permutation to our stmts. */
3984 vect_slp_permute (perms[perm_in],
3985 SLP_TREE_SCALAR_STMTS (node), true);
3986 /* And to the lane/load permutation, which we can simply
3987 make regular by design. */
3988 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3990 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
3991 /* ??? When we handle non-bijective permutes the idea
3992 is that we can force the load-permutation to be
3993 { min, min + 1, min + 2, ... max }. But then the
3994 scalar defs might no longer match the lane content
3995 which means wrong-code with live lane vectorization.
3996 So we possibly have to have NULL entries for those. */
3997 vect_slp_permute (perms[perm_in],
3998 SLP_TREE_LOAD_PERMUTATION (node), true);
4000 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
4001 gcc_unreachable ();
4005 /* Elide any permutations at BB reduction roots. */
4006 if (is_a <bb_vec_info> (vinfo))
4008 for (slp_instance instance : vinfo->slp_instances)
4010 if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
4011 continue;
4012 slp_tree old = SLP_INSTANCE_TREE (instance);
4013 if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
4014 && SLP_TREE_CHILDREN (old).length () == 1)
4016 slp_tree child = SLP_TREE_CHILDREN (old)[0];
4017 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
4019 /* Preserve the special VEC_PERM we use to shield existing
4020 vector defs from the rest. But make it a no-op. */
4021 unsigned i = 0;
4022 for (std::pair<unsigned, unsigned> &p
4023 : SLP_TREE_LANE_PERMUTATION (old))
4024 p.second = i++;
4026 else
4028 SLP_INSTANCE_TREE (instance) = child;
4029 SLP_TREE_REF_COUNT (child)++;
4030 vect_free_slp_tree (old);
4033 else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
4034 && SLP_TREE_REF_COUNT (old) == 1
4035 && vertices[old->vertex].get_perm_materialized () != 0)
4037 /* ??? For loads the situation is more complex since
4038 we can't modify the permute in place in case the
4039 node is used multiple times. In fact for loads this
4040 should be somehow handled in the propagation engine. */
4041 /* Apply the reverse permutation to our stmts. */
4042 int perm = vertices[old->vertex].get_perm_materialized ();
4043 vect_slp_permute (perms[perm],
4044 SLP_TREE_SCALAR_STMTS (old), true);
4045 vect_slp_permute (perms[perm],
4046 SLP_TREE_LOAD_PERMUTATION (old), true);
4051 /* Free the perms vector used for propagation. */
4052 while (!perms.is_empty ())
4053 perms.pop ().release ();
4054 free_graph (slpg);
4057 /* Now elide load permutations that are not necessary. */
4058 for (i = 0; i < leafs.length (); ++i)
4060 node = vertices[leafs[i]].node;
4061 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
4062 continue;
4064 /* In basic block vectorization we allow any subchain of an interleaving
4065 chain.
4066 FORNOW: not in loop SLP because of realignment complications. */
4067 if (is_a <bb_vec_info> (vinfo))
4069 bool subchain_p = true;
4070 stmt_vec_info next_load_info = NULL;
4071 stmt_vec_info load_info;
4072 unsigned j;
4073 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4075 if (j != 0
4076 && (next_load_info != load_info
4077 || DR_GROUP_GAP (load_info) != 1))
4079 subchain_p = false;
4080 break;
4082 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
4084 if (subchain_p)
4086 SLP_TREE_LOAD_PERMUTATION (node).release ();
4087 continue;
4090 else
4092 stmt_vec_info load_info;
4093 bool this_load_permuted = false;
4094 unsigned j;
4095 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4096 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
4098 this_load_permuted = true;
4099 break;
4101 stmt_vec_info first_stmt_info
4102 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
4103 if (!this_load_permuted
4104 /* The load requires permutation when unrolling exposes
4105 a gap either because the group is larger than the SLP
4106 group-size or because there is a gap between the groups. */
4107 && (known_eq (LOOP_VINFO_VECT_FACTOR
4108 (as_a <loop_vec_info> (vinfo)), 1U)
4109 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
4110 && DR_GROUP_GAP (first_stmt_info) == 0)))
4112 SLP_TREE_LOAD_PERMUTATION (node).release ();
4113 continue;
4119 /* Gather loads reachable from the individual SLP graph entries. */
4121 void
4122 vect_gather_slp_loads (vec_info *vinfo)
4124 unsigned i;
4125 slp_instance instance;
4126 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
4128 hash_set<slp_tree> visited;
4129 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
4130 SLP_INSTANCE_TREE (instance), visited);
4135 /* For each possible SLP instance decide whether to SLP it and calculate overall
4136 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
4137 least one instance. */
4139 bool
4140 vect_make_slp_decision (loop_vec_info loop_vinfo)
4142 unsigned int i;
4143 poly_uint64 unrolling_factor = 1;
4144 const vec<slp_instance> &slp_instances
4145 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
4146 slp_instance instance;
4147 int decided_to_slp = 0;
4149 DUMP_VECT_SCOPE ("vect_make_slp_decision");
4151 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4153 /* FORNOW: SLP if you can. */
4154 /* All unroll factors have the form:
4156 GET_MODE_SIZE (vinfo->vector_mode) * X
4158 for some rational X, so they must have a common multiple. */
4159 unrolling_factor
4160 = force_common_multiple (unrolling_factor,
4161 SLP_INSTANCE_UNROLLING_FACTOR (instance));
4163 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
4164 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
4165 loop-based vectorization. Such stmts will be marked as HYBRID. */
4166 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4167 decided_to_slp++;
4170 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
4172 if (decided_to_slp && dump_enabled_p ())
4174 dump_printf_loc (MSG_NOTE, vect_location,
4175 "Decided to SLP %d instances. Unrolling factor ",
4176 decided_to_slp);
4177 dump_dec (MSG_NOTE, unrolling_factor);
4178 dump_printf (MSG_NOTE, "\n");
4181 return (decided_to_slp > 0);
4184 /* Private data for vect_detect_hybrid_slp. */
4185 struct vdhs_data
4187 loop_vec_info loop_vinfo;
4188 vec<stmt_vec_info> *worklist;
4191 /* Walker for walk_gimple_op. */
4193 static tree
4194 vect_detect_hybrid_slp (tree *tp, int *, void *data)
4196 walk_stmt_info *wi = (walk_stmt_info *)data;
4197 vdhs_data *dat = (vdhs_data *)wi->info;
4199 if (wi->is_lhs)
4200 return NULL_TREE;
4202 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
4203 if (!def_stmt_info)
4204 return NULL_TREE;
4205 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
4206 if (PURE_SLP_STMT (def_stmt_info))
4208 if (dump_enabled_p ())
4209 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
4210 def_stmt_info->stmt);
4211 STMT_SLP_TYPE (def_stmt_info) = hybrid;
4212 dat->worklist->safe_push (def_stmt_info);
4215 return NULL_TREE;
4218 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
4219 if so, otherwise pushing it to WORKLIST. */
4221 static void
4222 maybe_push_to_hybrid_worklist (vec_info *vinfo,
4223 vec<stmt_vec_info> &worklist,
4224 stmt_vec_info stmt_info)
4226 if (dump_enabled_p ())
4227 dump_printf_loc (MSG_NOTE, vect_location,
4228 "Processing hybrid candidate : %G", stmt_info->stmt);
4229 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
4230 imm_use_iterator iter2;
4231 ssa_op_iter iter1;
4232 use_operand_p use_p;
4233 def_operand_p def_p;
4234 bool any_def = false;
4235 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
4237 any_def = true;
4238 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
4240 if (is_gimple_debug (USE_STMT (use_p)))
4241 continue;
4242 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
4243 /* An out-of loop use means this is a loop_vect sink. */
4244 if (!use_info)
4246 if (dump_enabled_p ())
4247 dump_printf_loc (MSG_NOTE, vect_location,
4248 "Found loop_vect sink: %G", stmt_info->stmt);
4249 worklist.safe_push (stmt_info);
4250 return;
4252 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
4254 if (dump_enabled_p ())
4255 dump_printf_loc (MSG_NOTE, vect_location,
4256 "Found loop_vect use: %G", use_info->stmt);
4257 worklist.safe_push (stmt_info);
4258 return;
4262 /* No def means this is a loo_vect sink. */
4263 if (!any_def)
4265 if (dump_enabled_p ())
4266 dump_printf_loc (MSG_NOTE, vect_location,
4267 "Found loop_vect sink: %G", stmt_info->stmt);
4268 worklist.safe_push (stmt_info);
4269 return;
4271 if (dump_enabled_p ())
4272 dump_printf_loc (MSG_NOTE, vect_location,
4273 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
4274 STMT_SLP_TYPE (stmt_info) = pure_slp;
4277 /* Find stmts that must be both vectorized and SLPed. */
4279 void
4280 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
4282 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
4284 /* All stmts participating in SLP are marked pure_slp, all other
4285 stmts are loop_vect.
4286 First collect all loop_vect stmts into a worklist.
4287 SLP patterns cause not all original scalar stmts to appear in
4288 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
4289 Rectify this here and do a backward walk over the IL only considering
4290 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
4291 mark them as pure_slp. */
4292 auto_vec<stmt_vec_info> worklist;
4293 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
4295 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
4296 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
4297 gsi_next (&gsi))
4299 gphi *phi = gsi.phi ();
4300 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
4301 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4302 maybe_push_to_hybrid_worklist (loop_vinfo,
4303 worklist, stmt_info);
4305 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
4306 gsi_prev (&gsi))
4308 gimple *stmt = gsi_stmt (gsi);
4309 if (is_gimple_debug (stmt))
4310 continue;
4311 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
4312 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
4314 for (gimple_stmt_iterator gsi2
4315 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
4316 !gsi_end_p (gsi2); gsi_next (&gsi2))
4318 stmt_vec_info patt_info
4319 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
4320 if (!STMT_SLP_TYPE (patt_info)
4321 && STMT_VINFO_RELEVANT (patt_info))
4322 maybe_push_to_hybrid_worklist (loop_vinfo,
4323 worklist, patt_info);
4325 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4327 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4328 maybe_push_to_hybrid_worklist (loop_vinfo,
4329 worklist, stmt_info);
4333 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
4334 mark any SLP vectorized stmt as hybrid.
4335 ??? We're visiting def stmts N times (once for each non-SLP and
4336 once for each hybrid-SLP use). */
4337 walk_stmt_info wi;
4338 vdhs_data dat;
4339 dat.worklist = &worklist;
4340 dat.loop_vinfo = loop_vinfo;
4341 memset (&wi, 0, sizeof (wi));
4342 wi.info = (void *)&dat;
4343 while (!worklist.is_empty ())
4345 stmt_vec_info stmt_info = worklist.pop ();
4346 /* Since SSA operands are not set up for pattern stmts we need
4347 to use walk_gimple_op. */
4348 wi.is_lhs = 0;
4349 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
4354 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
4356 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
4357 : vec_info (vec_info::bb, init_cost (NULL, false), shared),
4358 bbs (_bbs),
4359 roots (vNULL)
4361 for (unsigned i = 0; i < bbs.length (); ++i)
4363 if (i != 0)
4364 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4365 gsi_next (&si))
4367 gphi *phi = si.phi ();
4368 gimple_set_uid (phi, 0);
4369 add_stmt (phi);
4371 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4372 !gsi_end_p (gsi); gsi_next (&gsi))
4374 gimple *stmt = gsi_stmt (gsi);
4375 gimple_set_uid (stmt, 0);
4376 if (is_gimple_debug (stmt))
4377 continue;
4378 add_stmt (stmt);
4384 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
4385 stmts in the basic block. */
4387 _bb_vec_info::~_bb_vec_info ()
4389 /* Reset region marker. */
4390 for (unsigned i = 0; i < bbs.length (); ++i)
4392 if (i != 0)
4393 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4394 gsi_next (&si))
4396 gphi *phi = si.phi ();
4397 gimple_set_uid (phi, -1);
4399 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4400 !gsi_end_p (gsi); gsi_next (&gsi))
4402 gimple *stmt = gsi_stmt (gsi);
4403 gimple_set_uid (stmt, -1);
4407 for (unsigned i = 0; i < roots.length (); ++i)
4409 roots[i].stmts.release ();
4410 roots[i].roots.release ();
4412 roots.release ();
4415 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
4416 given then that child nodes have already been processed, and that
4417 their def types currently match their SLP node's def type. */
4419 static bool
4420 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
4421 slp_instance node_instance,
4422 stmt_vector_for_cost *cost_vec)
4424 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4426 /* Calculate the number of vector statements to be created for the
4427 scalar stmts in this node. For SLP reductions it is equal to the
4428 number of vector statements in the children (which has already been
4429 calculated by the recursive call). Otherwise it is the number of
4430 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
4431 VF divided by the number of elements in a vector. */
4432 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
4433 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
4435 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
4436 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
4438 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4439 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
4440 break;
4443 else
4445 poly_uint64 vf;
4446 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4447 vf = loop_vinfo->vectorization_factor;
4448 else
4449 vf = 1;
4450 unsigned int group_size = SLP_TREE_LANES (node);
4451 tree vectype = SLP_TREE_VECTYPE (node);
4452 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4453 = vect_get_num_vectors (vf * group_size, vectype);
4456 /* Handle purely internal nodes. */
4457 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4458 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
4460 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
4462 bool dummy;
4463 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
4464 node, node_instance, cost_vec);
4467 /* Try to build NODE from scalars, returning true on success.
4468 NODE_INSTANCE is the SLP instance that contains NODE. */
4470 static bool
4471 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
4472 slp_instance node_instance)
4474 stmt_vec_info stmt_info;
4475 unsigned int i;
4477 if (!is_a <bb_vec_info> (vinfo)
4478 || node == SLP_INSTANCE_TREE (node_instance)
4479 || !SLP_TREE_SCALAR_STMTS (node).exists ()
4480 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
4481 return false;
4483 if (dump_enabled_p ())
4484 dump_printf_loc (MSG_NOTE, vect_location,
4485 "Building vector operands of %p from scalars instead\n", node);
4487 /* Don't remove and free the child nodes here, since they could be
4488 referenced by other structures. The analysis and scheduling phases
4489 (need to) ignore child nodes of anything that isn't vect_internal_def. */
4490 unsigned int group_size = SLP_TREE_LANES (node);
4491 SLP_TREE_DEF_TYPE (node) = vect_external_def;
4492 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
4493 SLP_TREE_LOAD_PERMUTATION (node).release ();
4494 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4496 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
4497 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
4499 return true;
4502 /* Compute the prologue cost for invariant or constant operands represented
4503 by NODE. */
4505 static void
4506 vect_prologue_cost_for_slp (slp_tree node,
4507 stmt_vector_for_cost *cost_vec)
4509 /* There's a special case of an existing vector, that costs nothing. */
4510 if (SLP_TREE_SCALAR_OPS (node).length () == 0
4511 && !SLP_TREE_VEC_DEFS (node).is_empty ())
4512 return;
4513 /* Without looking at the actual initializer a vector of
4514 constants can be implemented as load from the constant pool.
4515 When all elements are the same we can use a splat. */
4516 tree vectype = SLP_TREE_VECTYPE (node);
4517 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
4518 unsigned num_vects_to_check;
4519 unsigned HOST_WIDE_INT const_nunits;
4520 unsigned nelt_limit;
4521 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
4522 && ! multiple_p (const_nunits, group_size))
4524 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4525 nelt_limit = const_nunits;
4527 else
4529 /* If either the vector has variable length or the vectors
4530 are composed of repeated whole groups we only need to
4531 cost construction once. All vectors will be the same. */
4532 num_vects_to_check = 1;
4533 nelt_limit = group_size;
4535 tree elt = NULL_TREE;
4536 unsigned nelt = 0;
4537 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
4539 unsigned si = j % group_size;
4540 if (nelt == 0)
4541 elt = SLP_TREE_SCALAR_OPS (node)[si];
4542 /* ??? We're just tracking whether all operands of a single
4543 vector initializer are the same, ideally we'd check if
4544 we emitted the same one already. */
4545 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
4546 elt = NULL_TREE;
4547 nelt++;
4548 if (nelt == nelt_limit)
4550 record_stmt_cost (cost_vec, 1,
4551 SLP_TREE_DEF_TYPE (node) == vect_external_def
4552 ? (elt ? scalar_to_vec : vec_construct)
4553 : vector_load,
4554 NULL, vectype, 0, vect_prologue);
4555 nelt = 0;
4560 /* Analyze statements contained in SLP tree NODE after recursively analyzing
4561 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
4563 Return true if the operations are supported. */
4565 static bool
4566 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
4567 slp_instance node_instance,
4568 hash_set<slp_tree> &visited_set,
4569 vec<slp_tree> &visited_vec,
4570 stmt_vector_for_cost *cost_vec)
4572 int i, j;
4573 slp_tree child;
4575 /* Assume we can code-generate all invariants. */
4576 if (!node
4577 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
4578 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4579 return true;
4581 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
4583 if (dump_enabled_p ())
4584 dump_printf_loc (MSG_NOTE, vect_location,
4585 "Failed cyclic SLP reference in %p\n", node);
4586 return false;
4588 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
4590 /* If we already analyzed the exact same set of scalar stmts we're done.
4591 We share the generated vector stmts for those. */
4592 if (visited_set.add (node))
4593 return true;
4594 visited_vec.safe_push (node);
4596 bool res = true;
4597 unsigned visited_rec_start = visited_vec.length ();
4598 unsigned cost_vec_rec_start = cost_vec->length ();
4599 bool seen_non_constant_child = false;
4600 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4602 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
4603 visited_set, visited_vec,
4604 cost_vec);
4605 if (!res)
4606 break;
4607 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
4608 seen_non_constant_child = true;
4610 /* We're having difficulties scheduling nodes with just constant
4611 operands and no scalar stmts since we then cannot compute a stmt
4612 insertion place. */
4613 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
4615 if (dump_enabled_p ())
4616 dump_printf_loc (MSG_NOTE, vect_location,
4617 "Cannot vectorize all-constant op node %p\n", node);
4618 res = false;
4621 if (res)
4622 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
4623 cost_vec);
4624 /* If analysis failed we have to pop all recursive visited nodes
4625 plus ourselves. */
4626 if (!res)
4628 while (visited_vec.length () >= visited_rec_start)
4629 visited_set.remove (visited_vec.pop ());
4630 cost_vec->truncate (cost_vec_rec_start);
4633 /* When the node can be vectorized cost invariant nodes it references.
4634 This is not done in DFS order to allow the refering node
4635 vectorizable_* calls to nail down the invariant nodes vector type
4636 and possibly unshare it if it needs a different vector type than
4637 other referrers. */
4638 if (res)
4639 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
4640 if (child
4641 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
4642 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
4643 /* Perform usual caching, note code-generation still
4644 code-gens these nodes multiple times but we expect
4645 to CSE them later. */
4646 && !visited_set.add (child))
4648 visited_vec.safe_push (child);
4649 /* ??? After auditing more code paths make a "default"
4650 and push the vector type from NODE to all children
4651 if it is not already set. */
4652 /* Compute the number of vectors to be generated. */
4653 tree vector_type = SLP_TREE_VECTYPE (child);
4654 if (!vector_type)
4656 /* For shifts with a scalar argument we don't need
4657 to cost or code-generate anything.
4658 ??? Represent this more explicitely. */
4659 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4660 == shift_vec_info_type)
4661 && j == 1);
4662 continue;
4664 unsigned group_size = SLP_TREE_LANES (child);
4665 poly_uint64 vf = 1;
4666 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4667 vf = loop_vinfo->vectorization_factor;
4668 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
4669 = vect_get_num_vectors (vf * group_size, vector_type);
4670 /* And cost them. */
4671 vect_prologue_cost_for_slp (child, cost_vec);
4674 /* If this node or any of its children can't be vectorized, try pruning
4675 the tree here rather than felling the whole thing. */
4676 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
4678 /* We'll need to revisit this for invariant costing and number
4679 of vectorized stmt setting. */
4680 res = true;
4683 return res;
4686 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4687 region and that can be vectorized using vectorizable_live_operation
4688 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4689 scalar code computing it to be retained. */
4691 static void
4692 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
4693 slp_instance instance,
4694 stmt_vector_for_cost *cost_vec,
4695 hash_set<stmt_vec_info> &svisited,
4696 hash_set<slp_tree> &visited)
4698 if (visited.add (node))
4699 return;
4701 unsigned i;
4702 stmt_vec_info stmt_info;
4703 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
4704 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4706 if (svisited.contains (stmt_info))
4707 continue;
4708 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4709 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
4710 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
4711 /* Only the pattern root stmt computes the original scalar value. */
4712 continue;
4713 bool mark_visited = true;
4714 gimple *orig_stmt = orig_stmt_info->stmt;
4715 ssa_op_iter op_iter;
4716 def_operand_p def_p;
4717 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4719 imm_use_iterator use_iter;
4720 gimple *use_stmt;
4721 stmt_vec_info use_stmt_info;
4722 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4723 if (!is_gimple_debug (use_stmt))
4725 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
4726 if (!use_stmt_info
4727 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4729 STMT_VINFO_LIVE_P (stmt_info) = true;
4730 if (vectorizable_live_operation (bb_vinfo, stmt_info,
4731 NULL, node, instance, i,
4732 false, cost_vec))
4733 /* ??? So we know we can vectorize the live stmt
4734 from one SLP node. If we cannot do so from all
4735 or none consistently we'd have to record which
4736 SLP node (and lane) we want to use for the live
4737 operation. So make sure we can code-generate
4738 from all nodes. */
4739 mark_visited = false;
4740 else
4741 STMT_VINFO_LIVE_P (stmt_info) = false;
4742 break;
4745 /* We have to verify whether we can insert the lane extract
4746 before all uses. The following is a conservative approximation.
4747 We cannot put this into vectorizable_live_operation because
4748 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4749 doesn't work.
4750 Note that while the fact that we emit code for loads at the
4751 first load should make this a non-problem leafs we construct
4752 from scalars are vectorized after the last scalar def.
4753 ??? If we'd actually compute the insert location during
4754 analysis we could use sth less conservative than the last
4755 scalar stmt in the node for the dominance check. */
4756 /* ??? What remains is "live" uses in vector CTORs in the same
4757 SLP graph which is where those uses can end up code-generated
4758 right after their definition instead of close to their original
4759 use. But that would restrict us to code-generate lane-extracts
4760 from the latest stmt in a node. So we compensate for this
4761 during code-generation, simply not replacing uses for those
4762 hopefully rare cases. */
4763 if (STMT_VINFO_LIVE_P (stmt_info))
4764 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4765 if (!is_gimple_debug (use_stmt)
4766 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4767 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4768 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4770 if (dump_enabled_p ())
4771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4772 "Cannot determine insertion place for "
4773 "lane extract\n");
4774 STMT_VINFO_LIVE_P (stmt_info) = false;
4775 mark_visited = true;
4778 if (mark_visited)
4779 svisited.add (stmt_info);
4782 slp_tree child;
4783 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4784 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4785 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4786 cost_vec, svisited, visited);
4789 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
4791 static bool
4792 vectorizable_bb_reduc_epilogue (slp_instance instance,
4793 stmt_vector_for_cost *cost_vec)
4795 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
4796 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
4797 if (reduc_code == MINUS_EXPR)
4798 reduc_code = PLUS_EXPR;
4799 internal_fn reduc_fn;
4800 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
4801 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
4802 || reduc_fn == IFN_LAST
4803 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
4804 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
4805 TREE_TYPE (vectype)))
4806 return false;
4808 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
4809 cost log2 vector operations plus shuffles and one extraction. */
4810 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
4811 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
4812 vectype, 0, vect_body);
4813 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
4814 vectype, 0, vect_body);
4815 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
4816 vectype, 0, vect_body);
4817 return true;
4820 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
4821 and recurse to children. */
4823 static void
4824 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
4825 hash_set<slp_tree> &visited)
4827 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
4828 || visited.add (node))
4829 return;
4831 stmt_vec_info stmt;
4832 unsigned i;
4833 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4834 roots.remove (vect_orig_stmt (stmt));
4836 slp_tree child;
4837 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4838 if (child)
4839 vect_slp_prune_covered_roots (child, roots, visited);
4842 /* Analyze statements in SLP instances of VINFO. Return true if the
4843 operations are supported. */
4845 bool
4846 vect_slp_analyze_operations (vec_info *vinfo)
4848 slp_instance instance;
4849 int i;
4851 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4853 hash_set<slp_tree> visited;
4854 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4856 auto_vec<slp_tree> visited_vec;
4857 stmt_vector_for_cost cost_vec;
4858 cost_vec.create (2);
4859 if (is_a <bb_vec_info> (vinfo))
4860 vect_location = instance->location ();
4861 if (!vect_slp_analyze_node_operations (vinfo,
4862 SLP_INSTANCE_TREE (instance),
4863 instance, visited, visited_vec,
4864 &cost_vec)
4865 /* CTOR instances require vectorized defs for the SLP tree root. */
4866 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
4867 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
4868 != vect_internal_def))
4869 /* Check we can vectorize the reduction. */
4870 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
4871 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
4873 slp_tree node = SLP_INSTANCE_TREE (instance);
4874 stmt_vec_info stmt_info;
4875 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4876 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
4877 else
4878 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4879 if (dump_enabled_p ())
4880 dump_printf_loc (MSG_NOTE, vect_location,
4881 "removing SLP instance operations starting from: %G",
4882 stmt_info->stmt);
4883 vect_free_slp_instance (instance);
4884 vinfo->slp_instances.ordered_remove (i);
4885 cost_vec.release ();
4886 while (!visited_vec.is_empty ())
4887 visited.remove (visited_vec.pop ());
4889 else
4891 i++;
4893 /* For BB vectorization remember the SLP graph entry
4894 cost for later. */
4895 if (is_a <bb_vec_info> (vinfo))
4896 instance->cost_vec = cost_vec;
4897 else
4899 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
4900 cost_vec.release ();
4905 /* Now look for SLP instances with a root that are covered by other
4906 instances and remove them. */
4907 hash_set<stmt_vec_info> roots;
4908 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4909 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4910 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
4911 if (!roots.is_empty ())
4913 visited.empty ();
4914 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4915 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
4916 visited);
4917 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4918 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
4919 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
4921 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
4922 if (dump_enabled_p ())
4923 dump_printf_loc (MSG_NOTE, vect_location,
4924 "removing SLP instance operations starting "
4925 "from: %G", root->stmt);
4926 vect_free_slp_instance (instance);
4927 vinfo->slp_instances.ordered_remove (i);
4929 else
4930 ++i;
4933 /* Compute vectorizable live stmts. */
4934 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4936 hash_set<stmt_vec_info> svisited;
4937 hash_set<slp_tree> visited;
4938 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4940 vect_location = instance->location ();
4941 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
4942 instance, &instance->cost_vec, svisited,
4943 visited);
4947 return !vinfo->slp_instances.is_empty ();
4950 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4951 closing the eventual chain. */
4953 static slp_instance
4954 get_ultimate_leader (slp_instance instance,
4955 hash_map<slp_instance, slp_instance> &instance_leader)
4957 auto_vec<slp_instance *, 8> chain;
4958 slp_instance *tem;
4959 while (*(tem = instance_leader.get (instance)) != instance)
4961 chain.safe_push (tem);
4962 instance = *tem;
4964 while (!chain.is_empty ())
4965 *chain.pop () = instance;
4966 return instance;
4969 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4971 static void
4972 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
4973 slp_instance instance, slp_tree node,
4974 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
4975 hash_map<slp_instance, slp_instance> &instance_leader,
4976 hash_set<slp_tree> &visited)
4978 stmt_vec_info stmt_info;
4979 unsigned i;
4981 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4983 bool existed_p;
4984 slp_instance &stmt_instance
4985 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
4986 if (!existed_p)
4988 else if (stmt_instance != instance)
4990 /* If we're running into a previously marked stmt make us the
4991 leader of the current ultimate leader. This keeps the
4992 leader chain acyclic and works even when the current instance
4993 connects two previously independent graph parts. */
4994 slp_instance stmt_leader
4995 = get_ultimate_leader (stmt_instance, instance_leader);
4996 if (stmt_leader != instance)
4997 instance_leader.put (stmt_leader, instance);
4999 stmt_instance = instance;
5002 if (!SLP_TREE_SCALAR_STMTS (node).is_empty () && visited.add (node))
5003 return;
5005 slp_tree child;
5006 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5007 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5008 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
5009 instance_leader, visited);
5012 /* Partition the SLP graph into pieces that can be costed independently. */
5014 static void
5015 vect_bb_partition_graph (bb_vec_info bb_vinfo)
5017 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
5019 /* First walk the SLP graph assigning each involved scalar stmt a
5020 corresponding SLP graph entry and upon visiting a previously
5021 marked stmt, make the stmts leader the current SLP graph entry. */
5022 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
5023 hash_map<slp_instance, slp_instance> instance_leader;
5024 hash_set<slp_tree> visited;
5025 slp_instance instance;
5026 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5028 instance_leader.put (instance, instance);
5029 vect_bb_partition_graph_r (bb_vinfo,
5030 instance, SLP_INSTANCE_TREE (instance),
5031 stmt_to_instance, instance_leader,
5032 visited);
5035 /* Then collect entries to each independent subgraph. */
5036 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5038 slp_instance leader = get_ultimate_leader (instance, instance_leader);
5039 leader->subgraph_entries.safe_push (instance);
5040 if (dump_enabled_p ()
5041 && leader != instance)
5042 dump_printf_loc (MSG_NOTE, vect_location,
5043 "instance %p is leader of %p\n",
5044 leader, instance);
5048 /* Compute the set of scalar stmts participating in internal and external
5049 nodes. */
5051 static void
5052 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
5053 hash_set<slp_tree> &visited,
5054 hash_set<stmt_vec_info> &vstmts,
5055 hash_set<stmt_vec_info> &estmts)
5057 int i;
5058 stmt_vec_info stmt_info;
5059 slp_tree child;
5061 if (visited.add (node))
5062 return;
5064 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
5066 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5067 vstmts.add (stmt_info);
5069 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5070 if (child)
5071 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
5072 vstmts, estmts);
5074 else
5075 for (tree def : SLP_TREE_SCALAR_OPS (node))
5077 stmt_vec_info def_stmt = vinfo->lookup_def (def);
5078 if (def_stmt)
5079 estmts.add (def_stmt);
5084 /* Compute the scalar cost of the SLP node NODE and its children
5085 and return it. Do not account defs that are marked in LIFE and
5086 update LIFE according to uses of NODE. */
5088 static void
5089 vect_bb_slp_scalar_cost (vec_info *vinfo,
5090 slp_tree node, vec<bool, va_heap> *life,
5091 stmt_vector_for_cost *cost_vec,
5092 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
5093 hash_set<slp_tree> &visited)
5095 unsigned i;
5096 stmt_vec_info stmt_info;
5097 slp_tree child;
5099 if (visited.add (node))
5100 return;
5102 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5104 ssa_op_iter op_iter;
5105 def_operand_p def_p;
5107 if ((*life)[i])
5108 continue;
5110 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5111 gimple *orig_stmt = orig_stmt_info->stmt;
5113 /* If there is a non-vectorized use of the defs then the scalar
5114 stmt is kept live in which case we do not account it or any
5115 required defs in the SLP children in the scalar cost. This
5116 way we make the vectorization more costly when compared to
5117 the scalar cost. */
5118 if (!STMT_VINFO_LIVE_P (stmt_info))
5120 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
5122 imm_use_iterator use_iter;
5123 gimple *use_stmt;
5124 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
5125 if (!is_gimple_debug (use_stmt))
5127 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
5128 if (!use_stmt_info
5129 || !vectorized_scalar_stmts.contains (use_stmt_info))
5131 (*life)[i] = true;
5132 break;
5136 if ((*life)[i])
5137 continue;
5140 /* Count scalar stmts only once. */
5141 if (gimple_visited_p (orig_stmt))
5142 continue;
5143 gimple_set_visited (orig_stmt, true);
5145 vect_cost_for_stmt kind;
5146 if (STMT_VINFO_DATA_REF (orig_stmt_info))
5148 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
5149 kind = scalar_load;
5150 else
5151 kind = scalar_store;
5153 else if (vect_nop_conversion_p (orig_stmt_info))
5154 continue;
5155 /* For single-argument PHIs assume coalescing which means zero cost
5156 for the scalar and the vector PHIs. This avoids artificially
5157 favoring the vector path (but may pessimize it in some cases). */
5158 else if (is_a <gphi *> (orig_stmt_info->stmt)
5159 && gimple_phi_num_args
5160 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
5161 continue;
5162 else
5163 kind = scalar_stmt;
5164 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
5165 SLP_TREE_VECTYPE (node), 0, vect_body);
5168 auto_vec<bool, 20> subtree_life;
5169 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5171 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5173 /* Do not directly pass LIFE to the recursive call, copy it to
5174 confine changes in the callee to the current child/subtree. */
5175 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5177 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
5178 for (unsigned j = 0;
5179 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
5181 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
5182 if (perm.first == i)
5183 subtree_life[perm.second] = (*life)[j];
5186 else
5188 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
5189 subtree_life.safe_splice (*life);
5191 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
5192 vectorized_scalar_stmts, visited);
5193 subtree_life.truncate (0);
5198 /* Comparator for the loop-index sorted cost vectors. */
5200 static int
5201 li_cost_vec_cmp (const void *a_, const void *b_)
5203 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
5204 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
5205 if (a->first < b->first)
5206 return -1;
5207 else if (a->first == b->first)
5208 return 0;
5209 return 1;
5212 /* Check if vectorization of the basic block is profitable for the
5213 subgraph denoted by SLP_INSTANCES. */
5215 static bool
5216 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
5217 vec<slp_instance> slp_instances,
5218 loop_p orig_loop)
5220 slp_instance instance;
5221 int i;
5222 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
5223 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
5225 if (dump_enabled_p ())
5227 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
5228 hash_set<slp_tree> visited;
5229 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5230 vect_print_slp_graph (MSG_NOTE, vect_location,
5231 SLP_INSTANCE_TREE (instance), visited);
5234 /* Compute the set of scalar stmts we know will go away 'locally' when
5235 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
5236 not accurate for nodes promoted extern late or for scalar stmts that
5237 are used both in extern defs and in vectorized defs. */
5238 hash_set<stmt_vec_info> vectorized_scalar_stmts;
5239 hash_set<stmt_vec_info> scalar_stmts_in_externs;
5240 hash_set<slp_tree> visited;
5241 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5243 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
5244 SLP_INSTANCE_TREE (instance),
5245 visited,
5246 vectorized_scalar_stmts,
5247 scalar_stmts_in_externs);
5248 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
5249 vectorized_scalar_stmts.add (rstmt);
5251 /* Scalar stmts used as defs in external nodes need to be preseved, so
5252 remove them from vectorized_scalar_stmts. */
5253 for (stmt_vec_info stmt : scalar_stmts_in_externs)
5254 vectorized_scalar_stmts.remove (stmt);
5256 /* Calculate scalar cost and sum the cost for the vector stmts
5257 previously collected. */
5258 stmt_vector_for_cost scalar_costs = vNULL;
5259 stmt_vector_for_cost vector_costs = vNULL;
5260 visited.empty ();
5261 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5263 auto_vec<bool, 20> life;
5264 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
5265 true);
5266 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5267 record_stmt_cost (&scalar_costs,
5268 SLP_INSTANCE_ROOT_STMTS (instance).length (),
5269 scalar_stmt,
5270 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
5271 vect_bb_slp_scalar_cost (bb_vinfo,
5272 SLP_INSTANCE_TREE (instance),
5273 &life, &scalar_costs, vectorized_scalar_stmts,
5274 visited);
5275 vector_costs.safe_splice (instance->cost_vec);
5276 instance->cost_vec.release ();
5279 if (dump_enabled_p ())
5280 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5282 /* When costing non-loop vectorization we need to consider each covered
5283 loop independently and make sure vectorization is profitable. For
5284 now we assume a loop may be not entered or executed an arbitrary
5285 number of iterations (??? static information can provide more
5286 precise info here) which means we can simply cost each containing
5287 loops stmts separately. */
5289 /* First produce cost vectors sorted by loop index. */
5290 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5291 li_scalar_costs (scalar_costs.length ());
5292 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5293 li_vector_costs (vector_costs.length ());
5294 stmt_info_for_cost *cost;
5295 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5297 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5298 li_scalar_costs.quick_push (std::make_pair (l, cost));
5300 /* Use a random used loop as fallback in case the first vector_costs
5301 entry does not have a stmt_info associated with it. */
5302 unsigned l = li_scalar_costs[0].first;
5303 FOR_EACH_VEC_ELT (vector_costs, i, cost)
5305 /* We inherit from the previous COST, invariants, externals and
5306 extracts immediately follow the cost for the related stmt. */
5307 if (cost->stmt_info)
5308 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5309 li_vector_costs.quick_push (std::make_pair (l, cost));
5311 li_scalar_costs.qsort (li_cost_vec_cmp);
5312 li_vector_costs.qsort (li_cost_vec_cmp);
5314 /* Now cost the portions individually. */
5315 unsigned vi = 0;
5316 unsigned si = 0;
5317 bool profitable = true;
5318 while (si < li_scalar_costs.length ()
5319 && vi < li_vector_costs.length ())
5321 unsigned sl = li_scalar_costs[si].first;
5322 unsigned vl = li_vector_costs[vi].first;
5323 if (sl != vl)
5325 if (dump_enabled_p ())
5326 dump_printf_loc (MSG_NOTE, vect_location,
5327 "Scalar %d and vector %d loop part do not "
5328 "match up, skipping scalar part\n", sl, vl);
5329 /* Skip the scalar part, assuming zero cost on the vector side. */
5332 si++;
5334 while (si < li_scalar_costs.length ()
5335 && li_scalar_costs[si].first == sl);
5336 continue;
5339 void *scalar_target_cost_data = init_cost (NULL, true);
5342 add_stmt_cost (bb_vinfo, scalar_target_cost_data,
5343 li_scalar_costs[si].second);
5344 si++;
5346 while (si < li_scalar_costs.length ()
5347 && li_scalar_costs[si].first == sl);
5348 unsigned dummy;
5349 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
5350 destroy_cost_data (scalar_target_cost_data);
5352 /* Complete the target-specific vector cost calculation. */
5353 void *vect_target_cost_data = init_cost (NULL, false);
5356 add_stmt_cost (bb_vinfo, vect_target_cost_data,
5357 li_vector_costs[vi].second);
5358 vi++;
5360 while (vi < li_vector_costs.length ()
5361 && li_vector_costs[vi].first == vl);
5362 finish_cost (vect_target_cost_data, &vec_prologue_cost,
5363 &vec_inside_cost, &vec_epilogue_cost);
5364 destroy_cost_data (vect_target_cost_data);
5366 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
5368 if (dump_enabled_p ())
5370 dump_printf_loc (MSG_NOTE, vect_location,
5371 "Cost model analysis for part in loop %d:\n", sl);
5372 dump_printf (MSG_NOTE, " Vector cost: %d\n",
5373 vec_inside_cost + vec_outside_cost);
5374 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
5377 /* Vectorization is profitable if its cost is more than the cost of scalar
5378 version. Note that we err on the vector side for equal cost because
5379 the cost estimate is otherwise quite pessimistic (constant uses are
5380 free on the scalar side but cost a load on the vector side for
5381 example). */
5382 if (vec_outside_cost + vec_inside_cost > scalar_cost)
5384 profitable = false;
5385 break;
5388 if (profitable && vi < li_vector_costs.length ())
5390 if (dump_enabled_p ())
5391 dump_printf_loc (MSG_NOTE, vect_location,
5392 "Excess vector cost for part in loop %d:\n",
5393 li_vector_costs[vi].first);
5394 profitable = false;
5397 /* Unset visited flag. This is delayed when the subgraph is profitable
5398 and we process the loop for remaining unvectorized if-converted code. */
5399 if (!orig_loop || !profitable)
5400 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5401 gimple_set_visited (cost->stmt_info->stmt, false);
5403 scalar_costs.release ();
5404 vector_costs.release ();
5406 return profitable;
5409 /* qsort comparator for lane defs. */
5411 static int
5412 vld_cmp (const void *a_, const void *b_)
5414 auto *a = (const std::pair<unsigned, tree> *)a_;
5415 auto *b = (const std::pair<unsigned, tree> *)b_;
5416 return a->first - b->first;
5419 /* Return true if USE_STMT is a vector lane insert into VEC and set
5420 *THIS_LANE to the lane number that is set. */
5422 static bool
5423 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
5425 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
5426 if (!use_ass
5427 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
5428 || (vec
5429 ? gimple_assign_rhs1 (use_ass) != vec
5430 : ((vec = gimple_assign_rhs1 (use_ass)), false))
5431 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
5432 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
5433 || !constant_multiple_p
5434 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
5435 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
5436 this_lane))
5437 return false;
5438 return true;
5441 /* Find any vectorizable constructors and add them to the grouped_store
5442 array. */
5444 static void
5445 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
5447 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
5448 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
5449 !gsi_end_p (gsi); gsi_next (&gsi))
5451 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
5452 if (!assign)
5453 continue;
5455 tree rhs = gimple_assign_rhs1 (assign);
5456 enum tree_code code = gimple_assign_rhs_code (assign);
5457 use_operand_p use_p;
5458 gimple *use_stmt;
5459 if (code == CONSTRUCTOR)
5461 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5462 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
5463 CONSTRUCTOR_NELTS (rhs))
5464 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
5465 || uniform_vector_p (rhs))
5466 continue;
5468 unsigned j;
5469 tree val;
5470 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
5471 if (TREE_CODE (val) != SSA_NAME
5472 || !bb_vinfo->lookup_def (val))
5473 break;
5474 if (j != CONSTRUCTOR_NELTS (rhs))
5475 continue;
5477 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
5478 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
5480 else if (code == BIT_INSERT_EXPR
5481 && VECTOR_TYPE_P (TREE_TYPE (rhs))
5482 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
5483 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
5484 && integer_zerop (gimple_assign_rhs3 (assign))
5485 && useless_type_conversion_p
5486 (TREE_TYPE (TREE_TYPE (rhs)),
5487 TREE_TYPE (gimple_assign_rhs2 (assign)))
5488 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
5490 /* We start to match on insert to lane zero but since the
5491 inserts need not be ordered we'd have to search both
5492 the def and the use chains. */
5493 tree vectype = TREE_TYPE (rhs);
5494 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5495 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
5496 auto_sbitmap lanes (nlanes);
5497 bitmap_clear (lanes);
5498 bitmap_set_bit (lanes, 0);
5499 tree def = gimple_assign_lhs (assign);
5500 lane_defs.quick_push
5501 (std::make_pair (0, gimple_assign_rhs2 (assign)));
5502 unsigned lanes_found = 1;
5503 /* Start with the use chains, the last stmt will be the root. */
5504 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
5505 vec<stmt_vec_info> roots = vNULL;
5506 roots.safe_push (last);
5509 use_operand_p use_p;
5510 gimple *use_stmt;
5511 if (!single_imm_use (def, &use_p, &use_stmt))
5512 break;
5513 unsigned this_lane;
5514 if (!bb_vinfo->lookup_stmt (use_stmt)
5515 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
5516 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
5517 break;
5518 if (bitmap_bit_p (lanes, this_lane))
5519 break;
5520 lanes_found++;
5521 bitmap_set_bit (lanes, this_lane);
5522 gassign *use_ass = as_a <gassign *> (use_stmt);
5523 lane_defs.quick_push (std::make_pair
5524 (this_lane, gimple_assign_rhs2 (use_ass)));
5525 last = bb_vinfo->lookup_stmt (use_ass);
5526 roots.safe_push (last);
5527 def = gimple_assign_lhs (use_ass);
5529 while (lanes_found < nlanes);
5530 if (roots.length () > 1)
5531 std::swap(roots[0], roots[roots.length () - 1]);
5532 if (lanes_found < nlanes)
5534 /* Now search the def chain. */
5535 def = gimple_assign_rhs1 (assign);
5538 if (TREE_CODE (def) != SSA_NAME
5539 || !has_single_use (def))
5540 break;
5541 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
5542 unsigned this_lane;
5543 if (!bb_vinfo->lookup_stmt (def_stmt)
5544 || !vect_slp_is_lane_insert (def_stmt,
5545 NULL_TREE, &this_lane)
5546 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
5547 break;
5548 if (bitmap_bit_p (lanes, this_lane))
5549 break;
5550 lanes_found++;
5551 bitmap_set_bit (lanes, this_lane);
5552 lane_defs.quick_push (std::make_pair
5553 (this_lane,
5554 gimple_assign_rhs2 (def_stmt)));
5555 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
5556 def = gimple_assign_rhs1 (def_stmt);
5558 while (lanes_found < nlanes);
5560 if (lanes_found == nlanes)
5562 /* Sort lane_defs after the lane index and register the root. */
5563 lane_defs.qsort (vld_cmp);
5564 vec<stmt_vec_info> stmts;
5565 stmts.create (nlanes);
5566 for (unsigned i = 0; i < nlanes; ++i)
5567 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
5568 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
5569 stmts, roots));
5571 else
5572 roots.release ();
5574 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5575 && (associative_tree_code (code) || code == MINUS_EXPR)
5576 /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS
5577 checks pessimize a two-element reduction. PR54400.
5578 ??? In-order reduction could be handled if we only
5579 traverse one operand chain in vect_slp_linearize_chain. */
5580 && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
5581 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
5582 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
5583 /* Ops with constants at the tail can be stripped here. */
5584 && TREE_CODE (rhs) == SSA_NAME
5585 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
5586 /* Should be the chain end. */
5587 && (!single_imm_use (gimple_assign_lhs (assign),
5588 &use_p, &use_stmt)
5589 || !is_gimple_assign (use_stmt)
5590 || (gimple_assign_rhs_code (use_stmt) != code
5591 && ((code != PLUS_EXPR && code != MINUS_EXPR)
5592 || (gimple_assign_rhs_code (use_stmt)
5593 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
5595 /* We start the match at the end of a possible association
5596 chain. */
5597 auto_vec<chain_op_t> chain;
5598 auto_vec<std::pair<tree_code, gimple *> > worklist;
5599 auto_vec<gimple *> chain_stmts;
5600 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
5601 if (code == MINUS_EXPR)
5602 code = PLUS_EXPR;
5603 internal_fn reduc_fn;
5604 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
5605 || reduc_fn == IFN_LAST)
5606 continue;
5607 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
5608 /* ??? */
5609 code_stmt, alt_code_stmt, &chain_stmts);
5610 if (chain.length () > 1)
5612 /* Sort the chain according to def_type and operation. */
5613 chain.sort (dt_sort_cmp, bb_vinfo);
5614 /* ??? Now we'd want to strip externals and constants
5615 but record those to be handled in the epilogue. */
5616 /* ??? For now do not allow mixing ops or externs/constants. */
5617 bool invalid = false;
5618 for (unsigned i = 0; i < chain.length (); ++i)
5619 if (chain[i].dt != vect_internal_def
5620 || chain[i].code != code)
5621 invalid = true;
5622 if (!invalid)
5624 vec<stmt_vec_info> stmts;
5625 stmts.create (chain.length ());
5626 for (unsigned i = 0; i < chain.length (); ++i)
5627 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
5628 vec<stmt_vec_info> roots;
5629 roots.create (chain_stmts.length ());
5630 for (unsigned i = 0; i < chain_stmts.length (); ++i)
5631 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
5632 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
5633 stmts, roots));
5640 /* Walk the grouped store chains and replace entries with their
5641 pattern variant if any. */
5643 static void
5644 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
5646 stmt_vec_info first_element;
5647 unsigned i;
5649 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5651 /* We also have CTORs in this array. */
5652 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
5653 continue;
5654 if (STMT_VINFO_IN_PATTERN_P (first_element))
5656 stmt_vec_info orig = first_element;
5657 first_element = STMT_VINFO_RELATED_STMT (first_element);
5658 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
5659 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
5660 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
5661 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
5662 vinfo->grouped_stores[i] = first_element;
5664 stmt_vec_info prev = first_element;
5665 while (DR_GROUP_NEXT_ELEMENT (prev))
5667 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
5668 if (STMT_VINFO_IN_PATTERN_P (elt))
5670 stmt_vec_info orig = elt;
5671 elt = STMT_VINFO_RELATED_STMT (elt);
5672 DR_GROUP_NEXT_ELEMENT (prev) = elt;
5673 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
5674 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
5676 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
5677 prev = elt;
5682 /* Check if the region described by BB_VINFO can be vectorized, returning
5683 true if so. When returning false, set FATAL to true if the same failure
5684 would prevent vectorization at other vector sizes, false if it is still
5685 worth trying other sizes. N_STMTS is the number of statements in the
5686 region. */
5688 static bool
5689 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
5690 vec<int> *dataref_groups)
5692 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
5694 slp_instance instance;
5695 int i;
5696 poly_uint64 min_vf = 2;
5698 /* The first group of checks is independent of the vector size. */
5699 fatal = true;
5701 /* Analyze the data references. */
5703 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
5705 if (dump_enabled_p ())
5706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5707 "not vectorized: unhandled data-ref in basic "
5708 "block.\n");
5709 return false;
5712 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
5714 if (dump_enabled_p ())
5715 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5716 "not vectorized: unhandled data access in "
5717 "basic block.\n");
5718 return false;
5721 vect_slp_check_for_constructors (bb_vinfo);
5723 /* If there are no grouped stores and no constructors in the region
5724 there is no need to continue with pattern recog as vect_analyze_slp
5725 will fail anyway. */
5726 if (bb_vinfo->grouped_stores.is_empty ()
5727 && bb_vinfo->roots.is_empty ())
5729 if (dump_enabled_p ())
5730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5731 "not vectorized: no grouped stores in "
5732 "basic block.\n");
5733 return false;
5736 /* While the rest of the analysis below depends on it in some way. */
5737 fatal = false;
5739 vect_pattern_recog (bb_vinfo);
5741 /* Update store groups from pattern processing. */
5742 vect_fixup_store_groups_with_patterns (bb_vinfo);
5744 /* Check the SLP opportunities in the basic block, analyze and build SLP
5745 trees. */
5746 if (!vect_analyze_slp (bb_vinfo, n_stmts))
5748 if (dump_enabled_p ())
5750 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5751 "Failed to SLP the basic block.\n");
5752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5753 "not vectorized: failed to find SLP opportunities "
5754 "in basic block.\n");
5756 return false;
5759 /* Optimize permutations. */
5760 vect_optimize_slp (bb_vinfo);
5762 /* Gather the loads reachable from the SLP graph entries. */
5763 vect_gather_slp_loads (bb_vinfo);
5765 vect_record_base_alignments (bb_vinfo);
5767 /* Analyze and verify the alignment of data references and the
5768 dependence in the SLP instances. */
5769 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
5771 vect_location = instance->location ();
5772 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
5773 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
5775 slp_tree node = SLP_INSTANCE_TREE (instance);
5776 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5777 if (dump_enabled_p ())
5778 dump_printf_loc (MSG_NOTE, vect_location,
5779 "removing SLP instance operations starting from: %G",
5780 stmt_info->stmt);
5781 vect_free_slp_instance (instance);
5782 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
5783 continue;
5786 /* Mark all the statements that we want to vectorize as pure SLP and
5787 relevant. */
5788 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5789 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
5790 unsigned j;
5791 stmt_vec_info root;
5792 /* Likewise consider instance root stmts as vectorized. */
5793 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
5794 STMT_SLP_TYPE (root) = pure_slp;
5796 i++;
5798 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
5799 return false;
5801 if (!vect_slp_analyze_operations (bb_vinfo))
5803 if (dump_enabled_p ())
5804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5805 "not vectorized: bad operation in basic block.\n");
5806 return false;
5809 vect_bb_partition_graph (bb_vinfo);
5811 return true;
5814 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
5815 basic blocks in BBS, returning true on success.
5816 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
5818 static bool
5819 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
5820 vec<int> *dataref_groups, unsigned int n_stmts,
5821 loop_p orig_loop)
5823 bb_vec_info bb_vinfo;
5824 auto_vector_modes vector_modes;
5826 /* Autodetect first vector size we try. */
5827 machine_mode next_vector_mode = VOIDmode;
5828 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
5829 unsigned int mode_i = 0;
5831 vec_info_shared shared;
5833 machine_mode autodetected_vector_mode = VOIDmode;
5834 while (1)
5836 bool vectorized = false;
5837 bool fatal = false;
5838 bb_vinfo = new _bb_vec_info (bbs, &shared);
5840 bool first_time_p = shared.datarefs.is_empty ();
5841 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
5842 if (first_time_p)
5843 bb_vinfo->shared->save_datarefs ();
5844 else
5845 bb_vinfo->shared->check_datarefs ();
5846 bb_vinfo->vector_mode = next_vector_mode;
5848 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
5850 if (dump_enabled_p ())
5852 dump_printf_loc (MSG_NOTE, vect_location,
5853 "***** Analysis succeeded with vector mode"
5854 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
5855 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
5858 bb_vinfo->shared->check_datarefs ();
5860 auto_vec<slp_instance> profitable_subgraphs;
5861 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
5863 if (instance->subgraph_entries.is_empty ())
5864 continue;
5866 vect_location = instance->location ();
5867 if (!unlimited_cost_model (NULL)
5868 && !vect_bb_vectorization_profitable_p
5869 (bb_vinfo, instance->subgraph_entries, orig_loop))
5871 if (dump_enabled_p ())
5872 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5873 "not vectorized: vectorization is not "
5874 "profitable.\n");
5875 continue;
5878 if (!dbg_cnt (vect_slp))
5879 continue;
5881 profitable_subgraphs.safe_push (instance);
5884 /* When we're vectorizing an if-converted loop body with the
5885 very-cheap cost model make sure we vectorized all if-converted
5886 code. */
5887 if (!profitable_subgraphs.is_empty ()
5888 && orig_loop)
5890 gcc_assert (bb_vinfo->bbs.length () == 1);
5891 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
5892 !gsi_end_p (gsi); gsi_next (&gsi))
5894 /* The costing above left us with DCEable vectorized scalar
5895 stmts having the visited flag set on profitable
5896 subgraphs. Do the delayed clearing of the flag here. */
5897 if (gimple_visited_p (gsi_stmt (gsi)))
5899 gimple_set_visited (gsi_stmt (gsi), false);
5900 continue;
5902 if (flag_vect_cost_model != VECT_COST_MODEL_VERY_CHEAP)
5903 continue;
5905 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
5906 if (gimple_assign_rhs_code (ass) == COND_EXPR)
5908 if (!profitable_subgraphs.is_empty ()
5909 && dump_enabled_p ())
5910 dump_printf_loc (MSG_NOTE, vect_location,
5911 "not profitable because of "
5912 "unprofitable if-converted scalar "
5913 "code\n");
5914 profitable_subgraphs.truncate (0);
5919 /* Finally schedule the profitable subgraphs. */
5920 for (slp_instance instance : profitable_subgraphs)
5922 if (!vectorized && dump_enabled_p ())
5923 dump_printf_loc (MSG_NOTE, vect_location,
5924 "Basic block will be vectorized "
5925 "using SLP\n");
5926 vectorized = true;
5928 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
5930 unsigned HOST_WIDE_INT bytes;
5931 if (dump_enabled_p ())
5933 if (GET_MODE_SIZE
5934 (bb_vinfo->vector_mode).is_constant (&bytes))
5935 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5936 "basic block part vectorized using %wu "
5937 "byte vectors\n", bytes);
5938 else
5939 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5940 "basic block part vectorized using "
5941 "variable length vectors\n");
5945 else
5947 if (dump_enabled_p ())
5948 dump_printf_loc (MSG_NOTE, vect_location,
5949 "***** Analysis failed with vector mode %s\n",
5950 GET_MODE_NAME (bb_vinfo->vector_mode));
5953 if (mode_i == 0)
5954 autodetected_vector_mode = bb_vinfo->vector_mode;
5956 if (!fatal)
5957 while (mode_i < vector_modes.length ()
5958 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
5960 if (dump_enabled_p ())
5961 dump_printf_loc (MSG_NOTE, vect_location,
5962 "***** The result for vector mode %s would"
5963 " be the same\n",
5964 GET_MODE_NAME (vector_modes[mode_i]));
5965 mode_i += 1;
5968 delete bb_vinfo;
5970 if (mode_i < vector_modes.length ()
5971 && VECTOR_MODE_P (autodetected_vector_mode)
5972 && (related_vector_mode (vector_modes[mode_i],
5973 GET_MODE_INNER (autodetected_vector_mode))
5974 == autodetected_vector_mode)
5975 && (related_vector_mode (autodetected_vector_mode,
5976 GET_MODE_INNER (vector_modes[mode_i]))
5977 == vector_modes[mode_i]))
5979 if (dump_enabled_p ())
5980 dump_printf_loc (MSG_NOTE, vect_location,
5981 "***** Skipping vector mode %s, which would"
5982 " repeat the analysis for %s\n",
5983 GET_MODE_NAME (vector_modes[mode_i]),
5984 GET_MODE_NAME (autodetected_vector_mode));
5985 mode_i += 1;
5988 if (vectorized
5989 || mode_i == vector_modes.length ()
5990 || autodetected_vector_mode == VOIDmode
5991 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5992 vector sizes will fail do not bother iterating. */
5993 || fatal)
5994 return vectorized;
5996 /* Try the next biggest vector size. */
5997 next_vector_mode = vector_modes[mode_i++];
5998 if (dump_enabled_p ())
5999 dump_printf_loc (MSG_NOTE, vect_location,
6000 "***** Re-trying analysis with vector mode %s\n",
6001 GET_MODE_NAME (next_vector_mode));
6006 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
6007 true if anything in the basic-block was vectorized. */
6009 static bool
6010 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
6012 vec<data_reference_p> datarefs = vNULL;
6013 auto_vec<int> dataref_groups;
6014 int insns = 0;
6015 int current_group = 0;
6017 for (unsigned i = 0; i < bbs.length (); i++)
6019 basic_block bb = bbs[i];
6020 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
6021 gsi_next (&gsi))
6023 gimple *stmt = gsi_stmt (gsi);
6024 if (is_gimple_debug (stmt))
6025 continue;
6027 insns++;
6029 if (gimple_location (stmt) != UNKNOWN_LOCATION)
6030 vect_location = stmt;
6032 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
6033 &dataref_groups, current_group))
6034 ++current_group;
6036 /* New BBs always start a new DR group. */
6037 ++current_group;
6040 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
6043 /* Special entry for the BB vectorizer. Analyze and transform a single
6044 if-converted BB with ORIG_LOOPs body being the not if-converted
6045 representation. Returns true if anything in the basic-block was
6046 vectorized. */
6048 bool
6049 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
6051 auto_vec<basic_block> bbs;
6052 bbs.safe_push (bb);
6053 return vect_slp_bbs (bbs, orig_loop);
6056 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
6057 true if anything in the basic-block was vectorized. */
6059 bool
6060 vect_slp_function (function *fun)
6062 bool r = false;
6063 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
6064 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
6066 /* For the moment split the function into pieces to avoid making
6067 the iteration on the vector mode moot. Split at points we know
6068 to not handle well which is CFG merges (SLP discovery doesn't
6069 handle non-loop-header PHIs) and loop exits. Since pattern
6070 recog requires reverse iteration to visit uses before defs
6071 simply chop RPO into pieces. */
6072 auto_vec<basic_block> bbs;
6073 for (unsigned i = 0; i < n; i++)
6075 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
6076 bool split = false;
6078 /* Split when a BB is not dominated by the first block. */
6079 if (!bbs.is_empty ()
6080 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
6082 if (dump_enabled_p ())
6083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6084 "splitting region at dominance boundary bb%d\n",
6085 bb->index);
6086 split = true;
6088 /* Split when the loop determined by the first block
6089 is exited. This is because we eventually insert
6090 invariants at region begin. */
6091 else if (!bbs.is_empty ()
6092 && bbs[0]->loop_father != bb->loop_father
6093 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
6095 if (dump_enabled_p ())
6096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6097 "splitting region at loop %d exit at bb%d\n",
6098 bbs[0]->loop_father->num, bb->index);
6099 split = true;
6102 if (split && !bbs.is_empty ())
6104 r |= vect_slp_bbs (bbs, NULL);
6105 bbs.truncate (0);
6106 bbs.quick_push (bb);
6108 else
6109 bbs.safe_push (bb);
6111 /* When we have a stmt ending this block and defining a
6112 value we have to insert on edges when inserting after it for
6113 a vector containing its definition. Avoid this for now. */
6114 if (gimple *last = last_stmt (bb))
6115 if (gimple_get_lhs (last)
6116 && is_ctrl_altering_stmt (last))
6118 if (dump_enabled_p ())
6119 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6120 "splitting region at control altering "
6121 "definition %G", last);
6122 r |= vect_slp_bbs (bbs, NULL);
6123 bbs.truncate (0);
6127 if (!bbs.is_empty ())
6128 r |= vect_slp_bbs (bbs, NULL);
6130 free (rpo);
6132 return r;
6135 /* Build a variable-length vector in which the elements in ELTS are repeated
6136 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
6137 RESULTS and add any new instructions to SEQ.
6139 The approach we use is:
6141 (1) Find a vector mode VM with integer elements of mode IM.
6143 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6144 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
6145 from small vectors to IM.
6147 (3) Duplicate each ELTS'[I] into a vector of mode VM.
6149 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
6150 correct byte contents.
6152 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
6154 We try to find the largest IM for which this sequence works, in order
6155 to cut down on the number of interleaves. */
6157 void
6158 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
6159 const vec<tree> &elts, unsigned int nresults,
6160 vec<tree> &results)
6162 unsigned int nelts = elts.length ();
6163 tree element_type = TREE_TYPE (vector_type);
6165 /* (1) Find a vector mode VM with integer elements of mode IM. */
6166 unsigned int nvectors = 1;
6167 tree new_vector_type;
6168 tree permutes[2];
6169 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
6170 &nvectors, &new_vector_type,
6171 permutes))
6172 gcc_unreachable ();
6174 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
6175 unsigned int partial_nelts = nelts / nvectors;
6176 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
6178 tree_vector_builder partial_elts;
6179 auto_vec<tree, 32> pieces (nvectors * 2);
6180 pieces.quick_grow_cleared (nvectors * 2);
6181 for (unsigned int i = 0; i < nvectors; ++i)
6183 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6184 ELTS' has mode IM. */
6185 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
6186 for (unsigned int j = 0; j < partial_nelts; ++j)
6187 partial_elts.quick_push (elts[i * partial_nelts + j]);
6188 tree t = gimple_build_vector (seq, &partial_elts);
6189 t = gimple_build (seq, VIEW_CONVERT_EXPR,
6190 TREE_TYPE (new_vector_type), t);
6192 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
6193 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
6196 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
6197 correct byte contents.
6199 Conceptually, we need to repeat the following operation log2(nvectors)
6200 times, where hi_start = nvectors / 2:
6202 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
6203 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
6205 However, if each input repeats every N elements and the VF is
6206 a multiple of N * 2, the HI result is the same as the LO result.
6207 This will be true for the first N1 iterations of the outer loop,
6208 followed by N2 iterations for which both the LO and HI results
6209 are needed. I.e.:
6211 N1 + N2 = log2(nvectors)
6213 Each "N1 iteration" doubles the number of redundant vectors and the
6214 effect of the process as a whole is to have a sequence of nvectors/2**N1
6215 vectors that repeats 2**N1 times. Rather than generate these redundant
6216 vectors, we halve the number of vectors for each N1 iteration. */
6217 unsigned int in_start = 0;
6218 unsigned int out_start = nvectors;
6219 unsigned int new_nvectors = nvectors;
6220 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
6222 unsigned int hi_start = new_nvectors / 2;
6223 unsigned int out_i = 0;
6224 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
6226 if ((in_i & 1) != 0
6227 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
6228 2 * in_repeat))
6229 continue;
6231 tree output = make_ssa_name (new_vector_type);
6232 tree input1 = pieces[in_start + (in_i / 2)];
6233 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
6234 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
6235 input1, input2,
6236 permutes[in_i & 1]);
6237 gimple_seq_add_stmt (seq, stmt);
6238 pieces[out_start + out_i] = output;
6239 out_i += 1;
6241 std::swap (in_start, out_start);
6242 new_nvectors = out_i;
6245 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
6246 results.reserve (nresults);
6247 for (unsigned int i = 0; i < nresults; ++i)
6248 if (i < new_nvectors)
6249 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
6250 pieces[in_start + i]));
6251 else
6252 results.quick_push (results[i - new_nvectors]);
6256 /* For constant and loop invariant defs in OP_NODE this function creates
6257 vector defs that will be used in the vectorized stmts and stores them
6258 to SLP_TREE_VEC_DEFS of OP_NODE. */
6260 static void
6261 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
6263 unsigned HOST_WIDE_INT nunits;
6264 tree vec_cst;
6265 unsigned j, number_of_places_left_in_vector;
6266 tree vector_type;
6267 tree vop;
6268 int group_size = op_node->ops.length ();
6269 unsigned int vec_num, i;
6270 unsigned number_of_copies = 1;
6271 bool constant_p;
6272 gimple_seq ctor_seq = NULL;
6273 auto_vec<tree, 16> permute_results;
6275 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
6276 vector_type = SLP_TREE_VECTYPE (op_node);
6278 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
6279 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
6280 auto_vec<tree> voprnds (number_of_vectors);
6282 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
6283 created vectors. It is greater than 1 if unrolling is performed.
6285 For example, we have two scalar operands, s1 and s2 (e.g., group of
6286 strided accesses of size two), while NUNITS is four (i.e., four scalars
6287 of this type can be packed in a vector). The output vector will contain
6288 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
6289 will be 2).
6291 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
6292 containing the operands.
6294 For example, NUNITS is four as before, and the group size is 8
6295 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
6296 {s5, s6, s7, s8}. */
6298 /* When using duplicate_and_interleave, we just need one element for
6299 each scalar statement. */
6300 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
6301 nunits = group_size;
6303 number_of_copies = nunits * number_of_vectors / group_size;
6305 number_of_places_left_in_vector = nunits;
6306 constant_p = true;
6307 tree_vector_builder elts (vector_type, nunits, 1);
6308 elts.quick_grow (nunits);
6309 stmt_vec_info insert_after = NULL;
6310 for (j = 0; j < number_of_copies; j++)
6312 tree op;
6313 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
6315 /* Create 'vect_ = {op0,op1,...,opn}'. */
6316 number_of_places_left_in_vector--;
6317 tree orig_op = op;
6318 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
6320 if (CONSTANT_CLASS_P (op))
6322 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6324 /* Can't use VIEW_CONVERT_EXPR for booleans because
6325 of possibly different sizes of scalar value and
6326 vector element. */
6327 if (integer_zerop (op))
6328 op = build_int_cst (TREE_TYPE (vector_type), 0);
6329 else if (integer_onep (op))
6330 op = build_all_ones_cst (TREE_TYPE (vector_type));
6331 else
6332 gcc_unreachable ();
6334 else
6335 op = fold_unary (VIEW_CONVERT_EXPR,
6336 TREE_TYPE (vector_type), op);
6337 gcc_assert (op && CONSTANT_CLASS_P (op));
6339 else
6341 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
6342 gimple *init_stmt;
6343 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6345 tree true_val
6346 = build_all_ones_cst (TREE_TYPE (vector_type));
6347 tree false_val
6348 = build_zero_cst (TREE_TYPE (vector_type));
6349 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
6350 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
6351 op, true_val,
6352 false_val);
6354 else
6356 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
6357 op);
6358 init_stmt
6359 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
6360 op);
6362 gimple_seq_add_stmt (&ctor_seq, init_stmt);
6363 op = new_temp;
6366 elts[number_of_places_left_in_vector] = op;
6367 if (!CONSTANT_CLASS_P (op))
6368 constant_p = false;
6369 /* For BB vectorization we have to compute an insert location
6370 when a def is inside the analyzed region since we cannot
6371 simply insert at the BB start in this case. */
6372 stmt_vec_info opdef;
6373 if (TREE_CODE (orig_op) == SSA_NAME
6374 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
6375 && is_a <bb_vec_info> (vinfo)
6376 && (opdef = vinfo->lookup_def (orig_op)))
6378 if (!insert_after)
6379 insert_after = opdef;
6380 else
6381 insert_after = get_later_stmt (insert_after, opdef);
6384 if (number_of_places_left_in_vector == 0)
6386 if (constant_p
6387 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
6388 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
6389 vec_cst = gimple_build_vector (&ctor_seq, &elts);
6390 else
6392 if (permute_results.is_empty ())
6393 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
6394 elts, number_of_vectors,
6395 permute_results);
6396 vec_cst = permute_results[number_of_vectors - j - 1];
6398 if (!gimple_seq_empty_p (ctor_seq))
6400 if (insert_after)
6402 gimple_stmt_iterator gsi;
6403 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
6405 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
6406 gsi_insert_seq_before (&gsi, ctor_seq,
6407 GSI_CONTINUE_LINKING);
6409 else if (!stmt_ends_bb_p (insert_after->stmt))
6411 gsi = gsi_for_stmt (insert_after->stmt);
6412 gsi_insert_seq_after (&gsi, ctor_seq,
6413 GSI_CONTINUE_LINKING);
6415 else
6417 /* When we want to insert after a def where the
6418 defining stmt throws then insert on the fallthru
6419 edge. */
6420 edge e = find_fallthru_edge
6421 (gimple_bb (insert_after->stmt)->succs);
6422 basic_block new_bb
6423 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
6424 gcc_assert (!new_bb);
6427 else
6428 vinfo->insert_seq_on_entry (NULL, ctor_seq);
6429 ctor_seq = NULL;
6431 voprnds.quick_push (vec_cst);
6432 insert_after = NULL;
6433 number_of_places_left_in_vector = nunits;
6434 constant_p = true;
6435 elts.new_vector (vector_type, nunits, 1);
6436 elts.quick_grow (nunits);
6441 /* Since the vectors are created in the reverse order, we should invert
6442 them. */
6443 vec_num = voprnds.length ();
6444 for (j = vec_num; j != 0; j--)
6446 vop = voprnds[j - 1];
6447 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6450 /* In case that VF is greater than the unrolling factor needed for the SLP
6451 group of stmts, NUMBER_OF_VECTORS to be created is greater than
6452 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
6453 to replicate the vectors. */
6454 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
6455 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
6456 i++)
6457 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6460 /* Get the Ith vectorized definition from SLP_NODE. */
6462 tree
6463 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
6465 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
6466 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
6467 else
6468 return SLP_TREE_VEC_DEFS (slp_node)[i];
6471 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
6473 void
6474 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
6476 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
6477 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
6479 unsigned j;
6480 gimple *vec_def_stmt;
6481 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
6482 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
6484 else
6485 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
6488 /* Get N vectorized definitions for SLP_NODE. */
6490 void
6491 vect_get_slp_defs (vec_info *,
6492 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
6494 if (n == -1U)
6495 n = SLP_TREE_CHILDREN (slp_node).length ();
6497 for (unsigned i = 0; i < n; ++i)
6499 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
6500 vec<tree> vec_defs = vNULL;
6501 vect_get_slp_defs (child, &vec_defs);
6502 vec_oprnds->quick_push (vec_defs);
6506 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6507 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6508 permute statements for the SLP node NODE. Store the number of vector
6509 permute instructions in *N_PERMS and the number of vector load
6510 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
6511 that were not needed. */
6513 bool
6514 vect_transform_slp_perm_load (vec_info *vinfo,
6515 slp_tree node, const vec<tree> &dr_chain,
6516 gimple_stmt_iterator *gsi, poly_uint64 vf,
6517 bool analyze_only, unsigned *n_perms,
6518 unsigned int *n_loads, bool dce_chain)
6520 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6521 int vec_index = 0;
6522 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6523 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
6524 unsigned int mask_element;
6525 machine_mode mode;
6527 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
6528 return false;
6530 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6532 mode = TYPE_MODE (vectype);
6533 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6535 /* Initialize the vect stmts of NODE to properly insert the generated
6536 stmts later. */
6537 if (! analyze_only)
6538 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
6539 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6540 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
6542 /* Generate permutation masks for every NODE. Number of masks for each NODE
6543 is equal to GROUP_SIZE.
6544 E.g., we have a group of three nodes with three loads from the same
6545 location in each node, and the vector size is 4. I.e., we have a
6546 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6547 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6548 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6551 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
6552 The last mask is illegal since we assume two operands for permute
6553 operation, and the mask element values can't be outside that range.
6554 Hence, the last mask must be converted into {2,5,5,5}.
6555 For the first two permutations we need the first and the second input
6556 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6557 we need the second and the third vectors: {b1,c1,a2,b2} and
6558 {c2,a3,b3,c3}. */
6560 int vect_stmts_counter = 0;
6561 unsigned int index = 0;
6562 int first_vec_index = -1;
6563 int second_vec_index = -1;
6564 bool noop_p = true;
6565 *n_perms = 0;
6567 vec_perm_builder mask;
6568 unsigned int nelts_to_build;
6569 unsigned int nvectors_per_build;
6570 unsigned int in_nlanes;
6571 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
6572 && multiple_p (nunits, group_size));
6573 if (repeating_p)
6575 /* A single vector contains a whole number of copies of the node, so:
6576 (a) all permutes can use the same mask; and
6577 (b) the permutes only need a single vector input. */
6578 mask.new_vector (nunits, group_size, 3);
6579 nelts_to_build = mask.encoded_nelts ();
6580 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
6581 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
6583 else
6585 /* We need to construct a separate mask for each vector statement. */
6586 unsigned HOST_WIDE_INT const_nunits, const_vf;
6587 if (!nunits.is_constant (&const_nunits)
6588 || !vf.is_constant (&const_vf))
6589 return false;
6590 mask.new_vector (const_nunits, const_nunits, 1);
6591 nelts_to_build = const_vf * group_size;
6592 nvectors_per_build = 1;
6593 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
6595 auto_sbitmap used_in_lanes (in_nlanes);
6596 bitmap_clear (used_in_lanes);
6597 auto_bitmap used_defs;
6599 unsigned int count = mask.encoded_nelts ();
6600 mask.quick_grow (count);
6601 vec_perm_indices indices;
6603 for (unsigned int j = 0; j < nelts_to_build; j++)
6605 unsigned int iter_num = j / group_size;
6606 unsigned int stmt_num = j % group_size;
6607 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
6608 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
6609 bitmap_set_bit (used_in_lanes, i);
6610 if (repeating_p)
6612 first_vec_index = 0;
6613 mask_element = i;
6615 else
6617 /* Enforced before the loop when !repeating_p. */
6618 unsigned int const_nunits = nunits.to_constant ();
6619 vec_index = i / const_nunits;
6620 mask_element = i % const_nunits;
6621 if (vec_index == first_vec_index
6622 || first_vec_index == -1)
6624 first_vec_index = vec_index;
6626 else if (vec_index == second_vec_index
6627 || second_vec_index == -1)
6629 second_vec_index = vec_index;
6630 mask_element += const_nunits;
6632 else
6634 if (dump_enabled_p ())
6635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6636 "permutation requires at "
6637 "least three vectors %G",
6638 stmt_info->stmt);
6639 gcc_assert (analyze_only);
6640 return false;
6643 gcc_assert (mask_element < 2 * const_nunits);
6646 if (mask_element != index)
6647 noop_p = false;
6648 mask[index++] = mask_element;
6650 if (index == count && !noop_p)
6652 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
6653 if (!can_vec_perm_const_p (mode, indices))
6655 if (dump_enabled_p ())
6657 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6658 vect_location,
6659 "unsupported vect permute { ");
6660 for (i = 0; i < count; ++i)
6662 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6663 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6665 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6667 gcc_assert (analyze_only);
6668 return false;
6671 ++*n_perms;
6674 if (index == count)
6676 if (!analyze_only)
6678 tree mask_vec = NULL_TREE;
6680 if (! noop_p)
6681 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6683 if (second_vec_index == -1)
6684 second_vec_index = first_vec_index;
6686 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
6688 /* Generate the permute statement if necessary. */
6689 tree first_vec = dr_chain[first_vec_index + ri];
6690 tree second_vec = dr_chain[second_vec_index + ri];
6691 gimple *perm_stmt;
6692 if (! noop_p)
6694 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6695 tree perm_dest
6696 = vect_create_destination_var (gimple_assign_lhs (stmt),
6697 vectype);
6698 perm_dest = make_ssa_name (perm_dest);
6699 perm_stmt
6700 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6701 first_vec, second_vec,
6702 mask_vec);
6703 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
6704 gsi);
6705 if (dce_chain)
6707 bitmap_set_bit (used_defs, first_vec_index + ri);
6708 bitmap_set_bit (used_defs, second_vec_index + ri);
6711 else
6713 /* If mask was NULL_TREE generate the requested
6714 identity transform. */
6715 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
6716 if (dce_chain)
6717 bitmap_set_bit (used_defs, first_vec_index + ri);
6720 /* Store the vector statement in NODE. */
6721 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
6725 index = 0;
6726 first_vec_index = -1;
6727 second_vec_index = -1;
6728 noop_p = true;
6732 if (n_loads)
6734 if (repeating_p)
6735 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6736 else
6738 /* Enforced above when !repeating_p. */
6739 unsigned int const_nunits = nunits.to_constant ();
6740 *n_loads = 0;
6741 bool load_seen = false;
6742 for (unsigned i = 0; i < in_nlanes; ++i)
6744 if (i % const_nunits == 0)
6746 if (load_seen)
6747 *n_loads += 1;
6748 load_seen = false;
6750 if (bitmap_bit_p (used_in_lanes, i))
6751 load_seen = true;
6753 if (load_seen)
6754 *n_loads += 1;
6758 if (dce_chain)
6759 for (unsigned i = 0; i < dr_chain.length (); ++i)
6760 if (!bitmap_bit_p (used_defs, i))
6762 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
6763 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
6764 gsi_remove (&rgsi, true);
6765 release_defs (stmt);
6768 return true;
6771 /* Produce the next vector result for SLP permutation NODE by adding a vector
6772 statement at GSI. If MASK_VEC is nonnull, add:
6774 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
6776 otherwise add:
6778 <new SSA name> = FIRST_DEF. */
6780 static void
6781 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6782 slp_tree node, tree first_def, tree second_def,
6783 tree mask_vec)
6785 tree vectype = SLP_TREE_VECTYPE (node);
6787 /* ??? We SLP match existing vector element extracts but
6788 allow punning which we need to re-instantiate at uses
6789 but have no good way of explicitly representing. */
6790 if (!types_compatible_p (TREE_TYPE (first_def), vectype))
6792 gassign *conv_stmt
6793 = gimple_build_assign (make_ssa_name (vectype),
6794 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
6795 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6796 first_def = gimple_assign_lhs (conv_stmt);
6798 gassign *perm_stmt;
6799 tree perm_dest = make_ssa_name (vectype);
6800 if (mask_vec)
6802 if (!types_compatible_p (TREE_TYPE (second_def), vectype))
6804 gassign *conv_stmt
6805 = gimple_build_assign (make_ssa_name (vectype),
6806 build1 (VIEW_CONVERT_EXPR,
6807 vectype, second_def));
6808 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6809 second_def = gimple_assign_lhs (conv_stmt);
6811 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6812 first_def, second_def,
6813 mask_vec);
6815 else
6816 /* We need a copy here in case the def was external. */
6817 perm_stmt = gimple_build_assign (perm_dest, first_def);
6818 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
6819 /* Store the vector statement in NODE. */
6820 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
6823 /* Vectorize the SLP permutations in NODE as specified
6824 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
6825 child number and lane number.
6826 Interleaving of two two-lane two-child SLP subtrees (not supported):
6827 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
6828 A blend of two four-lane two-child SLP subtrees:
6829 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
6830 Highpart of a four-lane one-child SLP subtree (not supported):
6831 [ { 0, 2 }, { 0, 3 } ]
6832 Where currently only a subset is supported by code generating below. */
6834 static bool
6835 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6836 slp_tree node, stmt_vector_for_cost *cost_vec)
6838 tree vectype = SLP_TREE_VECTYPE (node);
6840 /* ??? We currently only support all same vector input and output types
6841 while the SLP IL should really do a concat + select and thus accept
6842 arbitrary mismatches. */
6843 slp_tree child;
6844 unsigned i;
6845 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6846 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
6847 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6849 if (!vect_maybe_update_slp_op_vectype (child, vectype)
6850 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
6852 if (dump_enabled_p ())
6853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6854 "Unsupported lane permutation\n");
6855 return false;
6857 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
6858 repeating_p = false;
6861 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
6862 gcc_assert (perm.length () == SLP_TREE_LANES (node));
6863 if (dump_enabled_p ())
6865 dump_printf_loc (MSG_NOTE, vect_location,
6866 "vectorizing permutation");
6867 for (unsigned i = 0; i < perm.length (); ++i)
6868 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
6869 if (repeating_p)
6870 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
6871 dump_printf (MSG_NOTE, "\n");
6874 /* REPEATING_P is true if every output vector is guaranteed to use the
6875 same permute vector. We can handle that case for both variable-length
6876 and constant-length vectors, but we only handle other cases for
6877 constant-length vectors.
6879 Set:
6881 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
6882 mask vector that we want to build.
6884 - NCOPIES to the number of copies of PERM that we need in order
6885 to build the necessary permute mask vectors.
6887 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
6888 for each permute mask vector. This is only relevant when GSI is
6889 nonnull. */
6890 uint64_t npatterns;
6891 unsigned nelts_per_pattern;
6892 uint64_t ncopies;
6893 unsigned noutputs_per_mask;
6894 if (repeating_p)
6896 /* We need a single permute mask vector that has the form:
6898 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
6900 In other words, the original n-element permute in PERM is
6901 "unrolled" to fill a full vector. The stepped vector encoding
6902 that we use for permutes requires 3n elements. */
6903 npatterns = SLP_TREE_LANES (node);
6904 nelts_per_pattern = ncopies = 3;
6905 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6907 else
6909 /* Calculate every element of every permute mask vector explicitly,
6910 instead of relying on the pattern described above. */
6911 if (!nunits.is_constant (&npatterns))
6912 return false;
6913 nelts_per_pattern = ncopies = 1;
6914 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
6915 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
6916 return false;
6917 noutputs_per_mask = 1;
6919 unsigned olanes = ncopies * SLP_TREE_LANES (node);
6920 gcc_assert (repeating_p || multiple_p (olanes, nunits));
6922 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
6923 from the { SLP operand, scalar lane } permutation as recorded in the
6924 SLP node as intermediate step. This part should already work
6925 with SLP children with arbitrary number of lanes. */
6926 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
6927 auto_vec<unsigned> active_lane;
6928 vperm.create (olanes);
6929 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
6930 for (unsigned i = 0; i < ncopies; ++i)
6932 for (unsigned pi = 0; pi < perm.length (); ++pi)
6934 std::pair<unsigned, unsigned> p = perm[pi];
6935 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
6936 if (repeating_p)
6937 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
6938 else
6940 /* We checked above that the vectors are constant-length. */
6941 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
6942 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
6943 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
6944 vperm.quick_push ({{p.first, vi}, vl});
6947 /* Advance to the next group. */
6948 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
6949 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
6952 if (dump_enabled_p ())
6954 dump_printf_loc (MSG_NOTE, vect_location, "as");
6955 for (unsigned i = 0; i < vperm.length (); ++i)
6957 if (i != 0
6958 && (repeating_p
6959 ? multiple_p (i, npatterns)
6960 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
6961 dump_printf (MSG_NOTE, ",");
6962 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
6963 vperm[i].first.first, vperm[i].first.second,
6964 vperm[i].second);
6966 dump_printf (MSG_NOTE, "\n");
6969 /* We can only handle two-vector permutes, everything else should
6970 be lowered on the SLP level. The following is closely inspired
6971 by vect_transform_slp_perm_load and is supposed to eventually
6972 replace it.
6973 ??? As intermediate step do code-gen in the SLP tree representation
6974 somehow? */
6975 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
6976 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
6977 unsigned int index = 0;
6978 poly_uint64 mask_element;
6979 vec_perm_builder mask;
6980 mask.new_vector (nunits, npatterns, nelts_per_pattern);
6981 unsigned int count = mask.encoded_nelts ();
6982 mask.quick_grow (count);
6983 vec_perm_indices indices;
6984 unsigned nperms = 0;
6985 for (unsigned i = 0; i < vperm.length (); ++i)
6987 mask_element = vperm[i].second;
6988 if (first_vec.first == -1U
6989 || first_vec == vperm[i].first)
6990 first_vec = vperm[i].first;
6991 else if (second_vec.first == -1U
6992 || second_vec == vperm[i].first)
6994 second_vec = vperm[i].first;
6995 mask_element += nunits;
6997 else
6999 if (dump_enabled_p ())
7000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7001 "permutation requires at "
7002 "least three vectors\n");
7003 gcc_assert (!gsi);
7004 return false;
7007 mask[index++] = mask_element;
7009 if (index == count)
7011 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits);
7012 bool identity_p = indices.series_p (0, 1, 0, 1);
7013 if (!identity_p
7014 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
7016 if (dump_enabled_p ())
7018 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
7019 vect_location,
7020 "unsupported vect permute { ");
7021 for (i = 0; i < count; ++i)
7023 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
7024 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
7026 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
7028 gcc_assert (!gsi);
7029 return false;
7032 if (!identity_p)
7033 nperms++;
7034 if (gsi)
7036 if (second_vec.first == -1U)
7037 second_vec = first_vec;
7039 slp_tree
7040 first_node = SLP_TREE_CHILDREN (node)[first_vec.first],
7041 second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
7043 tree mask_vec = NULL_TREE;
7044 if (!identity_p)
7045 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
7047 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
7049 tree first_def
7050 = vect_get_slp_vect_def (first_node,
7051 first_vec.second + vi);
7052 tree second_def
7053 = vect_get_slp_vect_def (second_node,
7054 second_vec.second + vi);
7055 vect_add_slp_permutation (vinfo, gsi, node, first_def,
7056 second_def, mask_vec);
7060 index = 0;
7061 first_vec = std::make_pair (-1U, -1U);
7062 second_vec = std::make_pair (-1U, -1U);
7066 if (!gsi)
7067 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
7069 return true;
7072 /* Vectorize SLP NODE. */
7074 static void
7075 vect_schedule_slp_node (vec_info *vinfo,
7076 slp_tree node, slp_instance instance)
7078 gimple_stmt_iterator si;
7079 int i;
7080 slp_tree child;
7082 /* For existing vectors there's nothing to do. */
7083 if (SLP_TREE_VEC_DEFS (node).exists ())
7084 return;
7086 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
7088 /* Vectorize externals and constants. */
7089 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
7090 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
7092 /* ??? vectorizable_shift can end up using a scalar operand which is
7093 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
7094 node in this case. */
7095 if (!SLP_TREE_VECTYPE (node))
7096 return;
7098 vect_create_constant_vectors (vinfo, node);
7099 return;
7102 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7104 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
7105 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
7107 if (dump_enabled_p ())
7108 dump_printf_loc (MSG_NOTE, vect_location,
7109 "------>vectorizing SLP node starting from: %G",
7110 stmt_info->stmt);
7112 if (STMT_VINFO_DATA_REF (stmt_info)
7113 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7115 /* Vectorized loads go before the first scalar load to make it
7116 ready early, vectorized stores go before the last scalar
7117 stmt which is where all uses are ready. */
7118 stmt_vec_info last_stmt_info = NULL;
7119 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
7120 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
7121 else /* DR_IS_WRITE */
7122 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
7123 si = gsi_for_stmt (last_stmt_info->stmt);
7125 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
7126 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
7127 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
7128 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7130 /* For PHI node vectorization we do not use the insertion iterator. */
7131 si = gsi_none ();
7133 else
7135 /* Emit other stmts after the children vectorized defs which is
7136 earliest possible. */
7137 gimple *last_stmt = NULL;
7138 bool seen_vector_def = false;
7139 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7140 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7142 /* For fold-left reductions we are retaining the scalar
7143 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
7144 set so the representation isn't perfect. Resort to the
7145 last scalar def here. */
7146 if (SLP_TREE_VEC_STMTS (child).is_empty ())
7148 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
7149 == cycle_phi_info_type);
7150 gphi *phi = as_a <gphi *>
7151 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
7152 if (!last_stmt
7153 || vect_stmt_dominates_stmt_p (last_stmt, phi))
7154 last_stmt = phi;
7156 /* We are emitting all vectorized stmts in the same place and
7157 the last one is the last.
7158 ??? Unless we have a load permutation applied and that
7159 figures to re-use an earlier generated load. */
7160 unsigned j;
7161 gimple *vstmt;
7162 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
7163 if (!last_stmt
7164 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7165 last_stmt = vstmt;
7167 else if (!SLP_TREE_VECTYPE (child))
7169 /* For externals we use unvectorized at all scalar defs. */
7170 unsigned j;
7171 tree def;
7172 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
7173 if (TREE_CODE (def) == SSA_NAME
7174 && !SSA_NAME_IS_DEFAULT_DEF (def))
7176 gimple *stmt = SSA_NAME_DEF_STMT (def);
7177 if (!last_stmt
7178 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
7179 last_stmt = stmt;
7182 else
7184 /* For externals we have to look at all defs since their
7185 insertion place is decided per vector. But beware
7186 of pre-existing vectors where we need to make sure
7187 we do not insert before the region boundary. */
7188 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
7189 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
7190 seen_vector_def = true;
7191 else
7193 unsigned j;
7194 tree vdef;
7195 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
7196 if (TREE_CODE (vdef) == SSA_NAME
7197 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
7199 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
7200 if (!last_stmt
7201 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7202 last_stmt = vstmt;
7206 /* This can happen when all children are pre-existing vectors or
7207 constants. */
7208 if (!last_stmt)
7209 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
7210 if (!last_stmt)
7212 gcc_assert (seen_vector_def);
7213 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
7215 else if (is_a <bb_vec_info> (vinfo)
7216 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
7217 && gimple_could_trap_p (stmt_info->stmt))
7219 /* We've constrained possibly trapping operations to all come
7220 from the same basic-block, if vectorized defs would allow earlier
7221 scheduling still force vectorized stmts to the original block.
7222 This is only necessary for BB vectorization since for loop vect
7223 all operations are in a single BB and scalar stmt based
7224 placement doesn't play well with epilogue vectorization. */
7225 gcc_assert (dominated_by_p (CDI_DOMINATORS,
7226 gimple_bb (stmt_info->stmt),
7227 gimple_bb (last_stmt)));
7228 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
7230 else if (is_a <gphi *> (last_stmt))
7231 si = gsi_after_labels (gimple_bb (last_stmt));
7232 else
7234 si = gsi_for_stmt (last_stmt);
7235 gsi_next (&si);
7239 bool done_p = false;
7241 /* Handle purely internal nodes. */
7242 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7244 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
7245 be shared with different SLP nodes (but usually it's the same
7246 operation apart from the case the stmt is only there for denoting
7247 the actual scalar lane defs ...). So do not call vect_transform_stmt
7248 but open-code it here (partly). */
7249 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
7250 gcc_assert (done);
7251 done_p = true;
7253 if (!done_p)
7254 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
7257 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
7258 For loop vectorization this is done in vectorizable_call, but for SLP
7259 it needs to be deferred until end of vect_schedule_slp, because multiple
7260 SLP instances may refer to the same scalar stmt. */
7262 static void
7263 vect_remove_slp_scalar_calls (vec_info *vinfo,
7264 slp_tree node, hash_set<slp_tree> &visited)
7266 gimple *new_stmt;
7267 gimple_stmt_iterator gsi;
7268 int i;
7269 slp_tree child;
7270 tree lhs;
7271 stmt_vec_info stmt_info;
7273 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7274 return;
7276 if (visited.add (node))
7277 return;
7279 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7280 vect_remove_slp_scalar_calls (vinfo, child, visited);
7282 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7284 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
7285 if (!stmt || gimple_bb (stmt) == NULL)
7286 continue;
7287 if (is_pattern_stmt_p (stmt_info)
7288 || !PURE_SLP_STMT (stmt_info))
7289 continue;
7290 lhs = gimple_call_lhs (stmt);
7291 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
7292 gsi = gsi_for_stmt (stmt);
7293 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
7294 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
7298 static void
7299 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
7301 hash_set<slp_tree> visited;
7302 vect_remove_slp_scalar_calls (vinfo, node, visited);
7305 /* Vectorize the instance root. */
7307 void
7308 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
7310 gassign *rstmt = NULL;
7312 if (instance->kind == slp_inst_kind_ctor)
7314 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
7316 gimple *child_stmt;
7317 int j;
7319 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7321 tree vect_lhs = gimple_get_lhs (child_stmt);
7322 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7323 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
7324 TREE_TYPE (vect_lhs)))
7325 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
7326 vect_lhs);
7327 rstmt = gimple_build_assign (root_lhs, vect_lhs);
7328 break;
7331 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
7333 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7334 gimple *child_stmt;
7335 int j;
7336 vec<constructor_elt, va_gc> *v;
7337 vec_alloc (v, nelts);
7339 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7340 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
7341 gimple_get_lhs (child_stmt));
7342 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7343 tree rtype
7344 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
7345 tree r_constructor = build_constructor (rtype, v);
7346 rstmt = gimple_build_assign (lhs, r_constructor);
7349 else if (instance->kind == slp_inst_kind_bb_reduc)
7351 /* Largely inspired by reduction chain epilogue handling in
7352 vect_create_epilog_for_reduction. */
7353 vec<tree> vec_defs = vNULL;
7354 vect_get_slp_defs (node, &vec_defs);
7355 enum tree_code reduc_code
7356 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
7357 /* ??? We actually have to reflect signs somewhere. */
7358 if (reduc_code == MINUS_EXPR)
7359 reduc_code = PLUS_EXPR;
7360 gimple_seq epilogue = NULL;
7361 /* We may end up with more than one vector result, reduce them
7362 to one vector. */
7363 tree vec_def = vec_defs[0];
7364 for (unsigned i = 1; i < vec_defs.length (); ++i)
7365 vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
7366 vec_def, vec_defs[i]);
7367 vec_defs.release ();
7368 /* ??? Support other schemes than direct internal fn. */
7369 internal_fn reduc_fn;
7370 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
7371 || reduc_fn == IFN_LAST)
7372 gcc_unreachable ();
7373 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
7374 TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
7376 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7377 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
7378 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
7379 update_stmt (gsi_stmt (rgsi));
7380 return;
7382 else
7383 gcc_unreachable ();
7385 gcc_assert (rstmt);
7387 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7388 gsi_replace (&rgsi, rstmt, true);
7391 struct slp_scc_info
7393 bool on_stack;
7394 int dfs;
7395 int lowlink;
7398 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
7400 static void
7401 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
7402 hash_map<slp_tree, slp_scc_info> &scc_info,
7403 int &maxdfs, vec<slp_tree> &stack)
7405 bool existed_p;
7406 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
7407 gcc_assert (!existed_p);
7408 info->dfs = maxdfs;
7409 info->lowlink = maxdfs;
7410 maxdfs++;
7412 /* Leaf. */
7413 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7415 info->on_stack = false;
7416 vect_schedule_slp_node (vinfo, node, instance);
7417 return;
7420 info->on_stack = true;
7421 stack.safe_push (node);
7423 unsigned i;
7424 slp_tree child;
7425 /* DFS recurse. */
7426 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7428 if (!child)
7429 continue;
7430 slp_scc_info *child_info = scc_info.get (child);
7431 if (!child_info)
7433 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
7434 /* Recursion might have re-allocated the node. */
7435 info = scc_info.get (node);
7436 child_info = scc_info.get (child);
7437 info->lowlink = MIN (info->lowlink, child_info->lowlink);
7439 else if (child_info->on_stack)
7440 info->lowlink = MIN (info->lowlink, child_info->dfs);
7442 if (info->lowlink != info->dfs)
7443 return;
7445 auto_vec<slp_tree, 4> phis_to_fixup;
7447 /* Singleton. */
7448 if (stack.last () == node)
7450 stack.pop ();
7451 info->on_stack = false;
7452 vect_schedule_slp_node (vinfo, node, instance);
7453 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
7454 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
7455 phis_to_fixup.quick_push (node);
7457 else
7459 /* SCC. */
7460 int last_idx = stack.length () - 1;
7461 while (stack[last_idx] != node)
7462 last_idx--;
7463 /* We can break the cycle at PHIs who have at least one child
7464 code generated. Then we could re-start the DFS walk until
7465 all nodes in the SCC are covered (we might have new entries
7466 for only back-reachable nodes). But it's simpler to just
7467 iterate and schedule those that are ready. */
7468 unsigned todo = stack.length () - last_idx;
7471 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
7473 slp_tree entry = stack[idx];
7474 if (!entry)
7475 continue;
7476 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
7477 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
7478 bool ready = !phi;
7479 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
7480 if (!child)
7482 gcc_assert (phi);
7483 ready = true;
7484 break;
7486 else if (scc_info.get (child)->on_stack)
7488 if (!phi)
7490 ready = false;
7491 break;
7494 else
7496 if (phi)
7498 ready = true;
7499 break;
7502 if (ready)
7504 vect_schedule_slp_node (vinfo, entry, instance);
7505 scc_info.get (entry)->on_stack = false;
7506 stack[idx] = NULL;
7507 todo--;
7508 if (phi)
7509 phis_to_fixup.safe_push (entry);
7513 while (todo != 0);
7515 /* Pop the SCC. */
7516 stack.truncate (last_idx);
7519 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
7520 slp_tree phi_node;
7521 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
7523 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
7524 edge_iterator ei;
7525 edge e;
7526 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
7528 unsigned dest_idx = e->dest_idx;
7529 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
7530 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
7531 continue;
7532 /* Simply fill all args. */
7533 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
7534 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
7535 vect_get_slp_vect_def (child, i),
7536 e, gimple_phi_arg_location (phi, dest_idx));
7541 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
7543 void
7544 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
7546 slp_instance instance;
7547 unsigned int i;
7549 hash_map<slp_tree, slp_scc_info> scc_info;
7550 int maxdfs = 0;
7551 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7553 slp_tree node = SLP_INSTANCE_TREE (instance);
7554 if (dump_enabled_p ())
7556 dump_printf_loc (MSG_NOTE, vect_location,
7557 "Vectorizing SLP tree:\n");
7558 /* ??? Dump all? */
7559 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7560 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
7561 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
7562 vect_print_slp_graph (MSG_NOTE, vect_location,
7563 SLP_INSTANCE_TREE (instance));
7565 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
7566 have a PHI be the node breaking the cycle. */
7567 auto_vec<slp_tree> stack;
7568 if (!scc_info.get (node))
7569 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
7571 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7572 vectorize_slp_instance_root_stmt (node, instance);
7574 if (dump_enabled_p ())
7575 dump_printf_loc (MSG_NOTE, vect_location,
7576 "vectorizing stmts using SLP.\n");
7579 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7581 slp_tree root = SLP_INSTANCE_TREE (instance);
7582 stmt_vec_info store_info;
7583 unsigned int j;
7585 /* Remove scalar call stmts. Do not do this for basic-block
7586 vectorization as not all uses may be vectorized.
7587 ??? Why should this be necessary? DCE should be able to
7588 remove the stmts itself.
7589 ??? For BB vectorization we can as well remove scalar
7590 stmts starting from the SLP tree root if they have no
7591 uses. */
7592 if (is_a <loop_vec_info> (vinfo))
7593 vect_remove_slp_scalar_calls (vinfo, root);
7595 /* Remove vectorized stores original scalar stmts. */
7596 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
7598 if (!STMT_VINFO_DATA_REF (store_info)
7599 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
7600 break;
7602 store_info = vect_orig_stmt (store_info);
7603 /* Free the attached stmt_vec_info and remove the stmt. */
7604 vinfo->remove_stmt (store_info);
7606 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
7607 to not crash in vect_free_slp_tree later. */
7608 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
7609 SLP_TREE_REPRESENTATIVE (root) = NULL;