mklog: add subject line skeleton
[official-gcc.git] / gcc / tree-vect-slp.c
blob0c1f85beeb2e9f3fb7c66c15d4d30594b2570f9e
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
57 static object_allocator<_slp_tree> *slp_tree_pool;
58 static slp_tree slp_first_node;
60 void
61 vect_slp_init (void)
63 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
66 void
67 vect_slp_fini (void)
69 while (slp_first_node)
70 delete slp_first_node;
71 delete slp_tree_pool;
72 slp_tree_pool = NULL;
75 void *
76 _slp_tree::operator new (size_t n)
78 gcc_assert (n == sizeof (_slp_tree));
79 return slp_tree_pool->allocate_raw ();
82 void
83 _slp_tree::operator delete (void *node, size_t n)
85 gcc_assert (n == sizeof (_slp_tree));
86 slp_tree_pool->remove_raw (node);
90 /* Initialize a SLP node. */
92 _slp_tree::_slp_tree ()
94 this->prev_node = NULL;
95 if (slp_first_node)
96 slp_first_node->prev_node = this;
97 this->next_node = slp_first_node;
98 slp_first_node = this;
99 SLP_TREE_SCALAR_STMTS (this) = vNULL;
100 SLP_TREE_SCALAR_OPS (this) = vNULL;
101 SLP_TREE_VEC_STMTS (this) = vNULL;
102 SLP_TREE_VEC_DEFS (this) = vNULL;
103 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
104 SLP_TREE_CHILDREN (this) = vNULL;
105 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
106 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
107 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
108 SLP_TREE_CODE (this) = ERROR_MARK;
109 SLP_TREE_VECTYPE (this) = NULL_TREE;
110 SLP_TREE_REPRESENTATIVE (this) = NULL;
111 SLP_TREE_REF_COUNT (this) = 1;
112 this->failed = NULL;
113 this->max_nunits = 1;
114 this->lanes = 0;
117 /* Tear down a SLP node. */
119 _slp_tree::~_slp_tree ()
121 if (this->prev_node)
122 this->prev_node->next_node = this->next_node;
123 else
124 slp_first_node = this->next_node;
125 if (this->next_node)
126 this->next_node->prev_node = this->prev_node;
127 SLP_TREE_CHILDREN (this).release ();
128 SLP_TREE_SCALAR_STMTS (this).release ();
129 SLP_TREE_SCALAR_OPS (this).release ();
130 SLP_TREE_VEC_STMTS (this).release ();
131 SLP_TREE_VEC_DEFS (this).release ();
132 SLP_TREE_LOAD_PERMUTATION (this).release ();
133 SLP_TREE_LANE_PERMUTATION (this).release ();
134 if (this->failed)
135 free (failed);
138 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
140 void
141 vect_free_slp_tree (slp_tree node)
143 int i;
144 slp_tree child;
146 if (--SLP_TREE_REF_COUNT (node) != 0)
147 return;
149 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
150 if (child)
151 vect_free_slp_tree (child);
153 /* If the node defines any SLP only patterns then those patterns are no
154 longer valid and should be removed. */
155 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
156 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
158 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
159 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
160 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
163 delete node;
166 /* Return a location suitable for dumpings related to the SLP instance. */
168 dump_user_location_t
169 _slp_instance::location () const
171 if (!root_stmts.is_empty ())
172 return root_stmts[0]->stmt;
173 else
174 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
178 /* Free the memory allocated for the SLP instance. */
180 void
181 vect_free_slp_instance (slp_instance instance)
183 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
184 SLP_INSTANCE_LOADS (instance).release ();
185 SLP_INSTANCE_ROOT_STMTS (instance).release ();
186 instance->subgraph_entries.release ();
187 instance->cost_vec.release ();
188 free (instance);
192 /* Create an SLP node for SCALAR_STMTS. */
194 slp_tree
195 vect_create_new_slp_node (unsigned nops, tree_code code)
197 slp_tree node = new _slp_tree;
198 SLP_TREE_SCALAR_STMTS (node) = vNULL;
199 SLP_TREE_CHILDREN (node).create (nops);
200 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
201 SLP_TREE_CODE (node) = code;
202 return node;
204 /* Create an SLP node for SCALAR_STMTS. */
206 static slp_tree
207 vect_create_new_slp_node (slp_tree node,
208 vec<stmt_vec_info> scalar_stmts, unsigned nops)
210 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
211 SLP_TREE_CHILDREN (node).create (nops);
212 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
213 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
214 SLP_TREE_LANES (node) = scalar_stmts.length ();
215 return node;
218 /* Create an SLP node for SCALAR_STMTS. */
220 static slp_tree
221 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
223 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
226 /* Create an SLP node for OPS. */
228 static slp_tree
229 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
231 SLP_TREE_SCALAR_OPS (node) = ops;
232 SLP_TREE_DEF_TYPE (node) = vect_external_def;
233 SLP_TREE_LANES (node) = ops.length ();
234 return node;
237 /* Create an SLP node for OPS. */
239 static slp_tree
240 vect_create_new_slp_node (vec<tree> ops)
242 return vect_create_new_slp_node (new _slp_tree, ops);
246 /* This structure is used in creation of an SLP tree. Each instance
247 corresponds to the same operand in a group of scalar stmts in an SLP
248 node. */
249 typedef struct _slp_oprnd_info
251 /* Def-stmts for the operands. */
252 vec<stmt_vec_info> def_stmts;
253 /* Operands. */
254 vec<tree> ops;
255 /* Information about the first statement, its vector def-type, type, the
256 operand itself in case it's constant, and an indication if it's a pattern
257 stmt. */
258 tree first_op_type;
259 enum vect_def_type first_dt;
260 bool any_pattern;
261 } *slp_oprnd_info;
264 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
265 operand. */
266 static vec<slp_oprnd_info>
267 vect_create_oprnd_info (int nops, int group_size)
269 int i;
270 slp_oprnd_info oprnd_info;
271 vec<slp_oprnd_info> oprnds_info;
273 oprnds_info.create (nops);
274 for (i = 0; i < nops; i++)
276 oprnd_info = XNEW (struct _slp_oprnd_info);
277 oprnd_info->def_stmts.create (group_size);
278 oprnd_info->ops.create (group_size);
279 oprnd_info->first_dt = vect_uninitialized_def;
280 oprnd_info->first_op_type = NULL_TREE;
281 oprnd_info->any_pattern = false;
282 oprnds_info.quick_push (oprnd_info);
285 return oprnds_info;
289 /* Free operands info. */
291 static void
292 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
294 int i;
295 slp_oprnd_info oprnd_info;
297 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
299 oprnd_info->def_stmts.release ();
300 oprnd_info->ops.release ();
301 XDELETE (oprnd_info);
304 oprnds_info.release ();
308 /* Return true if STMTS contains a pattern statement. */
310 static bool
311 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
313 stmt_vec_info stmt_info;
314 unsigned int i;
315 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
316 if (is_pattern_stmt_p (stmt_info))
317 return true;
318 return false;
321 /* Return true when all lanes in the external or constant NODE have
322 the same value. */
324 static bool
325 vect_slp_tree_uniform_p (slp_tree node)
327 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
328 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
330 /* Pre-exsting vectors. */
331 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
332 return false;
334 unsigned i;
335 tree op, first = NULL_TREE;
336 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
337 if (!first)
338 first = op;
339 else if (!operand_equal_p (first, op, 0))
340 return false;
342 return true;
345 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
346 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
347 of the chain. */
350 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
351 stmt_vec_info first_stmt_info)
353 stmt_vec_info next_stmt_info = first_stmt_info;
354 int result = 0;
356 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
357 return -1;
361 if (next_stmt_info == stmt_info)
362 return result;
363 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
364 if (next_stmt_info)
365 result += DR_GROUP_GAP (next_stmt_info);
367 while (next_stmt_info);
369 return -1;
372 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
373 using the method implemented by duplicate_and_interleave. Return true
374 if so, returning the number of intermediate vectors in *NVECTORS_OUT
375 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
376 (if nonnull). */
378 bool
379 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
380 tree elt_type, unsigned int *nvectors_out,
381 tree *vector_type_out,
382 tree *permutes)
384 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
385 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
386 return false;
388 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
389 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
390 unsigned int nvectors = 1;
391 for (;;)
393 scalar_int_mode int_mode;
394 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
395 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
397 /* Get the natural vector type for this SLP group size. */
398 tree int_type = build_nonstandard_integer_type
399 (GET_MODE_BITSIZE (int_mode), 1);
400 tree vector_type
401 = get_vectype_for_scalar_type (vinfo, int_type, count);
402 if (vector_type
403 && VECTOR_MODE_P (TYPE_MODE (vector_type))
404 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
405 GET_MODE_SIZE (base_vector_mode)))
407 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
408 together into elements of type INT_TYPE and using the result
409 to build NVECTORS vectors. */
410 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
411 vec_perm_builder sel1 (nelts, 2, 3);
412 vec_perm_builder sel2 (nelts, 2, 3);
413 poly_int64 half_nelts = exact_div (nelts, 2);
414 for (unsigned int i = 0; i < 3; ++i)
416 sel1.quick_push (i);
417 sel1.quick_push (i + nelts);
418 sel2.quick_push (half_nelts + i);
419 sel2.quick_push (half_nelts + i + nelts);
421 vec_perm_indices indices1 (sel1, 2, nelts);
422 vec_perm_indices indices2 (sel2, 2, nelts);
423 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
424 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
426 if (nvectors_out)
427 *nvectors_out = nvectors;
428 if (vector_type_out)
429 *vector_type_out = vector_type;
430 if (permutes)
432 permutes[0] = vect_gen_perm_mask_checked (vector_type,
433 indices1);
434 permutes[1] = vect_gen_perm_mask_checked (vector_type,
435 indices2);
437 return true;
441 if (!multiple_p (elt_bytes, 2, &elt_bytes))
442 return false;
443 nvectors *= 2;
447 /* Return true if DTA and DTB match. */
449 static bool
450 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
452 return (dta == dtb
453 || ((dta == vect_external_def || dta == vect_constant_def)
454 && (dtb == vect_external_def || dtb == vect_constant_def)));
457 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
458 they are of a valid type and that they match the defs of the first stmt of
459 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
460 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
461 indicates swap is required for cond_expr stmts. Specifically, *SWAP
462 is 1 if STMT is cond and operands of comparison need to be swapped;
463 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
464 If there is any operand swap in this function, *SWAP is set to non-zero
465 value.
466 If there was a fatal error return -1; if the error could be corrected by
467 swapping operands of father node of this one, return 1; if everything is
468 ok return 0. */
469 static int
470 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
471 bool *skip_args,
472 vec<stmt_vec_info> stmts, unsigned stmt_num,
473 vec<slp_oprnd_info> *oprnds_info)
475 stmt_vec_info stmt_info = stmts[stmt_num];
476 tree oprnd;
477 unsigned int i, number_of_oprnds;
478 enum vect_def_type dt = vect_uninitialized_def;
479 slp_oprnd_info oprnd_info;
480 int first_op_idx = 1;
481 unsigned int commutative_op = -1U;
482 bool first_op_cond = false;
483 bool first = stmt_num == 0;
485 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
487 number_of_oprnds = gimple_call_num_args (stmt);
488 first_op_idx = 3;
489 if (gimple_call_internal_p (stmt))
491 internal_fn ifn = gimple_call_internal_fn (stmt);
492 commutative_op = first_commutative_argument (ifn);
494 /* Masked load, only look at mask. */
495 if (ifn == IFN_MASK_LOAD)
497 number_of_oprnds = 1;
498 /* Mask operand index. */
499 first_op_idx = 5;
503 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
505 enum tree_code code = gimple_assign_rhs_code (stmt);
506 number_of_oprnds = gimple_num_ops (stmt) - 1;
507 /* Swap can only be done for cond_expr if asked to, otherwise we
508 could result in different comparison code to the first stmt. */
509 if (code == COND_EXPR
510 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
512 first_op_cond = true;
513 number_of_oprnds++;
515 else
516 commutative_op = commutative_tree_code (code) ? 0U : -1U;
518 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
519 number_of_oprnds = gimple_phi_num_args (stmt);
520 else
521 return -1;
523 bool swapped = (swap != 0);
524 bool backedge = false;
525 gcc_assert (!swapped || first_op_cond);
526 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
527 for (i = 0; i < number_of_oprnds; i++)
529 if (first_op_cond)
531 /* Map indicating how operands of cond_expr should be swapped. */
532 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
533 int *map = maps[swap];
535 if (i < 2)
536 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
537 first_op_idx), map[i]);
538 else
539 oprnd = gimple_op (stmt_info->stmt, map[i]);
541 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
543 oprnd = gimple_phi_arg_def (stmt, i);
544 backedge = dominated_by_p (CDI_DOMINATORS,
545 gimple_phi_arg_edge (stmt, i)->src,
546 gimple_bb (stmt_info->stmt));
548 else
549 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
550 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
551 oprnd = TREE_OPERAND (oprnd, 0);
553 oprnd_info = (*oprnds_info)[i];
555 stmt_vec_info def_stmt_info;
556 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
560 "Build SLP failed: can't analyze def for %T\n",
561 oprnd);
563 return -1;
566 if (skip_args[i])
568 oprnd_info->def_stmts.quick_push (NULL);
569 oprnd_info->ops.quick_push (NULL_TREE);
570 oprnd_info->first_dt = vect_uninitialized_def;
571 continue;
574 oprnd_info->def_stmts.quick_push (def_stmt_info);
575 oprnd_info->ops.quick_push (oprnd);
577 if (def_stmt_info
578 && is_pattern_stmt_p (def_stmt_info))
580 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
581 != def_stmt_info)
582 oprnd_info->any_pattern = true;
583 else
584 /* If we promote this to external use the original stmt def. */
585 oprnd_info->ops.last ()
586 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
589 /* If there's a extern def on a backedge make sure we can
590 code-generate at the region start.
591 ??? This is another case that could be fixed by adjusting
592 how we split the function but at the moment we'd have conflicting
593 goals there. */
594 if (backedge
595 && dts[i] == vect_external_def
596 && is_a <bb_vec_info> (vinfo)
597 && TREE_CODE (oprnd) == SSA_NAME
598 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
599 && !dominated_by_p (CDI_DOMINATORS,
600 as_a <bb_vec_info> (vinfo)->bbs[0],
601 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
603 if (dump_enabled_p ())
604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
605 "Build SLP failed: extern def %T only defined "
606 "on backedge\n", oprnd);
607 return -1;
610 if (first)
612 tree type = TREE_TYPE (oprnd);
613 dt = dts[i];
614 if ((dt == vect_constant_def
615 || dt == vect_external_def)
616 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
617 && (TREE_CODE (type) == BOOLEAN_TYPE
618 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
619 type)))
621 if (dump_enabled_p ())
622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
623 "Build SLP failed: invalid type of def "
624 "for variable-length SLP %T\n", oprnd);
625 return -1;
628 /* For the swapping logic below force vect_reduction_def
629 for the reduction op in a SLP reduction group. */
630 if (!STMT_VINFO_DATA_REF (stmt_info)
631 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
632 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
633 && def_stmt_info)
634 dts[i] = dt = vect_reduction_def;
636 /* Check the types of the definition. */
637 switch (dt)
639 case vect_external_def:
640 case vect_constant_def:
641 case vect_internal_def:
642 case vect_reduction_def:
643 case vect_induction_def:
644 case vect_nested_cycle:
645 break;
647 default:
648 /* FORNOW: Not supported. */
649 if (dump_enabled_p ())
650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
651 "Build SLP failed: illegal type of def %T\n",
652 oprnd);
653 return -1;
656 oprnd_info->first_dt = dt;
657 oprnd_info->first_op_type = type;
660 if (first)
661 return 0;
663 /* Now match the operand definition types to that of the first stmt. */
664 for (i = 0; i < number_of_oprnds;)
666 if (skip_args[i])
668 ++i;
669 continue;
672 oprnd_info = (*oprnds_info)[i];
673 dt = dts[i];
674 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
675 oprnd = oprnd_info->ops[stmt_num];
676 tree type = TREE_TYPE (oprnd);
678 if (!types_compatible_p (oprnd_info->first_op_type, type))
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
682 "Build SLP failed: different operand types\n");
683 return 1;
686 /* Not first stmt of the group, check that the def-stmt/s match
687 the def-stmt/s of the first stmt. Allow different definition
688 types for reduction chains: the first stmt must be a
689 vect_reduction_def (a phi node), and the rest
690 end in the reduction chain. */
691 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
692 && !(oprnd_info->first_dt == vect_reduction_def
693 && !STMT_VINFO_DATA_REF (stmt_info)
694 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
695 && def_stmt_info
696 && !STMT_VINFO_DATA_REF (def_stmt_info)
697 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
698 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
699 || (!STMT_VINFO_DATA_REF (stmt_info)
700 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
701 && ((!def_stmt_info
702 || STMT_VINFO_DATA_REF (def_stmt_info)
703 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
704 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
705 != (oprnd_info->first_dt != vect_reduction_def))))
707 /* Try swapping operands if we got a mismatch. For BB
708 vectorization only in case it will clearly improve things. */
709 if (i == commutative_op && !swapped
710 && (!is_a <bb_vec_info> (vinfo)
711 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
712 dts[i+1])
713 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
714 || vect_def_types_match
715 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
717 if (dump_enabled_p ())
718 dump_printf_loc (MSG_NOTE, vect_location,
719 "trying swapped operands\n");
720 std::swap (dts[i], dts[i+1]);
721 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
722 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
723 std::swap ((*oprnds_info)[i]->ops[stmt_num],
724 (*oprnds_info)[i+1]->ops[stmt_num]);
725 swapped = true;
726 continue;
729 if (is_a <bb_vec_info> (vinfo)
730 && !oprnd_info->any_pattern)
732 /* Now for commutative ops we should see whether we can
733 make the other operand matching. */
734 if (dump_enabled_p ())
735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
736 "treating operand as external\n");
737 oprnd_info->first_dt = dt = vect_external_def;
739 else
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
743 "Build SLP failed: different types\n");
744 return 1;
748 /* Make sure to demote the overall operand to external. */
749 if (dt == vect_external_def)
750 oprnd_info->first_dt = vect_external_def;
751 /* For a SLP reduction chain we want to duplicate the reduction to
752 each of the chain members. That gets us a sane SLP graph (still
753 the stmts are not 100% correct wrt the initial values). */
754 else if ((dt == vect_internal_def
755 || dt == vect_reduction_def)
756 && oprnd_info->first_dt == vect_reduction_def
757 && !STMT_VINFO_DATA_REF (stmt_info)
758 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
759 && !STMT_VINFO_DATA_REF (def_stmt_info)
760 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
761 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
763 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
764 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
767 ++i;
770 /* Swap operands. */
771 if (swapped)
773 if (dump_enabled_p ())
774 dump_printf_loc (MSG_NOTE, vect_location,
775 "swapped operands to match def types in %G",
776 stmt_info->stmt);
779 return 0;
782 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
783 Return true if we can, meaning that this choice doesn't conflict with
784 existing SLP nodes that use STMT_INFO. */
786 bool
787 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
789 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
790 if (old_vectype)
791 return useless_type_conversion_p (vectype, old_vectype);
793 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
795 /* We maintain the invariant that if any statement in the group is
796 used, all other members of the group have the same vector type. */
797 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
798 stmt_vec_info member_info = first_info;
799 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
800 if (is_pattern_stmt_p (member_info)
801 && !useless_type_conversion_p (vectype,
802 STMT_VINFO_VECTYPE (member_info)))
803 break;
805 if (!member_info)
807 for (member_info = first_info; member_info;
808 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
809 STMT_VINFO_VECTYPE (member_info) = vectype;
810 return true;
813 else if (!is_pattern_stmt_p (stmt_info))
815 STMT_VINFO_VECTYPE (stmt_info) = vectype;
816 return true;
819 if (dump_enabled_p ())
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 "Build SLP failed: incompatible vector"
823 " types for: %G", stmt_info->stmt);
824 dump_printf_loc (MSG_NOTE, vect_location,
825 " old vector type: %T\n", old_vectype);
826 dump_printf_loc (MSG_NOTE, vect_location,
827 " new vector type: %T\n", vectype);
829 return false;
832 /* Return true if call statements CALL1 and CALL2 are similar enough
833 to be combined into the same SLP group. */
835 static bool
836 compatible_calls_p (gcall *call1, gcall *call2)
838 unsigned int nargs = gimple_call_num_args (call1);
839 if (nargs != gimple_call_num_args (call2))
840 return false;
842 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
843 return false;
845 if (gimple_call_internal_p (call1))
847 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
848 TREE_TYPE (gimple_call_lhs (call2))))
849 return false;
850 for (unsigned int i = 0; i < nargs; ++i)
851 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
852 TREE_TYPE (gimple_call_arg (call2, i))))
853 return false;
855 else
857 if (!operand_equal_p (gimple_call_fn (call1),
858 gimple_call_fn (call2), 0))
859 return false;
861 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
862 return false;
864 return true;
867 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
868 caller's attempt to find the vector type in STMT_INFO with the narrowest
869 element type. Return true if VECTYPE is nonnull and if it is valid
870 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
871 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
872 vect_build_slp_tree. */
874 static bool
875 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
876 unsigned int group_size,
877 tree vectype, poly_uint64 *max_nunits)
879 if (!vectype)
881 if (dump_enabled_p ())
882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
883 "Build SLP failed: unsupported data-type in %G\n",
884 stmt_info->stmt);
885 /* Fatal mismatch. */
886 return false;
889 /* If populating the vector type requires unrolling then fail
890 before adjusting *max_nunits for basic-block vectorization. */
891 if (is_a <bb_vec_info> (vinfo)
892 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
894 if (dump_enabled_p ())
895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
896 "Build SLP failed: unrolling required "
897 "in basic block SLP\n");
898 /* Fatal mismatch. */
899 return false;
902 /* In case of multiple types we need to detect the smallest type. */
903 vect_update_max_nunits (max_nunits, vectype);
904 return true;
907 /* Verify if the scalar stmts STMTS are isomorphic, require data
908 permutation or are of unsupported types of operation. Return
909 true if they are, otherwise return false and indicate in *MATCHES
910 which stmts are not isomorphic to the first one. If MATCHES[0]
911 is false then this indicates the comparison could not be
912 carried out or the stmts will never be vectorized by SLP.
914 Note COND_EXPR is possibly isomorphic to another one after swapping its
915 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
916 the first stmt by swapping the two operands of comparison; set SWAP[i]
917 to 2 if stmt I is isormorphic to the first stmt by inverting the code
918 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
919 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
921 static bool
922 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
923 vec<stmt_vec_info> stmts, unsigned int group_size,
924 poly_uint64 *max_nunits, bool *matches,
925 bool *two_operators, tree *node_vectype)
927 unsigned int i;
928 stmt_vec_info first_stmt_info = stmts[0];
929 enum tree_code first_stmt_code = ERROR_MARK;
930 enum tree_code alt_stmt_code = ERROR_MARK;
931 enum tree_code rhs_code = ERROR_MARK;
932 enum tree_code first_cond_code = ERROR_MARK;
933 tree lhs;
934 bool need_same_oprnds = false;
935 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
936 optab optab;
937 int icode;
938 machine_mode optab_op2_mode;
939 machine_mode vec_mode;
940 stmt_vec_info first_load = NULL, prev_first_load = NULL;
941 bool first_stmt_load_p = false, load_p = false;
942 bool first_stmt_phi_p = false, phi_p = false;
943 bool maybe_soft_fail = false;
944 tree soft_fail_nunits_vectype = NULL_TREE;
946 /* For every stmt in NODE find its def stmt/s. */
947 stmt_vec_info stmt_info;
948 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
950 gimple *stmt = stmt_info->stmt;
951 swap[i] = 0;
952 matches[i] = false;
954 if (dump_enabled_p ())
955 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
957 /* Fail to vectorize statements marked as unvectorizable, throw
958 or are volatile. */
959 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
960 || stmt_can_throw_internal (cfun, stmt)
961 || gimple_has_volatile_ops (stmt))
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
965 "Build SLP failed: unvectorizable statement %G",
966 stmt);
967 /* ??? For BB vectorization we want to commutate operands in a way
968 to shuffle all unvectorizable defs into one operand and have
969 the other still vectorized. The following doesn't reliably
970 work for this though but it's the easiest we can do here. */
971 if (is_a <bb_vec_info> (vinfo) && i != 0)
972 continue;
973 /* Fatal mismatch. */
974 matches[0] = false;
975 return false;
978 lhs = gimple_get_lhs (stmt);
979 if (lhs == NULL_TREE)
981 if (dump_enabled_p ())
982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
983 "Build SLP failed: not GIMPLE_ASSIGN nor "
984 "GIMPLE_CALL %G", stmt);
985 if (is_a <bb_vec_info> (vinfo) && i != 0)
986 continue;
987 /* Fatal mismatch. */
988 matches[0] = false;
989 return false;
992 tree nunits_vectype;
993 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
994 &nunits_vectype, group_size))
996 if (is_a <bb_vec_info> (vinfo) && i != 0)
997 continue;
998 /* Fatal mismatch. */
999 matches[0] = false;
1000 return false;
1002 /* Record nunits required but continue analysis, producing matches[]
1003 as if nunits was not an issue. This allows splitting of groups
1004 to happen. */
1005 if (nunits_vectype
1006 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1007 nunits_vectype, max_nunits))
1009 gcc_assert (is_a <bb_vec_info> (vinfo));
1010 maybe_soft_fail = true;
1011 soft_fail_nunits_vectype = nunits_vectype;
1014 gcc_assert (vectype);
1016 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1017 if (call_stmt)
1019 rhs_code = CALL_EXPR;
1021 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
1022 load_p = true;
1023 else if ((gimple_call_internal_p (call_stmt)
1024 && (!vectorizable_internal_fn_p
1025 (gimple_call_internal_fn (call_stmt))))
1026 || gimple_call_tail_p (call_stmt)
1027 || gimple_call_noreturn_p (call_stmt)
1028 || !gimple_call_nothrow_p (call_stmt)
1029 || gimple_call_chain (call_stmt))
1031 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1033 "Build SLP failed: unsupported call type %G",
1034 call_stmt);
1035 if (is_a <bb_vec_info> (vinfo) && i != 0)
1036 continue;
1037 /* Fatal mismatch. */
1038 matches[0] = false;
1039 return false;
1042 else if (gimple_code (stmt) == GIMPLE_PHI)
1044 rhs_code = ERROR_MARK;
1045 phi_p = true;
1047 else
1049 rhs_code = gimple_assign_rhs_code (stmt);
1050 load_p = gimple_vuse (stmt);
1053 /* Check the operation. */
1054 if (i == 0)
1056 *node_vectype = vectype;
1057 first_stmt_code = rhs_code;
1058 first_stmt_load_p = load_p;
1059 first_stmt_phi_p = phi_p;
1061 /* Shift arguments should be equal in all the packed stmts for a
1062 vector shift with scalar shift operand. */
1063 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1064 || rhs_code == LROTATE_EXPR
1065 || rhs_code == RROTATE_EXPR)
1067 vec_mode = TYPE_MODE (vectype);
1069 /* First see if we have a vector/vector shift. */
1070 optab = optab_for_tree_code (rhs_code, vectype,
1071 optab_vector);
1073 if (!optab
1074 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1076 /* No vector/vector shift, try for a vector/scalar shift. */
1077 optab = optab_for_tree_code (rhs_code, vectype,
1078 optab_scalar);
1080 if (!optab)
1082 if (dump_enabled_p ())
1083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1084 "Build SLP failed: no optab.\n");
1085 if (is_a <bb_vec_info> (vinfo) && i != 0)
1086 continue;
1087 /* Fatal mismatch. */
1088 matches[0] = false;
1089 return false;
1091 icode = (int) optab_handler (optab, vec_mode);
1092 if (icode == CODE_FOR_nothing)
1094 if (dump_enabled_p ())
1095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1096 "Build SLP failed: "
1097 "op not supported by target.\n");
1098 if (is_a <bb_vec_info> (vinfo) && i != 0)
1099 continue;
1100 /* Fatal mismatch. */
1101 matches[0] = false;
1102 return false;
1104 optab_op2_mode = insn_data[icode].operand[2].mode;
1105 if (!VECTOR_MODE_P (optab_op2_mode))
1107 need_same_oprnds = true;
1108 first_op1 = gimple_assign_rhs2 (stmt);
1112 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1114 need_same_oprnds = true;
1115 first_op1 = gimple_assign_rhs2 (stmt);
1117 else if (!load_p
1118 && rhs_code == BIT_FIELD_REF)
1120 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1121 if (!is_a <bb_vec_info> (vinfo)
1122 || TREE_CODE (vec) != SSA_NAME
1123 || !operand_equal_p (TYPE_SIZE (vectype),
1124 TYPE_SIZE (TREE_TYPE (vec))))
1126 if (dump_enabled_p ())
1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128 "Build SLP failed: "
1129 "BIT_FIELD_REF not supported\n");
1130 /* Fatal mismatch. */
1131 matches[0] = false;
1132 return false;
1135 else if (call_stmt
1136 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1138 need_same_oprnds = true;
1139 first_op1 = gimple_call_arg (call_stmt, 1);
1142 else
1144 if (first_stmt_code != rhs_code
1145 && alt_stmt_code == ERROR_MARK)
1146 alt_stmt_code = rhs_code;
1147 if ((first_stmt_code != rhs_code
1148 && (first_stmt_code != IMAGPART_EXPR
1149 || rhs_code != REALPART_EXPR)
1150 && (first_stmt_code != REALPART_EXPR
1151 || rhs_code != IMAGPART_EXPR)
1152 /* Handle mismatches in plus/minus by computing both
1153 and merging the results. */
1154 && !((first_stmt_code == PLUS_EXPR
1155 || first_stmt_code == MINUS_EXPR)
1156 && (alt_stmt_code == PLUS_EXPR
1157 || alt_stmt_code == MINUS_EXPR)
1158 && rhs_code == alt_stmt_code)
1159 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1160 && (first_stmt_code == ARRAY_REF
1161 || first_stmt_code == BIT_FIELD_REF
1162 || first_stmt_code == INDIRECT_REF
1163 || first_stmt_code == COMPONENT_REF
1164 || first_stmt_code == MEM_REF)))
1165 || first_stmt_load_p != load_p
1166 || first_stmt_phi_p != phi_p)
1168 if (dump_enabled_p ())
1170 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1171 "Build SLP failed: different operation "
1172 "in stmt %G", stmt);
1173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1174 "original stmt %G", first_stmt_info->stmt);
1176 /* Mismatch. */
1177 continue;
1180 if (need_same_oprnds)
1182 tree other_op1 = (call_stmt
1183 ? gimple_call_arg (call_stmt, 1)
1184 : gimple_assign_rhs2 (stmt));
1185 if (!operand_equal_p (first_op1, other_op1, 0))
1187 if (dump_enabled_p ())
1188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189 "Build SLP failed: different shift "
1190 "arguments in %G", stmt);
1191 /* Mismatch. */
1192 continue;
1195 if (!load_p
1196 && first_stmt_code == BIT_FIELD_REF
1197 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1198 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1200 if (dump_enabled_p ())
1201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1202 "Build SLP failed: different BIT_FIELD_REF "
1203 "arguments in %G", stmt);
1204 /* Mismatch. */
1205 continue;
1208 if (!load_p && rhs_code == CALL_EXPR)
1210 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1211 as_a <gcall *> (stmt)))
1213 if (dump_enabled_p ())
1214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1215 "Build SLP failed: different calls in %G",
1216 stmt);
1217 /* Mismatch. */
1218 continue;
1222 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1223 && (gimple_bb (first_stmt_info->stmt)
1224 != gimple_bb (stmt_info->stmt)))
1226 if (dump_enabled_p ())
1227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1228 "Build SLP failed: different BB for PHI "
1229 "or possibly trapping operation in %G", stmt);
1230 /* Mismatch. */
1231 continue;
1234 if (!types_compatible_p (vectype, *node_vectype))
1236 if (dump_enabled_p ())
1237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1238 "Build SLP failed: different vector type "
1239 "in %G", stmt);
1240 /* Mismatch. */
1241 continue;
1245 /* Grouped store or load. */
1246 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1248 if (REFERENCE_CLASS_P (lhs))
1250 /* Store. */
1253 else
1255 /* Load. */
1256 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1257 if (prev_first_load)
1259 /* Check that there are no loads from different interleaving
1260 chains in the same node. */
1261 if (prev_first_load != first_load)
1263 if (dump_enabled_p ())
1264 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1265 vect_location,
1266 "Build SLP failed: different "
1267 "interleaving chains in one node %G",
1268 stmt);
1269 /* Mismatch. */
1270 continue;
1273 else
1274 prev_first_load = first_load;
1276 } /* Grouped access. */
1277 else
1279 if (load_p)
1281 /* Not grouped load. */
1282 if (dump_enabled_p ())
1283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1284 "Build SLP failed: not grouped load %G", stmt);
1286 /* FORNOW: Not grouped loads are not supported. */
1287 if (is_a <bb_vec_info> (vinfo) && i != 0)
1288 continue;
1289 /* Fatal mismatch. */
1290 matches[0] = false;
1291 return false;
1294 /* Not memory operation. */
1295 if (!phi_p
1296 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1297 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1298 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1299 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1300 && rhs_code != VIEW_CONVERT_EXPR
1301 && rhs_code != CALL_EXPR
1302 && rhs_code != BIT_FIELD_REF)
1304 if (dump_enabled_p ())
1305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1306 "Build SLP failed: operation unsupported %G",
1307 stmt);
1308 if (is_a <bb_vec_info> (vinfo) && i != 0)
1309 continue;
1310 /* Fatal mismatch. */
1311 matches[0] = false;
1312 return false;
1315 if (rhs_code == COND_EXPR)
1317 tree cond_expr = gimple_assign_rhs1 (stmt);
1318 enum tree_code cond_code = TREE_CODE (cond_expr);
1319 enum tree_code swap_code = ERROR_MARK;
1320 enum tree_code invert_code = ERROR_MARK;
1322 if (i == 0)
1323 first_cond_code = TREE_CODE (cond_expr);
1324 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1326 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1327 swap_code = swap_tree_comparison (cond_code);
1328 invert_code = invert_tree_comparison (cond_code, honor_nans);
1331 if (first_cond_code == cond_code)
1333 /* Isomorphic can be achieved by swapping. */
1334 else if (first_cond_code == swap_code)
1335 swap[i] = 1;
1336 /* Isomorphic can be achieved by inverting. */
1337 else if (first_cond_code == invert_code)
1338 swap[i] = 2;
1339 else
1341 if (dump_enabled_p ())
1342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1343 "Build SLP failed: different"
1344 " operation %G", stmt);
1345 /* Mismatch. */
1346 continue;
1351 matches[i] = true;
1354 for (i = 0; i < group_size; ++i)
1355 if (!matches[i])
1356 return false;
1358 /* If we allowed a two-operation SLP node verify the target can cope
1359 with the permute we are going to use. */
1360 if (alt_stmt_code != ERROR_MARK
1361 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1363 *two_operators = true;
1366 if (maybe_soft_fail)
1368 unsigned HOST_WIDE_INT const_nunits;
1369 if (!TYPE_VECTOR_SUBPARTS
1370 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1371 || const_nunits > group_size)
1372 matches[0] = false;
1373 else
1375 /* With constant vector elements simulate a mismatch at the
1376 point we need to split. */
1377 unsigned tail = group_size & (const_nunits - 1);
1378 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1380 return false;
1383 return true;
1386 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1387 Note we never remove apart from at destruction time so we do not
1388 need a special value for deleted that differs from empty. */
1389 struct bst_traits
1391 typedef vec <stmt_vec_info> value_type;
1392 typedef vec <stmt_vec_info> compare_type;
1393 static inline hashval_t hash (value_type);
1394 static inline bool equal (value_type existing, value_type candidate);
1395 static inline bool is_empty (value_type x) { return !x.exists (); }
1396 static inline bool is_deleted (value_type x) { return !x.exists (); }
1397 static const bool empty_zero_p = true;
1398 static inline void mark_empty (value_type &x) { x.release (); }
1399 static inline void mark_deleted (value_type &x) { x.release (); }
1400 static inline void remove (value_type &x) { x.release (); }
1402 inline hashval_t
1403 bst_traits::hash (value_type x)
1405 inchash::hash h;
1406 for (unsigned i = 0; i < x.length (); ++i)
1407 h.add_int (gimple_uid (x[i]->stmt));
1408 return h.end ();
1410 inline bool
1411 bst_traits::equal (value_type existing, value_type candidate)
1413 if (existing.length () != candidate.length ())
1414 return false;
1415 for (unsigned i = 0; i < existing.length (); ++i)
1416 if (existing[i] != candidate[i])
1417 return false;
1418 return true;
1421 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1422 but then vec::insert does memmove and that's not compatible with
1423 std::pair. */
1424 struct chain_op_t
1426 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1427 : code (code_), dt (dt_), op (op_) {}
1428 tree_code code;
1429 vect_def_type dt;
1430 tree op;
1433 /* Comparator for sorting associatable chains. */
1435 static int
1436 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1438 auto *op1 = (const chain_op_t *) op1_;
1439 auto *op2 = (const chain_op_t *) op2_;
1440 if (op1->dt != op2->dt)
1441 return (int)op1->dt - (int)op2->dt;
1442 return (int)op1->code - (int)op2->code;
1445 /* Linearize the associatable expression chain at START with the
1446 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1447 filling CHAIN with the result and using WORKLIST as intermediate storage.
1448 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1449 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1450 stmts, starting with START. */
1452 static void
1453 vect_slp_linearize_chain (vec_info *vinfo,
1454 vec<std::pair<tree_code, gimple *> > &worklist,
1455 vec<chain_op_t> &chain,
1456 enum tree_code code, gimple *start,
1457 gimple *&code_stmt, gimple *&alt_code_stmt,
1458 vec<gimple *> *chain_stmts)
1460 /* For each lane linearize the addition/subtraction (or other
1461 uniform associatable operation) expression tree. */
1462 worklist.safe_push (std::make_pair (code, start));
1463 while (!worklist.is_empty ())
1465 auto entry = worklist.pop ();
1466 gassign *stmt = as_a <gassign *> (entry.second);
1467 enum tree_code in_code = entry.first;
1468 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1469 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1470 if (!code_stmt
1471 && gimple_assign_rhs_code (stmt) == code)
1472 code_stmt = stmt;
1473 else if (!alt_code_stmt
1474 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1475 alt_code_stmt = stmt;
1476 if (chain_stmts)
1477 chain_stmts->safe_push (stmt);
1478 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1480 tree op = gimple_op (stmt, opnum);
1481 vect_def_type dt;
1482 stmt_vec_info def_stmt_info;
1483 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1484 gcc_assert (res);
1485 if (dt == vect_internal_def)
1487 stmt_vec_info orig_def_stmt_info = def_stmt_info;
1488 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
1489 if (def_stmt_info != orig_def_stmt_info)
1490 op = gimple_get_lhs (def_stmt_info->stmt);
1492 gimple *use_stmt;
1493 use_operand_p use_p;
1494 if (dt == vect_internal_def
1495 && single_imm_use (op, &use_p, &use_stmt)
1496 && is_gimple_assign (def_stmt_info->stmt)
1497 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1498 || (code == PLUS_EXPR
1499 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1500 == MINUS_EXPR))))
1502 tree_code op_def_code = this_code;
1503 if (op_def_code == MINUS_EXPR && opnum == 1)
1504 op_def_code = PLUS_EXPR;
1505 if (in_code == MINUS_EXPR)
1506 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1507 worklist.safe_push (std::make_pair (op_def_code,
1508 def_stmt_info->stmt));
1510 else
1512 tree_code op_def_code = this_code;
1513 if (op_def_code == MINUS_EXPR && opnum == 1)
1514 op_def_code = PLUS_EXPR;
1515 if (in_code == MINUS_EXPR)
1516 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1517 chain.safe_push (chain_op_t (op_def_code, dt, op));
1523 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1524 simple_hashmap_traits <bst_traits, slp_tree> >
1525 scalar_stmts_to_slp_tree_map_t;
1527 static slp_tree
1528 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1529 vec<stmt_vec_info> stmts, unsigned int group_size,
1530 poly_uint64 *max_nunits,
1531 bool *matches, unsigned *limit, unsigned *tree_size,
1532 scalar_stmts_to_slp_tree_map_t *bst_map);
1534 static slp_tree
1535 vect_build_slp_tree (vec_info *vinfo,
1536 vec<stmt_vec_info> stmts, unsigned int group_size,
1537 poly_uint64 *max_nunits,
1538 bool *matches, unsigned *limit, unsigned *tree_size,
1539 scalar_stmts_to_slp_tree_map_t *bst_map)
1541 if (slp_tree *leader = bst_map->get (stmts))
1543 if (dump_enabled_p ())
1544 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1545 !(*leader)->failed ? "" : "failed ", *leader);
1546 if (!(*leader)->failed)
1548 SLP_TREE_REF_COUNT (*leader)++;
1549 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1550 stmts.release ();
1551 return *leader;
1553 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1554 return NULL;
1557 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1558 so we can pick up backedge destinations during discovery. */
1559 slp_tree res = new _slp_tree;
1560 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1561 SLP_TREE_SCALAR_STMTS (res) = stmts;
1562 bst_map->put (stmts.copy (), res);
1564 if (*limit == 0)
1566 if (dump_enabled_p ())
1567 dump_printf_loc (MSG_NOTE, vect_location,
1568 "SLP discovery limit exceeded\n");
1569 /* Mark the node invalid so we can detect those when still in use
1570 as backedge destinations. */
1571 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1572 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1573 res->failed = XNEWVEC (bool, group_size);
1574 memset (res->failed, 0, sizeof (bool) * group_size);
1575 memset (matches, 0, sizeof (bool) * group_size);
1576 return NULL;
1578 --*limit;
1580 if (dump_enabled_p ())
1581 dump_printf_loc (MSG_NOTE, vect_location,
1582 "starting SLP discovery for node %p\n", res);
1584 poly_uint64 this_max_nunits = 1;
1585 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1586 &this_max_nunits,
1587 matches, limit, tree_size, bst_map);
1588 if (!res_)
1590 if (dump_enabled_p ())
1591 dump_printf_loc (MSG_NOTE, vect_location,
1592 "SLP discovery for node %p failed\n", res);
1593 /* Mark the node invalid so we can detect those when still in use
1594 as backedge destinations. */
1595 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1596 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1597 res->failed = XNEWVEC (bool, group_size);
1598 memcpy (res->failed, matches, sizeof (bool) * group_size);
1600 else
1602 if (dump_enabled_p ())
1603 dump_printf_loc (MSG_NOTE, vect_location,
1604 "SLP discovery for node %p succeeded\n", res);
1605 gcc_assert (res_ == res);
1606 res->max_nunits = this_max_nunits;
1607 vect_update_max_nunits (max_nunits, this_max_nunits);
1608 /* Keep a reference for the bst_map use. */
1609 SLP_TREE_REF_COUNT (res)++;
1611 return res_;
1614 /* Helper for building an associated SLP node chain. */
1616 static void
1617 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1618 slp_tree op0, slp_tree op1,
1619 stmt_vec_info oper1, stmt_vec_info oper2,
1620 vec<std::pair<unsigned, unsigned> > lperm)
1622 unsigned group_size = SLP_TREE_LANES (op1);
1624 slp_tree child1 = new _slp_tree;
1625 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1626 SLP_TREE_VECTYPE (child1) = vectype;
1627 SLP_TREE_LANES (child1) = group_size;
1628 SLP_TREE_CHILDREN (child1).create (2);
1629 SLP_TREE_CHILDREN (child1).quick_push (op0);
1630 SLP_TREE_CHILDREN (child1).quick_push (op1);
1631 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1633 slp_tree child2 = new _slp_tree;
1634 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1635 SLP_TREE_VECTYPE (child2) = vectype;
1636 SLP_TREE_LANES (child2) = group_size;
1637 SLP_TREE_CHILDREN (child2).create (2);
1638 SLP_TREE_CHILDREN (child2).quick_push (op0);
1639 SLP_TREE_REF_COUNT (op0)++;
1640 SLP_TREE_CHILDREN (child2).quick_push (op1);
1641 SLP_TREE_REF_COUNT (op1)++;
1642 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1644 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1645 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1646 SLP_TREE_VECTYPE (perm) = vectype;
1647 SLP_TREE_LANES (perm) = group_size;
1648 /* ??? We should set this NULL but that's not expected. */
1649 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1650 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1651 SLP_TREE_CHILDREN (perm).quick_push (child1);
1652 SLP_TREE_CHILDREN (perm).quick_push (child2);
1655 /* Recursively build an SLP tree starting from NODE.
1656 Fail (and return a value not equal to zero) if def-stmts are not
1657 isomorphic, require data permutation or are of unsupported types of
1658 operation. Otherwise, return 0.
1659 The value returned is the depth in the SLP tree where a mismatch
1660 was found. */
1662 static slp_tree
1663 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1664 vec<stmt_vec_info> stmts, unsigned int group_size,
1665 poly_uint64 *max_nunits,
1666 bool *matches, unsigned *limit, unsigned *tree_size,
1667 scalar_stmts_to_slp_tree_map_t *bst_map)
1669 unsigned nops, i, this_tree_size = 0;
1670 poly_uint64 this_max_nunits = *max_nunits;
1672 matches[0] = false;
1674 stmt_vec_info stmt_info = stmts[0];
1675 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1676 nops = gimple_call_num_args (stmt);
1677 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1679 nops = gimple_num_ops (stmt) - 1;
1680 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1681 nops++;
1683 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1684 nops = gimple_phi_num_args (phi);
1685 else
1686 return NULL;
1688 /* If the SLP node is a PHI (induction or reduction), terminate
1689 the recursion. */
1690 bool *skip_args = XALLOCAVEC (bool, nops);
1691 memset (skip_args, 0, sizeof (bool) * nops);
1692 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1693 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1695 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1696 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1697 group_size);
1698 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1699 max_nunits))
1700 return NULL;
1702 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1703 if (def_type == vect_induction_def)
1705 /* Induction PHIs are not cycles but walk the initial
1706 value. Only for inner loops through, for outer loops
1707 we need to pick up the value from the actual PHIs
1708 to more easily support peeling and epilogue vectorization. */
1709 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1710 if (!nested_in_vect_loop_p (loop, stmt_info))
1711 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1712 else
1713 loop = loop->inner;
1714 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1716 else if (def_type == vect_reduction_def
1717 || def_type == vect_double_reduction_def
1718 || def_type == vect_nested_cycle)
1720 /* Else def types have to match. */
1721 stmt_vec_info other_info;
1722 bool all_same = true;
1723 FOR_EACH_VEC_ELT (stmts, i, other_info)
1725 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1726 return NULL;
1727 if (other_info != stmt_info)
1728 all_same = false;
1730 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1731 /* Reduction initial values are not explicitely represented. */
1732 if (!nested_in_vect_loop_p (loop, stmt_info))
1733 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1734 /* Reduction chain backedge defs are filled manually.
1735 ??? Need a better way to identify a SLP reduction chain PHI.
1736 Or a better overall way to SLP match those. */
1737 if (all_same && def_type == vect_reduction_def)
1738 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1740 else if (def_type != vect_internal_def)
1741 return NULL;
1745 bool two_operators = false;
1746 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1747 tree vectype = NULL_TREE;
1748 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1749 &this_max_nunits, matches, &two_operators,
1750 &vectype))
1751 return NULL;
1753 /* If the SLP node is a load, terminate the recursion unless masked. */
1754 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1755 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1757 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1759 /* Masked load. */
1760 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1761 nops = 1;
1763 else
1765 *max_nunits = this_max_nunits;
1766 (*tree_size)++;
1767 node = vect_create_new_slp_node (node, stmts, 0);
1768 SLP_TREE_VECTYPE (node) = vectype;
1769 /* And compute the load permutation. Whether it is actually
1770 a permutation depends on the unrolling factor which is
1771 decided later. */
1772 vec<unsigned> load_permutation;
1773 int j;
1774 stmt_vec_info load_info;
1775 load_permutation.create (group_size);
1776 stmt_vec_info first_stmt_info
1777 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1778 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1780 int load_place = vect_get_place_in_interleaving_chain
1781 (load_info, first_stmt_info);
1782 gcc_assert (load_place != -1);
1783 load_permutation.safe_push (load_place);
1785 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1786 return node;
1789 else if (gimple_assign_single_p (stmt_info->stmt)
1790 && !gimple_vuse (stmt_info->stmt)
1791 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1793 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1794 the same SSA name vector of a compatible type to vectype. */
1795 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1796 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1797 stmt_vec_info estmt_info;
1798 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1800 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1801 tree bfref = gimple_assign_rhs1 (estmt);
1802 HOST_WIDE_INT lane;
1803 if (!known_eq (bit_field_size (bfref),
1804 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1805 || !constant_multiple_p (bit_field_offset (bfref),
1806 bit_field_size (bfref), &lane))
1808 lperm.release ();
1809 return NULL;
1811 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1813 slp_tree vnode = vect_create_new_slp_node (vNULL);
1814 /* ??? We record vectype here but we hide eventually necessary
1815 punning and instead rely on code generation to materialize
1816 VIEW_CONVERT_EXPRs as necessary. We instead should make
1817 this explicit somehow. */
1818 SLP_TREE_VECTYPE (vnode) = vectype;
1819 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1820 /* We are always building a permutation node even if it is an identity
1821 permute to shield the rest of the vectorizer from the odd node
1822 representing an actual vector without any scalar ops.
1823 ??? We could hide it completely with making the permute node
1824 external? */
1825 node = vect_create_new_slp_node (node, stmts, 1);
1826 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1827 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1828 SLP_TREE_VECTYPE (node) = vectype;
1829 SLP_TREE_CHILDREN (node).quick_push (vnode);
1830 return node;
1832 /* When discovery reaches an associatable operation see whether we can
1833 improve that to match up lanes in a way superior to the operand
1834 swapping code which at most looks at two defs.
1835 ??? For BB vectorization we cannot do the brute-force search
1836 for matching as we can succeed by means of builds from scalars
1837 and have no good way to "cost" one build against another. */
1838 else if (is_a <loop_vec_info> (vinfo)
1839 /* ??? We don't handle !vect_internal_def defs below. */
1840 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1841 && is_gimple_assign (stmt_info->stmt)
1842 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1843 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1844 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1845 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1846 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1848 /* See if we have a chain of (mixed) adds or subtracts or other
1849 associatable ops. */
1850 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1851 if (code == MINUS_EXPR)
1852 code = PLUS_EXPR;
1853 stmt_vec_info other_op_stmt_info = NULL;
1854 stmt_vec_info op_stmt_info = NULL;
1855 unsigned chain_len = 0;
1856 auto_vec<chain_op_t> chain;
1857 auto_vec<std::pair<tree_code, gimple *> > worklist;
1858 auto_vec<vec<chain_op_t> > chains (group_size);
1859 auto_vec<slp_tree, 4> children;
1860 bool hard_fail = true;
1861 for (unsigned lane = 0; lane < group_size; ++lane)
1863 /* For each lane linearize the addition/subtraction (or other
1864 uniform associatable operation) expression tree. */
1865 gimple *op_stmt = NULL, *other_op_stmt = NULL;
1866 vect_slp_linearize_chain (vinfo, worklist, chain, code,
1867 stmts[lane]->stmt, op_stmt, other_op_stmt,
1868 NULL);
1869 if (!op_stmt_info && op_stmt)
1870 op_stmt_info = vinfo->lookup_stmt (op_stmt);
1871 if (!other_op_stmt_info && other_op_stmt)
1872 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1873 if (chain.length () == 2)
1875 /* In a chain of just two elements resort to the regular
1876 operand swapping scheme. If we run into a length
1877 mismatch still hard-FAIL. */
1878 if (chain_len == 0)
1879 hard_fail = false;
1880 else
1882 matches[lane] = false;
1883 /* ??? We might want to process the other lanes, but
1884 make sure to not give false matching hints to the
1885 caller for lanes we did not process. */
1886 if (lane != group_size - 1)
1887 matches[0] = false;
1889 break;
1891 else if (chain_len == 0)
1892 chain_len = chain.length ();
1893 else if (chain.length () != chain_len)
1895 /* ??? Here we could slip in magic to compensate with
1896 neutral operands. */
1897 matches[lane] = false;
1898 if (lane != group_size - 1)
1899 matches[0] = false;
1900 break;
1902 chains.quick_push (chain.copy ());
1903 chain.truncate (0);
1905 if (chains.length () == group_size
1906 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
1907 && op_stmt_info)
1909 /* Now we have a set of chains with the same length. */
1910 /* 1. pre-sort according to def_type and operation. */
1911 for (unsigned lane = 0; lane < group_size; ++lane)
1912 chains[lane].stablesort (dt_sort_cmp, vinfo);
1913 if (dump_enabled_p ())
1915 dump_printf_loc (MSG_NOTE, vect_location,
1916 "pre-sorted chains of %s\n",
1917 get_tree_code_name (code));
1918 for (unsigned lane = 0; lane < group_size; ++lane)
1920 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1921 dump_printf (MSG_NOTE, "%s %T ",
1922 get_tree_code_name (chains[lane][opnum].code),
1923 chains[lane][opnum].op);
1924 dump_printf (MSG_NOTE, "\n");
1927 /* 2. try to build children nodes, associating as necessary. */
1928 for (unsigned n = 0; n < chain_len; ++n)
1930 vect_def_type dt = chains[0][n].dt;
1931 unsigned lane;
1932 for (lane = 0; lane < group_size; ++lane)
1933 if (chains[lane][n].dt != dt)
1935 if (dt == vect_constant_def
1936 && chains[lane][n].dt == vect_external_def)
1937 dt = vect_external_def;
1938 else if (dt == vect_external_def
1939 && chains[lane][n].dt == vect_constant_def)
1941 else
1942 break;
1944 if (lane != group_size)
1946 if (dump_enabled_p ())
1947 dump_printf_loc (MSG_NOTE, vect_location,
1948 "giving up on chain due to mismatched "
1949 "def types\n");
1950 matches[lane] = false;
1951 if (lane != group_size - 1)
1952 matches[0] = false;
1953 goto out;
1955 if (dt == vect_constant_def
1956 || dt == vect_external_def)
1958 /* We can always build those. Might want to sort last
1959 or defer building. */
1960 vec<tree> ops;
1961 ops.create (group_size);
1962 for (lane = 0; lane < group_size; ++lane)
1963 ops.quick_push (chains[lane][n].op);
1964 slp_tree child = vect_create_new_slp_node (ops);
1965 SLP_TREE_DEF_TYPE (child) = dt;
1966 children.safe_push (child);
1968 else if (dt != vect_internal_def)
1970 /* Not sure, we might need sth special.
1971 gcc.dg/vect/pr96854.c,
1972 gfortran.dg/vect/fast-math-pr37021.f90
1973 and gfortran.dg/vect/pr61171.f trigger. */
1974 /* Soft-fail for now. */
1975 hard_fail = false;
1976 goto out;
1978 else
1980 vec<stmt_vec_info> op_stmts;
1981 op_stmts.create (group_size);
1982 slp_tree child = NULL;
1983 /* Brute-force our way. We have to consider a lane
1984 failing after fixing an earlier fail up in the
1985 SLP discovery recursion. So track the current
1986 permute per lane. */
1987 unsigned *perms = XALLOCAVEC (unsigned, group_size);
1988 memset (perms, 0, sizeof (unsigned) * group_size);
1991 op_stmts.truncate (0);
1992 for (lane = 0; lane < group_size; ++lane)
1993 op_stmts.quick_push
1994 (vinfo->lookup_def (chains[lane][n].op));
1995 child = vect_build_slp_tree (vinfo, op_stmts,
1996 group_size, &this_max_nunits,
1997 matches, limit,
1998 &this_tree_size, bst_map);
1999 /* ??? We're likely getting too many fatal mismatches
2000 here so maybe we want to ignore them (but then we
2001 have no idea which lanes fatally mismatched). */
2002 if (child || !matches[0])
2003 break;
2004 /* Swap another lane we have not yet matched up into
2005 lanes that did not match. If we run out of
2006 permute possibilities for a lane terminate the
2007 search. */
2008 bool term = false;
2009 for (lane = 1; lane < group_size; ++lane)
2010 if (!matches[lane])
2012 if (n + perms[lane] + 1 == chain_len)
2014 term = true;
2015 break;
2017 std::swap (chains[lane][n],
2018 chains[lane][n + perms[lane] + 1]);
2019 perms[lane]++;
2021 if (term)
2022 break;
2024 while (1);
2025 if (!child)
2027 if (dump_enabled_p ())
2028 dump_printf_loc (MSG_NOTE, vect_location,
2029 "failed to match up op %d\n", n);
2030 op_stmts.release ();
2031 matches[lane] = false;
2032 if (lane != group_size - 1)
2033 matches[0] = false;
2034 goto out;
2036 if (dump_enabled_p ())
2038 dump_printf_loc (MSG_NOTE, vect_location,
2039 "matched up op %d to\n", n);
2040 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2042 children.safe_push (child);
2045 /* 3. build SLP nodes to combine the chain. */
2046 for (unsigned lane = 0; lane < group_size; ++lane)
2047 if (chains[lane][0].code != code)
2049 /* See if there's any alternate all-PLUS entry. */
2050 unsigned n;
2051 for (n = 1; n < chain_len; ++n)
2053 for (lane = 0; lane < group_size; ++lane)
2054 if (chains[lane][n].code != code)
2055 break;
2056 if (lane == group_size)
2057 break;
2059 if (n != chain_len)
2061 /* Swap that in at first position. */
2062 std::swap (children[0], children[n]);
2063 for (lane = 0; lane < group_size; ++lane)
2064 std::swap (chains[lane][0], chains[lane][n]);
2066 else
2068 /* ??? When this triggers and we end up with two
2069 vect_constant/external_def up-front things break (ICE)
2070 spectacularly finding an insertion place for the
2071 all-constant op. We should have a fully
2072 vect_internal_def operand though(?) so we can swap
2073 that into first place and then prepend the all-zero
2074 constant. */
2075 if (dump_enabled_p ())
2076 dump_printf_loc (MSG_NOTE, vect_location,
2077 "inserting constant zero to compensate "
2078 "for (partially) negated first "
2079 "operand\n");
2080 chain_len++;
2081 for (lane = 0; lane < group_size; ++lane)
2082 chains[lane].safe_insert
2083 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2084 vec<tree> zero_ops;
2085 zero_ops.create (group_size);
2086 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2087 for (lane = 1; lane < group_size; ++lane)
2088 zero_ops.quick_push (zero_ops[0]);
2089 slp_tree zero = vect_create_new_slp_node (zero_ops);
2090 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2091 children.safe_insert (0, zero);
2093 break;
2095 for (unsigned i = 1; i < children.length (); ++i)
2097 slp_tree op0 = children[i - 1];
2098 slp_tree op1 = children[i];
2099 bool this_two_op = false;
2100 for (unsigned lane = 0; lane < group_size; ++lane)
2101 if (chains[lane][i].code != chains[0][i].code)
2103 this_two_op = true;
2104 break;
2106 slp_tree child;
2107 if (i == children.length () - 1)
2108 child = vect_create_new_slp_node (node, stmts, 2);
2109 else
2110 child = vect_create_new_slp_node (2, ERROR_MARK);
2111 if (this_two_op)
2113 vec<std::pair<unsigned, unsigned> > lperm;
2114 lperm.create (group_size);
2115 for (unsigned lane = 0; lane < group_size; ++lane)
2116 lperm.quick_push (std::make_pair
2117 (chains[lane][i].code != chains[0][i].code, lane));
2118 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2119 (chains[0][i].code == code
2120 ? op_stmt_info
2121 : other_op_stmt_info),
2122 (chains[0][i].code == code
2123 ? other_op_stmt_info
2124 : op_stmt_info),
2125 lperm);
2127 else
2129 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2130 SLP_TREE_VECTYPE (child) = vectype;
2131 SLP_TREE_LANES (child) = group_size;
2132 SLP_TREE_CHILDREN (child).quick_push (op0);
2133 SLP_TREE_CHILDREN (child).quick_push (op1);
2134 SLP_TREE_REPRESENTATIVE (child)
2135 = (chains[0][i].code == code
2136 ? op_stmt_info : other_op_stmt_info);
2138 children[i] = child;
2140 *tree_size += this_tree_size + 1;
2141 *max_nunits = this_max_nunits;
2142 while (!chains.is_empty ())
2143 chains.pop ().release ();
2144 return node;
2146 out:
2147 while (!children.is_empty ())
2148 vect_free_slp_tree (children.pop ());
2149 while (!chains.is_empty ())
2150 chains.pop ().release ();
2151 /* Hard-fail, otherwise we might run into quadratic processing of the
2152 chains starting one stmt into the chain again. */
2153 if (hard_fail)
2154 return NULL;
2155 /* Fall thru to normal processing. */
2158 /* Get at the operands, verifying they are compatible. */
2159 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2160 slp_oprnd_info oprnd_info;
2161 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2163 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2164 stmts, i, &oprnds_info);
2165 if (res != 0)
2166 matches[(res == -1) ? 0 : i] = false;
2167 if (!matches[0])
2168 break;
2170 for (i = 0; i < group_size; ++i)
2171 if (!matches[i])
2173 vect_free_oprnd_info (oprnds_info);
2174 return NULL;
2176 swap = NULL;
2178 auto_vec<slp_tree, 4> children;
2180 stmt_info = stmts[0];
2182 /* Create SLP_TREE nodes for the definition node/s. */
2183 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2185 slp_tree child;
2186 unsigned int j;
2188 /* We're skipping certain operands from processing, for example
2189 outer loop reduction initial defs. */
2190 if (skip_args[i])
2192 children.safe_push (NULL);
2193 continue;
2196 if (oprnd_info->first_dt == vect_uninitialized_def)
2198 /* COND_EXPR have one too many eventually if the condition
2199 is a SSA name. */
2200 gcc_assert (i == 3 && nops == 4);
2201 continue;
2204 if (is_a <bb_vec_info> (vinfo)
2205 && oprnd_info->first_dt == vect_internal_def
2206 && !oprnd_info->any_pattern)
2208 /* For BB vectorization, if all defs are the same do not
2209 bother to continue the build along the single-lane
2210 graph but use a splat of the scalar value. */
2211 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2212 for (j = 1; j < group_size; ++j)
2213 if (oprnd_info->def_stmts[j] != first_def)
2214 break;
2215 if (j == group_size
2216 /* But avoid doing this for loads where we may be
2217 able to CSE things, unless the stmt is not
2218 vectorizable. */
2219 && (!STMT_VINFO_VECTORIZABLE (first_def)
2220 || !gimple_vuse (first_def->stmt)))
2222 if (dump_enabled_p ())
2223 dump_printf_loc (MSG_NOTE, vect_location,
2224 "Using a splat of the uniform operand\n");
2225 oprnd_info->first_dt = vect_external_def;
2229 if (oprnd_info->first_dt == vect_external_def
2230 || oprnd_info->first_dt == vect_constant_def)
2232 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2233 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2234 oprnd_info->ops = vNULL;
2235 children.safe_push (invnode);
2236 continue;
2239 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2240 group_size, &this_max_nunits,
2241 matches, limit,
2242 &this_tree_size, bst_map)) != NULL)
2244 oprnd_info->def_stmts = vNULL;
2245 children.safe_push (child);
2246 continue;
2249 /* If the SLP build for operand zero failed and operand zero
2250 and one can be commutated try that for the scalar stmts
2251 that failed the match. */
2252 if (i == 0
2253 /* A first scalar stmt mismatch signals a fatal mismatch. */
2254 && matches[0]
2255 /* ??? For COND_EXPRs we can swap the comparison operands
2256 as well as the arms under some constraints. */
2257 && nops == 2
2258 && oprnds_info[1]->first_dt == vect_internal_def
2259 && is_gimple_assign (stmt_info->stmt)
2260 /* Swapping operands for reductions breaks assumptions later on. */
2261 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2262 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2264 /* See whether we can swap the matching or the non-matching
2265 stmt operands. */
2266 bool swap_not_matching = true;
2269 for (j = 0; j < group_size; ++j)
2271 if (matches[j] != !swap_not_matching)
2272 continue;
2273 stmt_vec_info stmt_info = stmts[j];
2274 /* Verify if we can swap operands of this stmt. */
2275 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2276 if (!stmt
2277 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2279 if (!swap_not_matching)
2280 goto fail;
2281 swap_not_matching = false;
2282 break;
2286 while (j != group_size);
2288 /* Swap mismatched definition stmts. */
2289 if (dump_enabled_p ())
2290 dump_printf_loc (MSG_NOTE, vect_location,
2291 "Re-trying with swapped operands of stmts ");
2292 for (j = 0; j < group_size; ++j)
2293 if (matches[j] == !swap_not_matching)
2295 std::swap (oprnds_info[0]->def_stmts[j],
2296 oprnds_info[1]->def_stmts[j]);
2297 std::swap (oprnds_info[0]->ops[j],
2298 oprnds_info[1]->ops[j]);
2299 if (dump_enabled_p ())
2300 dump_printf (MSG_NOTE, "%d ", j);
2302 if (dump_enabled_p ())
2303 dump_printf (MSG_NOTE, "\n");
2304 /* And try again with scratch 'matches' ... */
2305 bool *tem = XALLOCAVEC (bool, group_size);
2306 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2307 group_size, &this_max_nunits,
2308 tem, limit,
2309 &this_tree_size, bst_map)) != NULL)
2311 oprnd_info->def_stmts = vNULL;
2312 children.safe_push (child);
2313 continue;
2316 fail:
2318 /* If the SLP build failed and we analyze a basic-block
2319 simply treat nodes we fail to build as externally defined
2320 (and thus build vectors from the scalar defs).
2321 The cost model will reject outright expensive cases.
2322 ??? This doesn't treat cases where permutation ultimatively
2323 fails (or we don't try permutation below). Ideally we'd
2324 even compute a permutation that will end up with the maximum
2325 SLP tree size... */
2326 if (is_a <bb_vec_info> (vinfo)
2327 /* ??? Rejecting patterns this way doesn't work. We'd have to
2328 do extra work to cancel the pattern so the uses see the
2329 scalar version. */
2330 && !is_pattern_stmt_p (stmt_info)
2331 && !oprnd_info->any_pattern)
2333 /* But if there's a leading vector sized set of matching stmts
2334 fail here so we can split the group. This matches the condition
2335 vect_analyze_slp_instance uses. */
2336 /* ??? We might want to split here and combine the results to support
2337 multiple vector sizes better. */
2338 for (j = 0; j < group_size; ++j)
2339 if (!matches[j])
2340 break;
2341 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2343 if (dump_enabled_p ())
2344 dump_printf_loc (MSG_NOTE, vect_location,
2345 "Building vector operands from scalars\n");
2346 this_tree_size++;
2347 child = vect_create_new_slp_node (oprnd_info->ops);
2348 children.safe_push (child);
2349 oprnd_info->ops = vNULL;
2350 continue;
2354 gcc_assert (child == NULL);
2355 FOR_EACH_VEC_ELT (children, j, child)
2356 if (child)
2357 vect_free_slp_tree (child);
2358 vect_free_oprnd_info (oprnds_info);
2359 return NULL;
2362 vect_free_oprnd_info (oprnds_info);
2364 /* If we have all children of a child built up from uniform scalars
2365 or does more than one possibly expensive vector construction then
2366 just throw that away, causing it built up from scalars.
2367 The exception is the SLP node for the vector store. */
2368 if (is_a <bb_vec_info> (vinfo)
2369 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2370 /* ??? Rejecting patterns this way doesn't work. We'd have to
2371 do extra work to cancel the pattern so the uses see the
2372 scalar version. */
2373 && !is_pattern_stmt_p (stmt_info))
2375 slp_tree child;
2376 unsigned j;
2377 bool all_uniform_p = true;
2378 unsigned n_vector_builds = 0;
2379 FOR_EACH_VEC_ELT (children, j, child)
2381 if (!child)
2383 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2384 all_uniform_p = false;
2385 else if (!vect_slp_tree_uniform_p (child))
2387 all_uniform_p = false;
2388 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2389 n_vector_builds++;
2392 if (all_uniform_p
2393 || n_vector_builds > 1
2394 || (n_vector_builds == children.length ()
2395 && is_a <gphi *> (stmt_info->stmt)))
2397 /* Roll back. */
2398 matches[0] = false;
2399 FOR_EACH_VEC_ELT (children, j, child)
2400 if (child)
2401 vect_free_slp_tree (child);
2403 if (dump_enabled_p ())
2404 dump_printf_loc (MSG_NOTE, vect_location,
2405 "Building parent vector operands from "
2406 "scalars instead\n");
2407 return NULL;
2411 *tree_size += this_tree_size + 1;
2412 *max_nunits = this_max_nunits;
2414 if (two_operators)
2416 /* ??? We'd likely want to either cache in bst_map sth like
2417 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2418 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2419 explicit stmts to put in so the keying on 'stmts' doesn't
2420 work (but we have the same issue with nodes that use 'ops'). */
2421 slp_tree one = new _slp_tree;
2422 slp_tree two = new _slp_tree;
2423 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2424 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2425 SLP_TREE_VECTYPE (one) = vectype;
2426 SLP_TREE_VECTYPE (two) = vectype;
2427 SLP_TREE_CHILDREN (one).safe_splice (children);
2428 SLP_TREE_CHILDREN (two).safe_splice (children);
2429 slp_tree child;
2430 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2431 SLP_TREE_REF_COUNT (child)++;
2433 /* Here we record the original defs since this
2434 node represents the final lane configuration. */
2435 node = vect_create_new_slp_node (node, stmts, 2);
2436 SLP_TREE_VECTYPE (node) = vectype;
2437 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2438 SLP_TREE_CHILDREN (node).quick_push (one);
2439 SLP_TREE_CHILDREN (node).quick_push (two);
2440 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2441 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2442 enum tree_code ocode = ERROR_MARK;
2443 stmt_vec_info ostmt_info;
2444 unsigned j = 0;
2445 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2447 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2448 if (gimple_assign_rhs_code (ostmt) != code0)
2450 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2451 ocode = gimple_assign_rhs_code (ostmt);
2452 j = i;
2454 else
2455 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2457 SLP_TREE_CODE (one) = code0;
2458 SLP_TREE_CODE (two) = ocode;
2459 SLP_TREE_LANES (one) = stmts.length ();
2460 SLP_TREE_LANES (two) = stmts.length ();
2461 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2462 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2463 return node;
2466 node = vect_create_new_slp_node (node, stmts, nops);
2467 SLP_TREE_VECTYPE (node) = vectype;
2468 SLP_TREE_CHILDREN (node).splice (children);
2469 return node;
2472 /* Dump a single SLP tree NODE. */
2474 static void
2475 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2476 slp_tree node)
2478 unsigned i, j;
2479 slp_tree child;
2480 stmt_vec_info stmt_info;
2481 tree op;
2483 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2484 dump_user_location_t user_loc = loc.get_user_location ();
2485 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
2486 SLP_TREE_DEF_TYPE (node) == vect_external_def
2487 ? " (external)"
2488 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2489 ? " (constant)"
2490 : ""), node,
2491 estimated_poly_value (node->max_nunits),
2492 SLP_TREE_REF_COUNT (node));
2493 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2495 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2496 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2497 else
2498 dump_printf_loc (metadata, user_loc, "op template: %G",
2499 SLP_TREE_REPRESENTATIVE (node)->stmt);
2501 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2502 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2503 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2504 else
2506 dump_printf_loc (metadata, user_loc, "\t{ ");
2507 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2508 dump_printf (metadata, "%T%s ", op,
2509 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2510 dump_printf (metadata, "}\n");
2512 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2514 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2515 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2516 dump_printf (dump_kind, " %u", j);
2517 dump_printf (dump_kind, " }\n");
2519 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2521 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2522 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2523 dump_printf (dump_kind, " %u[%u]",
2524 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2525 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2526 dump_printf (dump_kind, " }\n");
2528 if (SLP_TREE_CHILDREN (node).is_empty ())
2529 return;
2530 dump_printf_loc (metadata, user_loc, "\tchildren");
2531 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2532 dump_printf (dump_kind, " %p", (void *)child);
2533 dump_printf (dump_kind, "\n");
2536 DEBUG_FUNCTION void
2537 debug (slp_tree node)
2539 debug_dump_context ctx;
2540 vect_print_slp_tree (MSG_NOTE,
2541 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2542 node);
2545 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2547 static void
2548 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2549 slp_tree node, hash_set<slp_tree> &visited)
2551 unsigned i;
2552 slp_tree child;
2554 if (visited.add (node))
2555 return;
2557 vect_print_slp_tree (dump_kind, loc, node);
2559 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2560 if (child)
2561 vect_print_slp_graph (dump_kind, loc, child, visited);
2564 static void
2565 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2566 slp_tree entry)
2568 hash_set<slp_tree> visited;
2569 vect_print_slp_graph (dump_kind, loc, entry, visited);
2572 /* Mark the tree rooted at NODE with PURE_SLP. */
2574 static void
2575 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2577 int i;
2578 stmt_vec_info stmt_info;
2579 slp_tree child;
2581 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2582 return;
2584 if (visited.add (node))
2585 return;
2587 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2588 STMT_SLP_TYPE (stmt_info) = pure_slp;
2590 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2591 if (child)
2592 vect_mark_slp_stmts (child, visited);
2595 static void
2596 vect_mark_slp_stmts (slp_tree node)
2598 hash_set<slp_tree> visited;
2599 vect_mark_slp_stmts (node, visited);
2602 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2604 static void
2605 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2607 int i;
2608 stmt_vec_info stmt_info;
2609 slp_tree child;
2611 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2612 return;
2614 if (visited.add (node))
2615 return;
2617 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2619 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2620 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2621 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2624 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2625 if (child)
2626 vect_mark_slp_stmts_relevant (child, visited);
2629 static void
2630 vect_mark_slp_stmts_relevant (slp_tree node)
2632 hash_set<slp_tree> visited;
2633 vect_mark_slp_stmts_relevant (node, visited);
2637 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2639 static void
2640 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2641 hash_set<slp_tree> &visited)
2643 if (!node || visited.add (node))
2644 return;
2646 if (SLP_TREE_CHILDREN (node).length () == 0)
2648 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2649 return;
2650 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2651 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2652 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2653 loads.safe_push (node);
2655 else
2657 unsigned i;
2658 slp_tree child;
2659 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2660 vect_gather_slp_loads (loads, child, visited);
2665 /* Find the last store in SLP INSTANCE. */
2667 stmt_vec_info
2668 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2670 stmt_vec_info last = NULL;
2671 stmt_vec_info stmt_vinfo;
2673 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2675 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2676 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2679 return last;
2682 /* Find the first stmt in NODE. */
2684 stmt_vec_info
2685 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2687 stmt_vec_info first = NULL;
2688 stmt_vec_info stmt_vinfo;
2690 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2692 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2693 if (!first
2694 || get_later_stmt (stmt_vinfo, first) == first)
2695 first = stmt_vinfo;
2698 return first;
2701 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2702 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2703 (also containing the first GROUP1_SIZE stmts, since stores are
2704 consecutive), the second containing the remainder.
2705 Return the first stmt in the second group. */
2707 static stmt_vec_info
2708 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2710 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2711 gcc_assert (group1_size > 0);
2712 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2713 gcc_assert (group2_size > 0);
2714 DR_GROUP_SIZE (first_vinfo) = group1_size;
2716 stmt_vec_info stmt_info = first_vinfo;
2717 for (unsigned i = group1_size; i > 1; i--)
2719 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2720 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2722 /* STMT is now the last element of the first group. */
2723 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2724 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2726 DR_GROUP_SIZE (group2) = group2_size;
2727 for (stmt_info = group2; stmt_info;
2728 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2730 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2731 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2734 /* For the second group, the DR_GROUP_GAP is that before the original group,
2735 plus skipping over the first vector. */
2736 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2738 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2739 DR_GROUP_GAP (first_vinfo) += group2_size;
2741 if (dump_enabled_p ())
2742 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2743 group1_size, group2_size);
2745 return group2;
2748 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2749 statements and a vector of NUNITS elements. */
2751 static poly_uint64
2752 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2754 return exact_div (common_multiple (nunits, group_size), group_size);
2757 /* Helper that checks to see if a node is a load node. */
2759 static inline bool
2760 vect_is_slp_load_node (slp_tree root)
2762 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2763 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2764 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2768 /* Helper function of optimize_load_redistribution that performs the operation
2769 recursively. */
2771 static slp_tree
2772 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2773 vec_info *vinfo, unsigned int group_size,
2774 hash_map<slp_tree, slp_tree> *load_map,
2775 slp_tree root)
2777 if (slp_tree *leader = load_map->get (root))
2778 return *leader;
2780 slp_tree node;
2781 unsigned i;
2783 /* For now, we don't know anything about externals so do not do anything. */
2784 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2785 return NULL;
2786 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2788 /* First convert this node into a load node and add it to the leaves
2789 list and flatten the permute from a lane to a load one. If it's
2790 unneeded it will be elided later. */
2791 vec<stmt_vec_info> stmts;
2792 stmts.create (SLP_TREE_LANES (root));
2793 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2794 for (unsigned j = 0; j < lane_perm.length (); j++)
2796 std::pair<unsigned, unsigned> perm = lane_perm[j];
2797 node = SLP_TREE_CHILDREN (root)[perm.first];
2799 if (!vect_is_slp_load_node (node)
2800 || SLP_TREE_CHILDREN (node).exists ())
2802 stmts.release ();
2803 goto next;
2806 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2809 if (dump_enabled_p ())
2810 dump_printf_loc (MSG_NOTE, vect_location,
2811 "converting stmts on permute node %p\n", root);
2813 bool *matches = XALLOCAVEC (bool, group_size);
2814 poly_uint64 max_nunits = 1;
2815 unsigned tree_size = 0, limit = 1;
2816 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2817 matches, &limit, &tree_size, bst_map);
2818 if (!node)
2819 stmts.release ();
2821 load_map->put (root, node);
2822 return node;
2825 next:
2826 load_map->put (root, NULL);
2828 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2830 slp_tree value
2831 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2832 node);
2833 if (value)
2835 SLP_TREE_REF_COUNT (value)++;
2836 SLP_TREE_CHILDREN (root)[i] = value;
2837 /* ??? We know the original leafs of the replaced nodes will
2838 be referenced by bst_map, only the permutes created by
2839 pattern matching are not. */
2840 if (SLP_TREE_REF_COUNT (node) == 1)
2841 load_map->remove (node);
2842 vect_free_slp_tree (node);
2846 return NULL;
2849 /* Temporary workaround for loads not being CSEd during SLP build. This
2850 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2851 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2852 same DR such that the final operation is equal to a permuted load. Such
2853 NODES are then directly converted into LOADS themselves. The nodes are
2854 CSEd using BST_MAP. */
2856 static void
2857 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2858 vec_info *vinfo, unsigned int group_size,
2859 hash_map<slp_tree, slp_tree> *load_map,
2860 slp_tree root)
2862 slp_tree node;
2863 unsigned i;
2865 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2867 slp_tree value
2868 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2869 node);
2870 if (value)
2872 SLP_TREE_REF_COUNT (value)++;
2873 SLP_TREE_CHILDREN (root)[i] = value;
2874 /* ??? We know the original leafs of the replaced nodes will
2875 be referenced by bst_map, only the permutes created by
2876 pattern matching are not. */
2877 if (SLP_TREE_REF_COUNT (node) == 1)
2878 load_map->remove (node);
2879 vect_free_slp_tree (node);
2884 /* Helper function of vect_match_slp_patterns.
2886 Attempts to match patterns against the slp tree rooted in REF_NODE using
2887 VINFO. Patterns are matched in post-order traversal.
2889 If matching is successful the value in REF_NODE is updated and returned, if
2890 not then it is returned unchanged. */
2892 static bool
2893 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2894 slp_tree_to_load_perm_map_t *perm_cache,
2895 hash_set<slp_tree> *visited)
2897 unsigned i;
2898 slp_tree node = *ref_node;
2899 bool found_p = false;
2900 if (!node || visited->add (node))
2901 return false;
2903 slp_tree child;
2904 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2905 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2906 vinfo, perm_cache, visited);
2908 for (unsigned x = 0; x < num__slp_patterns; x++)
2910 vect_pattern *pattern = slp_patterns[x] (perm_cache, ref_node);
2911 if (pattern)
2913 pattern->build (vinfo);
2914 delete pattern;
2915 found_p = true;
2919 return found_p;
2922 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2923 vec_info VINFO.
2925 The modified tree is returned. Patterns are tried in order and multiple
2926 patterns may match. */
2928 static bool
2929 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2930 hash_set<slp_tree> *visited,
2931 slp_tree_to_load_perm_map_t *perm_cache)
2933 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2934 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2936 if (dump_enabled_p ())
2937 dump_printf_loc (MSG_NOTE, vect_location,
2938 "Analyzing SLP tree %p for patterns\n",
2939 SLP_INSTANCE_TREE (instance));
2941 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
2944 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
2945 splitting into two, with the first split group having size NEW_GROUP_SIZE.
2946 Return true if we could use IFN_STORE_LANES instead and if that appears
2947 to be the better approach. */
2949 static bool
2950 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
2951 unsigned int group_size,
2952 unsigned int new_group_size)
2954 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2955 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
2956 if (!vectype)
2957 return false;
2958 /* Allow the split if one of the two new groups would operate on full
2959 vectors *within* rather than across one scalar loop iteration.
2960 This is purely a heuristic, but it should work well for group
2961 sizes of 3 and 4, where the possible splits are:
2963 3->2+1: OK if the vector has exactly two elements
2964 4->2+2: Likewise
2965 4->3+1: Less clear-cut. */
2966 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
2967 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
2968 return false;
2969 return vect_store_lanes_supported (vectype, group_size, false);
2972 /* Analyze an SLP instance starting from a group of grouped stores. Call
2973 vect_build_slp_tree to build a tree of packed stmts if possible.
2974 Return FALSE if it's impossible to SLP any stmt in the loop. */
2976 static bool
2977 vect_analyze_slp_instance (vec_info *vinfo,
2978 scalar_stmts_to_slp_tree_map_t *bst_map,
2979 stmt_vec_info stmt_info, slp_instance_kind kind,
2980 unsigned max_tree_size, unsigned *limit);
2982 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2983 of KIND. Return true if successful. */
2985 static bool
2986 vect_build_slp_instance (vec_info *vinfo,
2987 slp_instance_kind kind,
2988 vec<stmt_vec_info> &scalar_stmts,
2989 vec<stmt_vec_info> &root_stmt_infos,
2990 unsigned max_tree_size, unsigned *limit,
2991 scalar_stmts_to_slp_tree_map_t *bst_map,
2992 /* ??? We need stmt_info for group splitting. */
2993 stmt_vec_info stmt_info_)
2995 if (dump_enabled_p ())
2997 dump_printf_loc (MSG_NOTE, vect_location,
2998 "Starting SLP discovery for\n");
2999 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3000 dump_printf_loc (MSG_NOTE, vect_location,
3001 " %G", scalar_stmts[i]->stmt);
3004 /* Build the tree for the SLP instance. */
3005 unsigned int group_size = scalar_stmts.length ();
3006 bool *matches = XALLOCAVEC (bool, group_size);
3007 poly_uint64 max_nunits = 1;
3008 unsigned tree_size = 0;
3009 unsigned i;
3010 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3011 &max_nunits, matches, limit,
3012 &tree_size, bst_map);
3013 if (node != NULL)
3015 /* Calculate the unrolling factor based on the smallest type. */
3016 poly_uint64 unrolling_factor
3017 = calculate_unrolling_factor (max_nunits, group_size);
3019 if (maybe_ne (unrolling_factor, 1U)
3020 && is_a <bb_vec_info> (vinfo))
3022 unsigned HOST_WIDE_INT const_max_nunits;
3023 if (!max_nunits.is_constant (&const_max_nunits)
3024 || const_max_nunits > group_size)
3026 if (dump_enabled_p ())
3027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3028 "Build SLP failed: store group "
3029 "size not a multiple of the vector size "
3030 "in basic block SLP\n");
3031 vect_free_slp_tree (node);
3032 return false;
3034 /* Fatal mismatch. */
3035 if (dump_enabled_p ())
3036 dump_printf_loc (MSG_NOTE, vect_location,
3037 "SLP discovery succeeded but node needs "
3038 "splitting\n");
3039 memset (matches, true, group_size);
3040 matches[group_size / const_max_nunits * const_max_nunits] = false;
3041 vect_free_slp_tree (node);
3043 else
3045 /* Create a new SLP instance. */
3046 slp_instance new_instance = XNEW (class _slp_instance);
3047 SLP_INSTANCE_TREE (new_instance) = node;
3048 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3049 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3050 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3051 SLP_INSTANCE_KIND (new_instance) = kind;
3052 new_instance->reduc_phis = NULL;
3053 new_instance->cost_vec = vNULL;
3054 new_instance->subgraph_entries = vNULL;
3056 if (dump_enabled_p ())
3057 dump_printf_loc (MSG_NOTE, vect_location,
3058 "SLP size %u vs. limit %u.\n",
3059 tree_size, max_tree_size);
3061 /* Fixup SLP reduction chains. */
3062 if (kind == slp_inst_kind_reduc_chain)
3064 /* If this is a reduction chain with a conversion in front
3065 amend the SLP tree with a node for that. */
3066 gimple *scalar_def
3067 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3068 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3070 /* Get at the conversion stmt - we know it's the single use
3071 of the last stmt of the reduction chain. */
3072 use_operand_p use_p;
3073 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3074 &use_p, &scalar_def);
3075 gcc_assert (r);
3076 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3077 next_info = vect_stmt_to_vectorize (next_info);
3078 scalar_stmts = vNULL;
3079 scalar_stmts.create (group_size);
3080 for (unsigned i = 0; i < group_size; ++i)
3081 scalar_stmts.quick_push (next_info);
3082 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3083 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3084 SLP_TREE_CHILDREN (conv).quick_push (node);
3085 SLP_INSTANCE_TREE (new_instance) = conv;
3086 /* We also have to fake this conversion stmt as SLP reduction
3087 group so we don't have to mess with too much code
3088 elsewhere. */
3089 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3090 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3092 /* Fill the backedge child of the PHI SLP node. The
3093 general matching code cannot find it because the
3094 scalar code does not reflect how we vectorize the
3095 reduction. */
3096 use_operand_p use_p;
3097 imm_use_iterator imm_iter;
3098 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3099 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3100 gimple_get_lhs (scalar_def))
3101 /* There are exactly two non-debug uses, the reduction
3102 PHI and the loop-closed PHI node. */
3103 if (!is_gimple_debug (USE_STMT (use_p))
3104 && gimple_bb (USE_STMT (use_p)) == loop->header)
3106 auto_vec<stmt_vec_info, 64> phis (group_size);
3107 stmt_vec_info phi_info
3108 = vinfo->lookup_stmt (USE_STMT (use_p));
3109 for (unsigned i = 0; i < group_size; ++i)
3110 phis.quick_push (phi_info);
3111 slp_tree *phi_node = bst_map->get (phis);
3112 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3113 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3114 = SLP_INSTANCE_TREE (new_instance);
3115 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3119 vinfo->slp_instances.safe_push (new_instance);
3121 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3122 the number of scalar stmts in the root in a few places.
3123 Verify that assumption holds. */
3124 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3125 .length () == group_size);
3127 if (dump_enabled_p ())
3129 dump_printf_loc (MSG_NOTE, vect_location,
3130 "Final SLP tree for instance %p:\n", new_instance);
3131 vect_print_slp_graph (MSG_NOTE, vect_location,
3132 SLP_INSTANCE_TREE (new_instance));
3135 return true;
3138 else
3140 /* Failed to SLP. */
3141 /* Free the allocated memory. */
3142 scalar_stmts.release ();
3145 stmt_vec_info stmt_info = stmt_info_;
3146 /* Try to break the group up into pieces. */
3147 if (kind == slp_inst_kind_store)
3149 /* ??? We could delay all the actual splitting of store-groups
3150 until after SLP discovery of the original group completed.
3151 Then we can recurse to vect_build_slp_instance directly. */
3152 for (i = 0; i < group_size; i++)
3153 if (!matches[i])
3154 break;
3156 /* For basic block SLP, try to break the group up into multiples of
3157 a vector size. */
3158 if (is_a <bb_vec_info> (vinfo)
3159 && (i > 1 && i < group_size))
3161 tree scalar_type
3162 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3163 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3164 1 << floor_log2 (i));
3165 unsigned HOST_WIDE_INT const_nunits;
3166 if (vectype
3167 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3169 /* Split into two groups at the first vector boundary. */
3170 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3171 unsigned group1_size = i & ~(const_nunits - 1);
3173 if (dump_enabled_p ())
3174 dump_printf_loc (MSG_NOTE, vect_location,
3175 "Splitting SLP group at stmt %u\n", i);
3176 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3177 group1_size);
3178 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3179 kind, max_tree_size,
3180 limit);
3181 /* Split the rest at the failure point and possibly
3182 re-analyze the remaining matching part if it has
3183 at least two lanes. */
3184 if (group1_size < i
3185 && (i + 1 < group_size
3186 || i - group1_size > 1))
3188 stmt_vec_info rest2 = rest;
3189 rest = vect_split_slp_store_group (rest, i - group1_size);
3190 if (i - group1_size > 1)
3191 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3192 kind, max_tree_size,
3193 limit);
3195 /* Re-analyze the non-matching tail if it has at least
3196 two lanes. */
3197 if (i + 1 < group_size)
3198 res |= vect_analyze_slp_instance (vinfo, bst_map,
3199 rest, kind, max_tree_size,
3200 limit);
3201 return res;
3205 /* For loop vectorization split into arbitrary pieces of size > 1. */
3206 if (is_a <loop_vec_info> (vinfo)
3207 && (i > 1 && i < group_size)
3208 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3210 unsigned group1_size = i;
3212 if (dump_enabled_p ())
3213 dump_printf_loc (MSG_NOTE, vect_location,
3214 "Splitting SLP group at stmt %u\n", i);
3216 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3217 group1_size);
3218 /* Loop vectorization cannot handle gaps in stores, make sure
3219 the split group appears as strided. */
3220 STMT_VINFO_STRIDED_P (rest) = 1;
3221 DR_GROUP_GAP (rest) = 0;
3222 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3223 DR_GROUP_GAP (stmt_info) = 0;
3225 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3226 kind, max_tree_size, limit);
3227 if (i + 1 < group_size)
3228 res |= vect_analyze_slp_instance (vinfo, bst_map,
3229 rest, kind, max_tree_size, limit);
3231 return res;
3234 /* Even though the first vector did not all match, we might be able to SLP
3235 (some) of the remainder. FORNOW ignore this possibility. */
3238 /* Failed to SLP. */
3239 if (dump_enabled_p ())
3240 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3241 return false;
3245 /* Analyze an SLP instance starting from a group of grouped stores. Call
3246 vect_build_slp_tree to build a tree of packed stmts if possible.
3247 Return FALSE if it's impossible to SLP any stmt in the loop. */
3249 static bool
3250 vect_analyze_slp_instance (vec_info *vinfo,
3251 scalar_stmts_to_slp_tree_map_t *bst_map,
3252 stmt_vec_info stmt_info,
3253 slp_instance_kind kind,
3254 unsigned max_tree_size, unsigned *limit)
3256 unsigned int i;
3257 vec<stmt_vec_info> scalar_stmts;
3259 if (is_a <bb_vec_info> (vinfo))
3260 vect_location = stmt_info->stmt;
3262 stmt_vec_info next_info = stmt_info;
3263 if (kind == slp_inst_kind_store)
3265 /* Collect the stores and store them in scalar_stmts. */
3266 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3267 while (next_info)
3269 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3270 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3273 else if (kind == slp_inst_kind_reduc_chain)
3275 /* Collect the reduction stmts and store them in scalar_stmts. */
3276 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3277 while (next_info)
3279 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3280 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3282 /* Mark the first element of the reduction chain as reduction to properly
3283 transform the node. In the reduction analysis phase only the last
3284 element of the chain is marked as reduction. */
3285 STMT_VINFO_DEF_TYPE (stmt_info)
3286 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3287 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3288 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3290 else if (kind == slp_inst_kind_ctor)
3292 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3293 tree val;
3294 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3295 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3297 stmt_vec_info def_info = vinfo->lookup_def (val);
3298 def_info = vect_stmt_to_vectorize (def_info);
3299 scalar_stmts.quick_push (def_info);
3301 if (dump_enabled_p ())
3302 dump_printf_loc (MSG_NOTE, vect_location,
3303 "Analyzing vectorizable constructor: %G\n",
3304 stmt_info->stmt);
3306 else if (kind == slp_inst_kind_reduc_group)
3308 /* Collect reduction statements. */
3309 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
3310 scalar_stmts.create (reductions.length ());
3311 for (i = 0; reductions.iterate (i, &next_info); i++)
3312 if (STMT_VINFO_RELEVANT_P (next_info)
3313 || STMT_VINFO_LIVE_P (next_info))
3314 scalar_stmts.quick_push (next_info);
3315 /* If less than two were relevant/live there's nothing to SLP. */
3316 if (scalar_stmts.length () < 2)
3317 return false;
3319 else
3320 gcc_unreachable ();
3322 vec<stmt_vec_info> roots = vNULL;
3323 if (kind == slp_inst_kind_ctor)
3325 roots.create (1);
3326 roots.quick_push (stmt_info);
3328 /* Build the tree for the SLP instance. */
3329 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3330 roots,
3331 max_tree_size, limit, bst_map,
3332 kind == slp_inst_kind_store
3333 ? stmt_info : NULL);
3334 if (!res)
3335 roots.release ();
3337 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3338 where we should do store group splitting. */
3340 return res;
3343 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3344 trees of packed scalar stmts if SLP is possible. */
3346 opt_result
3347 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3349 unsigned int i;
3350 stmt_vec_info first_element;
3351 slp_instance instance;
3353 DUMP_VECT_SCOPE ("vect_analyze_slp");
3355 unsigned limit = max_tree_size;
3357 scalar_stmts_to_slp_tree_map_t *bst_map
3358 = new scalar_stmts_to_slp_tree_map_t ();
3360 /* Find SLP sequences starting from groups of grouped stores. */
3361 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3362 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3363 STMT_VINFO_GROUPED_ACCESS (first_element)
3364 ? slp_inst_kind_store : slp_inst_kind_ctor,
3365 max_tree_size, &limit);
3367 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3369 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3371 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3372 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3373 bb_vinfo->roots[i].stmts,
3374 bb_vinfo->roots[i].roots,
3375 max_tree_size, &limit, bst_map, NULL))
3377 bb_vinfo->roots[i].stmts = vNULL;
3378 bb_vinfo->roots[i].roots = vNULL;
3383 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3385 /* Find SLP sequences starting from reduction chains. */
3386 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3387 if (! STMT_VINFO_RELEVANT_P (first_element)
3388 && ! STMT_VINFO_LIVE_P (first_element))
3390 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3391 slp_inst_kind_reduc_chain,
3392 max_tree_size, &limit))
3394 /* Dissolve reduction chain group. */
3395 stmt_vec_info vinfo = first_element;
3396 stmt_vec_info last = NULL;
3397 while (vinfo)
3399 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3400 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3401 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3402 last = vinfo;
3403 vinfo = next;
3405 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3406 /* It can be still vectorized as part of an SLP reduction. */
3407 loop_vinfo->reductions.safe_push (last);
3410 /* Find SLP sequences starting from groups of reductions. */
3411 if (loop_vinfo->reductions.length () > 1)
3412 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3413 slp_inst_kind_reduc_group, max_tree_size,
3414 &limit);
3417 hash_set<slp_tree> visited_patterns;
3418 slp_tree_to_load_perm_map_t perm_cache;
3420 /* See if any patterns can be found in the SLP tree. */
3421 bool pattern_found = false;
3422 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3423 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3424 &visited_patterns, &perm_cache);
3426 /* If any were found optimize permutations of loads. */
3427 if (pattern_found)
3429 hash_map<slp_tree, slp_tree> load_map;
3430 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3432 slp_tree root = SLP_INSTANCE_TREE (instance);
3433 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3434 &load_map, root);
3440 /* The map keeps a reference on SLP nodes built, release that. */
3441 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3442 it != bst_map->end (); ++it)
3443 if ((*it).second)
3444 vect_free_slp_tree ((*it).second);
3445 delete bst_map;
3447 if (pattern_found && dump_enabled_p ())
3449 dump_printf_loc (MSG_NOTE, vect_location,
3450 "Pattern matched SLP tree\n");
3451 hash_set<slp_tree> visited;
3452 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3453 vect_print_slp_graph (MSG_NOTE, vect_location,
3454 SLP_INSTANCE_TREE (instance), visited);
3457 return opt_result::success ();
3460 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3462 static void
3463 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
3464 vec<slp_tree> &vertices, vec<int> &leafs)
3466 unsigned i;
3467 slp_tree child;
3469 if (visited.add (node))
3470 return;
3472 node->vertex = vertices.length ();
3473 vertices.safe_push (node);
3475 bool leaf = true;
3476 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3477 if (child)
3479 leaf = false;
3480 vect_slp_build_vertices (visited, child, vertices, leafs);
3482 if (leaf)
3483 leafs.safe_push (node->vertex);
3486 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3488 static void
3489 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
3490 vec<int> &leafs)
3492 hash_set<slp_tree> visited;
3493 unsigned i;
3494 slp_instance instance;
3495 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
3497 unsigned n_v = vertices.length ();
3498 unsigned n_l = leafs.length ();
3499 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
3500 leafs);
3501 /* If we added vertices but no entries to the reverse graph we've
3502 added a cycle that is not backwards-reachable. Push the entry
3503 to mimic as leaf then. */
3504 if (vertices.length () > n_v
3505 && leafs.length () == n_l)
3506 leafs.safe_push (SLP_INSTANCE_TREE (instance)->vertex);
3510 /* Apply (reverse) bijectite PERM to VEC. */
3512 template <class T>
3513 static void
3514 vect_slp_permute (vec<unsigned> perm,
3515 vec<T> &vec, bool reverse)
3517 auto_vec<T, 64> saved;
3518 saved.create (vec.length ());
3519 for (unsigned i = 0; i < vec.length (); ++i)
3520 saved.quick_push (vec[i]);
3522 if (reverse)
3524 for (unsigned i = 0; i < vec.length (); ++i)
3525 vec[perm[i]] = saved[i];
3526 for (unsigned i = 0; i < vec.length (); ++i)
3527 gcc_assert (vec[perm[i]] == saved[i]);
3529 else
3531 for (unsigned i = 0; i < vec.length (); ++i)
3532 vec[i] = saved[perm[i]];
3533 for (unsigned i = 0; i < vec.length (); ++i)
3534 gcc_assert (vec[i] == saved[perm[i]]);
3538 /* Return whether permutations PERM_A and PERM_B as recorded in the
3539 PERMS vector are equal. */
3541 static bool
3542 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
3543 int perm_a, int perm_b)
3545 return (perm_a == perm_b
3546 || (perms[perm_a].length () == perms[perm_b].length ()
3547 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
3548 sizeof (unsigned) * perms[perm_a].length ()) == 0));
3551 /* Optimize the SLP graph of VINFO. */
3553 void
3554 vect_optimize_slp (vec_info *vinfo)
3556 if (vinfo->slp_instances.is_empty ())
3557 return;
3559 slp_tree node;
3560 unsigned i;
3561 auto_vec<slp_tree> vertices;
3562 auto_vec<int> leafs;
3563 vect_slp_build_vertices (vinfo, vertices, leafs);
3565 struct graph *slpg = new_graph (vertices.length ());
3566 FOR_EACH_VEC_ELT (vertices, i, node)
3568 unsigned j;
3569 slp_tree child;
3570 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3571 if (child)
3572 add_edge (slpg, i, child->vertex);
3575 /* Compute (reverse) postorder on the inverted graph. */
3576 auto_vec<int> ipo;
3577 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3579 auto_sbitmap n_visited (vertices.length ());
3580 auto_sbitmap n_materialize (vertices.length ());
3581 auto_vec<int> n_perm (vertices.length ());
3582 auto_vec<vec<unsigned> > perms;
3584 bitmap_clear (n_visited);
3585 bitmap_clear (n_materialize);
3586 n_perm.quick_grow_cleared (vertices.length ());
3587 perms.safe_push (vNULL); /* zero is no permute */
3589 /* Produce initial permutations. */
3590 for (i = 0; i < leafs.length (); ++i)
3592 int idx = leafs[i];
3593 slp_tree node = vertices[idx];
3595 /* Handle externals and constants optimistically throughout the
3596 iteration. */
3597 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3598 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3599 continue;
3601 /* Leafs do not change across iterations. Note leafs also double
3602 as entries to the reverse graph. */
3603 if (!slpg->vertices[idx].succ)
3604 bitmap_set_bit (n_visited, idx);
3605 /* Loads are the only thing generating permutes. */
3606 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3607 continue;
3609 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3610 node unpermuted, record this permute. */
3611 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3612 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3613 continue;
3614 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3615 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3616 bool any_permute = false;
3617 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3619 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3620 imin = MIN (imin, idx);
3621 imax = MAX (imax, idx);
3622 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3623 any_permute = true;
3625 /* If there's no permute no need to split one out. */
3626 if (!any_permute)
3627 continue;
3628 /* If the span doesn't match we'd disrupt VF computation, avoid
3629 that for now. */
3630 if (imax - imin + 1 != SLP_TREE_LANES (node))
3631 continue;
3633 /* For now only handle true permutes, like
3634 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3635 when permuting constants and invariants keeping the permute
3636 bijective. */
3637 auto_sbitmap load_index (SLP_TREE_LANES (node));
3638 bitmap_clear (load_index);
3639 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3640 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3641 unsigned j;
3642 for (j = 0; j < SLP_TREE_LANES (node); ++j)
3643 if (!bitmap_bit_p (load_index, j))
3644 break;
3645 if (j != SLP_TREE_LANES (node))
3646 continue;
3648 vec<unsigned> perm = vNULL;
3649 perm.safe_grow (SLP_TREE_LANES (node), true);
3650 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3651 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3652 perms.safe_push (perm);
3653 n_perm[idx] = perms.length () - 1;
3656 /* Propagate permutes along the graph and compute materialization points. */
3657 bool changed;
3658 unsigned iteration = 0;
3661 changed = false;
3662 ++iteration;
3664 for (i = vertices.length (); i > 0 ; --i)
3666 int idx = ipo[i-1];
3667 slp_tree node = vertices[idx];
3668 /* For leafs there's nothing to do - we've seeded permutes
3669 on those above. */
3670 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3671 continue;
3673 bitmap_set_bit (n_visited, idx);
3675 /* We cannot move a permute across a store. */
3676 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
3677 && DR_IS_WRITE
3678 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
3679 continue;
3681 int perm = -1;
3682 for (graph_edge *succ = slpg->vertices[idx].succ;
3683 succ; succ = succ->succ_next)
3685 int succ_idx = succ->dest;
3686 /* Handle unvisited nodes optimistically. */
3687 /* ??? But for constants once we want to handle non-bijective
3688 permutes we have to verify the permute, when unifying lanes,
3689 will not unify different constants. For example see
3690 gcc.dg/vect/bb-slp-14.c for a case that would break. */
3691 if (!bitmap_bit_p (n_visited, succ_idx))
3692 continue;
3693 int succ_perm = n_perm[succ_idx];
3694 /* Once we materialize succs permutation its output lanes
3695 appear unpermuted to us. */
3696 if (bitmap_bit_p (n_materialize, succ_idx))
3697 succ_perm = 0;
3698 if (perm == -1)
3699 perm = succ_perm;
3700 else if (succ_perm == 0)
3702 perm = 0;
3703 break;
3705 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
3707 perm = 0;
3708 break;
3712 if (perm == -1)
3713 /* Pick up pre-computed leaf values. */
3714 perm = n_perm[idx];
3715 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
3717 if (iteration > 1)
3718 /* Make sure we eventually converge. */
3719 gcc_checking_assert (perm == 0);
3720 n_perm[idx] = perm;
3721 if (perm == 0)
3722 bitmap_clear_bit (n_materialize, idx);
3723 changed = true;
3726 if (perm == 0)
3727 continue;
3729 /* Elide pruning at materialization points in the first
3730 iteration so every node was visited once at least. */
3731 if (iteration == 1)
3732 continue;
3734 /* Decide on permute materialization. Look whether there's
3735 a use (pred) edge that is permuted differently than us.
3736 In that case mark ourselves so the permutation is applied.
3737 For VEC_PERM_EXPRs the permutation doesn't carry along
3738 from children to parents so force materialization at the
3739 point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
3740 are a source of an arbitrary permutation again, similar
3741 to constants/externals - that's something we do not yet
3742 optimally handle. */
3743 bool all_preds_permuted = (SLP_TREE_CODE (node) != VEC_PERM_EXPR
3744 && slpg->vertices[idx].pred != NULL);
3745 if (all_preds_permuted)
3746 for (graph_edge *pred = slpg->vertices[idx].pred;
3747 pred; pred = pred->pred_next)
3749 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
3750 int pred_perm = n_perm[pred->src];
3751 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3753 all_preds_permuted = false;
3754 break;
3757 if (!all_preds_permuted)
3759 if (!bitmap_bit_p (n_materialize, idx))
3760 changed = true;
3761 bitmap_set_bit (n_materialize, idx);
3765 while (changed || iteration == 1);
3767 /* Materialize. */
3768 for (i = 0; i < vertices.length (); ++i)
3770 int perm = n_perm[i];
3771 if (perm <= 0)
3772 continue;
3774 slp_tree node = vertices[i];
3776 /* First permute invariant/external original successors. */
3777 unsigned j;
3778 slp_tree child;
3779 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3781 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3782 continue;
3784 /* If the vector is uniform there's nothing to do. */
3785 if (vect_slp_tree_uniform_p (child))
3786 continue;
3788 /* We can end up sharing some externals via two_operator
3789 handling. Be prepared to unshare those. */
3790 if (child->refcnt != 1)
3792 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3793 SLP_TREE_CHILDREN (node)[j] = child
3794 = vect_create_new_slp_node
3795 (SLP_TREE_SCALAR_OPS (child).copy ());
3797 vect_slp_permute (perms[perm],
3798 SLP_TREE_SCALAR_OPS (child), true);
3801 if (bitmap_bit_p (n_materialize, i))
3803 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3804 /* For loads simply drop the permutation, the load permutation
3805 already performs the desired permutation. */
3807 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3809 /* If the node is already a permute node we can apply
3810 the permutation to the lane selection, effectively
3811 materializing it on the incoming vectors. */
3812 if (dump_enabled_p ())
3813 dump_printf_loc (MSG_NOTE, vect_location,
3814 "simplifying permute node %p\n",
3815 node);
3817 for (unsigned k = 0;
3818 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3819 SLP_TREE_LANE_PERMUTATION (node)[k].second
3820 = perms[perm][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3822 else
3824 if (dump_enabled_p ())
3825 dump_printf_loc (MSG_NOTE, vect_location,
3826 "inserting permute node in place of %p\n",
3827 node);
3829 /* Make a copy of NODE and in-place change it to a
3830 VEC_PERM node to permute the lanes of the copy. */
3831 slp_tree copy = new _slp_tree;
3832 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3833 SLP_TREE_CHILDREN (node) = vNULL;
3834 SLP_TREE_SCALAR_STMTS (copy)
3835 = SLP_TREE_SCALAR_STMTS (node).copy ();
3836 vect_slp_permute (perms[perm],
3837 SLP_TREE_SCALAR_STMTS (copy), true);
3838 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3839 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3840 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3841 SLP_TREE_LANE_PERMUTATION (copy)
3842 = SLP_TREE_LANE_PERMUTATION (node);
3843 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3844 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3845 copy->refcnt = 1;
3846 copy->max_nunits = node->max_nunits;
3847 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3848 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3849 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3851 /* Now turn NODE into a VEC_PERM. */
3852 SLP_TREE_CHILDREN (node).safe_push (copy);
3853 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3854 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3855 SLP_TREE_LANE_PERMUTATION (node)
3856 .quick_push (std::make_pair (0, perms[perm][j]));
3857 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3860 else
3862 /* Apply the reverse permutation to our stmts. */
3863 vect_slp_permute (perms[perm],
3864 SLP_TREE_SCALAR_STMTS (node), true);
3865 /* And to the load permutation, which we can simply
3866 make regular by design. */
3867 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3869 /* ??? When we handle non-bijective permutes the idea
3870 is that we can force the load-permutation to be
3871 { min, min + 1, min + 2, ... max }. But then the
3872 scalar defs might no longer match the lane content
3873 which means wrong-code with live lane vectorization.
3874 So we possibly have to have NULL entries for those. */
3875 vect_slp_permute (perms[perm],
3876 SLP_TREE_LOAD_PERMUTATION (node), true);
3881 /* Free the perms vector used for propagation. */
3882 while (!perms.is_empty ())
3883 perms.pop ().release ();
3884 free_graph (slpg);
3887 /* Now elide load permutations that are not necessary. */
3888 for (i = 0; i < leafs.length (); ++i)
3890 node = vertices[leafs[i]];
3891 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3892 continue;
3894 /* In basic block vectorization we allow any subchain of an interleaving
3895 chain.
3896 FORNOW: not in loop SLP because of realignment complications. */
3897 if (is_a <bb_vec_info> (vinfo))
3899 bool subchain_p = true;
3900 stmt_vec_info next_load_info = NULL;
3901 stmt_vec_info load_info;
3902 unsigned j;
3903 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3905 if (j != 0
3906 && (next_load_info != load_info
3907 || DR_GROUP_GAP (load_info) != 1))
3909 subchain_p = false;
3910 break;
3912 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3914 if (subchain_p)
3916 SLP_TREE_LOAD_PERMUTATION (node).release ();
3917 continue;
3920 else
3922 stmt_vec_info load_info;
3923 bool this_load_permuted = false;
3924 unsigned j;
3925 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3926 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3928 this_load_permuted = true;
3929 break;
3931 stmt_vec_info first_stmt_info
3932 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3933 if (!this_load_permuted
3934 /* The load requires permutation when unrolling exposes
3935 a gap either because the group is larger than the SLP
3936 group-size or because there is a gap between the groups. */
3937 && (known_eq (LOOP_VINFO_VECT_FACTOR
3938 (as_a <loop_vec_info> (vinfo)), 1U)
3939 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3940 && DR_GROUP_GAP (first_stmt_info) == 0)))
3942 SLP_TREE_LOAD_PERMUTATION (node).release ();
3943 continue;
3948 /* And any permutations of BB reductions. */
3949 if (is_a <bb_vec_info> (vinfo))
3951 for (slp_instance instance : vinfo->slp_instances)
3953 if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
3954 continue;
3955 slp_tree old = SLP_INSTANCE_TREE (instance);
3956 if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
3957 && SLP_TREE_CHILDREN (old).length () == 1)
3959 slp_tree child = SLP_TREE_CHILDREN (old)[0];
3960 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
3962 /* Preserve the special VEC_PERM we use to shield existing
3963 vector defs from the rest. But make it a no-op. */
3964 unsigned i = 0;
3965 for (std::pair<unsigned, unsigned> &p
3966 : SLP_TREE_LANE_PERMUTATION (old))
3967 p.second = i++;
3969 else
3971 SLP_INSTANCE_TREE (instance) = child;
3972 SLP_TREE_REF_COUNT (child)++;
3973 vect_free_slp_tree (old);
3976 else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
3977 && SLP_TREE_REF_COUNT (old) == 1)
3979 /* ??? For loads the situation is more complex since
3980 we can't modify the permute in place in case the
3981 node is used multiple times. In fact for loads this
3982 should be somehow handled in the propagation engine. */
3983 auto fn = [] (const void *a, const void *b)
3984 { return *(const int *)a - *(const int *)b; };
3985 SLP_TREE_LOAD_PERMUTATION (old).qsort (fn);
3991 /* Gather loads reachable from the individual SLP graph entries. */
3993 void
3994 vect_gather_slp_loads (vec_info *vinfo)
3996 unsigned i;
3997 slp_instance instance;
3998 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
4000 hash_set<slp_tree> visited;
4001 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
4002 SLP_INSTANCE_TREE (instance), visited);
4007 /* For each possible SLP instance decide whether to SLP it and calculate overall
4008 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
4009 least one instance. */
4011 bool
4012 vect_make_slp_decision (loop_vec_info loop_vinfo)
4014 unsigned int i;
4015 poly_uint64 unrolling_factor = 1;
4016 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
4017 slp_instance instance;
4018 int decided_to_slp = 0;
4020 DUMP_VECT_SCOPE ("vect_make_slp_decision");
4022 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4024 /* FORNOW: SLP if you can. */
4025 /* All unroll factors have the form:
4027 GET_MODE_SIZE (vinfo->vector_mode) * X
4029 for some rational X, so they must have a common multiple. */
4030 unrolling_factor
4031 = force_common_multiple (unrolling_factor,
4032 SLP_INSTANCE_UNROLLING_FACTOR (instance));
4034 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
4035 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
4036 loop-based vectorization. Such stmts will be marked as HYBRID. */
4037 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4038 decided_to_slp++;
4041 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
4043 if (decided_to_slp && dump_enabled_p ())
4045 dump_printf_loc (MSG_NOTE, vect_location,
4046 "Decided to SLP %d instances. Unrolling factor ",
4047 decided_to_slp);
4048 dump_dec (MSG_NOTE, unrolling_factor);
4049 dump_printf (MSG_NOTE, "\n");
4052 return (decided_to_slp > 0);
4055 /* Private data for vect_detect_hybrid_slp. */
4056 struct vdhs_data
4058 loop_vec_info loop_vinfo;
4059 vec<stmt_vec_info> *worklist;
4062 /* Walker for walk_gimple_op. */
4064 static tree
4065 vect_detect_hybrid_slp (tree *tp, int *, void *data)
4067 walk_stmt_info *wi = (walk_stmt_info *)data;
4068 vdhs_data *dat = (vdhs_data *)wi->info;
4070 if (wi->is_lhs)
4071 return NULL_TREE;
4073 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
4074 if (!def_stmt_info)
4075 return NULL_TREE;
4076 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
4077 if (PURE_SLP_STMT (def_stmt_info))
4079 if (dump_enabled_p ())
4080 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
4081 def_stmt_info->stmt);
4082 STMT_SLP_TYPE (def_stmt_info) = hybrid;
4083 dat->worklist->safe_push (def_stmt_info);
4086 return NULL_TREE;
4089 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
4090 if so, otherwise pushing it to WORKLIST. */
4092 static void
4093 maybe_push_to_hybrid_worklist (vec_info *vinfo,
4094 vec<stmt_vec_info> &worklist,
4095 stmt_vec_info stmt_info)
4097 if (dump_enabled_p ())
4098 dump_printf_loc (MSG_NOTE, vect_location,
4099 "Processing hybrid candidate : %G", stmt_info->stmt);
4100 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
4101 imm_use_iterator iter2;
4102 ssa_op_iter iter1;
4103 use_operand_p use_p;
4104 def_operand_p def_p;
4105 bool any_def = false;
4106 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
4108 any_def = true;
4109 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
4111 if (is_gimple_debug (USE_STMT (use_p)))
4112 continue;
4113 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
4114 /* An out-of loop use means this is a loop_vect sink. */
4115 if (!use_info)
4117 if (dump_enabled_p ())
4118 dump_printf_loc (MSG_NOTE, vect_location,
4119 "Found loop_vect sink: %G", stmt_info->stmt);
4120 worklist.safe_push (stmt_info);
4121 return;
4123 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
4125 if (dump_enabled_p ())
4126 dump_printf_loc (MSG_NOTE, vect_location,
4127 "Found loop_vect use: %G", use_info->stmt);
4128 worklist.safe_push (stmt_info);
4129 return;
4133 /* No def means this is a loo_vect sink. */
4134 if (!any_def)
4136 if (dump_enabled_p ())
4137 dump_printf_loc (MSG_NOTE, vect_location,
4138 "Found loop_vect sink: %G", stmt_info->stmt);
4139 worklist.safe_push (stmt_info);
4140 return;
4142 if (dump_enabled_p ())
4143 dump_printf_loc (MSG_NOTE, vect_location,
4144 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
4145 STMT_SLP_TYPE (stmt_info) = pure_slp;
4148 /* Find stmts that must be both vectorized and SLPed. */
4150 void
4151 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
4153 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
4155 /* All stmts participating in SLP are marked pure_slp, all other
4156 stmts are loop_vect.
4157 First collect all loop_vect stmts into a worklist.
4158 SLP patterns cause not all original scalar stmts to appear in
4159 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
4160 Rectify this here and do a backward walk over the IL only considering
4161 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
4162 mark them as pure_slp. */
4163 auto_vec<stmt_vec_info> worklist;
4164 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
4166 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
4167 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
4168 gsi_next (&gsi))
4170 gphi *phi = gsi.phi ();
4171 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
4172 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4173 maybe_push_to_hybrid_worklist (loop_vinfo,
4174 worklist, stmt_info);
4176 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
4177 gsi_prev (&gsi))
4179 gimple *stmt = gsi_stmt (gsi);
4180 if (is_gimple_debug (stmt))
4181 continue;
4182 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
4183 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
4185 for (gimple_stmt_iterator gsi2
4186 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
4187 !gsi_end_p (gsi2); gsi_next (&gsi2))
4189 stmt_vec_info patt_info
4190 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
4191 if (!STMT_SLP_TYPE (patt_info)
4192 && STMT_VINFO_RELEVANT (patt_info))
4193 maybe_push_to_hybrid_worklist (loop_vinfo,
4194 worklist, patt_info);
4196 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4198 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4199 maybe_push_to_hybrid_worklist (loop_vinfo,
4200 worklist, stmt_info);
4204 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
4205 mark any SLP vectorized stmt as hybrid.
4206 ??? We're visiting def stmts N times (once for each non-SLP and
4207 once for each hybrid-SLP use). */
4208 walk_stmt_info wi;
4209 vdhs_data dat;
4210 dat.worklist = &worklist;
4211 dat.loop_vinfo = loop_vinfo;
4212 memset (&wi, 0, sizeof (wi));
4213 wi.info = (void *)&dat;
4214 while (!worklist.is_empty ())
4216 stmt_vec_info stmt_info = worklist.pop ();
4217 /* Since SSA operands are not set up for pattern stmts we need
4218 to use walk_gimple_op. */
4219 wi.is_lhs = 0;
4220 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
4225 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
4227 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
4228 : vec_info (vec_info::bb, init_cost (NULL, false), shared),
4229 bbs (_bbs),
4230 roots (vNULL)
4232 for (unsigned i = 0; i < bbs.length (); ++i)
4234 if (i != 0)
4235 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4236 gsi_next (&si))
4238 gphi *phi = si.phi ();
4239 gimple_set_uid (phi, 0);
4240 add_stmt (phi);
4242 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4243 !gsi_end_p (gsi); gsi_next (&gsi))
4245 gimple *stmt = gsi_stmt (gsi);
4246 gimple_set_uid (stmt, 0);
4247 if (is_gimple_debug (stmt))
4248 continue;
4249 add_stmt (stmt);
4255 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
4256 stmts in the basic block. */
4258 _bb_vec_info::~_bb_vec_info ()
4260 /* Reset region marker. */
4261 for (unsigned i = 0; i < bbs.length (); ++i)
4263 if (i != 0)
4264 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4265 gsi_next (&si))
4267 gphi *phi = si.phi ();
4268 gimple_set_uid (phi, -1);
4270 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4271 !gsi_end_p (gsi); gsi_next (&gsi))
4273 gimple *stmt = gsi_stmt (gsi);
4274 gimple_set_uid (stmt, -1);
4278 for (unsigned i = 0; i < roots.length (); ++i)
4280 roots[i].stmts.release ();
4281 roots[i].roots.release ();
4283 roots.release ();
4286 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
4287 given then that child nodes have already been processed, and that
4288 their def types currently match their SLP node's def type. */
4290 static bool
4291 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
4292 slp_instance node_instance,
4293 stmt_vector_for_cost *cost_vec)
4295 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4297 /* Calculate the number of vector statements to be created for the
4298 scalar stmts in this node. For SLP reductions it is equal to the
4299 number of vector statements in the children (which has already been
4300 calculated by the recursive call). Otherwise it is the number of
4301 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
4302 VF divided by the number of elements in a vector. */
4303 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
4304 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
4306 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
4307 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
4309 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4310 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
4311 break;
4314 else
4316 poly_uint64 vf;
4317 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4318 vf = loop_vinfo->vectorization_factor;
4319 else
4320 vf = 1;
4321 unsigned int group_size = SLP_TREE_LANES (node);
4322 tree vectype = SLP_TREE_VECTYPE (node);
4323 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4324 = vect_get_num_vectors (vf * group_size, vectype);
4327 /* Handle purely internal nodes. */
4328 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4329 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
4331 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
4332 if (is_a <bb_vec_info> (vinfo)
4333 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
4335 if (dump_enabled_p ())
4336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4337 "desired vector type conflicts with earlier one "
4338 "for %G", stmt_info->stmt);
4339 return false;
4342 bool dummy;
4343 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
4344 node, node_instance, cost_vec);
4347 /* Try to build NODE from scalars, returning true on success.
4348 NODE_INSTANCE is the SLP instance that contains NODE. */
4350 static bool
4351 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
4352 slp_instance node_instance)
4354 stmt_vec_info stmt_info;
4355 unsigned int i;
4357 if (!is_a <bb_vec_info> (vinfo)
4358 || node == SLP_INSTANCE_TREE (node_instance)
4359 || !SLP_TREE_SCALAR_STMTS (node).exists ()
4360 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
4361 return false;
4363 if (dump_enabled_p ())
4364 dump_printf_loc (MSG_NOTE, vect_location,
4365 "Building vector operands of %p from scalars instead\n", node);
4367 /* Don't remove and free the child nodes here, since they could be
4368 referenced by other structures. The analysis and scheduling phases
4369 (need to) ignore child nodes of anything that isn't vect_internal_def. */
4370 unsigned int group_size = SLP_TREE_LANES (node);
4371 SLP_TREE_DEF_TYPE (node) = vect_external_def;
4372 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
4373 SLP_TREE_LOAD_PERMUTATION (node).release ();
4374 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4376 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
4377 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
4379 return true;
4382 /* Compute the prologue cost for invariant or constant operands represented
4383 by NODE. */
4385 static void
4386 vect_prologue_cost_for_slp (slp_tree node,
4387 stmt_vector_for_cost *cost_vec)
4389 /* There's a special case of an existing vector, that costs nothing. */
4390 if (SLP_TREE_SCALAR_OPS (node).length () == 0
4391 && !SLP_TREE_VEC_DEFS (node).is_empty ())
4392 return;
4393 /* Without looking at the actual initializer a vector of
4394 constants can be implemented as load from the constant pool.
4395 When all elements are the same we can use a splat. */
4396 tree vectype = SLP_TREE_VECTYPE (node);
4397 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
4398 unsigned num_vects_to_check;
4399 unsigned HOST_WIDE_INT const_nunits;
4400 unsigned nelt_limit;
4401 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
4402 && ! multiple_p (const_nunits, group_size))
4404 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4405 nelt_limit = const_nunits;
4407 else
4409 /* If either the vector has variable length or the vectors
4410 are composed of repeated whole groups we only need to
4411 cost construction once. All vectors will be the same. */
4412 num_vects_to_check = 1;
4413 nelt_limit = group_size;
4415 tree elt = NULL_TREE;
4416 unsigned nelt = 0;
4417 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
4419 unsigned si = j % group_size;
4420 if (nelt == 0)
4421 elt = SLP_TREE_SCALAR_OPS (node)[si];
4422 /* ??? We're just tracking whether all operands of a single
4423 vector initializer are the same, ideally we'd check if
4424 we emitted the same one already. */
4425 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
4426 elt = NULL_TREE;
4427 nelt++;
4428 if (nelt == nelt_limit)
4430 record_stmt_cost (cost_vec, 1,
4431 SLP_TREE_DEF_TYPE (node) == vect_external_def
4432 ? (elt ? scalar_to_vec : vec_construct)
4433 : vector_load,
4434 NULL, vectype, 0, vect_prologue);
4435 nelt = 0;
4440 /* Analyze statements contained in SLP tree NODE after recursively analyzing
4441 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
4443 Return true if the operations are supported. */
4445 static bool
4446 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
4447 slp_instance node_instance,
4448 hash_set<slp_tree> &visited_set,
4449 vec<slp_tree> &visited_vec,
4450 stmt_vector_for_cost *cost_vec)
4452 int i, j;
4453 slp_tree child;
4455 /* Assume we can code-generate all invariants. */
4456 if (!node
4457 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
4458 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4459 return true;
4461 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
4463 if (dump_enabled_p ())
4464 dump_printf_loc (MSG_NOTE, vect_location,
4465 "Failed cyclic SLP reference in %p\n", node);
4466 return false;
4468 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
4470 /* If we already analyzed the exact same set of scalar stmts we're done.
4471 We share the generated vector stmts for those. */
4472 if (visited_set.add (node))
4473 return true;
4474 visited_vec.safe_push (node);
4476 bool res = true;
4477 unsigned visited_rec_start = visited_vec.length ();
4478 unsigned cost_vec_rec_start = cost_vec->length ();
4479 bool seen_non_constant_child = false;
4480 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4482 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
4483 visited_set, visited_vec,
4484 cost_vec);
4485 if (!res)
4486 break;
4487 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
4488 seen_non_constant_child = true;
4490 /* We're having difficulties scheduling nodes with just constant
4491 operands and no scalar stmts since we then cannot compute a stmt
4492 insertion place. */
4493 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
4495 if (dump_enabled_p ())
4496 dump_printf_loc (MSG_NOTE, vect_location,
4497 "Cannot vectorize all-constant op node %p\n", node);
4498 res = false;
4501 if (res)
4502 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
4503 cost_vec);
4504 /* If analysis failed we have to pop all recursive visited nodes
4505 plus ourselves. */
4506 if (!res)
4508 while (visited_vec.length () >= visited_rec_start)
4509 visited_set.remove (visited_vec.pop ());
4510 cost_vec->truncate (cost_vec_rec_start);
4513 /* When the node can be vectorized cost invariant nodes it references.
4514 This is not done in DFS order to allow the refering node
4515 vectorizable_* calls to nail down the invariant nodes vector type
4516 and possibly unshare it if it needs a different vector type than
4517 other referrers. */
4518 if (res)
4519 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
4520 if (child
4521 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
4522 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
4523 /* Perform usual caching, note code-generation still
4524 code-gens these nodes multiple times but we expect
4525 to CSE them later. */
4526 && !visited_set.add (child))
4528 visited_vec.safe_push (child);
4529 /* ??? After auditing more code paths make a "default"
4530 and push the vector type from NODE to all children
4531 if it is not already set. */
4532 /* Compute the number of vectors to be generated. */
4533 tree vector_type = SLP_TREE_VECTYPE (child);
4534 if (!vector_type)
4536 /* For shifts with a scalar argument we don't need
4537 to cost or code-generate anything.
4538 ??? Represent this more explicitely. */
4539 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4540 == shift_vec_info_type)
4541 && j == 1);
4542 continue;
4544 unsigned group_size = SLP_TREE_LANES (child);
4545 poly_uint64 vf = 1;
4546 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4547 vf = loop_vinfo->vectorization_factor;
4548 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
4549 = vect_get_num_vectors (vf * group_size, vector_type);
4550 /* And cost them. */
4551 vect_prologue_cost_for_slp (child, cost_vec);
4554 /* If this node or any of its children can't be vectorized, try pruning
4555 the tree here rather than felling the whole thing. */
4556 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
4558 /* We'll need to revisit this for invariant costing and number
4559 of vectorized stmt setting. */
4560 res = true;
4563 return res;
4566 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4567 region and that can be vectorized using vectorizable_live_operation
4568 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4569 scalar code computing it to be retained. */
4571 static void
4572 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
4573 slp_instance instance,
4574 stmt_vector_for_cost *cost_vec,
4575 hash_set<stmt_vec_info> &svisited,
4576 hash_set<slp_tree> &visited)
4578 if (visited.add (node))
4579 return;
4581 unsigned i;
4582 stmt_vec_info stmt_info;
4583 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
4584 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4586 if (svisited.contains (stmt_info))
4587 continue;
4588 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4589 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
4590 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
4591 /* Only the pattern root stmt computes the original scalar value. */
4592 continue;
4593 bool mark_visited = true;
4594 gimple *orig_stmt = orig_stmt_info->stmt;
4595 ssa_op_iter op_iter;
4596 def_operand_p def_p;
4597 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4599 imm_use_iterator use_iter;
4600 gimple *use_stmt;
4601 stmt_vec_info use_stmt_info;
4602 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4603 if (!is_gimple_debug (use_stmt))
4605 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
4606 if (!use_stmt_info
4607 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4609 STMT_VINFO_LIVE_P (stmt_info) = true;
4610 if (vectorizable_live_operation (bb_vinfo, stmt_info,
4611 NULL, node, instance, i,
4612 false, cost_vec))
4613 /* ??? So we know we can vectorize the live stmt
4614 from one SLP node. If we cannot do so from all
4615 or none consistently we'd have to record which
4616 SLP node (and lane) we want to use for the live
4617 operation. So make sure we can code-generate
4618 from all nodes. */
4619 mark_visited = false;
4620 else
4621 STMT_VINFO_LIVE_P (stmt_info) = false;
4622 break;
4625 /* We have to verify whether we can insert the lane extract
4626 before all uses. The following is a conservative approximation.
4627 We cannot put this into vectorizable_live_operation because
4628 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4629 doesn't work.
4630 Note that while the fact that we emit code for loads at the
4631 first load should make this a non-problem leafs we construct
4632 from scalars are vectorized after the last scalar def.
4633 ??? If we'd actually compute the insert location during
4634 analysis we could use sth less conservative than the last
4635 scalar stmt in the node for the dominance check. */
4636 /* ??? What remains is "live" uses in vector CTORs in the same
4637 SLP graph which is where those uses can end up code-generated
4638 right after their definition instead of close to their original
4639 use. But that would restrict us to code-generate lane-extracts
4640 from the latest stmt in a node. So we compensate for this
4641 during code-generation, simply not replacing uses for those
4642 hopefully rare cases. */
4643 if (STMT_VINFO_LIVE_P (stmt_info))
4644 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4645 if (!is_gimple_debug (use_stmt)
4646 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4647 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4648 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4650 if (dump_enabled_p ())
4651 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4652 "Cannot determine insertion place for "
4653 "lane extract\n");
4654 STMT_VINFO_LIVE_P (stmt_info) = false;
4655 mark_visited = true;
4658 if (mark_visited)
4659 svisited.add (stmt_info);
4662 slp_tree child;
4663 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4664 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4665 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4666 cost_vec, svisited, visited);
4669 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
4671 static bool
4672 vectorizable_bb_reduc_epilogue (slp_instance instance,
4673 stmt_vector_for_cost *cost_vec)
4675 enum tree_code reduc_code
4676 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
4677 if (reduc_code == MINUS_EXPR)
4678 reduc_code = PLUS_EXPR;
4679 internal_fn reduc_fn;
4680 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
4681 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
4682 || reduc_fn == IFN_LAST
4683 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH))
4684 return false;
4686 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
4687 cost log2 vector operations plus shuffles. */
4688 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
4689 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
4690 vectype, 0, vect_body);
4691 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
4692 vectype, 0, vect_body);
4693 return true;
4696 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
4697 and recurse to children. */
4699 static void
4700 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
4701 hash_set<slp_tree> &visited)
4703 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
4704 || visited.add (node))
4705 return;
4707 stmt_vec_info stmt;
4708 unsigned i;
4709 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4710 roots.remove (vect_orig_stmt (stmt));
4712 slp_tree child;
4713 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4714 if (child)
4715 vect_slp_prune_covered_roots (child, roots, visited);
4718 /* Analyze statements in SLP instances of VINFO. Return true if the
4719 operations are supported. */
4721 bool
4722 vect_slp_analyze_operations (vec_info *vinfo)
4724 slp_instance instance;
4725 int i;
4727 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4729 hash_set<slp_tree> visited;
4730 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4732 auto_vec<slp_tree> visited_vec;
4733 stmt_vector_for_cost cost_vec;
4734 cost_vec.create (2);
4735 if (is_a <bb_vec_info> (vinfo))
4736 vect_location = instance->location ();
4737 if (!vect_slp_analyze_node_operations (vinfo,
4738 SLP_INSTANCE_TREE (instance),
4739 instance, visited, visited_vec,
4740 &cost_vec)
4741 /* CTOR instances require vectorized defs for the SLP tree root. */
4742 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
4743 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
4744 != vect_internal_def))
4745 /* Check we can vectorize the reduction. */
4746 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
4747 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
4749 slp_tree node = SLP_INSTANCE_TREE (instance);
4750 stmt_vec_info stmt_info;
4751 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4752 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
4753 else
4754 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4755 if (dump_enabled_p ())
4756 dump_printf_loc (MSG_NOTE, vect_location,
4757 "removing SLP instance operations starting from: %G",
4758 stmt_info->stmt);
4759 vect_free_slp_instance (instance);
4760 vinfo->slp_instances.ordered_remove (i);
4761 cost_vec.release ();
4762 while (!visited_vec.is_empty ())
4763 visited.remove (visited_vec.pop ());
4765 else
4767 i++;
4769 /* For BB vectorization remember the SLP graph entry
4770 cost for later. */
4771 if (is_a <bb_vec_info> (vinfo))
4772 instance->cost_vec = cost_vec;
4773 else
4775 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
4776 cost_vec.release ();
4781 /* Now look for SLP instances with a root that are covered by other
4782 instances and remove them. */
4783 hash_set<stmt_vec_info> roots;
4784 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4785 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4786 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
4787 if (!roots.is_empty ())
4789 visited.empty ();
4790 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4791 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
4792 visited);
4793 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4794 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
4795 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
4797 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
4798 if (dump_enabled_p ())
4799 dump_printf_loc (MSG_NOTE, vect_location,
4800 "removing SLP instance operations starting "
4801 "from: %G", root->stmt);
4802 vect_free_slp_instance (instance);
4803 vinfo->slp_instances.ordered_remove (i);
4805 else
4806 ++i;
4809 /* Compute vectorizable live stmts. */
4810 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4812 hash_set<stmt_vec_info> svisited;
4813 hash_set<slp_tree> visited;
4814 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4816 vect_location = instance->location ();
4817 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
4818 instance, &instance->cost_vec, svisited,
4819 visited);
4823 return !vinfo->slp_instances.is_empty ();
4826 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4827 closing the eventual chain. */
4829 static slp_instance
4830 get_ultimate_leader (slp_instance instance,
4831 hash_map<slp_instance, slp_instance> &instance_leader)
4833 auto_vec<slp_instance *, 8> chain;
4834 slp_instance *tem;
4835 while (*(tem = instance_leader.get (instance)) != instance)
4837 chain.safe_push (tem);
4838 instance = *tem;
4840 while (!chain.is_empty ())
4841 *chain.pop () = instance;
4842 return instance;
4845 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4847 static void
4848 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
4849 slp_instance instance, slp_tree node,
4850 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
4851 hash_map<slp_instance, slp_instance> &instance_leader,
4852 hash_set<slp_tree> &visited)
4854 stmt_vec_info stmt_info;
4855 unsigned i;
4857 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4859 bool existed_p;
4860 slp_instance &stmt_instance
4861 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
4862 if (!existed_p)
4864 else if (stmt_instance != instance)
4866 /* If we're running into a previously marked stmt make us the
4867 leader of the current ultimate leader. This keeps the
4868 leader chain acyclic and works even when the current instance
4869 connects two previously independent graph parts. */
4870 slp_instance stmt_leader
4871 = get_ultimate_leader (stmt_instance, instance_leader);
4872 if (stmt_leader != instance)
4873 instance_leader.put (stmt_leader, instance);
4875 stmt_instance = instance;
4878 if (!SLP_TREE_SCALAR_STMTS (node).is_empty () && visited.add (node))
4879 return;
4881 slp_tree child;
4882 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4883 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4884 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
4885 instance_leader, visited);
4888 /* Partition the SLP graph into pieces that can be costed independently. */
4890 static void
4891 vect_bb_partition_graph (bb_vec_info bb_vinfo)
4893 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4895 /* First walk the SLP graph assigning each involved scalar stmt a
4896 corresponding SLP graph entry and upon visiting a previously
4897 marked stmt, make the stmts leader the current SLP graph entry. */
4898 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
4899 hash_map<slp_instance, slp_instance> instance_leader;
4900 hash_set<slp_tree> visited;
4901 slp_instance instance;
4902 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4904 instance_leader.put (instance, instance);
4905 vect_bb_partition_graph_r (bb_vinfo,
4906 instance, SLP_INSTANCE_TREE (instance),
4907 stmt_to_instance, instance_leader,
4908 visited);
4911 /* Then collect entries to each independent subgraph. */
4912 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4914 slp_instance leader = get_ultimate_leader (instance, instance_leader);
4915 leader->subgraph_entries.safe_push (instance);
4916 if (dump_enabled_p ()
4917 && leader != instance)
4918 dump_printf_loc (MSG_NOTE, vect_location,
4919 "instance %p is leader of %p\n",
4920 leader, instance);
4924 /* Compute the scalar cost of the SLP node NODE and its children
4925 and return it. Do not account defs that are marked in LIFE and
4926 update LIFE according to uses of NODE. */
4928 static void
4929 vect_bb_slp_scalar_cost (vec_info *vinfo,
4930 slp_tree node, vec<bool, va_heap> *life,
4931 stmt_vector_for_cost *cost_vec,
4932 hash_set<slp_tree> &visited)
4934 unsigned i;
4935 stmt_vec_info stmt_info;
4936 slp_tree child;
4938 if (visited.add (node))
4939 return;
4941 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4943 ssa_op_iter op_iter;
4944 def_operand_p def_p;
4946 if ((*life)[i])
4947 continue;
4949 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4950 gimple *orig_stmt = orig_stmt_info->stmt;
4952 /* If there is a non-vectorized use of the defs then the scalar
4953 stmt is kept live in which case we do not account it or any
4954 required defs in the SLP children in the scalar cost. This
4955 way we make the vectorization more costly when compared to
4956 the scalar cost. */
4957 if (!STMT_VINFO_LIVE_P (stmt_info))
4959 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4961 imm_use_iterator use_iter;
4962 gimple *use_stmt;
4963 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4964 if (!is_gimple_debug (use_stmt))
4966 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
4967 if (!use_stmt_info
4968 || !PURE_SLP_STMT
4969 (vect_stmt_to_vectorize (use_stmt_info)))
4971 (*life)[i] = true;
4972 break;
4976 if ((*life)[i])
4977 continue;
4980 /* Count scalar stmts only once. */
4981 if (gimple_visited_p (orig_stmt))
4982 continue;
4983 gimple_set_visited (orig_stmt, true);
4985 vect_cost_for_stmt kind;
4986 if (STMT_VINFO_DATA_REF (orig_stmt_info))
4988 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
4989 kind = scalar_load;
4990 else
4991 kind = scalar_store;
4993 else if (vect_nop_conversion_p (orig_stmt_info))
4994 continue;
4995 /* For single-argument PHIs assume coalescing which means zero cost
4996 for the scalar and the vector PHIs. This avoids artificially
4997 favoring the vector path (but may pessimize it in some cases). */
4998 else if (is_a <gphi *> (orig_stmt_info->stmt)
4999 && gimple_phi_num_args
5000 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
5001 continue;
5002 else
5003 kind = scalar_stmt;
5004 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
5005 SLP_TREE_VECTYPE (node), 0, vect_body);
5008 auto_vec<bool, 20> subtree_life;
5009 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5011 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5013 /* Do not directly pass LIFE to the recursive call, copy it to
5014 confine changes in the callee to the current child/subtree. */
5015 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5017 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
5018 for (unsigned j = 0;
5019 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
5021 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
5022 if (perm.first == i)
5023 subtree_life[perm.second] = (*life)[j];
5026 else
5028 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
5029 subtree_life.safe_splice (*life);
5031 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
5032 visited);
5033 subtree_life.truncate (0);
5038 /* Comparator for the loop-index sorted cost vectors. */
5040 static int
5041 li_cost_vec_cmp (const void *a_, const void *b_)
5043 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
5044 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
5045 if (a->first < b->first)
5046 return -1;
5047 else if (a->first == b->first)
5048 return 0;
5049 return 1;
5052 /* Check if vectorization of the basic block is profitable for the
5053 subgraph denoted by SLP_INSTANCES. */
5055 static bool
5056 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
5057 vec<slp_instance> slp_instances)
5059 slp_instance instance;
5060 int i;
5061 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
5062 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
5064 if (dump_enabled_p ())
5066 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
5067 hash_set<slp_tree> visited;
5068 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5069 vect_print_slp_graph (MSG_NOTE, vect_location,
5070 SLP_INSTANCE_TREE (instance), visited);
5073 /* Calculate scalar cost and sum the cost for the vector stmts
5074 previously collected. */
5075 stmt_vector_for_cost scalar_costs = vNULL;
5076 stmt_vector_for_cost vector_costs = vNULL;
5077 hash_set<slp_tree> visited;
5078 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5080 auto_vec<bool, 20> life;
5081 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
5082 true);
5083 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5084 record_stmt_cost (&scalar_costs,
5085 SLP_INSTANCE_ROOT_STMTS (instance).length (),
5086 scalar_stmt,
5087 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
5088 vect_bb_slp_scalar_cost (bb_vinfo,
5089 SLP_INSTANCE_TREE (instance),
5090 &life, &scalar_costs, visited);
5091 vector_costs.safe_splice (instance->cost_vec);
5092 instance->cost_vec.release ();
5094 /* Unset visited flag. */
5095 stmt_info_for_cost *cost;
5096 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5097 gimple_set_visited (cost->stmt_info->stmt, false);
5099 if (dump_enabled_p ())
5100 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5102 /* When costing non-loop vectorization we need to consider each covered
5103 loop independently and make sure vectorization is profitable. For
5104 now we assume a loop may be not entered or executed an arbitrary
5105 number of iterations (??? static information can provide more
5106 precise info here) which means we can simply cost each containing
5107 loops stmts separately. */
5109 /* First produce cost vectors sorted by loop index. */
5110 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5111 li_scalar_costs (scalar_costs.length ());
5112 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5113 li_vector_costs (vector_costs.length ());
5114 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5116 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5117 li_scalar_costs.quick_push (std::make_pair (l, cost));
5119 /* Use a random used loop as fallback in case the first vector_costs
5120 entry does not have a stmt_info associated with it. */
5121 unsigned l = li_scalar_costs[0].first;
5122 FOR_EACH_VEC_ELT (vector_costs, i, cost)
5124 /* We inherit from the previous COST, invariants, externals and
5125 extracts immediately follow the cost for the related stmt. */
5126 if (cost->stmt_info)
5127 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5128 li_vector_costs.quick_push (std::make_pair (l, cost));
5130 li_scalar_costs.qsort (li_cost_vec_cmp);
5131 li_vector_costs.qsort (li_cost_vec_cmp);
5133 /* Now cost the portions individually. */
5134 unsigned vi = 0;
5135 unsigned si = 0;
5136 while (si < li_scalar_costs.length ()
5137 && vi < li_vector_costs.length ())
5139 unsigned sl = li_scalar_costs[si].first;
5140 unsigned vl = li_vector_costs[vi].first;
5141 if (sl != vl)
5143 if (dump_enabled_p ())
5144 dump_printf_loc (MSG_NOTE, vect_location,
5145 "Scalar %d and vector %d loop part do not "
5146 "match up, skipping scalar part\n", sl, vl);
5147 /* Skip the scalar part, assuming zero cost on the vector side. */
5150 si++;
5152 while (si < li_scalar_costs.length ()
5153 && li_scalar_costs[si].first == sl);
5154 continue;
5157 void *scalar_target_cost_data = init_cost (NULL, true);
5160 add_stmt_cost (bb_vinfo, scalar_target_cost_data,
5161 li_scalar_costs[si].second);
5162 si++;
5164 while (si < li_scalar_costs.length ()
5165 && li_scalar_costs[si].first == sl);
5166 unsigned dummy;
5167 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
5168 destroy_cost_data (scalar_target_cost_data);
5170 /* Complete the target-specific vector cost calculation. */
5171 void *vect_target_cost_data = init_cost (NULL, false);
5174 add_stmt_cost (bb_vinfo, vect_target_cost_data,
5175 li_vector_costs[vi].second);
5176 vi++;
5178 while (vi < li_vector_costs.length ()
5179 && li_vector_costs[vi].first == vl);
5180 finish_cost (vect_target_cost_data, &vec_prologue_cost,
5181 &vec_inside_cost, &vec_epilogue_cost);
5182 destroy_cost_data (vect_target_cost_data);
5184 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
5186 if (dump_enabled_p ())
5188 dump_printf_loc (MSG_NOTE, vect_location,
5189 "Cost model analysis for part in loop %d:\n", sl);
5190 dump_printf (MSG_NOTE, " Vector cost: %d\n",
5191 vec_inside_cost + vec_outside_cost);
5192 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
5195 /* Vectorization is profitable if its cost is more than the cost of scalar
5196 version. Note that we err on the vector side for equal cost because
5197 the cost estimate is otherwise quite pessimistic (constant uses are
5198 free on the scalar side but cost a load on the vector side for
5199 example). */
5200 if (vec_outside_cost + vec_inside_cost > scalar_cost)
5202 scalar_costs.release ();
5203 vector_costs.release ();
5204 return false;
5207 if (vi < li_vector_costs.length ())
5209 if (dump_enabled_p ())
5210 dump_printf_loc (MSG_NOTE, vect_location,
5211 "Excess vector cost for part in loop %d:\n",
5212 li_vector_costs[vi].first);
5213 scalar_costs.release ();
5214 vector_costs.release ();
5215 return false;
5218 scalar_costs.release ();
5219 vector_costs.release ();
5220 return true;
5223 /* qsort comparator for lane defs. */
5225 static int
5226 vld_cmp (const void *a_, const void *b_)
5228 auto *a = (const std::pair<unsigned, tree> *)a_;
5229 auto *b = (const std::pair<unsigned, tree> *)b_;
5230 return a->first - b->first;
5233 /* Return true if USE_STMT is a vector lane insert into VEC and set
5234 *THIS_LANE to the lane number that is set. */
5236 static bool
5237 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
5239 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
5240 if (!use_ass
5241 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
5242 || (vec
5243 ? gimple_assign_rhs1 (use_ass) != vec
5244 : ((vec = gimple_assign_rhs1 (use_ass)), false))
5245 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
5246 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
5247 || !constant_multiple_p
5248 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
5249 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
5250 this_lane))
5251 return false;
5252 return true;
5255 /* Find any vectorizable constructors and add them to the grouped_store
5256 array. */
5258 static void
5259 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
5261 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
5262 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
5263 !gsi_end_p (gsi); gsi_next (&gsi))
5265 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
5266 if (!assign)
5267 continue;
5269 tree rhs = gimple_assign_rhs1 (assign);
5270 enum tree_code code = gimple_assign_rhs_code (assign);
5271 use_operand_p use_p;
5272 gimple *use_stmt;
5273 if (code == CONSTRUCTOR)
5275 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5276 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
5277 CONSTRUCTOR_NELTS (rhs))
5278 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
5279 || uniform_vector_p (rhs))
5280 continue;
5282 unsigned j;
5283 tree val;
5284 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
5285 if (TREE_CODE (val) != SSA_NAME
5286 || !bb_vinfo->lookup_def (val))
5287 break;
5288 if (j != CONSTRUCTOR_NELTS (rhs))
5289 continue;
5291 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
5292 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
5294 else if (code == BIT_INSERT_EXPR
5295 && VECTOR_TYPE_P (TREE_TYPE (rhs))
5296 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
5297 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
5298 && integer_zerop (gimple_assign_rhs3 (assign))
5299 && useless_type_conversion_p
5300 (TREE_TYPE (TREE_TYPE (rhs)),
5301 TREE_TYPE (gimple_assign_rhs2 (assign)))
5302 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
5304 /* We start to match on insert to lane zero but since the
5305 inserts need not be ordered we'd have to search both
5306 the def and the use chains. */
5307 tree vectype = TREE_TYPE (rhs);
5308 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5309 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
5310 auto_sbitmap lanes (nlanes);
5311 bitmap_clear (lanes);
5312 bitmap_set_bit (lanes, 0);
5313 tree def = gimple_assign_lhs (assign);
5314 lane_defs.quick_push
5315 (std::make_pair (0, gimple_assign_rhs2 (assign)));
5316 unsigned lanes_found = 1;
5317 /* Start with the use chains, the last stmt will be the root. */
5318 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
5319 vec<stmt_vec_info> roots = vNULL;
5320 roots.safe_push (last);
5323 use_operand_p use_p;
5324 gimple *use_stmt;
5325 if (!single_imm_use (def, &use_p, &use_stmt))
5326 break;
5327 unsigned this_lane;
5328 if (!bb_vinfo->lookup_stmt (use_stmt)
5329 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
5330 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
5331 break;
5332 if (bitmap_bit_p (lanes, this_lane))
5333 break;
5334 lanes_found++;
5335 bitmap_set_bit (lanes, this_lane);
5336 gassign *use_ass = as_a <gassign *> (use_stmt);
5337 lane_defs.quick_push (std::make_pair
5338 (this_lane, gimple_assign_rhs2 (use_ass)));
5339 last = bb_vinfo->lookup_stmt (use_ass);
5340 roots.safe_push (last);
5341 def = gimple_assign_lhs (use_ass);
5343 while (lanes_found < nlanes);
5344 if (roots.length () > 1)
5345 std::swap(roots[0], roots[roots.length () - 1]);
5346 if (lanes_found < nlanes)
5348 /* Now search the def chain. */
5349 def = gimple_assign_rhs1 (assign);
5352 if (TREE_CODE (def) != SSA_NAME
5353 || !has_single_use (def))
5354 break;
5355 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
5356 unsigned this_lane;
5357 if (!bb_vinfo->lookup_stmt (def_stmt)
5358 || !vect_slp_is_lane_insert (def_stmt,
5359 NULL_TREE, &this_lane)
5360 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
5361 break;
5362 if (bitmap_bit_p (lanes, this_lane))
5363 break;
5364 lanes_found++;
5365 bitmap_set_bit (lanes, this_lane);
5366 lane_defs.quick_push (std::make_pair
5367 (this_lane,
5368 gimple_assign_rhs2 (def_stmt)));
5369 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
5370 def = gimple_assign_rhs1 (def_stmt);
5372 while (lanes_found < nlanes);
5374 if (lanes_found == nlanes)
5376 /* Sort lane_defs after the lane index and register the root. */
5377 lane_defs.qsort (vld_cmp);
5378 vec<stmt_vec_info> stmts;
5379 stmts.create (nlanes);
5380 for (unsigned i = 0; i < nlanes; ++i)
5381 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
5382 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
5383 stmts, roots));
5385 else
5386 roots.release ();
5388 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5389 && (associative_tree_code (code) || code == MINUS_EXPR)
5390 /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS
5391 checks pessimize a two-element reduction. PR54400.
5392 ??? In-order reduction could be handled if we only
5393 traverse one operand chain in vect_slp_linearize_chain. */
5394 && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
5395 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
5396 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
5397 /* Ops with constants at the tail can be stripped here. */
5398 && TREE_CODE (rhs) == SSA_NAME
5399 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
5400 /* Should be the chain end. */
5401 && (!single_imm_use (gimple_assign_lhs (assign),
5402 &use_p, &use_stmt)
5403 || !is_gimple_assign (use_stmt)
5404 || (gimple_assign_rhs_code (use_stmt) != code
5405 && ((code != PLUS_EXPR && code != MINUS_EXPR)
5406 || (gimple_assign_rhs_code (use_stmt)
5407 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
5409 /* We start the match at the end of a possible association
5410 chain. */
5411 auto_vec<chain_op_t> chain;
5412 auto_vec<std::pair<tree_code, gimple *> > worklist;
5413 auto_vec<gimple *> chain_stmts;
5414 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
5415 if (code == MINUS_EXPR)
5416 code = PLUS_EXPR;
5417 internal_fn reduc_fn;
5418 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
5419 || reduc_fn == IFN_LAST)
5420 continue;
5421 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
5422 /* ??? */
5423 code_stmt, alt_code_stmt, &chain_stmts);
5424 if (chain.length () > 1)
5426 /* Sort the chain according to def_type and operation. */
5427 chain.sort (dt_sort_cmp, bb_vinfo);
5428 /* ??? Now we'd want to strip externals and constants
5429 but record those to be handled in the epilogue. */
5430 /* ??? For now do not allow mixing ops or externs/constants. */
5431 bool invalid = false;
5432 for (unsigned i = 0; i < chain.length (); ++i)
5433 if (chain[i].dt != vect_internal_def
5434 || chain[i].code != code)
5435 invalid = true;
5436 if (!invalid)
5438 vec<stmt_vec_info> stmts;
5439 stmts.create (chain.length ());
5440 for (unsigned i = 0; i < chain.length (); ++i)
5441 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
5442 vec<stmt_vec_info> roots;
5443 roots.create (chain_stmts.length ());
5444 for (unsigned i = 0; i < chain_stmts.length (); ++i)
5445 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
5446 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
5447 stmts, roots));
5454 /* Walk the grouped store chains and replace entries with their
5455 pattern variant if any. */
5457 static void
5458 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
5460 stmt_vec_info first_element;
5461 unsigned i;
5463 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5465 /* We also have CTORs in this array. */
5466 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
5467 continue;
5468 if (STMT_VINFO_IN_PATTERN_P (first_element))
5470 stmt_vec_info orig = first_element;
5471 first_element = STMT_VINFO_RELATED_STMT (first_element);
5472 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
5473 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
5474 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
5475 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
5476 vinfo->grouped_stores[i] = first_element;
5478 stmt_vec_info prev = first_element;
5479 while (DR_GROUP_NEXT_ELEMENT (prev))
5481 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
5482 if (STMT_VINFO_IN_PATTERN_P (elt))
5484 stmt_vec_info orig = elt;
5485 elt = STMT_VINFO_RELATED_STMT (elt);
5486 DR_GROUP_NEXT_ELEMENT (prev) = elt;
5487 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
5488 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
5490 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
5491 prev = elt;
5496 /* Check if the region described by BB_VINFO can be vectorized, returning
5497 true if so. When returning false, set FATAL to true if the same failure
5498 would prevent vectorization at other vector sizes, false if it is still
5499 worth trying other sizes. N_STMTS is the number of statements in the
5500 region. */
5502 static bool
5503 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
5504 vec<int> *dataref_groups)
5506 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
5508 slp_instance instance;
5509 int i;
5510 poly_uint64 min_vf = 2;
5512 /* The first group of checks is independent of the vector size. */
5513 fatal = true;
5515 /* Analyze the data references. */
5517 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
5519 if (dump_enabled_p ())
5520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5521 "not vectorized: unhandled data-ref in basic "
5522 "block.\n");
5523 return false;
5526 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
5528 if (dump_enabled_p ())
5529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5530 "not vectorized: unhandled data access in "
5531 "basic block.\n");
5532 return false;
5535 vect_slp_check_for_constructors (bb_vinfo);
5537 /* If there are no grouped stores and no constructors in the region
5538 there is no need to continue with pattern recog as vect_analyze_slp
5539 will fail anyway. */
5540 if (bb_vinfo->grouped_stores.is_empty ()
5541 && bb_vinfo->roots.is_empty ())
5543 if (dump_enabled_p ())
5544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5545 "not vectorized: no grouped stores in "
5546 "basic block.\n");
5547 return false;
5550 /* While the rest of the analysis below depends on it in some way. */
5551 fatal = false;
5553 vect_pattern_recog (bb_vinfo);
5555 /* Update store groups from pattern processing. */
5556 vect_fixup_store_groups_with_patterns (bb_vinfo);
5558 /* Check the SLP opportunities in the basic block, analyze and build SLP
5559 trees. */
5560 if (!vect_analyze_slp (bb_vinfo, n_stmts))
5562 if (dump_enabled_p ())
5564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5565 "Failed to SLP the basic block.\n");
5566 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5567 "not vectorized: failed to find SLP opportunities "
5568 "in basic block.\n");
5570 return false;
5573 /* Optimize permutations. */
5574 vect_optimize_slp (bb_vinfo);
5576 /* Gather the loads reachable from the SLP graph entries. */
5577 vect_gather_slp_loads (bb_vinfo);
5579 vect_record_base_alignments (bb_vinfo);
5581 /* Analyze and verify the alignment of data references and the
5582 dependence in the SLP instances. */
5583 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
5585 vect_location = instance->location ();
5586 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
5587 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
5589 slp_tree node = SLP_INSTANCE_TREE (instance);
5590 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5591 if (dump_enabled_p ())
5592 dump_printf_loc (MSG_NOTE, vect_location,
5593 "removing SLP instance operations starting from: %G",
5594 stmt_info->stmt);
5595 vect_free_slp_instance (instance);
5596 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
5597 continue;
5600 /* Mark all the statements that we want to vectorize as pure SLP and
5601 relevant. */
5602 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5603 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
5604 unsigned j;
5605 stmt_vec_info root;
5606 /* Likewise consider instance root stmts as vectorized. */
5607 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
5608 STMT_SLP_TYPE (root) = pure_slp;
5610 i++;
5612 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
5613 return false;
5615 if (!vect_slp_analyze_operations (bb_vinfo))
5617 if (dump_enabled_p ())
5618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5619 "not vectorized: bad operation in basic block.\n");
5620 return false;
5623 vect_bb_partition_graph (bb_vinfo);
5625 return true;
5628 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
5629 basic blocks in BBS, returning true on success.
5630 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
5632 static bool
5633 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
5634 vec<int> *dataref_groups, unsigned int n_stmts)
5636 bb_vec_info bb_vinfo;
5637 auto_vector_modes vector_modes;
5639 /* Autodetect first vector size we try. */
5640 machine_mode next_vector_mode = VOIDmode;
5641 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
5642 unsigned int mode_i = 0;
5644 vec_info_shared shared;
5646 machine_mode autodetected_vector_mode = VOIDmode;
5647 while (1)
5649 bool vectorized = false;
5650 bool fatal = false;
5651 bb_vinfo = new _bb_vec_info (bbs, &shared);
5653 bool first_time_p = shared.datarefs.is_empty ();
5654 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
5655 if (first_time_p)
5656 bb_vinfo->shared->save_datarefs ();
5657 else
5658 bb_vinfo->shared->check_datarefs ();
5659 bb_vinfo->vector_mode = next_vector_mode;
5661 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
5663 if (dump_enabled_p ())
5665 dump_printf_loc (MSG_NOTE, vect_location,
5666 "***** Analysis succeeded with vector mode"
5667 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
5668 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
5671 bb_vinfo->shared->check_datarefs ();
5673 unsigned i;
5674 slp_instance instance;
5675 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
5677 if (instance->subgraph_entries.is_empty ())
5678 continue;
5680 vect_location = instance->location ();
5681 if (!unlimited_cost_model (NULL)
5682 && !vect_bb_vectorization_profitable_p
5683 (bb_vinfo, instance->subgraph_entries))
5685 if (dump_enabled_p ())
5686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5687 "not vectorized: vectorization is not "
5688 "profitable.\n");
5689 continue;
5692 if (!dbg_cnt (vect_slp))
5693 continue;
5695 if (!vectorized && dump_enabled_p ())
5696 dump_printf_loc (MSG_NOTE, vect_location,
5697 "Basic block will be vectorized "
5698 "using SLP\n");
5699 vectorized = true;
5701 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
5703 unsigned HOST_WIDE_INT bytes;
5704 if (dump_enabled_p ())
5706 if (GET_MODE_SIZE
5707 (bb_vinfo->vector_mode).is_constant (&bytes))
5708 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5709 "basic block part vectorized using %wu "
5710 "byte vectors\n", bytes);
5711 else
5712 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5713 "basic block part vectorized using "
5714 "variable length vectors\n");
5718 else
5720 if (dump_enabled_p ())
5721 dump_printf_loc (MSG_NOTE, vect_location,
5722 "***** Analysis failed with vector mode %s\n",
5723 GET_MODE_NAME (bb_vinfo->vector_mode));
5726 if (mode_i == 0)
5727 autodetected_vector_mode = bb_vinfo->vector_mode;
5729 if (!fatal)
5730 while (mode_i < vector_modes.length ()
5731 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
5733 if (dump_enabled_p ())
5734 dump_printf_loc (MSG_NOTE, vect_location,
5735 "***** The result for vector mode %s would"
5736 " be the same\n",
5737 GET_MODE_NAME (vector_modes[mode_i]));
5738 mode_i += 1;
5741 delete bb_vinfo;
5743 if (mode_i < vector_modes.length ()
5744 && VECTOR_MODE_P (autodetected_vector_mode)
5745 && (related_vector_mode (vector_modes[mode_i],
5746 GET_MODE_INNER (autodetected_vector_mode))
5747 == autodetected_vector_mode)
5748 && (related_vector_mode (autodetected_vector_mode,
5749 GET_MODE_INNER (vector_modes[mode_i]))
5750 == vector_modes[mode_i]))
5752 if (dump_enabled_p ())
5753 dump_printf_loc (MSG_NOTE, vect_location,
5754 "***** Skipping vector mode %s, which would"
5755 " repeat the analysis for %s\n",
5756 GET_MODE_NAME (vector_modes[mode_i]),
5757 GET_MODE_NAME (autodetected_vector_mode));
5758 mode_i += 1;
5761 if (vectorized
5762 || mode_i == vector_modes.length ()
5763 || autodetected_vector_mode == VOIDmode
5764 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5765 vector sizes will fail do not bother iterating. */
5766 || fatal)
5767 return vectorized;
5769 /* Try the next biggest vector size. */
5770 next_vector_mode = vector_modes[mode_i++];
5771 if (dump_enabled_p ())
5772 dump_printf_loc (MSG_NOTE, vect_location,
5773 "***** Re-trying analysis with vector mode %s\n",
5774 GET_MODE_NAME (next_vector_mode));
5779 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
5780 true if anything in the basic-block was vectorized. */
5782 static bool
5783 vect_slp_bbs (vec<basic_block> bbs)
5785 vec<data_reference_p> datarefs = vNULL;
5786 auto_vec<int> dataref_groups;
5787 int insns = 0;
5788 int current_group = 0;
5790 for (unsigned i = 0; i < bbs.length (); i++)
5792 basic_block bb = bbs[i];
5793 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
5794 gsi_next (&gsi))
5796 gimple *stmt = gsi_stmt (gsi);
5797 if (is_gimple_debug (stmt))
5798 continue;
5800 insns++;
5802 if (gimple_location (stmt) != UNKNOWN_LOCATION)
5803 vect_location = stmt;
5805 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
5806 &dataref_groups, current_group))
5807 ++current_group;
5811 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
5814 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5815 true if anything in the basic-block was vectorized. */
5817 bool
5818 vect_slp_bb (basic_block bb)
5820 auto_vec<basic_block> bbs;
5821 bbs.safe_push (bb);
5822 return vect_slp_bbs (bbs);
5825 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5826 true if anything in the basic-block was vectorized. */
5828 bool
5829 vect_slp_function (function *fun)
5831 bool r = false;
5832 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
5833 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
5835 /* For the moment split the function into pieces to avoid making
5836 the iteration on the vector mode moot. Split at points we know
5837 to not handle well which is CFG merges (SLP discovery doesn't
5838 handle non-loop-header PHIs) and loop exits. Since pattern
5839 recog requires reverse iteration to visit uses before defs
5840 simply chop RPO into pieces. */
5841 auto_vec<basic_block> bbs;
5842 for (unsigned i = 0; i < n; i++)
5844 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
5845 bool split = false;
5847 /* Split when a BB is not dominated by the first block. */
5848 if (!bbs.is_empty ()
5849 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
5851 if (dump_enabled_p ())
5852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5853 "splitting region at dominance boundary bb%d\n",
5854 bb->index);
5855 split = true;
5857 /* Split when the loop determined by the first block
5858 is exited. This is because we eventually insert
5859 invariants at region begin. */
5860 else if (!bbs.is_empty ()
5861 && bbs[0]->loop_father != bb->loop_father
5862 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
5864 if (dump_enabled_p ())
5865 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5866 "splitting region at loop %d exit at bb%d\n",
5867 bbs[0]->loop_father->num, bb->index);
5868 split = true;
5871 if (split && !bbs.is_empty ())
5873 r |= vect_slp_bbs (bbs);
5874 bbs.truncate (0);
5875 bbs.quick_push (bb);
5877 else
5878 bbs.safe_push (bb);
5880 /* When we have a stmt ending this block and defining a
5881 value we have to insert on edges when inserting after it for
5882 a vector containing its definition. Avoid this for now. */
5883 if (gimple *last = last_stmt (bb))
5884 if (gimple_get_lhs (last)
5885 && is_ctrl_altering_stmt (last))
5887 if (dump_enabled_p ())
5888 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5889 "splitting region at control altering "
5890 "definition %G", last);
5891 r |= vect_slp_bbs (bbs);
5892 bbs.truncate (0);
5896 if (!bbs.is_empty ())
5897 r |= vect_slp_bbs (bbs);
5899 free (rpo);
5901 return r;
5904 /* Build a variable-length vector in which the elements in ELTS are repeated
5905 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
5906 RESULTS and add any new instructions to SEQ.
5908 The approach we use is:
5910 (1) Find a vector mode VM with integer elements of mode IM.
5912 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5913 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
5914 from small vectors to IM.
5916 (3) Duplicate each ELTS'[I] into a vector of mode VM.
5918 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
5919 correct byte contents.
5921 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
5923 We try to find the largest IM for which this sequence works, in order
5924 to cut down on the number of interleaves. */
5926 void
5927 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
5928 vec<tree> elts, unsigned int nresults,
5929 vec<tree> &results)
5931 unsigned int nelts = elts.length ();
5932 tree element_type = TREE_TYPE (vector_type);
5934 /* (1) Find a vector mode VM with integer elements of mode IM. */
5935 unsigned int nvectors = 1;
5936 tree new_vector_type;
5937 tree permutes[2];
5938 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
5939 &nvectors, &new_vector_type,
5940 permutes))
5941 gcc_unreachable ();
5943 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
5944 unsigned int partial_nelts = nelts / nvectors;
5945 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
5947 tree_vector_builder partial_elts;
5948 auto_vec<tree, 32> pieces (nvectors * 2);
5949 pieces.quick_grow_cleared (nvectors * 2);
5950 for (unsigned int i = 0; i < nvectors; ++i)
5952 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5953 ELTS' has mode IM. */
5954 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
5955 for (unsigned int j = 0; j < partial_nelts; ++j)
5956 partial_elts.quick_push (elts[i * partial_nelts + j]);
5957 tree t = gimple_build_vector (seq, &partial_elts);
5958 t = gimple_build (seq, VIEW_CONVERT_EXPR,
5959 TREE_TYPE (new_vector_type), t);
5961 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
5962 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
5965 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
5966 correct byte contents.
5968 Conceptually, we need to repeat the following operation log2(nvectors)
5969 times, where hi_start = nvectors / 2:
5971 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
5972 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
5974 However, if each input repeats every N elements and the VF is
5975 a multiple of N * 2, the HI result is the same as the LO result.
5976 This will be true for the first N1 iterations of the outer loop,
5977 followed by N2 iterations for which both the LO and HI results
5978 are needed. I.e.:
5980 N1 + N2 = log2(nvectors)
5982 Each "N1 iteration" doubles the number of redundant vectors and the
5983 effect of the process as a whole is to have a sequence of nvectors/2**N1
5984 vectors that repeats 2**N1 times. Rather than generate these redundant
5985 vectors, we halve the number of vectors for each N1 iteration. */
5986 unsigned int in_start = 0;
5987 unsigned int out_start = nvectors;
5988 unsigned int new_nvectors = nvectors;
5989 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
5991 unsigned int hi_start = new_nvectors / 2;
5992 unsigned int out_i = 0;
5993 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
5995 if ((in_i & 1) != 0
5996 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
5997 2 * in_repeat))
5998 continue;
6000 tree output = make_ssa_name (new_vector_type);
6001 tree input1 = pieces[in_start + (in_i / 2)];
6002 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
6003 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
6004 input1, input2,
6005 permutes[in_i & 1]);
6006 gimple_seq_add_stmt (seq, stmt);
6007 pieces[out_start + out_i] = output;
6008 out_i += 1;
6010 std::swap (in_start, out_start);
6011 new_nvectors = out_i;
6014 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
6015 results.reserve (nresults);
6016 for (unsigned int i = 0; i < nresults; ++i)
6017 if (i < new_nvectors)
6018 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
6019 pieces[in_start + i]));
6020 else
6021 results.quick_push (results[i - new_nvectors]);
6025 /* For constant and loop invariant defs in OP_NODE this function creates
6026 vector defs that will be used in the vectorized stmts and stores them
6027 to SLP_TREE_VEC_DEFS of OP_NODE. */
6029 static void
6030 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
6032 unsigned HOST_WIDE_INT nunits;
6033 tree vec_cst;
6034 unsigned j, number_of_places_left_in_vector;
6035 tree vector_type;
6036 tree vop;
6037 int group_size = op_node->ops.length ();
6038 unsigned int vec_num, i;
6039 unsigned number_of_copies = 1;
6040 bool constant_p;
6041 gimple_seq ctor_seq = NULL;
6042 auto_vec<tree, 16> permute_results;
6044 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
6045 vector_type = SLP_TREE_VECTYPE (op_node);
6047 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
6048 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
6049 auto_vec<tree> voprnds (number_of_vectors);
6051 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
6052 created vectors. It is greater than 1 if unrolling is performed.
6054 For example, we have two scalar operands, s1 and s2 (e.g., group of
6055 strided accesses of size two), while NUNITS is four (i.e., four scalars
6056 of this type can be packed in a vector). The output vector will contain
6057 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
6058 will be 2).
6060 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
6061 containing the operands.
6063 For example, NUNITS is four as before, and the group size is 8
6064 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
6065 {s5, s6, s7, s8}. */
6067 /* When using duplicate_and_interleave, we just need one element for
6068 each scalar statement. */
6069 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
6070 nunits = group_size;
6072 number_of_copies = nunits * number_of_vectors / group_size;
6074 number_of_places_left_in_vector = nunits;
6075 constant_p = true;
6076 tree_vector_builder elts (vector_type, nunits, 1);
6077 elts.quick_grow (nunits);
6078 stmt_vec_info insert_after = NULL;
6079 for (j = 0; j < number_of_copies; j++)
6081 tree op;
6082 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
6084 /* Create 'vect_ = {op0,op1,...,opn}'. */
6085 number_of_places_left_in_vector--;
6086 tree orig_op = op;
6087 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
6089 if (CONSTANT_CLASS_P (op))
6091 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6093 /* Can't use VIEW_CONVERT_EXPR for booleans because
6094 of possibly different sizes of scalar value and
6095 vector element. */
6096 if (integer_zerop (op))
6097 op = build_int_cst (TREE_TYPE (vector_type), 0);
6098 else if (integer_onep (op))
6099 op = build_all_ones_cst (TREE_TYPE (vector_type));
6100 else
6101 gcc_unreachable ();
6103 else
6104 op = fold_unary (VIEW_CONVERT_EXPR,
6105 TREE_TYPE (vector_type), op);
6106 gcc_assert (op && CONSTANT_CLASS_P (op));
6108 else
6110 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
6111 gimple *init_stmt;
6112 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6114 tree true_val
6115 = build_all_ones_cst (TREE_TYPE (vector_type));
6116 tree false_val
6117 = build_zero_cst (TREE_TYPE (vector_type));
6118 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
6119 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
6120 op, true_val,
6121 false_val);
6123 else
6125 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
6126 op);
6127 init_stmt
6128 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
6129 op);
6131 gimple_seq_add_stmt (&ctor_seq, init_stmt);
6132 op = new_temp;
6135 elts[number_of_places_left_in_vector] = op;
6136 if (!CONSTANT_CLASS_P (op))
6137 constant_p = false;
6138 /* For BB vectorization we have to compute an insert location
6139 when a def is inside the analyzed region since we cannot
6140 simply insert at the BB start in this case. */
6141 stmt_vec_info opdef;
6142 if (TREE_CODE (orig_op) == SSA_NAME
6143 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
6144 && is_a <bb_vec_info> (vinfo)
6145 && (opdef = vinfo->lookup_def (orig_op)))
6147 if (!insert_after)
6148 insert_after = opdef;
6149 else
6150 insert_after = get_later_stmt (insert_after, opdef);
6153 if (number_of_places_left_in_vector == 0)
6155 if (constant_p
6156 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
6157 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
6158 vec_cst = gimple_build_vector (&ctor_seq, &elts);
6159 else
6161 if (permute_results.is_empty ())
6162 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
6163 elts, number_of_vectors,
6164 permute_results);
6165 vec_cst = permute_results[number_of_vectors - j - 1];
6167 if (!gimple_seq_empty_p (ctor_seq))
6169 if (insert_after)
6171 gimple_stmt_iterator gsi;
6172 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
6174 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
6175 gsi_insert_seq_before (&gsi, ctor_seq,
6176 GSI_CONTINUE_LINKING);
6178 else if (!stmt_ends_bb_p (insert_after->stmt))
6180 gsi = gsi_for_stmt (insert_after->stmt);
6181 gsi_insert_seq_after (&gsi, ctor_seq,
6182 GSI_CONTINUE_LINKING);
6184 else
6186 /* When we want to insert after a def where the
6187 defining stmt throws then insert on the fallthru
6188 edge. */
6189 edge e = find_fallthru_edge
6190 (gimple_bb (insert_after->stmt)->succs);
6191 basic_block new_bb
6192 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
6193 gcc_assert (!new_bb);
6196 else
6197 vinfo->insert_seq_on_entry (NULL, ctor_seq);
6198 ctor_seq = NULL;
6200 voprnds.quick_push (vec_cst);
6201 insert_after = NULL;
6202 number_of_places_left_in_vector = nunits;
6203 constant_p = true;
6204 elts.new_vector (vector_type, nunits, 1);
6205 elts.quick_grow (nunits);
6210 /* Since the vectors are created in the reverse order, we should invert
6211 them. */
6212 vec_num = voprnds.length ();
6213 for (j = vec_num; j != 0; j--)
6215 vop = voprnds[j - 1];
6216 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6219 /* In case that VF is greater than the unrolling factor needed for the SLP
6220 group of stmts, NUMBER_OF_VECTORS to be created is greater than
6221 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
6222 to replicate the vectors. */
6223 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
6224 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
6225 i++)
6226 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6229 /* Get the Ith vectorized definition from SLP_NODE. */
6231 tree
6232 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
6234 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
6235 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
6236 else
6237 return SLP_TREE_VEC_DEFS (slp_node)[i];
6240 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
6242 void
6243 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
6245 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
6246 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
6248 unsigned j;
6249 gimple *vec_def_stmt;
6250 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
6251 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
6253 else
6254 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
6257 /* Get N vectorized definitions for SLP_NODE. */
6259 void
6260 vect_get_slp_defs (vec_info *,
6261 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
6263 if (n == -1U)
6264 n = SLP_TREE_CHILDREN (slp_node).length ();
6266 for (unsigned i = 0; i < n; ++i)
6268 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
6269 vec<tree> vec_defs = vNULL;
6270 vect_get_slp_defs (child, &vec_defs);
6271 vec_oprnds->quick_push (vec_defs);
6275 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6276 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6277 permute statements for the SLP node NODE. Store the number of vector
6278 permute instructions in *N_PERMS and the number of vector load
6279 instructions in *N_LOADS. */
6281 bool
6282 vect_transform_slp_perm_load (vec_info *vinfo,
6283 slp_tree node, vec<tree> dr_chain,
6284 gimple_stmt_iterator *gsi, poly_uint64 vf,
6285 bool analyze_only, unsigned *n_perms,
6286 unsigned int *n_loads)
6288 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6289 int vec_index = 0;
6290 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6291 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
6292 unsigned int mask_element;
6293 machine_mode mode;
6295 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
6296 return false;
6298 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6300 mode = TYPE_MODE (vectype);
6301 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6303 /* Initialize the vect stmts of NODE to properly insert the generated
6304 stmts later. */
6305 if (! analyze_only)
6306 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
6307 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6308 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
6310 /* Generate permutation masks for every NODE. Number of masks for each NODE
6311 is equal to GROUP_SIZE.
6312 E.g., we have a group of three nodes with three loads from the same
6313 location in each node, and the vector size is 4. I.e., we have a
6314 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6315 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6316 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6319 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
6320 The last mask is illegal since we assume two operands for permute
6321 operation, and the mask element values can't be outside that range.
6322 Hence, the last mask must be converted into {2,5,5,5}.
6323 For the first two permutations we need the first and the second input
6324 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6325 we need the second and the third vectors: {b1,c1,a2,b2} and
6326 {c2,a3,b3,c3}. */
6328 int vect_stmts_counter = 0;
6329 unsigned int index = 0;
6330 int first_vec_index = -1;
6331 int second_vec_index = -1;
6332 bool noop_p = true;
6333 *n_perms = 0;
6335 vec_perm_builder mask;
6336 unsigned int nelts_to_build;
6337 unsigned int nvectors_per_build;
6338 unsigned int in_nlanes;
6339 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
6340 && multiple_p (nunits, group_size));
6341 if (repeating_p)
6343 /* A single vector contains a whole number of copies of the node, so:
6344 (a) all permutes can use the same mask; and
6345 (b) the permutes only need a single vector input. */
6346 mask.new_vector (nunits, group_size, 3);
6347 nelts_to_build = mask.encoded_nelts ();
6348 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
6349 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
6351 else
6353 /* We need to construct a separate mask for each vector statement. */
6354 unsigned HOST_WIDE_INT const_nunits, const_vf;
6355 if (!nunits.is_constant (&const_nunits)
6356 || !vf.is_constant (&const_vf))
6357 return false;
6358 mask.new_vector (const_nunits, const_nunits, 1);
6359 nelts_to_build = const_vf * group_size;
6360 nvectors_per_build = 1;
6361 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
6363 auto_sbitmap used_in_lanes (in_nlanes);
6364 bitmap_clear (used_in_lanes);
6366 unsigned int count = mask.encoded_nelts ();
6367 mask.quick_grow (count);
6368 vec_perm_indices indices;
6370 for (unsigned int j = 0; j < nelts_to_build; j++)
6372 unsigned int iter_num = j / group_size;
6373 unsigned int stmt_num = j % group_size;
6374 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
6375 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
6376 bitmap_set_bit (used_in_lanes, i);
6377 if (repeating_p)
6379 first_vec_index = 0;
6380 mask_element = i;
6382 else
6384 /* Enforced before the loop when !repeating_p. */
6385 unsigned int const_nunits = nunits.to_constant ();
6386 vec_index = i / const_nunits;
6387 mask_element = i % const_nunits;
6388 if (vec_index == first_vec_index
6389 || first_vec_index == -1)
6391 first_vec_index = vec_index;
6393 else if (vec_index == second_vec_index
6394 || second_vec_index == -1)
6396 second_vec_index = vec_index;
6397 mask_element += const_nunits;
6399 else
6401 if (dump_enabled_p ())
6402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6403 "permutation requires at "
6404 "least three vectors %G",
6405 stmt_info->stmt);
6406 gcc_assert (analyze_only);
6407 return false;
6410 gcc_assert (mask_element < 2 * const_nunits);
6413 if (mask_element != index)
6414 noop_p = false;
6415 mask[index++] = mask_element;
6417 if (index == count && !noop_p)
6419 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
6420 if (!can_vec_perm_const_p (mode, indices))
6422 if (dump_enabled_p ())
6424 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6425 vect_location,
6426 "unsupported vect permute { ");
6427 for (i = 0; i < count; ++i)
6429 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6430 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6432 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6434 gcc_assert (analyze_only);
6435 return false;
6438 ++*n_perms;
6441 if (index == count)
6443 if (!analyze_only)
6445 tree mask_vec = NULL_TREE;
6447 if (! noop_p)
6448 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6450 if (second_vec_index == -1)
6451 second_vec_index = first_vec_index;
6453 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
6455 /* Generate the permute statement if necessary. */
6456 tree first_vec = dr_chain[first_vec_index + ri];
6457 tree second_vec = dr_chain[second_vec_index + ri];
6458 gimple *perm_stmt;
6459 if (! noop_p)
6461 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6462 tree perm_dest
6463 = vect_create_destination_var (gimple_assign_lhs (stmt),
6464 vectype);
6465 perm_dest = make_ssa_name (perm_dest);
6466 perm_stmt
6467 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6468 first_vec, second_vec,
6469 mask_vec);
6470 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
6471 gsi);
6473 else
6474 /* If mask was NULL_TREE generate the requested
6475 identity transform. */
6476 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
6478 /* Store the vector statement in NODE. */
6479 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
6483 index = 0;
6484 first_vec_index = -1;
6485 second_vec_index = -1;
6486 noop_p = true;
6490 if (n_loads)
6492 if (repeating_p)
6493 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6494 else
6496 /* Enforced above when !repeating_p. */
6497 unsigned int const_nunits = nunits.to_constant ();
6498 *n_loads = 0;
6499 bool load_seen = false;
6500 for (unsigned i = 0; i < in_nlanes; ++i)
6502 if (i % const_nunits == 0)
6504 if (load_seen)
6505 *n_loads += 1;
6506 load_seen = false;
6508 if (bitmap_bit_p (used_in_lanes, i))
6509 load_seen = true;
6511 if (load_seen)
6512 *n_loads += 1;
6516 return true;
6519 /* Produce the next vector result for SLP permutation NODE by adding a vector
6520 statement at GSI. If MASK_VEC is nonnull, add:
6522 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
6524 otherwise add:
6526 <new SSA name> = FIRST_DEF. */
6528 static void
6529 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6530 slp_tree node, tree first_def, tree second_def,
6531 tree mask_vec)
6533 tree vectype = SLP_TREE_VECTYPE (node);
6535 /* ??? We SLP match existing vector element extracts but
6536 allow punning which we need to re-instantiate at uses
6537 but have no good way of explicitly representing. */
6538 if (!types_compatible_p (TREE_TYPE (first_def), vectype))
6540 gassign *conv_stmt
6541 = gimple_build_assign (make_ssa_name (vectype),
6542 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
6543 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6544 first_def = gimple_assign_lhs (conv_stmt);
6546 gassign *perm_stmt;
6547 tree perm_dest = make_ssa_name (vectype);
6548 if (mask_vec)
6550 if (!types_compatible_p (TREE_TYPE (second_def), vectype))
6552 gassign *conv_stmt
6553 = gimple_build_assign (make_ssa_name (vectype),
6554 build1 (VIEW_CONVERT_EXPR,
6555 vectype, second_def));
6556 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6557 second_def = gimple_assign_lhs (conv_stmt);
6559 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6560 first_def, second_def,
6561 mask_vec);
6563 else
6564 /* We need a copy here in case the def was external. */
6565 perm_stmt = gimple_build_assign (perm_dest, first_def);
6566 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
6567 /* Store the vector statement in NODE. */
6568 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
6571 /* Vectorize the SLP permutations in NODE as specified
6572 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
6573 child number and lane number.
6574 Interleaving of two two-lane two-child SLP subtrees (not supported):
6575 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
6576 A blend of two four-lane two-child SLP subtrees:
6577 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
6578 Highpart of a four-lane one-child SLP subtree (not supported):
6579 [ { 0, 2 }, { 0, 3 } ]
6580 Where currently only a subset is supported by code generating below. */
6582 static bool
6583 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6584 slp_tree node, stmt_vector_for_cost *cost_vec)
6586 tree vectype = SLP_TREE_VECTYPE (node);
6588 /* ??? We currently only support all same vector input and output types
6589 while the SLP IL should really do a concat + select and thus accept
6590 arbitrary mismatches. */
6591 slp_tree child;
6592 unsigned i;
6593 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6594 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
6595 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6597 if (!vect_maybe_update_slp_op_vectype (child, vectype)
6598 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
6600 if (dump_enabled_p ())
6601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6602 "Unsupported lane permutation\n");
6603 return false;
6605 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
6606 repeating_p = false;
6609 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
6610 gcc_assert (perm.length () == SLP_TREE_LANES (node));
6611 if (dump_enabled_p ())
6613 dump_printf_loc (MSG_NOTE, vect_location,
6614 "vectorizing permutation");
6615 for (unsigned i = 0; i < perm.length (); ++i)
6616 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
6617 if (repeating_p)
6618 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
6619 dump_printf (MSG_NOTE, "\n");
6622 /* REPEATING_P is true if every output vector is guaranteed to use the
6623 same permute vector. We can handle that case for both variable-length
6624 and constant-length vectors, but we only handle other cases for
6625 constant-length vectors.
6627 Set:
6629 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
6630 mask vector that we want to build.
6632 - NCOPIES to the number of copies of PERM that we need in order
6633 to build the necessary permute mask vectors.
6635 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
6636 for each permute mask vector. This is only relevant when GSI is
6637 nonnull. */
6638 uint64_t npatterns;
6639 unsigned nelts_per_pattern;
6640 uint64_t ncopies;
6641 unsigned noutputs_per_mask;
6642 if (repeating_p)
6644 /* We need a single permute mask vector that has the form:
6646 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
6648 In other words, the original n-element permute in PERM is
6649 "unrolled" to fill a full vector. The stepped vector encoding
6650 that we use for permutes requires 3n elements. */
6651 npatterns = SLP_TREE_LANES (node);
6652 nelts_per_pattern = ncopies = 3;
6653 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6655 else
6657 /* Calculate every element of every permute mask vector explicitly,
6658 instead of relying on the pattern described above. */
6659 if (!nunits.is_constant (&npatterns))
6660 return false;
6661 nelts_per_pattern = ncopies = 1;
6662 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
6663 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
6664 return false;
6665 noutputs_per_mask = 1;
6667 unsigned olanes = ncopies * SLP_TREE_LANES (node);
6668 gcc_assert (repeating_p || multiple_p (olanes, nunits));
6670 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
6671 from the { SLP operand, scalar lane } permutation as recorded in the
6672 SLP node as intermediate step. This part should already work
6673 with SLP children with arbitrary number of lanes. */
6674 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
6675 auto_vec<unsigned> active_lane;
6676 vperm.create (olanes);
6677 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
6678 for (unsigned i = 0; i < ncopies; ++i)
6680 for (unsigned pi = 0; pi < perm.length (); ++pi)
6682 std::pair<unsigned, unsigned> p = perm[pi];
6683 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
6684 if (repeating_p)
6685 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
6686 else
6688 /* We checked above that the vectors are constant-length. */
6689 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
6690 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
6691 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
6692 vperm.quick_push ({{p.first, vi}, vl});
6695 /* Advance to the next group. */
6696 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
6697 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
6700 if (dump_enabled_p ())
6702 dump_printf_loc (MSG_NOTE, vect_location, "as");
6703 for (unsigned i = 0; i < vperm.length (); ++i)
6705 if (i != 0
6706 && (repeating_p
6707 ? multiple_p (i, npatterns)
6708 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
6709 dump_printf (MSG_NOTE, ",");
6710 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
6711 vperm[i].first.first, vperm[i].first.second,
6712 vperm[i].second);
6714 dump_printf (MSG_NOTE, "\n");
6717 /* We can only handle two-vector permutes, everything else should
6718 be lowered on the SLP level. The following is closely inspired
6719 by vect_transform_slp_perm_load and is supposed to eventually
6720 replace it.
6721 ??? As intermediate step do code-gen in the SLP tree representation
6722 somehow? */
6723 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
6724 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
6725 unsigned int index = 0;
6726 poly_uint64 mask_element;
6727 vec_perm_builder mask;
6728 mask.new_vector (nunits, npatterns, nelts_per_pattern);
6729 unsigned int count = mask.encoded_nelts ();
6730 mask.quick_grow (count);
6731 vec_perm_indices indices;
6732 unsigned nperms = 0;
6733 for (unsigned i = 0; i < vperm.length (); ++i)
6735 mask_element = vperm[i].second;
6736 if (first_vec.first == -1U
6737 || first_vec == vperm[i].first)
6738 first_vec = vperm[i].first;
6739 else if (second_vec.first == -1U
6740 || second_vec == vperm[i].first)
6742 second_vec = vperm[i].first;
6743 mask_element += nunits;
6745 else
6747 if (dump_enabled_p ())
6748 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6749 "permutation requires at "
6750 "least three vectors\n");
6751 gcc_assert (!gsi);
6752 return false;
6755 mask[index++] = mask_element;
6757 if (index == count)
6759 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits);
6760 bool identity_p = indices.series_p (0, 1, 0, 1);
6761 if (!identity_p
6762 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6764 if (dump_enabled_p ())
6766 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6767 vect_location,
6768 "unsupported vect permute { ");
6769 for (i = 0; i < count; ++i)
6771 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6772 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6774 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6776 gcc_assert (!gsi);
6777 return false;
6780 if (!identity_p)
6781 nperms++;
6782 if (gsi)
6784 if (second_vec.first == -1U)
6785 second_vec = first_vec;
6787 slp_tree
6788 first_node = SLP_TREE_CHILDREN (node)[first_vec.first],
6789 second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
6791 tree mask_vec = NULL_TREE;
6792 if (!identity_p)
6793 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6795 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
6797 tree first_def
6798 = vect_get_slp_vect_def (first_node,
6799 first_vec.second + vi);
6800 tree second_def
6801 = vect_get_slp_vect_def (second_node,
6802 second_vec.second + vi);
6803 vect_add_slp_permutation (vinfo, gsi, node, first_def,
6804 second_def, mask_vec);
6808 index = 0;
6809 first_vec = std::make_pair (-1U, -1U);
6810 second_vec = std::make_pair (-1U, -1U);
6814 if (!gsi)
6815 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
6817 return true;
6820 /* Vectorize SLP NODE. */
6822 static void
6823 vect_schedule_slp_node (vec_info *vinfo,
6824 slp_tree node, slp_instance instance)
6826 gimple_stmt_iterator si;
6827 int i;
6828 slp_tree child;
6830 /* For existing vectors there's nothing to do. */
6831 if (SLP_TREE_VEC_DEFS (node).exists ())
6832 return;
6834 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
6836 /* Vectorize externals and constants. */
6837 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
6838 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6840 /* ??? vectorizable_shift can end up using a scalar operand which is
6841 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
6842 node in this case. */
6843 if (!SLP_TREE_VECTYPE (node))
6844 return;
6846 vect_create_constant_vectors (vinfo, node);
6847 return;
6850 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6852 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
6853 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6855 if (dump_enabled_p ())
6856 dump_printf_loc (MSG_NOTE, vect_location,
6857 "------>vectorizing SLP node starting from: %G",
6858 stmt_info->stmt);
6860 if (STMT_VINFO_DATA_REF (stmt_info)
6861 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6863 /* Vectorized loads go before the first scalar load to make it
6864 ready early, vectorized stores go before the last scalar
6865 stmt which is where all uses are ready. */
6866 stmt_vec_info last_stmt_info = NULL;
6867 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
6868 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
6869 else /* DR_IS_WRITE */
6870 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
6871 si = gsi_for_stmt (last_stmt_info->stmt);
6873 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
6874 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
6875 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
6876 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6878 /* For PHI node vectorization we do not use the insertion iterator. */
6879 si = gsi_none ();
6881 else
6883 /* Emit other stmts after the children vectorized defs which is
6884 earliest possible. */
6885 gimple *last_stmt = NULL;
6886 bool seen_vector_def = false;
6887 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6888 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6890 /* For fold-left reductions we are retaining the scalar
6891 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
6892 set so the representation isn't perfect. Resort to the
6893 last scalar def here. */
6894 if (SLP_TREE_VEC_STMTS (child).is_empty ())
6896 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
6897 == cycle_phi_info_type);
6898 gphi *phi = as_a <gphi *>
6899 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
6900 if (!last_stmt
6901 || vect_stmt_dominates_stmt_p (last_stmt, phi))
6902 last_stmt = phi;
6904 /* We are emitting all vectorized stmts in the same place and
6905 the last one is the last.
6906 ??? Unless we have a load permutation applied and that
6907 figures to re-use an earlier generated load. */
6908 unsigned j;
6909 gimple *vstmt;
6910 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
6911 if (!last_stmt
6912 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
6913 last_stmt = vstmt;
6915 else if (!SLP_TREE_VECTYPE (child))
6917 /* For externals we use unvectorized at all scalar defs. */
6918 unsigned j;
6919 tree def;
6920 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
6921 if (TREE_CODE (def) == SSA_NAME
6922 && !SSA_NAME_IS_DEFAULT_DEF (def))
6924 gimple *stmt = SSA_NAME_DEF_STMT (def);
6925 if (!last_stmt
6926 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
6927 last_stmt = stmt;
6930 else
6932 /* For externals we have to look at all defs since their
6933 insertion place is decided per vector. But beware
6934 of pre-existing vectors where we need to make sure
6935 we do not insert before the region boundary. */
6936 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
6937 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
6938 seen_vector_def = true;
6939 else
6941 unsigned j;
6942 tree vdef;
6943 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
6944 if (TREE_CODE (vdef) == SSA_NAME
6945 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
6947 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
6948 if (!last_stmt
6949 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
6950 last_stmt = vstmt;
6954 /* This can happen when all children are pre-existing vectors or
6955 constants. */
6956 if (!last_stmt)
6957 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
6958 if (!last_stmt)
6960 gcc_assert (seen_vector_def);
6961 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
6963 else if (is_a <gphi *> (last_stmt))
6964 si = gsi_after_labels (gimple_bb (last_stmt));
6965 else
6967 si = gsi_for_stmt (last_stmt);
6968 gsi_next (&si);
6972 bool done_p = false;
6974 /* Handle purely internal nodes. */
6975 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6977 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
6978 be shared with different SLP nodes (but usually it's the same
6979 operation apart from the case the stmt is only there for denoting
6980 the actual scalar lane defs ...). So do not call vect_transform_stmt
6981 but open-code it here (partly). */
6982 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
6983 gcc_assert (done);
6984 done_p = true;
6986 if (!done_p)
6987 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
6990 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
6991 For loop vectorization this is done in vectorizable_call, but for SLP
6992 it needs to be deferred until end of vect_schedule_slp, because multiple
6993 SLP instances may refer to the same scalar stmt. */
6995 static void
6996 vect_remove_slp_scalar_calls (vec_info *vinfo,
6997 slp_tree node, hash_set<slp_tree> &visited)
6999 gimple *new_stmt;
7000 gimple_stmt_iterator gsi;
7001 int i;
7002 slp_tree child;
7003 tree lhs;
7004 stmt_vec_info stmt_info;
7006 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7007 return;
7009 if (visited.add (node))
7010 return;
7012 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7013 vect_remove_slp_scalar_calls (vinfo, child, visited);
7015 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7017 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
7018 if (!stmt || gimple_bb (stmt) == NULL)
7019 continue;
7020 if (is_pattern_stmt_p (stmt_info)
7021 || !PURE_SLP_STMT (stmt_info))
7022 continue;
7023 lhs = gimple_call_lhs (stmt);
7024 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
7025 gsi = gsi_for_stmt (stmt);
7026 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
7027 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
7031 static void
7032 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
7034 hash_set<slp_tree> visited;
7035 vect_remove_slp_scalar_calls (vinfo, node, visited);
7038 /* Vectorize the instance root. */
7040 void
7041 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
7043 gassign *rstmt = NULL;
7045 if (instance->kind == slp_inst_kind_ctor)
7047 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
7049 gimple *child_stmt;
7050 int j;
7052 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7054 tree vect_lhs = gimple_get_lhs (child_stmt);
7055 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7056 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
7057 TREE_TYPE (vect_lhs)))
7058 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
7059 vect_lhs);
7060 rstmt = gimple_build_assign (root_lhs, vect_lhs);
7061 break;
7064 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
7066 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7067 gimple *child_stmt;
7068 int j;
7069 vec<constructor_elt, va_gc> *v;
7070 vec_alloc (v, nelts);
7072 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7073 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
7074 gimple_get_lhs (child_stmt));
7075 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7076 tree rtype
7077 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
7078 tree r_constructor = build_constructor (rtype, v);
7079 rstmt = gimple_build_assign (lhs, r_constructor);
7082 else if (instance->kind == slp_inst_kind_bb_reduc)
7084 /* Largely inspired by reduction chain epilogue handling in
7085 vect_create_epilog_for_reduction. */
7086 vec<tree> vec_defs = vNULL;
7087 vect_get_slp_defs (node, &vec_defs);
7088 enum tree_code reduc_code
7089 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
7090 /* ??? We actually have to reflect signs somewhere. */
7091 if (reduc_code == MINUS_EXPR)
7092 reduc_code = PLUS_EXPR;
7093 gimple_seq epilogue = NULL;
7094 /* We may end up with more than one vector result, reduce them
7095 to one vector. */
7096 tree vec_def = vec_defs[0];
7097 for (unsigned i = 1; i < vec_defs.length (); ++i)
7098 vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
7099 vec_def, vec_defs[i]);
7100 vec_defs.release ();
7101 /* ??? Support other schemes than direct internal fn. */
7102 internal_fn reduc_fn;
7103 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
7104 || reduc_fn == IFN_LAST)
7105 gcc_unreachable ();
7106 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
7107 TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
7109 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7110 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
7111 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
7112 update_stmt (gsi_stmt (rgsi));
7113 return;
7115 else
7116 gcc_unreachable ();
7118 gcc_assert (rstmt);
7120 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7121 gsi_replace (&rgsi, rstmt, true);
7124 struct slp_scc_info
7126 bool on_stack;
7127 int dfs;
7128 int lowlink;
7131 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
7133 static void
7134 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
7135 hash_map<slp_tree, slp_scc_info> &scc_info,
7136 int &maxdfs, vec<slp_tree> &stack)
7138 bool existed_p;
7139 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
7140 gcc_assert (!existed_p);
7141 info->dfs = maxdfs;
7142 info->lowlink = maxdfs;
7143 maxdfs++;
7145 /* Leaf. */
7146 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7148 info->on_stack = false;
7149 vect_schedule_slp_node (vinfo, node, instance);
7150 return;
7153 info->on_stack = true;
7154 stack.safe_push (node);
7156 unsigned i;
7157 slp_tree child;
7158 /* DFS recurse. */
7159 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7161 if (!child)
7162 continue;
7163 slp_scc_info *child_info = scc_info.get (child);
7164 if (!child_info)
7166 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
7167 /* Recursion might have re-allocated the node. */
7168 info = scc_info.get (node);
7169 child_info = scc_info.get (child);
7170 info->lowlink = MIN (info->lowlink, child_info->lowlink);
7172 else if (child_info->on_stack)
7173 info->lowlink = MIN (info->lowlink, child_info->dfs);
7175 if (info->lowlink != info->dfs)
7176 return;
7178 auto_vec<slp_tree, 4> phis_to_fixup;
7180 /* Singleton. */
7181 if (stack.last () == node)
7183 stack.pop ();
7184 info->on_stack = false;
7185 vect_schedule_slp_node (vinfo, node, instance);
7186 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
7187 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
7188 phis_to_fixup.quick_push (node);
7190 else
7192 /* SCC. */
7193 int last_idx = stack.length () - 1;
7194 while (stack[last_idx] != node)
7195 last_idx--;
7196 /* We can break the cycle at PHIs who have at least one child
7197 code generated. Then we could re-start the DFS walk until
7198 all nodes in the SCC are covered (we might have new entries
7199 for only back-reachable nodes). But it's simpler to just
7200 iterate and schedule those that are ready. */
7201 unsigned todo = stack.length () - last_idx;
7204 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
7206 slp_tree entry = stack[idx];
7207 if (!entry)
7208 continue;
7209 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
7210 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
7211 bool ready = !phi;
7212 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
7213 if (!child)
7215 gcc_assert (phi);
7216 ready = true;
7217 break;
7219 else if (scc_info.get (child)->on_stack)
7221 if (!phi)
7223 ready = false;
7224 break;
7227 else
7229 if (phi)
7231 ready = true;
7232 break;
7235 if (ready)
7237 vect_schedule_slp_node (vinfo, entry, instance);
7238 scc_info.get (entry)->on_stack = false;
7239 stack[idx] = NULL;
7240 todo--;
7241 if (phi)
7242 phis_to_fixup.safe_push (entry);
7246 while (todo != 0);
7248 /* Pop the SCC. */
7249 stack.truncate (last_idx);
7252 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
7253 slp_tree phi_node;
7254 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
7256 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
7257 edge_iterator ei;
7258 edge e;
7259 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
7261 unsigned dest_idx = e->dest_idx;
7262 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
7263 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
7264 continue;
7265 /* Simply fill all args. */
7266 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
7267 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
7268 vect_get_slp_vect_def (child, i),
7269 e, gimple_phi_arg_location (phi, dest_idx));
7274 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
7276 void
7277 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
7279 slp_instance instance;
7280 unsigned int i;
7282 hash_map<slp_tree, slp_scc_info> scc_info;
7283 int maxdfs = 0;
7284 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7286 slp_tree node = SLP_INSTANCE_TREE (instance);
7287 if (dump_enabled_p ())
7289 dump_printf_loc (MSG_NOTE, vect_location,
7290 "Vectorizing SLP tree:\n");
7291 /* ??? Dump all? */
7292 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7293 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
7294 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
7295 vect_print_slp_graph (MSG_NOTE, vect_location,
7296 SLP_INSTANCE_TREE (instance));
7298 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
7299 have a PHI be the node breaking the cycle. */
7300 auto_vec<slp_tree> stack;
7301 if (!scc_info.get (node))
7302 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
7304 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7305 vectorize_slp_instance_root_stmt (node, instance);
7307 if (dump_enabled_p ())
7308 dump_printf_loc (MSG_NOTE, vect_location,
7309 "vectorizing stmts using SLP.\n");
7312 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7314 slp_tree root = SLP_INSTANCE_TREE (instance);
7315 stmt_vec_info store_info;
7316 unsigned int j;
7318 /* Remove scalar call stmts. Do not do this for basic-block
7319 vectorization as not all uses may be vectorized.
7320 ??? Why should this be necessary? DCE should be able to
7321 remove the stmts itself.
7322 ??? For BB vectorization we can as well remove scalar
7323 stmts starting from the SLP tree root if they have no
7324 uses. */
7325 if (is_a <loop_vec_info> (vinfo))
7326 vect_remove_slp_scalar_calls (vinfo, root);
7328 /* Remove vectorized stores original scalar stmts. */
7329 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
7331 if (!STMT_VINFO_DATA_REF (store_info)
7332 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
7333 break;
7335 store_info = vect_orig_stmt (store_info);
7336 /* Free the attached stmt_vec_info and remove the stmt. */
7337 vinfo->remove_stmt (store_info);
7339 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
7340 to not crash in vect_free_slp_tree later. */
7341 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
7342 SLP_TREE_REPRESENTATIVE (root) = NULL;