c++: using from enclosing class template [PR105006]
[official-gcc.git] / gcc / tree-vect-slp.cc
blob4ac2b70303c94e8f111fe41df39ac727cf8cc462
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
57 static object_allocator<_slp_tree> *slp_tree_pool;
58 static slp_tree slp_first_node;
60 void
61 vect_slp_init (void)
63 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
66 void
67 vect_slp_fini (void)
69 while (slp_first_node)
70 delete slp_first_node;
71 delete slp_tree_pool;
72 slp_tree_pool = NULL;
75 void *
76 _slp_tree::operator new (size_t n)
78 gcc_assert (n == sizeof (_slp_tree));
79 return slp_tree_pool->allocate_raw ();
82 void
83 _slp_tree::operator delete (void *node, size_t n)
85 gcc_assert (n == sizeof (_slp_tree));
86 slp_tree_pool->remove_raw (node);
90 /* Initialize a SLP node. */
92 _slp_tree::_slp_tree ()
94 this->prev_node = NULL;
95 if (slp_first_node)
96 slp_first_node->prev_node = this;
97 this->next_node = slp_first_node;
98 slp_first_node = this;
99 SLP_TREE_SCALAR_STMTS (this) = vNULL;
100 SLP_TREE_SCALAR_OPS (this) = vNULL;
101 SLP_TREE_VEC_STMTS (this) = vNULL;
102 SLP_TREE_VEC_DEFS (this) = vNULL;
103 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
104 SLP_TREE_CHILDREN (this) = vNULL;
105 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
106 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
107 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
108 SLP_TREE_CODE (this) = ERROR_MARK;
109 SLP_TREE_VECTYPE (this) = NULL_TREE;
110 SLP_TREE_REPRESENTATIVE (this) = NULL;
111 SLP_TREE_REF_COUNT (this) = 1;
112 this->failed = NULL;
113 this->max_nunits = 1;
114 this->lanes = 0;
117 /* Tear down a SLP node. */
119 _slp_tree::~_slp_tree ()
121 if (this->prev_node)
122 this->prev_node->next_node = this->next_node;
123 else
124 slp_first_node = this->next_node;
125 if (this->next_node)
126 this->next_node->prev_node = this->prev_node;
127 SLP_TREE_CHILDREN (this).release ();
128 SLP_TREE_SCALAR_STMTS (this).release ();
129 SLP_TREE_SCALAR_OPS (this).release ();
130 SLP_TREE_VEC_STMTS (this).release ();
131 SLP_TREE_VEC_DEFS (this).release ();
132 SLP_TREE_LOAD_PERMUTATION (this).release ();
133 SLP_TREE_LANE_PERMUTATION (this).release ();
134 if (this->failed)
135 free (failed);
138 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
140 void
141 vect_free_slp_tree (slp_tree node)
143 int i;
144 slp_tree child;
146 if (--SLP_TREE_REF_COUNT (node) != 0)
147 return;
149 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
150 if (child)
151 vect_free_slp_tree (child);
153 /* If the node defines any SLP only patterns then those patterns are no
154 longer valid and should be removed. */
155 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
156 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
158 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
159 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
160 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
163 delete node;
166 /* Return a location suitable for dumpings related to the SLP instance. */
168 dump_user_location_t
169 _slp_instance::location () const
171 if (!root_stmts.is_empty ())
172 return root_stmts[0]->stmt;
173 else
174 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
178 /* Free the memory allocated for the SLP instance. */
180 void
181 vect_free_slp_instance (slp_instance instance)
183 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
184 SLP_INSTANCE_LOADS (instance).release ();
185 SLP_INSTANCE_ROOT_STMTS (instance).release ();
186 instance->subgraph_entries.release ();
187 instance->cost_vec.release ();
188 free (instance);
192 /* Create an SLP node for SCALAR_STMTS. */
194 slp_tree
195 vect_create_new_slp_node (unsigned nops, tree_code code)
197 slp_tree node = new _slp_tree;
198 SLP_TREE_SCALAR_STMTS (node) = vNULL;
199 SLP_TREE_CHILDREN (node).create (nops);
200 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
201 SLP_TREE_CODE (node) = code;
202 return node;
204 /* Create an SLP node for SCALAR_STMTS. */
206 static slp_tree
207 vect_create_new_slp_node (slp_tree node,
208 vec<stmt_vec_info> scalar_stmts, unsigned nops)
210 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
211 SLP_TREE_CHILDREN (node).create (nops);
212 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
213 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
214 SLP_TREE_LANES (node) = scalar_stmts.length ();
215 return node;
218 /* Create an SLP node for SCALAR_STMTS. */
220 static slp_tree
221 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
223 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
226 /* Create an SLP node for OPS. */
228 static slp_tree
229 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
231 SLP_TREE_SCALAR_OPS (node) = ops;
232 SLP_TREE_DEF_TYPE (node) = vect_external_def;
233 SLP_TREE_LANES (node) = ops.length ();
234 return node;
237 /* Create an SLP node for OPS. */
239 static slp_tree
240 vect_create_new_slp_node (vec<tree> ops)
242 return vect_create_new_slp_node (new _slp_tree, ops);
246 /* This structure is used in creation of an SLP tree. Each instance
247 corresponds to the same operand in a group of scalar stmts in an SLP
248 node. */
249 typedef struct _slp_oprnd_info
251 /* Def-stmts for the operands. */
252 vec<stmt_vec_info> def_stmts;
253 /* Operands. */
254 vec<tree> ops;
255 /* Information about the first statement, its vector def-type, type, the
256 operand itself in case it's constant, and an indication if it's a pattern
257 stmt. */
258 tree first_op_type;
259 enum vect_def_type first_dt;
260 bool any_pattern;
261 } *slp_oprnd_info;
264 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
265 operand. */
266 static vec<slp_oprnd_info>
267 vect_create_oprnd_info (int nops, int group_size)
269 int i;
270 slp_oprnd_info oprnd_info;
271 vec<slp_oprnd_info> oprnds_info;
273 oprnds_info.create (nops);
274 for (i = 0; i < nops; i++)
276 oprnd_info = XNEW (struct _slp_oprnd_info);
277 oprnd_info->def_stmts.create (group_size);
278 oprnd_info->ops.create (group_size);
279 oprnd_info->first_dt = vect_uninitialized_def;
280 oprnd_info->first_op_type = NULL_TREE;
281 oprnd_info->any_pattern = false;
282 oprnds_info.quick_push (oprnd_info);
285 return oprnds_info;
289 /* Free operands info. */
291 static void
292 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
294 int i;
295 slp_oprnd_info oprnd_info;
297 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
299 oprnd_info->def_stmts.release ();
300 oprnd_info->ops.release ();
301 XDELETE (oprnd_info);
304 oprnds_info.release ();
308 /* Return true if STMTS contains a pattern statement. */
310 static bool
311 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
313 stmt_vec_info stmt_info;
314 unsigned int i;
315 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
316 if (is_pattern_stmt_p (stmt_info))
317 return true;
318 return false;
321 /* Return true when all lanes in the external or constant NODE have
322 the same value. */
324 static bool
325 vect_slp_tree_uniform_p (slp_tree node)
327 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
328 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
330 /* Pre-exsting vectors. */
331 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
332 return false;
334 unsigned i;
335 tree op, first = NULL_TREE;
336 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
337 if (!first)
338 first = op;
339 else if (!operand_equal_p (first, op, 0))
340 return false;
342 return true;
345 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
346 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
347 of the chain. */
350 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
351 stmt_vec_info first_stmt_info)
353 stmt_vec_info next_stmt_info = first_stmt_info;
354 int result = 0;
356 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
357 return -1;
361 if (next_stmt_info == stmt_info)
362 return result;
363 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
364 if (next_stmt_info)
365 result += DR_GROUP_GAP (next_stmt_info);
367 while (next_stmt_info);
369 return -1;
372 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
373 using the method implemented by duplicate_and_interleave. Return true
374 if so, returning the number of intermediate vectors in *NVECTORS_OUT
375 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
376 (if nonnull). */
378 bool
379 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
380 tree elt_type, unsigned int *nvectors_out,
381 tree *vector_type_out,
382 tree *permutes)
384 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
385 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
386 return false;
388 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
389 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
390 unsigned int nvectors = 1;
391 for (;;)
393 scalar_int_mode int_mode;
394 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
395 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
397 /* Get the natural vector type for this SLP group size. */
398 tree int_type = build_nonstandard_integer_type
399 (GET_MODE_BITSIZE (int_mode), 1);
400 tree vector_type
401 = get_vectype_for_scalar_type (vinfo, int_type, count);
402 if (vector_type
403 && VECTOR_MODE_P (TYPE_MODE (vector_type))
404 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
405 GET_MODE_SIZE (base_vector_mode)))
407 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
408 together into elements of type INT_TYPE and using the result
409 to build NVECTORS vectors. */
410 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
411 vec_perm_builder sel1 (nelts, 2, 3);
412 vec_perm_builder sel2 (nelts, 2, 3);
413 poly_int64 half_nelts = exact_div (nelts, 2);
414 for (unsigned int i = 0; i < 3; ++i)
416 sel1.quick_push (i);
417 sel1.quick_push (i + nelts);
418 sel2.quick_push (half_nelts + i);
419 sel2.quick_push (half_nelts + i + nelts);
421 vec_perm_indices indices1 (sel1, 2, nelts);
422 vec_perm_indices indices2 (sel2, 2, nelts);
423 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
424 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
426 if (nvectors_out)
427 *nvectors_out = nvectors;
428 if (vector_type_out)
429 *vector_type_out = vector_type;
430 if (permutes)
432 permutes[0] = vect_gen_perm_mask_checked (vector_type,
433 indices1);
434 permutes[1] = vect_gen_perm_mask_checked (vector_type,
435 indices2);
437 return true;
441 if (!multiple_p (elt_bytes, 2, &elt_bytes))
442 return false;
443 nvectors *= 2;
447 /* Return true if DTA and DTB match. */
449 static bool
450 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
452 return (dta == dtb
453 || ((dta == vect_external_def || dta == vect_constant_def)
454 && (dtb == vect_external_def || dtb == vect_constant_def)));
457 static const int cond_expr_maps[3][5] = {
458 { 4, -1, -2, 1, 2 },
459 { 4, -2, -1, 1, 2 },
460 { 4, -1, -2, 2, 1 }
462 static const int arg1_map[] = { 1, 1 };
463 static const int arg2_map[] = { 1, 2 };
464 static const int arg1_arg4_map[] = { 2, 1, 4 };
466 /* For most SLP statements, there is a one-to-one mapping between
467 gimple arguments and child nodes. If that is not true for STMT,
468 return an array that contains:
470 - the number of child nodes, followed by
471 - for each child node, the index of the argument associated with that node.
472 The special index -1 is the first operand of an embedded comparison and
473 the special index -2 is the second operand of an embedded comparison.
475 SWAP is as for vect_get_and_check_slp_defs. */
477 static const int *
478 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
480 if (auto assign = dyn_cast<const gassign *> (stmt))
482 if (gimple_assign_rhs_code (assign) == COND_EXPR
483 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
484 return cond_expr_maps[swap];
486 gcc_assert (!swap);
487 if (auto call = dyn_cast<const gcall *> (stmt))
489 if (gimple_call_internal_p (call))
490 switch (gimple_call_internal_fn (call))
492 case IFN_MASK_LOAD:
493 return arg2_map;
495 case IFN_GATHER_LOAD:
496 return arg1_map;
498 case IFN_MASK_GATHER_LOAD:
499 return arg1_arg4_map;
501 default:
502 break;
505 return nullptr;
508 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
509 they are of a valid type and that they match the defs of the first stmt of
510 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
511 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
512 indicates swap is required for cond_expr stmts. Specifically, SWAP
513 is 1 if STMT is cond and operands of comparison need to be swapped;
514 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
516 If there was a fatal error return -1; if the error could be corrected by
517 swapping operands of father node of this one, return 1; if everything is
518 ok return 0. */
519 static int
520 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
521 bool *skip_args,
522 vec<stmt_vec_info> stmts, unsigned stmt_num,
523 vec<slp_oprnd_info> *oprnds_info)
525 stmt_vec_info stmt_info = stmts[stmt_num];
526 tree oprnd;
527 unsigned int i, number_of_oprnds;
528 enum vect_def_type dt = vect_uninitialized_def;
529 slp_oprnd_info oprnd_info;
530 unsigned int commutative_op = -1U;
531 bool first = stmt_num == 0;
533 if (!is_a<gcall *> (stmt_info->stmt)
534 && !is_a<gassign *> (stmt_info->stmt)
535 && !is_a<gphi *> (stmt_info->stmt))
536 return -1;
538 number_of_oprnds = gimple_num_args (stmt_info->stmt);
539 const int *map = vect_get_operand_map (stmt_info->stmt, swap);
540 if (map)
541 number_of_oprnds = *map++;
542 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
544 if (gimple_call_internal_p (stmt))
546 internal_fn ifn = gimple_call_internal_fn (stmt);
547 commutative_op = first_commutative_argument (ifn);
550 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
552 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
553 commutative_op = 0;
556 bool swapped = (swap != 0);
557 bool backedge = false;
558 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
559 for (i = 0; i < number_of_oprnds; i++)
561 int opno = map ? map[i] : int (i);
562 if (opno < 0)
563 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
564 else
566 oprnd = gimple_arg (stmt_info->stmt, opno);
567 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
568 backedge = dominated_by_p (CDI_DOMINATORS,
569 gimple_phi_arg_edge (stmt, opno)->src,
570 gimple_bb (stmt_info->stmt));
572 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
573 oprnd = TREE_OPERAND (oprnd, 0);
575 oprnd_info = (*oprnds_info)[i];
577 stmt_vec_info def_stmt_info;
578 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
582 "Build SLP failed: can't analyze def for %T\n",
583 oprnd);
585 return -1;
588 if (skip_args[i])
590 oprnd_info->def_stmts.quick_push (NULL);
591 oprnd_info->ops.quick_push (NULL_TREE);
592 oprnd_info->first_dt = vect_uninitialized_def;
593 continue;
596 oprnd_info->def_stmts.quick_push (def_stmt_info);
597 oprnd_info->ops.quick_push (oprnd);
599 if (def_stmt_info
600 && is_pattern_stmt_p (def_stmt_info))
602 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
603 != def_stmt_info)
604 oprnd_info->any_pattern = true;
605 else
606 /* If we promote this to external use the original stmt def. */
607 oprnd_info->ops.last ()
608 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
611 /* If there's a extern def on a backedge make sure we can
612 code-generate at the region start.
613 ??? This is another case that could be fixed by adjusting
614 how we split the function but at the moment we'd have conflicting
615 goals there. */
616 if (backedge
617 && dts[i] == vect_external_def
618 && is_a <bb_vec_info> (vinfo)
619 && TREE_CODE (oprnd) == SSA_NAME
620 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
621 && !dominated_by_p (CDI_DOMINATORS,
622 as_a <bb_vec_info> (vinfo)->bbs[0],
623 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627 "Build SLP failed: extern def %T only defined "
628 "on backedge\n", oprnd);
629 return -1;
632 if (first)
634 tree type = TREE_TYPE (oprnd);
635 dt = dts[i];
636 if ((dt == vect_constant_def
637 || dt == vect_external_def)
638 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
639 && (TREE_CODE (type) == BOOLEAN_TYPE
640 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
641 type)))
643 if (dump_enabled_p ())
644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
645 "Build SLP failed: invalid type of def "
646 "for variable-length SLP %T\n", oprnd);
647 return -1;
650 /* For the swapping logic below force vect_reduction_def
651 for the reduction op in a SLP reduction group. */
652 if (!STMT_VINFO_DATA_REF (stmt_info)
653 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
654 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
655 && def_stmt_info)
656 dts[i] = dt = vect_reduction_def;
658 /* Check the types of the definition. */
659 switch (dt)
661 case vect_external_def:
662 case vect_constant_def:
663 case vect_internal_def:
664 case vect_reduction_def:
665 case vect_induction_def:
666 case vect_nested_cycle:
667 break;
669 default:
670 /* FORNOW: Not supported. */
671 if (dump_enabled_p ())
672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
673 "Build SLP failed: illegal type of def %T\n",
674 oprnd);
675 return -1;
678 oprnd_info->first_dt = dt;
679 oprnd_info->first_op_type = type;
682 if (first)
683 return 0;
685 /* Now match the operand definition types to that of the first stmt. */
686 for (i = 0; i < number_of_oprnds;)
688 if (skip_args[i])
690 ++i;
691 continue;
694 oprnd_info = (*oprnds_info)[i];
695 dt = dts[i];
696 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
697 oprnd = oprnd_info->ops[stmt_num];
698 tree type = TREE_TYPE (oprnd);
700 if (!types_compatible_p (oprnd_info->first_op_type, type))
702 if (dump_enabled_p ())
703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
704 "Build SLP failed: different operand types\n");
705 return 1;
708 /* Not first stmt of the group, check that the def-stmt/s match
709 the def-stmt/s of the first stmt. Allow different definition
710 types for reduction chains: the first stmt must be a
711 vect_reduction_def (a phi node), and the rest
712 end in the reduction chain. */
713 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
714 && !(oprnd_info->first_dt == vect_reduction_def
715 && !STMT_VINFO_DATA_REF (stmt_info)
716 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
717 && def_stmt_info
718 && !STMT_VINFO_DATA_REF (def_stmt_info)
719 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
720 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
721 || (!STMT_VINFO_DATA_REF (stmt_info)
722 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
723 && ((!def_stmt_info
724 || STMT_VINFO_DATA_REF (def_stmt_info)
725 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
726 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
727 != (oprnd_info->first_dt != vect_reduction_def))))
729 /* Try swapping operands if we got a mismatch. For BB
730 vectorization only in case it will clearly improve things. */
731 if (i == commutative_op && !swapped
732 && (!is_a <bb_vec_info> (vinfo)
733 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
734 dts[i+1])
735 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
736 || vect_def_types_match
737 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
739 if (dump_enabled_p ())
740 dump_printf_loc (MSG_NOTE, vect_location,
741 "trying swapped operands\n");
742 std::swap (dts[i], dts[i+1]);
743 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
744 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
745 std::swap ((*oprnds_info)[i]->ops[stmt_num],
746 (*oprnds_info)[i+1]->ops[stmt_num]);
747 swapped = true;
748 continue;
751 if (is_a <bb_vec_info> (vinfo)
752 && !oprnd_info->any_pattern)
754 /* Now for commutative ops we should see whether we can
755 make the other operand matching. */
756 if (dump_enabled_p ())
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 "treating operand as external\n");
759 oprnd_info->first_dt = dt = vect_external_def;
761 else
763 if (dump_enabled_p ())
764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
765 "Build SLP failed: different types\n");
766 return 1;
770 /* Make sure to demote the overall operand to external. */
771 if (dt == vect_external_def)
772 oprnd_info->first_dt = vect_external_def;
773 /* For a SLP reduction chain we want to duplicate the reduction to
774 each of the chain members. That gets us a sane SLP graph (still
775 the stmts are not 100% correct wrt the initial values). */
776 else if ((dt == vect_internal_def
777 || dt == vect_reduction_def)
778 && oprnd_info->first_dt == vect_reduction_def
779 && !STMT_VINFO_DATA_REF (stmt_info)
780 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
781 && !STMT_VINFO_DATA_REF (def_stmt_info)
782 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
783 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
785 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
786 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
789 ++i;
792 /* Swap operands. */
793 if (swapped)
795 if (dump_enabled_p ())
796 dump_printf_loc (MSG_NOTE, vect_location,
797 "swapped operands to match def types in %G",
798 stmt_info->stmt);
801 return 0;
804 /* Return true if call statements CALL1 and CALL2 are similar enough
805 to be combined into the same SLP group. */
807 bool
808 compatible_calls_p (gcall *call1, gcall *call2)
810 unsigned int nargs = gimple_call_num_args (call1);
811 if (nargs != gimple_call_num_args (call2))
812 return false;
814 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
815 return false;
817 if (gimple_call_internal_p (call1))
819 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
820 TREE_TYPE (gimple_call_lhs (call2))))
821 return false;
822 for (unsigned int i = 0; i < nargs; ++i)
823 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
824 TREE_TYPE (gimple_call_arg (call2, i))))
825 return false;
827 else
829 if (!operand_equal_p (gimple_call_fn (call1),
830 gimple_call_fn (call2), 0))
831 return false;
833 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
834 return false;
837 /* Check that any unvectorized arguments are equal. */
838 if (const int *map = vect_get_operand_map (call1))
840 unsigned int nkept = *map++;
841 unsigned int mapi = 0;
842 for (unsigned int i = 0; i < nargs; ++i)
843 if (mapi < nkept && map[mapi] == int (i))
844 mapi += 1;
845 else if (!operand_equal_p (gimple_call_arg (call1, i),
846 gimple_call_arg (call2, i)))
847 return false;
850 return true;
853 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
854 caller's attempt to find the vector type in STMT_INFO with the narrowest
855 element type. Return true if VECTYPE is nonnull and if it is valid
856 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
857 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
858 vect_build_slp_tree. */
860 static bool
861 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
862 unsigned int group_size,
863 tree vectype, poly_uint64 *max_nunits)
865 if (!vectype)
867 if (dump_enabled_p ())
868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
869 "Build SLP failed: unsupported data-type in %G\n",
870 stmt_info->stmt);
871 /* Fatal mismatch. */
872 return false;
875 /* If populating the vector type requires unrolling then fail
876 before adjusting *max_nunits for basic-block vectorization. */
877 if (is_a <bb_vec_info> (vinfo)
878 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
880 if (dump_enabled_p ())
881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
882 "Build SLP failed: unrolling required "
883 "in basic block SLP\n");
884 /* Fatal mismatch. */
885 return false;
888 /* In case of multiple types we need to detect the smallest type. */
889 vect_update_max_nunits (max_nunits, vectype);
890 return true;
893 /* Verify if the scalar stmts STMTS are isomorphic, require data
894 permutation or are of unsupported types of operation. Return
895 true if they are, otherwise return false and indicate in *MATCHES
896 which stmts are not isomorphic to the first one. If MATCHES[0]
897 is false then this indicates the comparison could not be
898 carried out or the stmts will never be vectorized by SLP.
900 Note COND_EXPR is possibly isomorphic to another one after swapping its
901 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
902 the first stmt by swapping the two operands of comparison; set SWAP[i]
903 to 2 if stmt I is isormorphic to the first stmt by inverting the code
904 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
905 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
907 static bool
908 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
909 vec<stmt_vec_info> stmts, unsigned int group_size,
910 poly_uint64 *max_nunits, bool *matches,
911 bool *two_operators, tree *node_vectype)
913 unsigned int i;
914 stmt_vec_info first_stmt_info = stmts[0];
915 code_helper first_stmt_code = ERROR_MARK;
916 code_helper alt_stmt_code = ERROR_MARK;
917 code_helper rhs_code = ERROR_MARK;
918 code_helper first_cond_code = ERROR_MARK;
919 tree lhs;
920 bool need_same_oprnds = false;
921 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
922 stmt_vec_info first_load = NULL, prev_first_load = NULL;
923 bool first_stmt_load_p = false, load_p = false;
924 bool first_stmt_phi_p = false, phi_p = false;
925 bool maybe_soft_fail = false;
926 tree soft_fail_nunits_vectype = NULL_TREE;
928 /* For every stmt in NODE find its def stmt/s. */
929 stmt_vec_info stmt_info;
930 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
932 gimple *stmt = stmt_info->stmt;
933 swap[i] = 0;
934 matches[i] = false;
936 if (dump_enabled_p ())
937 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
939 /* Fail to vectorize statements marked as unvectorizable, throw
940 or are volatile. */
941 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
942 || stmt_can_throw_internal (cfun, stmt)
943 || gimple_has_volatile_ops (stmt))
945 if (dump_enabled_p ())
946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
947 "Build SLP failed: unvectorizable statement %G",
948 stmt);
949 /* ??? For BB vectorization we want to commutate operands in a way
950 to shuffle all unvectorizable defs into one operand and have
951 the other still vectorized. The following doesn't reliably
952 work for this though but it's the easiest we can do here. */
953 if (is_a <bb_vec_info> (vinfo) && i != 0)
954 continue;
955 /* Fatal mismatch. */
956 matches[0] = false;
957 return false;
960 lhs = gimple_get_lhs (stmt);
961 if (lhs == NULL_TREE)
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
965 "Build SLP failed: not GIMPLE_ASSIGN nor "
966 "GIMPLE_CALL %G", stmt);
967 if (is_a <bb_vec_info> (vinfo) && i != 0)
968 continue;
969 /* Fatal mismatch. */
970 matches[0] = false;
971 return false;
974 tree nunits_vectype;
975 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
976 &nunits_vectype, group_size))
978 if (is_a <bb_vec_info> (vinfo) && i != 0)
979 continue;
980 /* Fatal mismatch. */
981 matches[0] = false;
982 return false;
984 /* Record nunits required but continue analysis, producing matches[]
985 as if nunits was not an issue. This allows splitting of groups
986 to happen. */
987 if (nunits_vectype
988 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
989 nunits_vectype, max_nunits))
991 gcc_assert (is_a <bb_vec_info> (vinfo));
992 maybe_soft_fail = true;
993 soft_fail_nunits_vectype = nunits_vectype;
996 gcc_assert (vectype);
998 gcall *call_stmt = dyn_cast <gcall *> (stmt);
999 if (call_stmt)
1001 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1002 if (cfn != CFN_LAST)
1003 rhs_code = cfn;
1004 else
1005 rhs_code = CALL_EXPR;
1007 if (cfn == CFN_MASK_LOAD
1008 || cfn == CFN_GATHER_LOAD
1009 || cfn == CFN_MASK_GATHER_LOAD)
1010 load_p = true;
1011 else if ((internal_fn_p (cfn)
1012 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1013 || gimple_call_tail_p (call_stmt)
1014 || gimple_call_noreturn_p (call_stmt)
1015 || gimple_call_chain (call_stmt))
1017 if (dump_enabled_p ())
1018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1019 "Build SLP failed: unsupported call type %G",
1020 call_stmt);
1021 if (is_a <bb_vec_info> (vinfo) && i != 0)
1022 continue;
1023 /* Fatal mismatch. */
1024 matches[0] = false;
1025 return false;
1028 else if (gimple_code (stmt) == GIMPLE_PHI)
1030 rhs_code = ERROR_MARK;
1031 phi_p = true;
1033 else
1035 rhs_code = gimple_assign_rhs_code (stmt);
1036 load_p = gimple_vuse (stmt);
1039 /* Check the operation. */
1040 if (i == 0)
1042 *node_vectype = vectype;
1043 first_stmt_code = rhs_code;
1044 first_stmt_load_p = load_p;
1045 first_stmt_phi_p = phi_p;
1047 /* Shift arguments should be equal in all the packed stmts for a
1048 vector shift with scalar shift operand. */
1049 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1050 || rhs_code == LROTATE_EXPR
1051 || rhs_code == RROTATE_EXPR)
1053 /* First see if we have a vector/vector shift. */
1054 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1056 /* No vector/vector shift, try for a vector/scalar shift. */
1057 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1059 if (dump_enabled_p ())
1060 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1061 "Build SLP failed: "
1062 "op not supported by target.\n");
1063 if (is_a <bb_vec_info> (vinfo) && i != 0)
1064 continue;
1065 /* Fatal mismatch. */
1066 matches[0] = false;
1067 return false;
1069 need_same_oprnds = true;
1070 first_op1 = gimple_assign_rhs2 (stmt);
1073 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1075 need_same_oprnds = true;
1076 first_op1 = gimple_assign_rhs2 (stmt);
1078 else if (!load_p
1079 && rhs_code == BIT_FIELD_REF)
1081 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1082 if (!is_a <bb_vec_info> (vinfo)
1083 || TREE_CODE (vec) != SSA_NAME
1084 || !operand_equal_p (TYPE_SIZE (vectype),
1085 TYPE_SIZE (TREE_TYPE (vec))))
1087 if (dump_enabled_p ())
1088 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1089 "Build SLP failed: "
1090 "BIT_FIELD_REF not supported\n");
1091 /* Fatal mismatch. */
1092 matches[0] = false;
1093 return false;
1096 else if (rhs_code == CFN_DIV_POW2)
1098 need_same_oprnds = true;
1099 first_op1 = gimple_call_arg (call_stmt, 1);
1102 else
1104 if (first_stmt_code != rhs_code
1105 && alt_stmt_code == ERROR_MARK)
1106 alt_stmt_code = rhs_code;
1107 if ((first_stmt_code != rhs_code
1108 && (first_stmt_code != IMAGPART_EXPR
1109 || rhs_code != REALPART_EXPR)
1110 && (first_stmt_code != REALPART_EXPR
1111 || rhs_code != IMAGPART_EXPR)
1112 /* Handle mismatches in plus/minus by computing both
1113 and merging the results. */
1114 && !((first_stmt_code == PLUS_EXPR
1115 || first_stmt_code == MINUS_EXPR)
1116 && (alt_stmt_code == PLUS_EXPR
1117 || alt_stmt_code == MINUS_EXPR)
1118 && rhs_code == alt_stmt_code)
1119 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1120 && (first_stmt_code == ARRAY_REF
1121 || first_stmt_code == BIT_FIELD_REF
1122 || first_stmt_code == INDIRECT_REF
1123 || first_stmt_code == COMPONENT_REF
1124 || first_stmt_code == MEM_REF)
1125 && (rhs_code == ARRAY_REF
1126 || rhs_code == BIT_FIELD_REF
1127 || rhs_code == INDIRECT_REF
1128 || rhs_code == COMPONENT_REF
1129 || rhs_code == MEM_REF)))
1130 || first_stmt_load_p != load_p
1131 || first_stmt_phi_p != phi_p)
1133 if (dump_enabled_p ())
1135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1136 "Build SLP failed: different operation "
1137 "in stmt %G", stmt);
1138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1139 "original stmt %G", first_stmt_info->stmt);
1141 /* Mismatch. */
1142 continue;
1145 if (!load_p
1146 && first_stmt_code == BIT_FIELD_REF
1147 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1148 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1150 if (dump_enabled_p ())
1151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1152 "Build SLP failed: different BIT_FIELD_REF "
1153 "arguments in %G", stmt);
1154 /* Mismatch. */
1155 continue;
1158 if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
1160 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1161 call_stmt))
1163 if (dump_enabled_p ())
1164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165 "Build SLP failed: different calls in %G",
1166 stmt);
1167 /* Mismatch. */
1168 continue;
1172 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1173 && (gimple_bb (first_stmt_info->stmt)
1174 != gimple_bb (stmt_info->stmt)))
1176 if (dump_enabled_p ())
1177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1178 "Build SLP failed: different BB for PHI "
1179 "or possibly trapping operation in %G", stmt);
1180 /* Mismatch. */
1181 continue;
1184 if (need_same_oprnds)
1186 tree other_op1 = gimple_arg (stmt, 1);
1187 if (!operand_equal_p (first_op1, other_op1, 0))
1189 if (dump_enabled_p ())
1190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191 "Build SLP failed: different shift "
1192 "arguments in %G", stmt);
1193 /* Mismatch. */
1194 continue;
1198 if (!types_compatible_p (vectype, *node_vectype))
1200 if (dump_enabled_p ())
1201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1202 "Build SLP failed: different vector type "
1203 "in %G", stmt);
1204 /* Mismatch. */
1205 continue;
1209 /* Grouped store or load. */
1210 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1212 if (REFERENCE_CLASS_P (lhs))
1214 /* Store. */
1217 else
1219 /* Load. */
1220 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1221 if (prev_first_load)
1223 /* Check that there are no loads from different interleaving
1224 chains in the same node. */
1225 if (prev_first_load != first_load)
1227 if (dump_enabled_p ())
1228 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1229 vect_location,
1230 "Build SLP failed: different "
1231 "interleaving chains in one node %G",
1232 stmt);
1233 /* Mismatch. */
1234 continue;
1237 else
1238 prev_first_load = first_load;
1240 } /* Grouped access. */
1241 else
1243 if (load_p
1244 && rhs_code != CFN_GATHER_LOAD
1245 && rhs_code != CFN_MASK_GATHER_LOAD)
1247 /* Not grouped load. */
1248 if (dump_enabled_p ())
1249 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250 "Build SLP failed: not grouped load %G", stmt);
1252 /* FORNOW: Not grouped loads are not supported. */
1253 if (is_a <bb_vec_info> (vinfo) && i != 0)
1254 continue;
1255 /* Fatal mismatch. */
1256 matches[0] = false;
1257 return false;
1260 /* Not memory operation. */
1261 if (!phi_p
1262 && rhs_code.is_tree_code ()
1263 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1264 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1265 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1266 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1267 && rhs_code != VIEW_CONVERT_EXPR
1268 && rhs_code != CALL_EXPR
1269 && rhs_code != BIT_FIELD_REF)
1271 if (dump_enabled_p ())
1272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1273 "Build SLP failed: operation unsupported %G",
1274 stmt);
1275 if (is_a <bb_vec_info> (vinfo) && i != 0)
1276 continue;
1277 /* Fatal mismatch. */
1278 matches[0] = false;
1279 return false;
1282 if (rhs_code == COND_EXPR)
1284 tree cond_expr = gimple_assign_rhs1 (stmt);
1285 enum tree_code cond_code = TREE_CODE (cond_expr);
1286 enum tree_code swap_code = ERROR_MARK;
1287 enum tree_code invert_code = ERROR_MARK;
1289 if (i == 0)
1290 first_cond_code = TREE_CODE (cond_expr);
1291 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1293 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1294 swap_code = swap_tree_comparison (cond_code);
1295 invert_code = invert_tree_comparison (cond_code, honor_nans);
1298 if (first_cond_code == cond_code)
1300 /* Isomorphic can be achieved by swapping. */
1301 else if (first_cond_code == swap_code)
1302 swap[i] = 1;
1303 /* Isomorphic can be achieved by inverting. */
1304 else if (first_cond_code == invert_code)
1305 swap[i] = 2;
1306 else
1308 if (dump_enabled_p ())
1309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1310 "Build SLP failed: different"
1311 " operation %G", stmt);
1312 /* Mismatch. */
1313 continue;
1318 matches[i] = true;
1321 for (i = 0; i < group_size; ++i)
1322 if (!matches[i])
1323 return false;
1325 /* If we allowed a two-operation SLP node verify the target can cope
1326 with the permute we are going to use. */
1327 if (alt_stmt_code != ERROR_MARK
1328 && (!alt_stmt_code.is_tree_code ()
1329 || TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference))
1331 *two_operators = true;
1334 if (maybe_soft_fail)
1336 unsigned HOST_WIDE_INT const_nunits;
1337 if (!TYPE_VECTOR_SUBPARTS
1338 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1339 || const_nunits > group_size)
1340 matches[0] = false;
1341 else
1343 /* With constant vector elements simulate a mismatch at the
1344 point we need to split. */
1345 unsigned tail = group_size & (const_nunits - 1);
1346 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1348 return false;
1351 return true;
1354 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1355 Note we never remove apart from at destruction time so we do not
1356 need a special value for deleted that differs from empty. */
1357 struct bst_traits
1359 typedef vec <stmt_vec_info> value_type;
1360 typedef vec <stmt_vec_info> compare_type;
1361 static inline hashval_t hash (value_type);
1362 static inline bool equal (value_type existing, value_type candidate);
1363 static inline bool is_empty (value_type x) { return !x.exists (); }
1364 static inline bool is_deleted (value_type x) { return !x.exists (); }
1365 static const bool empty_zero_p = true;
1366 static inline void mark_empty (value_type &x) { x.release (); }
1367 static inline void mark_deleted (value_type &x) { x.release (); }
1368 static inline void remove (value_type &x) { x.release (); }
1370 inline hashval_t
1371 bst_traits::hash (value_type x)
1373 inchash::hash h;
1374 for (unsigned i = 0; i < x.length (); ++i)
1375 h.add_int (gimple_uid (x[i]->stmt));
1376 return h.end ();
1378 inline bool
1379 bst_traits::equal (value_type existing, value_type candidate)
1381 if (existing.length () != candidate.length ())
1382 return false;
1383 for (unsigned i = 0; i < existing.length (); ++i)
1384 if (existing[i] != candidate[i])
1385 return false;
1386 return true;
1389 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1390 but then vec::insert does memmove and that's not compatible with
1391 std::pair. */
1392 struct chain_op_t
1394 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1395 : code (code_), dt (dt_), op (op_) {}
1396 tree_code code;
1397 vect_def_type dt;
1398 tree op;
1401 /* Comparator for sorting associatable chains. */
1403 static int
1404 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1406 auto *op1 = (const chain_op_t *) op1_;
1407 auto *op2 = (const chain_op_t *) op2_;
1408 if (op1->dt != op2->dt)
1409 return (int)op1->dt - (int)op2->dt;
1410 return (int)op1->code - (int)op2->code;
1413 /* Linearize the associatable expression chain at START with the
1414 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1415 filling CHAIN with the result and using WORKLIST as intermediate storage.
1416 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1417 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1418 stmts, starting with START. */
1420 static void
1421 vect_slp_linearize_chain (vec_info *vinfo,
1422 vec<std::pair<tree_code, gimple *> > &worklist,
1423 vec<chain_op_t> &chain,
1424 enum tree_code code, gimple *start,
1425 gimple *&code_stmt, gimple *&alt_code_stmt,
1426 vec<gimple *> *chain_stmts)
1428 /* For each lane linearize the addition/subtraction (or other
1429 uniform associatable operation) expression tree. */
1430 worklist.safe_push (std::make_pair (code, start));
1431 while (!worklist.is_empty ())
1433 auto entry = worklist.pop ();
1434 gassign *stmt = as_a <gassign *> (entry.second);
1435 enum tree_code in_code = entry.first;
1436 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1437 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1438 if (!code_stmt
1439 && gimple_assign_rhs_code (stmt) == code)
1440 code_stmt = stmt;
1441 else if (!alt_code_stmt
1442 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1443 alt_code_stmt = stmt;
1444 if (chain_stmts)
1445 chain_stmts->safe_push (stmt);
1446 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1448 tree op = gimple_op (stmt, opnum);
1449 vect_def_type dt;
1450 stmt_vec_info def_stmt_info;
1451 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1452 gcc_assert (res);
1453 if (dt == vect_internal_def
1454 && is_pattern_stmt_p (def_stmt_info))
1455 op = gimple_get_lhs (def_stmt_info->stmt);
1456 gimple *use_stmt;
1457 use_operand_p use_p;
1458 if (dt == vect_internal_def
1459 && single_imm_use (op, &use_p, &use_stmt)
1460 && is_gimple_assign (def_stmt_info->stmt)
1461 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1462 || (code == PLUS_EXPR
1463 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1464 == MINUS_EXPR))))
1466 tree_code op_def_code = this_code;
1467 if (op_def_code == MINUS_EXPR && opnum == 1)
1468 op_def_code = PLUS_EXPR;
1469 if (in_code == MINUS_EXPR)
1470 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1471 worklist.safe_push (std::make_pair (op_def_code,
1472 def_stmt_info->stmt));
1474 else
1476 tree_code op_def_code = this_code;
1477 if (op_def_code == MINUS_EXPR && opnum == 1)
1478 op_def_code = PLUS_EXPR;
1479 if (in_code == MINUS_EXPR)
1480 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1481 chain.safe_push (chain_op_t (op_def_code, dt, op));
1487 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1488 simple_hashmap_traits <bst_traits, slp_tree> >
1489 scalar_stmts_to_slp_tree_map_t;
1491 static slp_tree
1492 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1493 vec<stmt_vec_info> stmts, unsigned int group_size,
1494 poly_uint64 *max_nunits,
1495 bool *matches, unsigned *limit, unsigned *tree_size,
1496 scalar_stmts_to_slp_tree_map_t *bst_map);
1498 static slp_tree
1499 vect_build_slp_tree (vec_info *vinfo,
1500 vec<stmt_vec_info> stmts, unsigned int group_size,
1501 poly_uint64 *max_nunits,
1502 bool *matches, unsigned *limit, unsigned *tree_size,
1503 scalar_stmts_to_slp_tree_map_t *bst_map)
1505 if (slp_tree *leader = bst_map->get (stmts))
1507 if (dump_enabled_p ())
1508 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1509 !(*leader)->failed ? "" : "failed ", *leader);
1510 if (!(*leader)->failed)
1512 SLP_TREE_REF_COUNT (*leader)++;
1513 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1514 stmts.release ();
1515 return *leader;
1517 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1518 return NULL;
1521 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1522 so we can pick up backedge destinations during discovery. */
1523 slp_tree res = new _slp_tree;
1524 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1525 SLP_TREE_SCALAR_STMTS (res) = stmts;
1526 bst_map->put (stmts.copy (), res);
1528 if (*limit == 0)
1530 if (dump_enabled_p ())
1531 dump_printf_loc (MSG_NOTE, vect_location,
1532 "SLP discovery limit exceeded\n");
1533 /* Mark the node invalid so we can detect those when still in use
1534 as backedge destinations. */
1535 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1536 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1537 res->failed = XNEWVEC (bool, group_size);
1538 memset (res->failed, 0, sizeof (bool) * group_size);
1539 memset (matches, 0, sizeof (bool) * group_size);
1540 return NULL;
1542 --*limit;
1544 if (dump_enabled_p ())
1545 dump_printf_loc (MSG_NOTE, vect_location,
1546 "starting SLP discovery for node %p\n", res);
1548 poly_uint64 this_max_nunits = 1;
1549 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1550 &this_max_nunits,
1551 matches, limit, tree_size, bst_map);
1552 if (!res_)
1554 if (dump_enabled_p ())
1555 dump_printf_loc (MSG_NOTE, vect_location,
1556 "SLP discovery for node %p failed\n", res);
1557 /* Mark the node invalid so we can detect those when still in use
1558 as backedge destinations. */
1559 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1560 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1561 res->failed = XNEWVEC (bool, group_size);
1562 if (flag_checking)
1564 unsigned i;
1565 for (i = 0; i < group_size; ++i)
1566 if (!matches[i])
1567 break;
1568 gcc_assert (i < group_size);
1570 memcpy (res->failed, matches, sizeof (bool) * group_size);
1572 else
1574 if (dump_enabled_p ())
1575 dump_printf_loc (MSG_NOTE, vect_location,
1576 "SLP discovery for node %p succeeded\n", res);
1577 gcc_assert (res_ == res);
1578 res->max_nunits = this_max_nunits;
1579 vect_update_max_nunits (max_nunits, this_max_nunits);
1580 /* Keep a reference for the bst_map use. */
1581 SLP_TREE_REF_COUNT (res)++;
1583 return res_;
1586 /* Helper for building an associated SLP node chain. */
1588 static void
1589 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1590 slp_tree op0, slp_tree op1,
1591 stmt_vec_info oper1, stmt_vec_info oper2,
1592 vec<std::pair<unsigned, unsigned> > lperm)
1594 unsigned group_size = SLP_TREE_LANES (op1);
1596 slp_tree child1 = new _slp_tree;
1597 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1598 SLP_TREE_VECTYPE (child1) = vectype;
1599 SLP_TREE_LANES (child1) = group_size;
1600 SLP_TREE_CHILDREN (child1).create (2);
1601 SLP_TREE_CHILDREN (child1).quick_push (op0);
1602 SLP_TREE_CHILDREN (child1).quick_push (op1);
1603 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1605 slp_tree child2 = new _slp_tree;
1606 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1607 SLP_TREE_VECTYPE (child2) = vectype;
1608 SLP_TREE_LANES (child2) = group_size;
1609 SLP_TREE_CHILDREN (child2).create (2);
1610 SLP_TREE_CHILDREN (child2).quick_push (op0);
1611 SLP_TREE_REF_COUNT (op0)++;
1612 SLP_TREE_CHILDREN (child2).quick_push (op1);
1613 SLP_TREE_REF_COUNT (op1)++;
1614 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1616 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1617 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1618 SLP_TREE_VECTYPE (perm) = vectype;
1619 SLP_TREE_LANES (perm) = group_size;
1620 /* ??? We should set this NULL but that's not expected. */
1621 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1622 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1623 SLP_TREE_CHILDREN (perm).quick_push (child1);
1624 SLP_TREE_CHILDREN (perm).quick_push (child2);
1627 /* Recursively build an SLP tree starting from NODE.
1628 Fail (and return a value not equal to zero) if def-stmts are not
1629 isomorphic, require data permutation or are of unsupported types of
1630 operation. Otherwise, return 0.
1631 The value returned is the depth in the SLP tree where a mismatch
1632 was found. */
1634 static slp_tree
1635 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1636 vec<stmt_vec_info> stmts, unsigned int group_size,
1637 poly_uint64 *max_nunits,
1638 bool *matches, unsigned *limit, unsigned *tree_size,
1639 scalar_stmts_to_slp_tree_map_t *bst_map)
1641 unsigned nops, i, this_tree_size = 0;
1642 poly_uint64 this_max_nunits = *max_nunits;
1644 matches[0] = false;
1646 stmt_vec_info stmt_info = stmts[0];
1647 if (!is_a<gcall *> (stmt_info->stmt)
1648 && !is_a<gassign *> (stmt_info->stmt)
1649 && !is_a<gphi *> (stmt_info->stmt))
1650 return NULL;
1652 nops = gimple_num_args (stmt_info->stmt);
1653 if (const int *map = vect_get_operand_map (stmt_info->stmt))
1654 nops = map[0];
1656 /* If the SLP node is a PHI (induction or reduction), terminate
1657 the recursion. */
1658 bool *skip_args = XALLOCAVEC (bool, nops);
1659 memset (skip_args, 0, sizeof (bool) * nops);
1660 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1661 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1663 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1664 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1665 group_size);
1666 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1667 max_nunits))
1668 return NULL;
1670 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1671 if (def_type == vect_induction_def)
1673 /* Induction PHIs are not cycles but walk the initial
1674 value. Only for inner loops through, for outer loops
1675 we need to pick up the value from the actual PHIs
1676 to more easily support peeling and epilogue vectorization. */
1677 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1678 if (!nested_in_vect_loop_p (loop, stmt_info))
1679 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1680 else
1681 loop = loop->inner;
1682 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1684 else if (def_type == vect_reduction_def
1685 || def_type == vect_double_reduction_def
1686 || def_type == vect_nested_cycle)
1688 /* Else def types have to match. */
1689 stmt_vec_info other_info;
1690 bool all_same = true;
1691 FOR_EACH_VEC_ELT (stmts, i, other_info)
1693 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1694 return NULL;
1695 if (other_info != stmt_info)
1696 all_same = false;
1698 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1699 /* Reduction initial values are not explicitely represented. */
1700 if (!nested_in_vect_loop_p (loop, stmt_info))
1701 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1702 /* Reduction chain backedge defs are filled manually.
1703 ??? Need a better way to identify a SLP reduction chain PHI.
1704 Or a better overall way to SLP match those. */
1705 if (all_same && def_type == vect_reduction_def)
1706 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1708 else if (def_type != vect_internal_def)
1709 return NULL;
1713 bool two_operators = false;
1714 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1715 tree vectype = NULL_TREE;
1716 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1717 &this_max_nunits, matches, &two_operators,
1718 &vectype))
1719 return NULL;
1721 /* If the SLP node is a load, terminate the recursion unless masked. */
1722 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1723 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1725 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1726 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1727 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1728 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1729 else
1731 *max_nunits = this_max_nunits;
1732 (*tree_size)++;
1733 node = vect_create_new_slp_node (node, stmts, 0);
1734 SLP_TREE_VECTYPE (node) = vectype;
1735 /* And compute the load permutation. Whether it is actually
1736 a permutation depends on the unrolling factor which is
1737 decided later. */
1738 vec<unsigned> load_permutation;
1739 int j;
1740 stmt_vec_info load_info;
1741 load_permutation.create (group_size);
1742 stmt_vec_info first_stmt_info
1743 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1744 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1746 int load_place = vect_get_place_in_interleaving_chain
1747 (load_info, first_stmt_info);
1748 gcc_assert (load_place != -1);
1749 load_permutation.safe_push (load_place);
1751 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1752 return node;
1755 else if (gimple_assign_single_p (stmt_info->stmt)
1756 && !gimple_vuse (stmt_info->stmt)
1757 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1759 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1760 the same SSA name vector of a compatible type to vectype. */
1761 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1762 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1763 stmt_vec_info estmt_info;
1764 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1766 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1767 tree bfref = gimple_assign_rhs1 (estmt);
1768 HOST_WIDE_INT lane;
1769 if (!known_eq (bit_field_size (bfref),
1770 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1771 || !constant_multiple_p (bit_field_offset (bfref),
1772 bit_field_size (bfref), &lane))
1774 lperm.release ();
1775 matches[0] = false;
1776 return NULL;
1778 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1780 slp_tree vnode = vect_create_new_slp_node (vNULL);
1781 /* ??? We record vectype here but we hide eventually necessary
1782 punning and instead rely on code generation to materialize
1783 VIEW_CONVERT_EXPRs as necessary. We instead should make
1784 this explicit somehow. */
1785 SLP_TREE_VECTYPE (vnode) = vectype;
1786 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1787 /* We are always building a permutation node even if it is an identity
1788 permute to shield the rest of the vectorizer from the odd node
1789 representing an actual vector without any scalar ops.
1790 ??? We could hide it completely with making the permute node
1791 external? */
1792 node = vect_create_new_slp_node (node, stmts, 1);
1793 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1794 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1795 SLP_TREE_VECTYPE (node) = vectype;
1796 SLP_TREE_CHILDREN (node).quick_push (vnode);
1797 return node;
1799 /* When discovery reaches an associatable operation see whether we can
1800 improve that to match up lanes in a way superior to the operand
1801 swapping code which at most looks at two defs.
1802 ??? For BB vectorization we cannot do the brute-force search
1803 for matching as we can succeed by means of builds from scalars
1804 and have no good way to "cost" one build against another. */
1805 else if (is_a <loop_vec_info> (vinfo)
1806 /* ??? We don't handle !vect_internal_def defs below. */
1807 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1808 && is_gimple_assign (stmt_info->stmt)
1809 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1810 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1811 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1812 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1813 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1815 /* See if we have a chain of (mixed) adds or subtracts or other
1816 associatable ops. */
1817 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1818 if (code == MINUS_EXPR)
1819 code = PLUS_EXPR;
1820 stmt_vec_info other_op_stmt_info = NULL;
1821 stmt_vec_info op_stmt_info = NULL;
1822 unsigned chain_len = 0;
1823 auto_vec<chain_op_t> chain;
1824 auto_vec<std::pair<tree_code, gimple *> > worklist;
1825 auto_vec<vec<chain_op_t> > chains (group_size);
1826 auto_vec<slp_tree, 4> children;
1827 bool hard_fail = true;
1828 for (unsigned lane = 0; lane < group_size; ++lane)
1830 /* For each lane linearize the addition/subtraction (or other
1831 uniform associatable operation) expression tree. */
1832 gimple *op_stmt = NULL, *other_op_stmt = NULL;
1833 vect_slp_linearize_chain (vinfo, worklist, chain, code,
1834 stmts[lane]->stmt, op_stmt, other_op_stmt,
1835 NULL);
1836 if (!op_stmt_info && op_stmt)
1837 op_stmt_info = vinfo->lookup_stmt (op_stmt);
1838 if (!other_op_stmt_info && other_op_stmt)
1839 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1840 if (chain.length () == 2)
1842 /* In a chain of just two elements resort to the regular
1843 operand swapping scheme. If we run into a length
1844 mismatch still hard-FAIL. */
1845 if (chain_len == 0)
1846 hard_fail = false;
1847 else
1849 matches[lane] = false;
1850 /* ??? We might want to process the other lanes, but
1851 make sure to not give false matching hints to the
1852 caller for lanes we did not process. */
1853 if (lane != group_size - 1)
1854 matches[0] = false;
1856 break;
1858 else if (chain_len == 0)
1859 chain_len = chain.length ();
1860 else if (chain.length () != chain_len)
1862 /* ??? Here we could slip in magic to compensate with
1863 neutral operands. */
1864 matches[lane] = false;
1865 if (lane != group_size - 1)
1866 matches[0] = false;
1867 break;
1869 chains.quick_push (chain.copy ());
1870 chain.truncate (0);
1872 if (chains.length () == group_size)
1874 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
1875 if (!op_stmt_info)
1877 hard_fail = false;
1878 goto out;
1880 /* Now we have a set of chains with the same length. */
1881 /* 1. pre-sort according to def_type and operation. */
1882 for (unsigned lane = 0; lane < group_size; ++lane)
1883 chains[lane].stablesort (dt_sort_cmp, vinfo);
1884 if (dump_enabled_p ())
1886 dump_printf_loc (MSG_NOTE, vect_location,
1887 "pre-sorted chains of %s\n",
1888 get_tree_code_name (code));
1889 for (unsigned lane = 0; lane < group_size; ++lane)
1891 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1892 dump_printf (MSG_NOTE, "%s %T ",
1893 get_tree_code_name (chains[lane][opnum].code),
1894 chains[lane][opnum].op);
1895 dump_printf (MSG_NOTE, "\n");
1898 /* 2. try to build children nodes, associating as necessary. */
1899 for (unsigned n = 0; n < chain_len; ++n)
1901 vect_def_type dt = chains[0][n].dt;
1902 unsigned lane;
1903 for (lane = 0; lane < group_size; ++lane)
1904 if (chains[lane][n].dt != dt)
1906 if (dt == vect_constant_def
1907 && chains[lane][n].dt == vect_external_def)
1908 dt = vect_external_def;
1909 else if (dt == vect_external_def
1910 && chains[lane][n].dt == vect_constant_def)
1912 else
1913 break;
1915 if (lane != group_size)
1917 if (dump_enabled_p ())
1918 dump_printf_loc (MSG_NOTE, vect_location,
1919 "giving up on chain due to mismatched "
1920 "def types\n");
1921 matches[lane] = false;
1922 if (lane != group_size - 1)
1923 matches[0] = false;
1924 goto out;
1926 if (dt == vect_constant_def
1927 || dt == vect_external_def)
1929 /* Check whether we can build the invariant. If we can't
1930 we never will be able to. */
1931 tree type = TREE_TYPE (chains[0][n].op);
1932 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
1933 && (TREE_CODE (type) == BOOLEAN_TYPE
1934 || !can_duplicate_and_interleave_p (vinfo, group_size,
1935 type)))
1937 matches[0] = false;
1938 goto out;
1940 vec<tree> ops;
1941 ops.create (group_size);
1942 for (lane = 0; lane < group_size; ++lane)
1943 ops.quick_push (chains[lane][n].op);
1944 slp_tree child = vect_create_new_slp_node (ops);
1945 SLP_TREE_DEF_TYPE (child) = dt;
1946 children.safe_push (child);
1948 else if (dt != vect_internal_def)
1950 /* Not sure, we might need sth special.
1951 gcc.dg/vect/pr96854.c,
1952 gfortran.dg/vect/fast-math-pr37021.f90
1953 and gfortran.dg/vect/pr61171.f trigger. */
1954 /* Soft-fail for now. */
1955 hard_fail = false;
1956 goto out;
1958 else
1960 vec<stmt_vec_info> op_stmts;
1961 op_stmts.create (group_size);
1962 slp_tree child = NULL;
1963 /* Brute-force our way. We have to consider a lane
1964 failing after fixing an earlier fail up in the
1965 SLP discovery recursion. So track the current
1966 permute per lane. */
1967 unsigned *perms = XALLOCAVEC (unsigned, group_size);
1968 memset (perms, 0, sizeof (unsigned) * group_size);
1971 op_stmts.truncate (0);
1972 for (lane = 0; lane < group_size; ++lane)
1973 op_stmts.quick_push
1974 (vinfo->lookup_def (chains[lane][n].op));
1975 child = vect_build_slp_tree (vinfo, op_stmts,
1976 group_size, &this_max_nunits,
1977 matches, limit,
1978 &this_tree_size, bst_map);
1979 /* ??? We're likely getting too many fatal mismatches
1980 here so maybe we want to ignore them (but then we
1981 have no idea which lanes fatally mismatched). */
1982 if (child || !matches[0])
1983 break;
1984 /* Swap another lane we have not yet matched up into
1985 lanes that did not match. If we run out of
1986 permute possibilities for a lane terminate the
1987 search. */
1988 bool term = false;
1989 for (lane = 1; lane < group_size; ++lane)
1990 if (!matches[lane])
1992 if (n + perms[lane] + 1 == chain_len)
1994 term = true;
1995 break;
1997 std::swap (chains[lane][n],
1998 chains[lane][n + perms[lane] + 1]);
1999 perms[lane]++;
2001 if (term)
2002 break;
2004 while (1);
2005 if (!child)
2007 if (dump_enabled_p ())
2008 dump_printf_loc (MSG_NOTE, vect_location,
2009 "failed to match up op %d\n", n);
2010 op_stmts.release ();
2011 if (lane != group_size - 1)
2012 matches[0] = false;
2013 else
2014 matches[lane] = false;
2015 goto out;
2017 if (dump_enabled_p ())
2019 dump_printf_loc (MSG_NOTE, vect_location,
2020 "matched up op %d to\n", n);
2021 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2023 children.safe_push (child);
2026 /* 3. build SLP nodes to combine the chain. */
2027 for (unsigned lane = 0; lane < group_size; ++lane)
2028 if (chains[lane][0].code != code)
2030 /* See if there's any alternate all-PLUS entry. */
2031 unsigned n;
2032 for (n = 1; n < chain_len; ++n)
2034 for (lane = 0; lane < group_size; ++lane)
2035 if (chains[lane][n].code != code)
2036 break;
2037 if (lane == group_size)
2038 break;
2040 if (n != chain_len)
2042 /* Swap that in at first position. */
2043 std::swap (children[0], children[n]);
2044 for (lane = 0; lane < group_size; ++lane)
2045 std::swap (chains[lane][0], chains[lane][n]);
2047 else
2049 /* ??? When this triggers and we end up with two
2050 vect_constant/external_def up-front things break (ICE)
2051 spectacularly finding an insertion place for the
2052 all-constant op. We should have a fully
2053 vect_internal_def operand though(?) so we can swap
2054 that into first place and then prepend the all-zero
2055 constant. */
2056 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_NOTE, vect_location,
2058 "inserting constant zero to compensate "
2059 "for (partially) negated first "
2060 "operand\n");
2061 chain_len++;
2062 for (lane = 0; lane < group_size; ++lane)
2063 chains[lane].safe_insert
2064 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2065 vec<tree> zero_ops;
2066 zero_ops.create (group_size);
2067 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2068 for (lane = 1; lane < group_size; ++lane)
2069 zero_ops.quick_push (zero_ops[0]);
2070 slp_tree zero = vect_create_new_slp_node (zero_ops);
2071 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2072 children.safe_insert (0, zero);
2074 break;
2076 for (unsigned i = 1; i < children.length (); ++i)
2078 slp_tree op0 = children[i - 1];
2079 slp_tree op1 = children[i];
2080 bool this_two_op = false;
2081 for (unsigned lane = 0; lane < group_size; ++lane)
2082 if (chains[lane][i].code != chains[0][i].code)
2084 this_two_op = true;
2085 break;
2087 slp_tree child;
2088 if (i == children.length () - 1)
2089 child = vect_create_new_slp_node (node, stmts, 2);
2090 else
2091 child = vect_create_new_slp_node (2, ERROR_MARK);
2092 if (this_two_op)
2094 vec<std::pair<unsigned, unsigned> > lperm;
2095 lperm.create (group_size);
2096 for (unsigned lane = 0; lane < group_size; ++lane)
2097 lperm.quick_push (std::make_pair
2098 (chains[lane][i].code != chains[0][i].code, lane));
2099 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2100 (chains[0][i].code == code
2101 ? op_stmt_info
2102 : other_op_stmt_info),
2103 (chains[0][i].code == code
2104 ? other_op_stmt_info
2105 : op_stmt_info),
2106 lperm);
2108 else
2110 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2111 SLP_TREE_VECTYPE (child) = vectype;
2112 SLP_TREE_LANES (child) = group_size;
2113 SLP_TREE_CHILDREN (child).quick_push (op0);
2114 SLP_TREE_CHILDREN (child).quick_push (op1);
2115 SLP_TREE_REPRESENTATIVE (child)
2116 = (chains[0][i].code == code
2117 ? op_stmt_info : other_op_stmt_info);
2119 children[i] = child;
2121 *tree_size += this_tree_size + 1;
2122 *max_nunits = this_max_nunits;
2123 while (!chains.is_empty ())
2124 chains.pop ().release ();
2125 return node;
2127 out:
2128 while (!children.is_empty ())
2129 vect_free_slp_tree (children.pop ());
2130 while (!chains.is_empty ())
2131 chains.pop ().release ();
2132 /* Hard-fail, otherwise we might run into quadratic processing of the
2133 chains starting one stmt into the chain again. */
2134 if (hard_fail)
2135 return NULL;
2136 /* Fall thru to normal processing. */
2139 /* Get at the operands, verifying they are compatible. */
2140 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2141 slp_oprnd_info oprnd_info;
2142 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2144 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2145 stmts, i, &oprnds_info);
2146 if (res != 0)
2147 matches[(res == -1) ? 0 : i] = false;
2148 if (!matches[0])
2149 break;
2151 for (i = 0; i < group_size; ++i)
2152 if (!matches[i])
2154 vect_free_oprnd_info (oprnds_info);
2155 return NULL;
2157 swap = NULL;
2159 auto_vec<slp_tree, 4> children;
2161 stmt_info = stmts[0];
2163 /* Create SLP_TREE nodes for the definition node/s. */
2164 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2166 slp_tree child;
2167 unsigned int j;
2169 /* We're skipping certain operands from processing, for example
2170 outer loop reduction initial defs. */
2171 if (skip_args[i])
2173 children.safe_push (NULL);
2174 continue;
2177 if (oprnd_info->first_dt == vect_uninitialized_def)
2179 /* COND_EXPR have one too many eventually if the condition
2180 is a SSA name. */
2181 gcc_assert (i == 3 && nops == 4);
2182 continue;
2185 if (is_a <bb_vec_info> (vinfo)
2186 && oprnd_info->first_dt == vect_internal_def
2187 && !oprnd_info->any_pattern)
2189 /* For BB vectorization, if all defs are the same do not
2190 bother to continue the build along the single-lane
2191 graph but use a splat of the scalar value. */
2192 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2193 for (j = 1; j < group_size; ++j)
2194 if (oprnd_info->def_stmts[j] != first_def)
2195 break;
2196 if (j == group_size
2197 /* But avoid doing this for loads where we may be
2198 able to CSE things, unless the stmt is not
2199 vectorizable. */
2200 && (!STMT_VINFO_VECTORIZABLE (first_def)
2201 || !gimple_vuse (first_def->stmt)))
2203 if (dump_enabled_p ())
2204 dump_printf_loc (MSG_NOTE, vect_location,
2205 "Using a splat of the uniform operand %G",
2206 first_def->stmt);
2207 oprnd_info->first_dt = vect_external_def;
2211 if (oprnd_info->first_dt == vect_external_def
2212 || oprnd_info->first_dt == vect_constant_def)
2214 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2215 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2216 oprnd_info->ops = vNULL;
2217 children.safe_push (invnode);
2218 continue;
2221 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2222 group_size, &this_max_nunits,
2223 matches, limit,
2224 &this_tree_size, bst_map)) != NULL)
2226 oprnd_info->def_stmts = vNULL;
2227 children.safe_push (child);
2228 continue;
2231 /* If the SLP build for operand zero failed and operand zero
2232 and one can be commutated try that for the scalar stmts
2233 that failed the match. */
2234 if (i == 0
2235 /* A first scalar stmt mismatch signals a fatal mismatch. */
2236 && matches[0]
2237 /* ??? For COND_EXPRs we can swap the comparison operands
2238 as well as the arms under some constraints. */
2239 && nops == 2
2240 && oprnds_info[1]->first_dt == vect_internal_def
2241 && is_gimple_assign (stmt_info->stmt)
2242 /* Swapping operands for reductions breaks assumptions later on. */
2243 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2244 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2246 /* See whether we can swap the matching or the non-matching
2247 stmt operands. */
2248 bool swap_not_matching = true;
2251 for (j = 0; j < group_size; ++j)
2253 if (matches[j] != !swap_not_matching)
2254 continue;
2255 stmt_vec_info stmt_info = stmts[j];
2256 /* Verify if we can swap operands of this stmt. */
2257 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2258 if (!stmt
2259 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2261 if (!swap_not_matching)
2262 goto fail;
2263 swap_not_matching = false;
2264 break;
2268 while (j != group_size);
2270 /* Swap mismatched definition stmts. */
2271 if (dump_enabled_p ())
2272 dump_printf_loc (MSG_NOTE, vect_location,
2273 "Re-trying with swapped operands of stmts ");
2274 for (j = 0; j < group_size; ++j)
2275 if (matches[j] == !swap_not_matching)
2277 std::swap (oprnds_info[0]->def_stmts[j],
2278 oprnds_info[1]->def_stmts[j]);
2279 std::swap (oprnds_info[0]->ops[j],
2280 oprnds_info[1]->ops[j]);
2281 if (dump_enabled_p ())
2282 dump_printf (MSG_NOTE, "%d ", j);
2284 if (dump_enabled_p ())
2285 dump_printf (MSG_NOTE, "\n");
2286 /* After swapping some operands we lost track whether an
2287 operand has any pattern defs so be conservative here. */
2288 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2289 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2290 /* And try again with scratch 'matches' ... */
2291 bool *tem = XALLOCAVEC (bool, group_size);
2292 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2293 group_size, &this_max_nunits,
2294 tem, limit,
2295 &this_tree_size, bst_map)) != NULL)
2297 oprnd_info->def_stmts = vNULL;
2298 children.safe_push (child);
2299 continue;
2302 fail:
2304 /* If the SLP build failed and we analyze a basic-block
2305 simply treat nodes we fail to build as externally defined
2306 (and thus build vectors from the scalar defs).
2307 The cost model will reject outright expensive cases.
2308 ??? This doesn't treat cases where permutation ultimatively
2309 fails (or we don't try permutation below). Ideally we'd
2310 even compute a permutation that will end up with the maximum
2311 SLP tree size... */
2312 if (is_a <bb_vec_info> (vinfo)
2313 /* ??? Rejecting patterns this way doesn't work. We'd have to
2314 do extra work to cancel the pattern so the uses see the
2315 scalar version. */
2316 && !is_pattern_stmt_p (stmt_info)
2317 && !oprnd_info->any_pattern)
2319 /* But if there's a leading vector sized set of matching stmts
2320 fail here so we can split the group. This matches the condition
2321 vect_analyze_slp_instance uses. */
2322 /* ??? We might want to split here and combine the results to support
2323 multiple vector sizes better. */
2324 for (j = 0; j < group_size; ++j)
2325 if (!matches[j])
2326 break;
2327 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2329 if (dump_enabled_p ())
2330 dump_printf_loc (MSG_NOTE, vect_location,
2331 "Building vector operands from scalars\n");
2332 this_tree_size++;
2333 child = vect_create_new_slp_node (oprnd_info->ops);
2334 children.safe_push (child);
2335 oprnd_info->ops = vNULL;
2336 continue;
2340 gcc_assert (child == NULL);
2341 FOR_EACH_VEC_ELT (children, j, child)
2342 if (child)
2343 vect_free_slp_tree (child);
2344 vect_free_oprnd_info (oprnds_info);
2345 return NULL;
2348 vect_free_oprnd_info (oprnds_info);
2350 /* If we have all children of a child built up from uniform scalars
2351 or does more than one possibly expensive vector construction then
2352 just throw that away, causing it built up from scalars.
2353 The exception is the SLP node for the vector store. */
2354 if (is_a <bb_vec_info> (vinfo)
2355 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2356 /* ??? Rejecting patterns this way doesn't work. We'd have to
2357 do extra work to cancel the pattern so the uses see the
2358 scalar version. */
2359 && !is_pattern_stmt_p (stmt_info))
2361 slp_tree child;
2362 unsigned j;
2363 bool all_uniform_p = true;
2364 unsigned n_vector_builds = 0;
2365 FOR_EACH_VEC_ELT (children, j, child)
2367 if (!child)
2369 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2370 all_uniform_p = false;
2371 else if (!vect_slp_tree_uniform_p (child))
2373 all_uniform_p = false;
2374 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2375 n_vector_builds++;
2378 if (all_uniform_p
2379 || n_vector_builds > 1
2380 || (n_vector_builds == children.length ()
2381 && is_a <gphi *> (stmt_info->stmt)))
2383 /* Roll back. */
2384 matches[0] = false;
2385 FOR_EACH_VEC_ELT (children, j, child)
2386 if (child)
2387 vect_free_slp_tree (child);
2389 if (dump_enabled_p ())
2390 dump_printf_loc (MSG_NOTE, vect_location,
2391 "Building parent vector operands from "
2392 "scalars instead\n");
2393 return NULL;
2397 *tree_size += this_tree_size + 1;
2398 *max_nunits = this_max_nunits;
2400 if (two_operators)
2402 /* ??? We'd likely want to either cache in bst_map sth like
2403 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2404 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2405 explicit stmts to put in so the keying on 'stmts' doesn't
2406 work (but we have the same issue with nodes that use 'ops'). */
2407 slp_tree one = new _slp_tree;
2408 slp_tree two = new _slp_tree;
2409 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2410 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2411 SLP_TREE_VECTYPE (one) = vectype;
2412 SLP_TREE_VECTYPE (two) = vectype;
2413 SLP_TREE_CHILDREN (one).safe_splice (children);
2414 SLP_TREE_CHILDREN (two).safe_splice (children);
2415 slp_tree child;
2416 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2417 SLP_TREE_REF_COUNT (child)++;
2419 /* Here we record the original defs since this
2420 node represents the final lane configuration. */
2421 node = vect_create_new_slp_node (node, stmts, 2);
2422 SLP_TREE_VECTYPE (node) = vectype;
2423 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2424 SLP_TREE_CHILDREN (node).quick_push (one);
2425 SLP_TREE_CHILDREN (node).quick_push (two);
2426 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2427 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2428 enum tree_code ocode = ERROR_MARK;
2429 stmt_vec_info ostmt_info;
2430 unsigned j = 0;
2431 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2433 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2434 if (gimple_assign_rhs_code (ostmt) != code0)
2436 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2437 ocode = gimple_assign_rhs_code (ostmt);
2438 j = i;
2440 else
2441 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2443 SLP_TREE_CODE (one) = code0;
2444 SLP_TREE_CODE (two) = ocode;
2445 SLP_TREE_LANES (one) = stmts.length ();
2446 SLP_TREE_LANES (two) = stmts.length ();
2447 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2448 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2449 return node;
2452 node = vect_create_new_slp_node (node, stmts, nops);
2453 SLP_TREE_VECTYPE (node) = vectype;
2454 SLP_TREE_CHILDREN (node).splice (children);
2455 return node;
2458 /* Dump a single SLP tree NODE. */
2460 static void
2461 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2462 slp_tree node)
2464 unsigned i, j;
2465 slp_tree child;
2466 stmt_vec_info stmt_info;
2467 tree op;
2469 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2470 dump_user_location_t user_loc = loc.get_user_location ();
2471 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)",
2472 SLP_TREE_DEF_TYPE (node) == vect_external_def
2473 ? " (external)"
2474 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2475 ? " (constant)"
2476 : ""), node,
2477 estimated_poly_value (node->max_nunits),
2478 SLP_TREE_REF_COUNT (node));
2479 if (SLP_TREE_VECTYPE (node))
2480 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2481 dump_printf (metadata, "\n");
2482 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2484 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2485 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2486 else
2487 dump_printf_loc (metadata, user_loc, "op template: %G",
2488 SLP_TREE_REPRESENTATIVE (node)->stmt);
2490 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2491 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2492 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2493 else
2495 dump_printf_loc (metadata, user_loc, "\t{ ");
2496 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2497 dump_printf (metadata, "%T%s ", op,
2498 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2499 dump_printf (metadata, "}\n");
2501 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2503 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2504 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2505 dump_printf (dump_kind, " %u", j);
2506 dump_printf (dump_kind, " }\n");
2508 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2510 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2511 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2512 dump_printf (dump_kind, " %u[%u]",
2513 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2514 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2515 dump_printf (dump_kind, " }\n");
2517 if (SLP_TREE_CHILDREN (node).is_empty ())
2518 return;
2519 dump_printf_loc (metadata, user_loc, "\tchildren");
2520 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2521 dump_printf (dump_kind, " %p", (void *)child);
2522 dump_printf (dump_kind, "\n");
2525 DEBUG_FUNCTION void
2526 debug (slp_tree node)
2528 debug_dump_context ctx;
2529 vect_print_slp_tree (MSG_NOTE,
2530 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2531 node);
2534 /* Recursive helper for the dot producer below. */
2536 static void
2537 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2539 if (visited.add (node))
2540 return;
2542 fprintf (f, "\"%p\" [label=\"", (void *)node);
2543 vect_print_slp_tree (MSG_NOTE,
2544 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2545 node);
2546 fprintf (f, "\"];\n");
2549 for (slp_tree child : SLP_TREE_CHILDREN (node))
2550 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2552 for (slp_tree child : SLP_TREE_CHILDREN (node))
2553 if (child)
2554 dot_slp_tree (f, child, visited);
2557 DEBUG_FUNCTION void
2558 dot_slp_tree (const char *fname, slp_tree node)
2560 FILE *f = fopen (fname, "w");
2561 fprintf (f, "digraph {\n");
2562 fflush (f);
2564 debug_dump_context ctx (f);
2565 hash_set<slp_tree> visited;
2566 dot_slp_tree (f, node, visited);
2568 fflush (f);
2569 fprintf (f, "}\n");
2570 fclose (f);
2573 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2575 static void
2576 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2577 slp_tree node, hash_set<slp_tree> &visited)
2579 unsigned i;
2580 slp_tree child;
2582 if (visited.add (node))
2583 return;
2585 vect_print_slp_tree (dump_kind, loc, node);
2587 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2588 if (child)
2589 vect_print_slp_graph (dump_kind, loc, child, visited);
2592 static void
2593 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2594 slp_tree entry)
2596 hash_set<slp_tree> visited;
2597 vect_print_slp_graph (dump_kind, loc, entry, visited);
2600 /* Mark the tree rooted at NODE with PURE_SLP. */
2602 static void
2603 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2605 int i;
2606 stmt_vec_info stmt_info;
2607 slp_tree child;
2609 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2610 return;
2612 if (visited.add (node))
2613 return;
2615 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2616 STMT_SLP_TYPE (stmt_info) = pure_slp;
2618 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2619 if (child)
2620 vect_mark_slp_stmts (child, visited);
2623 static void
2624 vect_mark_slp_stmts (slp_tree node)
2626 hash_set<slp_tree> visited;
2627 vect_mark_slp_stmts (node, visited);
2630 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2632 static void
2633 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2635 int i;
2636 stmt_vec_info stmt_info;
2637 slp_tree child;
2639 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2640 return;
2642 if (visited.add (node))
2643 return;
2645 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2647 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2648 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2649 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2652 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2653 if (child)
2654 vect_mark_slp_stmts_relevant (child, visited);
2657 static void
2658 vect_mark_slp_stmts_relevant (slp_tree node)
2660 hash_set<slp_tree> visited;
2661 vect_mark_slp_stmts_relevant (node, visited);
2665 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2667 static void
2668 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2669 hash_set<slp_tree> &visited)
2671 if (!node || visited.add (node))
2672 return;
2674 if (SLP_TREE_CHILDREN (node).length () == 0)
2676 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2677 return;
2678 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2679 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2680 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2681 loads.safe_push (node);
2683 else
2685 unsigned i;
2686 slp_tree child;
2687 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2688 vect_gather_slp_loads (loads, child, visited);
2693 /* Find the last store in SLP INSTANCE. */
2695 stmt_vec_info
2696 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2698 stmt_vec_info last = NULL;
2699 stmt_vec_info stmt_vinfo;
2701 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2703 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2704 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2707 return last;
2710 /* Find the first stmt in NODE. */
2712 stmt_vec_info
2713 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2715 stmt_vec_info first = NULL;
2716 stmt_vec_info stmt_vinfo;
2718 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2720 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2721 if (!first
2722 || get_later_stmt (stmt_vinfo, first) == first)
2723 first = stmt_vinfo;
2726 return first;
2729 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2730 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2731 (also containing the first GROUP1_SIZE stmts, since stores are
2732 consecutive), the second containing the remainder.
2733 Return the first stmt in the second group. */
2735 static stmt_vec_info
2736 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2738 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2739 gcc_assert (group1_size > 0);
2740 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2741 gcc_assert (group2_size > 0);
2742 DR_GROUP_SIZE (first_vinfo) = group1_size;
2744 stmt_vec_info stmt_info = first_vinfo;
2745 for (unsigned i = group1_size; i > 1; i--)
2747 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2748 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2750 /* STMT is now the last element of the first group. */
2751 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2752 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2754 DR_GROUP_SIZE (group2) = group2_size;
2755 for (stmt_info = group2; stmt_info;
2756 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2758 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2759 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2762 /* For the second group, the DR_GROUP_GAP is that before the original group,
2763 plus skipping over the first vector. */
2764 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2766 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2767 DR_GROUP_GAP (first_vinfo) += group2_size;
2769 if (dump_enabled_p ())
2770 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2771 group1_size, group2_size);
2773 return group2;
2776 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2777 statements and a vector of NUNITS elements. */
2779 static poly_uint64
2780 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2782 return exact_div (common_multiple (nunits, group_size), group_size);
2785 /* Helper that checks to see if a node is a load node. */
2787 static inline bool
2788 vect_is_slp_load_node (slp_tree root)
2790 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2791 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2792 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2796 /* Helper function of optimize_load_redistribution that performs the operation
2797 recursively. */
2799 static slp_tree
2800 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2801 vec_info *vinfo, unsigned int group_size,
2802 hash_map<slp_tree, slp_tree> *load_map,
2803 slp_tree root)
2805 if (slp_tree *leader = load_map->get (root))
2806 return *leader;
2808 slp_tree node;
2809 unsigned i;
2811 /* For now, we don't know anything about externals so do not do anything. */
2812 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2813 return NULL;
2814 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2816 /* First convert this node into a load node and add it to the leaves
2817 list and flatten the permute from a lane to a load one. If it's
2818 unneeded it will be elided later. */
2819 vec<stmt_vec_info> stmts;
2820 stmts.create (SLP_TREE_LANES (root));
2821 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2822 for (unsigned j = 0; j < lane_perm.length (); j++)
2824 std::pair<unsigned, unsigned> perm = lane_perm[j];
2825 node = SLP_TREE_CHILDREN (root)[perm.first];
2827 if (!vect_is_slp_load_node (node)
2828 || SLP_TREE_CHILDREN (node).exists ())
2830 stmts.release ();
2831 goto next;
2834 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2837 if (dump_enabled_p ())
2838 dump_printf_loc (MSG_NOTE, vect_location,
2839 "converting stmts on permute node %p\n", root);
2841 bool *matches = XALLOCAVEC (bool, group_size);
2842 poly_uint64 max_nunits = 1;
2843 unsigned tree_size = 0, limit = 1;
2844 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2845 matches, &limit, &tree_size, bst_map);
2846 if (!node)
2847 stmts.release ();
2849 load_map->put (root, node);
2850 return node;
2853 next:
2854 load_map->put (root, NULL);
2856 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2858 slp_tree value
2859 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2860 node);
2861 if (value)
2863 SLP_TREE_REF_COUNT (value)++;
2864 SLP_TREE_CHILDREN (root)[i] = value;
2865 /* ??? We know the original leafs of the replaced nodes will
2866 be referenced by bst_map, only the permutes created by
2867 pattern matching are not. */
2868 if (SLP_TREE_REF_COUNT (node) == 1)
2869 load_map->remove (node);
2870 vect_free_slp_tree (node);
2874 return NULL;
2877 /* Temporary workaround for loads not being CSEd during SLP build. This
2878 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2879 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2880 same DR such that the final operation is equal to a permuted load. Such
2881 NODES are then directly converted into LOADS themselves. The nodes are
2882 CSEd using BST_MAP. */
2884 static void
2885 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2886 vec_info *vinfo, unsigned int group_size,
2887 hash_map<slp_tree, slp_tree> *load_map,
2888 slp_tree root)
2890 slp_tree node;
2891 unsigned i;
2893 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2895 slp_tree value
2896 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2897 node);
2898 if (value)
2900 SLP_TREE_REF_COUNT (value)++;
2901 SLP_TREE_CHILDREN (root)[i] = value;
2902 /* ??? We know the original leafs of the replaced nodes will
2903 be referenced by bst_map, only the permutes created by
2904 pattern matching are not. */
2905 if (SLP_TREE_REF_COUNT (node) == 1)
2906 load_map->remove (node);
2907 vect_free_slp_tree (node);
2912 /* Helper function of vect_match_slp_patterns.
2914 Attempts to match patterns against the slp tree rooted in REF_NODE using
2915 VINFO. Patterns are matched in post-order traversal.
2917 If matching is successful the value in REF_NODE is updated and returned, if
2918 not then it is returned unchanged. */
2920 static bool
2921 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2922 slp_tree_to_load_perm_map_t *perm_cache,
2923 slp_compat_nodes_map_t *compat_cache,
2924 hash_set<slp_tree> *visited)
2926 unsigned i;
2927 slp_tree node = *ref_node;
2928 bool found_p = false;
2929 if (!node || visited->add (node))
2930 return false;
2932 slp_tree child;
2933 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2934 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2935 vinfo, perm_cache, compat_cache,
2936 visited);
2938 for (unsigned x = 0; x < num__slp_patterns; x++)
2940 vect_pattern *pattern
2941 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
2942 if (pattern)
2944 pattern->build (vinfo);
2945 delete pattern;
2946 found_p = true;
2950 return found_p;
2953 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2954 vec_info VINFO.
2956 The modified tree is returned. Patterns are tried in order and multiple
2957 patterns may match. */
2959 static bool
2960 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2961 hash_set<slp_tree> *visited,
2962 slp_tree_to_load_perm_map_t *perm_cache,
2963 slp_compat_nodes_map_t *compat_cache)
2965 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2966 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2968 if (dump_enabled_p ())
2969 dump_printf_loc (MSG_NOTE, vect_location,
2970 "Analyzing SLP tree %p for patterns\n",
2971 SLP_INSTANCE_TREE (instance));
2973 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
2974 visited);
2977 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
2978 splitting into two, with the first split group having size NEW_GROUP_SIZE.
2979 Return true if we could use IFN_STORE_LANES instead and if that appears
2980 to be the better approach. */
2982 static bool
2983 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
2984 unsigned int group_size,
2985 unsigned int new_group_size)
2987 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2988 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
2989 if (!vectype)
2990 return false;
2991 /* Allow the split if one of the two new groups would operate on full
2992 vectors *within* rather than across one scalar loop iteration.
2993 This is purely a heuristic, but it should work well for group
2994 sizes of 3 and 4, where the possible splits are:
2996 3->2+1: OK if the vector has exactly two elements
2997 4->2+2: Likewise
2998 4->3+1: Less clear-cut. */
2999 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3000 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3001 return false;
3002 return vect_store_lanes_supported (vectype, group_size, false);
3005 /* Analyze an SLP instance starting from a group of grouped stores. Call
3006 vect_build_slp_tree to build a tree of packed stmts if possible.
3007 Return FALSE if it's impossible to SLP any stmt in the loop. */
3009 static bool
3010 vect_analyze_slp_instance (vec_info *vinfo,
3011 scalar_stmts_to_slp_tree_map_t *bst_map,
3012 stmt_vec_info stmt_info, slp_instance_kind kind,
3013 unsigned max_tree_size, unsigned *limit);
3015 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3016 of KIND. Return true if successful. */
3018 static bool
3019 vect_build_slp_instance (vec_info *vinfo,
3020 slp_instance_kind kind,
3021 vec<stmt_vec_info> &scalar_stmts,
3022 vec<stmt_vec_info> &root_stmt_infos,
3023 unsigned max_tree_size, unsigned *limit,
3024 scalar_stmts_to_slp_tree_map_t *bst_map,
3025 /* ??? We need stmt_info for group splitting. */
3026 stmt_vec_info stmt_info_)
3028 if (dump_enabled_p ())
3030 dump_printf_loc (MSG_NOTE, vect_location,
3031 "Starting SLP discovery for\n");
3032 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3033 dump_printf_loc (MSG_NOTE, vect_location,
3034 " %G", scalar_stmts[i]->stmt);
3037 /* Build the tree for the SLP instance. */
3038 unsigned int group_size = scalar_stmts.length ();
3039 bool *matches = XALLOCAVEC (bool, group_size);
3040 poly_uint64 max_nunits = 1;
3041 unsigned tree_size = 0;
3042 unsigned i;
3043 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3044 &max_nunits, matches, limit,
3045 &tree_size, bst_map);
3046 if (node != NULL)
3048 /* Calculate the unrolling factor based on the smallest type. */
3049 poly_uint64 unrolling_factor
3050 = calculate_unrolling_factor (max_nunits, group_size);
3052 if (maybe_ne (unrolling_factor, 1U)
3053 && is_a <bb_vec_info> (vinfo))
3055 unsigned HOST_WIDE_INT const_max_nunits;
3056 if (!max_nunits.is_constant (&const_max_nunits)
3057 || const_max_nunits > group_size)
3059 if (dump_enabled_p ())
3060 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3061 "Build SLP failed: store group "
3062 "size not a multiple of the vector size "
3063 "in basic block SLP\n");
3064 vect_free_slp_tree (node);
3065 return false;
3067 /* Fatal mismatch. */
3068 if (dump_enabled_p ())
3069 dump_printf_loc (MSG_NOTE, vect_location,
3070 "SLP discovery succeeded but node needs "
3071 "splitting\n");
3072 memset (matches, true, group_size);
3073 matches[group_size / const_max_nunits * const_max_nunits] = false;
3074 vect_free_slp_tree (node);
3076 else
3078 /* Create a new SLP instance. */
3079 slp_instance new_instance = XNEW (class _slp_instance);
3080 SLP_INSTANCE_TREE (new_instance) = node;
3081 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3082 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3083 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3084 SLP_INSTANCE_KIND (new_instance) = kind;
3085 new_instance->reduc_phis = NULL;
3086 new_instance->cost_vec = vNULL;
3087 new_instance->subgraph_entries = vNULL;
3089 if (dump_enabled_p ())
3090 dump_printf_loc (MSG_NOTE, vect_location,
3091 "SLP size %u vs. limit %u.\n",
3092 tree_size, max_tree_size);
3094 /* Fixup SLP reduction chains. */
3095 if (kind == slp_inst_kind_reduc_chain)
3097 /* If this is a reduction chain with a conversion in front
3098 amend the SLP tree with a node for that. */
3099 gimple *scalar_def
3100 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3101 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3103 /* Get at the conversion stmt - we know it's the single use
3104 of the last stmt of the reduction chain. */
3105 use_operand_p use_p;
3106 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3107 &use_p, &scalar_def);
3108 gcc_assert (r);
3109 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3110 next_info = vect_stmt_to_vectorize (next_info);
3111 scalar_stmts = vNULL;
3112 scalar_stmts.create (group_size);
3113 for (unsigned i = 0; i < group_size; ++i)
3114 scalar_stmts.quick_push (next_info);
3115 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3116 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3117 SLP_TREE_CHILDREN (conv).quick_push (node);
3118 SLP_INSTANCE_TREE (new_instance) = conv;
3119 /* We also have to fake this conversion stmt as SLP reduction
3120 group so we don't have to mess with too much code
3121 elsewhere. */
3122 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3123 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3125 /* Fill the backedge child of the PHI SLP node. The
3126 general matching code cannot find it because the
3127 scalar code does not reflect how we vectorize the
3128 reduction. */
3129 use_operand_p use_p;
3130 imm_use_iterator imm_iter;
3131 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3132 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3133 gimple_get_lhs (scalar_def))
3134 /* There are exactly two non-debug uses, the reduction
3135 PHI and the loop-closed PHI node. */
3136 if (!is_gimple_debug (USE_STMT (use_p))
3137 && gimple_bb (USE_STMT (use_p)) == loop->header)
3139 auto_vec<stmt_vec_info, 64> phis (group_size);
3140 stmt_vec_info phi_info
3141 = vinfo->lookup_stmt (USE_STMT (use_p));
3142 for (unsigned i = 0; i < group_size; ++i)
3143 phis.quick_push (phi_info);
3144 slp_tree *phi_node = bst_map->get (phis);
3145 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3146 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3147 = SLP_INSTANCE_TREE (new_instance);
3148 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3152 vinfo->slp_instances.safe_push (new_instance);
3154 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3155 the number of scalar stmts in the root in a few places.
3156 Verify that assumption holds. */
3157 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3158 .length () == group_size);
3160 if (dump_enabled_p ())
3162 dump_printf_loc (MSG_NOTE, vect_location,
3163 "Final SLP tree for instance %p:\n", new_instance);
3164 vect_print_slp_graph (MSG_NOTE, vect_location,
3165 SLP_INSTANCE_TREE (new_instance));
3168 return true;
3171 else
3173 /* Failed to SLP. */
3174 /* Free the allocated memory. */
3175 scalar_stmts.release ();
3178 stmt_vec_info stmt_info = stmt_info_;
3179 /* Try to break the group up into pieces. */
3180 if (kind == slp_inst_kind_store)
3182 /* ??? We could delay all the actual splitting of store-groups
3183 until after SLP discovery of the original group completed.
3184 Then we can recurse to vect_build_slp_instance directly. */
3185 for (i = 0; i < group_size; i++)
3186 if (!matches[i])
3187 break;
3189 /* For basic block SLP, try to break the group up into multiples of
3190 a vector size. */
3191 if (is_a <bb_vec_info> (vinfo)
3192 && (i > 1 && i < group_size))
3194 tree scalar_type
3195 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3196 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3197 1 << floor_log2 (i));
3198 unsigned HOST_WIDE_INT const_nunits;
3199 if (vectype
3200 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3202 /* Split into two groups at the first vector boundary. */
3203 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3204 unsigned group1_size = i & ~(const_nunits - 1);
3206 if (dump_enabled_p ())
3207 dump_printf_loc (MSG_NOTE, vect_location,
3208 "Splitting SLP group at stmt %u\n", i);
3209 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3210 group1_size);
3211 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3212 kind, max_tree_size,
3213 limit);
3214 /* Split the rest at the failure point and possibly
3215 re-analyze the remaining matching part if it has
3216 at least two lanes. */
3217 if (group1_size < i
3218 && (i + 1 < group_size
3219 || i - group1_size > 1))
3221 stmt_vec_info rest2 = rest;
3222 rest = vect_split_slp_store_group (rest, i - group1_size);
3223 if (i - group1_size > 1)
3224 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3225 kind, max_tree_size,
3226 limit);
3228 /* Re-analyze the non-matching tail if it has at least
3229 two lanes. */
3230 if (i + 1 < group_size)
3231 res |= vect_analyze_slp_instance (vinfo, bst_map,
3232 rest, kind, max_tree_size,
3233 limit);
3234 return res;
3238 /* For loop vectorization split into arbitrary pieces of size > 1. */
3239 if (is_a <loop_vec_info> (vinfo)
3240 && (i > 1 && i < group_size)
3241 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3243 unsigned group1_size = i;
3245 if (dump_enabled_p ())
3246 dump_printf_loc (MSG_NOTE, vect_location,
3247 "Splitting SLP group at stmt %u\n", i);
3249 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3250 group1_size);
3251 /* Loop vectorization cannot handle gaps in stores, make sure
3252 the split group appears as strided. */
3253 STMT_VINFO_STRIDED_P (rest) = 1;
3254 DR_GROUP_GAP (rest) = 0;
3255 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3256 DR_GROUP_GAP (stmt_info) = 0;
3258 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3259 kind, max_tree_size, limit);
3260 if (i + 1 < group_size)
3261 res |= vect_analyze_slp_instance (vinfo, bst_map,
3262 rest, kind, max_tree_size, limit);
3264 return res;
3267 /* Even though the first vector did not all match, we might be able to SLP
3268 (some) of the remainder. FORNOW ignore this possibility. */
3271 /* Failed to SLP. */
3272 if (dump_enabled_p ())
3273 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3274 return false;
3278 /* Analyze an SLP instance starting from a group of grouped stores. Call
3279 vect_build_slp_tree to build a tree of packed stmts if possible.
3280 Return FALSE if it's impossible to SLP any stmt in the loop. */
3282 static bool
3283 vect_analyze_slp_instance (vec_info *vinfo,
3284 scalar_stmts_to_slp_tree_map_t *bst_map,
3285 stmt_vec_info stmt_info,
3286 slp_instance_kind kind,
3287 unsigned max_tree_size, unsigned *limit)
3289 unsigned int i;
3290 vec<stmt_vec_info> scalar_stmts;
3292 if (is_a <bb_vec_info> (vinfo))
3293 vect_location = stmt_info->stmt;
3295 stmt_vec_info next_info = stmt_info;
3296 if (kind == slp_inst_kind_store)
3298 /* Collect the stores and store them in scalar_stmts. */
3299 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3300 while (next_info)
3302 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3303 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3306 else if (kind == slp_inst_kind_reduc_chain)
3308 /* Collect the reduction stmts and store them in scalar_stmts. */
3309 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3310 while (next_info)
3312 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3313 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3315 /* Mark the first element of the reduction chain as reduction to properly
3316 transform the node. In the reduction analysis phase only the last
3317 element of the chain is marked as reduction. */
3318 STMT_VINFO_DEF_TYPE (stmt_info)
3319 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3320 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3321 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3323 else if (kind == slp_inst_kind_ctor)
3325 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3326 tree val;
3327 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3328 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3330 stmt_vec_info def_info = vinfo->lookup_def (val);
3331 def_info = vect_stmt_to_vectorize (def_info);
3332 scalar_stmts.quick_push (def_info);
3334 if (dump_enabled_p ())
3335 dump_printf_loc (MSG_NOTE, vect_location,
3336 "Analyzing vectorizable constructor: %G\n",
3337 stmt_info->stmt);
3339 else if (kind == slp_inst_kind_reduc_group)
3341 /* Collect reduction statements. */
3342 const vec<stmt_vec_info> &reductions
3343 = as_a <loop_vec_info> (vinfo)->reductions;
3344 scalar_stmts.create (reductions.length ());
3345 for (i = 0; reductions.iterate (i, &next_info); i++)
3346 if ((STMT_VINFO_RELEVANT_P (next_info)
3347 || STMT_VINFO_LIVE_P (next_info))
3348 /* ??? Make sure we didn't skip a conversion around a reduction
3349 path. In that case we'd have to reverse engineer that conversion
3350 stmt following the chain using reduc_idx and from the PHI
3351 using reduc_def. */
3352 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3353 scalar_stmts.quick_push (next_info);
3354 /* If less than two were relevant/live there's nothing to SLP. */
3355 if (scalar_stmts.length () < 2)
3356 return false;
3358 else
3359 gcc_unreachable ();
3361 vec<stmt_vec_info> roots = vNULL;
3362 if (kind == slp_inst_kind_ctor)
3364 roots.create (1);
3365 roots.quick_push (stmt_info);
3367 /* Build the tree for the SLP instance. */
3368 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3369 roots,
3370 max_tree_size, limit, bst_map,
3371 kind == slp_inst_kind_store
3372 ? stmt_info : NULL);
3373 if (!res)
3374 roots.release ();
3376 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3377 where we should do store group splitting. */
3379 return res;
3382 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3383 trees of packed scalar stmts if SLP is possible. */
3385 opt_result
3386 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3388 unsigned int i;
3389 stmt_vec_info first_element;
3390 slp_instance instance;
3392 DUMP_VECT_SCOPE ("vect_analyze_slp");
3394 unsigned limit = max_tree_size;
3396 scalar_stmts_to_slp_tree_map_t *bst_map
3397 = new scalar_stmts_to_slp_tree_map_t ();
3399 /* Find SLP sequences starting from groups of grouped stores. */
3400 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3401 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3402 STMT_VINFO_GROUPED_ACCESS (first_element)
3403 ? slp_inst_kind_store : slp_inst_kind_ctor,
3404 max_tree_size, &limit);
3406 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3408 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3410 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3411 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3412 bb_vinfo->roots[i].stmts,
3413 bb_vinfo->roots[i].roots,
3414 max_tree_size, &limit, bst_map, NULL))
3416 bb_vinfo->roots[i].stmts = vNULL;
3417 bb_vinfo->roots[i].roots = vNULL;
3422 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3424 /* Find SLP sequences starting from reduction chains. */
3425 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3426 if (! STMT_VINFO_RELEVANT_P (first_element)
3427 && ! STMT_VINFO_LIVE_P (first_element))
3429 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3430 slp_inst_kind_reduc_chain,
3431 max_tree_size, &limit))
3433 /* Dissolve reduction chain group. */
3434 stmt_vec_info vinfo = first_element;
3435 stmt_vec_info last = NULL;
3436 while (vinfo)
3438 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3439 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3440 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3441 last = vinfo;
3442 vinfo = next;
3444 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3445 /* It can be still vectorized as part of an SLP reduction. */
3446 loop_vinfo->reductions.safe_push (last);
3449 /* Find SLP sequences starting from groups of reductions. */
3450 if (loop_vinfo->reductions.length () > 1)
3451 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3452 slp_inst_kind_reduc_group, max_tree_size,
3453 &limit);
3456 hash_set<slp_tree> visited_patterns;
3457 slp_tree_to_load_perm_map_t perm_cache;
3458 slp_compat_nodes_map_t compat_cache;
3460 /* See if any patterns can be found in the SLP tree. */
3461 bool pattern_found = false;
3462 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3463 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3464 &visited_patterns, &perm_cache,
3465 &compat_cache);
3467 /* If any were found optimize permutations of loads. */
3468 if (pattern_found)
3470 hash_map<slp_tree, slp_tree> load_map;
3471 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3473 slp_tree root = SLP_INSTANCE_TREE (instance);
3474 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3475 &load_map, root);
3481 /* The map keeps a reference on SLP nodes built, release that. */
3482 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3483 it != bst_map->end (); ++it)
3484 if ((*it).second)
3485 vect_free_slp_tree ((*it).second);
3486 delete bst_map;
3488 if (pattern_found && dump_enabled_p ())
3490 dump_printf_loc (MSG_NOTE, vect_location,
3491 "Pattern matched SLP tree\n");
3492 hash_set<slp_tree> visited;
3493 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3494 vect_print_slp_graph (MSG_NOTE, vect_location,
3495 SLP_INSTANCE_TREE (instance), visited);
3498 return opt_result::success ();
3501 struct slpg_vertex
3503 slpg_vertex (slp_tree node_)
3504 : node (node_), perm_in (-1), perm_out (-1) {}
3506 int get_perm_materialized () const
3507 { return perm_in != perm_out ? perm_in : 0; }
3509 slp_tree node;
3510 /* The common permutation on the incoming lanes (towards SLP children). */
3511 int perm_in;
3512 /* The permutation on the outgoing lanes (towards SLP parents). When
3513 the node is a materialization point for a permute this differs
3514 from perm_in (and is then usually zero). Materialization happens
3515 on the input side. */
3516 int perm_out;
3519 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3521 static void
3522 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
3523 vec<slpg_vertex> &vertices, vec<int> &leafs)
3525 unsigned i;
3526 slp_tree child;
3528 if (visited.add (node))
3529 return;
3531 node->vertex = vertices.length ();
3532 vertices.safe_push (slpg_vertex (node));
3534 bool leaf = true;
3535 bool force_leaf = false;
3536 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3537 if (child)
3539 leaf = false;
3540 vect_slp_build_vertices (visited, child, vertices, leafs);
3542 else
3543 force_leaf = true;
3544 /* Since SLP discovery works along use-def edges all cycles have an
3545 entry - but there's the exception of cycles where we do not handle
3546 the entry explicitely (but with a NULL SLP node), like some reductions
3547 and inductions. Force those SLP PHIs to act as leafs to make them
3548 backwards reachable. */
3549 if (leaf || force_leaf)
3550 leafs.safe_push (node->vertex);
3553 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3555 static void
3556 vect_slp_build_vertices (vec_info *info, vec<slpg_vertex> &vertices,
3557 vec<int> &leafs)
3559 hash_set<slp_tree> visited;
3560 unsigned i;
3561 slp_instance instance;
3562 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
3563 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
3564 leafs);
3567 /* Apply (reverse) bijectite PERM to VEC. */
3569 template <class T>
3570 static void
3571 vect_slp_permute (vec<unsigned> perm,
3572 vec<T> &vec, bool reverse)
3574 auto_vec<T, 64> saved;
3575 saved.create (vec.length ());
3576 for (unsigned i = 0; i < vec.length (); ++i)
3577 saved.quick_push (vec[i]);
3579 if (reverse)
3581 for (unsigned i = 0; i < vec.length (); ++i)
3582 vec[perm[i]] = saved[i];
3583 for (unsigned i = 0; i < vec.length (); ++i)
3584 gcc_assert (vec[perm[i]] == saved[i]);
3586 else
3588 for (unsigned i = 0; i < vec.length (); ++i)
3589 vec[i] = saved[perm[i]];
3590 for (unsigned i = 0; i < vec.length (); ++i)
3591 gcc_assert (vec[i] == saved[perm[i]]);
3595 /* Return whether permutations PERM_A and PERM_B as recorded in the
3596 PERMS vector are equal. */
3598 static bool
3599 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
3600 int perm_a, int perm_b)
3602 return (perm_a == perm_b
3603 || (perm_a != -1 && perm_b != -1
3604 && perms[perm_a].length () == perms[perm_b].length ()
3605 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
3606 sizeof (unsigned) * perms[perm_a].length ()) == 0));
3609 /* Optimize the SLP graph of VINFO. */
3611 void
3612 vect_optimize_slp (vec_info *vinfo)
3614 if (vinfo->slp_instances.is_empty ())
3615 return;
3617 slp_tree node;
3618 unsigned i;
3619 auto_vec<slpg_vertex> vertices;
3620 auto_vec<int> leafs;
3621 vect_slp_build_vertices (vinfo, vertices, leafs);
3623 struct graph *slpg = new_graph (vertices.length ());
3624 for (slpg_vertex &v : vertices)
3625 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
3626 if (child)
3627 add_edge (slpg, v.node->vertex, child->vertex);
3629 /* Compute (reverse) postorder on the inverted graph. */
3630 auto_vec<int> ipo;
3631 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3633 auto_vec<vec<unsigned> > perms;
3634 perms.safe_push (vNULL); /* zero is no permute */
3636 /* Produce initial permutations. */
3637 for (i = 0; i < leafs.length (); ++i)
3639 int idx = leafs[i];
3640 slp_tree node = vertices[idx].node;
3642 /* Handle externals and constants optimistically throughout the
3643 iteration. But treat existing vectors as fixed since we
3644 do not handle permuting them below. */
3645 if ((SLP_TREE_DEF_TYPE (node) == vect_external_def
3646 && !SLP_TREE_VEC_DEFS (node).exists ())
3647 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3648 continue;
3650 /* Leafs do not change across iterations. Note leafs also double
3651 as entries to the reverse graph. */
3652 if (!slpg->vertices[idx].succ)
3654 vertices[idx].perm_in = 0;
3655 vertices[idx].perm_out = 0;
3658 /* Loads are the only thing generating permutes. */
3659 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3660 continue;
3662 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3663 node unpermuted, record this permute. */
3664 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3665 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3666 continue;
3667 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3668 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3669 bool any_permute = false;
3670 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3672 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3673 imin = MIN (imin, idx);
3674 imax = MAX (imax, idx);
3675 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3676 any_permute = true;
3678 /* If there's no permute no need to split one out. */
3679 if (!any_permute)
3680 continue;
3681 /* If the span doesn't match we'd disrupt VF computation, avoid
3682 that for now. */
3683 if (imax - imin + 1 != SLP_TREE_LANES (node))
3684 continue;
3686 /* For now only handle true permutes, like
3687 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3688 when permuting constants and invariants keeping the permute
3689 bijective. */
3690 auto_sbitmap load_index (SLP_TREE_LANES (node));
3691 bitmap_clear (load_index);
3692 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3693 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3694 unsigned j;
3695 for (j = 0; j < SLP_TREE_LANES (node); ++j)
3696 if (!bitmap_bit_p (load_index, j))
3697 break;
3698 if (j != SLP_TREE_LANES (node))
3699 continue;
3701 vec<unsigned> perm = vNULL;
3702 perm.safe_grow (SLP_TREE_LANES (node), true);
3703 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3704 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3705 perms.safe_push (perm);
3706 vertices[idx].perm_in = perms.length () - 1;
3707 vertices[idx].perm_out = perms.length () - 1;
3710 /* In addition to the above we have to mark outgoing permutes facing
3711 non-reduction graph entries that are not represented as to be
3712 materialized. */
3713 for (slp_instance instance : vinfo->slp_instances)
3714 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
3716 /* Just setting perm_out isn't enough for the propagation to
3717 pick this up. */
3718 vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_in = 0;
3719 vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_out = 0;
3722 /* Propagate permutes along the graph and compute materialization points. */
3723 bool changed;
3724 bool do_materialization = false;
3725 unsigned iteration = 0;
3728 changed = false;
3729 ++iteration;
3731 if (dump_enabled_p ())
3732 dump_printf_loc (MSG_NOTE, vect_location,
3733 "SLP optimize iteration %d\n", iteration);
3735 for (i = vertices.length (); i > 0 ; --i)
3737 int idx = ipo[i-1];
3738 slp_tree node = vertices[idx].node;
3740 /* Handle externals and constants optimistically throughout the
3741 iteration. */
3742 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3743 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3744 continue;
3746 /* We still eventually have failed backedge SLP nodes in the
3747 graph, those are only cancelled when analyzing operations.
3748 Simply treat them as transparent ops, propagating permutes
3749 through them. */
3750 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3752 /* We do not handle stores with a permutation, so all
3753 incoming permutes must have been materialized. */
3754 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
3755 if (STMT_VINFO_DATA_REF (rep)
3756 && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
3758 /* ??? We're forcing materialization in place
3759 of the child here, we'd need special handling
3760 in materialization to leave perm_in -1 here. */
3761 vertices[idx].perm_in = 0;
3762 vertices[idx].perm_out = 0;
3764 /* We cannot move a permute across an operation that is
3765 not independent on lanes. Note this is an explicit
3766 negative list since that's much shorter than the respective
3767 positive one but it's critical to keep maintaining it. */
3768 if (is_gimple_call (STMT_VINFO_STMT (rep)))
3769 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
3771 case CFN_COMPLEX_ADD_ROT90:
3772 case CFN_COMPLEX_ADD_ROT270:
3773 case CFN_COMPLEX_MUL:
3774 case CFN_COMPLEX_MUL_CONJ:
3775 case CFN_VEC_ADDSUB:
3776 case CFN_VEC_FMADDSUB:
3777 case CFN_VEC_FMSUBADD:
3778 vertices[idx].perm_in = 0;
3779 vertices[idx].perm_out = 0;
3780 default:;
3784 if (!slpg->vertices[idx].succ)
3785 /* Pick up pre-computed leaf values. */
3787 else
3789 bool any_succ_perm_out_m1 = false;
3790 int perm_in = vertices[idx].perm_in;
3791 for (graph_edge *succ = slpg->vertices[idx].succ;
3792 succ; succ = succ->succ_next)
3794 int succ_idx = succ->dest;
3795 int succ_perm = vertices[succ_idx].perm_out;
3796 /* Handle unvisited (and constant) nodes optimistically. */
3797 /* ??? But for constants once we want to handle
3798 non-bijective permutes we have to verify the permute,
3799 when unifying lanes, will not unify different constants.
3800 For example see gcc.dg/vect/bb-slp-14.c for a case
3801 that would break. */
3802 if (succ_perm == -1)
3804 /* When we handled a non-leaf optimistically, note
3805 that so we can adjust its outgoing permute below. */
3806 slp_tree succ_node = vertices[succ_idx].node;
3807 if (SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3808 && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3809 any_succ_perm_out_m1 = true;
3810 continue;
3812 if (perm_in == -1)
3813 perm_in = succ_perm;
3814 else if (succ_perm == 0
3815 || !vect_slp_perms_eq (perms, perm_in, succ_perm))
3817 perm_in = 0;
3818 break;
3822 /* Adjust any incoming permutes we treated optimistically. */
3823 if (perm_in != -1 && any_succ_perm_out_m1)
3825 for (graph_edge *succ = slpg->vertices[idx].succ;
3826 succ; succ = succ->succ_next)
3828 slp_tree succ_node = vertices[succ->dest].node;
3829 if (vertices[succ->dest].perm_out == -1
3830 && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3831 && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3833 vertices[succ->dest].perm_out = perm_in;
3834 /* And ensure this propagates. */
3835 if (vertices[succ->dest].perm_in == -1)
3836 vertices[succ->dest].perm_in = perm_in;
3839 changed = true;
3842 if (!vect_slp_perms_eq (perms, perm_in,
3843 vertices[idx].perm_in))
3845 /* Make sure we eventually converge. */
3846 gcc_checking_assert (vertices[idx].perm_in == -1
3847 || perm_in == 0);
3848 vertices[idx].perm_in = perm_in;
3850 /* While we can handle VEC_PERM nodes as transparent
3851 pass-through they can be a cheap materialization
3852 point as well. In addition they can act as source
3853 of a random permutation as well.
3854 The following ensures that former materialization
3855 points that now have zero incoming permutes no
3856 longer appear as such and that former "any" permutes
3857 get pass-through. We keep VEC_PERM nodes optimistic
3858 as "any" outgoing permute though. */
3859 if (vertices[idx].perm_out != 0
3860 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
3861 vertices[idx].perm_out = perm_in;
3862 changed = true;
3866 /* Elide pruning at materialization points in the first
3867 iteration phase. */
3868 if (!do_materialization)
3869 continue;
3871 int perm = vertices[idx].perm_out;
3872 if (perm == 0 || perm == -1)
3873 continue;
3875 /* Decide on permute materialization. Look whether there's
3876 a use (pred) edge that is permuted differently than us.
3877 In that case mark ourselves so the permutation is applied. */
3878 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
3879 if (all_preds_permuted)
3880 for (graph_edge *pred = slpg->vertices[idx].pred;
3881 pred; pred = pred->pred_next)
3883 int pred_perm = vertices[pred->src].perm_in;
3884 gcc_checking_assert (pred_perm != -1);
3885 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3887 all_preds_permuted = false;
3888 break;
3891 if (!all_preds_permuted)
3893 vertices[idx].perm_out = 0;
3894 changed = true;
3898 /* If the initial propagation converged, switch on materialization
3899 and re-propagate. */
3900 if (!changed && !do_materialization)
3902 do_materialization = true;
3903 changed = true;
3906 while (changed);
3907 statistics_histogram_event (cfun, "SLP optimize perm iterations", iteration);
3909 /* Materialize. */
3910 for (i = 0; i < vertices.length (); ++i)
3912 int perm_in = vertices[i].perm_in;
3913 slp_tree node = vertices[i].node;
3915 /* First permute invariant/external original successors, we handle
3916 those optimistically during propagation and duplicate them if
3917 they are used with different permutations. */
3918 unsigned j;
3919 slp_tree child;
3920 if (perm_in > 0)
3921 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3923 if (!child
3924 || (SLP_TREE_DEF_TYPE (child) != vect_constant_def
3925 && SLP_TREE_DEF_TYPE (child) != vect_external_def))
3926 continue;
3928 /* If the vector is uniform there's nothing to do. */
3929 if (vect_slp_tree_uniform_p (child))
3930 continue;
3932 /* We can end up sharing some externals via two_operator
3933 handling. Be prepared to unshare those. */
3934 if (child->refcnt != 1)
3936 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3937 SLP_TREE_CHILDREN (node)[j] = child
3938 = vect_create_new_slp_node
3939 (SLP_TREE_SCALAR_OPS (child).copy ());
3941 vect_slp_permute (perms[perm_in],
3942 SLP_TREE_SCALAR_OPS (child), true);
3945 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3947 /* Apply the common permutes to the input vectors. */
3948 if (perm_in > 0)
3950 /* If the node is already a permute node we can apply
3951 the permutation to the lane selection, effectively
3952 materializing it on the incoming vectors. */
3953 if (dump_enabled_p ())
3954 dump_printf_loc (MSG_NOTE, vect_location,
3955 "simplifying permute node %p\n",
3956 node);
3957 for (unsigned k = 0;
3958 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3959 SLP_TREE_LANE_PERMUTATION (node)[k].second
3960 = perms[perm_in][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3962 /* Apply the anticipated output permute to the permute and
3963 stmt vectors. */
3964 int perm_out = vertices[i].perm_out;
3965 if (perm_out > 0)
3967 vect_slp_permute (perms[perm_out],
3968 SLP_TREE_SCALAR_STMTS (node), true);
3969 vect_slp_permute (perms[perm_out],
3970 SLP_TREE_LANE_PERMUTATION (node), true);
3973 else if (vertices[i].get_perm_materialized () != 0)
3975 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3976 /* For loads simply drop the permutation, the load permutation
3977 already performs the desired permutation. */
3979 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3980 gcc_unreachable ();
3981 else
3983 if (dump_enabled_p ())
3984 dump_printf_loc (MSG_NOTE, vect_location,
3985 "inserting permute node in place of %p\n",
3986 node);
3988 /* Make a copy of NODE and in-place change it to a
3989 VEC_PERM node to permute the lanes of the copy. */
3990 slp_tree copy = new _slp_tree;
3991 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3992 SLP_TREE_CHILDREN (node) = vNULL;
3993 SLP_TREE_SCALAR_STMTS (copy)
3994 = SLP_TREE_SCALAR_STMTS (node).copy ();
3995 vect_slp_permute (perms[perm_in],
3996 SLP_TREE_SCALAR_STMTS (copy), true);
3997 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3998 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3999 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
4000 SLP_TREE_LANE_PERMUTATION (copy)
4001 = SLP_TREE_LANE_PERMUTATION (node);
4002 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
4003 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
4004 copy->refcnt = 1;
4005 copy->max_nunits = node->max_nunits;
4006 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
4007 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
4008 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
4010 /* Now turn NODE into a VEC_PERM. */
4011 SLP_TREE_CHILDREN (node).safe_push (copy);
4012 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
4013 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4014 SLP_TREE_LANE_PERMUTATION (node)
4015 .quick_push (std::make_pair (0, perms[perm_in][j]));
4016 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
4019 else if (perm_in > 0) /* perm_in == perm_out */
4021 /* Apply the reverse permutation to our stmts. */
4022 vect_slp_permute (perms[perm_in],
4023 SLP_TREE_SCALAR_STMTS (node), true);
4024 /* And to the lane/load permutation, which we can simply
4025 make regular by design. */
4026 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4028 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
4029 /* ??? When we handle non-bijective permutes the idea
4030 is that we can force the load-permutation to be
4031 { min, min + 1, min + 2, ... max }. But then the
4032 scalar defs might no longer match the lane content
4033 which means wrong-code with live lane vectorization.
4034 So we possibly have to have NULL entries for those. */
4035 vect_slp_permute (perms[perm_in],
4036 SLP_TREE_LOAD_PERMUTATION (node), true);
4038 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
4039 gcc_unreachable ();
4043 /* Elide any permutations at BB reduction roots. */
4044 if (is_a <bb_vec_info> (vinfo))
4046 for (slp_instance instance : vinfo->slp_instances)
4048 if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
4049 continue;
4050 slp_tree old = SLP_INSTANCE_TREE (instance);
4051 if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
4052 && SLP_TREE_CHILDREN (old).length () == 1)
4054 slp_tree child = SLP_TREE_CHILDREN (old)[0];
4055 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
4057 /* Preserve the special VEC_PERM we use to shield existing
4058 vector defs from the rest. But make it a no-op. */
4059 unsigned i = 0;
4060 for (std::pair<unsigned, unsigned> &p
4061 : SLP_TREE_LANE_PERMUTATION (old))
4062 p.second = i++;
4064 else
4066 SLP_INSTANCE_TREE (instance) = child;
4067 SLP_TREE_REF_COUNT (child)++;
4068 vect_free_slp_tree (old);
4071 else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
4072 && SLP_TREE_REF_COUNT (old) == 1
4073 && vertices[old->vertex].get_perm_materialized () != 0)
4075 /* ??? For loads the situation is more complex since
4076 we can't modify the permute in place in case the
4077 node is used multiple times. In fact for loads this
4078 should be somehow handled in the propagation engine. */
4079 /* Apply the reverse permutation to our stmts. */
4080 int perm = vertices[old->vertex].get_perm_materialized ();
4081 vect_slp_permute (perms[perm],
4082 SLP_TREE_SCALAR_STMTS (old), true);
4083 vect_slp_permute (perms[perm],
4084 SLP_TREE_LOAD_PERMUTATION (old), true);
4089 /* Free the perms vector used for propagation. */
4090 while (!perms.is_empty ())
4091 perms.pop ().release ();
4092 free_graph (slpg);
4095 /* Now elide load permutations that are not necessary. */
4096 for (i = 0; i < leafs.length (); ++i)
4098 node = vertices[leafs[i]].node;
4099 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
4100 continue;
4102 /* In basic block vectorization we allow any subchain of an interleaving
4103 chain.
4104 FORNOW: not in loop SLP because of realignment complications. */
4105 if (is_a <bb_vec_info> (vinfo))
4107 bool subchain_p = true;
4108 stmt_vec_info next_load_info = NULL;
4109 stmt_vec_info load_info;
4110 unsigned j;
4111 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4113 if (j != 0
4114 && (next_load_info != load_info
4115 || DR_GROUP_GAP (load_info) != 1))
4117 subchain_p = false;
4118 break;
4120 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
4122 if (subchain_p)
4124 SLP_TREE_LOAD_PERMUTATION (node).release ();
4125 continue;
4128 else
4130 stmt_vec_info load_info;
4131 bool this_load_permuted = false;
4132 unsigned j;
4133 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4134 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
4136 this_load_permuted = true;
4137 break;
4139 stmt_vec_info first_stmt_info
4140 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
4141 if (!this_load_permuted
4142 /* The load requires permutation when unrolling exposes
4143 a gap either because the group is larger than the SLP
4144 group-size or because there is a gap between the groups. */
4145 && (known_eq (LOOP_VINFO_VECT_FACTOR
4146 (as_a <loop_vec_info> (vinfo)), 1U)
4147 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
4148 && DR_GROUP_GAP (first_stmt_info) == 0)))
4150 SLP_TREE_LOAD_PERMUTATION (node).release ();
4151 continue;
4157 /* Gather loads reachable from the individual SLP graph entries. */
4159 void
4160 vect_gather_slp_loads (vec_info *vinfo)
4162 unsigned i;
4163 slp_instance instance;
4164 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
4166 hash_set<slp_tree> visited;
4167 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
4168 SLP_INSTANCE_TREE (instance), visited);
4173 /* For each possible SLP instance decide whether to SLP it and calculate overall
4174 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
4175 least one instance. */
4177 bool
4178 vect_make_slp_decision (loop_vec_info loop_vinfo)
4180 unsigned int i;
4181 poly_uint64 unrolling_factor = 1;
4182 const vec<slp_instance> &slp_instances
4183 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
4184 slp_instance instance;
4185 int decided_to_slp = 0;
4187 DUMP_VECT_SCOPE ("vect_make_slp_decision");
4189 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4191 /* FORNOW: SLP if you can. */
4192 /* All unroll factors have the form:
4194 GET_MODE_SIZE (vinfo->vector_mode) * X
4196 for some rational X, so they must have a common multiple. */
4197 unrolling_factor
4198 = force_common_multiple (unrolling_factor,
4199 SLP_INSTANCE_UNROLLING_FACTOR (instance));
4201 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
4202 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
4203 loop-based vectorization. Such stmts will be marked as HYBRID. */
4204 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4205 decided_to_slp++;
4208 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
4210 if (decided_to_slp && dump_enabled_p ())
4212 dump_printf_loc (MSG_NOTE, vect_location,
4213 "Decided to SLP %d instances. Unrolling factor ",
4214 decided_to_slp);
4215 dump_dec (MSG_NOTE, unrolling_factor);
4216 dump_printf (MSG_NOTE, "\n");
4219 return (decided_to_slp > 0);
4222 /* Private data for vect_detect_hybrid_slp. */
4223 struct vdhs_data
4225 loop_vec_info loop_vinfo;
4226 vec<stmt_vec_info> *worklist;
4229 /* Walker for walk_gimple_op. */
4231 static tree
4232 vect_detect_hybrid_slp (tree *tp, int *, void *data)
4234 walk_stmt_info *wi = (walk_stmt_info *)data;
4235 vdhs_data *dat = (vdhs_data *)wi->info;
4237 if (wi->is_lhs)
4238 return NULL_TREE;
4240 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
4241 if (!def_stmt_info)
4242 return NULL_TREE;
4243 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
4244 if (PURE_SLP_STMT (def_stmt_info))
4246 if (dump_enabled_p ())
4247 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
4248 def_stmt_info->stmt);
4249 STMT_SLP_TYPE (def_stmt_info) = hybrid;
4250 dat->worklist->safe_push (def_stmt_info);
4253 return NULL_TREE;
4256 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
4257 if so, otherwise pushing it to WORKLIST. */
4259 static void
4260 maybe_push_to_hybrid_worklist (vec_info *vinfo,
4261 vec<stmt_vec_info> &worklist,
4262 stmt_vec_info stmt_info)
4264 if (dump_enabled_p ())
4265 dump_printf_loc (MSG_NOTE, vect_location,
4266 "Processing hybrid candidate : %G", stmt_info->stmt);
4267 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
4268 imm_use_iterator iter2;
4269 ssa_op_iter iter1;
4270 use_operand_p use_p;
4271 def_operand_p def_p;
4272 bool any_def = false;
4273 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
4275 any_def = true;
4276 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
4278 if (is_gimple_debug (USE_STMT (use_p)))
4279 continue;
4280 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
4281 /* An out-of loop use means this is a loop_vect sink. */
4282 if (!use_info)
4284 if (dump_enabled_p ())
4285 dump_printf_loc (MSG_NOTE, vect_location,
4286 "Found loop_vect sink: %G", stmt_info->stmt);
4287 worklist.safe_push (stmt_info);
4288 return;
4290 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
4292 if (dump_enabled_p ())
4293 dump_printf_loc (MSG_NOTE, vect_location,
4294 "Found loop_vect use: %G", use_info->stmt);
4295 worklist.safe_push (stmt_info);
4296 return;
4300 /* No def means this is a loo_vect sink. */
4301 if (!any_def)
4303 if (dump_enabled_p ())
4304 dump_printf_loc (MSG_NOTE, vect_location,
4305 "Found loop_vect sink: %G", stmt_info->stmt);
4306 worklist.safe_push (stmt_info);
4307 return;
4309 if (dump_enabled_p ())
4310 dump_printf_loc (MSG_NOTE, vect_location,
4311 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
4312 STMT_SLP_TYPE (stmt_info) = pure_slp;
4315 /* Find stmts that must be both vectorized and SLPed. */
4317 void
4318 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
4320 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
4322 /* All stmts participating in SLP are marked pure_slp, all other
4323 stmts are loop_vect.
4324 First collect all loop_vect stmts into a worklist.
4325 SLP patterns cause not all original scalar stmts to appear in
4326 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
4327 Rectify this here and do a backward walk over the IL only considering
4328 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
4329 mark them as pure_slp. */
4330 auto_vec<stmt_vec_info> worklist;
4331 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
4333 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
4334 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
4335 gsi_next (&gsi))
4337 gphi *phi = gsi.phi ();
4338 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
4339 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4340 maybe_push_to_hybrid_worklist (loop_vinfo,
4341 worklist, stmt_info);
4343 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
4344 gsi_prev (&gsi))
4346 gimple *stmt = gsi_stmt (gsi);
4347 if (is_gimple_debug (stmt))
4348 continue;
4349 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
4350 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
4352 for (gimple_stmt_iterator gsi2
4353 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
4354 !gsi_end_p (gsi2); gsi_next (&gsi2))
4356 stmt_vec_info patt_info
4357 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
4358 if (!STMT_SLP_TYPE (patt_info)
4359 && STMT_VINFO_RELEVANT (patt_info))
4360 maybe_push_to_hybrid_worklist (loop_vinfo,
4361 worklist, patt_info);
4363 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4365 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4366 maybe_push_to_hybrid_worklist (loop_vinfo,
4367 worklist, stmt_info);
4371 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
4372 mark any SLP vectorized stmt as hybrid.
4373 ??? We're visiting def stmts N times (once for each non-SLP and
4374 once for each hybrid-SLP use). */
4375 walk_stmt_info wi;
4376 vdhs_data dat;
4377 dat.worklist = &worklist;
4378 dat.loop_vinfo = loop_vinfo;
4379 memset (&wi, 0, sizeof (wi));
4380 wi.info = (void *)&dat;
4381 while (!worklist.is_empty ())
4383 stmt_vec_info stmt_info = worklist.pop ();
4384 /* Since SSA operands are not set up for pattern stmts we need
4385 to use walk_gimple_op. */
4386 wi.is_lhs = 0;
4387 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
4392 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
4394 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
4395 : vec_info (vec_info::bb, shared),
4396 bbs (_bbs),
4397 roots (vNULL)
4399 for (unsigned i = 0; i < bbs.length (); ++i)
4401 if (i != 0)
4402 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4403 gsi_next (&si))
4405 gphi *phi = si.phi ();
4406 gimple_set_uid (phi, 0);
4407 add_stmt (phi);
4409 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4410 !gsi_end_p (gsi); gsi_next (&gsi))
4412 gimple *stmt = gsi_stmt (gsi);
4413 gimple_set_uid (stmt, 0);
4414 if (is_gimple_debug (stmt))
4415 continue;
4416 add_stmt (stmt);
4422 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
4423 stmts in the basic block. */
4425 _bb_vec_info::~_bb_vec_info ()
4427 /* Reset region marker. */
4428 for (unsigned i = 0; i < bbs.length (); ++i)
4430 if (i != 0)
4431 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4432 gsi_next (&si))
4434 gphi *phi = si.phi ();
4435 gimple_set_uid (phi, -1);
4437 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4438 !gsi_end_p (gsi); gsi_next (&gsi))
4440 gimple *stmt = gsi_stmt (gsi);
4441 gimple_set_uid (stmt, -1);
4445 for (unsigned i = 0; i < roots.length (); ++i)
4447 roots[i].stmts.release ();
4448 roots[i].roots.release ();
4450 roots.release ();
4453 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
4454 given then that child nodes have already been processed, and that
4455 their def types currently match their SLP node's def type. */
4457 static bool
4458 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
4459 slp_instance node_instance,
4460 stmt_vector_for_cost *cost_vec)
4462 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4464 /* Calculate the number of vector statements to be created for the
4465 scalar stmts in this node. For SLP reductions it is equal to the
4466 number of vector statements in the children (which has already been
4467 calculated by the recursive call). Otherwise it is the number of
4468 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
4469 VF divided by the number of elements in a vector. */
4470 if (!STMT_VINFO_DATA_REF (stmt_info)
4471 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
4473 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
4474 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
4476 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4477 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
4478 break;
4481 else
4483 poly_uint64 vf;
4484 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4485 vf = loop_vinfo->vectorization_factor;
4486 else
4487 vf = 1;
4488 unsigned int group_size = SLP_TREE_LANES (node);
4489 tree vectype = SLP_TREE_VECTYPE (node);
4490 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4491 = vect_get_num_vectors (vf * group_size, vectype);
4494 /* Handle purely internal nodes. */
4495 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4496 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
4498 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
4500 bool dummy;
4501 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
4502 node, node_instance, cost_vec);
4505 /* Try to build NODE from scalars, returning true on success.
4506 NODE_INSTANCE is the SLP instance that contains NODE. */
4508 static bool
4509 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
4510 slp_instance node_instance)
4512 stmt_vec_info stmt_info;
4513 unsigned int i;
4515 if (!is_a <bb_vec_info> (vinfo)
4516 || node == SLP_INSTANCE_TREE (node_instance)
4517 || !SLP_TREE_SCALAR_STMTS (node).exists ()
4518 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
4519 return false;
4521 if (dump_enabled_p ())
4522 dump_printf_loc (MSG_NOTE, vect_location,
4523 "Building vector operands of %p from scalars instead\n", node);
4525 /* Don't remove and free the child nodes here, since they could be
4526 referenced by other structures. The analysis and scheduling phases
4527 (need to) ignore child nodes of anything that isn't vect_internal_def. */
4528 unsigned int group_size = SLP_TREE_LANES (node);
4529 SLP_TREE_DEF_TYPE (node) = vect_external_def;
4530 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
4531 SLP_TREE_LOAD_PERMUTATION (node).release ();
4532 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4534 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
4535 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
4537 return true;
4540 /* Return true if all elements of the slice are the same. */
4541 bool
4542 vect_scalar_ops_slice::all_same_p () const
4544 for (unsigned int i = 1; i < length; ++i)
4545 if (!operand_equal_p (op (0), op (i)))
4546 return false;
4547 return true;
4550 hashval_t
4551 vect_scalar_ops_slice_hash::hash (const value_type &s)
4553 hashval_t hash = 0;
4554 for (unsigned i = 0; i < s.length; ++i)
4555 hash = iterative_hash_expr (s.op (i), hash);
4556 return hash;
4559 bool
4560 vect_scalar_ops_slice_hash::equal (const value_type &s1,
4561 const compare_type &s2)
4563 if (s1.length != s2.length)
4564 return false;
4565 for (unsigned i = 0; i < s1.length; ++i)
4566 if (!operand_equal_p (s1.op (i), s2.op (i)))
4567 return false;
4568 return true;
4571 /* Compute the prologue cost for invariant or constant operands represented
4572 by NODE. */
4574 static void
4575 vect_prologue_cost_for_slp (slp_tree node,
4576 stmt_vector_for_cost *cost_vec)
4578 /* There's a special case of an existing vector, that costs nothing. */
4579 if (SLP_TREE_SCALAR_OPS (node).length () == 0
4580 && !SLP_TREE_VEC_DEFS (node).is_empty ())
4581 return;
4582 /* Without looking at the actual initializer a vector of
4583 constants can be implemented as load from the constant pool.
4584 When all elements are the same we can use a splat. */
4585 tree vectype = SLP_TREE_VECTYPE (node);
4586 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
4587 unsigned HOST_WIDE_INT const_nunits;
4588 unsigned nelt_limit;
4589 auto ops = &SLP_TREE_SCALAR_OPS (node);
4590 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4591 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
4592 && ! multiple_p (const_nunits, group_size))
4594 nelt_limit = const_nunits;
4595 hash_set<vect_scalar_ops_slice_hash> vector_ops;
4596 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
4597 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
4598 starts.quick_push (i * const_nunits);
4600 else
4602 /* If either the vector has variable length or the vectors
4603 are composed of repeated whole groups we only need to
4604 cost construction once. All vectors will be the same. */
4605 nelt_limit = group_size;
4606 starts.quick_push (0);
4608 /* ??? We're just tracking whether vectors in a single node are the same.
4609 Ideally we'd do something more global. */
4610 for (unsigned int start : starts)
4612 vect_cost_for_stmt kind;
4613 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
4614 kind = vector_load;
4615 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
4616 kind = scalar_to_vec;
4617 else
4618 kind = vec_construct;
4619 record_stmt_cost (cost_vec, 1, kind, node, vectype, 0, vect_prologue);
4623 /* Analyze statements contained in SLP tree NODE after recursively analyzing
4624 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
4626 Return true if the operations are supported. */
4628 static bool
4629 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
4630 slp_instance node_instance,
4631 hash_set<slp_tree> &visited_set,
4632 vec<slp_tree> &visited_vec,
4633 stmt_vector_for_cost *cost_vec)
4635 int i, j;
4636 slp_tree child;
4638 /* Assume we can code-generate all invariants. */
4639 if (!node
4640 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
4641 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4642 return true;
4644 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
4646 if (dump_enabled_p ())
4647 dump_printf_loc (MSG_NOTE, vect_location,
4648 "Failed cyclic SLP reference in %p\n", node);
4649 return false;
4651 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
4653 /* If we already analyzed the exact same set of scalar stmts we're done.
4654 We share the generated vector stmts for those. */
4655 if (visited_set.add (node))
4656 return true;
4657 visited_vec.safe_push (node);
4659 bool res = true;
4660 unsigned visited_rec_start = visited_vec.length ();
4661 unsigned cost_vec_rec_start = cost_vec->length ();
4662 bool seen_non_constant_child = false;
4663 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4665 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
4666 visited_set, visited_vec,
4667 cost_vec);
4668 if (!res)
4669 break;
4670 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
4671 seen_non_constant_child = true;
4673 /* We're having difficulties scheduling nodes with just constant
4674 operands and no scalar stmts since we then cannot compute a stmt
4675 insertion place. */
4676 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
4678 if (dump_enabled_p ())
4679 dump_printf_loc (MSG_NOTE, vect_location,
4680 "Cannot vectorize all-constant op node %p\n", node);
4681 res = false;
4684 if (res)
4685 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
4686 cost_vec);
4687 /* If analysis failed we have to pop all recursive visited nodes
4688 plus ourselves. */
4689 if (!res)
4691 while (visited_vec.length () >= visited_rec_start)
4692 visited_set.remove (visited_vec.pop ());
4693 cost_vec->truncate (cost_vec_rec_start);
4696 /* When the node can be vectorized cost invariant nodes it references.
4697 This is not done in DFS order to allow the refering node
4698 vectorizable_* calls to nail down the invariant nodes vector type
4699 and possibly unshare it if it needs a different vector type than
4700 other referrers. */
4701 if (res)
4702 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
4703 if (child
4704 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
4705 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
4706 /* Perform usual caching, note code-generation still
4707 code-gens these nodes multiple times but we expect
4708 to CSE them later. */
4709 && !visited_set.add (child))
4711 visited_vec.safe_push (child);
4712 /* ??? After auditing more code paths make a "default"
4713 and push the vector type from NODE to all children
4714 if it is not already set. */
4715 /* Compute the number of vectors to be generated. */
4716 tree vector_type = SLP_TREE_VECTYPE (child);
4717 if (!vector_type)
4719 /* For shifts with a scalar argument we don't need
4720 to cost or code-generate anything.
4721 ??? Represent this more explicitely. */
4722 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4723 == shift_vec_info_type)
4724 && j == 1);
4725 continue;
4727 unsigned group_size = SLP_TREE_LANES (child);
4728 poly_uint64 vf = 1;
4729 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4730 vf = loop_vinfo->vectorization_factor;
4731 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
4732 = vect_get_num_vectors (vf * group_size, vector_type);
4733 /* And cost them. */
4734 vect_prologue_cost_for_slp (child, cost_vec);
4737 /* If this node or any of its children can't be vectorized, try pruning
4738 the tree here rather than felling the whole thing. */
4739 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
4741 /* We'll need to revisit this for invariant costing and number
4742 of vectorized stmt setting. */
4743 res = true;
4746 return res;
4749 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4750 region and that can be vectorized using vectorizable_live_operation
4751 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4752 scalar code computing it to be retained. */
4754 static void
4755 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
4756 slp_instance instance,
4757 stmt_vector_for_cost *cost_vec,
4758 hash_set<stmt_vec_info> &svisited,
4759 hash_set<slp_tree> &visited)
4761 if (visited.add (node))
4762 return;
4764 unsigned i;
4765 stmt_vec_info stmt_info;
4766 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
4767 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4769 if (svisited.contains (stmt_info))
4770 continue;
4771 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4772 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
4773 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
4774 /* Only the pattern root stmt computes the original scalar value. */
4775 continue;
4776 bool mark_visited = true;
4777 gimple *orig_stmt = orig_stmt_info->stmt;
4778 ssa_op_iter op_iter;
4779 def_operand_p def_p;
4780 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4782 imm_use_iterator use_iter;
4783 gimple *use_stmt;
4784 stmt_vec_info use_stmt_info;
4785 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4786 if (!is_gimple_debug (use_stmt))
4788 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
4789 if (!use_stmt_info
4790 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4792 STMT_VINFO_LIVE_P (stmt_info) = true;
4793 if (vectorizable_live_operation (bb_vinfo, stmt_info,
4794 NULL, node, instance, i,
4795 false, cost_vec))
4796 /* ??? So we know we can vectorize the live stmt
4797 from one SLP node. If we cannot do so from all
4798 or none consistently we'd have to record which
4799 SLP node (and lane) we want to use for the live
4800 operation. So make sure we can code-generate
4801 from all nodes. */
4802 mark_visited = false;
4803 else
4804 STMT_VINFO_LIVE_P (stmt_info) = false;
4805 break;
4808 /* We have to verify whether we can insert the lane extract
4809 before all uses. The following is a conservative approximation.
4810 We cannot put this into vectorizable_live_operation because
4811 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4812 doesn't work.
4813 Note that while the fact that we emit code for loads at the
4814 first load should make this a non-problem leafs we construct
4815 from scalars are vectorized after the last scalar def.
4816 ??? If we'd actually compute the insert location during
4817 analysis we could use sth less conservative than the last
4818 scalar stmt in the node for the dominance check. */
4819 /* ??? What remains is "live" uses in vector CTORs in the same
4820 SLP graph which is where those uses can end up code-generated
4821 right after their definition instead of close to their original
4822 use. But that would restrict us to code-generate lane-extracts
4823 from the latest stmt in a node. So we compensate for this
4824 during code-generation, simply not replacing uses for those
4825 hopefully rare cases. */
4826 if (STMT_VINFO_LIVE_P (stmt_info))
4827 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4828 if (!is_gimple_debug (use_stmt)
4829 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4830 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4831 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4833 if (dump_enabled_p ())
4834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4835 "Cannot determine insertion place for "
4836 "lane extract\n");
4837 STMT_VINFO_LIVE_P (stmt_info) = false;
4838 mark_visited = true;
4841 if (mark_visited)
4842 svisited.add (stmt_info);
4845 slp_tree child;
4846 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4847 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4848 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4849 cost_vec, svisited, visited);
4852 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
4854 static bool
4855 vectorizable_bb_reduc_epilogue (slp_instance instance,
4856 stmt_vector_for_cost *cost_vec)
4858 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
4859 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
4860 if (reduc_code == MINUS_EXPR)
4861 reduc_code = PLUS_EXPR;
4862 internal_fn reduc_fn;
4863 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
4864 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
4865 || reduc_fn == IFN_LAST
4866 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
4867 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
4868 TREE_TYPE (vectype)))
4869 return false;
4871 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
4872 cost log2 vector operations plus shuffles and one extraction. */
4873 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
4874 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
4875 vectype, 0, vect_body);
4876 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
4877 vectype, 0, vect_body);
4878 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
4879 vectype, 0, vect_body);
4880 return true;
4883 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
4884 and recurse to children. */
4886 static void
4887 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
4888 hash_set<slp_tree> &visited)
4890 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
4891 || visited.add (node))
4892 return;
4894 stmt_vec_info stmt;
4895 unsigned i;
4896 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4897 roots.remove (vect_orig_stmt (stmt));
4899 slp_tree child;
4900 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4901 if (child)
4902 vect_slp_prune_covered_roots (child, roots, visited);
4905 /* Analyze statements in SLP instances of VINFO. Return true if the
4906 operations are supported. */
4908 bool
4909 vect_slp_analyze_operations (vec_info *vinfo)
4911 slp_instance instance;
4912 int i;
4914 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4916 hash_set<slp_tree> visited;
4917 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4919 auto_vec<slp_tree> visited_vec;
4920 stmt_vector_for_cost cost_vec;
4921 cost_vec.create (2);
4922 if (is_a <bb_vec_info> (vinfo))
4923 vect_location = instance->location ();
4924 if (!vect_slp_analyze_node_operations (vinfo,
4925 SLP_INSTANCE_TREE (instance),
4926 instance, visited, visited_vec,
4927 &cost_vec)
4928 /* CTOR instances require vectorized defs for the SLP tree root. */
4929 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
4930 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
4931 != vect_internal_def
4932 /* Make sure we vectorized with the expected type. */
4933 || !useless_type_conversion_p
4934 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
4935 (instance->root_stmts[0]->stmt))),
4936 TREE_TYPE (SLP_TREE_VECTYPE
4937 (SLP_INSTANCE_TREE (instance))))))
4938 /* Check we can vectorize the reduction. */
4939 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
4940 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
4942 slp_tree node = SLP_INSTANCE_TREE (instance);
4943 stmt_vec_info stmt_info;
4944 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4945 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
4946 else
4947 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4948 if (dump_enabled_p ())
4949 dump_printf_loc (MSG_NOTE, vect_location,
4950 "removing SLP instance operations starting from: %G",
4951 stmt_info->stmt);
4952 vect_free_slp_instance (instance);
4953 vinfo->slp_instances.ordered_remove (i);
4954 cost_vec.release ();
4955 while (!visited_vec.is_empty ())
4956 visited.remove (visited_vec.pop ());
4958 else
4960 i++;
4961 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
4963 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
4964 cost_vec.release ();
4966 else
4967 /* For BB vectorization remember the SLP graph entry
4968 cost for later. */
4969 instance->cost_vec = cost_vec;
4973 /* Now look for SLP instances with a root that are covered by other
4974 instances and remove them. */
4975 hash_set<stmt_vec_info> roots;
4976 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4977 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4978 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
4979 if (!roots.is_empty ())
4981 visited.empty ();
4982 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4983 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
4984 visited);
4985 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4986 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
4987 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
4989 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
4990 if (dump_enabled_p ())
4991 dump_printf_loc (MSG_NOTE, vect_location,
4992 "removing SLP instance operations starting "
4993 "from: %G", root->stmt);
4994 vect_free_slp_instance (instance);
4995 vinfo->slp_instances.ordered_remove (i);
4997 else
4998 ++i;
5001 /* Compute vectorizable live stmts. */
5002 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
5004 hash_set<stmt_vec_info> svisited;
5005 hash_set<slp_tree> visited;
5006 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5008 vect_location = instance->location ();
5009 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
5010 instance, &instance->cost_vec, svisited,
5011 visited);
5015 return !vinfo->slp_instances.is_empty ();
5018 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
5019 closing the eventual chain. */
5021 static slp_instance
5022 get_ultimate_leader (slp_instance instance,
5023 hash_map<slp_instance, slp_instance> &instance_leader)
5025 auto_vec<slp_instance *, 8> chain;
5026 slp_instance *tem;
5027 while (*(tem = instance_leader.get (instance)) != instance)
5029 chain.safe_push (tem);
5030 instance = *tem;
5032 while (!chain.is_empty ())
5033 *chain.pop () = instance;
5034 return instance;
5037 /* Worker of vect_bb_partition_graph, recurse on NODE. */
5039 static void
5040 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
5041 slp_instance instance, slp_tree node,
5042 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
5043 hash_map<slp_instance, slp_instance> &instance_leader,
5044 hash_set<slp_tree> &visited)
5046 stmt_vec_info stmt_info;
5047 unsigned i;
5049 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5051 bool existed_p;
5052 slp_instance &stmt_instance
5053 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
5054 if (!existed_p)
5056 else if (stmt_instance != instance)
5058 /* If we're running into a previously marked stmt make us the
5059 leader of the current ultimate leader. This keeps the
5060 leader chain acyclic and works even when the current instance
5061 connects two previously independent graph parts. */
5062 slp_instance stmt_leader
5063 = get_ultimate_leader (stmt_instance, instance_leader);
5064 if (stmt_leader != instance)
5065 instance_leader.put (stmt_leader, instance);
5067 stmt_instance = instance;
5070 if (!SLP_TREE_SCALAR_STMTS (node).is_empty () && visited.add (node))
5071 return;
5073 slp_tree child;
5074 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5075 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5076 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
5077 instance_leader, visited);
5080 /* Partition the SLP graph into pieces that can be costed independently. */
5082 static void
5083 vect_bb_partition_graph (bb_vec_info bb_vinfo)
5085 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
5087 /* First walk the SLP graph assigning each involved scalar stmt a
5088 corresponding SLP graph entry and upon visiting a previously
5089 marked stmt, make the stmts leader the current SLP graph entry. */
5090 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
5091 hash_map<slp_instance, slp_instance> instance_leader;
5092 hash_set<slp_tree> visited;
5093 slp_instance instance;
5094 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5096 instance_leader.put (instance, instance);
5097 vect_bb_partition_graph_r (bb_vinfo,
5098 instance, SLP_INSTANCE_TREE (instance),
5099 stmt_to_instance, instance_leader,
5100 visited);
5103 /* Then collect entries to each independent subgraph. */
5104 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5106 slp_instance leader = get_ultimate_leader (instance, instance_leader);
5107 leader->subgraph_entries.safe_push (instance);
5108 if (dump_enabled_p ()
5109 && leader != instance)
5110 dump_printf_loc (MSG_NOTE, vect_location,
5111 "instance %p is leader of %p\n",
5112 leader, instance);
5116 /* Compute the set of scalar stmts participating in internal and external
5117 nodes. */
5119 static void
5120 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
5121 hash_set<slp_tree> &visited,
5122 hash_set<stmt_vec_info> &vstmts,
5123 hash_set<stmt_vec_info> &estmts)
5125 int i;
5126 stmt_vec_info stmt_info;
5127 slp_tree child;
5129 if (visited.add (node))
5130 return;
5132 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
5134 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5135 vstmts.add (stmt_info);
5137 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5138 if (child)
5139 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
5140 vstmts, estmts);
5142 else
5143 for (tree def : SLP_TREE_SCALAR_OPS (node))
5145 stmt_vec_info def_stmt = vinfo->lookup_def (def);
5146 if (def_stmt)
5147 estmts.add (def_stmt);
5152 /* Compute the scalar cost of the SLP node NODE and its children
5153 and return it. Do not account defs that are marked in LIFE and
5154 update LIFE according to uses of NODE. */
5156 static void
5157 vect_bb_slp_scalar_cost (vec_info *vinfo,
5158 slp_tree node, vec<bool, va_heap> *life,
5159 stmt_vector_for_cost *cost_vec,
5160 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
5161 hash_set<slp_tree> &visited)
5163 unsigned i;
5164 stmt_vec_info stmt_info;
5165 slp_tree child;
5167 if (visited.add (node))
5168 return;
5170 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5172 ssa_op_iter op_iter;
5173 def_operand_p def_p;
5175 if ((*life)[i])
5176 continue;
5178 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5179 gimple *orig_stmt = orig_stmt_info->stmt;
5181 /* If there is a non-vectorized use of the defs then the scalar
5182 stmt is kept live in which case we do not account it or any
5183 required defs in the SLP children in the scalar cost. This
5184 way we make the vectorization more costly when compared to
5185 the scalar cost. */
5186 if (!STMT_VINFO_LIVE_P (stmt_info))
5188 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
5190 imm_use_iterator use_iter;
5191 gimple *use_stmt;
5192 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
5193 if (!is_gimple_debug (use_stmt))
5195 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
5196 if (!use_stmt_info
5197 || !vectorized_scalar_stmts.contains (use_stmt_info))
5199 (*life)[i] = true;
5200 break;
5204 if ((*life)[i])
5205 continue;
5208 /* Count scalar stmts only once. */
5209 if (gimple_visited_p (orig_stmt))
5210 continue;
5211 gimple_set_visited (orig_stmt, true);
5213 vect_cost_for_stmt kind;
5214 if (STMT_VINFO_DATA_REF (orig_stmt_info))
5216 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
5217 kind = scalar_load;
5218 else
5219 kind = scalar_store;
5221 else if (vect_nop_conversion_p (orig_stmt_info))
5222 continue;
5223 /* For single-argument PHIs assume coalescing which means zero cost
5224 for the scalar and the vector PHIs. This avoids artificially
5225 favoring the vector path (but may pessimize it in some cases). */
5226 else if (is_a <gphi *> (orig_stmt_info->stmt)
5227 && gimple_phi_num_args
5228 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
5229 continue;
5230 else
5231 kind = scalar_stmt;
5232 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
5233 SLP_TREE_VECTYPE (node), 0, vect_body);
5236 auto_vec<bool, 20> subtree_life;
5237 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5239 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5241 /* Do not directly pass LIFE to the recursive call, copy it to
5242 confine changes in the callee to the current child/subtree. */
5243 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5245 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
5246 for (unsigned j = 0;
5247 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
5249 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
5250 if (perm.first == i)
5251 subtree_life[perm.second] = (*life)[j];
5254 else
5256 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
5257 subtree_life.safe_splice (*life);
5259 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
5260 vectorized_scalar_stmts, visited);
5261 subtree_life.truncate (0);
5266 /* Comparator for the loop-index sorted cost vectors. */
5268 static int
5269 li_cost_vec_cmp (const void *a_, const void *b_)
5271 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
5272 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
5273 if (a->first < b->first)
5274 return -1;
5275 else if (a->first == b->first)
5276 return 0;
5277 return 1;
5280 /* Check if vectorization of the basic block is profitable for the
5281 subgraph denoted by SLP_INSTANCES. */
5283 static bool
5284 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
5285 vec<slp_instance> slp_instances,
5286 loop_p orig_loop)
5288 slp_instance instance;
5289 int i;
5290 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
5291 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
5293 if (dump_enabled_p ())
5295 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
5296 hash_set<slp_tree> visited;
5297 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5298 vect_print_slp_graph (MSG_NOTE, vect_location,
5299 SLP_INSTANCE_TREE (instance), visited);
5302 /* Compute the set of scalar stmts we know will go away 'locally' when
5303 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
5304 not accurate for nodes promoted extern late or for scalar stmts that
5305 are used both in extern defs and in vectorized defs. */
5306 hash_set<stmt_vec_info> vectorized_scalar_stmts;
5307 hash_set<stmt_vec_info> scalar_stmts_in_externs;
5308 hash_set<slp_tree> visited;
5309 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5311 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
5312 SLP_INSTANCE_TREE (instance),
5313 visited,
5314 vectorized_scalar_stmts,
5315 scalar_stmts_in_externs);
5316 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
5317 vectorized_scalar_stmts.add (rstmt);
5319 /* Scalar stmts used as defs in external nodes need to be preseved, so
5320 remove them from vectorized_scalar_stmts. */
5321 for (stmt_vec_info stmt : scalar_stmts_in_externs)
5322 vectorized_scalar_stmts.remove (stmt);
5324 /* Calculate scalar cost and sum the cost for the vector stmts
5325 previously collected. */
5326 stmt_vector_for_cost scalar_costs = vNULL;
5327 stmt_vector_for_cost vector_costs = vNULL;
5328 visited.empty ();
5329 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5331 auto_vec<bool, 20> life;
5332 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
5333 true);
5334 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5335 record_stmt_cost (&scalar_costs,
5336 SLP_INSTANCE_ROOT_STMTS (instance).length (),
5337 scalar_stmt,
5338 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
5339 vect_bb_slp_scalar_cost (bb_vinfo,
5340 SLP_INSTANCE_TREE (instance),
5341 &life, &scalar_costs, vectorized_scalar_stmts,
5342 visited);
5343 vector_costs.safe_splice (instance->cost_vec);
5344 instance->cost_vec.release ();
5347 if (dump_enabled_p ())
5348 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5350 /* When costing non-loop vectorization we need to consider each covered
5351 loop independently and make sure vectorization is profitable. For
5352 now we assume a loop may be not entered or executed an arbitrary
5353 number of iterations (??? static information can provide more
5354 precise info here) which means we can simply cost each containing
5355 loops stmts separately. */
5357 /* First produce cost vectors sorted by loop index. */
5358 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5359 li_scalar_costs (scalar_costs.length ());
5360 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5361 li_vector_costs (vector_costs.length ());
5362 stmt_info_for_cost *cost;
5363 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5365 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5366 li_scalar_costs.quick_push (std::make_pair (l, cost));
5368 /* Use a random used loop as fallback in case the first vector_costs
5369 entry does not have a stmt_info associated with it. */
5370 unsigned l = li_scalar_costs[0].first;
5371 FOR_EACH_VEC_ELT (vector_costs, i, cost)
5373 /* We inherit from the previous COST, invariants, externals and
5374 extracts immediately follow the cost for the related stmt. */
5375 if (cost->stmt_info)
5376 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5377 li_vector_costs.quick_push (std::make_pair (l, cost));
5379 li_scalar_costs.qsort (li_cost_vec_cmp);
5380 li_vector_costs.qsort (li_cost_vec_cmp);
5382 /* Now cost the portions individually. */
5383 unsigned vi = 0;
5384 unsigned si = 0;
5385 bool profitable = true;
5386 while (si < li_scalar_costs.length ()
5387 && vi < li_vector_costs.length ())
5389 unsigned sl = li_scalar_costs[si].first;
5390 unsigned vl = li_vector_costs[vi].first;
5391 if (sl != vl)
5393 if (dump_enabled_p ())
5394 dump_printf_loc (MSG_NOTE, vect_location,
5395 "Scalar %d and vector %d loop part do not "
5396 "match up, skipping scalar part\n", sl, vl);
5397 /* Skip the scalar part, assuming zero cost on the vector side. */
5400 si++;
5402 while (si < li_scalar_costs.length ()
5403 && li_scalar_costs[si].first == sl);
5404 continue;
5407 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
5410 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
5411 si++;
5413 while (si < li_scalar_costs.length ()
5414 && li_scalar_costs[si].first == sl);
5415 unsigned dummy;
5416 finish_cost (scalar_target_cost_data, nullptr,
5417 &dummy, &scalar_cost, &dummy);
5419 /* Complete the target-specific vector cost calculation. */
5420 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
5423 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
5424 vi++;
5426 while (vi < li_vector_costs.length ()
5427 && li_vector_costs[vi].first == vl);
5428 finish_cost (vect_target_cost_data, scalar_target_cost_data,
5429 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
5430 delete scalar_target_cost_data;
5431 delete vect_target_cost_data;
5433 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
5435 if (dump_enabled_p ())
5437 dump_printf_loc (MSG_NOTE, vect_location,
5438 "Cost model analysis for part in loop %d:\n", sl);
5439 dump_printf (MSG_NOTE, " Vector cost: %d\n",
5440 vec_inside_cost + vec_outside_cost);
5441 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
5444 /* Vectorization is profitable if its cost is more than the cost of scalar
5445 version. Note that we err on the vector side for equal cost because
5446 the cost estimate is otherwise quite pessimistic (constant uses are
5447 free on the scalar side but cost a load on the vector side for
5448 example). */
5449 if (vec_outside_cost + vec_inside_cost > scalar_cost)
5451 profitable = false;
5452 break;
5455 if (profitable && vi < li_vector_costs.length ())
5457 if (dump_enabled_p ())
5458 dump_printf_loc (MSG_NOTE, vect_location,
5459 "Excess vector cost for part in loop %d:\n",
5460 li_vector_costs[vi].first);
5461 profitable = false;
5464 /* Unset visited flag. This is delayed when the subgraph is profitable
5465 and we process the loop for remaining unvectorized if-converted code. */
5466 if (!orig_loop || !profitable)
5467 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5468 gimple_set_visited (cost->stmt_info->stmt, false);
5470 scalar_costs.release ();
5471 vector_costs.release ();
5473 return profitable;
5476 /* qsort comparator for lane defs. */
5478 static int
5479 vld_cmp (const void *a_, const void *b_)
5481 auto *a = (const std::pair<unsigned, tree> *)a_;
5482 auto *b = (const std::pair<unsigned, tree> *)b_;
5483 return a->first - b->first;
5486 /* Return true if USE_STMT is a vector lane insert into VEC and set
5487 *THIS_LANE to the lane number that is set. */
5489 static bool
5490 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
5492 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
5493 if (!use_ass
5494 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
5495 || (vec
5496 ? gimple_assign_rhs1 (use_ass) != vec
5497 : ((vec = gimple_assign_rhs1 (use_ass)), false))
5498 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
5499 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
5500 || !constant_multiple_p
5501 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
5502 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
5503 this_lane))
5504 return false;
5505 return true;
5508 /* Find any vectorizable constructors and add them to the grouped_store
5509 array. */
5511 static void
5512 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
5514 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
5515 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
5516 !gsi_end_p (gsi); gsi_next (&gsi))
5518 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
5519 if (!assign)
5520 continue;
5522 tree rhs = gimple_assign_rhs1 (assign);
5523 enum tree_code code = gimple_assign_rhs_code (assign);
5524 use_operand_p use_p;
5525 gimple *use_stmt;
5526 if (code == CONSTRUCTOR)
5528 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5529 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
5530 CONSTRUCTOR_NELTS (rhs))
5531 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
5532 || uniform_vector_p (rhs))
5533 continue;
5535 unsigned j;
5536 tree val;
5537 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
5538 if (TREE_CODE (val) != SSA_NAME
5539 || !bb_vinfo->lookup_def (val))
5540 break;
5541 if (j != CONSTRUCTOR_NELTS (rhs))
5542 continue;
5544 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
5545 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
5547 else if (code == BIT_INSERT_EXPR
5548 && VECTOR_TYPE_P (TREE_TYPE (rhs))
5549 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
5550 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
5551 && integer_zerop (gimple_assign_rhs3 (assign))
5552 && useless_type_conversion_p
5553 (TREE_TYPE (TREE_TYPE (rhs)),
5554 TREE_TYPE (gimple_assign_rhs2 (assign)))
5555 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
5557 /* We start to match on insert to lane zero but since the
5558 inserts need not be ordered we'd have to search both
5559 the def and the use chains. */
5560 tree vectype = TREE_TYPE (rhs);
5561 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5562 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
5563 auto_sbitmap lanes (nlanes);
5564 bitmap_clear (lanes);
5565 bitmap_set_bit (lanes, 0);
5566 tree def = gimple_assign_lhs (assign);
5567 lane_defs.quick_push
5568 (std::make_pair (0, gimple_assign_rhs2 (assign)));
5569 unsigned lanes_found = 1;
5570 /* Start with the use chains, the last stmt will be the root. */
5571 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
5572 vec<stmt_vec_info> roots = vNULL;
5573 roots.safe_push (last);
5576 use_operand_p use_p;
5577 gimple *use_stmt;
5578 if (!single_imm_use (def, &use_p, &use_stmt))
5579 break;
5580 unsigned this_lane;
5581 if (!bb_vinfo->lookup_stmt (use_stmt)
5582 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
5583 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
5584 break;
5585 if (bitmap_bit_p (lanes, this_lane))
5586 break;
5587 lanes_found++;
5588 bitmap_set_bit (lanes, this_lane);
5589 gassign *use_ass = as_a <gassign *> (use_stmt);
5590 lane_defs.quick_push (std::make_pair
5591 (this_lane, gimple_assign_rhs2 (use_ass)));
5592 last = bb_vinfo->lookup_stmt (use_ass);
5593 roots.safe_push (last);
5594 def = gimple_assign_lhs (use_ass);
5596 while (lanes_found < nlanes);
5597 if (roots.length () > 1)
5598 std::swap(roots[0], roots[roots.length () - 1]);
5599 if (lanes_found < nlanes)
5601 /* Now search the def chain. */
5602 def = gimple_assign_rhs1 (assign);
5605 if (TREE_CODE (def) != SSA_NAME
5606 || !has_single_use (def))
5607 break;
5608 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
5609 unsigned this_lane;
5610 if (!bb_vinfo->lookup_stmt (def_stmt)
5611 || !vect_slp_is_lane_insert (def_stmt,
5612 NULL_TREE, &this_lane)
5613 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
5614 break;
5615 if (bitmap_bit_p (lanes, this_lane))
5616 break;
5617 lanes_found++;
5618 bitmap_set_bit (lanes, this_lane);
5619 lane_defs.quick_push (std::make_pair
5620 (this_lane,
5621 gimple_assign_rhs2 (def_stmt)));
5622 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
5623 def = gimple_assign_rhs1 (def_stmt);
5625 while (lanes_found < nlanes);
5627 if (lanes_found == nlanes)
5629 /* Sort lane_defs after the lane index and register the root. */
5630 lane_defs.qsort (vld_cmp);
5631 vec<stmt_vec_info> stmts;
5632 stmts.create (nlanes);
5633 for (unsigned i = 0; i < nlanes; ++i)
5634 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
5635 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
5636 stmts, roots));
5638 else
5639 roots.release ();
5641 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5642 && (associative_tree_code (code) || code == MINUS_EXPR)
5643 /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS
5644 checks pessimize a two-element reduction. PR54400.
5645 ??? In-order reduction could be handled if we only
5646 traverse one operand chain in vect_slp_linearize_chain. */
5647 && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
5648 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
5649 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
5650 /* Ops with constants at the tail can be stripped here. */
5651 && TREE_CODE (rhs) == SSA_NAME
5652 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
5653 /* Should be the chain end. */
5654 && (!single_imm_use (gimple_assign_lhs (assign),
5655 &use_p, &use_stmt)
5656 || !is_gimple_assign (use_stmt)
5657 || (gimple_assign_rhs_code (use_stmt) != code
5658 && ((code != PLUS_EXPR && code != MINUS_EXPR)
5659 || (gimple_assign_rhs_code (use_stmt)
5660 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
5662 /* We start the match at the end of a possible association
5663 chain. */
5664 auto_vec<chain_op_t> chain;
5665 auto_vec<std::pair<tree_code, gimple *> > worklist;
5666 auto_vec<gimple *> chain_stmts;
5667 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
5668 if (code == MINUS_EXPR)
5669 code = PLUS_EXPR;
5670 internal_fn reduc_fn;
5671 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
5672 || reduc_fn == IFN_LAST)
5673 continue;
5674 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
5675 /* ??? */
5676 code_stmt, alt_code_stmt, &chain_stmts);
5677 if (chain.length () > 1)
5679 /* Sort the chain according to def_type and operation. */
5680 chain.sort (dt_sort_cmp, bb_vinfo);
5681 /* ??? Now we'd want to strip externals and constants
5682 but record those to be handled in the epilogue. */
5683 /* ??? For now do not allow mixing ops or externs/constants. */
5684 bool invalid = false;
5685 for (unsigned i = 0; i < chain.length (); ++i)
5686 if (chain[i].dt != vect_internal_def
5687 || chain[i].code != code)
5688 invalid = true;
5689 if (!invalid)
5691 vec<stmt_vec_info> stmts;
5692 stmts.create (chain.length ());
5693 for (unsigned i = 0; i < chain.length (); ++i)
5694 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
5695 vec<stmt_vec_info> roots;
5696 roots.create (chain_stmts.length ());
5697 for (unsigned i = 0; i < chain_stmts.length (); ++i)
5698 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
5699 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
5700 stmts, roots));
5707 /* Walk the grouped store chains and replace entries with their
5708 pattern variant if any. */
5710 static void
5711 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
5713 stmt_vec_info first_element;
5714 unsigned i;
5716 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5718 /* We also have CTORs in this array. */
5719 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
5720 continue;
5721 if (STMT_VINFO_IN_PATTERN_P (first_element))
5723 stmt_vec_info orig = first_element;
5724 first_element = STMT_VINFO_RELATED_STMT (first_element);
5725 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
5726 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
5727 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
5728 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
5729 vinfo->grouped_stores[i] = first_element;
5731 stmt_vec_info prev = first_element;
5732 while (DR_GROUP_NEXT_ELEMENT (prev))
5734 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
5735 if (STMT_VINFO_IN_PATTERN_P (elt))
5737 stmt_vec_info orig = elt;
5738 elt = STMT_VINFO_RELATED_STMT (elt);
5739 DR_GROUP_NEXT_ELEMENT (prev) = elt;
5740 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
5741 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
5743 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
5744 prev = elt;
5749 /* Check if the region described by BB_VINFO can be vectorized, returning
5750 true if so. When returning false, set FATAL to true if the same failure
5751 would prevent vectorization at other vector sizes, false if it is still
5752 worth trying other sizes. N_STMTS is the number of statements in the
5753 region. */
5755 static bool
5756 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
5757 vec<int> *dataref_groups)
5759 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
5761 slp_instance instance;
5762 int i;
5763 poly_uint64 min_vf = 2;
5765 /* The first group of checks is independent of the vector size. */
5766 fatal = true;
5768 /* Analyze the data references. */
5770 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
5772 if (dump_enabled_p ())
5773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5774 "not vectorized: unhandled data-ref in basic "
5775 "block.\n");
5776 return false;
5779 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
5781 if (dump_enabled_p ())
5782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5783 "not vectorized: unhandled data access in "
5784 "basic block.\n");
5785 return false;
5788 vect_slp_check_for_constructors (bb_vinfo);
5790 /* If there are no grouped stores and no constructors in the region
5791 there is no need to continue with pattern recog as vect_analyze_slp
5792 will fail anyway. */
5793 if (bb_vinfo->grouped_stores.is_empty ()
5794 && bb_vinfo->roots.is_empty ())
5796 if (dump_enabled_p ())
5797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5798 "not vectorized: no grouped stores in "
5799 "basic block.\n");
5800 return false;
5803 /* While the rest of the analysis below depends on it in some way. */
5804 fatal = false;
5806 vect_pattern_recog (bb_vinfo);
5808 /* Update store groups from pattern processing. */
5809 vect_fixup_store_groups_with_patterns (bb_vinfo);
5811 /* Check the SLP opportunities in the basic block, analyze and build SLP
5812 trees. */
5813 if (!vect_analyze_slp (bb_vinfo, n_stmts))
5815 if (dump_enabled_p ())
5817 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5818 "Failed to SLP the basic block.\n");
5819 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5820 "not vectorized: failed to find SLP opportunities "
5821 "in basic block.\n");
5823 return false;
5826 /* Optimize permutations. */
5827 vect_optimize_slp (bb_vinfo);
5829 /* Gather the loads reachable from the SLP graph entries. */
5830 vect_gather_slp_loads (bb_vinfo);
5832 vect_record_base_alignments (bb_vinfo);
5834 /* Analyze and verify the alignment of data references and the
5835 dependence in the SLP instances. */
5836 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
5838 vect_location = instance->location ();
5839 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
5840 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
5842 slp_tree node = SLP_INSTANCE_TREE (instance);
5843 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5844 if (dump_enabled_p ())
5845 dump_printf_loc (MSG_NOTE, vect_location,
5846 "removing SLP instance operations starting from: %G",
5847 stmt_info->stmt);
5848 vect_free_slp_instance (instance);
5849 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
5850 continue;
5853 /* Mark all the statements that we want to vectorize as pure SLP and
5854 relevant. */
5855 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5856 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
5857 unsigned j;
5858 stmt_vec_info root;
5859 /* Likewise consider instance root stmts as vectorized. */
5860 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
5861 STMT_SLP_TYPE (root) = pure_slp;
5863 i++;
5865 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
5866 return false;
5868 if (!vect_slp_analyze_operations (bb_vinfo))
5870 if (dump_enabled_p ())
5871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5872 "not vectorized: bad operation in basic block.\n");
5873 return false;
5876 vect_bb_partition_graph (bb_vinfo);
5878 return true;
5881 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
5882 basic blocks in BBS, returning true on success.
5883 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
5885 static bool
5886 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
5887 vec<int> *dataref_groups, unsigned int n_stmts,
5888 loop_p orig_loop)
5890 bb_vec_info bb_vinfo;
5891 auto_vector_modes vector_modes;
5893 /* Autodetect first vector size we try. */
5894 machine_mode next_vector_mode = VOIDmode;
5895 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
5896 unsigned int mode_i = 0;
5898 vec_info_shared shared;
5900 machine_mode autodetected_vector_mode = VOIDmode;
5901 while (1)
5903 bool vectorized = false;
5904 bool fatal = false;
5905 bb_vinfo = new _bb_vec_info (bbs, &shared);
5907 bool first_time_p = shared.datarefs.is_empty ();
5908 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
5909 if (first_time_p)
5910 bb_vinfo->shared->save_datarefs ();
5911 else
5912 bb_vinfo->shared->check_datarefs ();
5913 bb_vinfo->vector_mode = next_vector_mode;
5915 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
5917 if (dump_enabled_p ())
5919 dump_printf_loc (MSG_NOTE, vect_location,
5920 "***** Analysis succeeded with vector mode"
5921 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
5922 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
5925 bb_vinfo->shared->check_datarefs ();
5927 auto_vec<slp_instance> profitable_subgraphs;
5928 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
5930 if (instance->subgraph_entries.is_empty ())
5931 continue;
5933 vect_location = instance->location ();
5934 if (!unlimited_cost_model (NULL)
5935 && !vect_bb_vectorization_profitable_p
5936 (bb_vinfo, instance->subgraph_entries, orig_loop))
5938 if (dump_enabled_p ())
5939 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5940 "not vectorized: vectorization is not "
5941 "profitable.\n");
5942 continue;
5945 if (!dbg_cnt (vect_slp))
5946 continue;
5948 profitable_subgraphs.safe_push (instance);
5951 /* When we're vectorizing an if-converted loop body make sure
5952 we vectorized all if-converted code. */
5953 if (!profitable_subgraphs.is_empty ()
5954 && orig_loop)
5956 gcc_assert (bb_vinfo->bbs.length () == 1);
5957 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
5958 !gsi_end_p (gsi); gsi_next (&gsi))
5960 /* The costing above left us with DCEable vectorized scalar
5961 stmts having the visited flag set on profitable
5962 subgraphs. Do the delayed clearing of the flag here. */
5963 if (gimple_visited_p (gsi_stmt (gsi)))
5965 gimple_set_visited (gsi_stmt (gsi), false);
5966 continue;
5968 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
5969 continue;
5971 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
5972 if (gimple_assign_rhs_code (ass) == COND_EXPR)
5974 if (!profitable_subgraphs.is_empty ()
5975 && dump_enabled_p ())
5976 dump_printf_loc (MSG_NOTE, vect_location,
5977 "not profitable because of "
5978 "unprofitable if-converted scalar "
5979 "code\n");
5980 profitable_subgraphs.truncate (0);
5985 /* Finally schedule the profitable subgraphs. */
5986 for (slp_instance instance : profitable_subgraphs)
5988 if (!vectorized && dump_enabled_p ())
5989 dump_printf_loc (MSG_NOTE, vect_location,
5990 "Basic block will be vectorized "
5991 "using SLP\n");
5992 vectorized = true;
5994 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
5996 unsigned HOST_WIDE_INT bytes;
5997 if (dump_enabled_p ())
5999 if (GET_MODE_SIZE
6000 (bb_vinfo->vector_mode).is_constant (&bytes))
6001 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
6002 "basic block part vectorized using %wu "
6003 "byte vectors\n", bytes);
6004 else
6005 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
6006 "basic block part vectorized using "
6007 "variable length vectors\n");
6011 else
6013 if (dump_enabled_p ())
6014 dump_printf_loc (MSG_NOTE, vect_location,
6015 "***** Analysis failed with vector mode %s\n",
6016 GET_MODE_NAME (bb_vinfo->vector_mode));
6019 if (mode_i == 0)
6020 autodetected_vector_mode = bb_vinfo->vector_mode;
6022 if (!fatal)
6023 while (mode_i < vector_modes.length ()
6024 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
6026 if (dump_enabled_p ())
6027 dump_printf_loc (MSG_NOTE, vect_location,
6028 "***** The result for vector mode %s would"
6029 " be the same\n",
6030 GET_MODE_NAME (vector_modes[mode_i]));
6031 mode_i += 1;
6034 delete bb_vinfo;
6036 if (mode_i < vector_modes.length ()
6037 && VECTOR_MODE_P (autodetected_vector_mode)
6038 && (related_vector_mode (vector_modes[mode_i],
6039 GET_MODE_INNER (autodetected_vector_mode))
6040 == autodetected_vector_mode)
6041 && (related_vector_mode (autodetected_vector_mode,
6042 GET_MODE_INNER (vector_modes[mode_i]))
6043 == vector_modes[mode_i]))
6045 if (dump_enabled_p ())
6046 dump_printf_loc (MSG_NOTE, vect_location,
6047 "***** Skipping vector mode %s, which would"
6048 " repeat the analysis for %s\n",
6049 GET_MODE_NAME (vector_modes[mode_i]),
6050 GET_MODE_NAME (autodetected_vector_mode));
6051 mode_i += 1;
6054 if (vectorized
6055 || mode_i == vector_modes.length ()
6056 || autodetected_vector_mode == VOIDmode
6057 /* If vect_slp_analyze_bb_1 signaled that analysis for all
6058 vector sizes will fail do not bother iterating. */
6059 || fatal)
6060 return vectorized;
6062 /* Try the next biggest vector size. */
6063 next_vector_mode = vector_modes[mode_i++];
6064 if (dump_enabled_p ())
6065 dump_printf_loc (MSG_NOTE, vect_location,
6066 "***** Re-trying analysis with vector mode %s\n",
6067 GET_MODE_NAME (next_vector_mode));
6072 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
6073 true if anything in the basic-block was vectorized. */
6075 static bool
6076 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
6078 vec<data_reference_p> datarefs = vNULL;
6079 auto_vec<int> dataref_groups;
6080 int insns = 0;
6081 int current_group = 0;
6083 for (unsigned i = 0; i < bbs.length (); i++)
6085 basic_block bb = bbs[i];
6086 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
6087 gsi_next (&gsi))
6089 gimple *stmt = gsi_stmt (gsi);
6090 if (is_gimple_debug (stmt))
6091 continue;
6093 insns++;
6095 if (gimple_location (stmt) != UNKNOWN_LOCATION)
6096 vect_location = stmt;
6098 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
6099 &dataref_groups, current_group))
6100 ++current_group;
6102 /* New BBs always start a new DR group. */
6103 ++current_group;
6106 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
6109 /* Special entry for the BB vectorizer. Analyze and transform a single
6110 if-converted BB with ORIG_LOOPs body being the not if-converted
6111 representation. Returns true if anything in the basic-block was
6112 vectorized. */
6114 bool
6115 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
6117 auto_vec<basic_block> bbs;
6118 bbs.safe_push (bb);
6119 return vect_slp_bbs (bbs, orig_loop);
6122 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
6123 true if anything in the basic-block was vectorized. */
6125 bool
6126 vect_slp_function (function *fun)
6128 bool r = false;
6129 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
6130 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
6132 /* For the moment split the function into pieces to avoid making
6133 the iteration on the vector mode moot. Split at points we know
6134 to not handle well which is CFG merges (SLP discovery doesn't
6135 handle non-loop-header PHIs) and loop exits. Since pattern
6136 recog requires reverse iteration to visit uses before defs
6137 simply chop RPO into pieces. */
6138 auto_vec<basic_block> bbs;
6139 for (unsigned i = 0; i < n; i++)
6141 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
6142 bool split = false;
6144 /* Split when a BB is not dominated by the first block. */
6145 if (!bbs.is_empty ()
6146 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
6148 if (dump_enabled_p ())
6149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6150 "splitting region at dominance boundary bb%d\n",
6151 bb->index);
6152 split = true;
6154 /* Split when the loop determined by the first block
6155 is exited. This is because we eventually insert
6156 invariants at region begin. */
6157 else if (!bbs.is_empty ()
6158 && bbs[0]->loop_father != bb->loop_father
6159 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
6161 if (dump_enabled_p ())
6162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6163 "splitting region at loop %d exit at bb%d\n",
6164 bbs[0]->loop_father->num, bb->index);
6165 split = true;
6168 if (split && !bbs.is_empty ())
6170 r |= vect_slp_bbs (bbs, NULL);
6171 bbs.truncate (0);
6172 bbs.quick_push (bb);
6174 else
6175 bbs.safe_push (bb);
6177 /* When we have a stmt ending this block and defining a
6178 value we have to insert on edges when inserting after it for
6179 a vector containing its definition. Avoid this for now. */
6180 if (gimple *last = last_stmt (bb))
6181 if (gimple_get_lhs (last)
6182 && is_ctrl_altering_stmt (last))
6184 if (dump_enabled_p ())
6185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6186 "splitting region at control altering "
6187 "definition %G", last);
6188 r |= vect_slp_bbs (bbs, NULL);
6189 bbs.truncate (0);
6193 if (!bbs.is_empty ())
6194 r |= vect_slp_bbs (bbs, NULL);
6196 free (rpo);
6198 return r;
6201 /* Build a variable-length vector in which the elements in ELTS are repeated
6202 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
6203 RESULTS and add any new instructions to SEQ.
6205 The approach we use is:
6207 (1) Find a vector mode VM with integer elements of mode IM.
6209 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6210 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
6211 from small vectors to IM.
6213 (3) Duplicate each ELTS'[I] into a vector of mode VM.
6215 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
6216 correct byte contents.
6218 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
6220 We try to find the largest IM for which this sequence works, in order
6221 to cut down on the number of interleaves. */
6223 void
6224 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
6225 const vec<tree> &elts, unsigned int nresults,
6226 vec<tree> &results)
6228 unsigned int nelts = elts.length ();
6229 tree element_type = TREE_TYPE (vector_type);
6231 /* (1) Find a vector mode VM with integer elements of mode IM. */
6232 unsigned int nvectors = 1;
6233 tree new_vector_type;
6234 tree permutes[2];
6235 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
6236 &nvectors, &new_vector_type,
6237 permutes))
6238 gcc_unreachable ();
6240 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
6241 unsigned int partial_nelts = nelts / nvectors;
6242 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
6244 tree_vector_builder partial_elts;
6245 auto_vec<tree, 32> pieces (nvectors * 2);
6246 pieces.quick_grow_cleared (nvectors * 2);
6247 for (unsigned int i = 0; i < nvectors; ++i)
6249 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6250 ELTS' has mode IM. */
6251 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
6252 for (unsigned int j = 0; j < partial_nelts; ++j)
6253 partial_elts.quick_push (elts[i * partial_nelts + j]);
6254 tree t = gimple_build_vector (seq, &partial_elts);
6255 t = gimple_build (seq, VIEW_CONVERT_EXPR,
6256 TREE_TYPE (new_vector_type), t);
6258 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
6259 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
6262 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
6263 correct byte contents.
6265 Conceptually, we need to repeat the following operation log2(nvectors)
6266 times, where hi_start = nvectors / 2:
6268 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
6269 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
6271 However, if each input repeats every N elements and the VF is
6272 a multiple of N * 2, the HI result is the same as the LO result.
6273 This will be true for the first N1 iterations of the outer loop,
6274 followed by N2 iterations for which both the LO and HI results
6275 are needed. I.e.:
6277 N1 + N2 = log2(nvectors)
6279 Each "N1 iteration" doubles the number of redundant vectors and the
6280 effect of the process as a whole is to have a sequence of nvectors/2**N1
6281 vectors that repeats 2**N1 times. Rather than generate these redundant
6282 vectors, we halve the number of vectors for each N1 iteration. */
6283 unsigned int in_start = 0;
6284 unsigned int out_start = nvectors;
6285 unsigned int new_nvectors = nvectors;
6286 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
6288 unsigned int hi_start = new_nvectors / 2;
6289 unsigned int out_i = 0;
6290 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
6292 if ((in_i & 1) != 0
6293 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
6294 2 * in_repeat))
6295 continue;
6297 tree output = make_ssa_name (new_vector_type);
6298 tree input1 = pieces[in_start + (in_i / 2)];
6299 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
6300 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
6301 input1, input2,
6302 permutes[in_i & 1]);
6303 gimple_seq_add_stmt (seq, stmt);
6304 pieces[out_start + out_i] = output;
6305 out_i += 1;
6307 std::swap (in_start, out_start);
6308 new_nvectors = out_i;
6311 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
6312 results.reserve (nresults);
6313 for (unsigned int i = 0; i < nresults; ++i)
6314 if (i < new_nvectors)
6315 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
6316 pieces[in_start + i]));
6317 else
6318 results.quick_push (results[i - new_nvectors]);
6322 /* For constant and loop invariant defs in OP_NODE this function creates
6323 vector defs that will be used in the vectorized stmts and stores them
6324 to SLP_TREE_VEC_DEFS of OP_NODE. */
6326 static void
6327 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
6329 unsigned HOST_WIDE_INT nunits;
6330 tree vec_cst;
6331 unsigned j, number_of_places_left_in_vector;
6332 tree vector_type;
6333 tree vop;
6334 int group_size = op_node->ops.length ();
6335 unsigned int vec_num, i;
6336 unsigned number_of_copies = 1;
6337 bool constant_p;
6338 gimple_seq ctor_seq = NULL;
6339 auto_vec<tree, 16> permute_results;
6341 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
6342 vector_type = SLP_TREE_VECTYPE (op_node);
6344 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
6345 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
6346 auto_vec<tree> voprnds (number_of_vectors);
6348 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
6349 created vectors. It is greater than 1 if unrolling is performed.
6351 For example, we have two scalar operands, s1 and s2 (e.g., group of
6352 strided accesses of size two), while NUNITS is four (i.e., four scalars
6353 of this type can be packed in a vector). The output vector will contain
6354 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
6355 will be 2).
6357 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
6358 containing the operands.
6360 For example, NUNITS is four as before, and the group size is 8
6361 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
6362 {s5, s6, s7, s8}. */
6364 /* When using duplicate_and_interleave, we just need one element for
6365 each scalar statement. */
6366 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
6367 nunits = group_size;
6369 number_of_copies = nunits * number_of_vectors / group_size;
6371 number_of_places_left_in_vector = nunits;
6372 constant_p = true;
6373 tree_vector_builder elts (vector_type, nunits, 1);
6374 elts.quick_grow (nunits);
6375 stmt_vec_info insert_after = NULL;
6376 for (j = 0; j < number_of_copies; j++)
6378 tree op;
6379 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
6381 /* Create 'vect_ = {op0,op1,...,opn}'. */
6382 number_of_places_left_in_vector--;
6383 tree orig_op = op;
6384 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
6386 if (CONSTANT_CLASS_P (op))
6388 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6390 /* Can't use VIEW_CONVERT_EXPR for booleans because
6391 of possibly different sizes of scalar value and
6392 vector element. */
6393 if (integer_zerop (op))
6394 op = build_int_cst (TREE_TYPE (vector_type), 0);
6395 else if (integer_onep (op))
6396 op = build_all_ones_cst (TREE_TYPE (vector_type));
6397 else
6398 gcc_unreachable ();
6400 else
6401 op = fold_unary (VIEW_CONVERT_EXPR,
6402 TREE_TYPE (vector_type), op);
6403 gcc_assert (op && CONSTANT_CLASS_P (op));
6405 else
6407 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
6408 gimple *init_stmt;
6409 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6411 tree true_val
6412 = build_all_ones_cst (TREE_TYPE (vector_type));
6413 tree false_val
6414 = build_zero_cst (TREE_TYPE (vector_type));
6415 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
6416 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
6417 op, true_val,
6418 false_val);
6420 else
6422 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
6423 op);
6424 init_stmt
6425 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
6426 op);
6428 gimple_seq_add_stmt (&ctor_seq, init_stmt);
6429 op = new_temp;
6432 elts[number_of_places_left_in_vector] = op;
6433 if (!CONSTANT_CLASS_P (op))
6434 constant_p = false;
6435 /* For BB vectorization we have to compute an insert location
6436 when a def is inside the analyzed region since we cannot
6437 simply insert at the BB start in this case. */
6438 stmt_vec_info opdef;
6439 if (TREE_CODE (orig_op) == SSA_NAME
6440 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
6441 && is_a <bb_vec_info> (vinfo)
6442 && (opdef = vinfo->lookup_def (orig_op)))
6444 if (!insert_after)
6445 insert_after = opdef;
6446 else
6447 insert_after = get_later_stmt (insert_after, opdef);
6450 if (number_of_places_left_in_vector == 0)
6452 if (constant_p
6453 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
6454 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
6455 vec_cst = gimple_build_vector (&ctor_seq, &elts);
6456 else
6458 if (permute_results.is_empty ())
6459 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
6460 elts, number_of_vectors,
6461 permute_results);
6462 vec_cst = permute_results[number_of_vectors - j - 1];
6464 if (!gimple_seq_empty_p (ctor_seq))
6466 if (insert_after)
6468 gimple_stmt_iterator gsi;
6469 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
6471 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
6472 gsi_insert_seq_before (&gsi, ctor_seq,
6473 GSI_CONTINUE_LINKING);
6475 else if (!stmt_ends_bb_p (insert_after->stmt))
6477 gsi = gsi_for_stmt (insert_after->stmt);
6478 gsi_insert_seq_after (&gsi, ctor_seq,
6479 GSI_CONTINUE_LINKING);
6481 else
6483 /* When we want to insert after a def where the
6484 defining stmt throws then insert on the fallthru
6485 edge. */
6486 edge e = find_fallthru_edge
6487 (gimple_bb (insert_after->stmt)->succs);
6488 basic_block new_bb
6489 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
6490 gcc_assert (!new_bb);
6493 else
6494 vinfo->insert_seq_on_entry (NULL, ctor_seq);
6495 ctor_seq = NULL;
6497 voprnds.quick_push (vec_cst);
6498 insert_after = NULL;
6499 number_of_places_left_in_vector = nunits;
6500 constant_p = true;
6501 elts.new_vector (vector_type, nunits, 1);
6502 elts.quick_grow (nunits);
6507 /* Since the vectors are created in the reverse order, we should invert
6508 them. */
6509 vec_num = voprnds.length ();
6510 for (j = vec_num; j != 0; j--)
6512 vop = voprnds[j - 1];
6513 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6516 /* In case that VF is greater than the unrolling factor needed for the SLP
6517 group of stmts, NUMBER_OF_VECTORS to be created is greater than
6518 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
6519 to replicate the vectors. */
6520 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
6521 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
6522 i++)
6523 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6526 /* Get the Ith vectorized definition from SLP_NODE. */
6528 tree
6529 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
6531 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
6532 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
6533 else
6534 return SLP_TREE_VEC_DEFS (slp_node)[i];
6537 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
6539 void
6540 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
6542 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
6543 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
6545 unsigned j;
6546 gimple *vec_def_stmt;
6547 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
6548 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
6550 else
6551 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
6554 /* Get N vectorized definitions for SLP_NODE. */
6556 void
6557 vect_get_slp_defs (vec_info *,
6558 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
6560 if (n == -1U)
6561 n = SLP_TREE_CHILDREN (slp_node).length ();
6563 for (unsigned i = 0; i < n; ++i)
6565 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
6566 vec<tree> vec_defs = vNULL;
6567 vect_get_slp_defs (child, &vec_defs);
6568 vec_oprnds->quick_push (vec_defs);
6572 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6573 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6574 permute statements for the SLP node NODE. Store the number of vector
6575 permute instructions in *N_PERMS and the number of vector load
6576 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
6577 that were not needed. */
6579 bool
6580 vect_transform_slp_perm_load (vec_info *vinfo,
6581 slp_tree node, const vec<tree> &dr_chain,
6582 gimple_stmt_iterator *gsi, poly_uint64 vf,
6583 bool analyze_only, unsigned *n_perms,
6584 unsigned int *n_loads, bool dce_chain)
6586 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6587 int vec_index = 0;
6588 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6589 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
6590 unsigned int mask_element;
6591 machine_mode mode;
6593 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
6594 return false;
6596 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6598 mode = TYPE_MODE (vectype);
6599 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6601 /* Initialize the vect stmts of NODE to properly insert the generated
6602 stmts later. */
6603 if (! analyze_only)
6604 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
6605 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6606 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
6608 /* Generate permutation masks for every NODE. Number of masks for each NODE
6609 is equal to GROUP_SIZE.
6610 E.g., we have a group of three nodes with three loads from the same
6611 location in each node, and the vector size is 4. I.e., we have a
6612 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6613 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6614 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6617 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
6618 The last mask is illegal since we assume two operands for permute
6619 operation, and the mask element values can't be outside that range.
6620 Hence, the last mask must be converted into {2,5,5,5}.
6621 For the first two permutations we need the first and the second input
6622 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6623 we need the second and the third vectors: {b1,c1,a2,b2} and
6624 {c2,a3,b3,c3}. */
6626 int vect_stmts_counter = 0;
6627 unsigned int index = 0;
6628 int first_vec_index = -1;
6629 int second_vec_index = -1;
6630 bool noop_p = true;
6631 *n_perms = 0;
6633 vec_perm_builder mask;
6634 unsigned int nelts_to_build;
6635 unsigned int nvectors_per_build;
6636 unsigned int in_nlanes;
6637 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
6638 && multiple_p (nunits, group_size));
6639 if (repeating_p)
6641 /* A single vector contains a whole number of copies of the node, so:
6642 (a) all permutes can use the same mask; and
6643 (b) the permutes only need a single vector input. */
6644 mask.new_vector (nunits, group_size, 3);
6645 nelts_to_build = mask.encoded_nelts ();
6646 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
6647 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
6649 else
6651 /* We need to construct a separate mask for each vector statement. */
6652 unsigned HOST_WIDE_INT const_nunits, const_vf;
6653 if (!nunits.is_constant (&const_nunits)
6654 || !vf.is_constant (&const_vf))
6655 return false;
6656 mask.new_vector (const_nunits, const_nunits, 1);
6657 nelts_to_build = const_vf * group_size;
6658 nvectors_per_build = 1;
6659 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
6661 auto_sbitmap used_in_lanes (in_nlanes);
6662 bitmap_clear (used_in_lanes);
6663 auto_bitmap used_defs;
6665 unsigned int count = mask.encoded_nelts ();
6666 mask.quick_grow (count);
6667 vec_perm_indices indices;
6669 for (unsigned int j = 0; j < nelts_to_build; j++)
6671 unsigned int iter_num = j / group_size;
6672 unsigned int stmt_num = j % group_size;
6673 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
6674 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
6675 bitmap_set_bit (used_in_lanes, i);
6676 if (repeating_p)
6678 first_vec_index = 0;
6679 mask_element = i;
6681 else
6683 /* Enforced before the loop when !repeating_p. */
6684 unsigned int const_nunits = nunits.to_constant ();
6685 vec_index = i / const_nunits;
6686 mask_element = i % const_nunits;
6687 if (vec_index == first_vec_index
6688 || first_vec_index == -1)
6690 first_vec_index = vec_index;
6692 else if (vec_index == second_vec_index
6693 || second_vec_index == -1)
6695 second_vec_index = vec_index;
6696 mask_element += const_nunits;
6698 else
6700 if (dump_enabled_p ())
6701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6702 "permutation requires at "
6703 "least three vectors %G",
6704 stmt_info->stmt);
6705 gcc_assert (analyze_only);
6706 return false;
6709 gcc_assert (mask_element < 2 * const_nunits);
6712 if (mask_element != index)
6713 noop_p = false;
6714 mask[index++] = mask_element;
6716 if (index == count && !noop_p)
6718 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
6719 if (!can_vec_perm_const_p (mode, indices))
6721 if (dump_enabled_p ())
6723 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6724 vect_location,
6725 "unsupported vect permute { ");
6726 for (i = 0; i < count; ++i)
6728 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6729 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6731 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6733 gcc_assert (analyze_only);
6734 return false;
6737 ++*n_perms;
6740 if (index == count)
6742 if (!analyze_only)
6744 tree mask_vec = NULL_TREE;
6746 if (! noop_p)
6747 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6749 if (second_vec_index == -1)
6750 second_vec_index = first_vec_index;
6752 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
6754 /* Generate the permute statement if necessary. */
6755 tree first_vec = dr_chain[first_vec_index + ri];
6756 tree second_vec = dr_chain[second_vec_index + ri];
6757 gimple *perm_stmt;
6758 if (! noop_p)
6760 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6761 tree perm_dest
6762 = vect_create_destination_var (gimple_assign_lhs (stmt),
6763 vectype);
6764 perm_dest = make_ssa_name (perm_dest);
6765 perm_stmt
6766 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6767 first_vec, second_vec,
6768 mask_vec);
6769 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
6770 gsi);
6771 if (dce_chain)
6773 bitmap_set_bit (used_defs, first_vec_index + ri);
6774 bitmap_set_bit (used_defs, second_vec_index + ri);
6777 else
6779 /* If mask was NULL_TREE generate the requested
6780 identity transform. */
6781 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
6782 if (dce_chain)
6783 bitmap_set_bit (used_defs, first_vec_index + ri);
6786 /* Store the vector statement in NODE. */
6787 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
6791 index = 0;
6792 first_vec_index = -1;
6793 second_vec_index = -1;
6794 noop_p = true;
6798 if (n_loads)
6800 if (repeating_p)
6801 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6802 else
6804 /* Enforced above when !repeating_p. */
6805 unsigned int const_nunits = nunits.to_constant ();
6806 *n_loads = 0;
6807 bool load_seen = false;
6808 for (unsigned i = 0; i < in_nlanes; ++i)
6810 if (i % const_nunits == 0)
6812 if (load_seen)
6813 *n_loads += 1;
6814 load_seen = false;
6816 if (bitmap_bit_p (used_in_lanes, i))
6817 load_seen = true;
6819 if (load_seen)
6820 *n_loads += 1;
6824 if (dce_chain)
6825 for (unsigned i = 0; i < dr_chain.length (); ++i)
6826 if (!bitmap_bit_p (used_defs, i))
6828 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
6829 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
6830 gsi_remove (&rgsi, true);
6831 release_defs (stmt);
6834 return true;
6837 /* Produce the next vector result for SLP permutation NODE by adding a vector
6838 statement at GSI. If MASK_VEC is nonnull, add:
6840 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
6842 otherwise add:
6844 <new SSA name> = FIRST_DEF. */
6846 static void
6847 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6848 slp_tree node, tree first_def, tree second_def,
6849 tree mask_vec)
6851 tree vectype = SLP_TREE_VECTYPE (node);
6853 /* ??? We SLP match existing vector element extracts but
6854 allow punning which we need to re-instantiate at uses
6855 but have no good way of explicitly representing. */
6856 if (!types_compatible_p (TREE_TYPE (first_def), vectype))
6858 gassign *conv_stmt
6859 = gimple_build_assign (make_ssa_name (vectype),
6860 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
6861 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6862 first_def = gimple_assign_lhs (conv_stmt);
6864 gassign *perm_stmt;
6865 tree perm_dest = make_ssa_name (vectype);
6866 if (mask_vec)
6868 if (!types_compatible_p (TREE_TYPE (second_def), vectype))
6870 gassign *conv_stmt
6871 = gimple_build_assign (make_ssa_name (vectype),
6872 build1 (VIEW_CONVERT_EXPR,
6873 vectype, second_def));
6874 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6875 second_def = gimple_assign_lhs (conv_stmt);
6877 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6878 first_def, second_def,
6879 mask_vec);
6881 else
6882 /* We need a copy here in case the def was external. */
6883 perm_stmt = gimple_build_assign (perm_dest, first_def);
6884 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
6885 /* Store the vector statement in NODE. */
6886 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
6889 /* Vectorize the SLP permutations in NODE as specified
6890 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
6891 child number and lane number.
6892 Interleaving of two two-lane two-child SLP subtrees (not supported):
6893 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
6894 A blend of two four-lane two-child SLP subtrees:
6895 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
6896 Highpart of a four-lane one-child SLP subtree (not supported):
6897 [ { 0, 2 }, { 0, 3 } ]
6898 Where currently only a subset is supported by code generating below. */
6900 static bool
6901 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6902 slp_tree node, stmt_vector_for_cost *cost_vec)
6904 tree vectype = SLP_TREE_VECTYPE (node);
6906 /* ??? We currently only support all same vector input and output types
6907 while the SLP IL should really do a concat + select and thus accept
6908 arbitrary mismatches. */
6909 slp_tree child;
6910 unsigned i;
6911 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6912 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
6913 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6915 if (!vect_maybe_update_slp_op_vectype (child, vectype)
6916 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
6918 if (dump_enabled_p ())
6919 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6920 "Unsupported lane permutation\n");
6921 return false;
6923 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
6924 repeating_p = false;
6927 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
6928 gcc_assert (perm.length () == SLP_TREE_LANES (node));
6929 if (dump_enabled_p ())
6931 dump_printf_loc (MSG_NOTE, vect_location,
6932 "vectorizing permutation");
6933 for (unsigned i = 0; i < perm.length (); ++i)
6934 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
6935 if (repeating_p)
6936 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
6937 dump_printf (MSG_NOTE, "\n");
6940 /* REPEATING_P is true if every output vector is guaranteed to use the
6941 same permute vector. We can handle that case for both variable-length
6942 and constant-length vectors, but we only handle other cases for
6943 constant-length vectors.
6945 Set:
6947 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
6948 mask vector that we want to build.
6950 - NCOPIES to the number of copies of PERM that we need in order
6951 to build the necessary permute mask vectors.
6953 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
6954 for each permute mask vector. This is only relevant when GSI is
6955 nonnull. */
6956 uint64_t npatterns;
6957 unsigned nelts_per_pattern;
6958 uint64_t ncopies;
6959 unsigned noutputs_per_mask;
6960 if (repeating_p)
6962 /* We need a single permute mask vector that has the form:
6964 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
6966 In other words, the original n-element permute in PERM is
6967 "unrolled" to fill a full vector. The stepped vector encoding
6968 that we use for permutes requires 3n elements. */
6969 npatterns = SLP_TREE_LANES (node);
6970 nelts_per_pattern = ncopies = 3;
6971 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6973 else
6975 /* Calculate every element of every permute mask vector explicitly,
6976 instead of relying on the pattern described above. */
6977 if (!nunits.is_constant (&npatterns))
6978 return false;
6979 nelts_per_pattern = ncopies = 1;
6980 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
6981 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
6982 return false;
6983 noutputs_per_mask = 1;
6985 unsigned olanes = ncopies * SLP_TREE_LANES (node);
6986 gcc_assert (repeating_p || multiple_p (olanes, nunits));
6988 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
6989 from the { SLP operand, scalar lane } permutation as recorded in the
6990 SLP node as intermediate step. This part should already work
6991 with SLP children with arbitrary number of lanes. */
6992 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
6993 auto_vec<unsigned> active_lane;
6994 vperm.create (olanes);
6995 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
6996 for (unsigned i = 0; i < ncopies; ++i)
6998 for (unsigned pi = 0; pi < perm.length (); ++pi)
7000 std::pair<unsigned, unsigned> p = perm[pi];
7001 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
7002 if (repeating_p)
7003 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
7004 else
7006 /* We checked above that the vectors are constant-length. */
7007 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
7008 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
7009 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
7010 vperm.quick_push ({{p.first, vi}, vl});
7013 /* Advance to the next group. */
7014 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
7015 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
7018 if (dump_enabled_p ())
7020 dump_printf_loc (MSG_NOTE, vect_location, "as");
7021 for (unsigned i = 0; i < vperm.length (); ++i)
7023 if (i != 0
7024 && (repeating_p
7025 ? multiple_p (i, npatterns)
7026 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
7027 dump_printf (MSG_NOTE, ",");
7028 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
7029 vperm[i].first.first, vperm[i].first.second,
7030 vperm[i].second);
7032 dump_printf (MSG_NOTE, "\n");
7035 /* We can only handle two-vector permutes, everything else should
7036 be lowered on the SLP level. The following is closely inspired
7037 by vect_transform_slp_perm_load and is supposed to eventually
7038 replace it.
7039 ??? As intermediate step do code-gen in the SLP tree representation
7040 somehow? */
7041 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
7042 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
7043 unsigned int index = 0;
7044 poly_uint64 mask_element;
7045 vec_perm_builder mask;
7046 mask.new_vector (nunits, npatterns, nelts_per_pattern);
7047 unsigned int count = mask.encoded_nelts ();
7048 mask.quick_grow (count);
7049 vec_perm_indices indices;
7050 unsigned nperms = 0;
7051 for (unsigned i = 0; i < vperm.length (); ++i)
7053 mask_element = vperm[i].second;
7054 if (first_vec.first == -1U
7055 || first_vec == vperm[i].first)
7056 first_vec = vperm[i].first;
7057 else if (second_vec.first == -1U
7058 || second_vec == vperm[i].first)
7060 second_vec = vperm[i].first;
7061 mask_element += nunits;
7063 else
7065 if (dump_enabled_p ())
7066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7067 "permutation requires at "
7068 "least three vectors\n");
7069 gcc_assert (!gsi);
7070 return false;
7073 mask[index++] = mask_element;
7075 if (index == count)
7077 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits);
7078 bool identity_p = indices.series_p (0, 1, 0, 1);
7079 if (!identity_p
7080 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
7082 if (dump_enabled_p ())
7084 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
7085 vect_location,
7086 "unsupported vect permute { ");
7087 for (i = 0; i < count; ++i)
7089 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
7090 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
7092 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
7094 gcc_assert (!gsi);
7095 return false;
7098 if (!identity_p)
7099 nperms++;
7100 if (gsi)
7102 if (second_vec.first == -1U)
7103 second_vec = first_vec;
7105 slp_tree
7106 first_node = SLP_TREE_CHILDREN (node)[first_vec.first],
7107 second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
7109 tree mask_vec = NULL_TREE;
7110 if (!identity_p)
7111 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
7113 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
7115 tree first_def
7116 = vect_get_slp_vect_def (first_node,
7117 first_vec.second + vi);
7118 tree second_def
7119 = vect_get_slp_vect_def (second_node,
7120 second_vec.second + vi);
7121 vect_add_slp_permutation (vinfo, gsi, node, first_def,
7122 second_def, mask_vec);
7126 index = 0;
7127 first_vec = std::make_pair (-1U, -1U);
7128 second_vec = std::make_pair (-1U, -1U);
7132 if (!gsi)
7133 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
7135 return true;
7138 /* Vectorize SLP NODE. */
7140 static void
7141 vect_schedule_slp_node (vec_info *vinfo,
7142 slp_tree node, slp_instance instance)
7144 gimple_stmt_iterator si;
7145 int i;
7146 slp_tree child;
7148 /* For existing vectors there's nothing to do. */
7149 if (SLP_TREE_VEC_DEFS (node).exists ())
7150 return;
7152 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
7154 /* Vectorize externals and constants. */
7155 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
7156 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
7158 /* ??? vectorizable_shift can end up using a scalar operand which is
7159 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
7160 node in this case. */
7161 if (!SLP_TREE_VECTYPE (node))
7162 return;
7164 vect_create_constant_vectors (vinfo, node);
7165 return;
7168 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7170 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
7171 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
7173 if (dump_enabled_p ())
7174 dump_printf_loc (MSG_NOTE, vect_location,
7175 "------>vectorizing SLP node starting from: %G",
7176 stmt_info->stmt);
7178 if (STMT_VINFO_DATA_REF (stmt_info)
7179 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7181 /* Vectorized loads go before the first scalar load to make it
7182 ready early, vectorized stores go before the last scalar
7183 stmt which is where all uses are ready. */
7184 stmt_vec_info last_stmt_info = NULL;
7185 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
7186 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
7187 else /* DR_IS_WRITE */
7188 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
7189 si = gsi_for_stmt (last_stmt_info->stmt);
7191 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
7192 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
7193 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
7194 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7196 /* For PHI node vectorization we do not use the insertion iterator. */
7197 si = gsi_none ();
7199 else
7201 /* Emit other stmts after the children vectorized defs which is
7202 earliest possible. */
7203 gimple *last_stmt = NULL;
7204 bool seen_vector_def = false;
7205 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7206 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7208 /* For fold-left reductions we are retaining the scalar
7209 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
7210 set so the representation isn't perfect. Resort to the
7211 last scalar def here. */
7212 if (SLP_TREE_VEC_STMTS (child).is_empty ())
7214 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
7215 == cycle_phi_info_type);
7216 gphi *phi = as_a <gphi *>
7217 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
7218 if (!last_stmt
7219 || vect_stmt_dominates_stmt_p (last_stmt, phi))
7220 last_stmt = phi;
7222 /* We are emitting all vectorized stmts in the same place and
7223 the last one is the last.
7224 ??? Unless we have a load permutation applied and that
7225 figures to re-use an earlier generated load. */
7226 unsigned j;
7227 gimple *vstmt;
7228 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
7229 if (!last_stmt
7230 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7231 last_stmt = vstmt;
7233 else if (!SLP_TREE_VECTYPE (child))
7235 /* For externals we use unvectorized at all scalar defs. */
7236 unsigned j;
7237 tree def;
7238 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
7239 if (TREE_CODE (def) == SSA_NAME
7240 && !SSA_NAME_IS_DEFAULT_DEF (def))
7242 gimple *stmt = SSA_NAME_DEF_STMT (def);
7243 if (!last_stmt
7244 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
7245 last_stmt = stmt;
7248 else
7250 /* For externals we have to look at all defs since their
7251 insertion place is decided per vector. But beware
7252 of pre-existing vectors where we need to make sure
7253 we do not insert before the region boundary. */
7254 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
7255 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
7256 seen_vector_def = true;
7257 else
7259 unsigned j;
7260 tree vdef;
7261 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
7262 if (TREE_CODE (vdef) == SSA_NAME
7263 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
7265 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
7266 if (!last_stmt
7267 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7268 last_stmt = vstmt;
7272 /* This can happen when all children are pre-existing vectors or
7273 constants. */
7274 if (!last_stmt)
7275 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
7276 if (!last_stmt)
7278 gcc_assert (seen_vector_def);
7279 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
7281 else if (is_a <bb_vec_info> (vinfo)
7282 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
7283 && gimple_could_trap_p (stmt_info->stmt))
7285 /* We've constrained possibly trapping operations to all come
7286 from the same basic-block, if vectorized defs would allow earlier
7287 scheduling still force vectorized stmts to the original block.
7288 This is only necessary for BB vectorization since for loop vect
7289 all operations are in a single BB and scalar stmt based
7290 placement doesn't play well with epilogue vectorization. */
7291 gcc_assert (dominated_by_p (CDI_DOMINATORS,
7292 gimple_bb (stmt_info->stmt),
7293 gimple_bb (last_stmt)));
7294 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
7296 else if (is_a <gphi *> (last_stmt))
7297 si = gsi_after_labels (gimple_bb (last_stmt));
7298 else
7300 si = gsi_for_stmt (last_stmt);
7301 gsi_next (&si);
7305 bool done_p = false;
7307 /* Handle purely internal nodes. */
7308 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7310 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
7311 be shared with different SLP nodes (but usually it's the same
7312 operation apart from the case the stmt is only there for denoting
7313 the actual scalar lane defs ...). So do not call vect_transform_stmt
7314 but open-code it here (partly). */
7315 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
7316 gcc_assert (done);
7317 done_p = true;
7319 if (!done_p)
7320 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
7323 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
7324 For loop vectorization this is done in vectorizable_call, but for SLP
7325 it needs to be deferred until end of vect_schedule_slp, because multiple
7326 SLP instances may refer to the same scalar stmt. */
7328 static void
7329 vect_remove_slp_scalar_calls (vec_info *vinfo,
7330 slp_tree node, hash_set<slp_tree> &visited)
7332 gimple *new_stmt;
7333 gimple_stmt_iterator gsi;
7334 int i;
7335 slp_tree child;
7336 tree lhs;
7337 stmt_vec_info stmt_info;
7339 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7340 return;
7342 if (visited.add (node))
7343 return;
7345 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7346 vect_remove_slp_scalar_calls (vinfo, child, visited);
7348 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7350 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
7351 if (!stmt || gimple_bb (stmt) == NULL)
7352 continue;
7353 if (is_pattern_stmt_p (stmt_info)
7354 || !PURE_SLP_STMT (stmt_info))
7355 continue;
7356 lhs = gimple_call_lhs (stmt);
7357 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
7358 gsi = gsi_for_stmt (stmt);
7359 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
7360 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
7364 static void
7365 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
7367 hash_set<slp_tree> visited;
7368 vect_remove_slp_scalar_calls (vinfo, node, visited);
7371 /* Vectorize the instance root. */
7373 void
7374 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
7376 gassign *rstmt = NULL;
7378 if (instance->kind == slp_inst_kind_ctor)
7380 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
7382 gimple *child_stmt = SLP_TREE_VEC_STMTS (node)[0];
7383 tree vect_lhs = gimple_get_lhs (child_stmt);
7384 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7385 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
7386 TREE_TYPE (vect_lhs)))
7387 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
7388 vect_lhs);
7389 rstmt = gimple_build_assign (root_lhs, vect_lhs);
7391 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
7393 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7394 gimple *child_stmt;
7395 int j;
7396 vec<constructor_elt, va_gc> *v;
7397 vec_alloc (v, nelts);
7399 /* A CTOR can handle V16HI composition from VNx8HI so we
7400 do not need to convert vector elements if the types
7401 do not match. */
7402 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7403 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
7404 gimple_get_lhs (child_stmt));
7405 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7406 tree rtype
7407 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
7408 tree r_constructor = build_constructor (rtype, v);
7409 rstmt = gimple_build_assign (lhs, r_constructor);
7412 else if (instance->kind == slp_inst_kind_bb_reduc)
7414 /* Largely inspired by reduction chain epilogue handling in
7415 vect_create_epilog_for_reduction. */
7416 vec<tree> vec_defs = vNULL;
7417 vect_get_slp_defs (node, &vec_defs);
7418 enum tree_code reduc_code
7419 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
7420 /* ??? We actually have to reflect signs somewhere. */
7421 if (reduc_code == MINUS_EXPR)
7422 reduc_code = PLUS_EXPR;
7423 gimple_seq epilogue = NULL;
7424 /* We may end up with more than one vector result, reduce them
7425 to one vector. */
7426 tree vec_def = vec_defs[0];
7427 for (unsigned i = 1; i < vec_defs.length (); ++i)
7428 vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
7429 vec_def, vec_defs[i]);
7430 vec_defs.release ();
7431 /* ??? Support other schemes than direct internal fn. */
7432 internal_fn reduc_fn;
7433 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
7434 || reduc_fn == IFN_LAST)
7435 gcc_unreachable ();
7436 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
7437 TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
7439 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7440 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
7441 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
7442 update_stmt (gsi_stmt (rgsi));
7443 return;
7445 else
7446 gcc_unreachable ();
7448 gcc_assert (rstmt);
7450 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7451 gsi_replace (&rgsi, rstmt, true);
7454 struct slp_scc_info
7456 bool on_stack;
7457 int dfs;
7458 int lowlink;
7461 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
7463 static void
7464 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
7465 hash_map<slp_tree, slp_scc_info> &scc_info,
7466 int &maxdfs, vec<slp_tree> &stack)
7468 bool existed_p;
7469 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
7470 gcc_assert (!existed_p);
7471 info->dfs = maxdfs;
7472 info->lowlink = maxdfs;
7473 maxdfs++;
7475 /* Leaf. */
7476 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7478 info->on_stack = false;
7479 vect_schedule_slp_node (vinfo, node, instance);
7480 return;
7483 info->on_stack = true;
7484 stack.safe_push (node);
7486 unsigned i;
7487 slp_tree child;
7488 /* DFS recurse. */
7489 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7491 if (!child)
7492 continue;
7493 slp_scc_info *child_info = scc_info.get (child);
7494 if (!child_info)
7496 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
7497 /* Recursion might have re-allocated the node. */
7498 info = scc_info.get (node);
7499 child_info = scc_info.get (child);
7500 info->lowlink = MIN (info->lowlink, child_info->lowlink);
7502 else if (child_info->on_stack)
7503 info->lowlink = MIN (info->lowlink, child_info->dfs);
7505 if (info->lowlink != info->dfs)
7506 return;
7508 auto_vec<slp_tree, 4> phis_to_fixup;
7510 /* Singleton. */
7511 if (stack.last () == node)
7513 stack.pop ();
7514 info->on_stack = false;
7515 vect_schedule_slp_node (vinfo, node, instance);
7516 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
7517 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
7518 phis_to_fixup.quick_push (node);
7520 else
7522 /* SCC. */
7523 int last_idx = stack.length () - 1;
7524 while (stack[last_idx] != node)
7525 last_idx--;
7526 /* We can break the cycle at PHIs who have at least one child
7527 code generated. Then we could re-start the DFS walk until
7528 all nodes in the SCC are covered (we might have new entries
7529 for only back-reachable nodes). But it's simpler to just
7530 iterate and schedule those that are ready. */
7531 unsigned todo = stack.length () - last_idx;
7534 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
7536 slp_tree entry = stack[idx];
7537 if (!entry)
7538 continue;
7539 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
7540 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
7541 bool ready = !phi;
7542 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
7543 if (!child)
7545 gcc_assert (phi);
7546 ready = true;
7547 break;
7549 else if (scc_info.get (child)->on_stack)
7551 if (!phi)
7553 ready = false;
7554 break;
7557 else
7559 if (phi)
7561 ready = true;
7562 break;
7565 if (ready)
7567 vect_schedule_slp_node (vinfo, entry, instance);
7568 scc_info.get (entry)->on_stack = false;
7569 stack[idx] = NULL;
7570 todo--;
7571 if (phi)
7572 phis_to_fixup.safe_push (entry);
7576 while (todo != 0);
7578 /* Pop the SCC. */
7579 stack.truncate (last_idx);
7582 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
7583 slp_tree phi_node;
7584 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
7586 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
7587 edge_iterator ei;
7588 edge e;
7589 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
7591 unsigned dest_idx = e->dest_idx;
7592 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
7593 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
7594 continue;
7595 /* Simply fill all args. */
7596 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
7597 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
7598 vect_get_slp_vect_def (child, i),
7599 e, gimple_phi_arg_location (phi, dest_idx));
7604 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
7606 void
7607 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
7609 slp_instance instance;
7610 unsigned int i;
7612 hash_map<slp_tree, slp_scc_info> scc_info;
7613 int maxdfs = 0;
7614 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7616 slp_tree node = SLP_INSTANCE_TREE (instance);
7617 if (dump_enabled_p ())
7619 dump_printf_loc (MSG_NOTE, vect_location,
7620 "Vectorizing SLP tree:\n");
7621 /* ??? Dump all? */
7622 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7623 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
7624 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
7625 vect_print_slp_graph (MSG_NOTE, vect_location,
7626 SLP_INSTANCE_TREE (instance));
7628 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
7629 have a PHI be the node breaking the cycle. */
7630 auto_vec<slp_tree> stack;
7631 if (!scc_info.get (node))
7632 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
7634 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7635 vectorize_slp_instance_root_stmt (node, instance);
7637 if (dump_enabled_p ())
7638 dump_printf_loc (MSG_NOTE, vect_location,
7639 "vectorizing stmts using SLP.\n");
7642 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7644 slp_tree root = SLP_INSTANCE_TREE (instance);
7645 stmt_vec_info store_info;
7646 unsigned int j;
7648 /* Remove scalar call stmts. Do not do this for basic-block
7649 vectorization as not all uses may be vectorized.
7650 ??? Why should this be necessary? DCE should be able to
7651 remove the stmts itself.
7652 ??? For BB vectorization we can as well remove scalar
7653 stmts starting from the SLP tree root if they have no
7654 uses. */
7655 if (is_a <loop_vec_info> (vinfo))
7656 vect_remove_slp_scalar_calls (vinfo, root);
7658 /* Remove vectorized stores original scalar stmts. */
7659 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
7661 if (!STMT_VINFO_DATA_REF (store_info)
7662 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
7663 break;
7665 store_info = vect_orig_stmt (store_info);
7666 /* Free the attached stmt_vec_info and remove the stmt. */
7667 vinfo->remove_stmt (store_info);
7669 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
7670 to not crash in vect_free_slp_tree later. */
7671 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
7672 SLP_TREE_REPRESENTATIVE (root) = NULL;