Add mi_thunk support for vcalls on hppa.
[official-gcc.git] / gcc / tree-vect-slp.c
blobc521c34a83bcfe595489c9709a0db56951b93c24
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
56 static object_allocator<_slp_tree> *slp_tree_pool;
57 static slp_tree slp_first_node;
59 void
60 vect_slp_init (void)
62 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
65 void
66 vect_slp_fini (void)
68 while (slp_first_node)
69 delete slp_first_node;
70 delete slp_tree_pool;
71 slp_tree_pool = NULL;
74 void *
75 _slp_tree::operator new (size_t n)
77 gcc_assert (n == sizeof (_slp_tree));
78 return slp_tree_pool->allocate_raw ();
81 void
82 _slp_tree::operator delete (void *node, size_t n)
84 gcc_assert (n == sizeof (_slp_tree));
85 slp_tree_pool->remove_raw (node);
89 /* Initialize a SLP node. */
91 _slp_tree::_slp_tree ()
93 this->prev_node = NULL;
94 if (slp_first_node)
95 slp_first_node->prev_node = this;
96 this->next_node = slp_first_node;
97 slp_first_node = this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL;
99 SLP_TREE_SCALAR_OPS (this) = vNULL;
100 SLP_TREE_VEC_STMTS (this) = vNULL;
101 SLP_TREE_VEC_DEFS (this) = vNULL;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
107 SLP_TREE_CODE (this) = ERROR_MARK;
108 SLP_TREE_VECTYPE (this) = NULL_TREE;
109 SLP_TREE_REPRESENTATIVE (this) = NULL;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits = 1;
112 this->lanes = 0;
115 /* Tear down a SLP node. */
117 _slp_tree::~_slp_tree ()
119 if (this->prev_node)
120 this->prev_node->next_node = this->next_node;
121 else
122 slp_first_node = this->next_node;
123 if (this->next_node)
124 this->next_node->prev_node = this->prev_node;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
136 void
137 vect_free_slp_tree (slp_tree node)
139 int i;
140 slp_tree child;
142 if (--SLP_TREE_REF_COUNT (node) != 0)
143 return;
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
146 if (child)
147 vect_free_slp_tree (child);
149 delete node;
152 /* Return a location suitable for dumpings related to the SLP instance. */
154 dump_user_location_t
155 _slp_instance::location () const
157 if (root_stmt)
158 return root_stmt->stmt;
159 else
160 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
164 /* Free the memory allocated for the SLP instance. */
166 void
167 vect_free_slp_instance (slp_instance instance)
169 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
170 SLP_INSTANCE_LOADS (instance).release ();
171 instance->subgraph_entries.release ();
172 instance->cost_vec.release ();
173 free (instance);
177 /* Create an SLP node for SCALAR_STMTS. */
179 slp_tree
180 vect_create_new_slp_node (unsigned nops, tree_code code)
182 slp_tree node = new _slp_tree;
183 SLP_TREE_SCALAR_STMTS (node) = vNULL;
184 SLP_TREE_CHILDREN (node).create (nops);
185 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
186 SLP_TREE_CODE (node) = code;
187 return node;
189 /* Create an SLP node for SCALAR_STMTS. */
191 static slp_tree
192 vect_create_new_slp_node (slp_tree node,
193 vec<stmt_vec_info> scalar_stmts, unsigned nops)
195 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
196 SLP_TREE_CHILDREN (node).create (nops);
197 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
198 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
199 SLP_TREE_LANES (node) = scalar_stmts.length ();
200 return node;
203 /* Create an SLP node for SCALAR_STMTS. */
205 static slp_tree
206 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
208 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
211 /* Create an SLP node for OPS. */
213 static slp_tree
214 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
216 SLP_TREE_SCALAR_OPS (node) = ops;
217 SLP_TREE_DEF_TYPE (node) = vect_external_def;
218 SLP_TREE_LANES (node) = ops.length ();
219 return node;
222 /* Create an SLP node for OPS. */
224 static slp_tree
225 vect_create_new_slp_node (vec<tree> ops)
227 return vect_create_new_slp_node (new _slp_tree, ops);
231 /* This structure is used in creation of an SLP tree. Each instance
232 corresponds to the same operand in a group of scalar stmts in an SLP
233 node. */
234 typedef struct _slp_oprnd_info
236 /* Def-stmts for the operands. */
237 vec<stmt_vec_info> def_stmts;
238 /* Operands. */
239 vec<tree> ops;
240 /* Information about the first statement, its vector def-type, type, the
241 operand itself in case it's constant, and an indication if it's a pattern
242 stmt. */
243 tree first_op_type;
244 enum vect_def_type first_dt;
245 bool any_pattern;
246 } *slp_oprnd_info;
249 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
250 operand. */
251 static vec<slp_oprnd_info>
252 vect_create_oprnd_info (int nops, int group_size)
254 int i;
255 slp_oprnd_info oprnd_info;
256 vec<slp_oprnd_info> oprnds_info;
258 oprnds_info.create (nops);
259 for (i = 0; i < nops; i++)
261 oprnd_info = XNEW (struct _slp_oprnd_info);
262 oprnd_info->def_stmts.create (group_size);
263 oprnd_info->ops.create (group_size);
264 oprnd_info->first_dt = vect_uninitialized_def;
265 oprnd_info->first_op_type = NULL_TREE;
266 oprnd_info->any_pattern = false;
267 oprnds_info.quick_push (oprnd_info);
270 return oprnds_info;
274 /* Free operands info. */
276 static void
277 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
279 int i;
280 slp_oprnd_info oprnd_info;
282 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
284 oprnd_info->def_stmts.release ();
285 oprnd_info->ops.release ();
286 XDELETE (oprnd_info);
289 oprnds_info.release ();
293 /* Return true if STMTS contains a pattern statement. */
295 static bool
296 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
298 stmt_vec_info stmt_info;
299 unsigned int i;
300 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
301 if (is_pattern_stmt_p (stmt_info))
302 return true;
303 return false;
306 /* Return true when all lanes in the external or constant NODE have
307 the same value. */
309 static bool
310 vect_slp_tree_uniform_p (slp_tree node)
312 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
313 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
315 /* Pre-exsting vectors. */
316 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
317 return false;
319 unsigned i;
320 tree op, first = NULL_TREE;
321 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
322 if (!first)
323 first = op;
324 else if (!operand_equal_p (first, op, 0))
325 return false;
327 return true;
330 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
331 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
332 of the chain. */
335 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
336 stmt_vec_info first_stmt_info)
338 stmt_vec_info next_stmt_info = first_stmt_info;
339 int result = 0;
341 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
342 return -1;
346 if (next_stmt_info == stmt_info)
347 return result;
348 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
349 if (next_stmt_info)
350 result += DR_GROUP_GAP (next_stmt_info);
352 while (next_stmt_info);
354 return -1;
357 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
358 using the method implemented by duplicate_and_interleave. Return true
359 if so, returning the number of intermediate vectors in *NVECTORS_OUT
360 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
361 (if nonnull). */
363 bool
364 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
365 tree elt_type, unsigned int *nvectors_out,
366 tree *vector_type_out,
367 tree *permutes)
369 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
370 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
371 return false;
373 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
374 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
375 unsigned int nvectors = 1;
376 for (;;)
378 scalar_int_mode int_mode;
379 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
380 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
382 /* Get the natural vector type for this SLP group size. */
383 tree int_type = build_nonstandard_integer_type
384 (GET_MODE_BITSIZE (int_mode), 1);
385 tree vector_type
386 = get_vectype_for_scalar_type (vinfo, int_type, count);
387 if (vector_type
388 && VECTOR_MODE_P (TYPE_MODE (vector_type))
389 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
390 GET_MODE_SIZE (base_vector_mode)))
392 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
393 together into elements of type INT_TYPE and using the result
394 to build NVECTORS vectors. */
395 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
396 vec_perm_builder sel1 (nelts, 2, 3);
397 vec_perm_builder sel2 (nelts, 2, 3);
398 poly_int64 half_nelts = exact_div (nelts, 2);
399 for (unsigned int i = 0; i < 3; ++i)
401 sel1.quick_push (i);
402 sel1.quick_push (i + nelts);
403 sel2.quick_push (half_nelts + i);
404 sel2.quick_push (half_nelts + i + nelts);
406 vec_perm_indices indices1 (sel1, 2, nelts);
407 vec_perm_indices indices2 (sel2, 2, nelts);
408 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
409 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
411 if (nvectors_out)
412 *nvectors_out = nvectors;
413 if (vector_type_out)
414 *vector_type_out = vector_type;
415 if (permutes)
417 permutes[0] = vect_gen_perm_mask_checked (vector_type,
418 indices1);
419 permutes[1] = vect_gen_perm_mask_checked (vector_type,
420 indices2);
422 return true;
426 if (!multiple_p (elt_bytes, 2, &elt_bytes))
427 return false;
428 nvectors *= 2;
432 /* Return true if DTA and DTB match. */
434 static bool
435 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
437 return (dta == dtb
438 || ((dta == vect_external_def || dta == vect_constant_def)
439 && (dtb == vect_external_def || dtb == vect_constant_def)));
442 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
443 they are of a valid type and that they match the defs of the first stmt of
444 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
445 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
446 indicates swap is required for cond_expr stmts. Specifically, *SWAP
447 is 1 if STMT is cond and operands of comparison need to be swapped;
448 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
449 If there is any operand swap in this function, *SWAP is set to non-zero
450 value.
451 If there was a fatal error return -1; if the error could be corrected by
452 swapping operands of father node of this one, return 1; if everything is
453 ok return 0. */
454 static int
455 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
456 bool *skip_args,
457 vec<stmt_vec_info> stmts, unsigned stmt_num,
458 vec<slp_oprnd_info> *oprnds_info)
460 stmt_vec_info stmt_info = stmts[stmt_num];
461 tree oprnd;
462 unsigned int i, number_of_oprnds;
463 enum vect_def_type dt = vect_uninitialized_def;
464 slp_oprnd_info oprnd_info;
465 int first_op_idx = 1;
466 unsigned int commutative_op = -1U;
467 bool first_op_cond = false;
468 bool first = stmt_num == 0;
470 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
472 number_of_oprnds = gimple_call_num_args (stmt);
473 first_op_idx = 3;
474 if (gimple_call_internal_p (stmt))
476 internal_fn ifn = gimple_call_internal_fn (stmt);
477 commutative_op = first_commutative_argument (ifn);
479 /* Masked load, only look at mask. */
480 if (ifn == IFN_MASK_LOAD)
482 number_of_oprnds = 1;
483 /* Mask operand index. */
484 first_op_idx = 5;
488 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
490 enum tree_code code = gimple_assign_rhs_code (stmt);
491 number_of_oprnds = gimple_num_ops (stmt) - 1;
492 /* Swap can only be done for cond_expr if asked to, otherwise we
493 could result in different comparison code to the first stmt. */
494 if (code == COND_EXPR
495 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
497 first_op_cond = true;
498 number_of_oprnds++;
500 else
501 commutative_op = commutative_tree_code (code) ? 0U : -1U;
503 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
504 number_of_oprnds = gimple_phi_num_args (stmt);
505 else
506 return -1;
508 bool swapped = (swap != 0);
509 bool backedge = false;
510 gcc_assert (!swapped || first_op_cond);
511 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
512 for (i = 0; i < number_of_oprnds; i++)
514 if (first_op_cond)
516 /* Map indicating how operands of cond_expr should be swapped. */
517 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
518 int *map = maps[swap];
520 if (i < 2)
521 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
522 first_op_idx), map[i]);
523 else
524 oprnd = gimple_op (stmt_info->stmt, map[i]);
526 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
528 oprnd = gimple_phi_arg_def (stmt, i);
529 backedge = dominated_by_p (CDI_DOMINATORS,
530 gimple_phi_arg_edge (stmt, i)->src,
531 gimple_bb (stmt_info->stmt));
533 else
534 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
535 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
536 oprnd = TREE_OPERAND (oprnd, 0);
538 oprnd_info = (*oprnds_info)[i];
540 stmt_vec_info def_stmt_info;
541 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
543 if (dump_enabled_p ())
544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
545 "Build SLP failed: can't analyze def for %T\n",
546 oprnd);
548 return -1;
551 if (skip_args[i])
553 oprnd_info->def_stmts.quick_push (NULL);
554 oprnd_info->ops.quick_push (NULL_TREE);
555 oprnd_info->first_dt = vect_uninitialized_def;
556 continue;
559 oprnd_info->def_stmts.quick_push (def_stmt_info);
560 oprnd_info->ops.quick_push (oprnd);
562 if (def_stmt_info
563 && is_pattern_stmt_p (def_stmt_info))
565 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
566 != def_stmt_info)
567 oprnd_info->any_pattern = true;
568 else
569 /* If we promote this to external use the original stmt def. */
570 oprnd_info->ops.last ()
571 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
574 /* If there's a extern def on a backedge make sure we can
575 code-generate at the region start.
576 ??? This is another case that could be fixed by adjusting
577 how we split the function but at the moment we'd have conflicting
578 goals there. */
579 if (backedge
580 && dts[i] == vect_external_def
581 && is_a <bb_vec_info> (vinfo)
582 && TREE_CODE (oprnd) == SSA_NAME
583 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
584 && !dominated_by_p (CDI_DOMINATORS,
585 as_a <bb_vec_info> (vinfo)->bbs[0],
586 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
588 if (dump_enabled_p ())
589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
590 "Build SLP failed: extern def %T only defined "
591 "on backedge\n", oprnd);
592 return -1;
595 if (first)
597 tree type = TREE_TYPE (oprnd);
598 dt = dts[i];
599 if ((dt == vect_constant_def
600 || dt == vect_external_def)
601 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
602 && (TREE_CODE (type) == BOOLEAN_TYPE
603 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
604 type)))
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
608 "Build SLP failed: invalid type of def "
609 "for variable-length SLP %T\n", oprnd);
610 return -1;
613 /* For the swapping logic below force vect_reduction_def
614 for the reduction op in a SLP reduction group. */
615 if (!STMT_VINFO_DATA_REF (stmt_info)
616 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
617 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
618 && def_stmt_info)
619 dts[i] = dt = vect_reduction_def;
621 /* Check the types of the definition. */
622 switch (dt)
624 case vect_external_def:
625 case vect_constant_def:
626 case vect_internal_def:
627 case vect_reduction_def:
628 case vect_induction_def:
629 case vect_nested_cycle:
630 break;
632 default:
633 /* FORNOW: Not supported. */
634 if (dump_enabled_p ())
635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
636 "Build SLP failed: illegal type of def %T\n",
637 oprnd);
638 return -1;
641 oprnd_info->first_dt = dt;
642 oprnd_info->first_op_type = type;
645 if (first)
646 return 0;
648 /* Now match the operand definition types to that of the first stmt. */
649 for (i = 0; i < number_of_oprnds;)
651 if (skip_args[i])
653 ++i;
654 continue;
657 oprnd_info = (*oprnds_info)[i];
658 dt = dts[i];
659 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
660 oprnd = oprnd_info->ops[stmt_num];
661 tree type = TREE_TYPE (oprnd);
663 if (!types_compatible_p (oprnd_info->first_op_type, type))
665 if (dump_enabled_p ())
666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
667 "Build SLP failed: different operand types\n");
668 return 1;
671 /* Not first stmt of the group, check that the def-stmt/s match
672 the def-stmt/s of the first stmt. Allow different definition
673 types for reduction chains: the first stmt must be a
674 vect_reduction_def (a phi node), and the rest
675 end in the reduction chain. */
676 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
677 && !(oprnd_info->first_dt == vect_reduction_def
678 && !STMT_VINFO_DATA_REF (stmt_info)
679 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
680 && def_stmt_info
681 && !STMT_VINFO_DATA_REF (def_stmt_info)
682 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
683 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
684 || (!STMT_VINFO_DATA_REF (stmt_info)
685 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
686 && ((!def_stmt_info
687 || STMT_VINFO_DATA_REF (def_stmt_info)
688 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
689 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
690 != (oprnd_info->first_dt != vect_reduction_def))))
692 /* Try swapping operands if we got a mismatch. For BB
693 vectorization only in case it will clearly improve things. */
694 if (i == commutative_op && !swapped
695 && (!is_a <bb_vec_info> (vinfo)
696 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
697 dts[i+1])
698 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
699 || vect_def_types_match
700 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
702 if (dump_enabled_p ())
703 dump_printf_loc (MSG_NOTE, vect_location,
704 "trying swapped operands\n");
705 std::swap (dts[i], dts[i+1]);
706 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
707 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
708 std::swap ((*oprnds_info)[i]->ops[stmt_num],
709 (*oprnds_info)[i+1]->ops[stmt_num]);
710 swapped = true;
711 continue;
714 if (is_a <bb_vec_info> (vinfo)
715 && !oprnd_info->any_pattern)
717 /* Now for commutative ops we should see whether we can
718 make the other operand matching. */
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "treating operand as external\n");
722 oprnd_info->first_dt = dt = vect_external_def;
724 else
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
728 "Build SLP failed: different types\n");
729 return 1;
733 /* Make sure to demote the overall operand to external. */
734 if (dt == vect_external_def)
735 oprnd_info->first_dt = vect_external_def;
736 /* For a SLP reduction chain we want to duplicate the reduction to
737 each of the chain members. That gets us a sane SLP graph (still
738 the stmts are not 100% correct wrt the initial values). */
739 else if ((dt == vect_internal_def
740 || dt == vect_reduction_def)
741 && oprnd_info->first_dt == vect_reduction_def
742 && !STMT_VINFO_DATA_REF (stmt_info)
743 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
744 && !STMT_VINFO_DATA_REF (def_stmt_info)
745 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
746 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
748 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
749 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
752 ++i;
755 /* Swap operands. */
756 if (swapped)
758 if (dump_enabled_p ())
759 dump_printf_loc (MSG_NOTE, vect_location,
760 "swapped operands to match def types in %G",
761 stmt_info->stmt);
764 return 0;
767 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
768 Return true if we can, meaning that this choice doesn't conflict with
769 existing SLP nodes that use STMT_INFO. */
771 bool
772 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
774 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
775 if (old_vectype)
776 return useless_type_conversion_p (vectype, old_vectype);
778 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
780 /* We maintain the invariant that if any statement in the group is
781 used, all other members of the group have the same vector type. */
782 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
783 stmt_vec_info member_info = first_info;
784 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
785 if (is_pattern_stmt_p (member_info)
786 && !useless_type_conversion_p (vectype,
787 STMT_VINFO_VECTYPE (member_info)))
788 break;
790 if (!member_info)
792 for (member_info = first_info; member_info;
793 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
794 STMT_VINFO_VECTYPE (member_info) = vectype;
795 return true;
798 else if (!is_pattern_stmt_p (stmt_info))
800 STMT_VINFO_VECTYPE (stmt_info) = vectype;
801 return true;
804 if (dump_enabled_p ())
806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
807 "Build SLP failed: incompatible vector"
808 " types for: %G", stmt_info->stmt);
809 dump_printf_loc (MSG_NOTE, vect_location,
810 " old vector type: %T\n", old_vectype);
811 dump_printf_loc (MSG_NOTE, vect_location,
812 " new vector type: %T\n", vectype);
814 return false;
817 /* Return true if call statements CALL1 and CALL2 are similar enough
818 to be combined into the same SLP group. */
820 static bool
821 compatible_calls_p (gcall *call1, gcall *call2)
823 unsigned int nargs = gimple_call_num_args (call1);
824 if (nargs != gimple_call_num_args (call2))
825 return false;
827 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
828 return false;
830 if (gimple_call_internal_p (call1))
832 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
833 TREE_TYPE (gimple_call_lhs (call2))))
834 return false;
835 for (unsigned int i = 0; i < nargs; ++i)
836 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
837 TREE_TYPE (gimple_call_arg (call2, i))))
838 return false;
840 else
842 if (!operand_equal_p (gimple_call_fn (call1),
843 gimple_call_fn (call2), 0))
844 return false;
846 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
847 return false;
849 return true;
852 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
853 caller's attempt to find the vector type in STMT_INFO with the narrowest
854 element type. Return true if VECTYPE is nonnull and if it is valid
855 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
856 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
857 vect_build_slp_tree. */
859 static bool
860 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
861 unsigned int group_size,
862 tree vectype, poly_uint64 *max_nunits)
864 if (!vectype)
866 if (dump_enabled_p ())
867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
868 "Build SLP failed: unsupported data-type in %G\n",
869 stmt_info->stmt);
870 /* Fatal mismatch. */
871 return false;
874 /* If populating the vector type requires unrolling then fail
875 before adjusting *max_nunits for basic-block vectorization. */
876 if (is_a <bb_vec_info> (vinfo)
877 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
879 if (dump_enabled_p ())
880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
881 "Build SLP failed: unrolling required "
882 "in basic block SLP\n");
883 /* Fatal mismatch. */
884 return false;
887 /* In case of multiple types we need to detect the smallest type. */
888 vect_update_max_nunits (max_nunits, vectype);
889 return true;
892 /* Verify if the scalar stmts STMTS are isomorphic, require data
893 permutation or are of unsupported types of operation. Return
894 true if they are, otherwise return false and indicate in *MATCHES
895 which stmts are not isomorphic to the first one. If MATCHES[0]
896 is false then this indicates the comparison could not be
897 carried out or the stmts will never be vectorized by SLP.
899 Note COND_EXPR is possibly isomorphic to another one after swapping its
900 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
901 the first stmt by swapping the two operands of comparison; set SWAP[i]
902 to 2 if stmt I is isormorphic to the first stmt by inverting the code
903 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
904 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
906 static bool
907 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
908 vec<stmt_vec_info> stmts, unsigned int group_size,
909 poly_uint64 *max_nunits, bool *matches,
910 bool *two_operators, tree *node_vectype)
912 unsigned int i;
913 stmt_vec_info first_stmt_info = stmts[0];
914 enum tree_code first_stmt_code = ERROR_MARK;
915 enum tree_code alt_stmt_code = ERROR_MARK;
916 enum tree_code rhs_code = ERROR_MARK;
917 enum tree_code first_cond_code = ERROR_MARK;
918 tree lhs;
919 bool need_same_oprnds = false;
920 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
921 optab optab;
922 int icode;
923 machine_mode optab_op2_mode;
924 machine_mode vec_mode;
925 stmt_vec_info first_load = NULL, prev_first_load = NULL;
926 bool first_stmt_load_p = false, load_p = false;
927 bool first_stmt_phi_p = false, phi_p = false;
928 bool maybe_soft_fail = false;
929 tree soft_fail_nunits_vectype = NULL_TREE;
931 /* For every stmt in NODE find its def stmt/s. */
932 stmt_vec_info stmt_info;
933 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
935 gimple *stmt = stmt_info->stmt;
936 swap[i] = 0;
937 matches[i] = false;
939 if (dump_enabled_p ())
940 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
942 /* Fail to vectorize statements marked as unvectorizable, throw
943 or are volatile. */
944 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
945 || stmt_can_throw_internal (cfun, stmt)
946 || gimple_has_volatile_ops (stmt))
948 if (dump_enabled_p ())
949 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
950 "Build SLP failed: unvectorizable statement %G",
951 stmt);
952 /* ??? For BB vectorization we want to commutate operands in a way
953 to shuffle all unvectorizable defs into one operand and have
954 the other still vectorized. The following doesn't reliably
955 work for this though but it's the easiest we can do here. */
956 if (is_a <bb_vec_info> (vinfo) && i != 0)
957 continue;
958 /* Fatal mismatch. */
959 matches[0] = false;
960 return false;
963 lhs = gimple_get_lhs (stmt);
964 if (lhs == NULL_TREE)
966 if (dump_enabled_p ())
967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
968 "Build SLP failed: not GIMPLE_ASSIGN nor "
969 "GIMPLE_CALL %G", stmt);
970 if (is_a <bb_vec_info> (vinfo) && i != 0)
971 continue;
972 /* Fatal mismatch. */
973 matches[0] = false;
974 return false;
977 tree nunits_vectype;
978 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
979 &nunits_vectype, group_size))
981 if (is_a <bb_vec_info> (vinfo) && i != 0)
982 continue;
983 /* Fatal mismatch. */
984 matches[0] = false;
985 return false;
987 /* Record nunits required but continue analysis, producing matches[]
988 as if nunits was not an issue. This allows splitting of groups
989 to happen. */
990 if (nunits_vectype
991 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
992 nunits_vectype, max_nunits))
994 gcc_assert (is_a <bb_vec_info> (vinfo));
995 maybe_soft_fail = true;
996 soft_fail_nunits_vectype = nunits_vectype;
999 gcc_assert (vectype);
1001 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1002 if (call_stmt)
1004 rhs_code = CALL_EXPR;
1006 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
1007 load_p = true;
1008 else if ((gimple_call_internal_p (call_stmt)
1009 && (!vectorizable_internal_fn_p
1010 (gimple_call_internal_fn (call_stmt))))
1011 || gimple_call_tail_p (call_stmt)
1012 || gimple_call_noreturn_p (call_stmt)
1013 || !gimple_call_nothrow_p (call_stmt)
1014 || gimple_call_chain (call_stmt))
1016 if (dump_enabled_p ())
1017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1018 "Build SLP failed: unsupported call type %G",
1019 call_stmt);
1020 if (is_a <bb_vec_info> (vinfo) && i != 0)
1021 continue;
1022 /* Fatal mismatch. */
1023 matches[0] = false;
1024 return false;
1027 else if (gimple_code (stmt) == GIMPLE_PHI)
1029 rhs_code = ERROR_MARK;
1030 phi_p = true;
1032 else
1034 rhs_code = gimple_assign_rhs_code (stmt);
1035 load_p = gimple_vuse (stmt);
1038 /* Check the operation. */
1039 if (i == 0)
1041 *node_vectype = vectype;
1042 first_stmt_code = rhs_code;
1043 first_stmt_load_p = load_p;
1044 first_stmt_phi_p = phi_p;
1046 /* Shift arguments should be equal in all the packed stmts for a
1047 vector shift with scalar shift operand. */
1048 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1049 || rhs_code == LROTATE_EXPR
1050 || rhs_code == RROTATE_EXPR)
1052 vec_mode = TYPE_MODE (vectype);
1054 /* First see if we have a vector/vector shift. */
1055 optab = optab_for_tree_code (rhs_code, vectype,
1056 optab_vector);
1058 if (!optab
1059 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1061 /* No vector/vector shift, try for a vector/scalar shift. */
1062 optab = optab_for_tree_code (rhs_code, vectype,
1063 optab_scalar);
1065 if (!optab)
1067 if (dump_enabled_p ())
1068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1069 "Build SLP failed: no optab.\n");
1070 if (is_a <bb_vec_info> (vinfo) && i != 0)
1071 continue;
1072 /* Fatal mismatch. */
1073 matches[0] = false;
1074 return false;
1076 icode = (int) optab_handler (optab, vec_mode);
1077 if (icode == CODE_FOR_nothing)
1079 if (dump_enabled_p ())
1080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1081 "Build SLP failed: "
1082 "op not supported by target.\n");
1083 if (is_a <bb_vec_info> (vinfo) && i != 0)
1084 continue;
1085 /* Fatal mismatch. */
1086 matches[0] = false;
1087 return false;
1089 optab_op2_mode = insn_data[icode].operand[2].mode;
1090 if (!VECTOR_MODE_P (optab_op2_mode))
1092 need_same_oprnds = true;
1093 first_op1 = gimple_assign_rhs2 (stmt);
1097 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1099 need_same_oprnds = true;
1100 first_op1 = gimple_assign_rhs2 (stmt);
1102 else if (!load_p
1103 && rhs_code == BIT_FIELD_REF)
1105 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1106 if (!is_a <bb_vec_info> (vinfo)
1107 || TREE_CODE (vec) != SSA_NAME
1108 || !operand_equal_p (TYPE_SIZE (vectype),
1109 TYPE_SIZE (TREE_TYPE (vec))))
1111 if (dump_enabled_p ())
1112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113 "Build SLP failed: "
1114 "BIT_FIELD_REF not supported\n");
1115 /* Fatal mismatch. */
1116 matches[0] = false;
1117 return false;
1120 else if (call_stmt
1121 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1123 need_same_oprnds = true;
1124 first_op1 = gimple_call_arg (call_stmt, 1);
1127 else
1129 if (first_stmt_code != rhs_code
1130 && alt_stmt_code == ERROR_MARK)
1131 alt_stmt_code = rhs_code;
1132 if ((first_stmt_code != rhs_code
1133 && (first_stmt_code != IMAGPART_EXPR
1134 || rhs_code != REALPART_EXPR)
1135 && (first_stmt_code != REALPART_EXPR
1136 || rhs_code != IMAGPART_EXPR)
1137 /* Handle mismatches in plus/minus by computing both
1138 and merging the results. */
1139 && !((first_stmt_code == PLUS_EXPR
1140 || first_stmt_code == MINUS_EXPR)
1141 && (alt_stmt_code == PLUS_EXPR
1142 || alt_stmt_code == MINUS_EXPR)
1143 && rhs_code == alt_stmt_code)
1144 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1145 && (first_stmt_code == ARRAY_REF
1146 || first_stmt_code == BIT_FIELD_REF
1147 || first_stmt_code == INDIRECT_REF
1148 || first_stmt_code == COMPONENT_REF
1149 || first_stmt_code == MEM_REF)))
1150 || first_stmt_load_p != load_p
1151 || first_stmt_phi_p != phi_p)
1153 if (dump_enabled_p ())
1155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1156 "Build SLP failed: different operation "
1157 "in stmt %G", stmt);
1158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159 "original stmt %G", first_stmt_info->stmt);
1161 /* Mismatch. */
1162 continue;
1165 if (need_same_oprnds)
1167 tree other_op1 = (call_stmt
1168 ? gimple_call_arg (call_stmt, 1)
1169 : gimple_assign_rhs2 (stmt));
1170 if (!operand_equal_p (first_op1, other_op1, 0))
1172 if (dump_enabled_p ())
1173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1174 "Build SLP failed: different shift "
1175 "arguments in %G", stmt);
1176 /* Mismatch. */
1177 continue;
1180 if (!load_p
1181 && first_stmt_code == BIT_FIELD_REF
1182 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1183 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1185 if (dump_enabled_p ())
1186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1187 "Build SLP failed: different BIT_FIELD_REF "
1188 "arguments in %G", stmt);
1189 /* Mismatch. */
1190 continue;
1193 if (!load_p && rhs_code == CALL_EXPR)
1195 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1196 as_a <gcall *> (stmt)))
1198 if (dump_enabled_p ())
1199 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1200 "Build SLP failed: different calls in %G",
1201 stmt);
1202 /* Mismatch. */
1203 continue;
1207 if (phi_p
1208 && (gimple_bb (first_stmt_info->stmt)
1209 != gimple_bb (stmt_info->stmt)))
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1213 "Build SLP failed: different BB for PHI "
1214 "in %G", stmt);
1215 /* Mismatch. */
1216 continue;
1219 if (!types_compatible_p (vectype, *node_vectype))
1221 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223 "Build SLP failed: different vector type "
1224 "in %G", stmt);
1225 /* Mismatch. */
1226 continue;
1230 /* Grouped store or load. */
1231 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1233 if (REFERENCE_CLASS_P (lhs))
1235 /* Store. */
1238 else
1240 /* Load. */
1241 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1242 if (prev_first_load)
1244 /* Check that there are no loads from different interleaving
1245 chains in the same node. */
1246 if (prev_first_load != first_load)
1248 if (dump_enabled_p ())
1249 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1250 vect_location,
1251 "Build SLP failed: different "
1252 "interleaving chains in one node %G",
1253 stmt);
1254 /* Mismatch. */
1255 continue;
1258 else
1259 prev_first_load = first_load;
1261 } /* Grouped access. */
1262 else
1264 if (load_p)
1266 /* Not grouped load. */
1267 if (dump_enabled_p ())
1268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1269 "Build SLP failed: not grouped load %G", stmt);
1271 /* FORNOW: Not grouped loads are not supported. */
1272 if (is_a <bb_vec_info> (vinfo) && i != 0)
1273 continue;
1274 /* Fatal mismatch. */
1275 matches[0] = false;
1276 return false;
1279 /* Not memory operation. */
1280 if (!phi_p
1281 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1282 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1283 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1284 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1285 && rhs_code != VIEW_CONVERT_EXPR
1286 && rhs_code != CALL_EXPR
1287 && rhs_code != BIT_FIELD_REF)
1289 if (dump_enabled_p ())
1290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291 "Build SLP failed: operation unsupported %G",
1292 stmt);
1293 if (is_a <bb_vec_info> (vinfo) && i != 0)
1294 continue;
1295 /* Fatal mismatch. */
1296 matches[0] = false;
1297 return false;
1300 if (rhs_code == COND_EXPR)
1302 tree cond_expr = gimple_assign_rhs1 (stmt);
1303 enum tree_code cond_code = TREE_CODE (cond_expr);
1304 enum tree_code swap_code = ERROR_MARK;
1305 enum tree_code invert_code = ERROR_MARK;
1307 if (i == 0)
1308 first_cond_code = TREE_CODE (cond_expr);
1309 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1311 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1312 swap_code = swap_tree_comparison (cond_code);
1313 invert_code = invert_tree_comparison (cond_code, honor_nans);
1316 if (first_cond_code == cond_code)
1318 /* Isomorphic can be achieved by swapping. */
1319 else if (first_cond_code == swap_code)
1320 swap[i] = 1;
1321 /* Isomorphic can be achieved by inverting. */
1322 else if (first_cond_code == invert_code)
1323 swap[i] = 2;
1324 else
1326 if (dump_enabled_p ())
1327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1328 "Build SLP failed: different"
1329 " operation %G", stmt);
1330 /* Mismatch. */
1331 continue;
1336 matches[i] = true;
1339 for (i = 0; i < group_size; ++i)
1340 if (!matches[i])
1341 return false;
1343 /* If we allowed a two-operation SLP node verify the target can cope
1344 with the permute we are going to use. */
1345 if (alt_stmt_code != ERROR_MARK
1346 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1348 *two_operators = true;
1351 if (maybe_soft_fail)
1353 unsigned HOST_WIDE_INT const_nunits;
1354 if (!TYPE_VECTOR_SUBPARTS
1355 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1356 || const_nunits > group_size)
1357 matches[0] = false;
1358 else
1360 /* With constant vector elements simulate a mismatch at the
1361 point we need to split. */
1362 unsigned tail = group_size & (const_nunits - 1);
1363 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1365 return false;
1368 return true;
1371 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1372 Note we never remove apart from at destruction time so we do not
1373 need a special value for deleted that differs from empty. */
1374 struct bst_traits
1376 typedef vec <stmt_vec_info> value_type;
1377 typedef vec <stmt_vec_info> compare_type;
1378 static inline hashval_t hash (value_type);
1379 static inline bool equal (value_type existing, value_type candidate);
1380 static inline bool is_empty (value_type x) { return !x.exists (); }
1381 static inline bool is_deleted (value_type x) { return !x.exists (); }
1382 static const bool empty_zero_p = true;
1383 static inline void mark_empty (value_type &x) { x.release (); }
1384 static inline void mark_deleted (value_type &x) { x.release (); }
1385 static inline void remove (value_type &x) { x.release (); }
1387 inline hashval_t
1388 bst_traits::hash (value_type x)
1390 inchash::hash h;
1391 for (unsigned i = 0; i < x.length (); ++i)
1392 h.add_int (gimple_uid (x[i]->stmt));
1393 return h.end ();
1395 inline bool
1396 bst_traits::equal (value_type existing, value_type candidate)
1398 if (existing.length () != candidate.length ())
1399 return false;
1400 for (unsigned i = 0; i < existing.length (); ++i)
1401 if (existing[i] != candidate[i])
1402 return false;
1403 return true;
1406 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1407 simple_hashmap_traits <bst_traits, slp_tree> >
1408 scalar_stmts_to_slp_tree_map_t;
1410 static slp_tree
1411 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1412 vec<stmt_vec_info> stmts, unsigned int group_size,
1413 poly_uint64 *max_nunits,
1414 bool *matches, unsigned *limit, unsigned *tree_size,
1415 scalar_stmts_to_slp_tree_map_t *bst_map);
1417 static slp_tree
1418 vect_build_slp_tree (vec_info *vinfo,
1419 vec<stmt_vec_info> stmts, unsigned int group_size,
1420 poly_uint64 *max_nunits,
1421 bool *matches, unsigned *limit, unsigned *tree_size,
1422 scalar_stmts_to_slp_tree_map_t *bst_map)
1424 if (slp_tree *leader = bst_map->get (stmts))
1426 if (dump_enabled_p ())
1427 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1428 *leader ? "" : "failed ", *leader);
1429 if (*leader)
1431 SLP_TREE_REF_COUNT (*leader)++;
1432 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1433 stmts.release ();
1435 return *leader;
1438 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1439 so we can pick up backedge destinations during discovery. */
1440 slp_tree res = new _slp_tree;
1441 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1442 SLP_TREE_SCALAR_STMTS (res) = stmts;
1443 bst_map->put (stmts.copy (), res);
1445 if (*limit == 0)
1447 if (dump_enabled_p ())
1448 dump_printf_loc (MSG_NOTE, vect_location,
1449 "SLP discovery limit exceeded\n");
1450 bool existed_p = bst_map->put (stmts, NULL);
1451 gcc_assert (existed_p);
1452 /* Mark the node invalid so we can detect those when still in use
1453 as backedge destinations. */
1454 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1455 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1456 vect_free_slp_tree (res);
1457 memset (matches, 0, sizeof (bool) * group_size);
1458 return NULL;
1460 --*limit;
1462 poly_uint64 this_max_nunits = 1;
1463 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1464 &this_max_nunits,
1465 matches, limit, tree_size, bst_map);
1466 if (!res_)
1468 bool existed_p = bst_map->put (stmts, NULL);
1469 gcc_assert (existed_p);
1470 /* Mark the node invalid so we can detect those when still in use
1471 as backedge destinations. */
1472 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1473 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1474 vect_free_slp_tree (res);
1476 else
1478 gcc_assert (res_ == res);
1479 res->max_nunits = this_max_nunits;
1480 vect_update_max_nunits (max_nunits, this_max_nunits);
1481 /* Keep a reference for the bst_map use. */
1482 SLP_TREE_REF_COUNT (res)++;
1484 return res_;
1487 /* Recursively build an SLP tree starting from NODE.
1488 Fail (and return a value not equal to zero) if def-stmts are not
1489 isomorphic, require data permutation or are of unsupported types of
1490 operation. Otherwise, return 0.
1491 The value returned is the depth in the SLP tree where a mismatch
1492 was found. */
1494 static slp_tree
1495 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1496 vec<stmt_vec_info> stmts, unsigned int group_size,
1497 poly_uint64 *max_nunits,
1498 bool *matches, unsigned *limit, unsigned *tree_size,
1499 scalar_stmts_to_slp_tree_map_t *bst_map)
1501 unsigned nops, i, this_tree_size = 0;
1502 poly_uint64 this_max_nunits = *max_nunits;
1504 matches[0] = false;
1506 stmt_vec_info stmt_info = stmts[0];
1507 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1508 nops = gimple_call_num_args (stmt);
1509 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1511 nops = gimple_num_ops (stmt) - 1;
1512 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1513 nops++;
1515 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1516 nops = gimple_phi_num_args (phi);
1517 else
1518 return NULL;
1520 /* If the SLP node is a PHI (induction or reduction), terminate
1521 the recursion. */
1522 bool *skip_args = XALLOCAVEC (bool, nops);
1523 memset (skip_args, 0, sizeof (bool) * nops);
1524 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1525 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1527 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1528 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1529 group_size);
1530 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1531 max_nunits))
1532 return NULL;
1534 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1535 if (def_type == vect_induction_def)
1537 /* Induction PHIs are not cycles but walk the initial
1538 value. Only for inner loops through, for outer loops
1539 we need to pick up the value from the actual PHIs
1540 to more easily support peeling and epilogue vectorization. */
1541 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1542 if (!nested_in_vect_loop_p (loop, stmt_info))
1543 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1544 else
1545 loop = loop->inner;
1546 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1548 else if (def_type == vect_reduction_def
1549 || def_type == vect_double_reduction_def
1550 || def_type == vect_nested_cycle)
1552 /* Else def types have to match. */
1553 stmt_vec_info other_info;
1554 bool all_same = true;
1555 FOR_EACH_VEC_ELT (stmts, i, other_info)
1557 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1558 return NULL;
1559 if (other_info != stmt_info)
1560 all_same = false;
1562 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1563 /* Reduction initial values are not explicitely represented. */
1564 if (!nested_in_vect_loop_p (loop, stmt_info))
1565 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1566 /* Reduction chain backedge defs are filled manually.
1567 ??? Need a better way to identify a SLP reduction chain PHI.
1568 Or a better overall way to SLP match those. */
1569 if (all_same && def_type == vect_reduction_def)
1570 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1572 else if (def_type != vect_internal_def)
1573 return NULL;
1577 bool two_operators = false;
1578 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1579 tree vectype = NULL_TREE;
1580 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1581 &this_max_nunits, matches, &two_operators,
1582 &vectype))
1583 return NULL;
1585 /* If the SLP node is a load, terminate the recursion unless masked. */
1586 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1587 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1589 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1591 /* Masked load. */
1592 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1593 nops = 1;
1595 else
1597 *max_nunits = this_max_nunits;
1598 (*tree_size)++;
1599 node = vect_create_new_slp_node (node, stmts, 0);
1600 SLP_TREE_VECTYPE (node) = vectype;
1601 /* And compute the load permutation. Whether it is actually
1602 a permutation depends on the unrolling factor which is
1603 decided later. */
1604 vec<unsigned> load_permutation;
1605 int j;
1606 stmt_vec_info load_info;
1607 load_permutation.create (group_size);
1608 stmt_vec_info first_stmt_info
1609 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1610 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1612 int load_place = vect_get_place_in_interleaving_chain
1613 (load_info, first_stmt_info);
1614 gcc_assert (load_place != -1);
1615 load_permutation.safe_push (load_place);
1617 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1618 return node;
1621 else if (gimple_assign_single_p (stmt_info->stmt)
1622 && !gimple_vuse (stmt_info->stmt)
1623 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1625 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1626 the same SSA name vector of a compatible type to vectype. */
1627 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1628 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1629 stmt_vec_info estmt_info;
1630 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1632 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1633 tree bfref = gimple_assign_rhs1 (estmt);
1634 HOST_WIDE_INT lane;
1635 if (!known_eq (bit_field_size (bfref),
1636 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1637 || !constant_multiple_p (bit_field_offset (bfref),
1638 bit_field_size (bfref), &lane))
1640 lperm.release ();
1641 return NULL;
1643 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1645 slp_tree vnode = vect_create_new_slp_node (vNULL);
1646 /* ??? We record vectype here but we hide eventually necessary
1647 punning and instead rely on code generation to materialize
1648 VIEW_CONVERT_EXPRs as necessary. We instead should make
1649 this explicit somehow. */
1650 SLP_TREE_VECTYPE (vnode) = vectype;
1651 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1652 /* We are always building a permutation node even if it is an identity
1653 permute to shield the rest of the vectorizer from the odd node
1654 representing an actual vector without any scalar ops.
1655 ??? We could hide it completely with making the permute node
1656 external? */
1657 node = vect_create_new_slp_node (node, stmts, 1);
1658 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1659 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1660 SLP_TREE_VECTYPE (node) = vectype;
1661 SLP_TREE_CHILDREN (node).quick_push (vnode);
1662 return node;
1665 /* Get at the operands, verifying they are compatible. */
1666 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1667 slp_oprnd_info oprnd_info;
1668 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1670 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
1671 stmts, i, &oprnds_info);
1672 if (res != 0)
1673 matches[(res == -1) ? 0 : i] = false;
1674 if (!matches[0])
1675 break;
1677 for (i = 0; i < group_size; ++i)
1678 if (!matches[i])
1680 vect_free_oprnd_info (oprnds_info);
1681 return NULL;
1683 swap = NULL;
1685 auto_vec<slp_tree, 4> children;
1687 stmt_info = stmts[0];
1689 /* Create SLP_TREE nodes for the definition node/s. */
1690 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1692 slp_tree child;
1693 unsigned int j;
1695 /* We're skipping certain operands from processing, for example
1696 outer loop reduction initial defs. */
1697 if (skip_args[i])
1699 children.safe_push (NULL);
1700 continue;
1703 if (oprnd_info->first_dt == vect_uninitialized_def)
1705 /* COND_EXPR have one too many eventually if the condition
1706 is a SSA name. */
1707 gcc_assert (i == 3 && nops == 4);
1708 continue;
1711 if (is_a <bb_vec_info> (vinfo)
1712 && oprnd_info->first_dt == vect_internal_def
1713 && !oprnd_info->any_pattern)
1715 /* For BB vectorization, if all defs are the same do not
1716 bother to continue the build along the single-lane
1717 graph but use a splat of the scalar value. */
1718 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1719 for (j = 1; j < group_size; ++j)
1720 if (oprnd_info->def_stmts[j] != first_def)
1721 break;
1722 if (j == group_size
1723 /* But avoid doing this for loads where we may be
1724 able to CSE things, unless the stmt is not
1725 vectorizable. */
1726 && (!STMT_VINFO_VECTORIZABLE (first_def)
1727 || !gimple_vuse (first_def->stmt)))
1729 if (dump_enabled_p ())
1730 dump_printf_loc (MSG_NOTE, vect_location,
1731 "Using a splat of the uniform operand\n");
1732 oprnd_info->first_dt = vect_external_def;
1736 if (oprnd_info->first_dt == vect_external_def
1737 || oprnd_info->first_dt == vect_constant_def)
1739 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1740 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1741 oprnd_info->ops = vNULL;
1742 children.safe_push (invnode);
1743 continue;
1746 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1747 group_size, &this_max_nunits,
1748 matches, limit,
1749 &this_tree_size, bst_map)) != NULL)
1751 oprnd_info->def_stmts = vNULL;
1752 children.safe_push (child);
1753 continue;
1756 /* If the SLP build for operand zero failed and operand zero
1757 and one can be commutated try that for the scalar stmts
1758 that failed the match. */
1759 if (i == 0
1760 /* A first scalar stmt mismatch signals a fatal mismatch. */
1761 && matches[0]
1762 /* ??? For COND_EXPRs we can swap the comparison operands
1763 as well as the arms under some constraints. */
1764 && nops == 2
1765 && oprnds_info[1]->first_dt == vect_internal_def
1766 && is_gimple_assign (stmt_info->stmt)
1767 /* Swapping operands for reductions breaks assumptions later on. */
1768 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1769 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1771 /* See whether we can swap the matching or the non-matching
1772 stmt operands. */
1773 bool swap_not_matching = true;
1776 for (j = 0; j < group_size; ++j)
1778 if (matches[j] != !swap_not_matching)
1779 continue;
1780 stmt_vec_info stmt_info = stmts[j];
1781 /* Verify if we can swap operands of this stmt. */
1782 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1783 if (!stmt
1784 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1786 if (!swap_not_matching)
1787 goto fail;
1788 swap_not_matching = false;
1789 break;
1793 while (j != group_size);
1795 /* Swap mismatched definition stmts. */
1796 if (dump_enabled_p ())
1797 dump_printf_loc (MSG_NOTE, vect_location,
1798 "Re-trying with swapped operands of stmts ");
1799 for (j = 0; j < group_size; ++j)
1800 if (matches[j] == !swap_not_matching)
1802 std::swap (oprnds_info[0]->def_stmts[j],
1803 oprnds_info[1]->def_stmts[j]);
1804 std::swap (oprnds_info[0]->ops[j],
1805 oprnds_info[1]->ops[j]);
1806 if (dump_enabled_p ())
1807 dump_printf (MSG_NOTE, "%d ", j);
1809 if (dump_enabled_p ())
1810 dump_printf (MSG_NOTE, "\n");
1811 /* And try again with scratch 'matches' ... */
1812 bool *tem = XALLOCAVEC (bool, group_size);
1813 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1814 group_size, &this_max_nunits,
1815 tem, limit,
1816 &this_tree_size, bst_map)) != NULL)
1818 oprnd_info->def_stmts = vNULL;
1819 children.safe_push (child);
1820 continue;
1823 fail:
1825 /* If the SLP build failed and we analyze a basic-block
1826 simply treat nodes we fail to build as externally defined
1827 (and thus build vectors from the scalar defs).
1828 The cost model will reject outright expensive cases.
1829 ??? This doesn't treat cases where permutation ultimatively
1830 fails (or we don't try permutation below). Ideally we'd
1831 even compute a permutation that will end up with the maximum
1832 SLP tree size... */
1833 if (is_a <bb_vec_info> (vinfo)
1834 /* ??? Rejecting patterns this way doesn't work. We'd have to
1835 do extra work to cancel the pattern so the uses see the
1836 scalar version. */
1837 && !is_pattern_stmt_p (stmt_info)
1838 && !oprnd_info->any_pattern)
1840 /* But if there's a leading vector sized set of matching stmts
1841 fail here so we can split the group. This matches the condition
1842 vect_analyze_slp_instance uses. */
1843 /* ??? We might want to split here and combine the results to support
1844 multiple vector sizes better. */
1845 for (j = 0; j < group_size; ++j)
1846 if (!matches[j])
1847 break;
1848 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1850 if (dump_enabled_p ())
1851 dump_printf_loc (MSG_NOTE, vect_location,
1852 "Building vector operands from scalars\n");
1853 this_tree_size++;
1854 child = vect_create_new_slp_node (oprnd_info->ops);
1855 children.safe_push (child);
1856 oprnd_info->ops = vNULL;
1857 continue;
1861 gcc_assert (child == NULL);
1862 FOR_EACH_VEC_ELT (children, j, child)
1863 if (child)
1864 vect_free_slp_tree (child);
1865 vect_free_oprnd_info (oprnds_info);
1866 return NULL;
1869 vect_free_oprnd_info (oprnds_info);
1871 /* If we have all children of a child built up from uniform scalars
1872 or does more than one possibly expensive vector construction then
1873 just throw that away, causing it built up from scalars.
1874 The exception is the SLP node for the vector store. */
1875 if (is_a <bb_vec_info> (vinfo)
1876 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1877 /* ??? Rejecting patterns this way doesn't work. We'd have to
1878 do extra work to cancel the pattern so the uses see the
1879 scalar version. */
1880 && !is_pattern_stmt_p (stmt_info))
1882 slp_tree child;
1883 unsigned j;
1884 bool all_uniform_p = true;
1885 unsigned n_vector_builds = 0;
1886 FOR_EACH_VEC_ELT (children, j, child)
1888 if (!child)
1890 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1891 all_uniform_p = false;
1892 else if (!vect_slp_tree_uniform_p (child))
1894 all_uniform_p = false;
1895 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1896 n_vector_builds++;
1899 if (all_uniform_p
1900 || n_vector_builds > 1
1901 || (n_vector_builds == children.length ()
1902 && is_a <gphi *> (stmt_info->stmt)))
1904 /* Roll back. */
1905 matches[0] = false;
1906 FOR_EACH_VEC_ELT (children, j, child)
1907 if (child)
1908 vect_free_slp_tree (child);
1910 if (dump_enabled_p ())
1911 dump_printf_loc (MSG_NOTE, vect_location,
1912 "Building parent vector operands from "
1913 "scalars instead\n");
1914 return NULL;
1918 *tree_size += this_tree_size + 1;
1919 *max_nunits = this_max_nunits;
1921 if (two_operators)
1923 /* ??? We'd likely want to either cache in bst_map sth like
1924 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1925 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1926 explicit stmts to put in so the keying on 'stmts' doesn't
1927 work (but we have the same issue with nodes that use 'ops'). */
1928 slp_tree one = new _slp_tree;
1929 slp_tree two = new _slp_tree;
1930 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1931 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1932 SLP_TREE_VECTYPE (one) = vectype;
1933 SLP_TREE_VECTYPE (two) = vectype;
1934 SLP_TREE_CHILDREN (one).safe_splice (children);
1935 SLP_TREE_CHILDREN (two).safe_splice (children);
1936 slp_tree child;
1937 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1938 SLP_TREE_REF_COUNT (child)++;
1940 /* Here we record the original defs since this
1941 node represents the final lane configuration. */
1942 node = vect_create_new_slp_node (node, stmts, 2);
1943 SLP_TREE_VECTYPE (node) = vectype;
1944 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1945 SLP_TREE_CHILDREN (node).quick_push (one);
1946 SLP_TREE_CHILDREN (node).quick_push (two);
1947 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1948 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1949 enum tree_code ocode = ERROR_MARK;
1950 stmt_vec_info ostmt_info;
1951 unsigned j = 0;
1952 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1954 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1955 if (gimple_assign_rhs_code (ostmt) != code0)
1957 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1958 ocode = gimple_assign_rhs_code (ostmt);
1959 j = i;
1961 else
1962 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1964 SLP_TREE_CODE (one) = code0;
1965 SLP_TREE_CODE (two) = ocode;
1966 SLP_TREE_LANES (one) = stmts.length ();
1967 SLP_TREE_LANES (two) = stmts.length ();
1968 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1969 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1970 return node;
1973 node = vect_create_new_slp_node (node, stmts, nops);
1974 SLP_TREE_VECTYPE (node) = vectype;
1975 SLP_TREE_CHILDREN (node).splice (children);
1976 return node;
1979 /* Dump a single SLP tree NODE. */
1981 static void
1982 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1983 slp_tree node)
1985 unsigned i, j;
1986 slp_tree child;
1987 stmt_vec_info stmt_info;
1988 tree op;
1990 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1991 dump_user_location_t user_loc = loc.get_user_location ();
1992 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1993 SLP_TREE_DEF_TYPE (node) == vect_external_def
1994 ? " (external)"
1995 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1996 ? " (constant)"
1997 : ""), node,
1998 estimated_poly_value (node->max_nunits),
1999 SLP_TREE_REF_COUNT (node));
2000 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2002 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2003 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2004 else
2005 dump_printf_loc (metadata, user_loc, "op template: %G",
2006 SLP_TREE_REPRESENTATIVE (node)->stmt);
2008 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2009 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2010 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2011 else
2013 dump_printf_loc (metadata, user_loc, "\t{ ");
2014 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2015 dump_printf (metadata, "%T%s ", op,
2016 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2017 dump_printf (metadata, "}\n");
2019 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2021 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2022 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2023 dump_printf (dump_kind, " %u", j);
2024 dump_printf (dump_kind, " }\n");
2026 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2028 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2029 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2030 dump_printf (dump_kind, " %u[%u]",
2031 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2032 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2033 dump_printf (dump_kind, " }\n");
2035 if (SLP_TREE_CHILDREN (node).is_empty ())
2036 return;
2037 dump_printf_loc (metadata, user_loc, "\tchildren");
2038 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2039 dump_printf (dump_kind, " %p", (void *)child);
2040 dump_printf (dump_kind, "\n");
2043 DEBUG_FUNCTION void
2044 debug (slp_tree node)
2046 debug_dump_context ctx;
2047 vect_print_slp_tree (MSG_NOTE,
2048 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2049 node);
2052 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2054 static void
2055 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2056 slp_tree node, hash_set<slp_tree> &visited)
2058 unsigned i;
2059 slp_tree child;
2061 if (visited.add (node))
2062 return;
2064 vect_print_slp_tree (dump_kind, loc, node);
2066 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2067 if (child)
2068 vect_print_slp_graph (dump_kind, loc, child, visited);
2071 static void
2072 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2073 slp_tree entry)
2075 hash_set<slp_tree> visited;
2076 vect_print_slp_graph (dump_kind, loc, entry, visited);
2079 /* Mark the tree rooted at NODE with PURE_SLP. */
2081 static void
2082 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2084 int i;
2085 stmt_vec_info stmt_info;
2086 slp_tree child;
2088 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2089 return;
2091 if (visited.add (node))
2092 return;
2094 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2095 STMT_SLP_TYPE (stmt_info) = pure_slp;
2097 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2098 if (child)
2099 vect_mark_slp_stmts (child, visited);
2102 static void
2103 vect_mark_slp_stmts (slp_tree node)
2105 hash_set<slp_tree> visited;
2106 vect_mark_slp_stmts (node, visited);
2109 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2111 static void
2112 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2114 int i;
2115 stmt_vec_info stmt_info;
2116 slp_tree child;
2118 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2119 return;
2121 if (visited.add (node))
2122 return;
2124 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2126 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2127 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2128 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2131 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2132 if (child)
2133 vect_mark_slp_stmts_relevant (child, visited);
2136 static void
2137 vect_mark_slp_stmts_relevant (slp_tree node)
2139 hash_set<slp_tree> visited;
2140 vect_mark_slp_stmts_relevant (node, visited);
2144 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2146 static void
2147 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2148 hash_set<slp_tree> &visited)
2150 if (!node || visited.add (node))
2151 return;
2153 if (SLP_TREE_CHILDREN (node).length () == 0)
2155 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2156 return;
2157 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2158 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2159 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2160 loads.safe_push (node);
2162 else
2164 unsigned i;
2165 slp_tree child;
2166 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2167 vect_gather_slp_loads (loads, child, visited);
2172 /* Find the last store in SLP INSTANCE. */
2174 stmt_vec_info
2175 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2177 stmt_vec_info last = NULL;
2178 stmt_vec_info stmt_vinfo;
2180 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2182 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2183 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2186 return last;
2189 /* Find the first stmt in NODE. */
2191 stmt_vec_info
2192 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2194 stmt_vec_info first = NULL;
2195 stmt_vec_info stmt_vinfo;
2197 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2199 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2200 if (!first
2201 || get_later_stmt (stmt_vinfo, first) == first)
2202 first = stmt_vinfo;
2205 return first;
2208 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2209 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2210 (also containing the first GROUP1_SIZE stmts, since stores are
2211 consecutive), the second containing the remainder.
2212 Return the first stmt in the second group. */
2214 static stmt_vec_info
2215 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2217 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2218 gcc_assert (group1_size > 0);
2219 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2220 gcc_assert (group2_size > 0);
2221 DR_GROUP_SIZE (first_vinfo) = group1_size;
2223 stmt_vec_info stmt_info = first_vinfo;
2224 for (unsigned i = group1_size; i > 1; i--)
2226 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2227 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2229 /* STMT is now the last element of the first group. */
2230 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2231 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2233 DR_GROUP_SIZE (group2) = group2_size;
2234 for (stmt_info = group2; stmt_info;
2235 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2237 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2238 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2241 /* For the second group, the DR_GROUP_GAP is that before the original group,
2242 plus skipping over the first vector. */
2243 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2245 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2246 DR_GROUP_GAP (first_vinfo) += group2_size;
2248 if (dump_enabled_p ())
2249 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2250 group1_size, group2_size);
2252 return group2;
2255 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2256 statements and a vector of NUNITS elements. */
2258 static poly_uint64
2259 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2261 return exact_div (common_multiple (nunits, group_size), group_size);
2264 /* Helper that checks to see if a node is a load node. */
2266 static inline bool
2267 vect_is_slp_load_node (slp_tree root)
2269 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2270 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2271 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2275 /* Helper function of optimize_load_redistribution that performs the operation
2276 recursively. */
2278 static slp_tree
2279 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2280 vec_info *vinfo, unsigned int group_size,
2281 hash_map<slp_tree, slp_tree> *load_map,
2282 slp_tree root)
2284 if (slp_tree *leader = load_map->get (root))
2285 return *leader;
2287 load_map->put (root, NULL);
2289 slp_tree node;
2290 unsigned i;
2292 /* For now, we don't know anything about externals so do not do anything. */
2293 if (SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2294 return NULL;
2295 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2297 /* First convert this node into a load node and add it to the leaves
2298 list and flatten the permute from a lane to a load one. If it's
2299 unneeded it will be elided later. */
2300 vec<stmt_vec_info> stmts;
2301 stmts.create (SLP_TREE_LANES (root));
2302 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2303 for (unsigned j = 0; j < lane_perm.length (); j++)
2305 std::pair<unsigned, unsigned> perm = lane_perm[j];
2306 node = SLP_TREE_CHILDREN (root)[perm.first];
2308 if (!vect_is_slp_load_node (node)
2309 || SLP_TREE_CHILDREN (node).exists ())
2311 stmts.release ();
2312 goto next;
2315 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2318 if (dump_enabled_p ())
2319 dump_printf_loc (MSG_NOTE, vect_location,
2320 "converting stmts on permute node %p\n", root);
2322 bool *matches = XALLOCAVEC (bool, group_size);
2323 poly_uint64 max_nunits = 1;
2324 unsigned tree_size = 0, limit = 1;
2325 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2326 matches, &limit, &tree_size, bst_map);
2327 if (!node)
2328 stmts.release ();
2330 load_map->put (root, node);
2331 return node;
2334 next:
2335 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2337 slp_tree value
2338 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2339 node);
2340 if (value)
2342 SLP_TREE_REF_COUNT (value)++;
2343 SLP_TREE_CHILDREN (root)[i] = value;
2344 vect_free_slp_tree (node);
2348 return NULL;
2351 /* Temporary workaround for loads not being CSEd during SLP build. This
2352 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2353 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2354 same DR such that the final operation is equal to a permuted load. Such
2355 NODES are then directly converted into LOADS themselves. The nodes are
2356 CSEd using BST_MAP. */
2358 static void
2359 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2360 vec_info *vinfo, unsigned int group_size,
2361 hash_map<slp_tree, slp_tree> *load_map,
2362 slp_tree root)
2364 slp_tree node;
2365 unsigned i;
2367 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2369 slp_tree value
2370 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2371 node);
2372 if (value)
2374 SLP_TREE_REF_COUNT (value)++;
2375 SLP_TREE_CHILDREN (root)[i] = value;
2376 vect_free_slp_tree (node);
2381 /* Helper function of vect_match_slp_patterns.
2383 Attempts to match patterns against the slp tree rooted in REF_NODE using
2384 VINFO. Patterns are matched in post-order traversal.
2386 If matching is successful the value in REF_NODE is updated and returned, if
2387 not then it is returned unchanged. */
2389 static bool
2390 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2391 slp_tree_to_load_perm_map_t *perm_cache,
2392 hash_set<slp_tree> *visited)
2394 unsigned i;
2395 slp_tree node = *ref_node;
2396 bool found_p = false;
2397 if (!node || visited->add (node))
2398 return false;
2400 slp_tree child;
2401 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2402 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2403 vinfo, perm_cache, visited);
2405 for (unsigned x = 0; x < num__slp_patterns; x++)
2407 vect_pattern *pattern = slp_patterns[x] (perm_cache, ref_node);
2408 if (pattern)
2410 pattern->build (vinfo);
2411 delete pattern;
2412 found_p = true;
2416 return found_p;
2419 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2420 vec_info VINFO.
2422 The modified tree is returned. Patterns are tried in order and multiple
2423 patterns may match. */
2425 static bool
2426 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2427 hash_set<slp_tree> *visited,
2428 slp_tree_to_load_perm_map_t *perm_cache)
2430 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2431 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2433 if (dump_enabled_p ())
2434 dump_printf_loc (MSG_NOTE, vect_location,
2435 "Analyzing SLP tree %p for patterns\n",
2436 SLP_INSTANCE_TREE (instance));
2438 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
2441 /* Analyze an SLP instance starting from a group of grouped stores. Call
2442 vect_build_slp_tree to build a tree of packed stmts if possible.
2443 Return FALSE if it's impossible to SLP any stmt in the loop. */
2445 static bool
2446 vect_analyze_slp_instance (vec_info *vinfo,
2447 scalar_stmts_to_slp_tree_map_t *bst_map,
2448 stmt_vec_info stmt_info, slp_instance_kind kind,
2449 unsigned max_tree_size, unsigned *limit);
2451 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2452 of KIND. Return true if successful. */
2454 static bool
2455 vect_build_slp_instance (vec_info *vinfo,
2456 slp_instance_kind kind,
2457 vec<stmt_vec_info> &scalar_stmts,
2458 stmt_vec_info root_stmt_info,
2459 unsigned max_tree_size, unsigned *limit,
2460 scalar_stmts_to_slp_tree_map_t *bst_map,
2461 /* ??? We need stmt_info for group splitting. */
2462 stmt_vec_info stmt_info_)
2464 if (dump_enabled_p ())
2466 dump_printf_loc (MSG_NOTE, vect_location,
2467 "Starting SLP discovery for\n");
2468 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2469 dump_printf_loc (MSG_NOTE, vect_location,
2470 " %G", scalar_stmts[i]->stmt);
2473 /* Build the tree for the SLP instance. */
2474 unsigned int group_size = scalar_stmts.length ();
2475 bool *matches = XALLOCAVEC (bool, group_size);
2476 poly_uint64 max_nunits = 1;
2477 unsigned tree_size = 0;
2478 unsigned i;
2479 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2480 &max_nunits, matches, limit,
2481 &tree_size, bst_map);
2482 if (node != NULL)
2484 /* Calculate the unrolling factor based on the smallest type. */
2485 poly_uint64 unrolling_factor
2486 = calculate_unrolling_factor (max_nunits, group_size);
2488 if (maybe_ne (unrolling_factor, 1U)
2489 && is_a <bb_vec_info> (vinfo))
2491 unsigned HOST_WIDE_INT const_max_nunits;
2492 if (!max_nunits.is_constant (&const_max_nunits)
2493 || const_max_nunits > group_size)
2495 if (dump_enabled_p ())
2496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2497 "Build SLP failed: store group "
2498 "size not a multiple of the vector size "
2499 "in basic block SLP\n");
2500 vect_free_slp_tree (node);
2501 return false;
2503 /* Fatal mismatch. */
2504 if (dump_enabled_p ())
2505 dump_printf_loc (MSG_NOTE, vect_location,
2506 "SLP discovery succeeded but node needs "
2507 "splitting\n");
2508 memset (matches, true, group_size);
2509 matches[group_size / const_max_nunits * const_max_nunits] = false;
2510 vect_free_slp_tree (node);
2512 else
2514 /* Create a new SLP instance. */
2515 slp_instance new_instance = XNEW (class _slp_instance);
2516 SLP_INSTANCE_TREE (new_instance) = node;
2517 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2518 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2519 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2520 SLP_INSTANCE_KIND (new_instance) = kind;
2521 new_instance->reduc_phis = NULL;
2522 new_instance->cost_vec = vNULL;
2523 new_instance->subgraph_entries = vNULL;
2525 if (dump_enabled_p ())
2526 dump_printf_loc (MSG_NOTE, vect_location,
2527 "SLP size %u vs. limit %u.\n",
2528 tree_size, max_tree_size);
2530 /* Fixup SLP reduction chains. */
2531 if (kind == slp_inst_kind_reduc_chain)
2533 /* If this is a reduction chain with a conversion in front
2534 amend the SLP tree with a node for that. */
2535 gimple *scalar_def
2536 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2537 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2539 /* Get at the conversion stmt - we know it's the single use
2540 of the last stmt of the reduction chain. */
2541 use_operand_p use_p;
2542 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2543 &use_p, &scalar_def);
2544 gcc_assert (r);
2545 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2546 next_info = vect_stmt_to_vectorize (next_info);
2547 scalar_stmts = vNULL;
2548 scalar_stmts.create (group_size);
2549 for (unsigned i = 0; i < group_size; ++i)
2550 scalar_stmts.quick_push (next_info);
2551 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2552 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2553 SLP_TREE_CHILDREN (conv).quick_push (node);
2554 SLP_INSTANCE_TREE (new_instance) = conv;
2555 /* We also have to fake this conversion stmt as SLP reduction
2556 group so we don't have to mess with too much code
2557 elsewhere. */
2558 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2559 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2561 /* Fill the backedge child of the PHI SLP node. The
2562 general matching code cannot find it because the
2563 scalar code does not reflect how we vectorize the
2564 reduction. */
2565 use_operand_p use_p;
2566 imm_use_iterator imm_iter;
2567 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2568 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2569 gimple_get_lhs (scalar_def))
2570 /* There are exactly two non-debug uses, the reduction
2571 PHI and the loop-closed PHI node. */
2572 if (!is_gimple_debug (USE_STMT (use_p))
2573 && gimple_bb (USE_STMT (use_p)) == loop->header)
2575 auto_vec<stmt_vec_info, 64> phis (group_size);
2576 stmt_vec_info phi_info
2577 = vinfo->lookup_stmt (USE_STMT (use_p));
2578 for (unsigned i = 0; i < group_size; ++i)
2579 phis.quick_push (phi_info);
2580 slp_tree *phi_node = bst_map->get (phis);
2581 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2582 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2583 = SLP_INSTANCE_TREE (new_instance);
2584 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2588 vinfo->slp_instances.safe_push (new_instance);
2590 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2591 the number of scalar stmts in the root in a few places.
2592 Verify that assumption holds. */
2593 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2594 .length () == group_size);
2596 if (dump_enabled_p ())
2598 dump_printf_loc (MSG_NOTE, vect_location,
2599 "Final SLP tree for instance %p:\n", new_instance);
2600 vect_print_slp_graph (MSG_NOTE, vect_location,
2601 SLP_INSTANCE_TREE (new_instance));
2604 return true;
2607 else
2609 /* Failed to SLP. */
2610 /* Free the allocated memory. */
2611 scalar_stmts.release ();
2614 stmt_vec_info stmt_info = stmt_info_;
2615 /* Try to break the group up into pieces. */
2616 if (kind == slp_inst_kind_store)
2618 /* ??? We could delay all the actual splitting of store-groups
2619 until after SLP discovery of the original group completed.
2620 Then we can recurse to vect_build_slp_instance directly. */
2621 for (i = 0; i < group_size; i++)
2622 if (!matches[i])
2623 break;
2625 /* For basic block SLP, try to break the group up into multiples of
2626 a vector size. */
2627 if (is_a <bb_vec_info> (vinfo)
2628 && (i > 1 && i < group_size))
2630 tree scalar_type
2631 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2632 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2633 1 << floor_log2 (i));
2634 unsigned HOST_WIDE_INT const_nunits;
2635 if (vectype
2636 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2638 /* Split into two groups at the first vector boundary. */
2639 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2640 unsigned group1_size = i & ~(const_nunits - 1);
2642 if (dump_enabled_p ())
2643 dump_printf_loc (MSG_NOTE, vect_location,
2644 "Splitting SLP group at stmt %u\n", i);
2645 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2646 group1_size);
2647 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2648 kind, max_tree_size,
2649 limit);
2650 /* Split the rest at the failure point and possibly
2651 re-analyze the remaining matching part if it has
2652 at least two lanes. */
2653 if (group1_size < i
2654 && (i + 1 < group_size
2655 || i - group1_size > 1))
2657 stmt_vec_info rest2 = rest;
2658 rest = vect_split_slp_store_group (rest, i - group1_size);
2659 if (i - group1_size > 1)
2660 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
2661 kind, max_tree_size,
2662 limit);
2664 /* Re-analyze the non-matching tail if it has at least
2665 two lanes. */
2666 if (i + 1 < group_size)
2667 res |= vect_analyze_slp_instance (vinfo, bst_map,
2668 rest, kind, max_tree_size,
2669 limit);
2670 return res;
2674 /* For loop vectorization split into arbitrary pieces of size > 1. */
2675 if (is_a <loop_vec_info> (vinfo)
2676 && (i > 1 && i < group_size))
2678 unsigned group1_size = i;
2680 if (dump_enabled_p ())
2681 dump_printf_loc (MSG_NOTE, vect_location,
2682 "Splitting SLP group at stmt %u\n", i);
2684 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2685 group1_size);
2686 /* Loop vectorization cannot handle gaps in stores, make sure
2687 the split group appears as strided. */
2688 STMT_VINFO_STRIDED_P (rest) = 1;
2689 DR_GROUP_GAP (rest) = 0;
2690 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2691 DR_GROUP_GAP (stmt_info) = 0;
2693 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2694 kind, max_tree_size, limit);
2695 if (i + 1 < group_size)
2696 res |= vect_analyze_slp_instance (vinfo, bst_map,
2697 rest, kind, max_tree_size, limit);
2699 return res;
2702 /* Even though the first vector did not all match, we might be able to SLP
2703 (some) of the remainder. FORNOW ignore this possibility. */
2706 /* Failed to SLP. */
2707 if (dump_enabled_p ())
2708 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2709 return false;
2713 /* Analyze an SLP instance starting from a group of grouped stores. Call
2714 vect_build_slp_tree to build a tree of packed stmts if possible.
2715 Return FALSE if it's impossible to SLP any stmt in the loop. */
2717 static bool
2718 vect_analyze_slp_instance (vec_info *vinfo,
2719 scalar_stmts_to_slp_tree_map_t *bst_map,
2720 stmt_vec_info stmt_info,
2721 slp_instance_kind kind,
2722 unsigned max_tree_size, unsigned *limit)
2724 unsigned int i;
2725 vec<stmt_vec_info> scalar_stmts;
2727 if (is_a <bb_vec_info> (vinfo))
2728 vect_location = stmt_info->stmt;
2730 stmt_vec_info next_info = stmt_info;
2731 if (kind == slp_inst_kind_store)
2733 /* Collect the stores and store them in scalar_stmts. */
2734 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
2735 while (next_info)
2737 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2738 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2741 else if (kind == slp_inst_kind_reduc_chain)
2743 /* Collect the reduction stmts and store them in scalar_stmts. */
2744 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
2745 while (next_info)
2747 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2748 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2750 /* Mark the first element of the reduction chain as reduction to properly
2751 transform the node. In the reduction analysis phase only the last
2752 element of the chain is marked as reduction. */
2753 STMT_VINFO_DEF_TYPE (stmt_info)
2754 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2755 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2756 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2758 else if (kind == slp_inst_kind_ctor)
2760 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2761 tree val;
2762 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
2763 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2765 stmt_vec_info def_info = vinfo->lookup_def (val);
2766 def_info = vect_stmt_to_vectorize (def_info);
2767 scalar_stmts.quick_push (def_info);
2769 if (dump_enabled_p ())
2770 dump_printf_loc (MSG_NOTE, vect_location,
2771 "Analyzing vectorizable constructor: %G\n",
2772 stmt_info->stmt);
2774 else if (kind == slp_inst_kind_reduc_group)
2776 /* Collect reduction statements. */
2777 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2778 scalar_stmts.create (reductions.length ());
2779 for (i = 0; reductions.iterate (i, &next_info); i++)
2780 if (STMT_VINFO_RELEVANT_P (next_info)
2781 || STMT_VINFO_LIVE_P (next_info))
2782 scalar_stmts.quick_push (next_info);
2783 /* If less than two were relevant/live there's nothing to SLP. */
2784 if (scalar_stmts.length () < 2)
2785 return false;
2787 else
2788 gcc_unreachable ();
2790 /* Build the tree for the SLP instance. */
2791 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2792 kind == slp_inst_kind_ctor
2793 ? stmt_info : NULL,
2794 max_tree_size, limit, bst_map,
2795 kind == slp_inst_kind_store
2796 ? stmt_info : NULL);
2798 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2799 where we should do store group splitting. */
2801 return res;
2804 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2805 trees of packed scalar stmts if SLP is possible. */
2807 opt_result
2808 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2810 unsigned int i;
2811 stmt_vec_info first_element;
2812 slp_instance instance;
2814 DUMP_VECT_SCOPE ("vect_analyze_slp");
2816 unsigned limit = max_tree_size;
2818 scalar_stmts_to_slp_tree_map_t *bst_map
2819 = new scalar_stmts_to_slp_tree_map_t ();
2821 /* Find SLP sequences starting from groups of grouped stores. */
2822 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2823 vect_analyze_slp_instance (vinfo, bst_map, first_element,
2824 STMT_VINFO_GROUPED_ACCESS (first_element)
2825 ? slp_inst_kind_store : slp_inst_kind_ctor,
2826 max_tree_size, &limit);
2828 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
2830 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
2832 vect_location = bb_vinfo->roots[i].root->stmt;
2833 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
2834 bb_vinfo->roots[i].stmts,
2835 bb_vinfo->roots[i].root,
2836 max_tree_size, &limit, bst_map, NULL))
2837 bb_vinfo->roots[i].stmts = vNULL;
2841 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2843 /* Find SLP sequences starting from reduction chains. */
2844 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2845 if (! STMT_VINFO_RELEVANT_P (first_element)
2846 && ! STMT_VINFO_LIVE_P (first_element))
2848 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2849 slp_inst_kind_reduc_chain,
2850 max_tree_size, &limit))
2852 /* Dissolve reduction chain group. */
2853 stmt_vec_info vinfo = first_element;
2854 stmt_vec_info last = NULL;
2855 while (vinfo)
2857 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2858 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2859 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2860 last = vinfo;
2861 vinfo = next;
2863 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2864 /* It can be still vectorized as part of an SLP reduction. */
2865 loop_vinfo->reductions.safe_push (last);
2868 /* Find SLP sequences starting from groups of reductions. */
2869 if (loop_vinfo->reductions.length () > 1)
2870 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2871 slp_inst_kind_reduc_group, max_tree_size,
2872 &limit);
2875 hash_set<slp_tree> visited_patterns;
2876 slp_tree_to_load_perm_map_t perm_cache;
2877 hash_map<slp_tree, slp_tree> load_map;
2879 /* See if any patterns can be found in the SLP tree. */
2880 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
2881 if (vect_match_slp_patterns (instance, vinfo, &visited_patterns,
2882 &perm_cache))
2884 slp_tree root = SLP_INSTANCE_TREE (instance);
2885 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
2886 &load_map, root);
2887 if (dump_enabled_p ())
2889 dump_printf_loc (MSG_NOTE, vect_location,
2890 "Pattern matched SLP tree\n");
2891 vect_print_slp_graph (MSG_NOTE, vect_location, root);
2897 /* The map keeps a reference on SLP nodes built, release that. */
2898 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2899 it != bst_map->end (); ++it)
2900 if ((*it).second)
2901 vect_free_slp_tree ((*it).second);
2902 delete bst_map;
2904 return opt_result::success ();
2907 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2909 static void
2910 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2911 vec<slp_tree> &vertices, vec<int> &leafs)
2913 unsigned i;
2914 slp_tree child;
2916 if (visited.add (node))
2917 return;
2919 node->vertex = vertices.length ();
2920 vertices.safe_push (node);
2922 bool leaf = true;
2923 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2924 if (child)
2926 leaf = false;
2927 vect_slp_build_vertices (visited, child, vertices, leafs);
2929 if (leaf)
2930 leafs.safe_push (node->vertex);
2933 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2935 static void
2936 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2937 vec<int> &leafs)
2939 hash_set<slp_tree> visited;
2940 unsigned i;
2941 slp_instance instance;
2942 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2944 unsigned n_v = vertices.length ();
2945 unsigned n_l = leafs.length ();
2946 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2947 leafs);
2948 /* If we added vertices but no entries to the reverse graph we've
2949 added a cycle that is not backwards-reachable. Push the entry
2950 to mimic as leaf then. */
2951 if (vertices.length () > n_v
2952 && leafs.length () == n_l)
2953 leafs.safe_push (SLP_INSTANCE_TREE (instance)->vertex);
2957 /* Apply (reverse) bijectite PERM to VEC. */
2959 template <class T>
2960 static void
2961 vect_slp_permute (vec<unsigned> perm,
2962 vec<T> &vec, bool reverse)
2964 auto_vec<T, 64> saved;
2965 saved.create (vec.length ());
2966 for (unsigned i = 0; i < vec.length (); ++i)
2967 saved.quick_push (vec[i]);
2969 if (reverse)
2971 for (unsigned i = 0; i < vec.length (); ++i)
2972 vec[perm[i]] = saved[i];
2973 for (unsigned i = 0; i < vec.length (); ++i)
2974 gcc_assert (vec[perm[i]] == saved[i]);
2976 else
2978 for (unsigned i = 0; i < vec.length (); ++i)
2979 vec[i] = saved[perm[i]];
2980 for (unsigned i = 0; i < vec.length (); ++i)
2981 gcc_assert (vec[i] == saved[perm[i]]);
2985 /* Return whether permutations PERM_A and PERM_B as recorded in the
2986 PERMS vector are equal. */
2988 static bool
2989 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2990 int perm_a, int perm_b)
2992 return (perm_a == perm_b
2993 || (perms[perm_a].length () == perms[perm_b].length ()
2994 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2995 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2998 /* Optimize the SLP graph of VINFO. */
3000 void
3001 vect_optimize_slp (vec_info *vinfo)
3003 if (vinfo->slp_instances.is_empty ())
3004 return;
3006 slp_tree node;
3007 unsigned i;
3008 auto_vec<slp_tree> vertices;
3009 auto_vec<int> leafs;
3010 vect_slp_build_vertices (vinfo, vertices, leafs);
3012 struct graph *slpg = new_graph (vertices.length ());
3013 FOR_EACH_VEC_ELT (vertices, i, node)
3015 unsigned j;
3016 slp_tree child;
3017 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3018 if (child)
3019 add_edge (slpg, i, child->vertex);
3022 /* Compute (reverse) postorder on the inverted graph. */
3023 auto_vec<int> ipo;
3024 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3026 auto_sbitmap n_visited (vertices.length ());
3027 auto_sbitmap n_materialize (vertices.length ());
3028 auto_vec<int> n_perm (vertices.length ());
3029 auto_vec<vec<unsigned> > perms;
3031 bitmap_clear (n_visited);
3032 bitmap_clear (n_materialize);
3033 n_perm.quick_grow_cleared (vertices.length ());
3034 perms.safe_push (vNULL); /* zero is no permute */
3036 /* Produce initial permutations. */
3037 for (i = 0; i < leafs.length (); ++i)
3039 int idx = leafs[i];
3040 slp_tree node = vertices[idx];
3042 /* Handle externals and constants optimistically throughout the
3043 iteration. */
3044 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3045 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3046 continue;
3048 /* Leafs do not change across iterations. Note leafs also double
3049 as entries to the reverse graph. */
3050 if (!slpg->vertices[idx].succ)
3051 bitmap_set_bit (n_visited, idx);
3052 /* Loads are the only thing generating permutes. */
3053 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3054 continue;
3056 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3057 node unpermuted, record this permute. */
3058 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3059 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3060 continue;
3061 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3062 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3063 bool any_permute = false;
3064 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3066 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3067 imin = MIN (imin, idx);
3068 imax = MAX (imax, idx);
3069 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3070 any_permute = true;
3072 /* If there's no permute no need to split one out. */
3073 if (!any_permute)
3074 continue;
3075 /* If the span doesn't match we'd disrupt VF computation, avoid
3076 that for now. */
3077 if (imax - imin + 1 != SLP_TREE_LANES (node))
3078 continue;
3080 /* For now only handle true permutes, like
3081 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3082 when permuting constants and invariants keeping the permute
3083 bijective. */
3084 auto_sbitmap load_index (SLP_TREE_LANES (node));
3085 bitmap_clear (load_index);
3086 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3087 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3088 unsigned j;
3089 for (j = 0; j < SLP_TREE_LANES (node); ++j)
3090 if (!bitmap_bit_p (load_index, j))
3091 break;
3092 if (j != SLP_TREE_LANES (node))
3093 continue;
3095 vec<unsigned> perm = vNULL;
3096 perm.safe_grow (SLP_TREE_LANES (node), true);
3097 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3098 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3099 perms.safe_push (perm);
3100 n_perm[idx] = perms.length () - 1;
3103 /* Propagate permutes along the graph and compute materialization points. */
3104 bool changed;
3105 unsigned iteration = 0;
3108 changed = false;
3109 ++iteration;
3111 for (i = vertices.length (); i > 0 ; --i)
3113 int idx = ipo[i-1];
3114 slp_tree node = vertices[idx];
3115 /* For leafs there's nothing to do - we've seeded permutes
3116 on those above. */
3117 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3118 continue;
3120 bitmap_set_bit (n_visited, idx);
3122 /* We cannot move a permute across a store. */
3123 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
3124 && DR_IS_WRITE
3125 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
3126 continue;
3128 int perm = -1;
3129 for (graph_edge *succ = slpg->vertices[idx].succ;
3130 succ; succ = succ->succ_next)
3132 int succ_idx = succ->dest;
3133 /* Handle unvisited nodes optimistically. */
3134 /* ??? But for constants once we want to handle non-bijective
3135 permutes we have to verify the permute, when unifying lanes,
3136 will not unify different constants. For example see
3137 gcc.dg/vect/bb-slp-14.c for a case that would break. */
3138 if (!bitmap_bit_p (n_visited, succ_idx))
3139 continue;
3140 int succ_perm = n_perm[succ_idx];
3141 /* Once we materialize succs permutation its output lanes
3142 appear unpermuted to us. */
3143 if (bitmap_bit_p (n_materialize, succ_idx))
3144 succ_perm = 0;
3145 if (perm == -1)
3146 perm = succ_perm;
3147 else if (succ_perm == 0)
3149 perm = 0;
3150 break;
3152 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
3154 perm = 0;
3155 break;
3159 if (perm == -1)
3160 /* Pick up pre-computed leaf values. */
3161 perm = n_perm[idx];
3162 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
3164 if (iteration > 1)
3165 /* Make sure we eventually converge. */
3166 gcc_checking_assert (perm == 0);
3167 n_perm[idx] = perm;
3168 if (perm == 0)
3169 bitmap_clear_bit (n_materialize, idx);
3170 changed = true;
3173 if (perm == 0)
3174 continue;
3176 /* Elide pruning at materialization points in the first
3177 iteration so every node was visited once at least. */
3178 if (iteration == 1)
3179 continue;
3181 /* Decide on permute materialization. Look whether there's
3182 a use (pred) edge that is permuted differently than us.
3183 In that case mark ourselves so the permutation is applied.
3184 For VEC_PERM_EXPRs the permutation doesn't carry along
3185 from children to parents so force materialization at the
3186 point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
3187 are a source of an arbitrary permutation again, similar
3188 to constants/externals - that's something we do not yet
3189 optimally handle. */
3190 bool all_preds_permuted = (SLP_TREE_CODE (node) != VEC_PERM_EXPR
3191 && slpg->vertices[idx].pred != NULL);
3192 if (all_preds_permuted)
3193 for (graph_edge *pred = slpg->vertices[idx].pred;
3194 pred; pred = pred->pred_next)
3196 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
3197 int pred_perm = n_perm[pred->src];
3198 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3200 all_preds_permuted = false;
3201 break;
3204 if (!all_preds_permuted)
3206 if (!bitmap_bit_p (n_materialize, idx))
3207 changed = true;
3208 bitmap_set_bit (n_materialize, idx);
3212 while (changed || iteration == 1);
3214 /* Materialize. */
3215 for (i = 0; i < vertices.length (); ++i)
3217 int perm = n_perm[i];
3218 if (perm <= 0)
3219 continue;
3221 slp_tree node = vertices[i];
3223 /* First permute invariant/external original successors. */
3224 unsigned j;
3225 slp_tree child;
3226 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3228 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3229 continue;
3231 /* If the vector is uniform there's nothing to do. */
3232 if (vect_slp_tree_uniform_p (child))
3233 continue;
3235 /* We can end up sharing some externals via two_operator
3236 handling. Be prepared to unshare those. */
3237 if (child->refcnt != 1)
3239 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3240 SLP_TREE_CHILDREN (node)[j] = child
3241 = vect_create_new_slp_node
3242 (SLP_TREE_SCALAR_OPS (child).copy ());
3244 vect_slp_permute (perms[perm],
3245 SLP_TREE_SCALAR_OPS (child), true);
3248 if (bitmap_bit_p (n_materialize, i))
3250 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3251 /* For loads simply drop the permutation, the load permutation
3252 already performs the desired permutation. */
3254 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3256 /* If the node is already a permute node we can apply
3257 the permutation to the lane selection, effectively
3258 materializing it on the incoming vectors. */
3259 if (dump_enabled_p ())
3260 dump_printf_loc (MSG_NOTE, vect_location,
3261 "simplifying permute node %p\n",
3262 node);
3264 for (unsigned k = 0;
3265 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3266 SLP_TREE_LANE_PERMUTATION (node)[k].second
3267 = perms[perm][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3269 else
3271 if (dump_enabled_p ())
3272 dump_printf_loc (MSG_NOTE, vect_location,
3273 "inserting permute node in place of %p\n",
3274 node);
3276 /* Make a copy of NODE and in-place change it to a
3277 VEC_PERM node to permute the lanes of the copy. */
3278 slp_tree copy = new _slp_tree;
3279 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3280 SLP_TREE_CHILDREN (node) = vNULL;
3281 SLP_TREE_SCALAR_STMTS (copy)
3282 = SLP_TREE_SCALAR_STMTS (node).copy ();
3283 vect_slp_permute (perms[perm],
3284 SLP_TREE_SCALAR_STMTS (copy), true);
3285 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3286 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3287 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3288 SLP_TREE_LANE_PERMUTATION (copy)
3289 = SLP_TREE_LANE_PERMUTATION (node);
3290 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3291 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3292 copy->refcnt = 1;
3293 copy->max_nunits = node->max_nunits;
3294 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3295 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3296 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3298 /* Now turn NODE into a VEC_PERM. */
3299 SLP_TREE_CHILDREN (node).safe_push (copy);
3300 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3301 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3302 SLP_TREE_LANE_PERMUTATION (node)
3303 .quick_push (std::make_pair (0, perms[perm][j]));
3304 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3307 else
3309 /* Apply the reverse permutation to our stmts. */
3310 vect_slp_permute (perms[perm],
3311 SLP_TREE_SCALAR_STMTS (node), true);
3312 /* And to the load permutation, which we can simply
3313 make regular by design. */
3314 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3316 /* ??? When we handle non-bijective permutes the idea
3317 is that we can force the load-permutation to be
3318 { min, min + 1, min + 2, ... max }. But then the
3319 scalar defs might no longer match the lane content
3320 which means wrong-code with live lane vectorization.
3321 So we possibly have to have NULL entries for those. */
3322 vect_slp_permute (perms[perm],
3323 SLP_TREE_LOAD_PERMUTATION (node), true);
3328 /* Free the perms vector used for propagation. */
3329 while (!perms.is_empty ())
3330 perms.pop ().release ();
3331 free_graph (slpg);
3334 /* Now elide load permutations that are not necessary. */
3335 for (i = 0; i < leafs.length (); ++i)
3337 node = vertices[leafs[i]];
3338 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3339 continue;
3341 /* In basic block vectorization we allow any subchain of an interleaving
3342 chain.
3343 FORNOW: not in loop SLP because of realignment complications. */
3344 if (is_a <bb_vec_info> (vinfo))
3346 bool subchain_p = true;
3347 stmt_vec_info next_load_info = NULL;
3348 stmt_vec_info load_info;
3349 unsigned j;
3350 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3352 if (j != 0
3353 && (next_load_info != load_info
3354 || DR_GROUP_GAP (load_info) != 1))
3356 subchain_p = false;
3357 break;
3359 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3361 if (subchain_p)
3363 SLP_TREE_LOAD_PERMUTATION (node).release ();
3364 continue;
3367 else
3369 stmt_vec_info load_info;
3370 bool this_load_permuted = false;
3371 unsigned j;
3372 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3373 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3375 this_load_permuted = true;
3376 break;
3378 stmt_vec_info first_stmt_info
3379 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3380 if (!this_load_permuted
3381 /* The load requires permutation when unrolling exposes
3382 a gap either because the group is larger than the SLP
3383 group-size or because there is a gap between the groups. */
3384 && (known_eq (LOOP_VINFO_VECT_FACTOR
3385 (as_a <loop_vec_info> (vinfo)), 1U)
3386 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3387 && DR_GROUP_GAP (first_stmt_info) == 0)))
3389 SLP_TREE_LOAD_PERMUTATION (node).release ();
3390 continue;
3396 /* Gather loads reachable from the individual SLP graph entries. */
3398 void
3399 vect_gather_slp_loads (vec_info *vinfo)
3401 unsigned i;
3402 slp_instance instance;
3403 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
3405 hash_set<slp_tree> visited;
3406 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
3407 SLP_INSTANCE_TREE (instance), visited);
3412 /* For each possible SLP instance decide whether to SLP it and calculate overall
3413 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3414 least one instance. */
3416 bool
3417 vect_make_slp_decision (loop_vec_info loop_vinfo)
3419 unsigned int i;
3420 poly_uint64 unrolling_factor = 1;
3421 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3422 slp_instance instance;
3423 int decided_to_slp = 0;
3425 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3427 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3429 /* FORNOW: SLP if you can. */
3430 /* All unroll factors have the form:
3432 GET_MODE_SIZE (vinfo->vector_mode) * X
3434 for some rational X, so they must have a common multiple. */
3435 unrolling_factor
3436 = force_common_multiple (unrolling_factor,
3437 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3439 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3440 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3441 loop-based vectorization. Such stmts will be marked as HYBRID. */
3442 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3443 decided_to_slp++;
3446 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3448 if (decided_to_slp && dump_enabled_p ())
3450 dump_printf_loc (MSG_NOTE, vect_location,
3451 "Decided to SLP %d instances. Unrolling factor ",
3452 decided_to_slp);
3453 dump_dec (MSG_NOTE, unrolling_factor);
3454 dump_printf (MSG_NOTE, "\n");
3457 return (decided_to_slp > 0);
3460 /* Private data for vect_detect_hybrid_slp. */
3461 struct vdhs_data
3463 loop_vec_info loop_vinfo;
3464 vec<stmt_vec_info> *worklist;
3467 /* Walker for walk_gimple_op. */
3469 static tree
3470 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3472 walk_stmt_info *wi = (walk_stmt_info *)data;
3473 vdhs_data *dat = (vdhs_data *)wi->info;
3475 if (wi->is_lhs)
3476 return NULL_TREE;
3478 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3479 if (!def_stmt_info)
3480 return NULL_TREE;
3481 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3482 if (PURE_SLP_STMT (def_stmt_info))
3484 if (dump_enabled_p ())
3485 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3486 def_stmt_info->stmt);
3487 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3488 dat->worklist->safe_push (def_stmt_info);
3491 return NULL_TREE;
3494 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3495 if so, otherwise pushing it to WORKLIST. */
3497 static void
3498 maybe_push_to_hybrid_worklist (vec_info *vinfo,
3499 vec<stmt_vec_info> &worklist,
3500 stmt_vec_info stmt_info)
3502 if (dump_enabled_p ())
3503 dump_printf_loc (MSG_NOTE, vect_location,
3504 "Processing hybrid candidate : %G", stmt_info->stmt);
3505 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
3506 imm_use_iterator iter2;
3507 ssa_op_iter iter1;
3508 use_operand_p use_p;
3509 def_operand_p def_p;
3510 bool any_def = false;
3511 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
3513 any_def = true;
3514 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
3516 if (is_gimple_debug (USE_STMT (use_p)))
3517 continue;
3518 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
3519 /* An out-of loop use means this is a loop_vect sink. */
3520 if (!use_info)
3522 if (dump_enabled_p ())
3523 dump_printf_loc (MSG_NOTE, vect_location,
3524 "Found loop_vect sink: %G", stmt_info->stmt);
3525 worklist.safe_push (stmt_info);
3526 return;
3528 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
3530 if (dump_enabled_p ())
3531 dump_printf_loc (MSG_NOTE, vect_location,
3532 "Found loop_vect use: %G", use_info->stmt);
3533 worklist.safe_push (stmt_info);
3534 return;
3538 /* No def means this is a loo_vect sink. */
3539 if (!any_def)
3541 if (dump_enabled_p ())
3542 dump_printf_loc (MSG_NOTE, vect_location,
3543 "Found loop_vect sink: %G", stmt_info->stmt);
3544 worklist.safe_push (stmt_info);
3545 return;
3547 if (dump_enabled_p ())
3548 dump_printf_loc (MSG_NOTE, vect_location,
3549 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
3550 STMT_SLP_TYPE (stmt_info) = pure_slp;
3553 /* Find stmts that must be both vectorized and SLPed. */
3555 void
3556 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3558 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3560 /* All stmts participating in SLP are marked pure_slp, all other
3561 stmts are loop_vect.
3562 First collect all loop_vect stmts into a worklist.
3563 SLP patterns cause not all original scalar stmts to appear in
3564 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3565 Rectify this here and do a backward walk over the IL only considering
3566 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3567 mark them as pure_slp. */
3568 auto_vec<stmt_vec_info> worklist;
3569 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
3571 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3572 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3573 gsi_next (&gsi))
3575 gphi *phi = gsi.phi ();
3576 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3577 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3578 maybe_push_to_hybrid_worklist (loop_vinfo,
3579 worklist, stmt_info);
3581 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
3582 gsi_prev (&gsi))
3584 gimple *stmt = gsi_stmt (gsi);
3585 if (is_gimple_debug (stmt))
3586 continue;
3587 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3588 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3590 for (gimple_stmt_iterator gsi2
3591 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3592 !gsi_end_p (gsi2); gsi_next (&gsi2))
3594 stmt_vec_info patt_info
3595 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3596 if (!STMT_SLP_TYPE (patt_info)
3597 && STMT_VINFO_RELEVANT (patt_info))
3598 maybe_push_to_hybrid_worklist (loop_vinfo,
3599 worklist, patt_info);
3601 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3603 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3604 maybe_push_to_hybrid_worklist (loop_vinfo,
3605 worklist, stmt_info);
3609 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3610 mark any SLP vectorized stmt as hybrid.
3611 ??? We're visiting def stmts N times (once for each non-SLP and
3612 once for each hybrid-SLP use). */
3613 walk_stmt_info wi;
3614 vdhs_data dat;
3615 dat.worklist = &worklist;
3616 dat.loop_vinfo = loop_vinfo;
3617 memset (&wi, 0, sizeof (wi));
3618 wi.info = (void *)&dat;
3619 while (!worklist.is_empty ())
3621 stmt_vec_info stmt_info = worklist.pop ();
3622 /* Since SSA operands are not set up for pattern stmts we need
3623 to use walk_gimple_op. */
3624 wi.is_lhs = 0;
3625 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3630 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3632 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3633 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs), roots (vNULL)
3635 for (unsigned i = 0; i < bbs.length (); ++i)
3637 if (i != 0)
3638 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3639 gsi_next (&si))
3641 gphi *phi = si.phi ();
3642 gimple_set_uid (phi, 0);
3643 add_stmt (phi);
3645 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3646 !gsi_end_p (gsi); gsi_next (&gsi))
3648 gimple *stmt = gsi_stmt (gsi);
3649 gimple_set_uid (stmt, 0);
3650 if (is_gimple_debug (stmt))
3651 continue;
3652 add_stmt (stmt);
3658 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3659 stmts in the basic block. */
3661 _bb_vec_info::~_bb_vec_info ()
3663 /* Reset region marker. */
3664 for (unsigned i = 0; i < bbs.length (); ++i)
3666 if (i != 0)
3667 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3668 gsi_next (&si))
3670 gphi *phi = si.phi ();
3671 gimple_set_uid (phi, -1);
3673 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3674 !gsi_end_p (gsi); gsi_next (&gsi))
3676 gimple *stmt = gsi_stmt (gsi);
3677 gimple_set_uid (stmt, -1);
3681 for (unsigned i = 0; i < roots.length (); ++i)
3682 roots[i].stmts.release ();
3683 roots.release ();
3686 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3687 given then that child nodes have already been processed, and that
3688 their def types currently match their SLP node's def type. */
3690 static bool
3691 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3692 slp_instance node_instance,
3693 stmt_vector_for_cost *cost_vec)
3695 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3696 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3698 /* Calculate the number of vector statements to be created for the
3699 scalar stmts in this node. For SLP reductions it is equal to the
3700 number of vector statements in the children (which has already been
3701 calculated by the recursive call). Otherwise it is the number of
3702 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3703 VF divided by the number of elements in a vector. */
3704 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3705 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3707 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3708 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3710 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3711 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3712 break;
3715 else
3717 poly_uint64 vf;
3718 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3719 vf = loop_vinfo->vectorization_factor;
3720 else
3721 vf = 1;
3722 unsigned int group_size = SLP_TREE_LANES (node);
3723 tree vectype = SLP_TREE_VECTYPE (node);
3724 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3725 = vect_get_num_vectors (vf * group_size, vectype);
3728 /* Handle purely internal nodes. */
3729 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3730 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3732 if (is_a <bb_vec_info> (vinfo)
3733 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3735 if (dump_enabled_p ())
3736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3737 "desired vector type conflicts with earlier one "
3738 "for %G", stmt_info->stmt);
3739 return false;
3742 bool dummy;
3743 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3744 node, node_instance, cost_vec);
3747 /* Try to build NODE from scalars, returning true on success.
3748 NODE_INSTANCE is the SLP instance that contains NODE. */
3750 static bool
3751 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3752 slp_instance node_instance)
3754 stmt_vec_info stmt_info;
3755 unsigned int i;
3757 if (!is_a <bb_vec_info> (vinfo)
3758 || node == SLP_INSTANCE_TREE (node_instance)
3759 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3760 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3761 return false;
3763 if (dump_enabled_p ())
3764 dump_printf_loc (MSG_NOTE, vect_location,
3765 "Building vector operands of %p from scalars instead\n", node);
3767 /* Don't remove and free the child nodes here, since they could be
3768 referenced by other structures. The analysis and scheduling phases
3769 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3770 unsigned int group_size = SLP_TREE_LANES (node);
3771 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3772 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3773 SLP_TREE_LOAD_PERMUTATION (node).release ();
3774 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3776 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3777 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3779 return true;
3782 /* Compute the prologue cost for invariant or constant operands represented
3783 by NODE. */
3785 static void
3786 vect_prologue_cost_for_slp (slp_tree node,
3787 stmt_vector_for_cost *cost_vec)
3789 /* There's a special case of an existing vector, that costs nothing. */
3790 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3791 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3792 return;
3793 /* Without looking at the actual initializer a vector of
3794 constants can be implemented as load from the constant pool.
3795 When all elements are the same we can use a splat. */
3796 tree vectype = SLP_TREE_VECTYPE (node);
3797 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3798 unsigned num_vects_to_check;
3799 unsigned HOST_WIDE_INT const_nunits;
3800 unsigned nelt_limit;
3801 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3802 && ! multiple_p (const_nunits, group_size))
3804 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3805 nelt_limit = const_nunits;
3807 else
3809 /* If either the vector has variable length or the vectors
3810 are composed of repeated whole groups we only need to
3811 cost construction once. All vectors will be the same. */
3812 num_vects_to_check = 1;
3813 nelt_limit = group_size;
3815 tree elt = NULL_TREE;
3816 unsigned nelt = 0;
3817 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3819 unsigned si = j % group_size;
3820 if (nelt == 0)
3821 elt = SLP_TREE_SCALAR_OPS (node)[si];
3822 /* ??? We're just tracking whether all operands of a single
3823 vector initializer are the same, ideally we'd check if
3824 we emitted the same one already. */
3825 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3826 elt = NULL_TREE;
3827 nelt++;
3828 if (nelt == nelt_limit)
3830 record_stmt_cost (cost_vec, 1,
3831 SLP_TREE_DEF_TYPE (node) == vect_external_def
3832 ? (elt ? scalar_to_vec : vec_construct)
3833 : vector_load,
3834 NULL, vectype, 0, vect_prologue);
3835 nelt = 0;
3840 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3841 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3843 Return true if the operations are supported. */
3845 static bool
3846 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3847 slp_instance node_instance,
3848 hash_set<slp_tree> &visited_set,
3849 vec<slp_tree> &visited_vec,
3850 stmt_vector_for_cost *cost_vec)
3852 int i, j;
3853 slp_tree child;
3855 /* Assume we can code-generate all invariants. */
3856 if (!node
3857 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3858 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3859 return true;
3861 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3863 if (dump_enabled_p ())
3864 dump_printf_loc (MSG_NOTE, vect_location,
3865 "Failed cyclic SLP reference in %p", node);
3866 return false;
3868 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3870 /* If we already analyzed the exact same set of scalar stmts we're done.
3871 We share the generated vector stmts for those. */
3872 if (visited_set.add (node))
3873 return true;
3874 visited_vec.safe_push (node);
3876 bool res = true;
3877 unsigned visited_rec_start = visited_vec.length ();
3878 unsigned cost_vec_rec_start = cost_vec->length ();
3879 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3881 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3882 visited_set, visited_vec,
3883 cost_vec);
3884 if (!res)
3885 break;
3888 if (res)
3889 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3890 cost_vec);
3891 /* If analysis failed we have to pop all recursive visited nodes
3892 plus ourselves. */
3893 if (!res)
3895 while (visited_vec.length () >= visited_rec_start)
3896 visited_set.remove (visited_vec.pop ());
3897 cost_vec->truncate (cost_vec_rec_start);
3900 /* When the node can be vectorized cost invariant nodes it references.
3901 This is not done in DFS order to allow the refering node
3902 vectorizable_* calls to nail down the invariant nodes vector type
3903 and possibly unshare it if it needs a different vector type than
3904 other referrers. */
3905 if (res)
3906 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3907 if (child
3908 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3909 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3910 /* Perform usual caching, note code-generation still
3911 code-gens these nodes multiple times but we expect
3912 to CSE them later. */
3913 && !visited_set.add (child))
3915 visited_vec.safe_push (child);
3916 /* ??? After auditing more code paths make a "default"
3917 and push the vector type from NODE to all children
3918 if it is not already set. */
3919 /* Compute the number of vectors to be generated. */
3920 tree vector_type = SLP_TREE_VECTYPE (child);
3921 if (!vector_type)
3923 /* For shifts with a scalar argument we don't need
3924 to cost or code-generate anything.
3925 ??? Represent this more explicitely. */
3926 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3927 == shift_vec_info_type)
3928 && j == 1);
3929 continue;
3931 unsigned group_size = SLP_TREE_LANES (child);
3932 poly_uint64 vf = 1;
3933 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3934 vf = loop_vinfo->vectorization_factor;
3935 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3936 = vect_get_num_vectors (vf * group_size, vector_type);
3937 /* And cost them. */
3938 vect_prologue_cost_for_slp (child, cost_vec);
3941 /* If this node or any of its children can't be vectorized, try pruning
3942 the tree here rather than felling the whole thing. */
3943 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3945 /* We'll need to revisit this for invariant costing and number
3946 of vectorized stmt setting. */
3947 res = true;
3950 return res;
3954 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3955 region and that can be vectorized using vectorizable_live_operation
3956 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3957 scalar code computing it to be retained. */
3959 static void
3960 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3961 slp_instance instance,
3962 stmt_vector_for_cost *cost_vec,
3963 hash_set<stmt_vec_info> &svisited,
3964 hash_set<slp_tree> &visited)
3966 if (visited.add (node))
3967 return;
3969 unsigned i;
3970 stmt_vec_info stmt_info;
3971 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3972 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3974 if (svisited.contains (stmt_info))
3975 continue;
3976 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3977 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3978 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3979 /* Only the pattern root stmt computes the original scalar value. */
3980 continue;
3981 bool mark_visited = true;
3982 gimple *orig_stmt = orig_stmt_info->stmt;
3983 ssa_op_iter op_iter;
3984 def_operand_p def_p;
3985 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3987 imm_use_iterator use_iter;
3988 gimple *use_stmt;
3989 stmt_vec_info use_stmt_info;
3990 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3991 if (!is_gimple_debug (use_stmt))
3993 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3994 if (!use_stmt_info
3995 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3997 STMT_VINFO_LIVE_P (stmt_info) = true;
3998 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3999 NULL, node, instance, i,
4000 false, cost_vec))
4001 /* ??? So we know we can vectorize the live stmt
4002 from one SLP node. If we cannot do so from all
4003 or none consistently we'd have to record which
4004 SLP node (and lane) we want to use for the live
4005 operation. So make sure we can code-generate
4006 from all nodes. */
4007 mark_visited = false;
4008 else
4009 STMT_VINFO_LIVE_P (stmt_info) = false;
4010 break;
4013 /* We have to verify whether we can insert the lane extract
4014 before all uses. The following is a conservative approximation.
4015 We cannot put this into vectorizable_live_operation because
4016 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4017 doesn't work.
4018 Note that while the fact that we emit code for loads at the
4019 first load should make this a non-problem leafs we construct
4020 from scalars are vectorized after the last scalar def.
4021 ??? If we'd actually compute the insert location during
4022 analysis we could use sth less conservative than the last
4023 scalar stmt in the node for the dominance check. */
4024 /* ??? What remains is "live" uses in vector CTORs in the same
4025 SLP graph which is where those uses can end up code-generated
4026 right after their definition instead of close to their original
4027 use. But that would restrict us to code-generate lane-extracts
4028 from the latest stmt in a node. So we compensate for this
4029 during code-generation, simply not replacing uses for those
4030 hopefully rare cases. */
4031 if (STMT_VINFO_LIVE_P (stmt_info))
4032 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4033 if (!is_gimple_debug (use_stmt)
4034 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4035 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4036 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4038 if (dump_enabled_p ())
4039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4040 "Cannot determine insertion place for "
4041 "lane extract\n");
4042 STMT_VINFO_LIVE_P (stmt_info) = false;
4043 mark_visited = true;
4046 if (mark_visited)
4047 svisited.add (stmt_info);
4050 slp_tree child;
4051 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4052 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4053 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4054 cost_vec, svisited, visited);
4057 /* Analyze statements in SLP instances of VINFO. Return true if the
4058 operations are supported. */
4060 bool
4061 vect_slp_analyze_operations (vec_info *vinfo)
4063 slp_instance instance;
4064 int i;
4066 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4068 hash_set<slp_tree> visited;
4069 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4071 auto_vec<slp_tree> visited_vec;
4072 stmt_vector_for_cost cost_vec;
4073 cost_vec.create (2);
4074 if (is_a <bb_vec_info> (vinfo))
4075 vect_location = instance->location ();
4076 if (!vect_slp_analyze_node_operations (vinfo,
4077 SLP_INSTANCE_TREE (instance),
4078 instance, visited, visited_vec,
4079 &cost_vec)
4080 /* Instances with a root stmt require vectorized defs for the
4081 SLP tree root. */
4082 || (SLP_INSTANCE_ROOT_STMT (instance)
4083 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
4084 != vect_internal_def)))
4086 slp_tree node = SLP_INSTANCE_TREE (instance);
4087 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4088 if (dump_enabled_p ())
4089 dump_printf_loc (MSG_NOTE, vect_location,
4090 "removing SLP instance operations starting from: %G",
4091 stmt_info->stmt);
4092 vect_free_slp_instance (instance);
4093 vinfo->slp_instances.ordered_remove (i);
4094 cost_vec.release ();
4095 while (!visited_vec.is_empty ())
4096 visited.remove (visited_vec.pop ());
4098 else
4100 i++;
4102 /* For BB vectorization remember the SLP graph entry
4103 cost for later. */
4104 if (is_a <bb_vec_info> (vinfo))
4105 instance->cost_vec = cost_vec;
4106 else
4108 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
4109 cost_vec.release ();
4114 /* Compute vectorizable live stmts. */
4115 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4117 hash_set<stmt_vec_info> svisited;
4118 hash_set<slp_tree> visited;
4119 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4121 vect_location = instance->location ();
4122 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
4123 instance, &instance->cost_vec, svisited,
4124 visited);
4128 return !vinfo->slp_instances.is_empty ();
4131 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4132 closing the eventual chain. */
4134 static slp_instance
4135 get_ultimate_leader (slp_instance instance,
4136 hash_map<slp_instance, slp_instance> &instance_leader)
4138 auto_vec<slp_instance *, 8> chain;
4139 slp_instance *tem;
4140 while (*(tem = instance_leader.get (instance)) != instance)
4142 chain.safe_push (tem);
4143 instance = *tem;
4145 while (!chain.is_empty ())
4146 *chain.pop () = instance;
4147 return instance;
4150 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4152 static void
4153 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
4154 slp_instance instance, slp_tree node,
4155 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
4156 hash_map<slp_instance, slp_instance> &instance_leader,
4157 hash_set<slp_tree> &visited)
4159 stmt_vec_info stmt_info;
4160 unsigned i;
4162 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4164 bool existed_p;
4165 slp_instance &stmt_instance
4166 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
4167 if (!existed_p)
4169 else if (stmt_instance != instance)
4171 /* If we're running into a previously marked stmt make us the
4172 leader of the current ultimate leader. This keeps the
4173 leader chain acyclic and works even when the current instance
4174 connects two previously independent graph parts. */
4175 slp_instance stmt_leader
4176 = get_ultimate_leader (stmt_instance, instance_leader);
4177 if (stmt_leader != instance)
4178 instance_leader.put (stmt_leader, instance);
4180 stmt_instance = instance;
4183 if (visited.add (node))
4184 return;
4186 slp_tree child;
4187 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4188 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4189 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
4190 instance_leader, visited);
4193 /* Partition the SLP graph into pieces that can be costed independently. */
4195 static void
4196 vect_bb_partition_graph (bb_vec_info bb_vinfo)
4198 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4200 /* First walk the SLP graph assigning each involved scalar stmt a
4201 corresponding SLP graph entry and upon visiting a previously
4202 marked stmt, make the stmts leader the current SLP graph entry. */
4203 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
4204 hash_map<slp_instance, slp_instance> instance_leader;
4205 hash_set<slp_tree> visited;
4206 slp_instance instance;
4207 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4209 instance_leader.put (instance, instance);
4210 vect_bb_partition_graph_r (bb_vinfo,
4211 instance, SLP_INSTANCE_TREE (instance),
4212 stmt_to_instance, instance_leader,
4213 visited);
4216 /* Then collect entries to each independent subgraph. */
4217 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4219 slp_instance leader = get_ultimate_leader (instance, instance_leader);
4220 leader->subgraph_entries.safe_push (instance);
4221 if (dump_enabled_p ()
4222 && leader != instance)
4223 dump_printf_loc (MSG_NOTE, vect_location,
4224 "instance %p is leader of %p\n",
4225 leader, instance);
4229 /* Compute the scalar cost of the SLP node NODE and its children
4230 and return it. Do not account defs that are marked in LIFE and
4231 update LIFE according to uses of NODE. */
4233 static void
4234 vect_bb_slp_scalar_cost (vec_info *vinfo,
4235 slp_tree node, vec<bool, va_heap> *life,
4236 stmt_vector_for_cost *cost_vec,
4237 hash_set<slp_tree> &visited)
4239 unsigned i;
4240 stmt_vec_info stmt_info;
4241 slp_tree child;
4243 if (visited.add (node))
4244 return;
4246 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4248 ssa_op_iter op_iter;
4249 def_operand_p def_p;
4251 if ((*life)[i])
4252 continue;
4254 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4255 gimple *orig_stmt = orig_stmt_info->stmt;
4257 /* If there is a non-vectorized use of the defs then the scalar
4258 stmt is kept live in which case we do not account it or any
4259 required defs in the SLP children in the scalar cost. This
4260 way we make the vectorization more costly when compared to
4261 the scalar cost. */
4262 if (!STMT_VINFO_LIVE_P (stmt_info))
4264 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4266 imm_use_iterator use_iter;
4267 gimple *use_stmt;
4268 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4269 if (!is_gimple_debug (use_stmt))
4271 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
4272 if (!use_stmt_info
4273 || !PURE_SLP_STMT
4274 (vect_stmt_to_vectorize (use_stmt_info)))
4276 (*life)[i] = true;
4277 break;
4281 if ((*life)[i])
4282 continue;
4285 /* Count scalar stmts only once. */
4286 if (gimple_visited_p (orig_stmt))
4287 continue;
4288 gimple_set_visited (orig_stmt, true);
4290 vect_cost_for_stmt kind;
4291 if (STMT_VINFO_DATA_REF (orig_stmt_info))
4293 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
4294 kind = scalar_load;
4295 else
4296 kind = scalar_store;
4298 else if (vect_nop_conversion_p (orig_stmt_info))
4299 continue;
4300 /* For single-argument PHIs assume coalescing which means zero cost
4301 for the scalar and the vector PHIs. This avoids artificially
4302 favoring the vector path (but may pessimize it in some cases). */
4303 else if (is_a <gphi *> (orig_stmt_info->stmt)
4304 && gimple_phi_num_args
4305 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
4306 continue;
4307 else
4308 kind = scalar_stmt;
4309 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
4310 SLP_TREE_VECTYPE (node), 0, vect_body);
4313 auto_vec<bool, 20> subtree_life;
4314 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4316 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4318 /* Do not directly pass LIFE to the recursive call, copy it to
4319 confine changes in the callee to the current child/subtree. */
4320 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4322 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
4323 for (unsigned j = 0;
4324 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
4326 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
4327 if (perm.first == i)
4328 subtree_life[perm.second] = (*life)[j];
4331 else
4333 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
4334 subtree_life.safe_splice (*life);
4336 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
4337 visited);
4338 subtree_life.truncate (0);
4343 /* Comparator for the loop-index sorted cost vectors. */
4345 static int
4346 li_cost_vec_cmp (const void *a_, const void *b_)
4348 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
4349 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
4350 if (a->first < b->first)
4351 return -1;
4352 else if (a->first == b->first)
4353 return 0;
4354 return 1;
4357 /* Check if vectorization of the basic block is profitable for the
4358 subgraph denoted by SLP_INSTANCES. */
4360 static bool
4361 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
4362 vec<slp_instance> slp_instances)
4364 slp_instance instance;
4365 int i;
4366 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
4367 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
4369 if (dump_enabled_p ())
4371 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
4372 hash_set<slp_tree> visited;
4373 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4374 vect_print_slp_graph (MSG_NOTE, vect_location,
4375 SLP_INSTANCE_TREE (instance), visited);
4378 /* Calculate scalar cost and sum the cost for the vector stmts
4379 previously collected. */
4380 stmt_vector_for_cost scalar_costs = vNULL;
4381 stmt_vector_for_cost vector_costs = vNULL;
4382 hash_set<slp_tree> visited;
4383 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4385 auto_vec<bool, 20> life;
4386 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
4387 true);
4388 if (SLP_INSTANCE_ROOT_STMT (instance))
4389 record_stmt_cost (&scalar_costs, 1, scalar_stmt,
4390 SLP_INSTANCE_ROOT_STMT (instance), 0, vect_body);
4391 vect_bb_slp_scalar_cost (bb_vinfo,
4392 SLP_INSTANCE_TREE (instance),
4393 &life, &scalar_costs, visited);
4394 vector_costs.safe_splice (instance->cost_vec);
4395 instance->cost_vec.release ();
4397 /* Unset visited flag. */
4398 stmt_info_for_cost *cost;
4399 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
4400 gimple_set_visited (cost->stmt_info->stmt, false);
4402 if (dump_enabled_p ())
4403 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4405 /* When costing non-loop vectorization we need to consider each covered
4406 loop independently and make sure vectorization is profitable. For
4407 now we assume a loop may be not entered or executed an arbitrary
4408 number of iterations (??? static information can provide more
4409 precise info here) which means we can simply cost each containing
4410 loops stmts separately. */
4412 /* First produce cost vectors sorted by loop index. */
4413 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
4414 li_scalar_costs (scalar_costs.length ());
4415 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
4416 li_vector_costs (vector_costs.length ());
4417 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
4419 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
4420 li_scalar_costs.quick_push (std::make_pair (l, cost));
4422 /* Use a random used loop as fallback in case the first vector_costs
4423 entry does not have a stmt_info associated with it. */
4424 unsigned l = li_scalar_costs[0].first;
4425 FOR_EACH_VEC_ELT (vector_costs, i, cost)
4427 /* We inherit from the previous COST, invariants, externals and
4428 extracts immediately follow the cost for the related stmt. */
4429 if (cost->stmt_info)
4430 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
4431 li_vector_costs.quick_push (std::make_pair (l, cost));
4433 li_scalar_costs.qsort (li_cost_vec_cmp);
4434 li_vector_costs.qsort (li_cost_vec_cmp);
4436 /* Now cost the portions individually. */
4437 unsigned vi = 0;
4438 unsigned si = 0;
4439 while (si < li_scalar_costs.length ()
4440 && vi < li_vector_costs.length ())
4442 unsigned sl = li_scalar_costs[si].first;
4443 unsigned vl = li_vector_costs[vi].first;
4444 if (sl != vl)
4446 if (dump_enabled_p ())
4447 dump_printf_loc (MSG_NOTE, vect_location,
4448 "Scalar %d and vector %d loop part do not "
4449 "match up, skipping scalar part\n", sl, vl);
4450 /* Skip the scalar part, assuming zero cost on the vector side. */
4453 si++;
4455 while (si < li_scalar_costs.length ()
4456 && li_scalar_costs[si].first == sl);
4457 continue;
4460 void *scalar_target_cost_data = init_cost (NULL);
4463 add_stmt_cost (bb_vinfo, scalar_target_cost_data,
4464 li_scalar_costs[si].second);
4465 si++;
4467 while (si < li_scalar_costs.length ()
4468 && li_scalar_costs[si].first == sl);
4469 unsigned dummy;
4470 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
4471 destroy_cost_data (scalar_target_cost_data);
4473 /* Complete the target-specific vector cost calculation. */
4474 void *vect_target_cost_data = init_cost (NULL);
4477 add_stmt_cost (bb_vinfo, vect_target_cost_data,
4478 li_vector_costs[vi].second);
4479 vi++;
4481 while (vi < li_vector_costs.length ()
4482 && li_vector_costs[vi].first == vl);
4483 finish_cost (vect_target_cost_data, &vec_prologue_cost,
4484 &vec_inside_cost, &vec_epilogue_cost);
4485 destroy_cost_data (vect_target_cost_data);
4487 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
4489 if (dump_enabled_p ())
4491 dump_printf_loc (MSG_NOTE, vect_location,
4492 "Cost model analysis for part in loop %d:\n", sl);
4493 dump_printf (MSG_NOTE, " Vector cost: %d\n",
4494 vec_inside_cost + vec_outside_cost);
4495 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
4498 /* Vectorization is profitable if its cost is more than the cost of scalar
4499 version. Note that we err on the vector side for equal cost because
4500 the cost estimate is otherwise quite pessimistic (constant uses are
4501 free on the scalar side but cost a load on the vector side for
4502 example). */
4503 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4505 scalar_costs.release ();
4506 vector_costs.release ();
4507 return false;
4510 if (vi < li_vector_costs.length ())
4512 if (dump_enabled_p ())
4513 dump_printf_loc (MSG_NOTE, vect_location,
4514 "Excess vector cost for part in loop %d:\n",
4515 li_vector_costs[vi].first);
4516 scalar_costs.release ();
4517 vector_costs.release ();
4518 return false;
4521 scalar_costs.release ();
4522 vector_costs.release ();
4523 return true;
4526 /* qsort comparator for lane defs. */
4528 static int
4529 vld_cmp (const void *a_, const void *b_)
4531 auto *a = (const std::pair<unsigned, tree> *)a_;
4532 auto *b = (const std::pair<unsigned, tree> *)b_;
4533 return a->first - b->first;
4536 /* Return true if USE_STMT is a vector lane insert into VEC and set
4537 *THIS_LANE to the lane number that is set. */
4539 static bool
4540 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
4542 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
4543 if (!use_ass
4544 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
4545 || (vec
4546 ? gimple_assign_rhs1 (use_ass) != vec
4547 : ((vec = gimple_assign_rhs1 (use_ass)), false))
4548 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
4549 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
4550 || !constant_multiple_p
4551 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
4552 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
4553 this_lane))
4554 return false;
4555 return true;
4558 /* Find any vectorizable constructors and add them to the grouped_store
4559 array. */
4561 static void
4562 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4564 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4565 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4566 !gsi_end_p (gsi); gsi_next (&gsi))
4568 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4569 if (!assign)
4570 continue;
4572 tree rhs = gimple_assign_rhs1 (assign);
4573 if (gimple_assign_rhs_code (assign) == CONSTRUCTOR)
4575 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4576 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4577 CONSTRUCTOR_NELTS (rhs))
4578 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4579 || uniform_vector_p (rhs))
4580 continue;
4582 unsigned j;
4583 tree val;
4584 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
4585 if (TREE_CODE (val) != SSA_NAME
4586 || !bb_vinfo->lookup_def (val))
4587 break;
4588 if (j != CONSTRUCTOR_NELTS (rhs))
4589 continue;
4591 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4592 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4594 else if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR
4595 && VECTOR_TYPE_P (TREE_TYPE (rhs))
4596 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
4597 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
4598 && integer_zerop (gimple_assign_rhs3 (assign))
4599 && useless_type_conversion_p
4600 (TREE_TYPE (TREE_TYPE (rhs)),
4601 TREE_TYPE (gimple_assign_rhs2 (assign)))
4602 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
4604 /* We start to match on insert to lane zero but since the
4605 inserts need not be ordered we'd have to search both
4606 the def and the use chains. */
4607 tree vectype = TREE_TYPE (rhs);
4608 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4609 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
4610 auto_sbitmap lanes (nlanes);
4611 bitmap_clear (lanes);
4612 bitmap_set_bit (lanes, 0);
4613 tree def = gimple_assign_lhs (assign);
4614 lane_defs.quick_push
4615 (std::make_pair (0, gimple_assign_rhs2 (assign)));
4616 unsigned lanes_found = 1;
4617 /* Start with the use chains, the last stmt will be the root. */
4618 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
4621 use_operand_p use_p;
4622 gimple *use_stmt;
4623 if (!single_imm_use (def, &use_p, &use_stmt))
4624 break;
4625 unsigned this_lane;
4626 if (!bb_vinfo->lookup_stmt (use_stmt)
4627 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
4628 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
4629 break;
4630 if (bitmap_bit_p (lanes, this_lane))
4631 break;
4632 lanes_found++;
4633 bitmap_set_bit (lanes, this_lane);
4634 gassign *use_ass = as_a <gassign *> (use_stmt);
4635 lane_defs.quick_push (std::make_pair
4636 (this_lane, gimple_assign_rhs2 (use_ass)));
4637 last = bb_vinfo->lookup_stmt (use_ass);
4638 def = gimple_assign_lhs (use_ass);
4640 while (lanes_found < nlanes);
4641 if (lanes_found < nlanes)
4643 /* Now search the def chain. */
4644 def = gimple_assign_rhs1 (assign);
4647 if (TREE_CODE (def) != SSA_NAME
4648 || !has_single_use (def))
4649 break;
4650 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
4651 unsigned this_lane;
4652 if (!bb_vinfo->lookup_stmt (def_stmt)
4653 || !vect_slp_is_lane_insert (def_stmt,
4654 NULL_TREE, &this_lane)
4655 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
4656 break;
4657 if (bitmap_bit_p (lanes, this_lane))
4658 break;
4659 lanes_found++;
4660 bitmap_set_bit (lanes, this_lane);
4661 lane_defs.quick_push (std::make_pair
4662 (this_lane,
4663 gimple_assign_rhs2 (def_stmt)));
4664 def = gimple_assign_rhs1 (def_stmt);
4666 while (lanes_found < nlanes);
4668 if (lanes_found == nlanes)
4670 /* Sort lane_defs after the lane index and register the root. */
4671 lane_defs.qsort (vld_cmp);
4672 vec<stmt_vec_info> stmts;
4673 stmts.create (nlanes);
4674 for (unsigned i = 0; i < nlanes; ++i)
4675 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
4676 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
4677 stmts, last));
4683 /* Walk the grouped store chains and replace entries with their
4684 pattern variant if any. */
4686 static void
4687 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
4689 stmt_vec_info first_element;
4690 unsigned i;
4692 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4694 /* We also have CTORs in this array. */
4695 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
4696 continue;
4697 if (STMT_VINFO_IN_PATTERN_P (first_element))
4699 stmt_vec_info orig = first_element;
4700 first_element = STMT_VINFO_RELATED_STMT (first_element);
4701 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
4702 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
4703 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
4704 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
4705 vinfo->grouped_stores[i] = first_element;
4707 stmt_vec_info prev = first_element;
4708 while (DR_GROUP_NEXT_ELEMENT (prev))
4710 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
4711 if (STMT_VINFO_IN_PATTERN_P (elt))
4713 stmt_vec_info orig = elt;
4714 elt = STMT_VINFO_RELATED_STMT (elt);
4715 DR_GROUP_NEXT_ELEMENT (prev) = elt;
4716 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
4717 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
4719 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
4720 prev = elt;
4725 /* Check if the region described by BB_VINFO can be vectorized, returning
4726 true if so. When returning false, set FATAL to true if the same failure
4727 would prevent vectorization at other vector sizes, false if it is still
4728 worth trying other sizes. N_STMTS is the number of statements in the
4729 region. */
4731 static bool
4732 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4733 vec<int> *dataref_groups)
4735 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4737 slp_instance instance;
4738 int i;
4739 poly_uint64 min_vf = 2;
4741 /* The first group of checks is independent of the vector size. */
4742 fatal = true;
4744 /* Analyze the data references. */
4746 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4748 if (dump_enabled_p ())
4749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4750 "not vectorized: unhandled data-ref in basic "
4751 "block.\n");
4752 return false;
4755 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4757 if (dump_enabled_p ())
4758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4759 "not vectorized: unhandled data access in "
4760 "basic block.\n");
4761 return false;
4764 vect_slp_check_for_constructors (bb_vinfo);
4766 /* If there are no grouped stores and no constructors in the region
4767 there is no need to continue with pattern recog as vect_analyze_slp
4768 will fail anyway. */
4769 if (bb_vinfo->grouped_stores.is_empty ()
4770 && bb_vinfo->roots.is_empty ())
4772 if (dump_enabled_p ())
4773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4774 "not vectorized: no grouped stores in "
4775 "basic block.\n");
4776 return false;
4779 /* While the rest of the analysis below depends on it in some way. */
4780 fatal = false;
4782 vect_pattern_recog (bb_vinfo);
4784 /* Update store groups from pattern processing. */
4785 vect_fixup_store_groups_with_patterns (bb_vinfo);
4787 /* Check the SLP opportunities in the basic block, analyze and build SLP
4788 trees. */
4789 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4791 if (dump_enabled_p ())
4793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4794 "Failed to SLP the basic block.\n");
4795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4796 "not vectorized: failed to find SLP opportunities "
4797 "in basic block.\n");
4799 return false;
4802 /* Optimize permutations. */
4803 vect_optimize_slp (bb_vinfo);
4805 /* Gather the loads reachable from the SLP graph entries. */
4806 vect_gather_slp_loads (bb_vinfo);
4808 vect_record_base_alignments (bb_vinfo);
4810 /* Analyze and verify the alignment of data references and the
4811 dependence in the SLP instances. */
4812 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4814 vect_location = instance->location ();
4815 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4816 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4818 slp_tree node = SLP_INSTANCE_TREE (instance);
4819 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4820 if (dump_enabled_p ())
4821 dump_printf_loc (MSG_NOTE, vect_location,
4822 "removing SLP instance operations starting from: %G",
4823 stmt_info->stmt);
4824 vect_free_slp_instance (instance);
4825 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4826 continue;
4829 /* Mark all the statements that we want to vectorize as pure SLP and
4830 relevant. */
4831 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4832 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4833 if (stmt_vec_info root = SLP_INSTANCE_ROOT_STMT (instance))
4835 STMT_SLP_TYPE (root) = pure_slp;
4836 if (is_gimple_assign (root->stmt)
4837 && gimple_assign_rhs_code (root->stmt) == BIT_INSERT_EXPR)
4839 /* ??? We should probably record the whole vector of
4840 root stmts so we do not have to back-track here... */
4841 for (unsigned n = SLP_TREE_LANES (SLP_INSTANCE_TREE (instance));
4842 n != 1; --n)
4844 root = bb_vinfo->lookup_def (gimple_assign_rhs1 (root->stmt));
4845 STMT_SLP_TYPE (root) = pure_slp;
4850 i++;
4852 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4853 return false;
4855 if (!vect_slp_analyze_operations (bb_vinfo))
4857 if (dump_enabled_p ())
4858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4859 "not vectorized: bad operation in basic block.\n");
4860 return false;
4863 vect_bb_partition_graph (bb_vinfo);
4865 return true;
4868 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4869 basic blocks in BBS, returning true on success.
4870 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4872 static bool
4873 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4874 vec<int> *dataref_groups, unsigned int n_stmts)
4876 bb_vec_info bb_vinfo;
4877 auto_vector_modes vector_modes;
4879 /* Autodetect first vector size we try. */
4880 machine_mode next_vector_mode = VOIDmode;
4881 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4882 unsigned int mode_i = 0;
4884 vec_info_shared shared;
4886 machine_mode autodetected_vector_mode = VOIDmode;
4887 while (1)
4889 bool vectorized = false;
4890 bool fatal = false;
4891 bb_vinfo = new _bb_vec_info (bbs, &shared);
4893 bool first_time_p = shared.datarefs.is_empty ();
4894 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4895 if (first_time_p)
4896 bb_vinfo->shared->save_datarefs ();
4897 else
4898 bb_vinfo->shared->check_datarefs ();
4899 bb_vinfo->vector_mode = next_vector_mode;
4901 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
4903 if (dump_enabled_p ())
4905 dump_printf_loc (MSG_NOTE, vect_location,
4906 "***** Analysis succeeded with vector mode"
4907 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4908 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4911 bb_vinfo->shared->check_datarefs ();
4913 unsigned i;
4914 slp_instance instance;
4915 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4917 if (instance->subgraph_entries.is_empty ())
4918 continue;
4920 vect_location = instance->location ();
4921 if (!unlimited_cost_model (NULL)
4922 && !vect_bb_vectorization_profitable_p
4923 (bb_vinfo, instance->subgraph_entries))
4925 if (dump_enabled_p ())
4926 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4927 "not vectorized: vectorization is not "
4928 "profitable.\n");
4929 continue;
4932 if (!dbg_cnt (vect_slp))
4933 continue;
4935 if (!vectorized && dump_enabled_p ())
4936 dump_printf_loc (MSG_NOTE, vect_location,
4937 "Basic block will be vectorized "
4938 "using SLP\n");
4939 vectorized = true;
4941 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4943 unsigned HOST_WIDE_INT bytes;
4944 if (dump_enabled_p ())
4946 if (GET_MODE_SIZE
4947 (bb_vinfo->vector_mode).is_constant (&bytes))
4948 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4949 "basic block part vectorized using %wu "
4950 "byte vectors\n", bytes);
4951 else
4952 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4953 "basic block part vectorized using "
4954 "variable length vectors\n");
4958 else
4960 if (dump_enabled_p ())
4961 dump_printf_loc (MSG_NOTE, vect_location,
4962 "***** Analysis failed with vector mode %s\n",
4963 GET_MODE_NAME (bb_vinfo->vector_mode));
4966 if (mode_i == 0)
4967 autodetected_vector_mode = bb_vinfo->vector_mode;
4969 if (!fatal)
4970 while (mode_i < vector_modes.length ()
4971 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4973 if (dump_enabled_p ())
4974 dump_printf_loc (MSG_NOTE, vect_location,
4975 "***** The result for vector mode %s would"
4976 " be the same\n",
4977 GET_MODE_NAME (vector_modes[mode_i]));
4978 mode_i += 1;
4981 delete bb_vinfo;
4983 if (mode_i < vector_modes.length ()
4984 && VECTOR_MODE_P (autodetected_vector_mode)
4985 && (related_vector_mode (vector_modes[mode_i],
4986 GET_MODE_INNER (autodetected_vector_mode))
4987 == autodetected_vector_mode)
4988 && (related_vector_mode (autodetected_vector_mode,
4989 GET_MODE_INNER (vector_modes[mode_i]))
4990 == vector_modes[mode_i]))
4992 if (dump_enabled_p ())
4993 dump_printf_loc (MSG_NOTE, vect_location,
4994 "***** Skipping vector mode %s, which would"
4995 " repeat the analysis for %s\n",
4996 GET_MODE_NAME (vector_modes[mode_i]),
4997 GET_MODE_NAME (autodetected_vector_mode));
4998 mode_i += 1;
5001 if (vectorized
5002 || mode_i == vector_modes.length ()
5003 || autodetected_vector_mode == VOIDmode
5004 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5005 vector sizes will fail do not bother iterating. */
5006 || fatal)
5007 return vectorized;
5009 /* Try the next biggest vector size. */
5010 next_vector_mode = vector_modes[mode_i++];
5011 if (dump_enabled_p ())
5012 dump_printf_loc (MSG_NOTE, vect_location,
5013 "***** Re-trying analysis with vector mode %s\n",
5014 GET_MODE_NAME (next_vector_mode));
5019 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
5020 true if anything in the basic-block was vectorized. */
5022 static bool
5023 vect_slp_bbs (vec<basic_block> bbs)
5025 vec<data_reference_p> datarefs = vNULL;
5026 auto_vec<int> dataref_groups;
5027 int insns = 0;
5028 int current_group = 0;
5030 for (unsigned i = 0; i < bbs.length (); i++)
5032 basic_block bb = bbs[i];
5033 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
5034 gsi_next (&gsi))
5036 gimple *stmt = gsi_stmt (gsi);
5037 if (is_gimple_debug (stmt))
5038 continue;
5040 insns++;
5042 if (gimple_location (stmt) != UNKNOWN_LOCATION)
5043 vect_location = stmt;
5045 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
5046 &dataref_groups, current_group))
5047 ++current_group;
5051 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
5054 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5055 true if anything in the basic-block was vectorized. */
5057 bool
5058 vect_slp_bb (basic_block bb)
5060 auto_vec<basic_block> bbs;
5061 bbs.safe_push (bb);
5062 return vect_slp_bbs (bbs);
5065 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5066 true if anything in the basic-block was vectorized. */
5068 bool
5069 vect_slp_function (function *fun)
5071 bool r = false;
5072 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
5073 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
5075 /* For the moment split the function into pieces to avoid making
5076 the iteration on the vector mode moot. Split at points we know
5077 to not handle well which is CFG merges (SLP discovery doesn't
5078 handle non-loop-header PHIs) and loop exits. Since pattern
5079 recog requires reverse iteration to visit uses before defs
5080 simply chop RPO into pieces. */
5081 auto_vec<basic_block> bbs;
5082 for (unsigned i = 0; i < n; i++)
5084 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
5085 bool split = false;
5087 /* Split when a BB is not dominated by the first block. */
5088 if (!bbs.is_empty ()
5089 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
5091 if (dump_enabled_p ())
5092 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5093 "splitting region at dominance boundary bb%d\n",
5094 bb->index);
5095 split = true;
5097 /* Split when the loop determined by the first block
5098 is exited. This is because we eventually insert
5099 invariants at region begin. */
5100 else if (!bbs.is_empty ()
5101 && bbs[0]->loop_father != bb->loop_father
5102 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
5104 if (dump_enabled_p ())
5105 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5106 "splitting region at loop %d exit at bb%d\n",
5107 bbs[0]->loop_father->num, bb->index);
5108 split = true;
5111 if (split && !bbs.is_empty ())
5113 r |= vect_slp_bbs (bbs);
5114 bbs.truncate (0);
5115 bbs.quick_push (bb);
5117 else
5118 bbs.safe_push (bb);
5120 /* When we have a stmt ending this block and defining a
5121 value we have to insert on edges when inserting after it for
5122 a vector containing its definition. Avoid this for now. */
5123 if (gimple *last = last_stmt (bb))
5124 if (gimple_get_lhs (last)
5125 && is_ctrl_altering_stmt (last))
5127 if (dump_enabled_p ())
5128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5129 "splitting region at control altering "
5130 "definition %G", last);
5131 r |= vect_slp_bbs (bbs);
5132 bbs.truncate (0);
5136 if (!bbs.is_empty ())
5137 r |= vect_slp_bbs (bbs);
5139 free (rpo);
5141 return r;
5144 /* Build a variable-length vector in which the elements in ELTS are repeated
5145 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
5146 RESULTS and add any new instructions to SEQ.
5148 The approach we use is:
5150 (1) Find a vector mode VM with integer elements of mode IM.
5152 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5153 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
5154 from small vectors to IM.
5156 (3) Duplicate each ELTS'[I] into a vector of mode VM.
5158 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
5159 correct byte contents.
5161 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
5163 We try to find the largest IM for which this sequence works, in order
5164 to cut down on the number of interleaves. */
5166 void
5167 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
5168 vec<tree> elts, unsigned int nresults,
5169 vec<tree> &results)
5171 unsigned int nelts = elts.length ();
5172 tree element_type = TREE_TYPE (vector_type);
5174 /* (1) Find a vector mode VM with integer elements of mode IM. */
5175 unsigned int nvectors = 1;
5176 tree new_vector_type;
5177 tree permutes[2];
5178 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
5179 &nvectors, &new_vector_type,
5180 permutes))
5181 gcc_unreachable ();
5183 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
5184 unsigned int partial_nelts = nelts / nvectors;
5185 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
5187 tree_vector_builder partial_elts;
5188 auto_vec<tree, 32> pieces (nvectors * 2);
5189 pieces.quick_grow_cleared (nvectors * 2);
5190 for (unsigned int i = 0; i < nvectors; ++i)
5192 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5193 ELTS' has mode IM. */
5194 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
5195 for (unsigned int j = 0; j < partial_nelts; ++j)
5196 partial_elts.quick_push (elts[i * partial_nelts + j]);
5197 tree t = gimple_build_vector (seq, &partial_elts);
5198 t = gimple_build (seq, VIEW_CONVERT_EXPR,
5199 TREE_TYPE (new_vector_type), t);
5201 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
5202 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
5205 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
5206 correct byte contents.
5208 Conceptually, we need to repeat the following operation log2(nvectors)
5209 times, where hi_start = nvectors / 2:
5211 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
5212 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
5214 However, if each input repeats every N elements and the VF is
5215 a multiple of N * 2, the HI result is the same as the LO result.
5216 This will be true for the first N1 iterations of the outer loop,
5217 followed by N2 iterations for which both the LO and HI results
5218 are needed. I.e.:
5220 N1 + N2 = log2(nvectors)
5222 Each "N1 iteration" doubles the number of redundant vectors and the
5223 effect of the process as a whole is to have a sequence of nvectors/2**N1
5224 vectors that repeats 2**N1 times. Rather than generate these redundant
5225 vectors, we halve the number of vectors for each N1 iteration. */
5226 unsigned int in_start = 0;
5227 unsigned int out_start = nvectors;
5228 unsigned int new_nvectors = nvectors;
5229 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
5231 unsigned int hi_start = new_nvectors / 2;
5232 unsigned int out_i = 0;
5233 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
5235 if ((in_i & 1) != 0
5236 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
5237 2 * in_repeat))
5238 continue;
5240 tree output = make_ssa_name (new_vector_type);
5241 tree input1 = pieces[in_start + (in_i / 2)];
5242 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
5243 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
5244 input1, input2,
5245 permutes[in_i & 1]);
5246 gimple_seq_add_stmt (seq, stmt);
5247 pieces[out_start + out_i] = output;
5248 out_i += 1;
5250 std::swap (in_start, out_start);
5251 new_nvectors = out_i;
5254 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
5255 results.reserve (nresults);
5256 for (unsigned int i = 0; i < nresults; ++i)
5257 if (i < new_nvectors)
5258 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
5259 pieces[in_start + i]));
5260 else
5261 results.quick_push (results[i - new_nvectors]);
5265 /* For constant and loop invariant defs in OP_NODE this function creates
5266 vector defs that will be used in the vectorized stmts and stores them
5267 to SLP_TREE_VEC_DEFS of OP_NODE. */
5269 static void
5270 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
5272 unsigned HOST_WIDE_INT nunits;
5273 tree vec_cst;
5274 unsigned j, number_of_places_left_in_vector;
5275 tree vector_type;
5276 tree vop;
5277 int group_size = op_node->ops.length ();
5278 unsigned int vec_num, i;
5279 unsigned number_of_copies = 1;
5280 bool constant_p;
5281 gimple_seq ctor_seq = NULL;
5282 auto_vec<tree, 16> permute_results;
5284 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
5285 vector_type = SLP_TREE_VECTYPE (op_node);
5287 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
5288 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
5289 auto_vec<tree> voprnds (number_of_vectors);
5291 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5292 created vectors. It is greater than 1 if unrolling is performed.
5294 For example, we have two scalar operands, s1 and s2 (e.g., group of
5295 strided accesses of size two), while NUNITS is four (i.e., four scalars
5296 of this type can be packed in a vector). The output vector will contain
5297 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5298 will be 2).
5300 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
5301 containing the operands.
5303 For example, NUNITS is four as before, and the group size is 8
5304 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5305 {s5, s6, s7, s8}. */
5307 /* When using duplicate_and_interleave, we just need one element for
5308 each scalar statement. */
5309 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5310 nunits = group_size;
5312 number_of_copies = nunits * number_of_vectors / group_size;
5314 number_of_places_left_in_vector = nunits;
5315 constant_p = true;
5316 tree_vector_builder elts (vector_type, nunits, 1);
5317 elts.quick_grow (nunits);
5318 stmt_vec_info insert_after = NULL;
5319 for (j = 0; j < number_of_copies; j++)
5321 tree op;
5322 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
5324 /* Create 'vect_ = {op0,op1,...,opn}'. */
5325 number_of_places_left_in_vector--;
5326 tree orig_op = op;
5327 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
5329 if (CONSTANT_CLASS_P (op))
5331 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5333 /* Can't use VIEW_CONVERT_EXPR for booleans because
5334 of possibly different sizes of scalar value and
5335 vector element. */
5336 if (integer_zerop (op))
5337 op = build_int_cst (TREE_TYPE (vector_type), 0);
5338 else if (integer_onep (op))
5339 op = build_all_ones_cst (TREE_TYPE (vector_type));
5340 else
5341 gcc_unreachable ();
5343 else
5344 op = fold_unary (VIEW_CONVERT_EXPR,
5345 TREE_TYPE (vector_type), op);
5346 gcc_assert (op && CONSTANT_CLASS_P (op));
5348 else
5350 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
5351 gimple *init_stmt;
5352 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5354 tree true_val
5355 = build_all_ones_cst (TREE_TYPE (vector_type));
5356 tree false_val
5357 = build_zero_cst (TREE_TYPE (vector_type));
5358 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
5359 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
5360 op, true_val,
5361 false_val);
5363 else
5365 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
5366 op);
5367 init_stmt
5368 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
5369 op);
5371 gimple_seq_add_stmt (&ctor_seq, init_stmt);
5372 op = new_temp;
5375 elts[number_of_places_left_in_vector] = op;
5376 if (!CONSTANT_CLASS_P (op))
5377 constant_p = false;
5378 /* For BB vectorization we have to compute an insert location
5379 when a def is inside the analyzed region since we cannot
5380 simply insert at the BB start in this case. */
5381 stmt_vec_info opdef;
5382 if (TREE_CODE (orig_op) == SSA_NAME
5383 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
5384 && is_a <bb_vec_info> (vinfo)
5385 && (opdef = vinfo->lookup_def (orig_op)))
5387 if (!insert_after)
5388 insert_after = opdef;
5389 else
5390 insert_after = get_later_stmt (insert_after, opdef);
5393 if (number_of_places_left_in_vector == 0)
5395 if (constant_p
5396 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5397 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5398 vec_cst = gimple_build_vector (&ctor_seq, &elts);
5399 else
5401 if (permute_results.is_empty ())
5402 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
5403 elts, number_of_vectors,
5404 permute_results);
5405 vec_cst = permute_results[number_of_vectors - j - 1];
5407 if (!gimple_seq_empty_p (ctor_seq))
5409 if (insert_after)
5411 gimple_stmt_iterator gsi;
5412 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
5414 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
5415 gsi_insert_seq_before (&gsi, ctor_seq,
5416 GSI_CONTINUE_LINKING);
5418 else if (!stmt_ends_bb_p (insert_after->stmt))
5420 gsi = gsi_for_stmt (insert_after->stmt);
5421 gsi_insert_seq_after (&gsi, ctor_seq,
5422 GSI_CONTINUE_LINKING);
5424 else
5426 /* When we want to insert after a def where the
5427 defining stmt throws then insert on the fallthru
5428 edge. */
5429 edge e = find_fallthru_edge
5430 (gimple_bb (insert_after->stmt)->succs);
5431 basic_block new_bb
5432 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
5433 gcc_assert (!new_bb);
5436 else
5437 vinfo->insert_seq_on_entry (NULL, ctor_seq);
5438 ctor_seq = NULL;
5440 voprnds.quick_push (vec_cst);
5441 insert_after = NULL;
5442 number_of_places_left_in_vector = nunits;
5443 constant_p = true;
5444 elts.new_vector (vector_type, nunits, 1);
5445 elts.quick_grow (nunits);
5450 /* Since the vectors are created in the reverse order, we should invert
5451 them. */
5452 vec_num = voprnds.length ();
5453 for (j = vec_num; j != 0; j--)
5455 vop = voprnds[j - 1];
5456 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5459 /* In case that VF is greater than the unrolling factor needed for the SLP
5460 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5461 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5462 to replicate the vectors. */
5463 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
5464 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
5465 i++)
5466 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5469 /* Get the Ith vectorized definition from SLP_NODE. */
5471 tree
5472 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
5474 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
5475 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
5476 else
5477 return SLP_TREE_VEC_DEFS (slp_node)[i];
5480 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5482 void
5483 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
5485 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
5486 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
5488 unsigned j;
5489 gimple *vec_def_stmt;
5490 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
5491 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
5493 else
5494 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
5497 /* Get N vectorized definitions for SLP_NODE. */
5499 void
5500 vect_get_slp_defs (vec_info *,
5501 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
5503 if (n == -1U)
5504 n = SLP_TREE_CHILDREN (slp_node).length ();
5506 for (unsigned i = 0; i < n; ++i)
5508 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
5509 vec<tree> vec_defs = vNULL;
5510 vect_get_slp_defs (child, &vec_defs);
5511 vec_oprnds->quick_push (vec_defs);
5515 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5516 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5517 permute statements for the SLP node NODE. Store the number of vector
5518 permute instructions in *N_PERMS and the number of vector load
5519 instructions in *N_LOADS. */
5521 bool
5522 vect_transform_slp_perm_load (vec_info *vinfo,
5523 slp_tree node, vec<tree> dr_chain,
5524 gimple_stmt_iterator *gsi, poly_uint64 vf,
5525 bool analyze_only, unsigned *n_perms,
5526 unsigned int *n_loads)
5528 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5529 int vec_index = 0;
5530 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5531 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
5532 unsigned int mask_element;
5533 machine_mode mode;
5535 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
5536 return false;
5538 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
5540 mode = TYPE_MODE (vectype);
5541 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5543 /* Initialize the vect stmts of NODE to properly insert the generated
5544 stmts later. */
5545 if (! analyze_only)
5546 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
5547 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
5548 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
5550 /* Generate permutation masks for every NODE. Number of masks for each NODE
5551 is equal to GROUP_SIZE.
5552 E.g., we have a group of three nodes with three loads from the same
5553 location in each node, and the vector size is 4. I.e., we have a
5554 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5555 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5556 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5559 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5560 The last mask is illegal since we assume two operands for permute
5561 operation, and the mask element values can't be outside that range.
5562 Hence, the last mask must be converted into {2,5,5,5}.
5563 For the first two permutations we need the first and the second input
5564 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5565 we need the second and the third vectors: {b1,c1,a2,b2} and
5566 {c2,a3,b3,c3}. */
5568 int vect_stmts_counter = 0;
5569 unsigned int index = 0;
5570 int first_vec_index = -1;
5571 int second_vec_index = -1;
5572 bool noop_p = true;
5573 *n_perms = 0;
5575 vec_perm_builder mask;
5576 unsigned int nelts_to_build;
5577 unsigned int nvectors_per_build;
5578 unsigned int in_nlanes;
5579 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
5580 && multiple_p (nunits, group_size));
5581 if (repeating_p)
5583 /* A single vector contains a whole number of copies of the node, so:
5584 (a) all permutes can use the same mask; and
5585 (b) the permutes only need a single vector input. */
5586 mask.new_vector (nunits, group_size, 3);
5587 nelts_to_build = mask.encoded_nelts ();
5588 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
5589 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
5591 else
5593 /* We need to construct a separate mask for each vector statement. */
5594 unsigned HOST_WIDE_INT const_nunits, const_vf;
5595 if (!nunits.is_constant (&const_nunits)
5596 || !vf.is_constant (&const_vf))
5597 return false;
5598 mask.new_vector (const_nunits, const_nunits, 1);
5599 nelts_to_build = const_vf * group_size;
5600 nvectors_per_build = 1;
5601 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
5603 auto_sbitmap used_in_lanes (in_nlanes);
5604 bitmap_clear (used_in_lanes);
5606 unsigned int count = mask.encoded_nelts ();
5607 mask.quick_grow (count);
5608 vec_perm_indices indices;
5610 for (unsigned int j = 0; j < nelts_to_build; j++)
5612 unsigned int iter_num = j / group_size;
5613 unsigned int stmt_num = j % group_size;
5614 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
5615 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
5616 bitmap_set_bit (used_in_lanes, i);
5617 if (repeating_p)
5619 first_vec_index = 0;
5620 mask_element = i;
5622 else
5624 /* Enforced before the loop when !repeating_p. */
5625 unsigned int const_nunits = nunits.to_constant ();
5626 vec_index = i / const_nunits;
5627 mask_element = i % const_nunits;
5628 if (vec_index == first_vec_index
5629 || first_vec_index == -1)
5631 first_vec_index = vec_index;
5633 else if (vec_index == second_vec_index
5634 || second_vec_index == -1)
5636 second_vec_index = vec_index;
5637 mask_element += const_nunits;
5639 else
5641 if (dump_enabled_p ())
5642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5643 "permutation requires at "
5644 "least three vectors %G",
5645 stmt_info->stmt);
5646 gcc_assert (analyze_only);
5647 return false;
5650 gcc_assert (mask_element < 2 * const_nunits);
5653 if (mask_element != index)
5654 noop_p = false;
5655 mask[index++] = mask_element;
5657 if (index == count && !noop_p)
5659 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
5660 if (!can_vec_perm_const_p (mode, indices))
5662 if (dump_enabled_p ())
5664 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5665 vect_location,
5666 "unsupported vect permute { ");
5667 for (i = 0; i < count; ++i)
5669 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5670 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5672 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5674 gcc_assert (analyze_only);
5675 return false;
5678 ++*n_perms;
5681 if (index == count)
5683 if (!analyze_only)
5685 tree mask_vec = NULL_TREE;
5687 if (! noop_p)
5688 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5690 if (second_vec_index == -1)
5691 second_vec_index = first_vec_index;
5693 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
5695 /* Generate the permute statement if necessary. */
5696 tree first_vec = dr_chain[first_vec_index + ri];
5697 tree second_vec = dr_chain[second_vec_index + ri];
5698 gimple *perm_stmt;
5699 if (! noop_p)
5701 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5702 tree perm_dest
5703 = vect_create_destination_var (gimple_assign_lhs (stmt),
5704 vectype);
5705 perm_dest = make_ssa_name (perm_dest);
5706 perm_stmt
5707 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5708 first_vec, second_vec,
5709 mask_vec);
5710 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
5711 gsi);
5713 else
5714 /* If mask was NULL_TREE generate the requested
5715 identity transform. */
5716 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
5718 /* Store the vector statement in NODE. */
5719 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
5723 index = 0;
5724 first_vec_index = -1;
5725 second_vec_index = -1;
5726 noop_p = true;
5730 if (n_loads)
5732 if (repeating_p)
5733 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5734 else
5736 /* Enforced above when !repeating_p. */
5737 unsigned int const_nunits = nunits.to_constant ();
5738 *n_loads = 0;
5739 bool load_seen = false;
5740 for (unsigned i = 0; i < in_nlanes; ++i)
5742 if (i % const_nunits == 0)
5744 if (load_seen)
5745 *n_loads += 1;
5746 load_seen = false;
5748 if (bitmap_bit_p (used_in_lanes, i))
5749 load_seen = true;
5751 if (load_seen)
5752 *n_loads += 1;
5756 return true;
5760 /* Vectorize the SLP permutations in NODE as specified
5761 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5762 child number and lane number.
5763 Interleaving of two two-lane two-child SLP subtrees (not supported):
5764 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5765 A blend of two four-lane two-child SLP subtrees:
5766 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5767 Highpart of a four-lane one-child SLP subtree (not supported):
5768 [ { 0, 2 }, { 0, 3 } ]
5769 Where currently only a subset is supported by code generating below. */
5771 static bool
5772 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5773 slp_tree node, stmt_vector_for_cost *cost_vec)
5775 tree vectype = SLP_TREE_VECTYPE (node);
5777 /* ??? We currently only support all same vector input and output types
5778 while the SLP IL should really do a concat + select and thus accept
5779 arbitrary mismatches. */
5780 slp_tree child;
5781 unsigned i;
5782 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5783 if (!vect_maybe_update_slp_op_vectype (child, vectype)
5784 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5786 if (dump_enabled_p ())
5787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5788 "Unsupported lane permutation\n");
5789 return false;
5792 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5793 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5794 if (dump_enabled_p ())
5796 dump_printf_loc (MSG_NOTE, vect_location,
5797 "vectorizing permutation");
5798 for (unsigned i = 0; i < perm.length (); ++i)
5799 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5800 dump_printf (MSG_NOTE, "\n");
5803 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5804 if (!nunits.is_constant ())
5805 return false;
5806 unsigned HOST_WIDE_INT vf = 1;
5807 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5808 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
5809 return false;
5810 unsigned olanes = vf * SLP_TREE_LANES (node);
5811 gcc_assert (multiple_p (olanes, nunits));
5813 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5814 from the { SLP operand, scalar lane } permutation as recorded in the
5815 SLP node as intermediate step. This part should already work
5816 with SLP children with arbitrary number of lanes. */
5817 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
5818 auto_vec<unsigned> active_lane;
5819 vperm.create (olanes);
5820 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
5821 for (unsigned i = 0; i < vf; ++i)
5823 for (unsigned pi = 0; pi < perm.length (); ++pi)
5825 std::pair<unsigned, unsigned> p = perm[pi];
5826 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
5827 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
5828 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
5829 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
5830 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
5832 /* Advance to the next group. */
5833 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
5834 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
5837 if (dump_enabled_p ())
5839 dump_printf_loc (MSG_NOTE, vect_location, "as");
5840 for (unsigned i = 0; i < vperm.length (); ++i)
5842 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
5843 dump_printf (MSG_NOTE, ",");
5844 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
5845 vperm[i].first.first, vperm[i].first.second,
5846 vperm[i].second);
5848 dump_printf (MSG_NOTE, "\n");
5851 /* We can only handle two-vector permutes, everything else should
5852 be lowered on the SLP level. The following is closely inspired
5853 by vect_transform_slp_perm_load and is supposed to eventually
5854 replace it.
5855 ??? As intermediate step do code-gen in the SLP tree representation
5856 somehow? */
5857 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
5858 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
5859 unsigned int const_nunits = nunits.to_constant ();
5860 unsigned int index = 0;
5861 unsigned int mask_element;
5862 vec_perm_builder mask;
5863 mask.new_vector (const_nunits, const_nunits, 1);
5864 unsigned int count = mask.encoded_nelts ();
5865 mask.quick_grow (count);
5866 vec_perm_indices indices;
5867 unsigned nperms = 0;
5868 for (unsigned i = 0; i < vperm.length (); ++i)
5870 mask_element = vperm[i].second;
5871 if (first_vec.first == -1U
5872 || first_vec == vperm[i].first)
5873 first_vec = vperm[i].first;
5874 else if (second_vec.first == -1U
5875 || second_vec == vperm[i].first)
5877 second_vec = vperm[i].first;
5878 mask_element += const_nunits;
5880 else
5882 if (dump_enabled_p ())
5883 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5884 "permutation requires at "
5885 "least three vectors\n");
5886 gcc_assert (!gsi);
5887 return false;
5890 mask[index++] = mask_element;
5892 if (index == count)
5894 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
5895 const_nunits);
5896 bool identity_p = indices.series_p (0, 1, 0, 1);
5897 if (!identity_p
5898 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5900 if (dump_enabled_p ())
5902 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5903 vect_location,
5904 "unsupported vect permute { ");
5905 for (i = 0; i < count; ++i)
5907 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5908 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5910 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5912 gcc_assert (!gsi);
5913 return false;
5916 if (!identity_p)
5917 nperms++;
5918 if (gsi)
5920 if (second_vec.first == -1U)
5921 second_vec = first_vec;
5923 /* Generate the permute statement if necessary. */
5924 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
5925 tree first_def
5926 = vect_get_slp_vect_def (first_node, first_vec.second);
5927 /* ??? We SLP match existing vector element extracts but
5928 allow punning which we need to re-instantiate at uses
5929 but have no good way of explicitely representing. */
5930 if (!types_compatible_p (TREE_TYPE (first_def), vectype))
5932 gassign *conv_stmt;
5933 conv_stmt = gimple_build_assign (make_ssa_name (vectype),
5934 build1 (VIEW_CONVERT_EXPR,
5935 vectype, first_def));
5936 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
5937 first_def = gimple_assign_lhs (conv_stmt);
5939 gassign *perm_stmt;
5940 tree perm_dest = make_ssa_name (vectype);
5941 if (!identity_p)
5943 slp_tree second_node
5944 = SLP_TREE_CHILDREN (node)[second_vec.first];
5945 tree second_def
5946 = vect_get_slp_vect_def (second_node, second_vec.second);
5947 if (!types_compatible_p (TREE_TYPE (second_def), vectype))
5949 gassign *conv_stmt;
5950 conv_stmt = gimple_build_assign (make_ssa_name (vectype),
5951 build1
5952 (VIEW_CONVERT_EXPR,
5953 vectype, second_def));
5954 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
5955 second_def = gimple_assign_lhs (conv_stmt);
5957 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5958 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5959 first_def, second_def,
5960 mask_vec);
5962 else
5963 /* We need a copy here in case the def was external. */
5964 perm_stmt = gimple_build_assign (perm_dest, first_def);
5965 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5966 /* Store the vector statement in NODE. */
5967 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5970 index = 0;
5971 first_vec = std::make_pair (-1U, -1U);
5972 second_vec = std::make_pair (-1U, -1U);
5976 if (!gsi)
5977 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
5979 return true;
5982 /* Vectorize SLP NODE. */
5984 static void
5985 vect_schedule_slp_node (vec_info *vinfo,
5986 slp_tree node, slp_instance instance)
5988 gimple_stmt_iterator si;
5989 int i;
5990 slp_tree child;
5992 /* For existing vectors there's nothing to do. */
5993 if (SLP_TREE_VEC_DEFS (node).exists ())
5994 return;
5996 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
5998 /* Vectorize externals and constants. */
5999 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
6000 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6002 /* ??? vectorizable_shift can end up using a scalar operand which is
6003 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
6004 node in this case. */
6005 if (!SLP_TREE_VECTYPE (node))
6006 return;
6008 vect_create_constant_vectors (vinfo, node);
6009 return;
6012 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6014 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
6015 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6017 if (dump_enabled_p ())
6018 dump_printf_loc (MSG_NOTE, vect_location,
6019 "------>vectorizing SLP node starting from: %G",
6020 stmt_info->stmt);
6022 if (STMT_VINFO_DATA_REF (stmt_info)
6023 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6025 /* Vectorized loads go before the first scalar load to make it
6026 ready early, vectorized stores go before the last scalar
6027 stmt which is where all uses are ready. */
6028 stmt_vec_info last_stmt_info = NULL;
6029 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
6030 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
6031 else /* DR_IS_WRITE */
6032 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
6033 si = gsi_for_stmt (last_stmt_info->stmt);
6035 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
6036 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
6037 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
6038 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6040 /* For PHI node vectorization we do not use the insertion iterator. */
6041 si = gsi_none ();
6043 else
6045 /* Emit other stmts after the children vectorized defs which is
6046 earliest possible. */
6047 gimple *last_stmt = NULL;
6048 bool seen_vector_def = false;
6049 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6050 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6052 /* For fold-left reductions we are retaining the scalar
6053 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
6054 set so the representation isn't perfect. Resort to the
6055 last scalar def here. */
6056 if (SLP_TREE_VEC_STMTS (child).is_empty ())
6058 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
6059 == cycle_phi_info_type);
6060 gphi *phi = as_a <gphi *>
6061 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
6062 if (!last_stmt
6063 || vect_stmt_dominates_stmt_p (last_stmt, phi))
6064 last_stmt = phi;
6066 /* We are emitting all vectorized stmts in the same place and
6067 the last one is the last.
6068 ??? Unless we have a load permutation applied and that
6069 figures to re-use an earlier generated load. */
6070 unsigned j;
6071 gimple *vstmt;
6072 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
6073 if (!last_stmt
6074 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
6075 last_stmt = vstmt;
6077 else if (!SLP_TREE_VECTYPE (child))
6079 /* For externals we use unvectorized at all scalar defs. */
6080 unsigned j;
6081 tree def;
6082 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
6083 if (TREE_CODE (def) == SSA_NAME
6084 && !SSA_NAME_IS_DEFAULT_DEF (def))
6086 gimple *stmt = SSA_NAME_DEF_STMT (def);
6087 if (!last_stmt
6088 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
6089 last_stmt = stmt;
6092 else
6094 /* For externals we have to look at all defs since their
6095 insertion place is decided per vector. But beware
6096 of pre-existing vectors where we need to make sure
6097 we do not insert before the region boundary. */
6098 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
6099 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
6100 seen_vector_def = true;
6101 else
6103 unsigned j;
6104 tree vdef;
6105 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
6106 if (TREE_CODE (vdef) == SSA_NAME
6107 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
6109 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
6110 if (!last_stmt
6111 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
6112 last_stmt = vstmt;
6116 /* This can happen when all children are pre-existing vectors or
6117 constants. */
6118 if (!last_stmt)
6119 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
6120 if (!last_stmt)
6122 gcc_assert (seen_vector_def);
6123 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
6125 else if (is_a <gphi *> (last_stmt))
6126 si = gsi_after_labels (gimple_bb (last_stmt));
6127 else
6129 si = gsi_for_stmt (last_stmt);
6130 gsi_next (&si);
6134 bool done_p = false;
6136 /* Handle purely internal nodes. */
6137 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6139 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
6140 be shared with different SLP nodes (but usually it's the same
6141 operation apart from the case the stmt is only there for denoting
6142 the actual scalar lane defs ...). So do not call vect_transform_stmt
6143 but open-code it here (partly). */
6144 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
6145 gcc_assert (done);
6146 done_p = true;
6148 if (!done_p)
6149 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
6152 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
6153 For loop vectorization this is done in vectorizable_call, but for SLP
6154 it needs to be deferred until end of vect_schedule_slp, because multiple
6155 SLP instances may refer to the same scalar stmt. */
6157 static void
6158 vect_remove_slp_scalar_calls (vec_info *vinfo,
6159 slp_tree node, hash_set<slp_tree> &visited)
6161 gimple *new_stmt;
6162 gimple_stmt_iterator gsi;
6163 int i;
6164 slp_tree child;
6165 tree lhs;
6166 stmt_vec_info stmt_info;
6168 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
6169 return;
6171 if (visited.add (node))
6172 return;
6174 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6175 vect_remove_slp_scalar_calls (vinfo, child, visited);
6177 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6179 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
6180 if (!stmt || gimple_bb (stmt) == NULL)
6181 continue;
6182 if (is_pattern_stmt_p (stmt_info)
6183 || !PURE_SLP_STMT (stmt_info))
6184 continue;
6185 lhs = gimple_call_lhs (stmt);
6186 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
6187 gsi = gsi_for_stmt (stmt);
6188 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
6189 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
6193 static void
6194 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
6196 hash_set<slp_tree> visited;
6197 vect_remove_slp_scalar_calls (vinfo, node, visited);
6200 /* Vectorize the instance root. */
6202 void
6203 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
6205 gassign *rstmt = NULL;
6207 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
6209 gimple *child_stmt;
6210 int j;
6212 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
6214 tree vect_lhs = gimple_get_lhs (child_stmt);
6215 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
6216 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
6217 TREE_TYPE (vect_lhs)))
6218 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
6219 vect_lhs);
6220 rstmt = gimple_build_assign (root_lhs, vect_lhs);
6221 break;
6224 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
6226 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6227 gimple *child_stmt;
6228 int j;
6229 vec<constructor_elt, va_gc> *v;
6230 vec_alloc (v, nelts);
6232 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
6234 CONSTRUCTOR_APPEND_ELT (v,
6235 NULL_TREE,
6236 gimple_get_lhs (child_stmt));
6238 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
6239 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
6240 tree r_constructor = build_constructor (rtype, v);
6241 rstmt = gimple_build_assign (lhs, r_constructor);
6244 gcc_assert (rstmt);
6246 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
6247 gsi_replace (&rgsi, rstmt, true);
6250 struct slp_scc_info
6252 bool on_stack;
6253 int dfs;
6254 int lowlink;
6257 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
6259 static void
6260 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
6261 hash_map<slp_tree, slp_scc_info> &scc_info,
6262 int &maxdfs, vec<slp_tree> &stack)
6264 bool existed_p;
6265 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
6266 gcc_assert (!existed_p);
6267 info->dfs = maxdfs;
6268 info->lowlink = maxdfs;
6269 maxdfs++;
6271 /* Leaf. */
6272 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
6274 info->on_stack = false;
6275 vect_schedule_slp_node (vinfo, node, instance);
6276 return;
6279 info->on_stack = true;
6280 stack.safe_push (node);
6282 unsigned i;
6283 slp_tree child;
6284 /* DFS recurse. */
6285 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6287 if (!child)
6288 continue;
6289 slp_scc_info *child_info = scc_info.get (child);
6290 if (!child_info)
6292 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
6293 /* Recursion might have re-allocated the node. */
6294 info = scc_info.get (node);
6295 child_info = scc_info.get (child);
6296 info->lowlink = MIN (info->lowlink, child_info->lowlink);
6298 else if (child_info->on_stack)
6299 info->lowlink = MIN (info->lowlink, child_info->dfs);
6301 if (info->lowlink != info->dfs)
6302 return;
6304 auto_vec<slp_tree, 4> phis_to_fixup;
6306 /* Singleton. */
6307 if (stack.last () == node)
6309 stack.pop ();
6310 info->on_stack = false;
6311 vect_schedule_slp_node (vinfo, node, instance);
6312 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6313 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
6314 phis_to_fixup.quick_push (node);
6316 else
6318 /* SCC. */
6319 int last_idx = stack.length () - 1;
6320 while (stack[last_idx] != node)
6321 last_idx--;
6322 /* We can break the cycle at PHIs who have at least one child
6323 code generated. Then we could re-start the DFS walk until
6324 all nodes in the SCC are covered (we might have new entries
6325 for only back-reachable nodes). But it's simpler to just
6326 iterate and schedule those that are ready. */
6327 unsigned todo = stack.length () - last_idx;
6330 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
6332 slp_tree entry = stack[idx];
6333 if (!entry)
6334 continue;
6335 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
6336 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
6337 bool ready = !phi;
6338 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
6339 if (!child)
6341 gcc_assert (phi);
6342 ready = true;
6343 break;
6345 else if (scc_info.get (child)->on_stack)
6347 if (!phi)
6349 ready = false;
6350 break;
6353 else
6355 if (phi)
6357 ready = true;
6358 break;
6361 if (ready)
6363 vect_schedule_slp_node (vinfo, entry, instance);
6364 scc_info.get (entry)->on_stack = false;
6365 stack[idx] = NULL;
6366 todo--;
6367 if (phi)
6368 phis_to_fixup.safe_push (entry);
6372 while (todo != 0);
6374 /* Pop the SCC. */
6375 stack.truncate (last_idx);
6378 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
6379 slp_tree phi_node;
6380 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
6382 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
6383 edge_iterator ei;
6384 edge e;
6385 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
6387 unsigned dest_idx = e->dest_idx;
6388 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
6389 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
6390 continue;
6391 /* Simply fill all args. */
6392 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
6393 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
6394 vect_get_slp_vect_def (child, i),
6395 e, gimple_phi_arg_location (phi, dest_idx));
6400 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
6402 void
6403 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
6405 slp_instance instance;
6406 unsigned int i;
6408 hash_map<slp_tree, slp_scc_info> scc_info;
6409 int maxdfs = 0;
6410 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6412 slp_tree node = SLP_INSTANCE_TREE (instance);
6413 if (dump_enabled_p ())
6415 dump_printf_loc (MSG_NOTE, vect_location,
6416 "Vectorizing SLP tree:\n");
6417 if (SLP_INSTANCE_ROOT_STMT (instance))
6418 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
6419 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
6420 vect_print_slp_graph (MSG_NOTE, vect_location,
6421 SLP_INSTANCE_TREE (instance));
6423 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
6424 have a PHI be the node breaking the cycle. */
6425 auto_vec<slp_tree> stack;
6426 if (!scc_info.get (node))
6427 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
6429 if (SLP_INSTANCE_ROOT_STMT (instance))
6430 vectorize_slp_instance_root_stmt (node, instance);
6432 if (dump_enabled_p ())
6433 dump_printf_loc (MSG_NOTE, vect_location,
6434 "vectorizing stmts using SLP.\n");
6437 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6439 slp_tree root = SLP_INSTANCE_TREE (instance);
6440 stmt_vec_info store_info;
6441 unsigned int j;
6443 /* Remove scalar call stmts. Do not do this for basic-block
6444 vectorization as not all uses may be vectorized.
6445 ??? Why should this be necessary? DCE should be able to
6446 remove the stmts itself.
6447 ??? For BB vectorization we can as well remove scalar
6448 stmts starting from the SLP tree root if they have no
6449 uses. */
6450 if (is_a <loop_vec_info> (vinfo))
6451 vect_remove_slp_scalar_calls (vinfo, root);
6453 /* Remove vectorized stores original scalar stmts. */
6454 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
6456 if (!STMT_VINFO_DATA_REF (store_info)
6457 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
6458 break;
6460 store_info = vect_orig_stmt (store_info);
6461 /* Free the attached stmt_vec_info and remove the stmt. */
6462 vinfo->remove_stmt (store_info);