[RS6000] dg-do !compile and scan-assembler
[official-gcc.git] / gcc / tree-vect-slp.c
blob470b67d76b526b0296226cb8d245edf464c3d640
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
52 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
53 slp_tree, stmt_vector_for_cost *);
55 object_allocator<_slp_tree> *slp_tree_pool;
57 void *
58 _slp_tree::operator new (size_t n)
60 gcc_assert (n == sizeof (_slp_tree));
61 return slp_tree_pool->allocate_raw ();
64 void
65 _slp_tree::operator delete (void *node, size_t n)
67 gcc_assert (n == sizeof (_slp_tree));
68 slp_tree_pool->remove_raw (node);
72 /* Initialize a SLP node. */
74 _slp_tree::_slp_tree ()
76 SLP_TREE_SCALAR_STMTS (this) = vNULL;
77 SLP_TREE_SCALAR_OPS (this) = vNULL;
78 SLP_TREE_VEC_STMTS (this) = vNULL;
79 SLP_TREE_VEC_DEFS (this) = vNULL;
80 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
81 SLP_TREE_CHILDREN (this) = vNULL;
82 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
83 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
84 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
85 SLP_TREE_CODE (this) = ERROR_MARK;
86 SLP_TREE_VECTYPE (this) = NULL_TREE;
87 SLP_TREE_REPRESENTATIVE (this) = NULL;
88 SLP_TREE_REF_COUNT (this) = 1;
89 this->max_nunits = 1;
90 this->lanes = 0;
93 /* Tear down a SLP node. */
95 _slp_tree::~_slp_tree ()
97 SLP_TREE_CHILDREN (this).release ();
98 SLP_TREE_SCALAR_STMTS (this).release ();
99 SLP_TREE_SCALAR_OPS (this).release ();
100 SLP_TREE_VEC_STMTS (this).release ();
101 SLP_TREE_VEC_DEFS (this).release ();
102 SLP_TREE_LOAD_PERMUTATION (this).release ();
103 SLP_TREE_LANE_PERMUTATION (this).release ();
106 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
108 static void
109 vect_free_slp_tree (slp_tree node)
111 int i;
112 slp_tree child;
114 if (--SLP_TREE_REF_COUNT (node) != 0)
115 return;
117 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
118 if (child)
119 vect_free_slp_tree (child);
121 delete node;
124 /* Return a location suitable for dumpings related to the SLP instance. */
126 dump_user_location_t
127 _slp_instance::location () const
129 if (root_stmt)
130 return root_stmt->stmt;
131 else
132 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
136 /* Free the memory allocated for the SLP instance. */
138 void
139 vect_free_slp_instance (slp_instance instance)
141 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
142 SLP_INSTANCE_LOADS (instance).release ();
143 instance->subgraph_entries.release ();
144 instance->cost_vec.release ();
145 free (instance);
149 /* Create an SLP node for SCALAR_STMTS. */
151 static slp_tree
152 vect_create_new_slp_node (slp_tree node,
153 vec<stmt_vec_info> scalar_stmts, unsigned nops)
155 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
156 SLP_TREE_CHILDREN (node).create (nops);
157 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
158 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
159 SLP_TREE_LANES (node) = scalar_stmts.length ();
160 return node;
163 /* Create an SLP node for SCALAR_STMTS. */
165 static slp_tree
166 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
168 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
171 /* Create an SLP node for OPS. */
173 static slp_tree
174 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
176 SLP_TREE_SCALAR_OPS (node) = ops;
177 SLP_TREE_DEF_TYPE (node) = vect_external_def;
178 SLP_TREE_LANES (node) = ops.length ();
179 return node;
182 /* Create an SLP node for OPS. */
184 static slp_tree
185 vect_create_new_slp_node (vec<tree> ops)
187 return vect_create_new_slp_node (new _slp_tree, ops);
191 /* This structure is used in creation of an SLP tree. Each instance
192 corresponds to the same operand in a group of scalar stmts in an SLP
193 node. */
194 typedef struct _slp_oprnd_info
196 /* Def-stmts for the operands. */
197 vec<stmt_vec_info> def_stmts;
198 /* Operands. */
199 vec<tree> ops;
200 /* Information about the first statement, its vector def-type, type, the
201 operand itself in case it's constant, and an indication if it's a pattern
202 stmt. */
203 tree first_op_type;
204 enum vect_def_type first_dt;
205 bool any_pattern;
206 } *slp_oprnd_info;
209 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
210 operand. */
211 static vec<slp_oprnd_info>
212 vect_create_oprnd_info (int nops, int group_size)
214 int i;
215 slp_oprnd_info oprnd_info;
216 vec<slp_oprnd_info> oprnds_info;
218 oprnds_info.create (nops);
219 for (i = 0; i < nops; i++)
221 oprnd_info = XNEW (struct _slp_oprnd_info);
222 oprnd_info->def_stmts.create (group_size);
223 oprnd_info->ops.create (group_size);
224 oprnd_info->first_dt = vect_uninitialized_def;
225 oprnd_info->first_op_type = NULL_TREE;
226 oprnd_info->any_pattern = false;
227 oprnds_info.quick_push (oprnd_info);
230 return oprnds_info;
234 /* Free operands info. */
236 static void
237 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
239 int i;
240 slp_oprnd_info oprnd_info;
242 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
244 oprnd_info->def_stmts.release ();
245 oprnd_info->ops.release ();
246 XDELETE (oprnd_info);
249 oprnds_info.release ();
253 /* Return true if STMTS contains a pattern statement. */
255 static bool
256 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
258 stmt_vec_info stmt_info;
259 unsigned int i;
260 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
261 if (is_pattern_stmt_p (stmt_info))
262 return true;
263 return false;
266 /* Return true when all lanes in the external or constant NODE have
267 the same value. */
269 static bool
270 vect_slp_tree_uniform_p (slp_tree node)
272 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
273 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
275 /* Pre-exsting vectors. */
276 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
277 return false;
279 unsigned i;
280 tree op, first = NULL_TREE;
281 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
282 if (!first)
283 first = op;
284 else if (!operand_equal_p (first, op, 0))
285 return false;
287 return true;
290 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
291 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
292 of the chain. */
295 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
296 stmt_vec_info first_stmt_info)
298 stmt_vec_info next_stmt_info = first_stmt_info;
299 int result = 0;
301 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
302 return -1;
306 if (next_stmt_info == stmt_info)
307 return result;
308 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
309 if (next_stmt_info)
310 result += DR_GROUP_GAP (next_stmt_info);
312 while (next_stmt_info);
314 return -1;
317 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
318 using the method implemented by duplicate_and_interleave. Return true
319 if so, returning the number of intermediate vectors in *NVECTORS_OUT
320 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
321 (if nonnull). */
323 bool
324 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
325 tree elt_type, unsigned int *nvectors_out,
326 tree *vector_type_out,
327 tree *permutes)
329 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
330 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
331 return false;
333 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
334 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
335 unsigned int nvectors = 1;
336 for (;;)
338 scalar_int_mode int_mode;
339 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
340 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
342 /* Get the natural vector type for this SLP group size. */
343 tree int_type = build_nonstandard_integer_type
344 (GET_MODE_BITSIZE (int_mode), 1);
345 tree vector_type
346 = get_vectype_for_scalar_type (vinfo, int_type, count);
347 if (vector_type
348 && VECTOR_MODE_P (TYPE_MODE (vector_type))
349 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
350 GET_MODE_SIZE (base_vector_mode)))
352 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
353 together into elements of type INT_TYPE and using the result
354 to build NVECTORS vectors. */
355 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
356 vec_perm_builder sel1 (nelts, 2, 3);
357 vec_perm_builder sel2 (nelts, 2, 3);
358 poly_int64 half_nelts = exact_div (nelts, 2);
359 for (unsigned int i = 0; i < 3; ++i)
361 sel1.quick_push (i);
362 sel1.quick_push (i + nelts);
363 sel2.quick_push (half_nelts + i);
364 sel2.quick_push (half_nelts + i + nelts);
366 vec_perm_indices indices1 (sel1, 2, nelts);
367 vec_perm_indices indices2 (sel2, 2, nelts);
368 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
369 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
371 if (nvectors_out)
372 *nvectors_out = nvectors;
373 if (vector_type_out)
374 *vector_type_out = vector_type;
375 if (permutes)
377 permutes[0] = vect_gen_perm_mask_checked (vector_type,
378 indices1);
379 permutes[1] = vect_gen_perm_mask_checked (vector_type,
380 indices2);
382 return true;
386 if (!multiple_p (elt_bytes, 2, &elt_bytes))
387 return false;
388 nvectors *= 2;
392 /* Return true if DTA and DTB match. */
394 static bool
395 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
397 return (dta == dtb
398 || ((dta == vect_external_def || dta == vect_constant_def)
399 && (dtb == vect_external_def || dtb == vect_constant_def)));
402 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
403 they are of a valid type and that they match the defs of the first stmt of
404 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
405 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
406 indicates swap is required for cond_expr stmts. Specifically, *SWAP
407 is 1 if STMT is cond and operands of comparison need to be swapped;
408 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
409 If there is any operand swap in this function, *SWAP is set to non-zero
410 value.
411 If there was a fatal error return -1; if the error could be corrected by
412 swapping operands of father node of this one, return 1; if everything is
413 ok return 0. */
414 static int
415 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
416 vec<stmt_vec_info> stmts, unsigned stmt_num,
417 vec<slp_oprnd_info> *oprnds_info)
419 stmt_vec_info stmt_info = stmts[stmt_num];
420 tree oprnd;
421 unsigned int i, number_of_oprnds;
422 enum vect_def_type dt = vect_uninitialized_def;
423 slp_oprnd_info oprnd_info;
424 int first_op_idx = 1;
425 unsigned int commutative_op = -1U;
426 bool first_op_cond = false;
427 bool first = stmt_num == 0;
429 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
431 number_of_oprnds = gimple_call_num_args (stmt);
432 first_op_idx = 3;
433 if (gimple_call_internal_p (stmt))
435 internal_fn ifn = gimple_call_internal_fn (stmt);
436 commutative_op = first_commutative_argument (ifn);
438 /* Masked load, only look at mask. */
439 if (ifn == IFN_MASK_LOAD)
441 number_of_oprnds = 1;
442 /* Mask operand index. */
443 first_op_idx = 5;
447 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
449 enum tree_code code = gimple_assign_rhs_code (stmt);
450 number_of_oprnds = gimple_num_ops (stmt) - 1;
451 /* Swap can only be done for cond_expr if asked to, otherwise we
452 could result in different comparison code to the first stmt. */
453 if (code == COND_EXPR
454 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
456 first_op_cond = true;
457 number_of_oprnds++;
459 else
460 commutative_op = commutative_tree_code (code) ? 0U : -1U;
462 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
463 number_of_oprnds = gimple_phi_num_args (stmt);
464 else
465 return -1;
467 bool swapped = (swap != 0);
468 bool backedge = false;
469 gcc_assert (!swapped || first_op_cond);
470 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
471 for (i = 0; i < number_of_oprnds; i++)
473 if (first_op_cond)
475 /* Map indicating how operands of cond_expr should be swapped. */
476 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
477 int *map = maps[swap];
479 if (i < 2)
480 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
481 first_op_idx), map[i]);
482 else
483 oprnd = gimple_op (stmt_info->stmt, map[i]);
485 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
487 oprnd = gimple_phi_arg_def (stmt, i);
488 backedge = dominated_by_p (CDI_DOMINATORS,
489 gimple_phi_arg_edge (stmt, i)->src,
490 gimple_bb (stmt_info->stmt));
492 else
493 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
494 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
495 oprnd = TREE_OPERAND (oprnd, 0);
497 oprnd_info = (*oprnds_info)[i];
499 stmt_vec_info def_stmt_info;
500 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
502 if (dump_enabled_p ())
503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
504 "Build SLP failed: can't analyze def for %T\n",
505 oprnd);
507 return -1;
510 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
511 oprnd_info->any_pattern = true;
513 oprnd_info->def_stmts.quick_push (def_stmt_info);
514 oprnd_info->ops.quick_push (oprnd);
516 /* If there's a extern def on a backedge make sure we can
517 code-generate at the region start.
518 ??? This is another case that could be fixed by adjusting
519 how we split the function but at the moment we'd have conflicting
520 goals there. */
521 if (backedge
522 && dts[i] == vect_external_def
523 && is_a <bb_vec_info> (vinfo)
524 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
525 && !dominated_by_p (CDI_DOMINATORS,
526 as_a <bb_vec_info> (vinfo)->bbs[0],
527 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
529 if (dump_enabled_p ())
530 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
531 "Build SLP failed: extern def %T only defined "
532 "on backedge\n", oprnd);
533 return -1;
536 if (first)
538 tree type = TREE_TYPE (oprnd);
539 dt = dts[i];
540 if ((dt == vect_constant_def
541 || dt == vect_external_def)
542 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
543 && (TREE_CODE (type) == BOOLEAN_TYPE
544 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
545 type)))
547 if (dump_enabled_p ())
548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
549 "Build SLP failed: invalid type of def "
550 "for variable-length SLP %T\n", oprnd);
551 return -1;
554 /* For the swapping logic below force vect_reduction_def
555 for the reduction op in a SLP reduction group. */
556 if (!STMT_VINFO_DATA_REF (stmt_info)
557 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
558 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
559 && def_stmt_info)
560 dts[i] = dt = vect_reduction_def;
562 /* Check the types of the definition. */
563 switch (dt)
565 case vect_external_def:
566 case vect_constant_def:
567 case vect_internal_def:
568 case vect_reduction_def:
569 case vect_induction_def:
570 case vect_nested_cycle:
571 break;
573 default:
574 /* FORNOW: Not supported. */
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
577 "Build SLP failed: illegal type of def %T\n",
578 oprnd);
579 return -1;
582 oprnd_info->first_dt = dt;
583 oprnd_info->first_op_type = type;
586 if (first)
587 return 0;
589 /* Now match the operand definition types to that of the first stmt. */
590 for (i = 0; i < number_of_oprnds;)
592 oprnd_info = (*oprnds_info)[i];
593 dt = dts[i];
594 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
595 oprnd = oprnd_info->ops[stmt_num];
596 tree type = TREE_TYPE (oprnd);
598 if (!types_compatible_p (oprnd_info->first_op_type, type))
600 if (dump_enabled_p ())
601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
602 "Build SLP failed: different operand types\n");
603 return 1;
606 /* Not first stmt of the group, check that the def-stmt/s match
607 the def-stmt/s of the first stmt. Allow different definition
608 types for reduction chains: the first stmt must be a
609 vect_reduction_def (a phi node), and the rest
610 end in the reduction chain. */
611 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
612 && !(oprnd_info->first_dt == vect_reduction_def
613 && !STMT_VINFO_DATA_REF (stmt_info)
614 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
615 && def_stmt_info
616 && !STMT_VINFO_DATA_REF (def_stmt_info)
617 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
618 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
619 || (!STMT_VINFO_DATA_REF (stmt_info)
620 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
621 && ((!def_stmt_info
622 || STMT_VINFO_DATA_REF (def_stmt_info)
623 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
624 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
625 != (oprnd_info->first_dt != vect_reduction_def))))
627 /* Try swapping operands if we got a mismatch. For BB
628 vectorization only in case it will clearly improve things. */
629 if (i == commutative_op && !swapped
630 && (!is_a <bb_vec_info> (vinfo)
631 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
632 dts[i+1])
633 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
634 || vect_def_types_match
635 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
637 if (dump_enabled_p ())
638 dump_printf_loc (MSG_NOTE, vect_location,
639 "trying swapped operands\n");
640 std::swap (dts[i], dts[i+1]);
641 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
642 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
643 std::swap ((*oprnds_info)[i]->ops[stmt_num],
644 (*oprnds_info)[i+1]->ops[stmt_num]);
645 swapped = true;
646 continue;
649 if (is_a <bb_vec_info> (vinfo)
650 && !oprnd_info->any_pattern)
652 /* Now for commutative ops we should see whether we can
653 make the other operand matching. */
654 if (dump_enabled_p ())
655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
656 "treating operand as external\n");
657 oprnd_info->first_dt = dt = vect_external_def;
659 else
661 if (dump_enabled_p ())
662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
663 "Build SLP failed: different types\n");
664 return 1;
668 /* Make sure to demote the overall operand to external. */
669 if (dt == vect_external_def)
670 oprnd_info->first_dt = vect_external_def;
671 /* For a SLP reduction chain we want to duplicate the reduction to
672 each of the chain members. That gets us a sane SLP graph (still
673 the stmts are not 100% correct wrt the initial values). */
674 else if ((dt == vect_internal_def
675 || dt == vect_reduction_def)
676 && oprnd_info->first_dt == vect_reduction_def
677 && !STMT_VINFO_DATA_REF (stmt_info)
678 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
679 && !STMT_VINFO_DATA_REF (def_stmt_info)
680 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
681 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
683 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
684 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
687 ++i;
690 /* Swap operands. */
691 if (swapped)
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE, vect_location,
695 "swapped operands to match def types in %G",
696 stmt_info->stmt);
699 return 0;
702 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
703 Return true if we can, meaning that this choice doesn't conflict with
704 existing SLP nodes that use STMT_INFO. */
706 bool
707 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
709 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
710 if (old_vectype)
711 return useless_type_conversion_p (vectype, old_vectype);
713 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
715 /* We maintain the invariant that if any statement in the group is
716 used, all other members of the group have the same vector type. */
717 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
718 stmt_vec_info member_info = first_info;
719 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
720 if (is_pattern_stmt_p (member_info)
721 && !useless_type_conversion_p (vectype,
722 STMT_VINFO_VECTYPE (member_info)))
723 break;
725 if (!member_info)
727 for (member_info = first_info; member_info;
728 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
729 STMT_VINFO_VECTYPE (member_info) = vectype;
730 return true;
733 else if (!is_pattern_stmt_p (stmt_info))
735 STMT_VINFO_VECTYPE (stmt_info) = vectype;
736 return true;
739 if (dump_enabled_p ())
741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
742 "Build SLP failed: incompatible vector"
743 " types for: %G", stmt_info->stmt);
744 dump_printf_loc (MSG_NOTE, vect_location,
745 " old vector type: %T\n", old_vectype);
746 dump_printf_loc (MSG_NOTE, vect_location,
747 " new vector type: %T\n", vectype);
749 return false;
752 /* Return true if call statements CALL1 and CALL2 are similar enough
753 to be combined into the same SLP group. */
755 static bool
756 compatible_calls_p (gcall *call1, gcall *call2)
758 unsigned int nargs = gimple_call_num_args (call1);
759 if (nargs != gimple_call_num_args (call2))
760 return false;
762 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
763 return false;
765 if (gimple_call_internal_p (call1))
767 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
768 TREE_TYPE (gimple_call_lhs (call2))))
769 return false;
770 for (unsigned int i = 0; i < nargs; ++i)
771 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
772 TREE_TYPE (gimple_call_arg (call2, i))))
773 return false;
775 else
777 if (!operand_equal_p (gimple_call_fn (call1),
778 gimple_call_fn (call2), 0))
779 return false;
781 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
782 return false;
784 return true;
787 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
788 caller's attempt to find the vector type in STMT_INFO with the narrowest
789 element type. Return true if VECTYPE is nonnull and if it is valid
790 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
791 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
792 vect_build_slp_tree. */
794 static bool
795 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
796 unsigned int group_size,
797 tree vectype, poly_uint64 *max_nunits)
799 if (!vectype)
801 if (dump_enabled_p ())
802 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
803 "Build SLP failed: unsupported data-type in %G\n",
804 stmt_info->stmt);
805 /* Fatal mismatch. */
806 return false;
809 /* If populating the vector type requires unrolling then fail
810 before adjusting *max_nunits for basic-block vectorization. */
811 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
812 unsigned HOST_WIDE_INT const_nunits;
813 if (is_a <bb_vec_info> (vinfo)
814 && (!nunits.is_constant (&const_nunits)
815 || const_nunits > group_size))
817 if (dump_enabled_p ())
818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
819 "Build SLP failed: unrolling required "
820 "in basic block SLP\n");
821 /* Fatal mismatch. */
822 return false;
825 /* In case of multiple types we need to detect the smallest type. */
826 vect_update_max_nunits (max_nunits, vectype);
827 return true;
830 /* Verify if the scalar stmts STMTS are isomorphic, require data
831 permutation or are of unsupported types of operation. Return
832 true if they are, otherwise return false and indicate in *MATCHES
833 which stmts are not isomorphic to the first one. If MATCHES[0]
834 is false then this indicates the comparison could not be
835 carried out or the stmts will never be vectorized by SLP.
837 Note COND_EXPR is possibly isomorphic to another one after swapping its
838 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
839 the first stmt by swapping the two operands of comparison; set SWAP[i]
840 to 2 if stmt I is isormorphic to the first stmt by inverting the code
841 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
842 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
844 static bool
845 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
846 vec<stmt_vec_info> stmts, unsigned int group_size,
847 poly_uint64 *max_nunits, bool *matches,
848 bool *two_operators, tree *node_vectype)
850 unsigned int i;
851 stmt_vec_info first_stmt_info = stmts[0];
852 enum tree_code first_stmt_code = ERROR_MARK;
853 enum tree_code alt_stmt_code = ERROR_MARK;
854 enum tree_code rhs_code = ERROR_MARK;
855 enum tree_code first_cond_code = ERROR_MARK;
856 tree lhs;
857 bool need_same_oprnds = false;
858 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
859 optab optab;
860 int icode;
861 machine_mode optab_op2_mode;
862 machine_mode vec_mode;
863 stmt_vec_info first_load = NULL, prev_first_load = NULL;
864 bool first_stmt_load_p = false, load_p = false;
865 bool first_stmt_phi_p = false, phi_p = false;
867 /* For every stmt in NODE find its def stmt/s. */
868 stmt_vec_info stmt_info;
869 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
871 gimple *stmt = stmt_info->stmt;
872 swap[i] = 0;
873 matches[i] = false;
875 if (dump_enabled_p ())
876 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
878 /* Fail to vectorize statements marked as unvectorizable, throw
879 or are volatile. */
880 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
881 || stmt_can_throw_internal (cfun, stmt)
882 || gimple_has_volatile_ops (stmt))
884 if (dump_enabled_p ())
885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
886 "Build SLP failed: unvectorizable statement %G",
887 stmt);
888 /* ??? For BB vectorization we want to commutate operands in a way
889 to shuffle all unvectorizable defs into one operand and have
890 the other still vectorized. The following doesn't reliably
891 work for this though but it's the easiest we can do here. */
892 if (is_a <bb_vec_info> (vinfo) && i != 0)
893 continue;
894 /* Fatal mismatch. */
895 matches[0] = false;
896 return false;
899 lhs = gimple_get_lhs (stmt);
900 if (lhs == NULL_TREE)
902 if (dump_enabled_p ())
903 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
904 "Build SLP failed: not GIMPLE_ASSIGN nor "
905 "GIMPLE_CALL %G", stmt);
906 if (is_a <bb_vec_info> (vinfo) && i != 0)
907 continue;
908 /* Fatal mismatch. */
909 matches[0] = false;
910 return false;
913 tree nunits_vectype;
914 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
915 &nunits_vectype, group_size)
916 || (nunits_vectype
917 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
918 nunits_vectype, max_nunits)))
920 if (is_a <bb_vec_info> (vinfo) && i != 0)
921 continue;
922 /* Fatal mismatch. */
923 matches[0] = false;
924 return false;
927 gcc_assert (vectype);
929 gcall *call_stmt = dyn_cast <gcall *> (stmt);
930 if (call_stmt)
932 rhs_code = CALL_EXPR;
934 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
935 load_p = true;
936 else if ((gimple_call_internal_p (call_stmt)
937 && (!vectorizable_internal_fn_p
938 (gimple_call_internal_fn (call_stmt))))
939 || gimple_call_tail_p (call_stmt)
940 || gimple_call_noreturn_p (call_stmt)
941 || !gimple_call_nothrow_p (call_stmt)
942 || gimple_call_chain (call_stmt))
944 if (dump_enabled_p ())
945 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
946 "Build SLP failed: unsupported call type %G",
947 call_stmt);
948 if (is_a <bb_vec_info> (vinfo) && i != 0)
949 continue;
950 /* Fatal mismatch. */
951 matches[0] = false;
952 return false;
955 else if (gimple_code (stmt) == GIMPLE_PHI)
957 rhs_code = ERROR_MARK;
958 phi_p = true;
960 else
962 rhs_code = gimple_assign_rhs_code (stmt);
963 load_p = gimple_vuse (stmt);
966 /* Check the operation. */
967 if (i == 0)
969 *node_vectype = vectype;
970 first_stmt_code = rhs_code;
971 first_stmt_load_p = load_p;
972 first_stmt_phi_p = phi_p;
974 /* Shift arguments should be equal in all the packed stmts for a
975 vector shift with scalar shift operand. */
976 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
977 || rhs_code == LROTATE_EXPR
978 || rhs_code == RROTATE_EXPR)
980 vec_mode = TYPE_MODE (vectype);
982 /* First see if we have a vector/vector shift. */
983 optab = optab_for_tree_code (rhs_code, vectype,
984 optab_vector);
986 if (!optab
987 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
989 /* No vector/vector shift, try for a vector/scalar shift. */
990 optab = optab_for_tree_code (rhs_code, vectype,
991 optab_scalar);
993 if (!optab)
995 if (dump_enabled_p ())
996 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
997 "Build SLP failed: no optab.\n");
998 if (is_a <bb_vec_info> (vinfo) && i != 0)
999 continue;
1000 /* Fatal mismatch. */
1001 matches[0] = false;
1002 return false;
1004 icode = (int) optab_handler (optab, vec_mode);
1005 if (icode == CODE_FOR_nothing)
1007 if (dump_enabled_p ())
1008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1009 "Build SLP failed: "
1010 "op not supported by target.\n");
1011 if (is_a <bb_vec_info> (vinfo) && i != 0)
1012 continue;
1013 /* Fatal mismatch. */
1014 matches[0] = false;
1015 return false;
1017 optab_op2_mode = insn_data[icode].operand[2].mode;
1018 if (!VECTOR_MODE_P (optab_op2_mode))
1020 need_same_oprnds = true;
1021 first_op1 = gimple_assign_rhs2 (stmt);
1025 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1027 need_same_oprnds = true;
1028 first_op1 = gimple_assign_rhs2 (stmt);
1030 else if (!load_p
1031 && rhs_code == BIT_FIELD_REF)
1033 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1034 if (TREE_CODE (vec) != SSA_NAME
1035 || !types_compatible_p (vectype, TREE_TYPE (vec)))
1037 if (is_a <bb_vec_info> (vinfo) && i != 0)
1038 continue;
1039 if (dump_enabled_p ())
1040 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1041 "Build SLP failed: "
1042 "BIT_FIELD_REF not supported\n");
1043 /* Fatal mismatch. */
1044 matches[0] = false;
1045 return false;
1048 else if (call_stmt
1049 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1051 need_same_oprnds = true;
1052 first_op1 = gimple_call_arg (call_stmt, 1);
1055 else
1057 if (first_stmt_code != rhs_code
1058 && alt_stmt_code == ERROR_MARK)
1059 alt_stmt_code = rhs_code;
1060 if ((first_stmt_code != rhs_code
1061 && (first_stmt_code != IMAGPART_EXPR
1062 || rhs_code != REALPART_EXPR)
1063 && (first_stmt_code != REALPART_EXPR
1064 || rhs_code != IMAGPART_EXPR)
1065 /* Handle mismatches in plus/minus by computing both
1066 and merging the results. */
1067 && !((first_stmt_code == PLUS_EXPR
1068 || first_stmt_code == MINUS_EXPR)
1069 && (alt_stmt_code == PLUS_EXPR
1070 || alt_stmt_code == MINUS_EXPR)
1071 && rhs_code == alt_stmt_code)
1072 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1073 && (first_stmt_code == ARRAY_REF
1074 || first_stmt_code == BIT_FIELD_REF
1075 || first_stmt_code == INDIRECT_REF
1076 || first_stmt_code == COMPONENT_REF
1077 || first_stmt_code == MEM_REF)))
1078 || first_stmt_load_p != load_p
1079 || first_stmt_phi_p != phi_p)
1081 if (dump_enabled_p ())
1083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1084 "Build SLP failed: different operation "
1085 "in stmt %G", stmt);
1086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1087 "original stmt %G", first_stmt_info->stmt);
1089 /* Mismatch. */
1090 continue;
1093 if (need_same_oprnds)
1095 tree other_op1 = (call_stmt
1096 ? gimple_call_arg (call_stmt, 1)
1097 : gimple_assign_rhs2 (stmt));
1098 if (!operand_equal_p (first_op1, other_op1, 0))
1100 if (dump_enabled_p ())
1101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1102 "Build SLP failed: different shift "
1103 "arguments in %G", stmt);
1104 /* Mismatch. */
1105 continue;
1108 if (!load_p
1109 && first_stmt_code == BIT_FIELD_REF
1110 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1111 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1113 if (dump_enabled_p ())
1114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1115 "Build SLP failed: different BIT_FIELD_REF "
1116 "arguments in %G", stmt);
1117 /* Mismatch. */
1118 continue;
1121 if (!load_p && rhs_code == CALL_EXPR)
1123 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1124 as_a <gcall *> (stmt)))
1126 if (dump_enabled_p ())
1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128 "Build SLP failed: different calls in %G",
1129 stmt);
1130 /* Mismatch. */
1131 continue;
1135 if (phi_p
1136 && (gimple_bb (first_stmt_info->stmt)
1137 != gimple_bb (stmt_info->stmt)))
1139 if (dump_enabled_p ())
1140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1141 "Build SLP failed: different BB for PHI "
1142 "in %G", stmt);
1143 /* Mismatch. */
1144 continue;
1147 if (!types_compatible_p (vectype, *node_vectype))
1149 if (dump_enabled_p ())
1150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1151 "Build SLP failed: different vector type "
1152 "in %G", stmt);
1153 /* Mismatch. */
1154 continue;
1158 /* Grouped store or load. */
1159 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1161 if (REFERENCE_CLASS_P (lhs))
1163 /* Store. */
1166 else
1168 /* Load. */
1169 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1170 if (prev_first_load)
1172 /* Check that there are no loads from different interleaving
1173 chains in the same node. */
1174 if (prev_first_load != first_load)
1176 if (dump_enabled_p ())
1177 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1178 vect_location,
1179 "Build SLP failed: different "
1180 "interleaving chains in one node %G",
1181 stmt);
1182 /* Mismatch. */
1183 continue;
1186 else
1187 prev_first_load = first_load;
1189 } /* Grouped access. */
1190 else
1192 if (load_p)
1194 /* Not grouped load. */
1195 if (dump_enabled_p ())
1196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1197 "Build SLP failed: not grouped load %G", stmt);
1199 /* FORNOW: Not grouped loads are not supported. */
1200 if (is_a <bb_vec_info> (vinfo) && i != 0)
1201 continue;
1202 /* Fatal mismatch. */
1203 matches[0] = false;
1204 return false;
1207 /* Not memory operation. */
1208 if (!phi_p
1209 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1210 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1211 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1212 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1213 && rhs_code != VIEW_CONVERT_EXPR
1214 && rhs_code != CALL_EXPR
1215 && rhs_code != BIT_FIELD_REF)
1217 if (dump_enabled_p ())
1218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219 "Build SLP failed: operation unsupported %G",
1220 stmt);
1221 if (is_a <bb_vec_info> (vinfo) && i != 0)
1222 continue;
1223 /* Fatal mismatch. */
1224 matches[0] = false;
1225 return false;
1228 if (rhs_code == COND_EXPR)
1230 tree cond_expr = gimple_assign_rhs1 (stmt);
1231 enum tree_code cond_code = TREE_CODE (cond_expr);
1232 enum tree_code swap_code = ERROR_MARK;
1233 enum tree_code invert_code = ERROR_MARK;
1235 if (i == 0)
1236 first_cond_code = TREE_CODE (cond_expr);
1237 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1239 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1240 swap_code = swap_tree_comparison (cond_code);
1241 invert_code = invert_tree_comparison (cond_code, honor_nans);
1244 if (first_cond_code == cond_code)
1246 /* Isomorphic can be achieved by swapping. */
1247 else if (first_cond_code == swap_code)
1248 swap[i] = 1;
1249 /* Isomorphic can be achieved by inverting. */
1250 else if (first_cond_code == invert_code)
1251 swap[i] = 2;
1252 else
1254 if (dump_enabled_p ())
1255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1256 "Build SLP failed: different"
1257 " operation %G", stmt);
1258 /* Mismatch. */
1259 continue;
1264 matches[i] = true;
1267 for (i = 0; i < group_size; ++i)
1268 if (!matches[i])
1269 return false;
1271 /* If we allowed a two-operation SLP node verify the target can cope
1272 with the permute we are going to use. */
1273 if (alt_stmt_code != ERROR_MARK
1274 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1276 *two_operators = true;
1279 return true;
1282 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1283 Note we never remove apart from at destruction time so we do not
1284 need a special value for deleted that differs from empty. */
1285 struct bst_traits
1287 typedef vec <stmt_vec_info> value_type;
1288 typedef vec <stmt_vec_info> compare_type;
1289 static inline hashval_t hash (value_type);
1290 static inline bool equal (value_type existing, value_type candidate);
1291 static inline bool is_empty (value_type x) { return !x.exists (); }
1292 static inline bool is_deleted (value_type x) { return !x.exists (); }
1293 static const bool empty_zero_p = true;
1294 static inline void mark_empty (value_type &x) { x.release (); }
1295 static inline void mark_deleted (value_type &x) { x.release (); }
1296 static inline void remove (value_type &x) { x.release (); }
1298 inline hashval_t
1299 bst_traits::hash (value_type x)
1301 inchash::hash h;
1302 for (unsigned i = 0; i < x.length (); ++i)
1303 h.add_int (gimple_uid (x[i]->stmt));
1304 return h.end ();
1306 inline bool
1307 bst_traits::equal (value_type existing, value_type candidate)
1309 if (existing.length () != candidate.length ())
1310 return false;
1311 for (unsigned i = 0; i < existing.length (); ++i)
1312 if (existing[i] != candidate[i])
1313 return false;
1314 return true;
1317 typedef hash_map <vec <gimple *>, slp_tree,
1318 simple_hashmap_traits <bst_traits, slp_tree> >
1319 scalar_stmts_to_slp_tree_map_t;
1321 static slp_tree
1322 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1323 vec<stmt_vec_info> stmts, unsigned int group_size,
1324 poly_uint64 *max_nunits,
1325 bool *matches, unsigned *npermutes, unsigned *tree_size,
1326 scalar_stmts_to_slp_tree_map_t *bst_map);
1328 static slp_tree
1329 vect_build_slp_tree (vec_info *vinfo,
1330 vec<stmt_vec_info> stmts, unsigned int group_size,
1331 poly_uint64 *max_nunits,
1332 bool *matches, unsigned *npermutes, unsigned *tree_size,
1333 scalar_stmts_to_slp_tree_map_t *bst_map)
1335 if (slp_tree *leader = bst_map->get (stmts))
1337 if (dump_enabled_p ())
1338 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1339 *leader ? "" : "failed ", *leader);
1340 if (*leader)
1342 SLP_TREE_REF_COUNT (*leader)++;
1343 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1345 return *leader;
1348 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1349 so we can pick up backedge destinations during discovery. */
1350 slp_tree res = new _slp_tree;
1351 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1352 SLP_TREE_SCALAR_STMTS (res) = stmts;
1353 bst_map->put (stmts.copy (), res);
1355 poly_uint64 this_max_nunits = 1;
1356 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1357 &this_max_nunits,
1358 matches, npermutes, tree_size, bst_map);
1359 if (!res_)
1361 bool existed_p = bst_map->put (stmts, NULL);
1362 gcc_assert (existed_p);
1363 /* Mark the node invalid so we can detect those when still in use
1364 as backedge destinations. */
1365 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1366 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1367 vect_free_slp_tree (res);
1369 else
1371 gcc_assert (res_ == res);
1372 res->max_nunits = this_max_nunits;
1373 vect_update_max_nunits (max_nunits, this_max_nunits);
1374 /* Keep a reference for the bst_map use. */
1375 SLP_TREE_REF_COUNT (res)++;
1377 return res_;
1380 /* Recursively build an SLP tree starting from NODE.
1381 Fail (and return a value not equal to zero) if def-stmts are not
1382 isomorphic, require data permutation or are of unsupported types of
1383 operation. Otherwise, return 0.
1384 The value returned is the depth in the SLP tree where a mismatch
1385 was found. */
1387 static slp_tree
1388 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1389 vec<stmt_vec_info> stmts, unsigned int group_size,
1390 poly_uint64 *max_nunits,
1391 bool *matches, unsigned *npermutes, unsigned *tree_size,
1392 scalar_stmts_to_slp_tree_map_t *bst_map)
1394 unsigned nops, i, this_tree_size = 0;
1395 poly_uint64 this_max_nunits = *max_nunits;
1397 matches[0] = false;
1399 stmt_vec_info stmt_info = stmts[0];
1400 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1401 nops = gimple_call_num_args (stmt);
1402 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1404 nops = gimple_num_ops (stmt) - 1;
1405 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1406 nops++;
1408 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1409 nops = gimple_phi_num_args (phi);
1410 else
1411 return NULL;
1413 /* If the SLP node is a PHI (induction or reduction), terminate
1414 the recursion. */
1415 bool skip_args[2] = { false, false };
1416 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1417 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1419 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1420 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1421 group_size);
1422 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1423 max_nunits))
1424 return NULL;
1426 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1427 /* Induction from different IVs is not supported. */
1428 if (def_type == vect_induction_def)
1430 stmt_vec_info other_info;
1431 FOR_EACH_VEC_ELT (stmts, i, other_info)
1432 if (stmt_info != other_info)
1433 return NULL;
1435 /* Induction PHIs are leafs. */
1436 (*tree_size)++;
1437 node = vect_create_new_slp_node (node, stmts, nops);
1438 SLP_TREE_VECTYPE (node) = vectype;
1439 SLP_TREE_CHILDREN (node).quick_grow_cleared (nops);
1440 return node;
1442 else if (def_type == vect_reduction_def
1443 || def_type == vect_double_reduction_def
1444 || def_type == vect_nested_cycle)
1446 /* Else def types have to match. */
1447 stmt_vec_info other_info;
1448 bool all_same = true;
1449 FOR_EACH_VEC_ELT (stmts, i, other_info)
1451 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1452 return NULL;
1453 if (other_info != stmt_info)
1454 all_same = false;
1456 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1457 /* Reduction initial values are not explicitely represented. */
1458 if (!nested_in_vect_loop_p (loop, stmt_info))
1459 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1460 /* Reduction chain backedge defs are filled manually.
1461 ??? Need a better way to identify a SLP reduction chain PHI.
1462 Or a better overall way to SLP match those. */
1463 if (all_same && def_type == vect_reduction_def)
1464 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1466 else if (def_type != vect_internal_def)
1467 return NULL;
1471 bool two_operators = false;
1472 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1473 tree vectype = NULL_TREE;
1474 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1475 &this_max_nunits, matches, &two_operators,
1476 &vectype))
1477 return NULL;
1479 /* If the SLP node is a load, terminate the recursion unless masked. */
1480 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1481 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1483 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1485 /* Masked load. */
1486 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1487 nops = 1;
1489 else
1491 *max_nunits = this_max_nunits;
1492 (*tree_size)++;
1493 node = vect_create_new_slp_node (node, stmts, 0);
1494 SLP_TREE_VECTYPE (node) = vectype;
1495 /* And compute the load permutation. Whether it is actually
1496 a permutation depends on the unrolling factor which is
1497 decided later. */
1498 vec<unsigned> load_permutation;
1499 int j;
1500 stmt_vec_info load_info;
1501 load_permutation.create (group_size);
1502 stmt_vec_info first_stmt_info
1503 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1504 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1506 int load_place = vect_get_place_in_interleaving_chain
1507 (load_info, first_stmt_info);
1508 gcc_assert (load_place != -1);
1509 load_permutation.safe_push (load_place);
1511 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1512 return node;
1515 else if (gimple_assign_single_p (stmt_info->stmt)
1516 && !gimple_vuse (stmt_info->stmt)
1517 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1519 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1520 the same SSA name vector of a compatible type to vectype. */
1521 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1522 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1523 stmt_vec_info estmt_info;
1524 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1526 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1527 tree bfref = gimple_assign_rhs1 (estmt);
1528 HOST_WIDE_INT lane;
1529 if (!known_eq (bit_field_size (bfref),
1530 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1531 || !constant_multiple_p (bit_field_offset (bfref),
1532 bit_field_size (bfref), &lane))
1534 lperm.release ();
1535 return NULL;
1537 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1539 slp_tree vnode = vect_create_new_slp_node (vNULL);
1540 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1541 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1542 /* We are always building a permutation node even if it is an identity
1543 permute to shield the rest of the vectorizer from the odd node
1544 representing an actual vector without any scalar ops.
1545 ??? We could hide it completely with making the permute node
1546 external? */
1547 node = vect_create_new_slp_node (node, stmts, 1);
1548 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1549 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1550 SLP_TREE_VECTYPE (node) = vectype;
1551 SLP_TREE_CHILDREN (node).quick_push (vnode);
1552 return node;
1555 /* Get at the operands, verifying they are compatible. */
1556 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1557 slp_oprnd_info oprnd_info;
1558 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1560 int res = vect_get_and_check_slp_defs (vinfo, swap[i],
1561 stmts, i, &oprnds_info);
1562 if (res != 0)
1563 matches[(res == -1) ? 0 : i] = false;
1564 if (!matches[0])
1565 break;
1567 for (i = 0; i < group_size; ++i)
1568 if (!matches[i])
1570 vect_free_oprnd_info (oprnds_info);
1571 return NULL;
1573 swap = NULL;
1575 auto_vec<slp_tree, 4> children;
1577 stmt_info = stmts[0];
1579 /* Create SLP_TREE nodes for the definition node/s. */
1580 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1582 slp_tree child;
1583 unsigned int j;
1585 if (oprnd_info->first_dt == vect_uninitialized_def)
1587 /* COND_EXPR have one too many eventually if the condition
1588 is a SSA name. */
1589 gcc_assert (i == 3 && nops == 4);
1590 continue;
1593 /* We're skipping certain operands from processing, for example
1594 outer loop reduction initial defs. */
1595 if (i <= 1 && skip_args[i])
1597 children.safe_push (NULL);
1598 continue;
1601 if (is_a <bb_vec_info> (vinfo)
1602 && oprnd_info->first_dt == vect_internal_def)
1604 /* For BB vectorization, if all defs are the same do not
1605 bother to continue the build along the single-lane
1606 graph but use a splat of the scalar value. */
1607 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1608 for (j = 1; j < group_size; ++j)
1609 if (oprnd_info->def_stmts[j] != first_def)
1610 break;
1611 if (j == group_size
1612 /* But avoid doing this for loads where we may be
1613 able to CSE things. */
1614 && !gimple_vuse (first_def->stmt))
1616 if (dump_enabled_p ())
1617 dump_printf_loc (MSG_NOTE, vect_location,
1618 "Using a splat of the uniform operand\n");
1619 oprnd_info->first_dt = vect_external_def;
1623 if (oprnd_info->first_dt == vect_external_def
1624 || oprnd_info->first_dt == vect_constant_def)
1626 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1627 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1628 oprnd_info->ops = vNULL;
1629 children.safe_push (invnode);
1630 continue;
1633 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1634 group_size, &this_max_nunits,
1635 matches, npermutes,
1636 &this_tree_size, bst_map)) != NULL)
1638 oprnd_info->def_stmts = vNULL;
1639 children.safe_push (child);
1640 continue;
1643 /* If the SLP build for operand zero failed and operand zero
1644 and one can be commutated try that for the scalar stmts
1645 that failed the match. */
1646 if (i == 0
1647 /* A first scalar stmt mismatch signals a fatal mismatch. */
1648 && matches[0]
1649 /* ??? For COND_EXPRs we can swap the comparison operands
1650 as well as the arms under some constraints. */
1651 && nops == 2
1652 && oprnds_info[1]->first_dt == vect_internal_def
1653 && is_gimple_assign (stmt_info->stmt)
1654 /* Swapping operands for reductions breaks assumptions later on. */
1655 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1656 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1657 /* Do so only if the number of not successful permutes was nor more
1658 than a cut-ff as re-trying the recursive match on
1659 possibly each level of the tree would expose exponential
1660 behavior. */
1661 && *npermutes < 4)
1663 /* See whether we can swap the matching or the non-matching
1664 stmt operands. */
1665 bool swap_not_matching = true;
1668 for (j = 0; j < group_size; ++j)
1670 if (matches[j] != !swap_not_matching)
1671 continue;
1672 stmt_vec_info stmt_info = stmts[j];
1673 /* Verify if we can swap operands of this stmt. */
1674 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1675 if (!stmt
1676 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1678 if (!swap_not_matching)
1679 goto fail;
1680 swap_not_matching = false;
1681 break;
1685 while (j != group_size);
1687 /* Swap mismatched definition stmts. */
1688 if (dump_enabled_p ())
1689 dump_printf_loc (MSG_NOTE, vect_location,
1690 "Re-trying with swapped operands of stmts ");
1691 for (j = 0; j < group_size; ++j)
1692 if (matches[j] == !swap_not_matching)
1694 std::swap (oprnds_info[0]->def_stmts[j],
1695 oprnds_info[1]->def_stmts[j]);
1696 std::swap (oprnds_info[0]->ops[j],
1697 oprnds_info[1]->ops[j]);
1698 if (dump_enabled_p ())
1699 dump_printf (MSG_NOTE, "%d ", j);
1701 if (dump_enabled_p ())
1702 dump_printf (MSG_NOTE, "\n");
1703 /* And try again with scratch 'matches' ... */
1704 bool *tem = XALLOCAVEC (bool, group_size);
1705 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1706 group_size, &this_max_nunits,
1707 tem, npermutes,
1708 &this_tree_size, bst_map)) != NULL)
1710 oprnd_info->def_stmts = vNULL;
1711 children.safe_push (child);
1712 continue;
1714 /* We do not undo the swapping here since it might still be
1715 the better order for the second operand in case we build
1716 the first one from scalars below. */
1717 ++*npermutes;
1719 fail:
1721 /* If the SLP build failed and we analyze a basic-block
1722 simply treat nodes we fail to build as externally defined
1723 (and thus build vectors from the scalar defs).
1724 The cost model will reject outright expensive cases.
1725 ??? This doesn't treat cases where permutation ultimatively
1726 fails (or we don't try permutation below). Ideally we'd
1727 even compute a permutation that will end up with the maximum
1728 SLP tree size... */
1729 if (is_a <bb_vec_info> (vinfo)
1730 /* ??? Rejecting patterns this way doesn't work. We'd have to
1731 do extra work to cancel the pattern so the uses see the
1732 scalar version. */
1733 && !is_pattern_stmt_p (stmt_info)
1734 && !oprnd_info->any_pattern)
1736 /* But if there's a leading vector sized set of matching stmts
1737 fail here so we can split the group. This matches the condition
1738 vect_analyze_slp_instance uses. */
1739 /* ??? We might want to split here and combine the results to support
1740 multiple vector sizes better. */
1741 for (j = 0; j < group_size; ++j)
1742 if (!matches[j])
1743 break;
1744 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1746 if (dump_enabled_p ())
1747 dump_printf_loc (MSG_NOTE, vect_location,
1748 "Building vector operands from scalars\n");
1749 this_tree_size++;
1750 child = vect_create_new_slp_node (oprnd_info->ops);
1751 children.safe_push (child);
1752 oprnd_info->ops = vNULL;
1753 continue;
1757 gcc_assert (child == NULL);
1758 FOR_EACH_VEC_ELT (children, j, child)
1759 if (child)
1760 vect_free_slp_tree (child);
1761 vect_free_oprnd_info (oprnds_info);
1762 return NULL;
1765 vect_free_oprnd_info (oprnds_info);
1767 /* If we have all children of a child built up from uniform scalars
1768 or does more than one possibly expensive vector construction then
1769 just throw that away, causing it built up from scalars.
1770 The exception is the SLP node for the vector store. */
1771 if (is_a <bb_vec_info> (vinfo)
1772 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1773 /* ??? Rejecting patterns this way doesn't work. We'd have to
1774 do extra work to cancel the pattern so the uses see the
1775 scalar version. */
1776 && !is_pattern_stmt_p (stmt_info))
1778 slp_tree child;
1779 unsigned j;
1780 bool all_uniform_p = true;
1781 unsigned n_vector_builds = 0;
1782 FOR_EACH_VEC_ELT (children, j, child)
1784 if (!child)
1786 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1787 all_uniform_p = false;
1788 else if (!vect_slp_tree_uniform_p (child))
1790 all_uniform_p = false;
1791 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1792 n_vector_builds++;
1795 if (all_uniform_p || n_vector_builds > 1)
1797 /* Roll back. */
1798 matches[0] = false;
1799 FOR_EACH_VEC_ELT (children, j, child)
1800 if (child)
1801 vect_free_slp_tree (child);
1803 if (dump_enabled_p ())
1804 dump_printf_loc (MSG_NOTE, vect_location,
1805 "Building parent vector operands from "
1806 "scalars instead\n");
1807 return NULL;
1811 *tree_size += this_tree_size + 1;
1812 *max_nunits = this_max_nunits;
1814 if (two_operators)
1816 /* ??? We'd likely want to either cache in bst_map sth like
1817 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1818 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1819 explicit stmts to put in so the keying on 'stmts' doesn't
1820 work (but we have the same issue with nodes that use 'ops'). */
1821 slp_tree one = new _slp_tree;
1822 slp_tree two = new _slp_tree;
1823 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1824 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1825 SLP_TREE_VECTYPE (one) = vectype;
1826 SLP_TREE_VECTYPE (two) = vectype;
1827 SLP_TREE_CHILDREN (one).safe_splice (children);
1828 SLP_TREE_CHILDREN (two).safe_splice (children);
1829 slp_tree child;
1830 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1831 SLP_TREE_REF_COUNT (child)++;
1833 /* Here we record the original defs since this
1834 node represents the final lane configuration. */
1835 node = vect_create_new_slp_node (node, stmts, 2);
1836 SLP_TREE_VECTYPE (node) = vectype;
1837 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1838 SLP_TREE_CHILDREN (node).quick_push (one);
1839 SLP_TREE_CHILDREN (node).quick_push (two);
1840 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1841 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1842 enum tree_code ocode = ERROR_MARK;
1843 stmt_vec_info ostmt_info;
1844 unsigned j = 0;
1845 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1847 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1848 if (gimple_assign_rhs_code (ostmt) != code0)
1850 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1851 ocode = gimple_assign_rhs_code (ostmt);
1852 j = i;
1854 else
1855 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1857 SLP_TREE_CODE (one) = code0;
1858 SLP_TREE_CODE (two) = ocode;
1859 SLP_TREE_LANES (one) = stmts.length ();
1860 SLP_TREE_LANES (two) = stmts.length ();
1861 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1862 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1863 return node;
1866 node = vect_create_new_slp_node (node, stmts, nops);
1867 SLP_TREE_VECTYPE (node) = vectype;
1868 SLP_TREE_CHILDREN (node).splice (children);
1869 return node;
1872 /* Dump a single SLP tree NODE. */
1874 static void
1875 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1876 slp_tree node)
1878 unsigned i, j;
1879 slp_tree child;
1880 stmt_vec_info stmt_info;
1881 tree op;
1883 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1884 dump_user_location_t user_loc = loc.get_user_location ();
1885 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1886 SLP_TREE_DEF_TYPE (node) == vect_external_def
1887 ? " (external)"
1888 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1889 ? " (constant)"
1890 : ""), node,
1891 estimated_poly_value (node->max_nunits),
1892 SLP_TREE_REF_COUNT (node));
1893 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1894 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1895 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1896 else
1898 dump_printf_loc (metadata, user_loc, "\t{ ");
1899 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1900 dump_printf (metadata, "%T%s ", op,
1901 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1902 dump_printf (metadata, "}\n");
1904 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1906 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1907 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1908 dump_printf (dump_kind, " %u", j);
1909 dump_printf (dump_kind, " }\n");
1911 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1913 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1914 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1915 dump_printf (dump_kind, " %u[%u]",
1916 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1917 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1918 dump_printf (dump_kind, " }\n");
1920 if (SLP_TREE_CHILDREN (node).is_empty ())
1921 return;
1922 dump_printf_loc (metadata, user_loc, "\tchildren");
1923 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1924 dump_printf (dump_kind, " %p", (void *)child);
1925 dump_printf (dump_kind, "\n");
1928 DEBUG_FUNCTION void
1929 debug (slp_tree node)
1931 debug_dump_context ctx;
1932 vect_print_slp_tree (MSG_NOTE,
1933 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1934 node);
1937 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1939 static void
1940 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1941 slp_tree node, hash_set<slp_tree> &visited)
1943 unsigned i;
1944 slp_tree child;
1946 if (visited.add (node))
1947 return;
1949 vect_print_slp_tree (dump_kind, loc, node);
1951 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1952 if (child)
1953 vect_print_slp_graph (dump_kind, loc, child, visited);
1956 static void
1957 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1958 slp_tree entry)
1960 hash_set<slp_tree> visited;
1961 vect_print_slp_graph (dump_kind, loc, entry, visited);
1964 /* Mark the tree rooted at NODE with PURE_SLP. */
1966 static void
1967 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1969 int i;
1970 stmt_vec_info stmt_info;
1971 slp_tree child;
1973 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1974 return;
1976 if (visited.add (node))
1977 return;
1979 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1980 STMT_SLP_TYPE (stmt_info) = pure_slp;
1982 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1983 if (child)
1984 vect_mark_slp_stmts (child, visited);
1987 static void
1988 vect_mark_slp_stmts (slp_tree node)
1990 hash_set<slp_tree> visited;
1991 vect_mark_slp_stmts (node, visited);
1994 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1996 static void
1997 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1999 int i;
2000 stmt_vec_info stmt_info;
2001 slp_tree child;
2003 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2004 return;
2006 if (visited.add (node))
2007 return;
2009 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2011 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2012 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2013 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2016 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2017 if (child)
2018 vect_mark_slp_stmts_relevant (child, visited);
2021 static void
2022 vect_mark_slp_stmts_relevant (slp_tree node)
2024 hash_set<slp_tree> visited;
2025 vect_mark_slp_stmts_relevant (node, visited);
2029 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2031 static void
2032 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2033 hash_set<slp_tree> &visited)
2035 if (!node || visited.add (node))
2036 return;
2038 if (SLP_TREE_CHILDREN (node).length () == 0)
2040 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2041 return;
2042 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2043 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2044 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2045 loads.safe_push (node);
2047 else
2049 unsigned i;
2050 slp_tree child;
2051 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2052 vect_gather_slp_loads (loads, child, visited);
2056 static void
2057 vect_gather_slp_loads (slp_instance inst, slp_tree node)
2059 hash_set<slp_tree> visited;
2060 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
2064 /* Find the last store in SLP INSTANCE. */
2066 stmt_vec_info
2067 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2069 stmt_vec_info last = NULL;
2070 stmt_vec_info stmt_vinfo;
2072 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2074 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2075 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2078 return last;
2081 /* Find the first stmt in NODE. */
2083 stmt_vec_info
2084 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2086 stmt_vec_info first = NULL;
2087 stmt_vec_info stmt_vinfo;
2089 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2091 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2092 if (!first
2093 || get_later_stmt (stmt_vinfo, first) == first)
2094 first = stmt_vinfo;
2097 return first;
2100 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2101 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2102 (also containing the first GROUP1_SIZE stmts, since stores are
2103 consecutive), the second containing the remainder.
2104 Return the first stmt in the second group. */
2106 static stmt_vec_info
2107 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2109 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2110 gcc_assert (group1_size > 0);
2111 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2112 gcc_assert (group2_size > 0);
2113 DR_GROUP_SIZE (first_vinfo) = group1_size;
2115 stmt_vec_info stmt_info = first_vinfo;
2116 for (unsigned i = group1_size; i > 1; i--)
2118 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2119 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2121 /* STMT is now the last element of the first group. */
2122 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2123 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2125 DR_GROUP_SIZE (group2) = group2_size;
2126 for (stmt_info = group2; stmt_info;
2127 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2129 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2130 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2133 /* For the second group, the DR_GROUP_GAP is that before the original group,
2134 plus skipping over the first vector. */
2135 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2137 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2138 DR_GROUP_GAP (first_vinfo) += group2_size;
2140 if (dump_enabled_p ())
2141 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2142 group1_size, group2_size);
2144 return group2;
2147 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2148 statements and a vector of NUNITS elements. */
2150 static poly_uint64
2151 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2153 return exact_div (common_multiple (nunits, group_size), group_size);
2156 enum slp_instance_kind {
2157 slp_inst_kind_store,
2158 slp_inst_kind_reduc_group,
2159 slp_inst_kind_reduc_chain,
2160 slp_inst_kind_ctor
2163 static bool
2164 vect_analyze_slp_instance (vec_info *vinfo,
2165 scalar_stmts_to_slp_tree_map_t *bst_map,
2166 stmt_vec_info stmt_info, unsigned max_tree_size);
2168 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2169 of KIND. Return true if successful. */
2171 static bool
2172 vect_build_slp_instance (vec_info *vinfo,
2173 slp_instance_kind kind,
2174 vec<stmt_vec_info> scalar_stmts,
2175 stmt_vec_info root_stmt_info,
2176 unsigned max_tree_size,
2177 scalar_stmts_to_slp_tree_map_t *bst_map,
2178 /* ??? We need stmt_info for group splitting. */
2179 stmt_vec_info stmt_info_)
2181 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_NOTE, vect_location,
2184 "Starting SLP discovery for\n");
2185 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2186 dump_printf_loc (MSG_NOTE, vect_location,
2187 " %G", scalar_stmts[i]->stmt);
2190 /* Build the tree for the SLP instance. */
2191 unsigned int group_size = scalar_stmts.length ();
2192 bool *matches = XALLOCAVEC (bool, group_size);
2193 unsigned npermutes = 0;
2194 poly_uint64 max_nunits = 1;
2195 unsigned tree_size = 0;
2196 unsigned i;
2197 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2198 &max_nunits, matches, &npermutes,
2199 &tree_size, bst_map);
2200 if (node != NULL)
2202 /* Calculate the unrolling factor based on the smallest type. */
2203 poly_uint64 unrolling_factor
2204 = calculate_unrolling_factor (max_nunits, group_size);
2206 if (maybe_ne (unrolling_factor, 1U)
2207 && is_a <bb_vec_info> (vinfo))
2209 unsigned HOST_WIDE_INT const_max_nunits;
2210 if (!max_nunits.is_constant (&const_max_nunits)
2211 || const_max_nunits > group_size)
2213 if (dump_enabled_p ())
2214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2215 "Build SLP failed: store group "
2216 "size not a multiple of the vector size "
2217 "in basic block SLP\n");
2218 vect_free_slp_tree (node);
2219 return false;
2221 /* Fatal mismatch. */
2222 if (dump_enabled_p ())
2223 dump_printf_loc (MSG_NOTE, vect_location,
2224 "SLP discovery succeeded but node needs "
2225 "splitting\n");
2226 memset (matches, true, group_size);
2227 matches[group_size / const_max_nunits * const_max_nunits] = false;
2228 vect_free_slp_tree (node);
2230 else
2232 /* Create a new SLP instance. */
2233 slp_instance new_instance = XNEW (class _slp_instance);
2234 SLP_INSTANCE_TREE (new_instance) = node;
2235 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2236 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2237 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2238 new_instance->reduc_phis = NULL;
2239 new_instance->cost_vec = vNULL;
2240 new_instance->subgraph_entries = vNULL;
2242 vect_gather_slp_loads (new_instance, node);
2243 if (dump_enabled_p ())
2244 dump_printf_loc (MSG_NOTE, vect_location,
2245 "SLP size %u vs. limit %u.\n",
2246 tree_size, max_tree_size);
2248 /* Check whether any load is possibly permuted. */
2249 slp_tree load_node;
2250 bool loads_permuted = false;
2251 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2253 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2254 continue;
2255 unsigned j;
2256 stmt_vec_info load_info;
2257 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2258 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2260 loads_permuted = true;
2261 break;
2265 /* If the loads and stores can be handled with load/store-lane
2266 instructions do not generate this SLP instance. */
2267 if (is_a <loop_vec_info> (vinfo)
2268 && loads_permuted
2269 && kind == slp_inst_kind_store
2270 && vect_store_lanes_supported
2271 (STMT_VINFO_VECTYPE (scalar_stmts[0]), group_size, false))
2273 slp_tree load_node;
2274 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2276 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2277 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2278 /* Use SLP for strided accesses (or if we can't load-lanes). */
2279 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2280 || ! vect_load_lanes_supported
2281 (STMT_VINFO_VECTYPE (stmt_vinfo),
2282 DR_GROUP_SIZE (stmt_vinfo), false))
2283 break;
2285 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2287 if (dump_enabled_p ())
2288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289 "Built SLP cancelled: can use "
2290 "load/store-lanes\n");
2291 vect_free_slp_instance (new_instance);
2292 return false;
2296 /* Fixup SLP reduction chains. */
2297 if (kind == slp_inst_kind_reduc_chain)
2299 /* If this is a reduction chain with a conversion in front
2300 amend the SLP tree with a node for that. */
2301 gimple *scalar_def
2302 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2303 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2305 /* Get at the conversion stmt - we know it's the single use
2306 of the last stmt of the reduction chain. */
2307 use_operand_p use_p;
2308 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2309 &use_p, &scalar_def);
2310 gcc_assert (r);
2311 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2312 next_info = vect_stmt_to_vectorize (next_info);
2313 scalar_stmts = vNULL;
2314 scalar_stmts.create (group_size);
2315 for (unsigned i = 0; i < group_size; ++i)
2316 scalar_stmts.quick_push (next_info);
2317 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2318 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2319 SLP_TREE_CHILDREN (conv).quick_push (node);
2320 SLP_INSTANCE_TREE (new_instance) = conv;
2321 /* We also have to fake this conversion stmt as SLP reduction
2322 group so we don't have to mess with too much code
2323 elsewhere. */
2324 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2325 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2327 /* Fill the backedge child of the PHI SLP node. The
2328 general matching code cannot find it because the
2329 scalar code does not reflect how we vectorize the
2330 reduction. */
2331 use_operand_p use_p;
2332 imm_use_iterator imm_iter;
2333 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2334 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2335 gimple_get_lhs (scalar_def))
2336 /* There are exactly two non-debug uses, the reduction
2337 PHI and the loop-closed PHI node. */
2338 if (!is_gimple_debug (USE_STMT (use_p))
2339 && gimple_bb (USE_STMT (use_p)) == loop->header)
2341 auto_vec<stmt_vec_info, 64> phis (group_size);
2342 stmt_vec_info phi_info
2343 = vinfo->lookup_stmt (USE_STMT (use_p));
2344 for (unsigned i = 0; i < group_size; ++i)
2345 phis.quick_push (phi_info);
2346 slp_tree *phi_node = bst_map->get (phis);
2347 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2348 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2349 = SLP_INSTANCE_TREE (new_instance);
2350 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2354 vinfo->slp_instances.safe_push (new_instance);
2356 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2357 the number of scalar stmts in the root in a few places.
2358 Verify that assumption holds. */
2359 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2360 .length () == group_size);
2362 if (dump_enabled_p ())
2364 dump_printf_loc (MSG_NOTE, vect_location,
2365 "Final SLP tree for instance:\n");
2366 vect_print_slp_graph (MSG_NOTE, vect_location,
2367 SLP_INSTANCE_TREE (new_instance));
2370 return true;
2373 else
2375 /* Failed to SLP. */
2376 /* Free the allocated memory. */
2377 scalar_stmts.release ();
2380 stmt_vec_info stmt_info = stmt_info_;
2381 /* Try to break the group up into pieces. */
2382 if (kind == slp_inst_kind_store)
2384 /* ??? We could delay all the actual splitting of store-groups
2385 until after SLP discovery of the original group completed.
2386 Then we can recurse to vect_build_slp_instance directly. */
2387 for (i = 0; i < group_size; i++)
2388 if (!matches[i])
2389 break;
2391 /* For basic block SLP, try to break the group up into multiples of
2392 a vector size. */
2393 if (is_a <bb_vec_info> (vinfo)
2394 && (i > 1 && i < group_size))
2396 tree scalar_type
2397 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2398 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2399 1 << floor_log2 (i));
2400 unsigned HOST_WIDE_INT const_nunits;
2401 if (vectype
2402 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2404 /* Split into two groups at the first vector boundary. */
2405 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2406 unsigned group1_size = i & ~(const_nunits - 1);
2408 if (dump_enabled_p ())
2409 dump_printf_loc (MSG_NOTE, vect_location,
2410 "Splitting SLP group at stmt %u\n", i);
2411 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2412 group1_size);
2413 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2414 max_tree_size);
2415 /* If the first non-match was in the middle of a vector,
2416 skip the rest of that vector. Do not bother to re-analyze
2417 single stmt groups. */
2418 if (group1_size < i)
2420 i = group1_size + const_nunits;
2421 if (i + 1 < group_size)
2422 rest = vect_split_slp_store_group (rest, const_nunits);
2424 if (i + 1 < group_size)
2425 res |= vect_analyze_slp_instance (vinfo, bst_map,
2426 rest, max_tree_size);
2427 return res;
2431 /* For loop vectorization split into arbitrary pieces of size > 1. */
2432 if (is_a <loop_vec_info> (vinfo)
2433 && (i > 1 && i < group_size))
2435 unsigned group1_size = i;
2437 if (dump_enabled_p ())
2438 dump_printf_loc (MSG_NOTE, vect_location,
2439 "Splitting SLP group at stmt %u\n", i);
2441 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2442 group1_size);
2443 /* Loop vectorization cannot handle gaps in stores, make sure
2444 the split group appears as strided. */
2445 STMT_VINFO_STRIDED_P (rest) = 1;
2446 DR_GROUP_GAP (rest) = 0;
2447 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2448 DR_GROUP_GAP (stmt_info) = 0;
2450 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2451 max_tree_size);
2452 if (i + 1 < group_size)
2453 res |= vect_analyze_slp_instance (vinfo, bst_map,
2454 rest, max_tree_size);
2456 return res;
2459 /* Even though the first vector did not all match, we might be able to SLP
2460 (some) of the remainder. FORNOW ignore this possibility. */
2463 /* Failed to SLP. */
2464 if (dump_enabled_p ())
2465 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2466 return false;
2470 /* Analyze an SLP instance starting from a group of grouped stores. Call
2471 vect_build_slp_tree to build a tree of packed stmts if possible.
2472 Return FALSE if it's impossible to SLP any stmt in the loop. */
2474 static bool
2475 vect_analyze_slp_instance (vec_info *vinfo,
2476 scalar_stmts_to_slp_tree_map_t *bst_map,
2477 stmt_vec_info stmt_info, unsigned max_tree_size)
2479 unsigned int group_size;
2480 unsigned int i;
2481 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2482 vec<stmt_vec_info> scalar_stmts;
2483 slp_instance_kind kind;
2485 if (is_a <bb_vec_info> (vinfo))
2486 vect_location = stmt_info->stmt;
2487 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2489 kind = slp_inst_kind_store;
2490 group_size = DR_GROUP_SIZE (stmt_info);
2492 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2494 kind = slp_inst_kind_reduc_chain;
2495 gcc_assert (is_a <loop_vec_info> (vinfo));
2496 group_size = REDUC_GROUP_SIZE (stmt_info);
2498 else if (is_gimple_assign (stmt_info->stmt)
2499 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2501 kind = slp_inst_kind_ctor;
2502 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2504 else
2506 kind = slp_inst_kind_reduc_group;
2507 gcc_assert (is_a <loop_vec_info> (vinfo));
2508 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2511 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2512 scalar_stmts.create (group_size);
2513 stmt_vec_info next_info = stmt_info;
2514 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2516 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2517 while (next_info)
2519 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2520 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2523 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2525 /* Collect the reduction stmts and store them in
2526 SLP_TREE_SCALAR_STMTS. */
2527 while (next_info)
2529 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2530 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2532 /* Mark the first element of the reduction chain as reduction to properly
2533 transform the node. In the reduction analysis phase only the last
2534 element of the chain is marked as reduction. */
2535 STMT_VINFO_DEF_TYPE (stmt_info)
2536 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2537 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2538 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2540 else if (kind == slp_inst_kind_ctor)
2542 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2543 tree val;
2544 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2546 if (TREE_CODE (val) == SSA_NAME)
2548 gimple* def = SSA_NAME_DEF_STMT (val);
2549 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2550 /* Value is defined in another basic block. */
2551 if (!def_info)
2552 return false;
2553 def_info = vect_stmt_to_vectorize (def_info);
2554 scalar_stmts.safe_push (def_info);
2556 else
2557 return false;
2559 if (dump_enabled_p ())
2560 dump_printf_loc (MSG_NOTE, vect_location,
2561 "Analyzing vectorizable constructor: %G\n",
2562 stmt_info->stmt);
2564 else
2566 /* Collect reduction statements. */
2567 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2568 for (i = 0; reductions.iterate (i, &next_info); i++)
2569 scalar_stmts.safe_push (next_info);
2572 /* Build the tree for the SLP instance. */
2573 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2574 kind == slp_inst_kind_ctor
2575 ? stmt_info : NULL,
2576 max_tree_size,
2577 bst_map, stmt_info);
2579 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2580 where we should do store group splitting. */
2582 return res;
2585 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2586 trees of packed scalar stmts if SLP is possible. */
2588 opt_result
2589 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2591 unsigned int i;
2592 stmt_vec_info first_element;
2594 DUMP_VECT_SCOPE ("vect_analyze_slp");
2596 scalar_stmts_to_slp_tree_map_t *bst_map
2597 = new scalar_stmts_to_slp_tree_map_t ();
2599 /* Find SLP sequences starting from groups of grouped stores. */
2600 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2601 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2603 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2605 if (loop_vinfo->reduction_chains.length () > 0)
2607 /* Find SLP sequences starting from reduction chains. */
2608 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2609 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2610 max_tree_size))
2612 /* Dissolve reduction chain group. */
2613 stmt_vec_info vinfo = first_element;
2614 stmt_vec_info last = NULL;
2615 while (vinfo)
2617 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2618 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2619 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2620 last = vinfo;
2621 vinfo = next;
2623 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2624 /* It can be still vectorized as part of an SLP reduction. */
2625 loop_vinfo->reductions.safe_push (last);
2629 /* Find SLP sequences starting from groups of reductions. */
2630 if (loop_vinfo->reductions.length () > 1)
2631 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2632 max_tree_size);
2635 /* The map keeps a reference on SLP nodes built, release that. */
2636 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2637 it != bst_map->end (); ++it)
2638 if ((*it).second)
2639 vect_free_slp_tree ((*it).second);
2640 delete bst_map;
2642 return opt_result::success ();
2645 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2647 static void
2648 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2649 vec<slp_tree> &vertices, vec<int> &leafs)
2651 unsigned i;
2652 slp_tree child;
2654 if (visited.add (node))
2655 return;
2657 node->vertex = vertices.length ();
2658 vertices.safe_push (node);
2660 bool leaf = true;
2661 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2662 if (child)
2664 leaf = false;
2665 vect_slp_build_vertices (visited, child, vertices, leafs);
2667 if (leaf)
2668 leafs.safe_push (node->vertex);
2671 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2673 static void
2674 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2675 vec<int> &leafs)
2677 hash_set<slp_tree> visited;
2678 unsigned i;
2679 slp_instance instance;
2680 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2681 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2682 leafs);
2685 /* Apply (reverse) bijectite PERM to VEC. */
2687 template <class T>
2688 static void
2689 vect_slp_permute (vec<unsigned> perm,
2690 vec<T> &vec, bool reverse)
2692 auto_vec<T, 64> saved;
2693 saved.create (vec.length ());
2694 for (unsigned i = 0; i < vec.length (); ++i)
2695 saved.quick_push (vec[i]);
2697 if (reverse)
2699 for (unsigned i = 0; i < vec.length (); ++i)
2700 vec[perm[i]] = saved[i];
2701 for (unsigned i = 0; i < vec.length (); ++i)
2702 gcc_assert (vec[perm[i]] == saved[i]);
2704 else
2706 for (unsigned i = 0; i < vec.length (); ++i)
2707 vec[i] = saved[perm[i]];
2708 for (unsigned i = 0; i < vec.length (); ++i)
2709 gcc_assert (vec[i] == saved[perm[i]]);
2713 /* Return whether permutations PERM_A and PERM_B as recorded in the
2714 PERMS vector are equal. */
2716 static bool
2717 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2718 int perm_a, int perm_b)
2720 return (perm_a == perm_b
2721 || (perms[perm_a].length () == perms[perm_b].length ()
2722 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2723 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2726 /* Optimize the SLP graph of VINFO. */
2728 void
2729 vect_optimize_slp (vec_info *vinfo)
2731 if (vinfo->slp_instances.is_empty ())
2732 return;
2734 slp_tree node;
2735 unsigned i;
2736 auto_vec<slp_tree> vertices;
2737 auto_vec<int> leafs;
2738 vect_slp_build_vertices (vinfo, vertices, leafs);
2740 struct graph *slpg = new_graph (vertices.length ());
2741 FOR_EACH_VEC_ELT (vertices, i, node)
2743 unsigned j;
2744 slp_tree child;
2745 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2746 if (child)
2747 add_edge (slpg, i, child->vertex);
2750 /* Compute (reverse) postorder on the inverted graph. */
2751 auto_vec<int> ipo;
2752 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
2754 auto_sbitmap n_visited (vertices.length ());
2755 auto_sbitmap n_materialize (vertices.length ());
2756 auto_vec<int> n_perm (vertices.length ());
2757 auto_vec<vec<unsigned> > perms;
2759 bitmap_clear (n_visited);
2760 bitmap_clear (n_materialize);
2761 n_perm.quick_grow_cleared (vertices.length ());
2762 perms.safe_push (vNULL); /* zero is no permute */
2764 /* Produce initial permutations. */
2765 for (i = 0; i < leafs.length (); ++i)
2767 int idx = leafs[i];
2768 slp_tree node = vertices[idx];
2770 /* Handle externals and constants optimistically throughout the
2771 iteration. */
2772 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
2773 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
2774 continue;
2776 /* Loads are the only thing generating permutes and leafs do not
2777 change across iterations. */
2778 bitmap_set_bit (n_visited, idx);
2779 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2780 continue;
2782 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2783 node unpermuted, record this permute. */
2784 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
2785 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
2786 continue;
2787 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
2788 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
2789 bool any_permute = false;
2790 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2792 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
2793 imin = MIN (imin, idx);
2794 imax = MAX (imax, idx);
2795 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
2796 any_permute = true;
2798 /* If there's no permute no need to split one out. */
2799 if (!any_permute)
2800 continue;
2801 /* If the span doesn't match we'd disrupt VF computation, avoid
2802 that for now. */
2803 if (imax - imin + 1 != SLP_TREE_LANES (node))
2804 continue;
2806 /* For now only handle true permutes, like
2807 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2808 when permuting constants and invariants keeping the permute
2809 bijective. */
2810 auto_sbitmap load_index (SLP_TREE_LANES (node));
2811 bitmap_clear (load_index);
2812 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2813 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
2814 unsigned j;
2815 for (j = 0; j < SLP_TREE_LANES (node); ++j)
2816 if (!bitmap_bit_p (load_index, j))
2817 break;
2818 if (j != SLP_TREE_LANES (node))
2819 continue;
2821 vec<unsigned> perm = vNULL;
2822 perm.safe_grow (SLP_TREE_LANES (node), true);
2823 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2824 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
2825 perms.safe_push (perm);
2826 n_perm[idx] = perms.length () - 1;
2829 /* Propagate permutes along the graph and compute materialization points. */
2830 bool changed;
2831 unsigned iteration = 0;
2834 changed = false;
2835 ++iteration;
2837 for (i = vertices.length (); i > 0 ; --i)
2839 int idx = ipo[i-1];
2840 slp_tree node = vertices[idx];
2841 /* For leafs there's nothing to do - we've seeded permutes
2842 on those above. */
2843 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2844 continue;
2846 bitmap_set_bit (n_visited, idx);
2848 /* We cannot move a permute across a store. */
2849 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2850 && DR_IS_WRITE
2851 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
2852 continue;
2854 int perm = -1;
2855 for (graph_edge *succ = slpg->vertices[idx].succ;
2856 succ; succ = succ->succ_next)
2858 int succ_idx = succ->dest;
2859 /* Handle unvisited nodes optimistically. */
2860 /* ??? But for constants once we want to handle non-bijective
2861 permutes we have to verify the permute, when unifying lanes,
2862 will not unify different constants. For example see
2863 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2864 if (!bitmap_bit_p (n_visited, succ_idx))
2865 continue;
2866 int succ_perm = n_perm[succ_idx];
2867 /* Once we materialize succs permutation its output lanes
2868 appear unpermuted to us. */
2869 if (bitmap_bit_p (n_materialize, succ_idx))
2870 succ_perm = 0;
2871 if (perm == -1)
2872 perm = succ_perm;
2873 else if (succ_perm == 0)
2875 perm = 0;
2876 break;
2878 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
2880 perm = 0;
2881 break;
2885 if (perm == -1)
2886 /* Pick up pre-computed leaf values. */
2887 perm = n_perm[idx];
2888 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
2890 if (iteration > 1)
2891 /* Make sure we eventually converge. */
2892 gcc_checking_assert (perm == 0);
2893 n_perm[idx] = perm;
2894 if (perm == 0)
2895 bitmap_clear_bit (n_materialize, idx);
2896 changed = true;
2899 if (perm == 0)
2900 continue;
2902 /* Elide pruning at materialization points in the first
2903 iteration so every node was visited once at least. */
2904 if (iteration == 1)
2905 continue;
2907 /* Decide on permute materialization. Look whether there's
2908 a use (pred) edge that is permuted differently than us.
2909 In that case mark ourselves so the permutation is applied. */
2910 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
2911 for (graph_edge *pred = slpg->vertices[idx].pred;
2912 pred; pred = pred->pred_next)
2914 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
2915 int pred_perm = n_perm[pred->src];
2916 if (!vect_slp_perms_eq (perms, perm, pred_perm))
2918 all_preds_permuted = false;
2919 break;
2922 if (!all_preds_permuted)
2924 if (!bitmap_bit_p (n_materialize, idx))
2925 changed = true;
2926 bitmap_set_bit (n_materialize, idx);
2930 while (changed || iteration == 1);
2932 /* Materialize. */
2933 for (i = 0; i < vertices.length (); ++i)
2935 int perm = n_perm[i];
2936 if (perm <= 0)
2937 continue;
2939 slp_tree node = vertices[i];
2941 /* First permute invariant/external original successors. */
2942 unsigned j;
2943 slp_tree child;
2944 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2946 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2947 continue;
2949 /* If the vector is uniform there's nothing to do. */
2950 if (vect_slp_tree_uniform_p (child))
2951 continue;
2953 /* We can end up sharing some externals via two_operator
2954 handling. Be prepared to unshare those. */
2955 if (child->refcnt != 1)
2957 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
2958 SLP_TREE_CHILDREN (node)[j] = child
2959 = vect_create_new_slp_node
2960 (SLP_TREE_SCALAR_OPS (child).copy ());
2962 vect_slp_permute (perms[perm],
2963 SLP_TREE_SCALAR_OPS (child), true);
2966 if (bitmap_bit_p (n_materialize, i))
2968 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2969 /* For loads simply drop the permutation, the load permutation
2970 already performs the desired permutation. */
2972 else
2974 if (dump_enabled_p ())
2975 dump_printf_loc (MSG_NOTE, vect_location,
2976 "inserting permute node in place of %p\n",
2977 node);
2979 /* Make a copy of NODE and in-place change it to a
2980 VEC_PERM node to permute the lanes of the copy. */
2981 slp_tree copy = new _slp_tree;
2982 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
2983 SLP_TREE_CHILDREN (node) = vNULL;
2984 SLP_TREE_SCALAR_STMTS (copy)
2985 = SLP_TREE_SCALAR_STMTS (node).copy ();
2986 vect_slp_permute (perms[perm],
2987 SLP_TREE_SCALAR_STMTS (copy), true);
2988 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
2989 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
2990 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
2991 SLP_TREE_LANE_PERMUTATION (copy)
2992 = SLP_TREE_LANE_PERMUTATION (node);
2993 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
2994 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
2995 copy->refcnt = 1;
2996 copy->max_nunits = node->max_nunits;
2997 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
2998 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
2999 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3001 /* Now turn NODE into a VEC_PERM. */
3002 SLP_TREE_CHILDREN (node).safe_push (copy);
3003 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3004 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3005 SLP_TREE_LANE_PERMUTATION (node)
3006 .quick_push (std::make_pair (0, perms[perm][j]));
3007 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3010 else
3012 /* Apply the reverse permutation to our stmts. */
3013 vect_slp_permute (perms[perm],
3014 SLP_TREE_SCALAR_STMTS (node), true);
3015 /* And to the load permutation, which we can simply
3016 make regular by design. */
3017 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3019 /* ??? When we handle non-bijective permutes the idea
3020 is that we can force the load-permutation to be
3021 { min, min + 1, min + 2, ... max }. But then the
3022 scalar defs might no longer match the lane content
3023 which means wrong-code with live lane vectorization.
3024 So we possibly have to have NULL entries for those. */
3025 vect_slp_permute (perms[perm],
3026 SLP_TREE_LOAD_PERMUTATION (node), true);
3031 /* Free the perms vector used for propagation. */
3032 while (!perms.is_empty ())
3033 perms.pop ().release ();
3034 free_graph (slpg);
3037 /* Now elide load permutations that are not necessary. */
3038 for (i = 0; i < leafs.length (); ++i)
3040 node = vertices[i];
3041 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3042 continue;
3044 /* In basic block vectorization we allow any subchain of an interleaving
3045 chain.
3046 FORNOW: not in loop SLP because of realignment complications. */
3047 if (is_a <bb_vec_info> (vinfo))
3049 bool subchain_p = true;
3050 stmt_vec_info next_load_info = NULL;
3051 stmt_vec_info load_info;
3052 unsigned j;
3053 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3055 if (j != 0
3056 && (next_load_info != load_info
3057 || DR_GROUP_GAP (load_info) != 1))
3059 subchain_p = false;
3060 break;
3062 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3064 if (subchain_p)
3066 SLP_TREE_LOAD_PERMUTATION (node).release ();
3067 continue;
3070 else
3072 stmt_vec_info load_info;
3073 bool this_load_permuted = false;
3074 unsigned j;
3075 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3076 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3078 this_load_permuted = true;
3079 break;
3081 stmt_vec_info first_stmt_info
3082 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3083 if (!this_load_permuted
3084 /* The load requires permutation when unrolling exposes
3085 a gap either because the group is larger than the SLP
3086 group-size or because there is a gap between the groups. */
3087 && (known_eq (LOOP_VINFO_VECT_FACTOR
3088 (as_a <loop_vec_info> (vinfo)), 1U)
3089 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3090 && DR_GROUP_GAP (first_stmt_info) == 0)))
3092 SLP_TREE_LOAD_PERMUTATION (node).release ();
3093 continue;
3100 /* For each possible SLP instance decide whether to SLP it and calculate overall
3101 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3102 least one instance. */
3104 bool
3105 vect_make_slp_decision (loop_vec_info loop_vinfo)
3107 unsigned int i;
3108 poly_uint64 unrolling_factor = 1;
3109 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3110 slp_instance instance;
3111 int decided_to_slp = 0;
3113 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3115 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3117 /* FORNOW: SLP if you can. */
3118 /* All unroll factors have the form:
3120 GET_MODE_SIZE (vinfo->vector_mode) * X
3122 for some rational X, so they must have a common multiple. */
3123 unrolling_factor
3124 = force_common_multiple (unrolling_factor,
3125 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3127 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3128 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3129 loop-based vectorization. Such stmts will be marked as HYBRID. */
3130 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3131 decided_to_slp++;
3134 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3136 if (decided_to_slp && dump_enabled_p ())
3138 dump_printf_loc (MSG_NOTE, vect_location,
3139 "Decided to SLP %d instances. Unrolling factor ",
3140 decided_to_slp);
3141 dump_dec (MSG_NOTE, unrolling_factor);
3142 dump_printf (MSG_NOTE, "\n");
3145 return (decided_to_slp > 0);
3148 /* Private data for vect_detect_hybrid_slp. */
3149 struct vdhs_data
3151 loop_vec_info loop_vinfo;
3152 vec<stmt_vec_info> *worklist;
3155 /* Walker for walk_gimple_op. */
3157 static tree
3158 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3160 walk_stmt_info *wi = (walk_stmt_info *)data;
3161 vdhs_data *dat = (vdhs_data *)wi->info;
3163 if (wi->is_lhs)
3164 return NULL_TREE;
3166 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3167 if (!def_stmt_info)
3168 return NULL_TREE;
3169 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3170 if (PURE_SLP_STMT (def_stmt_info))
3172 if (dump_enabled_p ())
3173 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3174 def_stmt_info->stmt);
3175 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3176 dat->worklist->safe_push (def_stmt_info);
3179 return NULL_TREE;
3182 /* Find stmts that must be both vectorized and SLPed. */
3184 void
3185 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3187 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3189 /* All stmts participating in SLP are marked pure_slp, all other
3190 stmts are loop_vect.
3191 First collect all loop_vect stmts into a worklist. */
3192 auto_vec<stmt_vec_info> worklist;
3193 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3195 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3196 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3197 gsi_next (&gsi))
3199 gphi *phi = gsi.phi ();
3200 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3201 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3202 worklist.safe_push (stmt_info);
3204 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3205 gsi_next (&gsi))
3207 gimple *stmt = gsi_stmt (gsi);
3208 if (is_gimple_debug (stmt))
3209 continue;
3210 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3211 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3213 for (gimple_stmt_iterator gsi2
3214 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3215 !gsi_end_p (gsi2); gsi_next (&gsi2))
3217 stmt_vec_info patt_info
3218 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3219 if (!STMT_SLP_TYPE (patt_info)
3220 && STMT_VINFO_RELEVANT (patt_info))
3221 worklist.safe_push (patt_info);
3223 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3225 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3226 worklist.safe_push (stmt_info);
3230 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3231 mark any SLP vectorized stmt as hybrid.
3232 ??? We're visiting def stmts N times (once for each non-SLP and
3233 once for each hybrid-SLP use). */
3234 walk_stmt_info wi;
3235 vdhs_data dat;
3236 dat.worklist = &worklist;
3237 dat.loop_vinfo = loop_vinfo;
3238 memset (&wi, 0, sizeof (wi));
3239 wi.info = (void *)&dat;
3240 while (!worklist.is_empty ())
3242 stmt_vec_info stmt_info = worklist.pop ();
3243 /* Since SSA operands are not set up for pattern stmts we need
3244 to use walk_gimple_op. */
3245 wi.is_lhs = 0;
3246 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3251 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3253 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3254 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs)
3256 for (unsigned i = 0; i < bbs.length (); ++i)
3258 if (i != 0)
3259 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3260 gsi_next (&si))
3262 gphi *phi = si.phi ();
3263 gimple_set_uid (phi, 0);
3264 add_stmt (phi);
3266 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3267 !gsi_end_p (gsi); gsi_next (&gsi))
3269 gimple *stmt = gsi_stmt (gsi);
3270 gimple_set_uid (stmt, 0);
3271 if (is_gimple_debug (stmt))
3272 continue;
3273 add_stmt (stmt);
3279 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3280 stmts in the basic block. */
3282 _bb_vec_info::~_bb_vec_info ()
3284 /* Reset region marker. */
3285 for (unsigned i = 0; i < bbs.length (); ++i)
3287 if (i != 0)
3288 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3289 gsi_next (&si))
3291 gphi *phi = si.phi ();
3292 gimple_set_uid (phi, -1);
3294 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3295 !gsi_end_p (gsi); gsi_next (&gsi))
3297 gimple *stmt = gsi_stmt (gsi);
3298 gimple_set_uid (stmt, -1);
3303 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3304 given then that child nodes have already been processed, and that
3305 their def types currently match their SLP node's def type. */
3307 static bool
3308 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3309 slp_instance node_instance,
3310 stmt_vector_for_cost *cost_vec)
3312 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3313 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3315 /* Calculate the number of vector statements to be created for the
3316 scalar stmts in this node. For SLP reductions it is equal to the
3317 number of vector statements in the children (which has already been
3318 calculated by the recursive call). Otherwise it is the number of
3319 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3320 VF divided by the number of elements in a vector. */
3321 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3322 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3324 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3325 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3327 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3328 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3329 break;
3332 else
3334 poly_uint64 vf;
3335 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3336 vf = loop_vinfo->vectorization_factor;
3337 else
3338 vf = 1;
3339 unsigned int group_size = SLP_TREE_LANES (node);
3340 tree vectype = SLP_TREE_VECTYPE (node);
3341 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3342 = vect_get_num_vectors (vf * group_size, vectype);
3345 /* Handle purely internal nodes. */
3346 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3347 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3349 if (is_a <bb_vec_info> (vinfo)
3350 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3351 return false;
3353 bool dummy;
3354 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3355 node, node_instance, cost_vec);
3358 /* Try to build NODE from scalars, returning true on success.
3359 NODE_INSTANCE is the SLP instance that contains NODE. */
3361 static bool
3362 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3363 slp_instance node_instance)
3365 stmt_vec_info stmt_info;
3366 unsigned int i;
3368 if (!is_a <bb_vec_info> (vinfo)
3369 || node == SLP_INSTANCE_TREE (node_instance)
3370 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3371 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3372 return false;
3374 if (dump_enabled_p ())
3375 dump_printf_loc (MSG_NOTE, vect_location,
3376 "Building vector operands from scalars instead\n");
3378 /* Don't remove and free the child nodes here, since they could be
3379 referenced by other structures. The analysis and scheduling phases
3380 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3381 unsigned int group_size = SLP_TREE_LANES (node);
3382 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3383 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3384 SLP_TREE_LOAD_PERMUTATION (node).release ();
3385 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3387 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3388 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3390 return true;
3393 /* Compute the prologue cost for invariant or constant operands represented
3394 by NODE. */
3396 static void
3397 vect_prologue_cost_for_slp (slp_tree node,
3398 stmt_vector_for_cost *cost_vec)
3400 /* There's a special case of an existing vector, that costs nothing. */
3401 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3402 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3403 return;
3404 /* Without looking at the actual initializer a vector of
3405 constants can be implemented as load from the constant pool.
3406 When all elements are the same we can use a splat. */
3407 tree vectype = SLP_TREE_VECTYPE (node);
3408 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3409 unsigned num_vects_to_check;
3410 unsigned HOST_WIDE_INT const_nunits;
3411 unsigned nelt_limit;
3412 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3413 && ! multiple_p (const_nunits, group_size))
3415 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3416 nelt_limit = const_nunits;
3418 else
3420 /* If either the vector has variable length or the vectors
3421 are composed of repeated whole groups we only need to
3422 cost construction once. All vectors will be the same. */
3423 num_vects_to_check = 1;
3424 nelt_limit = group_size;
3426 tree elt = NULL_TREE;
3427 unsigned nelt = 0;
3428 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3430 unsigned si = j % group_size;
3431 if (nelt == 0)
3432 elt = SLP_TREE_SCALAR_OPS (node)[si];
3433 /* ??? We're just tracking whether all operands of a single
3434 vector initializer are the same, ideally we'd check if
3435 we emitted the same one already. */
3436 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3437 elt = NULL_TREE;
3438 nelt++;
3439 if (nelt == nelt_limit)
3441 record_stmt_cost (cost_vec, 1,
3442 SLP_TREE_DEF_TYPE (node) == vect_external_def
3443 ? (elt ? scalar_to_vec : vec_construct)
3444 : vector_load,
3445 NULL, vectype, 0, vect_prologue);
3446 nelt = 0;
3451 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3452 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3454 Return true if the operations are supported. */
3456 static bool
3457 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3458 slp_instance node_instance,
3459 hash_set<slp_tree> &visited,
3460 hash_set<slp_tree> &lvisited,
3461 stmt_vector_for_cost *cost_vec)
3463 int i, j;
3464 slp_tree child;
3466 /* Assume we can code-generate all invariants. */
3467 if (!node
3468 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3469 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3470 return true;
3472 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3474 if (dump_enabled_p ())
3475 dump_printf_loc (MSG_NOTE, vect_location,
3476 "Failed cyclic SLP reference in %p", node);
3477 return false;
3479 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3481 /* If we already analyzed the exact same set of scalar stmts we're done.
3482 We share the generated vector stmts for those. */
3483 if (visited.contains (node)
3484 || lvisited.add (node))
3485 return true;
3487 bool res = true;
3488 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3490 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3491 visited, lvisited, cost_vec);
3492 if (!res)
3493 break;
3496 if (res)
3497 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3498 cost_vec);
3499 if (!res)
3500 lvisited.remove (node);
3502 /* When the node can be vectorized cost invariant nodes it references.
3503 This is not done in DFS order to allow the refering node
3504 vectorizable_* calls to nail down the invariant nodes vector type
3505 and possibly unshare it if it needs a different vector type than
3506 other referrers. */
3507 if (res)
3508 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3509 if (child
3510 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3511 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3512 /* Perform usual caching, note code-generation still
3513 code-gens these nodes multiple times but we expect
3514 to CSE them later. */
3515 && !visited.contains (child)
3516 && !lvisited.add (child))
3518 /* ??? After auditing more code paths make a "default"
3519 and push the vector type from NODE to all children
3520 if it is not already set. */
3521 /* Compute the number of vectors to be generated. */
3522 tree vector_type = SLP_TREE_VECTYPE (child);
3523 if (!vector_type)
3525 /* For shifts with a scalar argument we don't need
3526 to cost or code-generate anything.
3527 ??? Represent this more explicitely. */
3528 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3529 == shift_vec_info_type)
3530 && j == 1);
3531 continue;
3533 unsigned group_size = SLP_TREE_LANES (child);
3534 poly_uint64 vf = 1;
3535 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3536 vf = loop_vinfo->vectorization_factor;
3537 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3538 = vect_get_num_vectors (vf * group_size, vector_type);
3539 /* And cost them. */
3540 vect_prologue_cost_for_slp (child, cost_vec);
3543 /* If this node or any of its children can't be vectorized, try pruning
3544 the tree here rather than felling the whole thing. */
3545 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3547 /* We'll need to revisit this for invariant costing and number
3548 of vectorized stmt setting. */
3549 res = true;
3552 return res;
3556 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3557 region and that can be vectorized using vectorizable_live_operation
3558 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3559 scalar code computing it to be retained. */
3561 static void
3562 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3563 slp_instance instance,
3564 stmt_vector_for_cost *cost_vec,
3565 hash_set<stmt_vec_info> &svisited,
3566 hash_set<slp_tree> &visited)
3568 if (visited.add (node))
3569 return;
3571 unsigned i;
3572 stmt_vec_info stmt_info;
3573 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3574 bool all_visited = true;
3575 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3577 if (svisited.contains (stmt_info))
3578 continue;
3579 all_visited = false;
3580 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3581 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3582 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3583 /* Only the pattern root stmt computes the original scalar value. */
3584 continue;
3585 bool mark_visited = true;
3586 gimple *orig_stmt = orig_stmt_info->stmt;
3587 ssa_op_iter op_iter;
3588 def_operand_p def_p;
3589 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3591 imm_use_iterator use_iter;
3592 gimple *use_stmt;
3593 stmt_vec_info use_stmt_info;
3594 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3595 if (!is_gimple_debug (use_stmt))
3597 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3598 if (!use_stmt_info
3599 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3601 STMT_VINFO_LIVE_P (stmt_info) = true;
3602 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3603 NULL, node, instance, i,
3604 false, cost_vec))
3605 /* ??? So we know we can vectorize the live stmt
3606 from one SLP node. If we cannot do so from all
3607 or none consistently we'd have to record which
3608 SLP node (and lane) we want to use for the live
3609 operation. So make sure we can code-generate
3610 from all nodes. */
3611 mark_visited = false;
3612 else
3613 STMT_VINFO_LIVE_P (stmt_info) = false;
3614 BREAK_FROM_IMM_USE_STMT (use_iter);
3617 /* We have to verify whether we can insert the lane extract
3618 before all uses. The following is a conservative approximation.
3619 We cannot put this into vectorizable_live_operation because
3620 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3621 doesn't work.
3622 Note that while the fact that we emit code for loads at the
3623 first load should make this a non-problem leafs we construct
3624 from scalars are vectorized after the last scalar def.
3625 ??? If we'd actually compute the insert location during
3626 analysis we could use sth less conservative than the last
3627 scalar stmt in the node for the dominance check. */
3628 /* ??? What remains is "live" uses in vector CTORs in the same
3629 SLP graph which is where those uses can end up code-generated
3630 right after their definition instead of close to their original
3631 use. But that would restrict us to code-generate lane-extracts
3632 from the latest stmt in a node. So we compensate for this
3633 during code-generation, simply not replacing uses for those
3634 hopefully rare cases. */
3635 if (STMT_VINFO_LIVE_P (stmt_info))
3636 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3637 if (!is_gimple_debug (use_stmt)
3638 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
3639 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3640 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
3642 if (dump_enabled_p ())
3643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3644 "Cannot determine insertion place for "
3645 "lane extract\n");
3646 STMT_VINFO_LIVE_P (stmt_info) = false;
3647 mark_visited = true;
3650 if (mark_visited)
3651 svisited.add (stmt_info);
3653 if (all_visited)
3654 return;
3656 slp_tree child;
3657 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3658 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3659 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
3660 cost_vec, svisited, visited);
3663 /* Analyze statements in SLP instances of VINFO. Return true if the
3664 operations are supported. */
3666 bool
3667 vect_slp_analyze_operations (vec_info *vinfo)
3669 slp_instance instance;
3670 int i;
3672 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3674 hash_set<slp_tree> visited;
3675 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
3677 hash_set<slp_tree> lvisited;
3678 stmt_vector_for_cost cost_vec;
3679 cost_vec.create (2);
3680 if (is_a <bb_vec_info> (vinfo))
3681 vect_location = instance->location ();
3682 if (!vect_slp_analyze_node_operations (vinfo,
3683 SLP_INSTANCE_TREE (instance),
3684 instance, visited, lvisited,
3685 &cost_vec)
3686 /* Instances with a root stmt require vectorized defs for the
3687 SLP tree root. */
3688 || (SLP_INSTANCE_ROOT_STMT (instance)
3689 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
3690 != vect_internal_def)))
3692 slp_tree node = SLP_INSTANCE_TREE (instance);
3693 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3694 if (dump_enabled_p ())
3695 dump_printf_loc (MSG_NOTE, vect_location,
3696 "removing SLP instance operations starting from: %G",
3697 stmt_info->stmt);
3698 vect_free_slp_instance (instance);
3699 vinfo->slp_instances.ordered_remove (i);
3700 cost_vec.release ();
3702 else
3704 for (hash_set<slp_tree>::iterator x = lvisited.begin();
3705 x != lvisited.end(); ++x)
3706 visited.add (*x);
3707 i++;
3709 /* For BB vectorization remember the SLP graph entry
3710 cost for later. */
3711 if (is_a <bb_vec_info> (vinfo))
3712 instance->cost_vec = cost_vec;
3713 else
3715 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3716 cost_vec.release ();
3721 /* Compute vectorizable live stmts. */
3722 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3724 hash_set<stmt_vec_info> svisited;
3725 hash_set<slp_tree> visited;
3726 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
3728 vect_location = instance->location ();
3729 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
3730 instance, &instance->cost_vec, svisited,
3731 visited);
3735 return !vinfo->slp_instances.is_empty ();
3738 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3739 closing the eventual chain. */
3741 static slp_instance
3742 get_ultimate_leader (slp_instance instance,
3743 hash_map<slp_instance, slp_instance> &instance_leader)
3745 auto_vec<slp_instance *, 8> chain;
3746 slp_instance *tem;
3747 while (*(tem = instance_leader.get (instance)) != instance)
3749 chain.safe_push (tem);
3750 instance = *tem;
3752 while (!chain.is_empty ())
3753 *chain.pop () = instance;
3754 return instance;
3757 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3759 static void
3760 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
3761 slp_instance instance, slp_tree node,
3762 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
3763 hash_map<slp_instance, slp_instance> &instance_leader,
3764 hash_set<slp_tree> &visited)
3766 stmt_vec_info stmt_info;
3767 unsigned i;
3769 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3771 bool existed_p;
3772 slp_instance &stmt_instance
3773 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
3774 if (!existed_p)
3776 else if (stmt_instance != instance)
3778 /* If we're running into a previously marked stmt make us the
3779 leader of the current ultimate leader. This keeps the
3780 leader chain acyclic and works even when the current instance
3781 connects two previously independent graph parts. */
3782 slp_instance stmt_leader
3783 = get_ultimate_leader (stmt_instance, instance_leader);
3784 if (stmt_leader != instance)
3785 instance_leader.put (stmt_leader, instance);
3787 stmt_instance = instance;
3790 if (visited.add (node))
3791 return;
3793 slp_tree child;
3794 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3795 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3796 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
3797 instance_leader, visited);
3800 /* Partition the SLP graph into pieces that can be costed independently. */
3802 static void
3803 vect_bb_partition_graph (bb_vec_info bb_vinfo)
3805 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3807 /* First walk the SLP graph assigning each involved scalar stmt a
3808 corresponding SLP graph entry and upon visiting a previously
3809 marked stmt, make the stmts leader the current SLP graph entry. */
3810 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
3811 hash_map<slp_instance, slp_instance> instance_leader;
3812 hash_set<slp_tree> visited;
3813 slp_instance instance;
3814 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3816 instance_leader.put (instance, instance);
3817 vect_bb_partition_graph_r (bb_vinfo,
3818 instance, SLP_INSTANCE_TREE (instance),
3819 stmt_to_instance, instance_leader,
3820 visited);
3823 /* Then collect entries to each independent subgraph. */
3824 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3826 slp_instance leader = get_ultimate_leader (instance, instance_leader);
3827 leader->subgraph_entries.safe_push (instance);
3828 if (dump_enabled_p ()
3829 && leader != instance)
3830 dump_printf_loc (MSG_NOTE, vect_location,
3831 "instance %p is leader of %p\n",
3832 leader, instance);
3836 /* Compute the scalar cost of the SLP node NODE and its children
3837 and return it. Do not account defs that are marked in LIFE and
3838 update LIFE according to uses of NODE. */
3840 static void
3841 vect_bb_slp_scalar_cost (vec_info *vinfo,
3842 slp_tree node, vec<bool, va_heap> *life,
3843 stmt_vector_for_cost *cost_vec,
3844 hash_set<slp_tree> &visited)
3846 unsigned i;
3847 stmt_vec_info stmt_info;
3848 slp_tree child;
3850 if (visited.add (node))
3851 return;
3853 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3855 ssa_op_iter op_iter;
3856 def_operand_p def_p;
3858 if ((*life)[i])
3859 continue;
3861 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3862 gimple *orig_stmt = orig_stmt_info->stmt;
3864 /* If there is a non-vectorized use of the defs then the scalar
3865 stmt is kept live in which case we do not account it or any
3866 required defs in the SLP children in the scalar cost. This
3867 way we make the vectorization more costly when compared to
3868 the scalar cost. */
3869 if (!STMT_VINFO_LIVE_P (stmt_info))
3871 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3873 imm_use_iterator use_iter;
3874 gimple *use_stmt;
3875 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3876 if (!is_gimple_debug (use_stmt))
3878 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3879 if (!use_stmt_info
3880 || !PURE_SLP_STMT
3881 (vect_stmt_to_vectorize (use_stmt_info)))
3883 (*life)[i] = true;
3884 BREAK_FROM_IMM_USE_STMT (use_iter);
3888 if ((*life)[i])
3889 continue;
3892 /* Count scalar stmts only once. */
3893 if (gimple_visited_p (orig_stmt))
3894 continue;
3895 gimple_set_visited (orig_stmt, true);
3897 vect_cost_for_stmt kind;
3898 if (STMT_VINFO_DATA_REF (orig_stmt_info))
3900 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
3901 kind = scalar_load;
3902 else
3903 kind = scalar_store;
3905 else if (vect_nop_conversion_p (orig_stmt_info))
3906 continue;
3907 else
3908 kind = scalar_stmt;
3909 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info, 0, vect_body);
3912 auto_vec<bool, 20> subtree_life;
3913 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3915 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3917 /* Do not directly pass LIFE to the recursive call, copy it to
3918 confine changes in the callee to the current child/subtree. */
3919 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3921 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
3922 for (unsigned j = 0;
3923 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3925 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3926 if (perm.first == i)
3927 subtree_life[perm.second] = (*life)[j];
3930 else
3932 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
3933 subtree_life.safe_splice (*life);
3935 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3936 visited);
3937 subtree_life.truncate (0);
3942 /* Check if vectorization of the basic block is profitable for the
3943 subgraph denoted by SLP_INSTANCES. */
3945 static bool
3946 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
3947 vec<slp_instance> slp_instances)
3949 slp_instance instance;
3950 int i;
3951 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3952 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3954 void *vect_target_cost_data = init_cost (NULL);
3956 /* Calculate scalar cost and sum the cost for the vector stmts
3957 previously collected. */
3958 stmt_vector_for_cost scalar_costs;
3959 scalar_costs.create (0);
3960 hash_set<slp_tree> visited;
3961 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3963 auto_vec<bool, 20> life;
3964 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
3965 true);
3966 vect_bb_slp_scalar_cost (bb_vinfo,
3967 SLP_INSTANCE_TREE (instance),
3968 &life, &scalar_costs, visited);
3969 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
3970 instance->cost_vec.release ();
3972 /* Unset visited flag. */
3973 stmt_info_for_cost *si;
3974 FOR_EACH_VEC_ELT (scalar_costs, i, si)
3975 gimple_set_visited (si->stmt_info->stmt, false);
3977 void *scalar_target_cost_data = init_cost (NULL);
3978 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
3979 scalar_costs.release ();
3980 unsigned dummy;
3981 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
3982 destroy_cost_data (scalar_target_cost_data);
3984 /* Complete the target-specific vector cost calculation. */
3985 finish_cost (vect_target_cost_data, &vec_prologue_cost,
3986 &vec_inside_cost, &vec_epilogue_cost);
3987 destroy_cost_data (vect_target_cost_data);
3989 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3991 if (dump_enabled_p ())
3993 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3994 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3995 vec_inside_cost);
3996 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3997 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3998 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
4001 /* Vectorization is profitable if its cost is more than the cost of scalar
4002 version. Note that we err on the vector side for equal cost because
4003 the cost estimate is otherwise quite pessimistic (constant uses are
4004 free on the scalar side but cost a load on the vector side for
4005 example). */
4006 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4007 return false;
4009 return true;
4012 /* Find any vectorizable constructors and add them to the grouped_store
4013 array. */
4015 static void
4016 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4018 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4019 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4020 !gsi_end_p (gsi); gsi_next (&gsi))
4022 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4023 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
4024 continue;
4026 tree rhs = gimple_assign_rhs1 (assign);
4027 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4028 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4029 CONSTRUCTOR_NELTS (rhs))
4030 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4031 || uniform_vector_p (rhs))
4032 continue;
4034 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4035 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4039 /* Check if the region described by BB_VINFO can be vectorized, returning
4040 true if so. When returning false, set FATAL to true if the same failure
4041 would prevent vectorization at other vector sizes, false if it is still
4042 worth trying other sizes. N_STMTS is the number of statements in the
4043 region. */
4045 static bool
4046 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4047 vec<int> *dataref_groups)
4049 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4051 slp_instance instance;
4052 int i;
4053 poly_uint64 min_vf = 2;
4055 /* The first group of checks is independent of the vector size. */
4056 fatal = true;
4058 /* Analyze the data references. */
4060 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4062 if (dump_enabled_p ())
4063 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4064 "not vectorized: unhandled data-ref in basic "
4065 "block.\n");
4066 return false;
4069 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4071 if (dump_enabled_p ())
4072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4073 "not vectorized: unhandled data access in "
4074 "basic block.\n");
4075 return false;
4078 vect_slp_check_for_constructors (bb_vinfo);
4080 /* If there are no grouped stores and no constructors in the region
4081 there is no need to continue with pattern recog as vect_analyze_slp
4082 will fail anyway. */
4083 if (bb_vinfo->grouped_stores.is_empty ())
4085 if (dump_enabled_p ())
4086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4087 "not vectorized: no grouped stores in "
4088 "basic block.\n");
4089 return false;
4092 /* While the rest of the analysis below depends on it in some way. */
4093 fatal = false;
4095 vect_pattern_recog (bb_vinfo);
4097 /* Check the SLP opportunities in the basic block, analyze and build SLP
4098 trees. */
4099 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4101 if (dump_enabled_p ())
4103 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4104 "Failed to SLP the basic block.\n");
4105 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4106 "not vectorized: failed to find SLP opportunities "
4107 "in basic block.\n");
4109 return false;
4112 /* Optimize permutations. */
4113 vect_optimize_slp (bb_vinfo);
4115 vect_record_base_alignments (bb_vinfo);
4117 /* Analyze and verify the alignment of data references and the
4118 dependence in the SLP instances. */
4119 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4121 vect_location = instance->location ();
4122 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4123 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4125 slp_tree node = SLP_INSTANCE_TREE (instance);
4126 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4127 if (dump_enabled_p ())
4128 dump_printf_loc (MSG_NOTE, vect_location,
4129 "removing SLP instance operations starting from: %G",
4130 stmt_info->stmt);
4131 vect_free_slp_instance (instance);
4132 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4133 continue;
4136 /* Mark all the statements that we want to vectorize as pure SLP and
4137 relevant. */
4138 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4139 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4140 if (SLP_INSTANCE_ROOT_STMT (instance))
4141 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
4143 i++;
4145 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4146 return false;
4148 if (!vect_slp_analyze_operations (bb_vinfo))
4150 if (dump_enabled_p ())
4151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4152 "not vectorized: bad operation in basic block.\n");
4153 return false;
4156 vect_bb_partition_graph (bb_vinfo);
4158 return true;
4161 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4162 basic blocks in BBS, returning true on success.
4163 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4165 static bool
4166 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4167 vec<int> *dataref_groups, unsigned int n_stmts)
4169 bb_vec_info bb_vinfo;
4170 auto_vector_modes vector_modes;
4172 /* Autodetect first vector size we try. */
4173 machine_mode next_vector_mode = VOIDmode;
4174 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4175 unsigned int mode_i = 0;
4177 vec_info_shared shared;
4179 machine_mode autodetected_vector_mode = VOIDmode;
4180 while (1)
4182 bool vectorized = false;
4183 bool fatal = false;
4184 bb_vinfo = new _bb_vec_info (bbs, &shared);
4186 bool first_time_p = shared.datarefs.is_empty ();
4187 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4188 if (first_time_p)
4189 bb_vinfo->shared->save_datarefs ();
4190 else
4191 bb_vinfo->shared->check_datarefs ();
4192 bb_vinfo->vector_mode = next_vector_mode;
4194 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
4195 && dbg_cnt (vect_slp))
4197 if (dump_enabled_p ())
4199 dump_printf_loc (MSG_NOTE, vect_location,
4200 "***** Analysis succeeded with vector mode"
4201 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4202 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4205 bb_vinfo->shared->check_datarefs ();
4207 unsigned i;
4208 slp_instance instance;
4209 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4211 if (instance->subgraph_entries.is_empty ())
4212 continue;
4214 vect_location = instance->location ();
4215 if (!unlimited_cost_model (NULL)
4216 && !vect_bb_vectorization_profitable_p
4217 (bb_vinfo, instance->subgraph_entries))
4219 if (dump_enabled_p ())
4220 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4221 "not vectorized: vectorization is not "
4222 "profitable.\n");
4223 continue;
4226 if (!vectorized && dump_enabled_p ())
4227 dump_printf_loc (MSG_NOTE, vect_location,
4228 "Basic block will be vectorized "
4229 "using SLP\n");
4230 vectorized = true;
4232 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4234 unsigned HOST_WIDE_INT bytes;
4235 if (dump_enabled_p ())
4237 if (GET_MODE_SIZE
4238 (bb_vinfo->vector_mode).is_constant (&bytes))
4239 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4240 "basic block part vectorized using %wu "
4241 "byte vectors\n", bytes);
4242 else
4243 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4244 "basic block part vectorized using "
4245 "variable length vectors\n");
4249 else
4251 if (dump_enabled_p ())
4252 dump_printf_loc (MSG_NOTE, vect_location,
4253 "***** Analysis failed with vector mode %s\n",
4254 GET_MODE_NAME (bb_vinfo->vector_mode));
4257 if (mode_i == 0)
4258 autodetected_vector_mode = bb_vinfo->vector_mode;
4260 if (!fatal)
4261 while (mode_i < vector_modes.length ()
4262 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4264 if (dump_enabled_p ())
4265 dump_printf_loc (MSG_NOTE, vect_location,
4266 "***** The result for vector mode %s would"
4267 " be the same\n",
4268 GET_MODE_NAME (vector_modes[mode_i]));
4269 mode_i += 1;
4272 delete bb_vinfo;
4274 if (mode_i < vector_modes.length ()
4275 && VECTOR_MODE_P (autodetected_vector_mode)
4276 && (related_vector_mode (vector_modes[mode_i],
4277 GET_MODE_INNER (autodetected_vector_mode))
4278 == autodetected_vector_mode)
4279 && (related_vector_mode (autodetected_vector_mode,
4280 GET_MODE_INNER (vector_modes[mode_i]))
4281 == vector_modes[mode_i]))
4283 if (dump_enabled_p ())
4284 dump_printf_loc (MSG_NOTE, vect_location,
4285 "***** Skipping vector mode %s, which would"
4286 " repeat the analysis for %s\n",
4287 GET_MODE_NAME (vector_modes[mode_i]),
4288 GET_MODE_NAME (autodetected_vector_mode));
4289 mode_i += 1;
4292 if (vectorized
4293 || mode_i == vector_modes.length ()
4294 || autodetected_vector_mode == VOIDmode
4295 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4296 vector sizes will fail do not bother iterating. */
4297 || fatal)
4298 return vectorized;
4300 /* Try the next biggest vector size. */
4301 next_vector_mode = vector_modes[mode_i++];
4302 if (dump_enabled_p ())
4303 dump_printf_loc (MSG_NOTE, vect_location,
4304 "***** Re-trying analysis with vector mode %s\n",
4305 GET_MODE_NAME (next_vector_mode));
4310 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4311 true if anything in the basic-block was vectorized. */
4313 static bool
4314 vect_slp_bbs (vec<basic_block> bbs)
4316 vec<data_reference_p> datarefs = vNULL;
4317 auto_vec<int> dataref_groups;
4318 int insns = 0;
4319 int current_group = 0;
4321 for (unsigned i = 0; i < bbs.length (); i++)
4323 basic_block bb = bbs[i];
4324 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
4325 gsi_next (&gsi))
4327 gimple *stmt = gsi_stmt (gsi);
4328 if (is_gimple_debug (stmt))
4329 continue;
4331 insns++;
4333 if (gimple_location (stmt) != UNKNOWN_LOCATION)
4334 vect_location = stmt;
4336 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
4337 &dataref_groups, current_group))
4338 ++current_group;
4342 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
4345 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4346 true if anything in the basic-block was vectorized. */
4348 bool
4349 vect_slp_bb (basic_block bb)
4351 auto_vec<basic_block> bbs;
4352 bbs.safe_push (bb);
4353 return vect_slp_bbs (bbs);
4356 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4357 true if anything in the basic-block was vectorized. */
4359 bool
4360 vect_slp_function (function *fun)
4362 bool r = false;
4363 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
4364 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
4366 /* For the moment split the function into pieces to avoid making
4367 the iteration on the vector mode moot. Split at points we know
4368 to not handle well which is CFG merges (SLP discovery doesn't
4369 handle non-loop-header PHIs) and loop exits. Since pattern
4370 recog requires reverse iteration to visit uses before defs
4371 simply chop RPO into pieces. */
4372 auto_vec<basic_block> bbs;
4373 for (unsigned i = 0; i < n; i++)
4375 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
4376 bool split = false;
4378 /* Split when a BB is not dominated by the first block. */
4379 if (!bbs.is_empty ()
4380 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
4382 if (dump_enabled_p ())
4383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4384 "splitting region at dominance boundary bb%d\n",
4385 bb->index);
4386 split = true;
4388 /* Split when the loop determined by the first block
4389 is exited. This is because we eventually insert
4390 invariants at region begin. */
4391 else if (!bbs.is_empty ()
4392 && bbs[0]->loop_father != bb->loop_father
4393 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
4395 if (dump_enabled_p ())
4396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4397 "splitting region at loop %d exit at bb%d\n",
4398 bbs[0]->loop_father->num, bb->index);
4399 split = true;
4402 if (split && !bbs.is_empty ())
4404 r |= vect_slp_bbs (bbs);
4405 bbs.truncate (0);
4406 bbs.quick_push (bb);
4408 else
4409 bbs.safe_push (bb);
4411 /* When we have a stmt ending this block and defining a
4412 value we have to insert on edges when inserting after it for
4413 a vector containing its definition. Avoid this for now. */
4414 if (gimple *last = last_stmt (bb))
4415 if (gimple_get_lhs (last)
4416 && is_ctrl_altering_stmt (last))
4418 if (dump_enabled_p ())
4419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4420 "splitting region at control altering "
4421 "definition %G", last);
4422 r |= vect_slp_bbs (bbs);
4423 bbs.truncate (0);
4427 if (!bbs.is_empty ())
4428 r |= vect_slp_bbs (bbs);
4430 free (rpo);
4432 return r;
4435 /* Build a variable-length vector in which the elements in ELTS are repeated
4436 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4437 RESULTS and add any new instructions to SEQ.
4439 The approach we use is:
4441 (1) Find a vector mode VM with integer elements of mode IM.
4443 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4444 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4445 from small vectors to IM.
4447 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4449 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4450 correct byte contents.
4452 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4454 We try to find the largest IM for which this sequence works, in order
4455 to cut down on the number of interleaves. */
4457 void
4458 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
4459 vec<tree> elts, unsigned int nresults,
4460 vec<tree> &results)
4462 unsigned int nelts = elts.length ();
4463 tree element_type = TREE_TYPE (vector_type);
4465 /* (1) Find a vector mode VM with integer elements of mode IM. */
4466 unsigned int nvectors = 1;
4467 tree new_vector_type;
4468 tree permutes[2];
4469 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
4470 &nvectors, &new_vector_type,
4471 permutes))
4472 gcc_unreachable ();
4474 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4475 unsigned int partial_nelts = nelts / nvectors;
4476 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
4478 tree_vector_builder partial_elts;
4479 auto_vec<tree, 32> pieces (nvectors * 2);
4480 pieces.quick_grow (nvectors * 2);
4481 for (unsigned int i = 0; i < nvectors; ++i)
4483 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4484 ELTS' has mode IM. */
4485 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
4486 for (unsigned int j = 0; j < partial_nelts; ++j)
4487 partial_elts.quick_push (elts[i * partial_nelts + j]);
4488 tree t = gimple_build_vector (seq, &partial_elts);
4489 t = gimple_build (seq, VIEW_CONVERT_EXPR,
4490 TREE_TYPE (new_vector_type), t);
4492 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4493 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
4496 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4497 correct byte contents.
4499 We need to repeat the following operation log2(nvectors) times:
4501 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4502 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4504 However, if each input repeats every N elements and the VF is
4505 a multiple of N * 2, the HI result is the same as the LO. */
4506 unsigned int in_start = 0;
4507 unsigned int out_start = nvectors;
4508 unsigned int hi_start = nvectors / 2;
4509 /* A bound on the number of outputs needed to produce NRESULTS results
4510 in the final iteration. */
4511 unsigned int noutputs_bound = nvectors * nresults;
4512 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
4514 noutputs_bound /= 2;
4515 unsigned int limit = MIN (noutputs_bound, nvectors);
4516 for (unsigned int i = 0; i < limit; ++i)
4518 if ((i & 1) != 0
4519 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
4520 2 * in_repeat))
4522 pieces[out_start + i] = pieces[out_start + i - 1];
4523 continue;
4526 tree output = make_ssa_name (new_vector_type);
4527 tree input1 = pieces[in_start + (i / 2)];
4528 tree input2 = pieces[in_start + (i / 2) + hi_start];
4529 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
4530 input1, input2,
4531 permutes[i & 1]);
4532 gimple_seq_add_stmt (seq, stmt);
4533 pieces[out_start + i] = output;
4535 std::swap (in_start, out_start);
4538 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4539 results.reserve (nresults);
4540 for (unsigned int i = 0; i < nresults; ++i)
4541 if (i < nvectors)
4542 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
4543 pieces[in_start + i]));
4544 else
4545 results.quick_push (results[i - nvectors]);
4549 /* For constant and loop invariant defs in OP_NODE this function creates
4550 vector defs that will be used in the vectorized stmts and stores them
4551 to SLP_TREE_VEC_DEFS of OP_NODE. */
4553 static void
4554 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
4556 unsigned HOST_WIDE_INT nunits;
4557 tree vec_cst;
4558 unsigned j, number_of_places_left_in_vector;
4559 tree vector_type;
4560 tree vop;
4561 int group_size = op_node->ops.length ();
4562 unsigned int vec_num, i;
4563 unsigned number_of_copies = 1;
4564 bool constant_p;
4565 gimple_seq ctor_seq = NULL;
4566 auto_vec<tree, 16> permute_results;
4568 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4569 vector_type = SLP_TREE_VECTYPE (op_node);
4571 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
4572 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
4573 auto_vec<tree> voprnds (number_of_vectors);
4575 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4576 created vectors. It is greater than 1 if unrolling is performed.
4578 For example, we have two scalar operands, s1 and s2 (e.g., group of
4579 strided accesses of size two), while NUNITS is four (i.e., four scalars
4580 of this type can be packed in a vector). The output vector will contain
4581 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4582 will be 2).
4584 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4585 containing the operands.
4587 For example, NUNITS is four as before, and the group size is 8
4588 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4589 {s5, s6, s7, s8}. */
4591 /* When using duplicate_and_interleave, we just need one element for
4592 each scalar statement. */
4593 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4594 nunits = group_size;
4596 number_of_copies = nunits * number_of_vectors / group_size;
4598 number_of_places_left_in_vector = nunits;
4599 constant_p = true;
4600 tree_vector_builder elts (vector_type, nunits, 1);
4601 elts.quick_grow (nunits);
4602 stmt_vec_info insert_after = NULL;
4603 for (j = 0; j < number_of_copies; j++)
4605 tree op;
4606 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
4608 /* Create 'vect_ = {op0,op1,...,opn}'. */
4609 number_of_places_left_in_vector--;
4610 tree orig_op = op;
4611 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
4613 if (CONSTANT_CLASS_P (op))
4615 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4617 /* Can't use VIEW_CONVERT_EXPR for booleans because
4618 of possibly different sizes of scalar value and
4619 vector element. */
4620 if (integer_zerop (op))
4621 op = build_int_cst (TREE_TYPE (vector_type), 0);
4622 else if (integer_onep (op))
4623 op = build_all_ones_cst (TREE_TYPE (vector_type));
4624 else
4625 gcc_unreachable ();
4627 else
4628 op = fold_unary (VIEW_CONVERT_EXPR,
4629 TREE_TYPE (vector_type), op);
4630 gcc_assert (op && CONSTANT_CLASS_P (op));
4632 else
4634 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
4635 gimple *init_stmt;
4636 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4638 tree true_val
4639 = build_all_ones_cst (TREE_TYPE (vector_type));
4640 tree false_val
4641 = build_zero_cst (TREE_TYPE (vector_type));
4642 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
4643 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
4644 op, true_val,
4645 false_val);
4647 else
4649 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
4650 op);
4651 init_stmt
4652 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
4653 op);
4655 gimple_seq_add_stmt (&ctor_seq, init_stmt);
4656 op = new_temp;
4659 elts[number_of_places_left_in_vector] = op;
4660 if (!CONSTANT_CLASS_P (op))
4661 constant_p = false;
4662 /* For BB vectorization we have to compute an insert location
4663 when a def is inside the analyzed region since we cannot
4664 simply insert at the BB start in this case. */
4665 stmt_vec_info opdef;
4666 if (TREE_CODE (orig_op) == SSA_NAME
4667 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
4668 && is_a <bb_vec_info> (vinfo)
4669 && (opdef = vinfo->lookup_def (orig_op)))
4671 if (!insert_after)
4672 insert_after = opdef;
4673 else
4674 insert_after = get_later_stmt (insert_after, opdef);
4677 if (number_of_places_left_in_vector == 0)
4679 if (constant_p
4680 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4681 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4682 vec_cst = gimple_build_vector (&ctor_seq, &elts);
4683 else
4685 if (permute_results.is_empty ())
4686 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
4687 elts, number_of_vectors,
4688 permute_results);
4689 vec_cst = permute_results[number_of_vectors - j - 1];
4691 if (!gimple_seq_empty_p (ctor_seq))
4693 if (insert_after)
4695 gimple_stmt_iterator gsi;
4696 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
4698 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
4699 gsi_insert_seq_before (&gsi, ctor_seq,
4700 GSI_CONTINUE_LINKING);
4702 else if (!stmt_ends_bb_p (insert_after->stmt))
4704 gsi = gsi_for_stmt (insert_after->stmt);
4705 gsi_insert_seq_after (&gsi, ctor_seq,
4706 GSI_CONTINUE_LINKING);
4708 else
4710 /* When we want to insert after a def where the
4711 defining stmt throws then insert on the fallthru
4712 edge. */
4713 edge e = find_fallthru_edge
4714 (gimple_bb (insert_after->stmt)->succs);
4715 basic_block new_bb
4716 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
4717 gcc_assert (!new_bb);
4720 else
4721 vinfo->insert_seq_on_entry (NULL, ctor_seq);
4722 ctor_seq = NULL;
4724 voprnds.quick_push (vec_cst);
4725 insert_after = NULL;
4726 number_of_places_left_in_vector = nunits;
4727 constant_p = true;
4728 elts.new_vector (vector_type, nunits, 1);
4729 elts.quick_grow (nunits);
4734 /* Since the vectors are created in the reverse order, we should invert
4735 them. */
4736 vec_num = voprnds.length ();
4737 for (j = vec_num; j != 0; j--)
4739 vop = voprnds[j - 1];
4740 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4743 /* In case that VF is greater than the unrolling factor needed for the SLP
4744 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4745 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4746 to replicate the vectors. */
4747 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
4748 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
4749 i++)
4750 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4753 /* Get the Ith vectorized definition from SLP_NODE. */
4755 tree
4756 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
4758 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
4759 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
4760 else
4761 return SLP_TREE_VEC_DEFS (slp_node)[i];
4764 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4766 void
4767 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
4769 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
4770 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
4772 unsigned j;
4773 gimple *vec_def_stmt;
4774 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
4775 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
4777 else
4778 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
4781 /* Get N vectorized definitions for SLP_NODE. */
4783 void
4784 vect_get_slp_defs (vec_info *,
4785 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
4787 if (n == -1U)
4788 n = SLP_TREE_CHILDREN (slp_node).length ();
4790 for (unsigned i = 0; i < n; ++i)
4792 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
4793 vec<tree> vec_defs = vNULL;
4794 vect_get_slp_defs (child, &vec_defs);
4795 vec_oprnds->quick_push (vec_defs);
4799 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4800 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4801 permute statements for the SLP node NODE. */
4803 bool
4804 vect_transform_slp_perm_load (vec_info *vinfo,
4805 slp_tree node, vec<tree> dr_chain,
4806 gimple_stmt_iterator *gsi, poly_uint64 vf,
4807 bool analyze_only, unsigned *n_perms)
4809 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4810 int vec_index = 0;
4811 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4812 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
4813 unsigned int mask_element;
4814 machine_mode mode;
4816 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
4817 return false;
4819 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
4821 mode = TYPE_MODE (vectype);
4822 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4824 /* Initialize the vect stmts of NODE to properly insert the generated
4825 stmts later. */
4826 if (! analyze_only)
4827 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
4828 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
4829 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
4831 /* Generate permutation masks for every NODE. Number of masks for each NODE
4832 is equal to GROUP_SIZE.
4833 E.g., we have a group of three nodes with three loads from the same
4834 location in each node, and the vector size is 4. I.e., we have a
4835 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4836 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4837 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4840 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4841 The last mask is illegal since we assume two operands for permute
4842 operation, and the mask element values can't be outside that range.
4843 Hence, the last mask must be converted into {2,5,5,5}.
4844 For the first two permutations we need the first and the second input
4845 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4846 we need the second and the third vectors: {b1,c1,a2,b2} and
4847 {c2,a3,b3,c3}. */
4849 int vect_stmts_counter = 0;
4850 unsigned int index = 0;
4851 int first_vec_index = -1;
4852 int second_vec_index = -1;
4853 bool noop_p = true;
4854 *n_perms = 0;
4856 vec_perm_builder mask;
4857 unsigned int nelts_to_build;
4858 unsigned int nvectors_per_build;
4859 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
4860 && multiple_p (nunits, group_size));
4861 if (repeating_p)
4863 /* A single vector contains a whole number of copies of the node, so:
4864 (a) all permutes can use the same mask; and
4865 (b) the permutes only need a single vector input. */
4866 mask.new_vector (nunits, group_size, 3);
4867 nelts_to_build = mask.encoded_nelts ();
4868 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
4870 else
4872 /* We need to construct a separate mask for each vector statement. */
4873 unsigned HOST_WIDE_INT const_nunits, const_vf;
4874 if (!nunits.is_constant (&const_nunits)
4875 || !vf.is_constant (&const_vf))
4876 return false;
4877 mask.new_vector (const_nunits, const_nunits, 1);
4878 nelts_to_build = const_vf * group_size;
4879 nvectors_per_build = 1;
4882 unsigned int count = mask.encoded_nelts ();
4883 mask.quick_grow (count);
4884 vec_perm_indices indices;
4886 for (unsigned int j = 0; j < nelts_to_build; j++)
4888 unsigned int iter_num = j / group_size;
4889 unsigned int stmt_num = j % group_size;
4890 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
4891 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
4892 if (repeating_p)
4894 first_vec_index = 0;
4895 mask_element = i;
4897 else
4899 /* Enforced before the loop when !repeating_p. */
4900 unsigned int const_nunits = nunits.to_constant ();
4901 vec_index = i / const_nunits;
4902 mask_element = i % const_nunits;
4903 if (vec_index == first_vec_index
4904 || first_vec_index == -1)
4906 first_vec_index = vec_index;
4908 else if (vec_index == second_vec_index
4909 || second_vec_index == -1)
4911 second_vec_index = vec_index;
4912 mask_element += const_nunits;
4914 else
4916 if (dump_enabled_p ())
4917 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4918 "permutation requires at "
4919 "least three vectors %G",
4920 stmt_info->stmt);
4921 gcc_assert (analyze_only);
4922 return false;
4925 gcc_assert (mask_element < 2 * const_nunits);
4928 if (mask_element != index)
4929 noop_p = false;
4930 mask[index++] = mask_element;
4932 if (index == count && !noop_p)
4934 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4935 if (!can_vec_perm_const_p (mode, indices))
4937 if (dump_enabled_p ())
4939 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4940 vect_location,
4941 "unsupported vect permute { ");
4942 for (i = 0; i < count; ++i)
4944 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4945 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4947 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4949 gcc_assert (analyze_only);
4950 return false;
4953 ++*n_perms;
4956 if (index == count)
4958 if (!analyze_only)
4960 tree mask_vec = NULL_TREE;
4962 if (! noop_p)
4963 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4965 if (second_vec_index == -1)
4966 second_vec_index = first_vec_index;
4968 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4970 /* Generate the permute statement if necessary. */
4971 tree first_vec = dr_chain[first_vec_index + ri];
4972 tree second_vec = dr_chain[second_vec_index + ri];
4973 gimple *perm_stmt;
4974 if (! noop_p)
4976 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4977 tree perm_dest
4978 = vect_create_destination_var (gimple_assign_lhs (stmt),
4979 vectype);
4980 perm_dest = make_ssa_name (perm_dest);
4981 perm_stmt
4982 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4983 first_vec, second_vec,
4984 mask_vec);
4985 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
4986 gsi);
4988 else
4989 /* If mask was NULL_TREE generate the requested
4990 identity transform. */
4991 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
4993 /* Store the vector statement in NODE. */
4994 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
4998 index = 0;
4999 first_vec_index = -1;
5000 second_vec_index = -1;
5001 noop_p = true;
5005 return true;
5009 /* Vectorize the SLP permutations in NODE as specified
5010 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5011 child number and lane number.
5012 Interleaving of two two-lane two-child SLP subtrees (not supported):
5013 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5014 A blend of two four-lane two-child SLP subtrees:
5015 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5016 Highpart of a four-lane one-child SLP subtree (not supported):
5017 [ { 0, 2 }, { 0, 3 } ]
5018 Where currently only a subset is supported by code generating below. */
5020 static bool
5021 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5022 slp_tree node, stmt_vector_for_cost *cost_vec)
5024 tree vectype = SLP_TREE_VECTYPE (node);
5026 /* ??? We currently only support all same vector input and output types
5027 while the SLP IL should really do a concat + select and thus accept
5028 arbitrary mismatches. */
5029 slp_tree child;
5030 unsigned i;
5031 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5032 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5034 if (dump_enabled_p ())
5035 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5036 "Unsupported lane permutation\n");
5037 return false;
5040 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5041 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5042 if (dump_enabled_p ())
5044 dump_printf_loc (MSG_NOTE, vect_location,
5045 "vectorizing permutation");
5046 for (unsigned i = 0; i < perm.length (); ++i)
5047 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5048 dump_printf (MSG_NOTE, "\n");
5051 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5052 if (!nunits.is_constant ())
5053 return false;
5054 unsigned HOST_WIDE_INT vf = 1;
5055 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5056 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
5057 return false;
5058 unsigned olanes = vf * SLP_TREE_LANES (node);
5059 gcc_assert (multiple_p (olanes, nunits));
5061 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5062 from the { SLP operand, scalar lane } permutation as recorded in the
5063 SLP node as intermediate step. This part should already work
5064 with SLP children with arbitrary number of lanes. */
5065 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
5066 auto_vec<unsigned> active_lane;
5067 vperm.create (olanes);
5068 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
5069 for (unsigned i = 0; i < vf; ++i)
5071 for (unsigned pi = 0; pi < perm.length (); ++pi)
5073 std::pair<unsigned, unsigned> p = perm[pi];
5074 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
5075 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
5076 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
5077 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
5078 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
5080 /* Advance to the next group. */
5081 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
5082 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
5085 if (dump_enabled_p ())
5087 dump_printf_loc (MSG_NOTE, vect_location, "as");
5088 for (unsigned i = 0; i < vperm.length (); ++i)
5090 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
5091 dump_printf (MSG_NOTE, ",");
5092 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
5093 vperm[i].first.first, vperm[i].first.second,
5094 vperm[i].first.second);
5096 dump_printf (MSG_NOTE, "\n");
5099 /* We can only handle two-vector permutes, everything else should
5100 be lowered on the SLP level. The following is closely inspired
5101 by vect_transform_slp_perm_load and is supposed to eventually
5102 replace it.
5103 ??? As intermediate step do code-gen in the SLP tree representation
5104 somehow? */
5105 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
5106 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
5107 unsigned int const_nunits = nunits.to_constant ();
5108 unsigned int index = 0;
5109 unsigned int mask_element;
5110 vec_perm_builder mask;
5111 mask.new_vector (const_nunits, const_nunits, 1);
5112 unsigned int count = mask.encoded_nelts ();
5113 mask.quick_grow (count);
5114 vec_perm_indices indices;
5115 unsigned nperms = 0;
5116 for (unsigned i = 0; i < vperm.length (); ++i)
5118 mask_element = vperm[i].second;
5119 if (first_vec.first == -1U
5120 || first_vec == vperm[i].first)
5121 first_vec = vperm[i].first;
5122 else if (second_vec.first == -1U
5123 || second_vec == vperm[i].first)
5125 second_vec = vperm[i].first;
5126 mask_element += const_nunits;
5128 else
5130 if (dump_enabled_p ())
5131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5132 "permutation requires at "
5133 "least three vectors");
5134 gcc_assert (!gsi);
5135 return false;
5138 mask[index++] = mask_element;
5140 if (index == count)
5142 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
5143 const_nunits);
5144 bool identity_p = indices.series_p (0, 1, 0, 1);
5145 if (!identity_p
5146 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5148 if (dump_enabled_p ())
5150 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5151 vect_location,
5152 "unsupported vect permute { ");
5153 for (i = 0; i < count; ++i)
5155 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5156 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5158 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5160 gcc_assert (!gsi);
5161 return false;
5164 if (!identity_p)
5165 nperms++;
5166 if (gsi)
5168 if (second_vec.first == -1U)
5169 second_vec = first_vec;
5171 /* Generate the permute statement if necessary. */
5172 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
5173 tree first_def
5174 = vect_get_slp_vect_def (first_node, first_vec.second);
5175 gassign *perm_stmt;
5176 tree perm_dest = make_ssa_name (vectype);
5177 if (!identity_p)
5179 slp_tree second_node
5180 = SLP_TREE_CHILDREN (node)[second_vec.first];
5181 tree second_def
5182 = vect_get_slp_vect_def (second_node, second_vec.second);
5183 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5184 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5185 first_def, second_def,
5186 mask_vec);
5188 else
5189 /* We need a copy here in case the def was external. */
5190 perm_stmt = gimple_build_assign (perm_dest, first_def);
5191 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5192 /* Store the vector statement in NODE. */
5193 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5196 index = 0;
5197 first_vec = std::make_pair (-1U, -1U);
5198 second_vec = std::make_pair (-1U, -1U);
5202 if (!gsi)
5203 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
5205 return true;
5208 /* Vectorize SLP NODE. */
5210 static void
5211 vect_schedule_slp_node (vec_info *vinfo,
5212 slp_tree node, slp_instance instance)
5214 gimple_stmt_iterator si;
5215 int i;
5216 slp_tree child;
5218 /* For existing vectors there's nothing to do. */
5219 if (SLP_TREE_VEC_DEFS (node).exists ())
5220 return;
5222 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
5224 /* Vectorize externals and constants. */
5225 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5226 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5228 /* ??? vectorizable_shift can end up using a scalar operand which is
5229 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5230 node in this case. */
5231 if (!SLP_TREE_VECTYPE (node))
5232 return;
5234 vect_create_constant_vectors (vinfo, node);
5235 return;
5238 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5240 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
5241 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
5243 if (dump_enabled_p ())
5244 dump_printf_loc (MSG_NOTE, vect_location,
5245 "------>vectorizing SLP node starting from: %G",
5246 stmt_info->stmt);
5248 if (STMT_VINFO_DATA_REF (stmt_info)
5249 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5251 /* Vectorized loads go before the first scalar load to make it
5252 ready early, vectorized stores go before the last scalar
5253 stmt which is where all uses are ready. */
5254 stmt_vec_info last_stmt_info = NULL;
5255 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
5256 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
5257 else /* DR_IS_WRITE */
5258 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
5259 si = gsi_for_stmt (last_stmt_info->stmt);
5261 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
5262 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
5263 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
5264 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5266 /* For PHI node vectorization we do not use the insertion iterator. */
5267 si = gsi_none ();
5269 else
5271 /* Emit other stmts after the children vectorized defs which is
5272 earliest possible. */
5273 gimple *last_stmt = NULL;
5274 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5275 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5277 /* For fold-left reductions we are retaining the scalar
5278 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5279 set so the representation isn't perfect. Resort to the
5280 last scalar def here. */
5281 if (SLP_TREE_VEC_STMTS (child).is_empty ())
5283 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
5284 == cycle_phi_info_type);
5285 gphi *phi = as_a <gphi *>
5286 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
5287 if (!last_stmt
5288 || vect_stmt_dominates_stmt_p (last_stmt, phi))
5289 last_stmt = phi;
5291 /* We are emitting all vectorized stmts in the same place and
5292 the last one is the last.
5293 ??? Unless we have a load permutation applied and that
5294 figures to re-use an earlier generated load. */
5295 unsigned j;
5296 gimple *vstmt;
5297 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
5298 if (!last_stmt
5299 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5300 last_stmt = vstmt;
5302 else if (!SLP_TREE_VECTYPE (child))
5304 /* For externals we use unvectorized at all scalar defs. */
5305 unsigned j;
5306 tree def;
5307 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
5308 if (TREE_CODE (def) == SSA_NAME
5309 && !SSA_NAME_IS_DEFAULT_DEF (def))
5311 gimple *stmt = SSA_NAME_DEF_STMT (def);
5312 if (!last_stmt
5313 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
5314 last_stmt = stmt;
5317 else
5319 /* For externals we have to look at all defs since their
5320 insertion place is decided per vector. But beware
5321 of pre-existing vectors where we need to make sure
5322 we do not insert before the region boundary. */
5323 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
5324 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
5325 last_stmt = gsi_stmt (gsi_after_labels
5326 (as_a <bb_vec_info> (vinfo)->bbs[0]));
5327 else
5329 unsigned j;
5330 tree vdef;
5331 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
5332 if (TREE_CODE (vdef) == SSA_NAME
5333 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
5335 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
5336 if (!last_stmt
5337 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5338 last_stmt = vstmt;
5342 /* This can happen when all children are pre-existing vectors or
5343 constants. */
5344 if (!last_stmt)
5345 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
5346 if (is_a <gphi *> (last_stmt))
5347 si = gsi_after_labels (gimple_bb (last_stmt));
5348 else
5350 si = gsi_for_stmt (last_stmt);
5351 gsi_next (&si);
5355 bool done_p = false;
5357 /* Handle purely internal nodes. */
5358 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5360 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5361 be shared with different SLP nodes (but usually it's the same
5362 operation apart from the case the stmt is only there for denoting
5363 the actual scalar lane defs ...). So do not call vect_transform_stmt
5364 but open-code it here (partly). */
5365 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
5366 gcc_assert (done);
5367 done_p = true;
5369 if (!done_p)
5370 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
5373 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5374 For loop vectorization this is done in vectorizable_call, but for SLP
5375 it needs to be deferred until end of vect_schedule_slp, because multiple
5376 SLP instances may refer to the same scalar stmt. */
5378 static void
5379 vect_remove_slp_scalar_calls (vec_info *vinfo,
5380 slp_tree node, hash_set<slp_tree> &visited)
5382 gimple *new_stmt;
5383 gimple_stmt_iterator gsi;
5384 int i;
5385 slp_tree child;
5386 tree lhs;
5387 stmt_vec_info stmt_info;
5389 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5390 return;
5392 if (visited.add (node))
5393 return;
5395 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5396 vect_remove_slp_scalar_calls (vinfo, child, visited);
5398 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5400 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
5401 if (!stmt || gimple_bb (stmt) == NULL)
5402 continue;
5403 if (is_pattern_stmt_p (stmt_info)
5404 || !PURE_SLP_STMT (stmt_info))
5405 continue;
5406 lhs = gimple_call_lhs (stmt);
5407 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
5408 gsi = gsi_for_stmt (stmt);
5409 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
5410 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
5414 static void
5415 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
5417 hash_set<slp_tree> visited;
5418 vect_remove_slp_scalar_calls (vinfo, node, visited);
5421 /* Vectorize the instance root. */
5423 void
5424 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
5426 gassign *rstmt = NULL;
5428 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
5430 gimple *child_stmt;
5431 int j;
5433 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5435 tree vect_lhs = gimple_get_lhs (child_stmt);
5436 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
5437 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
5438 TREE_TYPE (vect_lhs)))
5439 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
5440 vect_lhs);
5441 rstmt = gimple_build_assign (root_lhs, vect_lhs);
5442 break;
5445 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
5447 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5448 gimple *child_stmt;
5449 int j;
5450 vec<constructor_elt, va_gc> *v;
5451 vec_alloc (v, nelts);
5453 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5455 CONSTRUCTOR_APPEND_ELT (v,
5456 NULL_TREE,
5457 gimple_get_lhs (child_stmt));
5459 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
5460 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
5461 tree r_constructor = build_constructor (rtype, v);
5462 rstmt = gimple_build_assign (lhs, r_constructor);
5465 gcc_assert (rstmt);
5467 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
5468 gsi_replace (&rgsi, rstmt, true);
5471 struct slp_scc_info
5473 bool on_stack;
5474 int dfs;
5475 int lowlink;
5478 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
5480 static void
5481 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
5482 hash_map<slp_tree, slp_scc_info> &scc_info,
5483 int &maxdfs, vec<slp_tree> &stack)
5485 bool existed_p;
5486 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
5487 gcc_assert (!existed_p);
5488 info->dfs = maxdfs;
5489 info->lowlink = maxdfs;
5490 info->on_stack = true;
5491 maxdfs++;
5492 stack.safe_push (node);
5493 unsigned i;
5494 slp_tree child;
5496 /* ??? We're keeping SLP_TREE_CHILDREN of externalized nodes. */
5497 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
5498 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5500 if (!child)
5501 continue;
5502 slp_scc_info *child_info = scc_info.get (child);
5503 if (!child_info)
5505 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
5506 /* Recursion might have re-allocated the node. */
5507 info = scc_info.get (node);
5508 child_info = scc_info.get (child);
5509 info->lowlink = MIN (info->lowlink, child_info->lowlink);
5511 else if (child_info->on_stack)
5512 info->lowlink = MIN (info->lowlink, child_info->dfs);
5514 if (info->lowlink != info->dfs)
5515 return;
5517 /* Singleton. */
5518 if (stack.last () == node)
5520 stack.pop ();
5521 info->on_stack = false;
5522 vect_schedule_slp_node (vinfo, node, instance);
5523 return;
5525 /* SCC. */
5526 int last_idx = stack.length () - 1;
5527 while (stack[last_idx] != node)
5528 last_idx--;
5529 /* We can break the cycle at PHIs who have at least one child
5530 code generated. Then we could re-start the DFS walk until
5531 all nodes in the SCC are covered (we might have new entries
5532 for only back-reachable nodes). But it's simpler to just
5533 iterate and schedule those that are ready. */
5534 auto_vec<slp_tree, 4> phis_to_fixup;
5535 unsigned todo = stack.length () - last_idx;
5538 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
5540 slp_tree entry = stack[idx];
5541 if (!entry)
5542 continue;
5543 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
5544 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
5545 bool ready = !phi;
5546 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
5547 if (!child)
5549 gcc_assert (phi);
5550 ready = true;
5551 break;
5553 else if (scc_info.get (child)->on_stack)
5555 if (!phi)
5557 ready = false;
5558 break;
5561 else
5563 if (phi)
5565 ready = true;
5566 break;
5569 if (ready)
5571 vect_schedule_slp_node (vinfo, entry, instance);
5572 scc_info.get (entry)->on_stack = false;
5573 stack[idx] = NULL;
5574 todo--;
5575 if (phi)
5576 phis_to_fixup.safe_push (entry);
5580 while (todo != 0);
5582 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
5583 slp_tree phi_node;
5584 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
5586 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
5587 edge_iterator ei;
5588 edge e;
5589 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
5591 unsigned dest_idx = e->dest_idx;
5592 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
5593 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
5594 continue;
5595 /* Simply fill all args. */
5596 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
5597 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
5598 vect_get_slp_vect_def (child, i),
5599 e, gimple_phi_arg_location (phi, dest_idx));
5603 /* Pop the SCC. */
5604 stack.truncate (last_idx);
5607 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
5609 void
5610 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
5612 slp_instance instance;
5613 unsigned int i;
5615 hash_map<slp_tree, slp_scc_info> scc_info;
5616 int maxdfs = 0;
5617 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5619 slp_tree node = SLP_INSTANCE_TREE (instance);
5620 if (dump_enabled_p ())
5622 dump_printf_loc (MSG_NOTE, vect_location,
5623 "Vectorizing SLP tree:\n");
5624 if (SLP_INSTANCE_ROOT_STMT (instance))
5625 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
5626 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
5627 vect_print_slp_graph (MSG_NOTE, vect_location,
5628 SLP_INSTANCE_TREE (instance));
5630 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
5631 have a PHI be the node breaking the cycle. */
5632 auto_vec<slp_tree> stack;
5633 if (!scc_info.get (node))
5634 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
5636 if (SLP_INSTANCE_ROOT_STMT (instance))
5637 vectorize_slp_instance_root_stmt (node, instance);
5639 if (dump_enabled_p ())
5640 dump_printf_loc (MSG_NOTE, vect_location,
5641 "vectorizing stmts using SLP.\n");
5644 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5646 slp_tree root = SLP_INSTANCE_TREE (instance);
5647 stmt_vec_info store_info;
5648 unsigned int j;
5650 /* Remove scalar call stmts. Do not do this for basic-block
5651 vectorization as not all uses may be vectorized.
5652 ??? Why should this be necessary? DCE should be able to
5653 remove the stmts itself.
5654 ??? For BB vectorization we can as well remove scalar
5655 stmts starting from the SLP tree root if they have no
5656 uses. */
5657 if (is_a <loop_vec_info> (vinfo))
5658 vect_remove_slp_scalar_calls (vinfo, root);
5660 /* Remove vectorized stores original scalar stmts. */
5661 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
5663 if (!STMT_VINFO_DATA_REF (store_info)
5664 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
5665 break;
5667 store_info = vect_orig_stmt (store_info);
5668 /* Free the attached stmt_vec_info and remove the stmt. */
5669 vinfo->remove_stmt (store_info);