testsuite: Correct requirements for vadsdu*, vslv and vsrv testcases.
[official-gcc.git] / gcc / tree-vect-slp.c
blobc98f747d4a962baab8d8f6d5a7b276845654bfb0
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
52 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
53 slp_tree, stmt_vector_for_cost *);
55 object_allocator<_slp_tree> *slp_tree_pool;
57 void *
58 _slp_tree::operator new (size_t n)
60 gcc_assert (n == sizeof (_slp_tree));
61 return slp_tree_pool->allocate_raw ();
64 void
65 _slp_tree::operator delete (void *node, size_t n)
67 gcc_assert (n == sizeof (_slp_tree));
68 slp_tree_pool->remove_raw (node);
72 /* Initialize a SLP node. */
74 _slp_tree::_slp_tree ()
76 SLP_TREE_SCALAR_STMTS (this) = vNULL;
77 SLP_TREE_SCALAR_OPS (this) = vNULL;
78 SLP_TREE_VEC_STMTS (this) = vNULL;
79 SLP_TREE_VEC_DEFS (this) = vNULL;
80 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
81 SLP_TREE_CHILDREN (this) = vNULL;
82 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
83 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
84 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
85 SLP_TREE_CODE (this) = ERROR_MARK;
86 SLP_TREE_VECTYPE (this) = NULL_TREE;
87 SLP_TREE_REPRESENTATIVE (this) = NULL;
88 SLP_TREE_REF_COUNT (this) = 1;
89 this->max_nunits = 1;
90 this->lanes = 0;
93 /* Tear down a SLP node. */
95 _slp_tree::~_slp_tree ()
97 SLP_TREE_CHILDREN (this).release ();
98 SLP_TREE_SCALAR_STMTS (this).release ();
99 SLP_TREE_SCALAR_OPS (this).release ();
100 SLP_TREE_VEC_STMTS (this).release ();
101 SLP_TREE_VEC_DEFS (this).release ();
102 SLP_TREE_LOAD_PERMUTATION (this).release ();
103 SLP_TREE_LANE_PERMUTATION (this).release ();
106 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
108 static void
109 vect_free_slp_tree (slp_tree node)
111 int i;
112 slp_tree child;
114 if (--SLP_TREE_REF_COUNT (node) != 0)
115 return;
117 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
118 if (child)
119 vect_free_slp_tree (child);
121 delete node;
124 /* Return a location suitable for dumpings related to the SLP instance. */
126 dump_user_location_t
127 _slp_instance::location () const
129 if (root_stmt)
130 return root_stmt->stmt;
131 else
132 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
136 /* Free the memory allocated for the SLP instance. */
138 void
139 vect_free_slp_instance (slp_instance instance)
141 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
142 SLP_INSTANCE_LOADS (instance).release ();
143 instance->subgraph_entries.release ();
144 instance->cost_vec.release ();
145 free (instance);
149 /* Create an SLP node for SCALAR_STMTS. */
151 static slp_tree
152 vect_create_new_slp_node (slp_tree node,
153 vec<stmt_vec_info> scalar_stmts, unsigned nops)
155 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
156 SLP_TREE_CHILDREN (node).create (nops);
157 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
158 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
159 SLP_TREE_LANES (node) = scalar_stmts.length ();
160 return node;
163 /* Create an SLP node for SCALAR_STMTS. */
165 static slp_tree
166 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
168 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
171 /* Create an SLP node for OPS. */
173 static slp_tree
174 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
176 SLP_TREE_SCALAR_OPS (node) = ops;
177 SLP_TREE_DEF_TYPE (node) = vect_external_def;
178 SLP_TREE_LANES (node) = ops.length ();
179 return node;
182 /* Create an SLP node for OPS. */
184 static slp_tree
185 vect_create_new_slp_node (vec<tree> ops)
187 return vect_create_new_slp_node (new _slp_tree, ops);
191 /* This structure is used in creation of an SLP tree. Each instance
192 corresponds to the same operand in a group of scalar stmts in an SLP
193 node. */
194 typedef struct _slp_oprnd_info
196 /* Def-stmts for the operands. */
197 vec<stmt_vec_info> def_stmts;
198 /* Operands. */
199 vec<tree> ops;
200 /* Information about the first statement, its vector def-type, type, the
201 operand itself in case it's constant, and an indication if it's a pattern
202 stmt. */
203 tree first_op_type;
204 enum vect_def_type first_dt;
205 bool any_pattern;
206 } *slp_oprnd_info;
209 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
210 operand. */
211 static vec<slp_oprnd_info>
212 vect_create_oprnd_info (int nops, int group_size)
214 int i;
215 slp_oprnd_info oprnd_info;
216 vec<slp_oprnd_info> oprnds_info;
218 oprnds_info.create (nops);
219 for (i = 0; i < nops; i++)
221 oprnd_info = XNEW (struct _slp_oprnd_info);
222 oprnd_info->def_stmts.create (group_size);
223 oprnd_info->ops.create (group_size);
224 oprnd_info->first_dt = vect_uninitialized_def;
225 oprnd_info->first_op_type = NULL_TREE;
226 oprnd_info->any_pattern = false;
227 oprnds_info.quick_push (oprnd_info);
230 return oprnds_info;
234 /* Free operands info. */
236 static void
237 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
239 int i;
240 slp_oprnd_info oprnd_info;
242 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
244 oprnd_info->def_stmts.release ();
245 oprnd_info->ops.release ();
246 XDELETE (oprnd_info);
249 oprnds_info.release ();
253 /* Return true if STMTS contains a pattern statement. */
255 static bool
256 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
258 stmt_vec_info stmt_info;
259 unsigned int i;
260 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
261 if (is_pattern_stmt_p (stmt_info))
262 return true;
263 return false;
266 /* Return true when all lanes in the external or constant NODE have
267 the same value. */
269 static bool
270 vect_slp_tree_uniform_p (slp_tree node)
272 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
273 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
275 /* Pre-exsting vectors. */
276 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
277 return false;
279 unsigned i;
280 tree op, first = NULL_TREE;
281 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
282 if (!first)
283 first = op;
284 else if (!operand_equal_p (first, op, 0))
285 return false;
287 return true;
290 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
291 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
292 of the chain. */
295 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
296 stmt_vec_info first_stmt_info)
298 stmt_vec_info next_stmt_info = first_stmt_info;
299 int result = 0;
301 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
302 return -1;
306 if (next_stmt_info == stmt_info)
307 return result;
308 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
309 if (next_stmt_info)
310 result += DR_GROUP_GAP (next_stmt_info);
312 while (next_stmt_info);
314 return -1;
317 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
318 using the method implemented by duplicate_and_interleave. Return true
319 if so, returning the number of intermediate vectors in *NVECTORS_OUT
320 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
321 (if nonnull). */
323 bool
324 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
325 tree elt_type, unsigned int *nvectors_out,
326 tree *vector_type_out,
327 tree *permutes)
329 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
330 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
331 return false;
333 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
334 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
335 unsigned int nvectors = 1;
336 for (;;)
338 scalar_int_mode int_mode;
339 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
340 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
342 /* Get the natural vector type for this SLP group size. */
343 tree int_type = build_nonstandard_integer_type
344 (GET_MODE_BITSIZE (int_mode), 1);
345 tree vector_type
346 = get_vectype_for_scalar_type (vinfo, int_type, count);
347 if (vector_type
348 && VECTOR_MODE_P (TYPE_MODE (vector_type))
349 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
350 GET_MODE_SIZE (base_vector_mode)))
352 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
353 together into elements of type INT_TYPE and using the result
354 to build NVECTORS vectors. */
355 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
356 vec_perm_builder sel1 (nelts, 2, 3);
357 vec_perm_builder sel2 (nelts, 2, 3);
358 poly_int64 half_nelts = exact_div (nelts, 2);
359 for (unsigned int i = 0; i < 3; ++i)
361 sel1.quick_push (i);
362 sel1.quick_push (i + nelts);
363 sel2.quick_push (half_nelts + i);
364 sel2.quick_push (half_nelts + i + nelts);
366 vec_perm_indices indices1 (sel1, 2, nelts);
367 vec_perm_indices indices2 (sel2, 2, nelts);
368 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
369 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
371 if (nvectors_out)
372 *nvectors_out = nvectors;
373 if (vector_type_out)
374 *vector_type_out = vector_type;
375 if (permutes)
377 permutes[0] = vect_gen_perm_mask_checked (vector_type,
378 indices1);
379 permutes[1] = vect_gen_perm_mask_checked (vector_type,
380 indices2);
382 return true;
386 if (!multiple_p (elt_bytes, 2, &elt_bytes))
387 return false;
388 nvectors *= 2;
392 /* Return true if DTA and DTB match. */
394 static bool
395 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
397 return (dta == dtb
398 || ((dta == vect_external_def || dta == vect_constant_def)
399 && (dtb == vect_external_def || dtb == vect_constant_def)));
402 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
403 they are of a valid type and that they match the defs of the first stmt of
404 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
405 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
406 indicates swap is required for cond_expr stmts. Specifically, *SWAP
407 is 1 if STMT is cond and operands of comparison need to be swapped;
408 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
409 If there is any operand swap in this function, *SWAP is set to non-zero
410 value.
411 If there was a fatal error return -1; if the error could be corrected by
412 swapping operands of father node of this one, return 1; if everything is
413 ok return 0. */
414 static int
415 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
416 bool *skip_args,
417 vec<stmt_vec_info> stmts, unsigned stmt_num,
418 vec<slp_oprnd_info> *oprnds_info)
420 stmt_vec_info stmt_info = stmts[stmt_num];
421 tree oprnd;
422 unsigned int i, number_of_oprnds;
423 enum vect_def_type dt = vect_uninitialized_def;
424 slp_oprnd_info oprnd_info;
425 int first_op_idx = 1;
426 unsigned int commutative_op = -1U;
427 bool first_op_cond = false;
428 bool first = stmt_num == 0;
430 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
432 number_of_oprnds = gimple_call_num_args (stmt);
433 first_op_idx = 3;
434 if (gimple_call_internal_p (stmt))
436 internal_fn ifn = gimple_call_internal_fn (stmt);
437 commutative_op = first_commutative_argument (ifn);
439 /* Masked load, only look at mask. */
440 if (ifn == IFN_MASK_LOAD)
442 number_of_oprnds = 1;
443 /* Mask operand index. */
444 first_op_idx = 5;
448 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
450 enum tree_code code = gimple_assign_rhs_code (stmt);
451 number_of_oprnds = gimple_num_ops (stmt) - 1;
452 /* Swap can only be done for cond_expr if asked to, otherwise we
453 could result in different comparison code to the first stmt. */
454 if (code == COND_EXPR
455 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
457 first_op_cond = true;
458 number_of_oprnds++;
460 else
461 commutative_op = commutative_tree_code (code) ? 0U : -1U;
463 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
464 number_of_oprnds = gimple_phi_num_args (stmt);
465 else
466 return -1;
468 bool swapped = (swap != 0);
469 bool backedge = false;
470 gcc_assert (!swapped || first_op_cond);
471 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
472 for (i = 0; i < number_of_oprnds; i++)
474 if (first_op_cond)
476 /* Map indicating how operands of cond_expr should be swapped. */
477 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
478 int *map = maps[swap];
480 if (i < 2)
481 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
482 first_op_idx), map[i]);
483 else
484 oprnd = gimple_op (stmt_info->stmt, map[i]);
486 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
488 oprnd = gimple_phi_arg_def (stmt, i);
489 backedge = dominated_by_p (CDI_DOMINATORS,
490 gimple_phi_arg_edge (stmt, i)->src,
491 gimple_bb (stmt_info->stmt));
493 else
494 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
495 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
496 oprnd = TREE_OPERAND (oprnd, 0);
498 oprnd_info = (*oprnds_info)[i];
500 stmt_vec_info def_stmt_info;
501 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
503 if (dump_enabled_p ())
504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
505 "Build SLP failed: can't analyze def for %T\n",
506 oprnd);
508 return -1;
511 if (skip_args[i])
513 oprnd_info->def_stmts.quick_push (NULL);
514 oprnd_info->ops.quick_push (NULL_TREE);
515 oprnd_info->first_dt = vect_uninitialized_def;
516 continue;
519 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
520 oprnd_info->any_pattern = true;
522 oprnd_info->def_stmts.quick_push (def_stmt_info);
523 oprnd_info->ops.quick_push (oprnd);
525 /* If there's a extern def on a backedge make sure we can
526 code-generate at the region start.
527 ??? This is another case that could be fixed by adjusting
528 how we split the function but at the moment we'd have conflicting
529 goals there. */
530 if (backedge
531 && dts[i] == vect_external_def
532 && is_a <bb_vec_info> (vinfo)
533 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
534 && !dominated_by_p (CDI_DOMINATORS,
535 as_a <bb_vec_info> (vinfo)->bbs[0],
536 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
538 if (dump_enabled_p ())
539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
540 "Build SLP failed: extern def %T only defined "
541 "on backedge\n", oprnd);
542 return -1;
545 if (first)
547 tree type = TREE_TYPE (oprnd);
548 dt = dts[i];
549 if ((dt == vect_constant_def
550 || dt == vect_external_def)
551 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
552 && (TREE_CODE (type) == BOOLEAN_TYPE
553 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
554 type)))
556 if (dump_enabled_p ())
557 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
558 "Build SLP failed: invalid type of def "
559 "for variable-length SLP %T\n", oprnd);
560 return -1;
563 /* For the swapping logic below force vect_reduction_def
564 for the reduction op in a SLP reduction group. */
565 if (!STMT_VINFO_DATA_REF (stmt_info)
566 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
567 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
568 && def_stmt_info)
569 dts[i] = dt = vect_reduction_def;
571 /* Check the types of the definition. */
572 switch (dt)
574 case vect_external_def:
575 case vect_constant_def:
576 case vect_internal_def:
577 case vect_reduction_def:
578 case vect_induction_def:
579 case vect_nested_cycle:
580 break;
582 default:
583 /* FORNOW: Not supported. */
584 if (dump_enabled_p ())
585 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
586 "Build SLP failed: illegal type of def %T\n",
587 oprnd);
588 return -1;
591 oprnd_info->first_dt = dt;
592 oprnd_info->first_op_type = type;
595 if (first)
596 return 0;
598 /* Now match the operand definition types to that of the first stmt. */
599 for (i = 0; i < number_of_oprnds;)
601 if (skip_args[i])
603 ++i;
604 continue;
607 oprnd_info = (*oprnds_info)[i];
608 dt = dts[i];
609 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
610 oprnd = oprnd_info->ops[stmt_num];
611 tree type = TREE_TYPE (oprnd);
613 if (!types_compatible_p (oprnd_info->first_op_type, type))
615 if (dump_enabled_p ())
616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
617 "Build SLP failed: different operand types\n");
618 return 1;
621 /* Not first stmt of the group, check that the def-stmt/s match
622 the def-stmt/s of the first stmt. Allow different definition
623 types for reduction chains: the first stmt must be a
624 vect_reduction_def (a phi node), and the rest
625 end in the reduction chain. */
626 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
627 && !(oprnd_info->first_dt == vect_reduction_def
628 && !STMT_VINFO_DATA_REF (stmt_info)
629 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
630 && def_stmt_info
631 && !STMT_VINFO_DATA_REF (def_stmt_info)
632 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
633 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
634 || (!STMT_VINFO_DATA_REF (stmt_info)
635 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
636 && ((!def_stmt_info
637 || STMT_VINFO_DATA_REF (def_stmt_info)
638 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
639 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
640 != (oprnd_info->first_dt != vect_reduction_def))))
642 /* Try swapping operands if we got a mismatch. For BB
643 vectorization only in case it will clearly improve things. */
644 if (i == commutative_op && !swapped
645 && (!is_a <bb_vec_info> (vinfo)
646 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
647 dts[i+1])
648 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
649 || vect_def_types_match
650 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
652 if (dump_enabled_p ())
653 dump_printf_loc (MSG_NOTE, vect_location,
654 "trying swapped operands\n");
655 std::swap (dts[i], dts[i+1]);
656 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
657 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
658 std::swap ((*oprnds_info)[i]->ops[stmt_num],
659 (*oprnds_info)[i+1]->ops[stmt_num]);
660 swapped = true;
661 continue;
664 if (is_a <bb_vec_info> (vinfo)
665 && !oprnd_info->any_pattern)
667 /* Now for commutative ops we should see whether we can
668 make the other operand matching. */
669 if (dump_enabled_p ())
670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
671 "treating operand as external\n");
672 oprnd_info->first_dt = dt = vect_external_def;
674 else
676 if (dump_enabled_p ())
677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
678 "Build SLP failed: different types\n");
679 return 1;
683 /* Make sure to demote the overall operand to external. */
684 if (dt == vect_external_def)
685 oprnd_info->first_dt = vect_external_def;
686 /* For a SLP reduction chain we want to duplicate the reduction to
687 each of the chain members. That gets us a sane SLP graph (still
688 the stmts are not 100% correct wrt the initial values). */
689 else if ((dt == vect_internal_def
690 || dt == vect_reduction_def)
691 && oprnd_info->first_dt == vect_reduction_def
692 && !STMT_VINFO_DATA_REF (stmt_info)
693 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
694 && !STMT_VINFO_DATA_REF (def_stmt_info)
695 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
696 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
698 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
699 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
702 ++i;
705 /* Swap operands. */
706 if (swapped)
708 if (dump_enabled_p ())
709 dump_printf_loc (MSG_NOTE, vect_location,
710 "swapped operands to match def types in %G",
711 stmt_info->stmt);
714 return 0;
717 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
718 Return true if we can, meaning that this choice doesn't conflict with
719 existing SLP nodes that use STMT_INFO. */
721 bool
722 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
724 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
725 if (old_vectype)
726 return useless_type_conversion_p (vectype, old_vectype);
728 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
730 /* We maintain the invariant that if any statement in the group is
731 used, all other members of the group have the same vector type. */
732 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
733 stmt_vec_info member_info = first_info;
734 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
735 if (is_pattern_stmt_p (member_info)
736 && !useless_type_conversion_p (vectype,
737 STMT_VINFO_VECTYPE (member_info)))
738 break;
740 if (!member_info)
742 for (member_info = first_info; member_info;
743 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
744 STMT_VINFO_VECTYPE (member_info) = vectype;
745 return true;
748 else if (!is_pattern_stmt_p (stmt_info))
750 STMT_VINFO_VECTYPE (stmt_info) = vectype;
751 return true;
754 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757 "Build SLP failed: incompatible vector"
758 " types for: %G", stmt_info->stmt);
759 dump_printf_loc (MSG_NOTE, vect_location,
760 " old vector type: %T\n", old_vectype);
761 dump_printf_loc (MSG_NOTE, vect_location,
762 " new vector type: %T\n", vectype);
764 return false;
767 /* Return true if call statements CALL1 and CALL2 are similar enough
768 to be combined into the same SLP group. */
770 static bool
771 compatible_calls_p (gcall *call1, gcall *call2)
773 unsigned int nargs = gimple_call_num_args (call1);
774 if (nargs != gimple_call_num_args (call2))
775 return false;
777 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
778 return false;
780 if (gimple_call_internal_p (call1))
782 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
783 TREE_TYPE (gimple_call_lhs (call2))))
784 return false;
785 for (unsigned int i = 0; i < nargs; ++i)
786 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
787 TREE_TYPE (gimple_call_arg (call2, i))))
788 return false;
790 else
792 if (!operand_equal_p (gimple_call_fn (call1),
793 gimple_call_fn (call2), 0))
794 return false;
796 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
797 return false;
799 return true;
802 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
803 caller's attempt to find the vector type in STMT_INFO with the narrowest
804 element type. Return true if VECTYPE is nonnull and if it is valid
805 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
806 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
807 vect_build_slp_tree. */
809 static bool
810 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
811 unsigned int group_size,
812 tree vectype, poly_uint64 *max_nunits)
814 if (!vectype)
816 if (dump_enabled_p ())
817 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
818 "Build SLP failed: unsupported data-type in %G\n",
819 stmt_info->stmt);
820 /* Fatal mismatch. */
821 return false;
824 /* If populating the vector type requires unrolling then fail
825 before adjusting *max_nunits for basic-block vectorization. */
826 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
827 unsigned HOST_WIDE_INT const_nunits;
828 if (is_a <bb_vec_info> (vinfo)
829 && (!nunits.is_constant (&const_nunits)
830 || const_nunits > group_size))
832 if (dump_enabled_p ())
833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
834 "Build SLP failed: unrolling required "
835 "in basic block SLP\n");
836 /* Fatal mismatch. */
837 return false;
840 /* In case of multiple types we need to detect the smallest type. */
841 vect_update_max_nunits (max_nunits, vectype);
842 return true;
845 /* Verify if the scalar stmts STMTS are isomorphic, require data
846 permutation or are of unsupported types of operation. Return
847 true if they are, otherwise return false and indicate in *MATCHES
848 which stmts are not isomorphic to the first one. If MATCHES[0]
849 is false then this indicates the comparison could not be
850 carried out or the stmts will never be vectorized by SLP.
852 Note COND_EXPR is possibly isomorphic to another one after swapping its
853 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
854 the first stmt by swapping the two operands of comparison; set SWAP[i]
855 to 2 if stmt I is isormorphic to the first stmt by inverting the code
856 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
857 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
859 static bool
860 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
861 vec<stmt_vec_info> stmts, unsigned int group_size,
862 poly_uint64 *max_nunits, bool *matches,
863 bool *two_operators, tree *node_vectype)
865 unsigned int i;
866 stmt_vec_info first_stmt_info = stmts[0];
867 enum tree_code first_stmt_code = ERROR_MARK;
868 enum tree_code alt_stmt_code = ERROR_MARK;
869 enum tree_code rhs_code = ERROR_MARK;
870 enum tree_code first_cond_code = ERROR_MARK;
871 tree lhs;
872 bool need_same_oprnds = false;
873 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
874 optab optab;
875 int icode;
876 machine_mode optab_op2_mode;
877 machine_mode vec_mode;
878 stmt_vec_info first_load = NULL, prev_first_load = NULL;
879 bool first_stmt_load_p = false, load_p = false;
880 bool first_stmt_phi_p = false, phi_p = false;
882 /* For every stmt in NODE find its def stmt/s. */
883 stmt_vec_info stmt_info;
884 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
886 gimple *stmt = stmt_info->stmt;
887 swap[i] = 0;
888 matches[i] = false;
890 if (dump_enabled_p ())
891 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
893 /* Fail to vectorize statements marked as unvectorizable, throw
894 or are volatile. */
895 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
896 || stmt_can_throw_internal (cfun, stmt)
897 || gimple_has_volatile_ops (stmt))
899 if (dump_enabled_p ())
900 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
901 "Build SLP failed: unvectorizable statement %G",
902 stmt);
903 /* ??? For BB vectorization we want to commutate operands in a way
904 to shuffle all unvectorizable defs into one operand and have
905 the other still vectorized. The following doesn't reliably
906 work for this though but it's the easiest we can do here. */
907 if (is_a <bb_vec_info> (vinfo) && i != 0)
908 continue;
909 /* Fatal mismatch. */
910 matches[0] = false;
911 return false;
914 lhs = gimple_get_lhs (stmt);
915 if (lhs == NULL_TREE)
917 if (dump_enabled_p ())
918 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
919 "Build SLP failed: not GIMPLE_ASSIGN nor "
920 "GIMPLE_CALL %G", stmt);
921 if (is_a <bb_vec_info> (vinfo) && i != 0)
922 continue;
923 /* Fatal mismatch. */
924 matches[0] = false;
925 return false;
928 tree nunits_vectype;
929 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
930 &nunits_vectype, group_size)
931 || (nunits_vectype
932 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
933 nunits_vectype, max_nunits)))
935 if (is_a <bb_vec_info> (vinfo) && i != 0)
936 continue;
937 /* Fatal mismatch. */
938 matches[0] = false;
939 return false;
942 gcc_assert (vectype);
944 gcall *call_stmt = dyn_cast <gcall *> (stmt);
945 if (call_stmt)
947 rhs_code = CALL_EXPR;
949 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
950 load_p = true;
951 else if ((gimple_call_internal_p (call_stmt)
952 && (!vectorizable_internal_fn_p
953 (gimple_call_internal_fn (call_stmt))))
954 || gimple_call_tail_p (call_stmt)
955 || gimple_call_noreturn_p (call_stmt)
956 || !gimple_call_nothrow_p (call_stmt)
957 || gimple_call_chain (call_stmt))
959 if (dump_enabled_p ())
960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
961 "Build SLP failed: unsupported call type %G",
962 call_stmt);
963 if (is_a <bb_vec_info> (vinfo) && i != 0)
964 continue;
965 /* Fatal mismatch. */
966 matches[0] = false;
967 return false;
970 else if (gimple_code (stmt) == GIMPLE_PHI)
972 rhs_code = ERROR_MARK;
973 phi_p = true;
975 else
977 rhs_code = gimple_assign_rhs_code (stmt);
978 load_p = gimple_vuse (stmt);
981 /* Check the operation. */
982 if (i == 0)
984 *node_vectype = vectype;
985 first_stmt_code = rhs_code;
986 first_stmt_load_p = load_p;
987 first_stmt_phi_p = phi_p;
989 /* Shift arguments should be equal in all the packed stmts for a
990 vector shift with scalar shift operand. */
991 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
992 || rhs_code == LROTATE_EXPR
993 || rhs_code == RROTATE_EXPR)
995 vec_mode = TYPE_MODE (vectype);
997 /* First see if we have a vector/vector shift. */
998 optab = optab_for_tree_code (rhs_code, vectype,
999 optab_vector);
1001 if (!optab
1002 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1004 /* No vector/vector shift, try for a vector/scalar shift. */
1005 optab = optab_for_tree_code (rhs_code, vectype,
1006 optab_scalar);
1008 if (!optab)
1010 if (dump_enabled_p ())
1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 "Build SLP failed: no optab.\n");
1013 if (is_a <bb_vec_info> (vinfo) && i != 0)
1014 continue;
1015 /* Fatal mismatch. */
1016 matches[0] = false;
1017 return false;
1019 icode = (int) optab_handler (optab, vec_mode);
1020 if (icode == CODE_FOR_nothing)
1022 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1024 "Build SLP failed: "
1025 "op not supported by target.\n");
1026 if (is_a <bb_vec_info> (vinfo) && i != 0)
1027 continue;
1028 /* Fatal mismatch. */
1029 matches[0] = false;
1030 return false;
1032 optab_op2_mode = insn_data[icode].operand[2].mode;
1033 if (!VECTOR_MODE_P (optab_op2_mode))
1035 need_same_oprnds = true;
1036 first_op1 = gimple_assign_rhs2 (stmt);
1040 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1042 need_same_oprnds = true;
1043 first_op1 = gimple_assign_rhs2 (stmt);
1045 else if (!load_p
1046 && rhs_code == BIT_FIELD_REF)
1048 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1049 if (TREE_CODE (vec) != SSA_NAME
1050 || !types_compatible_p (vectype, TREE_TYPE (vec)))
1052 if (is_a <bb_vec_info> (vinfo) && i != 0)
1053 continue;
1054 if (dump_enabled_p ())
1055 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1056 "Build SLP failed: "
1057 "BIT_FIELD_REF not supported\n");
1058 /* Fatal mismatch. */
1059 matches[0] = false;
1060 return false;
1063 else if (call_stmt
1064 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1066 need_same_oprnds = true;
1067 first_op1 = gimple_call_arg (call_stmt, 1);
1070 else
1072 if (first_stmt_code != rhs_code
1073 && alt_stmt_code == ERROR_MARK)
1074 alt_stmt_code = rhs_code;
1075 if ((first_stmt_code != rhs_code
1076 && (first_stmt_code != IMAGPART_EXPR
1077 || rhs_code != REALPART_EXPR)
1078 && (first_stmt_code != REALPART_EXPR
1079 || rhs_code != IMAGPART_EXPR)
1080 /* Handle mismatches in plus/minus by computing both
1081 and merging the results. */
1082 && !((first_stmt_code == PLUS_EXPR
1083 || first_stmt_code == MINUS_EXPR)
1084 && (alt_stmt_code == PLUS_EXPR
1085 || alt_stmt_code == MINUS_EXPR)
1086 && rhs_code == alt_stmt_code)
1087 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1088 && (first_stmt_code == ARRAY_REF
1089 || first_stmt_code == BIT_FIELD_REF
1090 || first_stmt_code == INDIRECT_REF
1091 || first_stmt_code == COMPONENT_REF
1092 || first_stmt_code == MEM_REF)))
1093 || first_stmt_load_p != load_p
1094 || first_stmt_phi_p != phi_p)
1096 if (dump_enabled_p ())
1098 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1099 "Build SLP failed: different operation "
1100 "in stmt %G", stmt);
1101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1102 "original stmt %G", first_stmt_info->stmt);
1104 /* Mismatch. */
1105 continue;
1108 if (need_same_oprnds)
1110 tree other_op1 = (call_stmt
1111 ? gimple_call_arg (call_stmt, 1)
1112 : gimple_assign_rhs2 (stmt));
1113 if (!operand_equal_p (first_op1, other_op1, 0))
1115 if (dump_enabled_p ())
1116 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1117 "Build SLP failed: different shift "
1118 "arguments in %G", stmt);
1119 /* Mismatch. */
1120 continue;
1123 if (!load_p
1124 && first_stmt_code == BIT_FIELD_REF
1125 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1126 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1128 if (dump_enabled_p ())
1129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1130 "Build SLP failed: different BIT_FIELD_REF "
1131 "arguments in %G", stmt);
1132 /* Mismatch. */
1133 continue;
1136 if (!load_p && rhs_code == CALL_EXPR)
1138 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1139 as_a <gcall *> (stmt)))
1141 if (dump_enabled_p ())
1142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1143 "Build SLP failed: different calls in %G",
1144 stmt);
1145 /* Mismatch. */
1146 continue;
1150 if (phi_p
1151 && (gimple_bb (first_stmt_info->stmt)
1152 != gimple_bb (stmt_info->stmt)))
1154 if (dump_enabled_p ())
1155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1156 "Build SLP failed: different BB for PHI "
1157 "in %G", stmt);
1158 /* Mismatch. */
1159 continue;
1162 if (!types_compatible_p (vectype, *node_vectype))
1164 if (dump_enabled_p ())
1165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1166 "Build SLP failed: different vector type "
1167 "in %G", stmt);
1168 /* Mismatch. */
1169 continue;
1173 /* Grouped store or load. */
1174 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1176 if (REFERENCE_CLASS_P (lhs))
1178 /* Store. */
1181 else
1183 /* Load. */
1184 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1185 if (prev_first_load)
1187 /* Check that there are no loads from different interleaving
1188 chains in the same node. */
1189 if (prev_first_load != first_load)
1191 if (dump_enabled_p ())
1192 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1193 vect_location,
1194 "Build SLP failed: different "
1195 "interleaving chains in one node %G",
1196 stmt);
1197 /* Mismatch. */
1198 continue;
1201 else
1202 prev_first_load = first_load;
1204 } /* Grouped access. */
1205 else
1207 if (load_p)
1209 /* Not grouped load. */
1210 if (dump_enabled_p ())
1211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1212 "Build SLP failed: not grouped load %G", stmt);
1214 /* FORNOW: Not grouped loads are not supported. */
1215 if (is_a <bb_vec_info> (vinfo) && i != 0)
1216 continue;
1217 /* Fatal mismatch. */
1218 matches[0] = false;
1219 return false;
1222 /* Not memory operation. */
1223 if (!phi_p
1224 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1225 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1226 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1227 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1228 && rhs_code != VIEW_CONVERT_EXPR
1229 && rhs_code != CALL_EXPR
1230 && rhs_code != BIT_FIELD_REF)
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234 "Build SLP failed: operation unsupported %G",
1235 stmt);
1236 if (is_a <bb_vec_info> (vinfo) && i != 0)
1237 continue;
1238 /* Fatal mismatch. */
1239 matches[0] = false;
1240 return false;
1243 if (rhs_code == COND_EXPR)
1245 tree cond_expr = gimple_assign_rhs1 (stmt);
1246 enum tree_code cond_code = TREE_CODE (cond_expr);
1247 enum tree_code swap_code = ERROR_MARK;
1248 enum tree_code invert_code = ERROR_MARK;
1250 if (i == 0)
1251 first_cond_code = TREE_CODE (cond_expr);
1252 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1254 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1255 swap_code = swap_tree_comparison (cond_code);
1256 invert_code = invert_tree_comparison (cond_code, honor_nans);
1259 if (first_cond_code == cond_code)
1261 /* Isomorphic can be achieved by swapping. */
1262 else if (first_cond_code == swap_code)
1263 swap[i] = 1;
1264 /* Isomorphic can be achieved by inverting. */
1265 else if (first_cond_code == invert_code)
1266 swap[i] = 2;
1267 else
1269 if (dump_enabled_p ())
1270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271 "Build SLP failed: different"
1272 " operation %G", stmt);
1273 /* Mismatch. */
1274 continue;
1279 matches[i] = true;
1282 for (i = 0; i < group_size; ++i)
1283 if (!matches[i])
1284 return false;
1286 /* If we allowed a two-operation SLP node verify the target can cope
1287 with the permute we are going to use. */
1288 if (alt_stmt_code != ERROR_MARK
1289 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1291 *two_operators = true;
1294 return true;
1297 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1298 Note we never remove apart from at destruction time so we do not
1299 need a special value for deleted that differs from empty. */
1300 struct bst_traits
1302 typedef vec <stmt_vec_info> value_type;
1303 typedef vec <stmt_vec_info> compare_type;
1304 static inline hashval_t hash (value_type);
1305 static inline bool equal (value_type existing, value_type candidate);
1306 static inline bool is_empty (value_type x) { return !x.exists (); }
1307 static inline bool is_deleted (value_type x) { return !x.exists (); }
1308 static const bool empty_zero_p = true;
1309 static inline void mark_empty (value_type &x) { x.release (); }
1310 static inline void mark_deleted (value_type &x) { x.release (); }
1311 static inline void remove (value_type &x) { x.release (); }
1313 inline hashval_t
1314 bst_traits::hash (value_type x)
1316 inchash::hash h;
1317 for (unsigned i = 0; i < x.length (); ++i)
1318 h.add_int (gimple_uid (x[i]->stmt));
1319 return h.end ();
1321 inline bool
1322 bst_traits::equal (value_type existing, value_type candidate)
1324 if (existing.length () != candidate.length ())
1325 return false;
1326 for (unsigned i = 0; i < existing.length (); ++i)
1327 if (existing[i] != candidate[i])
1328 return false;
1329 return true;
1332 typedef hash_map <vec <gimple *>, slp_tree,
1333 simple_hashmap_traits <bst_traits, slp_tree> >
1334 scalar_stmts_to_slp_tree_map_t;
1336 static slp_tree
1337 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1338 vec<stmt_vec_info> stmts, unsigned int group_size,
1339 poly_uint64 *max_nunits,
1340 bool *matches, unsigned *npermutes, unsigned *tree_size,
1341 scalar_stmts_to_slp_tree_map_t *bst_map);
1343 static slp_tree
1344 vect_build_slp_tree (vec_info *vinfo,
1345 vec<stmt_vec_info> stmts, unsigned int group_size,
1346 poly_uint64 *max_nunits,
1347 bool *matches, unsigned *npermutes, unsigned *tree_size,
1348 scalar_stmts_to_slp_tree_map_t *bst_map)
1350 if (slp_tree *leader = bst_map->get (stmts))
1352 if (dump_enabled_p ())
1353 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1354 *leader ? "" : "failed ", *leader);
1355 if (*leader)
1357 SLP_TREE_REF_COUNT (*leader)++;
1358 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1360 return *leader;
1363 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1364 so we can pick up backedge destinations during discovery. */
1365 slp_tree res = new _slp_tree;
1366 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1367 SLP_TREE_SCALAR_STMTS (res) = stmts;
1368 bst_map->put (stmts.copy (), res);
1370 poly_uint64 this_max_nunits = 1;
1371 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1372 &this_max_nunits,
1373 matches, npermutes, tree_size, bst_map);
1374 if (!res_)
1376 bool existed_p = bst_map->put (stmts, NULL);
1377 gcc_assert (existed_p);
1378 /* Mark the node invalid so we can detect those when still in use
1379 as backedge destinations. */
1380 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1381 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1382 vect_free_slp_tree (res);
1384 else
1386 gcc_assert (res_ == res);
1387 res->max_nunits = this_max_nunits;
1388 vect_update_max_nunits (max_nunits, this_max_nunits);
1389 /* Keep a reference for the bst_map use. */
1390 SLP_TREE_REF_COUNT (res)++;
1392 return res_;
1395 /* Recursively build an SLP tree starting from NODE.
1396 Fail (and return a value not equal to zero) if def-stmts are not
1397 isomorphic, require data permutation or are of unsupported types of
1398 operation. Otherwise, return 0.
1399 The value returned is the depth in the SLP tree where a mismatch
1400 was found. */
1402 static slp_tree
1403 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1404 vec<stmt_vec_info> stmts, unsigned int group_size,
1405 poly_uint64 *max_nunits,
1406 bool *matches, unsigned *npermutes, unsigned *tree_size,
1407 scalar_stmts_to_slp_tree_map_t *bst_map)
1409 unsigned nops, i, this_tree_size = 0;
1410 poly_uint64 this_max_nunits = *max_nunits;
1412 matches[0] = false;
1414 stmt_vec_info stmt_info = stmts[0];
1415 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1416 nops = gimple_call_num_args (stmt);
1417 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1419 nops = gimple_num_ops (stmt) - 1;
1420 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1421 nops++;
1423 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1424 nops = gimple_phi_num_args (phi);
1425 else
1426 return NULL;
1428 /* If the SLP node is a PHI (induction or reduction), terminate
1429 the recursion. */
1430 bool *skip_args = XALLOCAVEC (bool, nops);
1431 memset (skip_args, 0, nops);
1432 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1433 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1435 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1436 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1437 group_size);
1438 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1439 max_nunits))
1440 return NULL;
1442 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1443 /* Induction from different IVs is not supported. */
1444 if (def_type == vect_induction_def)
1446 stmt_vec_info other_info;
1447 FOR_EACH_VEC_ELT (stmts, i, other_info)
1448 if (stmt_info != other_info)
1449 return NULL;
1451 /* Induction PHIs are leafs. */
1452 (*tree_size)++;
1453 node = vect_create_new_slp_node (node, stmts, nops);
1454 SLP_TREE_VECTYPE (node) = vectype;
1455 SLP_TREE_CHILDREN (node).quick_grow_cleared (nops);
1456 return node;
1458 else if (def_type == vect_reduction_def
1459 || def_type == vect_double_reduction_def
1460 || def_type == vect_nested_cycle)
1462 /* Else def types have to match. */
1463 stmt_vec_info other_info;
1464 bool all_same = true;
1465 FOR_EACH_VEC_ELT (stmts, i, other_info)
1467 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1468 return NULL;
1469 if (other_info != stmt_info)
1470 all_same = false;
1472 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1473 /* Reduction initial values are not explicitely represented. */
1474 if (!nested_in_vect_loop_p (loop, stmt_info))
1475 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1476 /* Reduction chain backedge defs are filled manually.
1477 ??? Need a better way to identify a SLP reduction chain PHI.
1478 Or a better overall way to SLP match those. */
1479 if (all_same && def_type == vect_reduction_def)
1480 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1482 else if (def_type != vect_internal_def)
1483 return NULL;
1487 bool two_operators = false;
1488 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1489 tree vectype = NULL_TREE;
1490 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1491 &this_max_nunits, matches, &two_operators,
1492 &vectype))
1493 return NULL;
1495 /* If the SLP node is a load, terminate the recursion unless masked. */
1496 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1497 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1499 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1501 /* Masked load. */
1502 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1503 nops = 1;
1505 else
1507 *max_nunits = this_max_nunits;
1508 (*tree_size)++;
1509 node = vect_create_new_slp_node (node, stmts, 0);
1510 SLP_TREE_VECTYPE (node) = vectype;
1511 /* And compute the load permutation. Whether it is actually
1512 a permutation depends on the unrolling factor which is
1513 decided later. */
1514 vec<unsigned> load_permutation;
1515 int j;
1516 stmt_vec_info load_info;
1517 load_permutation.create (group_size);
1518 stmt_vec_info first_stmt_info
1519 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1520 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1522 int load_place = vect_get_place_in_interleaving_chain
1523 (load_info, first_stmt_info);
1524 gcc_assert (load_place != -1);
1525 load_permutation.safe_push (load_place);
1527 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1528 return node;
1531 else if (gimple_assign_single_p (stmt_info->stmt)
1532 && !gimple_vuse (stmt_info->stmt)
1533 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1535 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1536 the same SSA name vector of a compatible type to vectype. */
1537 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1538 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1539 stmt_vec_info estmt_info;
1540 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1542 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1543 tree bfref = gimple_assign_rhs1 (estmt);
1544 HOST_WIDE_INT lane;
1545 if (!known_eq (bit_field_size (bfref),
1546 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1547 || !constant_multiple_p (bit_field_offset (bfref),
1548 bit_field_size (bfref), &lane))
1550 lperm.release ();
1551 return NULL;
1553 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1555 slp_tree vnode = vect_create_new_slp_node (vNULL);
1556 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1557 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1558 /* We are always building a permutation node even if it is an identity
1559 permute to shield the rest of the vectorizer from the odd node
1560 representing an actual vector without any scalar ops.
1561 ??? We could hide it completely with making the permute node
1562 external? */
1563 node = vect_create_new_slp_node (node, stmts, 1);
1564 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1565 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1566 SLP_TREE_VECTYPE (node) = vectype;
1567 SLP_TREE_CHILDREN (node).quick_push (vnode);
1568 return node;
1571 /* Get at the operands, verifying they are compatible. */
1572 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1573 slp_oprnd_info oprnd_info;
1574 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1576 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
1577 stmts, i, &oprnds_info);
1578 if (res != 0)
1579 matches[(res == -1) ? 0 : i] = false;
1580 if (!matches[0])
1581 break;
1583 for (i = 0; i < group_size; ++i)
1584 if (!matches[i])
1586 vect_free_oprnd_info (oprnds_info);
1587 return NULL;
1589 swap = NULL;
1591 auto_vec<slp_tree, 4> children;
1593 stmt_info = stmts[0];
1595 /* Create SLP_TREE nodes for the definition node/s. */
1596 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1598 slp_tree child;
1599 unsigned int j;
1601 /* We're skipping certain operands from processing, for example
1602 outer loop reduction initial defs. */
1603 if (skip_args[i])
1605 children.safe_push (NULL);
1606 continue;
1609 if (oprnd_info->first_dt == vect_uninitialized_def)
1611 /* COND_EXPR have one too many eventually if the condition
1612 is a SSA name. */
1613 gcc_assert (i == 3 && nops == 4);
1614 continue;
1617 if (is_a <bb_vec_info> (vinfo)
1618 && oprnd_info->first_dt == vect_internal_def
1619 && !oprnd_info->any_pattern)
1621 /* For BB vectorization, if all defs are the same do not
1622 bother to continue the build along the single-lane
1623 graph but use a splat of the scalar value. */
1624 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1625 for (j = 1; j < group_size; ++j)
1626 if (oprnd_info->def_stmts[j] != first_def)
1627 break;
1628 if (j == group_size
1629 /* But avoid doing this for loads where we may be
1630 able to CSE things. */
1631 && !gimple_vuse (first_def->stmt))
1633 if (dump_enabled_p ())
1634 dump_printf_loc (MSG_NOTE, vect_location,
1635 "Using a splat of the uniform operand\n");
1636 oprnd_info->first_dt = vect_external_def;
1640 if (oprnd_info->first_dt == vect_external_def
1641 || oprnd_info->first_dt == vect_constant_def)
1643 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1644 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1645 oprnd_info->ops = vNULL;
1646 children.safe_push (invnode);
1647 continue;
1650 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1651 group_size, &this_max_nunits,
1652 matches, npermutes,
1653 &this_tree_size, bst_map)) != NULL)
1655 oprnd_info->def_stmts = vNULL;
1656 children.safe_push (child);
1657 continue;
1660 /* If the SLP build for operand zero failed and operand zero
1661 and one can be commutated try that for the scalar stmts
1662 that failed the match. */
1663 if (i == 0
1664 /* A first scalar stmt mismatch signals a fatal mismatch. */
1665 && matches[0]
1666 /* ??? For COND_EXPRs we can swap the comparison operands
1667 as well as the arms under some constraints. */
1668 && nops == 2
1669 && oprnds_info[1]->first_dt == vect_internal_def
1670 && is_gimple_assign (stmt_info->stmt)
1671 /* Swapping operands for reductions breaks assumptions later on. */
1672 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1673 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1674 /* Do so only if the number of not successful permutes was nor more
1675 than a cut-ff as re-trying the recursive match on
1676 possibly each level of the tree would expose exponential
1677 behavior. */
1678 && *npermutes < 4)
1680 /* See whether we can swap the matching or the non-matching
1681 stmt operands. */
1682 bool swap_not_matching = true;
1685 for (j = 0; j < group_size; ++j)
1687 if (matches[j] != !swap_not_matching)
1688 continue;
1689 stmt_vec_info stmt_info = stmts[j];
1690 /* Verify if we can swap operands of this stmt. */
1691 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1692 if (!stmt
1693 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1695 if (!swap_not_matching)
1696 goto fail;
1697 swap_not_matching = false;
1698 break;
1702 while (j != group_size);
1704 /* Swap mismatched definition stmts. */
1705 if (dump_enabled_p ())
1706 dump_printf_loc (MSG_NOTE, vect_location,
1707 "Re-trying with swapped operands of stmts ");
1708 for (j = 0; j < group_size; ++j)
1709 if (matches[j] == !swap_not_matching)
1711 std::swap (oprnds_info[0]->def_stmts[j],
1712 oprnds_info[1]->def_stmts[j]);
1713 std::swap (oprnds_info[0]->ops[j],
1714 oprnds_info[1]->ops[j]);
1715 if (dump_enabled_p ())
1716 dump_printf (MSG_NOTE, "%d ", j);
1718 if (dump_enabled_p ())
1719 dump_printf (MSG_NOTE, "\n");
1720 /* And try again with scratch 'matches' ... */
1721 bool *tem = XALLOCAVEC (bool, group_size);
1722 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1723 group_size, &this_max_nunits,
1724 tem, npermutes,
1725 &this_tree_size, bst_map)) != NULL)
1727 oprnd_info->def_stmts = vNULL;
1728 children.safe_push (child);
1729 continue;
1731 /* We do not undo the swapping here since it might still be
1732 the better order for the second operand in case we build
1733 the first one from scalars below. */
1734 ++*npermutes;
1736 fail:
1738 /* If the SLP build failed and we analyze a basic-block
1739 simply treat nodes we fail to build as externally defined
1740 (and thus build vectors from the scalar defs).
1741 The cost model will reject outright expensive cases.
1742 ??? This doesn't treat cases where permutation ultimatively
1743 fails (or we don't try permutation below). Ideally we'd
1744 even compute a permutation that will end up with the maximum
1745 SLP tree size... */
1746 if (is_a <bb_vec_info> (vinfo)
1747 /* ??? Rejecting patterns this way doesn't work. We'd have to
1748 do extra work to cancel the pattern so the uses see the
1749 scalar version. */
1750 && !is_pattern_stmt_p (stmt_info)
1751 && !oprnd_info->any_pattern)
1753 /* But if there's a leading vector sized set of matching stmts
1754 fail here so we can split the group. This matches the condition
1755 vect_analyze_slp_instance uses. */
1756 /* ??? We might want to split here and combine the results to support
1757 multiple vector sizes better. */
1758 for (j = 0; j < group_size; ++j)
1759 if (!matches[j])
1760 break;
1761 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1763 if (dump_enabled_p ())
1764 dump_printf_loc (MSG_NOTE, vect_location,
1765 "Building vector operands from scalars\n");
1766 this_tree_size++;
1767 child = vect_create_new_slp_node (oprnd_info->ops);
1768 children.safe_push (child);
1769 oprnd_info->ops = vNULL;
1770 continue;
1774 gcc_assert (child == NULL);
1775 FOR_EACH_VEC_ELT (children, j, child)
1776 if (child)
1777 vect_free_slp_tree (child);
1778 vect_free_oprnd_info (oprnds_info);
1779 return NULL;
1782 vect_free_oprnd_info (oprnds_info);
1784 /* If we have all children of a child built up from uniform scalars
1785 or does more than one possibly expensive vector construction then
1786 just throw that away, causing it built up from scalars.
1787 The exception is the SLP node for the vector store. */
1788 if (is_a <bb_vec_info> (vinfo)
1789 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1790 /* ??? Rejecting patterns this way doesn't work. We'd have to
1791 do extra work to cancel the pattern so the uses see the
1792 scalar version. */
1793 && !is_pattern_stmt_p (stmt_info))
1795 slp_tree child;
1796 unsigned j;
1797 bool all_uniform_p = true;
1798 unsigned n_vector_builds = 0;
1799 FOR_EACH_VEC_ELT (children, j, child)
1801 if (!child)
1803 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1804 all_uniform_p = false;
1805 else if (!vect_slp_tree_uniform_p (child))
1807 all_uniform_p = false;
1808 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1809 n_vector_builds++;
1812 if (all_uniform_p || n_vector_builds > 1)
1814 /* Roll back. */
1815 matches[0] = false;
1816 FOR_EACH_VEC_ELT (children, j, child)
1817 if (child)
1818 vect_free_slp_tree (child);
1820 if (dump_enabled_p ())
1821 dump_printf_loc (MSG_NOTE, vect_location,
1822 "Building parent vector operands from "
1823 "scalars instead\n");
1824 return NULL;
1828 *tree_size += this_tree_size + 1;
1829 *max_nunits = this_max_nunits;
1831 if (two_operators)
1833 /* ??? We'd likely want to either cache in bst_map sth like
1834 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1835 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1836 explicit stmts to put in so the keying on 'stmts' doesn't
1837 work (but we have the same issue with nodes that use 'ops'). */
1838 slp_tree one = new _slp_tree;
1839 slp_tree two = new _slp_tree;
1840 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1841 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1842 SLP_TREE_VECTYPE (one) = vectype;
1843 SLP_TREE_VECTYPE (two) = vectype;
1844 SLP_TREE_CHILDREN (one).safe_splice (children);
1845 SLP_TREE_CHILDREN (two).safe_splice (children);
1846 slp_tree child;
1847 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1848 SLP_TREE_REF_COUNT (child)++;
1850 /* Here we record the original defs since this
1851 node represents the final lane configuration. */
1852 node = vect_create_new_slp_node (node, stmts, 2);
1853 SLP_TREE_VECTYPE (node) = vectype;
1854 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1855 SLP_TREE_CHILDREN (node).quick_push (one);
1856 SLP_TREE_CHILDREN (node).quick_push (two);
1857 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1858 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1859 enum tree_code ocode = ERROR_MARK;
1860 stmt_vec_info ostmt_info;
1861 unsigned j = 0;
1862 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1864 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1865 if (gimple_assign_rhs_code (ostmt) != code0)
1867 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1868 ocode = gimple_assign_rhs_code (ostmt);
1869 j = i;
1871 else
1872 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1874 SLP_TREE_CODE (one) = code0;
1875 SLP_TREE_CODE (two) = ocode;
1876 SLP_TREE_LANES (one) = stmts.length ();
1877 SLP_TREE_LANES (two) = stmts.length ();
1878 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1879 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1880 return node;
1883 node = vect_create_new_slp_node (node, stmts, nops);
1884 SLP_TREE_VECTYPE (node) = vectype;
1885 SLP_TREE_CHILDREN (node).splice (children);
1886 return node;
1889 /* Dump a single SLP tree NODE. */
1891 static void
1892 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1893 slp_tree node)
1895 unsigned i, j;
1896 slp_tree child;
1897 stmt_vec_info stmt_info;
1898 tree op;
1900 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1901 dump_user_location_t user_loc = loc.get_user_location ();
1902 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1903 SLP_TREE_DEF_TYPE (node) == vect_external_def
1904 ? " (external)"
1905 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1906 ? " (constant)"
1907 : ""), node,
1908 estimated_poly_value (node->max_nunits),
1909 SLP_TREE_REF_COUNT (node));
1910 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1911 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1912 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1913 else
1915 dump_printf_loc (metadata, user_loc, "\t{ ");
1916 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1917 dump_printf (metadata, "%T%s ", op,
1918 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1919 dump_printf (metadata, "}\n");
1921 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1923 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1924 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1925 dump_printf (dump_kind, " %u", j);
1926 dump_printf (dump_kind, " }\n");
1928 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1930 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1931 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1932 dump_printf (dump_kind, " %u[%u]",
1933 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1934 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1935 dump_printf (dump_kind, " }\n");
1937 if (SLP_TREE_CHILDREN (node).is_empty ())
1938 return;
1939 dump_printf_loc (metadata, user_loc, "\tchildren");
1940 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1941 dump_printf (dump_kind, " %p", (void *)child);
1942 dump_printf (dump_kind, "\n");
1945 DEBUG_FUNCTION void
1946 debug (slp_tree node)
1948 debug_dump_context ctx;
1949 vect_print_slp_tree (MSG_NOTE,
1950 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1951 node);
1954 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1956 static void
1957 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1958 slp_tree node, hash_set<slp_tree> &visited)
1960 unsigned i;
1961 slp_tree child;
1963 if (visited.add (node))
1964 return;
1966 vect_print_slp_tree (dump_kind, loc, node);
1968 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1969 if (child)
1970 vect_print_slp_graph (dump_kind, loc, child, visited);
1973 static void
1974 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1975 slp_tree entry)
1977 hash_set<slp_tree> visited;
1978 vect_print_slp_graph (dump_kind, loc, entry, visited);
1981 /* Mark the tree rooted at NODE with PURE_SLP. */
1983 static void
1984 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1986 int i;
1987 stmt_vec_info stmt_info;
1988 slp_tree child;
1990 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1991 return;
1993 if (visited.add (node))
1994 return;
1996 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1997 STMT_SLP_TYPE (stmt_info) = pure_slp;
1999 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2000 if (child)
2001 vect_mark_slp_stmts (child, visited);
2004 static void
2005 vect_mark_slp_stmts (slp_tree node)
2007 hash_set<slp_tree> visited;
2008 vect_mark_slp_stmts (node, visited);
2011 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2013 static void
2014 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2016 int i;
2017 stmt_vec_info stmt_info;
2018 slp_tree child;
2020 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2021 return;
2023 if (visited.add (node))
2024 return;
2026 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2028 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2029 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2030 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2033 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2034 if (child)
2035 vect_mark_slp_stmts_relevant (child, visited);
2038 static void
2039 vect_mark_slp_stmts_relevant (slp_tree node)
2041 hash_set<slp_tree> visited;
2042 vect_mark_slp_stmts_relevant (node, visited);
2046 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2048 static void
2049 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2050 hash_set<slp_tree> &visited)
2052 if (!node || visited.add (node))
2053 return;
2055 if (SLP_TREE_CHILDREN (node).length () == 0)
2057 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2058 return;
2059 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2060 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2061 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2062 loads.safe_push (node);
2064 else
2066 unsigned i;
2067 slp_tree child;
2068 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2069 vect_gather_slp_loads (loads, child, visited);
2073 static void
2074 vect_gather_slp_loads (slp_instance inst, slp_tree node)
2076 hash_set<slp_tree> visited;
2077 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
2081 /* Find the last store in SLP INSTANCE. */
2083 stmt_vec_info
2084 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2086 stmt_vec_info last = NULL;
2087 stmt_vec_info stmt_vinfo;
2089 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2091 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2092 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2095 return last;
2098 /* Find the first stmt in NODE. */
2100 stmt_vec_info
2101 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2103 stmt_vec_info first = NULL;
2104 stmt_vec_info stmt_vinfo;
2106 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2108 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2109 if (!first
2110 || get_later_stmt (stmt_vinfo, first) == first)
2111 first = stmt_vinfo;
2114 return first;
2117 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2118 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2119 (also containing the first GROUP1_SIZE stmts, since stores are
2120 consecutive), the second containing the remainder.
2121 Return the first stmt in the second group. */
2123 static stmt_vec_info
2124 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2126 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2127 gcc_assert (group1_size > 0);
2128 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2129 gcc_assert (group2_size > 0);
2130 DR_GROUP_SIZE (first_vinfo) = group1_size;
2132 stmt_vec_info stmt_info = first_vinfo;
2133 for (unsigned i = group1_size; i > 1; i--)
2135 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2136 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2138 /* STMT is now the last element of the first group. */
2139 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2140 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2142 DR_GROUP_SIZE (group2) = group2_size;
2143 for (stmt_info = group2; stmt_info;
2144 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2146 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2147 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2150 /* For the second group, the DR_GROUP_GAP is that before the original group,
2151 plus skipping over the first vector. */
2152 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2154 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2155 DR_GROUP_GAP (first_vinfo) += group2_size;
2157 if (dump_enabled_p ())
2158 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2159 group1_size, group2_size);
2161 return group2;
2164 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2165 statements and a vector of NUNITS elements. */
2167 static poly_uint64
2168 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2170 return exact_div (common_multiple (nunits, group_size), group_size);
2173 enum slp_instance_kind {
2174 slp_inst_kind_store,
2175 slp_inst_kind_reduc_group,
2176 slp_inst_kind_reduc_chain,
2177 slp_inst_kind_ctor
2180 static bool
2181 vect_analyze_slp_instance (vec_info *vinfo,
2182 scalar_stmts_to_slp_tree_map_t *bst_map,
2183 stmt_vec_info stmt_info, unsigned max_tree_size);
2185 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2186 of KIND. Return true if successful. */
2188 static bool
2189 vect_build_slp_instance (vec_info *vinfo,
2190 slp_instance_kind kind,
2191 vec<stmt_vec_info> scalar_stmts,
2192 stmt_vec_info root_stmt_info,
2193 unsigned max_tree_size,
2194 scalar_stmts_to_slp_tree_map_t *bst_map,
2195 /* ??? We need stmt_info for group splitting. */
2196 stmt_vec_info stmt_info_)
2198 if (dump_enabled_p ())
2200 dump_printf_loc (MSG_NOTE, vect_location,
2201 "Starting SLP discovery for\n");
2202 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2203 dump_printf_loc (MSG_NOTE, vect_location,
2204 " %G", scalar_stmts[i]->stmt);
2207 /* Build the tree for the SLP instance. */
2208 unsigned int group_size = scalar_stmts.length ();
2209 bool *matches = XALLOCAVEC (bool, group_size);
2210 unsigned npermutes = 0;
2211 poly_uint64 max_nunits = 1;
2212 unsigned tree_size = 0;
2213 unsigned i;
2214 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2215 &max_nunits, matches, &npermutes,
2216 &tree_size, bst_map);
2217 if (node != NULL)
2219 /* Calculate the unrolling factor based on the smallest type. */
2220 poly_uint64 unrolling_factor
2221 = calculate_unrolling_factor (max_nunits, group_size);
2223 if (maybe_ne (unrolling_factor, 1U)
2224 && is_a <bb_vec_info> (vinfo))
2226 unsigned HOST_WIDE_INT const_max_nunits;
2227 if (!max_nunits.is_constant (&const_max_nunits)
2228 || const_max_nunits > group_size)
2230 if (dump_enabled_p ())
2231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2232 "Build SLP failed: store group "
2233 "size not a multiple of the vector size "
2234 "in basic block SLP\n");
2235 vect_free_slp_tree (node);
2236 return false;
2238 /* Fatal mismatch. */
2239 if (dump_enabled_p ())
2240 dump_printf_loc (MSG_NOTE, vect_location,
2241 "SLP discovery succeeded but node needs "
2242 "splitting\n");
2243 memset (matches, true, group_size);
2244 matches[group_size / const_max_nunits * const_max_nunits] = false;
2245 vect_free_slp_tree (node);
2247 else
2249 /* Create a new SLP instance. */
2250 slp_instance new_instance = XNEW (class _slp_instance);
2251 SLP_INSTANCE_TREE (new_instance) = node;
2252 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2253 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2254 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2255 new_instance->reduc_phis = NULL;
2256 new_instance->cost_vec = vNULL;
2257 new_instance->subgraph_entries = vNULL;
2259 vect_gather_slp_loads (new_instance, node);
2260 if (dump_enabled_p ())
2261 dump_printf_loc (MSG_NOTE, vect_location,
2262 "SLP size %u vs. limit %u.\n",
2263 tree_size, max_tree_size);
2265 /* Check whether any load is possibly permuted. */
2266 slp_tree load_node;
2267 bool loads_permuted = false;
2268 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2270 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2271 continue;
2272 unsigned j;
2273 stmt_vec_info load_info;
2274 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2275 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2277 loads_permuted = true;
2278 break;
2282 /* If the loads and stores can be handled with load/store-lane
2283 instructions do not generate this SLP instance. */
2284 if (is_a <loop_vec_info> (vinfo)
2285 && loads_permuted
2286 && kind == slp_inst_kind_store
2287 && vect_store_lanes_supported
2288 (STMT_VINFO_VECTYPE (scalar_stmts[0]), group_size, false))
2290 slp_tree load_node;
2291 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2293 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2294 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2295 /* Use SLP for strided accesses (or if we can't load-lanes). */
2296 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2297 || ! vect_load_lanes_supported
2298 (STMT_VINFO_VECTYPE (stmt_vinfo),
2299 DR_GROUP_SIZE (stmt_vinfo), false))
2300 break;
2302 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2304 if (dump_enabled_p ())
2305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2306 "Built SLP cancelled: can use "
2307 "load/store-lanes\n");
2308 vect_free_slp_instance (new_instance);
2309 return false;
2313 /* Fixup SLP reduction chains. */
2314 if (kind == slp_inst_kind_reduc_chain)
2316 /* If this is a reduction chain with a conversion in front
2317 amend the SLP tree with a node for that. */
2318 gimple *scalar_def
2319 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2320 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2322 /* Get at the conversion stmt - we know it's the single use
2323 of the last stmt of the reduction chain. */
2324 use_operand_p use_p;
2325 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2326 &use_p, &scalar_def);
2327 gcc_assert (r);
2328 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2329 next_info = vect_stmt_to_vectorize (next_info);
2330 scalar_stmts = vNULL;
2331 scalar_stmts.create (group_size);
2332 for (unsigned i = 0; i < group_size; ++i)
2333 scalar_stmts.quick_push (next_info);
2334 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2335 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2336 SLP_TREE_CHILDREN (conv).quick_push (node);
2337 SLP_INSTANCE_TREE (new_instance) = conv;
2338 /* We also have to fake this conversion stmt as SLP reduction
2339 group so we don't have to mess with too much code
2340 elsewhere. */
2341 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2342 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2344 /* Fill the backedge child of the PHI SLP node. The
2345 general matching code cannot find it because the
2346 scalar code does not reflect how we vectorize the
2347 reduction. */
2348 use_operand_p use_p;
2349 imm_use_iterator imm_iter;
2350 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2351 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2352 gimple_get_lhs (scalar_def))
2353 /* There are exactly two non-debug uses, the reduction
2354 PHI and the loop-closed PHI node. */
2355 if (!is_gimple_debug (USE_STMT (use_p))
2356 && gimple_bb (USE_STMT (use_p)) == loop->header)
2358 auto_vec<stmt_vec_info, 64> phis (group_size);
2359 stmt_vec_info phi_info
2360 = vinfo->lookup_stmt (USE_STMT (use_p));
2361 for (unsigned i = 0; i < group_size; ++i)
2362 phis.quick_push (phi_info);
2363 slp_tree *phi_node = bst_map->get (phis);
2364 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2365 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2366 = SLP_INSTANCE_TREE (new_instance);
2367 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2371 vinfo->slp_instances.safe_push (new_instance);
2373 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2374 the number of scalar stmts in the root in a few places.
2375 Verify that assumption holds. */
2376 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2377 .length () == group_size);
2379 if (dump_enabled_p ())
2381 dump_printf_loc (MSG_NOTE, vect_location,
2382 "Final SLP tree for instance:\n");
2383 vect_print_slp_graph (MSG_NOTE, vect_location,
2384 SLP_INSTANCE_TREE (new_instance));
2387 return true;
2390 else
2392 /* Failed to SLP. */
2393 /* Free the allocated memory. */
2394 scalar_stmts.release ();
2397 stmt_vec_info stmt_info = stmt_info_;
2398 /* Try to break the group up into pieces. */
2399 if (kind == slp_inst_kind_store)
2401 /* ??? We could delay all the actual splitting of store-groups
2402 until after SLP discovery of the original group completed.
2403 Then we can recurse to vect_build_slp_instance directly. */
2404 for (i = 0; i < group_size; i++)
2405 if (!matches[i])
2406 break;
2408 /* For basic block SLP, try to break the group up into multiples of
2409 a vector size. */
2410 if (is_a <bb_vec_info> (vinfo)
2411 && (i > 1 && i < group_size))
2413 tree scalar_type
2414 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2415 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2416 1 << floor_log2 (i));
2417 unsigned HOST_WIDE_INT const_nunits;
2418 if (vectype
2419 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2421 /* Split into two groups at the first vector boundary. */
2422 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2423 unsigned group1_size = i & ~(const_nunits - 1);
2425 if (dump_enabled_p ())
2426 dump_printf_loc (MSG_NOTE, vect_location,
2427 "Splitting SLP group at stmt %u\n", i);
2428 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2429 group1_size);
2430 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2431 max_tree_size);
2432 /* Split the rest at the failure point and possibly
2433 re-analyze the remaining matching part if it has
2434 at least two lanes. */
2435 if (group1_size < i
2436 && (i + 1 < group_size
2437 || i - group1_size > 1))
2439 stmt_vec_info rest2 = rest;
2440 rest = vect_split_slp_store_group (rest, i - group1_size);
2441 if (i - group1_size > 1)
2442 res |= vect_analyze_slp_instance (vinfo, bst_map,
2443 rest2, max_tree_size);
2445 /* Re-analyze the non-matching tail if it has at least
2446 two lanes. */
2447 if (i + 1 < group_size)
2448 res |= vect_analyze_slp_instance (vinfo, bst_map,
2449 rest, max_tree_size);
2450 return res;
2454 /* For loop vectorization split into arbitrary pieces of size > 1. */
2455 if (is_a <loop_vec_info> (vinfo)
2456 && (i > 1 && i < group_size))
2458 unsigned group1_size = i;
2460 if (dump_enabled_p ())
2461 dump_printf_loc (MSG_NOTE, vect_location,
2462 "Splitting SLP group at stmt %u\n", i);
2464 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2465 group1_size);
2466 /* Loop vectorization cannot handle gaps in stores, make sure
2467 the split group appears as strided. */
2468 STMT_VINFO_STRIDED_P (rest) = 1;
2469 DR_GROUP_GAP (rest) = 0;
2470 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2471 DR_GROUP_GAP (stmt_info) = 0;
2473 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2474 max_tree_size);
2475 if (i + 1 < group_size)
2476 res |= vect_analyze_slp_instance (vinfo, bst_map,
2477 rest, max_tree_size);
2479 return res;
2482 /* Even though the first vector did not all match, we might be able to SLP
2483 (some) of the remainder. FORNOW ignore this possibility. */
2486 /* Failed to SLP. */
2487 if (dump_enabled_p ())
2488 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2489 return false;
2493 /* Analyze an SLP instance starting from a group of grouped stores. Call
2494 vect_build_slp_tree to build a tree of packed stmts if possible.
2495 Return FALSE if it's impossible to SLP any stmt in the loop. */
2497 static bool
2498 vect_analyze_slp_instance (vec_info *vinfo,
2499 scalar_stmts_to_slp_tree_map_t *bst_map,
2500 stmt_vec_info stmt_info, unsigned max_tree_size)
2502 unsigned int group_size;
2503 unsigned int i;
2504 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2505 vec<stmt_vec_info> scalar_stmts;
2506 slp_instance_kind kind;
2508 if (is_a <bb_vec_info> (vinfo))
2509 vect_location = stmt_info->stmt;
2510 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2512 kind = slp_inst_kind_store;
2513 group_size = DR_GROUP_SIZE (stmt_info);
2515 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2517 kind = slp_inst_kind_reduc_chain;
2518 gcc_assert (is_a <loop_vec_info> (vinfo));
2519 group_size = REDUC_GROUP_SIZE (stmt_info);
2521 else if (is_gimple_assign (stmt_info->stmt)
2522 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2524 kind = slp_inst_kind_ctor;
2525 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2527 else
2529 kind = slp_inst_kind_reduc_group;
2530 gcc_assert (is_a <loop_vec_info> (vinfo));
2531 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2534 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2535 scalar_stmts.create (group_size);
2536 stmt_vec_info next_info = stmt_info;
2537 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2539 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2540 while (next_info)
2542 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2543 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2546 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2548 /* Collect the reduction stmts and store them in
2549 SLP_TREE_SCALAR_STMTS. */
2550 while (next_info)
2552 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2553 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2555 /* Mark the first element of the reduction chain as reduction to properly
2556 transform the node. In the reduction analysis phase only the last
2557 element of the chain is marked as reduction. */
2558 STMT_VINFO_DEF_TYPE (stmt_info)
2559 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2560 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2561 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2563 else if (kind == slp_inst_kind_ctor)
2565 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2566 tree val;
2567 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2569 if (TREE_CODE (val) == SSA_NAME)
2571 gimple* def = SSA_NAME_DEF_STMT (val);
2572 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2573 /* Value is defined in another basic block. */
2574 if (!def_info)
2575 return false;
2576 def_info = vect_stmt_to_vectorize (def_info);
2577 scalar_stmts.safe_push (def_info);
2579 else
2580 return false;
2582 if (dump_enabled_p ())
2583 dump_printf_loc (MSG_NOTE, vect_location,
2584 "Analyzing vectorizable constructor: %G\n",
2585 stmt_info->stmt);
2587 else
2589 /* Collect reduction statements. */
2590 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2591 for (i = 0; reductions.iterate (i, &next_info); i++)
2592 scalar_stmts.safe_push (next_info);
2595 /* Build the tree for the SLP instance. */
2596 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2597 kind == slp_inst_kind_ctor
2598 ? stmt_info : NULL,
2599 max_tree_size,
2600 bst_map, stmt_info);
2602 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2603 where we should do store group splitting. */
2605 return res;
2608 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2609 trees of packed scalar stmts if SLP is possible. */
2611 opt_result
2612 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2614 unsigned int i;
2615 stmt_vec_info first_element;
2617 DUMP_VECT_SCOPE ("vect_analyze_slp");
2619 scalar_stmts_to_slp_tree_map_t *bst_map
2620 = new scalar_stmts_to_slp_tree_map_t ();
2622 /* Find SLP sequences starting from groups of grouped stores. */
2623 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2624 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2626 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2628 if (loop_vinfo->reduction_chains.length () > 0)
2630 /* Find SLP sequences starting from reduction chains. */
2631 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2632 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2633 max_tree_size))
2635 /* Dissolve reduction chain group. */
2636 stmt_vec_info vinfo = first_element;
2637 stmt_vec_info last = NULL;
2638 while (vinfo)
2640 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2641 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2642 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2643 last = vinfo;
2644 vinfo = next;
2646 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2647 /* It can be still vectorized as part of an SLP reduction. */
2648 loop_vinfo->reductions.safe_push (last);
2652 /* Find SLP sequences starting from groups of reductions. */
2653 if (loop_vinfo->reductions.length () > 1)
2654 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2655 max_tree_size);
2658 /* The map keeps a reference on SLP nodes built, release that. */
2659 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2660 it != bst_map->end (); ++it)
2661 if ((*it).second)
2662 vect_free_slp_tree ((*it).second);
2663 delete bst_map;
2665 return opt_result::success ();
2668 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2670 static void
2671 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2672 vec<slp_tree> &vertices, vec<int> &leafs)
2674 unsigned i;
2675 slp_tree child;
2677 if (visited.add (node))
2678 return;
2680 node->vertex = vertices.length ();
2681 vertices.safe_push (node);
2683 bool leaf = true;
2684 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2685 if (child)
2687 leaf = false;
2688 vect_slp_build_vertices (visited, child, vertices, leafs);
2690 if (leaf)
2691 leafs.safe_push (node->vertex);
2694 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2696 static void
2697 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2698 vec<int> &leafs)
2700 hash_set<slp_tree> visited;
2701 unsigned i;
2702 slp_instance instance;
2703 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2704 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2705 leafs);
2708 /* Apply (reverse) bijectite PERM to VEC. */
2710 template <class T>
2711 static void
2712 vect_slp_permute (vec<unsigned> perm,
2713 vec<T> &vec, bool reverse)
2715 auto_vec<T, 64> saved;
2716 saved.create (vec.length ());
2717 for (unsigned i = 0; i < vec.length (); ++i)
2718 saved.quick_push (vec[i]);
2720 if (reverse)
2722 for (unsigned i = 0; i < vec.length (); ++i)
2723 vec[perm[i]] = saved[i];
2724 for (unsigned i = 0; i < vec.length (); ++i)
2725 gcc_assert (vec[perm[i]] == saved[i]);
2727 else
2729 for (unsigned i = 0; i < vec.length (); ++i)
2730 vec[i] = saved[perm[i]];
2731 for (unsigned i = 0; i < vec.length (); ++i)
2732 gcc_assert (vec[i] == saved[perm[i]]);
2736 /* Return whether permutations PERM_A and PERM_B as recorded in the
2737 PERMS vector are equal. */
2739 static bool
2740 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2741 int perm_a, int perm_b)
2743 return (perm_a == perm_b
2744 || (perms[perm_a].length () == perms[perm_b].length ()
2745 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2746 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2749 /* Optimize the SLP graph of VINFO. */
2751 void
2752 vect_optimize_slp (vec_info *vinfo)
2754 if (vinfo->slp_instances.is_empty ())
2755 return;
2757 slp_tree node;
2758 unsigned i;
2759 auto_vec<slp_tree> vertices;
2760 auto_vec<int> leafs;
2761 vect_slp_build_vertices (vinfo, vertices, leafs);
2763 struct graph *slpg = new_graph (vertices.length ());
2764 FOR_EACH_VEC_ELT (vertices, i, node)
2766 unsigned j;
2767 slp_tree child;
2768 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2769 if (child)
2770 add_edge (slpg, i, child->vertex);
2773 /* Compute (reverse) postorder on the inverted graph. */
2774 auto_vec<int> ipo;
2775 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
2777 auto_sbitmap n_visited (vertices.length ());
2778 auto_sbitmap n_materialize (vertices.length ());
2779 auto_vec<int> n_perm (vertices.length ());
2780 auto_vec<vec<unsigned> > perms;
2782 bitmap_clear (n_visited);
2783 bitmap_clear (n_materialize);
2784 n_perm.quick_grow_cleared (vertices.length ());
2785 perms.safe_push (vNULL); /* zero is no permute */
2787 /* Produce initial permutations. */
2788 for (i = 0; i < leafs.length (); ++i)
2790 int idx = leafs[i];
2791 slp_tree node = vertices[idx];
2793 /* Handle externals and constants optimistically throughout the
2794 iteration. */
2795 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
2796 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
2797 continue;
2799 /* Loads are the only thing generating permutes and leafs do not
2800 change across iterations. */
2801 bitmap_set_bit (n_visited, idx);
2802 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2803 continue;
2805 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2806 node unpermuted, record this permute. */
2807 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
2808 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
2809 continue;
2810 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
2811 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
2812 bool any_permute = false;
2813 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2815 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
2816 imin = MIN (imin, idx);
2817 imax = MAX (imax, idx);
2818 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
2819 any_permute = true;
2821 /* If there's no permute no need to split one out. */
2822 if (!any_permute)
2823 continue;
2824 /* If the span doesn't match we'd disrupt VF computation, avoid
2825 that for now. */
2826 if (imax - imin + 1 != SLP_TREE_LANES (node))
2827 continue;
2829 /* For now only handle true permutes, like
2830 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2831 when permuting constants and invariants keeping the permute
2832 bijective. */
2833 auto_sbitmap load_index (SLP_TREE_LANES (node));
2834 bitmap_clear (load_index);
2835 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2836 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
2837 unsigned j;
2838 for (j = 0; j < SLP_TREE_LANES (node); ++j)
2839 if (!bitmap_bit_p (load_index, j))
2840 break;
2841 if (j != SLP_TREE_LANES (node))
2842 continue;
2844 vec<unsigned> perm = vNULL;
2845 perm.safe_grow (SLP_TREE_LANES (node), true);
2846 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2847 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
2848 perms.safe_push (perm);
2849 n_perm[idx] = perms.length () - 1;
2852 /* Propagate permutes along the graph and compute materialization points. */
2853 bool changed;
2854 unsigned iteration = 0;
2857 changed = false;
2858 ++iteration;
2860 for (i = vertices.length (); i > 0 ; --i)
2862 int idx = ipo[i-1];
2863 slp_tree node = vertices[idx];
2864 /* For leafs there's nothing to do - we've seeded permutes
2865 on those above. */
2866 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2867 continue;
2869 bitmap_set_bit (n_visited, idx);
2871 /* We cannot move a permute across a store. */
2872 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2873 && DR_IS_WRITE
2874 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
2875 continue;
2877 int perm = -1;
2878 for (graph_edge *succ = slpg->vertices[idx].succ;
2879 succ; succ = succ->succ_next)
2881 int succ_idx = succ->dest;
2882 /* Handle unvisited nodes optimistically. */
2883 /* ??? But for constants once we want to handle non-bijective
2884 permutes we have to verify the permute, when unifying lanes,
2885 will not unify different constants. For example see
2886 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2887 if (!bitmap_bit_p (n_visited, succ_idx))
2888 continue;
2889 int succ_perm = n_perm[succ_idx];
2890 /* Once we materialize succs permutation its output lanes
2891 appear unpermuted to us. */
2892 if (bitmap_bit_p (n_materialize, succ_idx))
2893 succ_perm = 0;
2894 if (perm == -1)
2895 perm = succ_perm;
2896 else if (succ_perm == 0)
2898 perm = 0;
2899 break;
2901 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
2903 perm = 0;
2904 break;
2908 if (perm == -1)
2909 /* Pick up pre-computed leaf values. */
2910 perm = n_perm[idx];
2911 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
2913 if (iteration > 1)
2914 /* Make sure we eventually converge. */
2915 gcc_checking_assert (perm == 0);
2916 n_perm[idx] = perm;
2917 if (perm == 0)
2918 bitmap_clear_bit (n_materialize, idx);
2919 changed = true;
2922 if (perm == 0)
2923 continue;
2925 /* Elide pruning at materialization points in the first
2926 iteration so every node was visited once at least. */
2927 if (iteration == 1)
2928 continue;
2930 /* Decide on permute materialization. Look whether there's
2931 a use (pred) edge that is permuted differently than us.
2932 In that case mark ourselves so the permutation is applied. */
2933 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
2934 for (graph_edge *pred = slpg->vertices[idx].pred;
2935 pred; pred = pred->pred_next)
2937 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
2938 int pred_perm = n_perm[pred->src];
2939 if (!vect_slp_perms_eq (perms, perm, pred_perm))
2941 all_preds_permuted = false;
2942 break;
2945 if (!all_preds_permuted)
2947 if (!bitmap_bit_p (n_materialize, idx))
2948 changed = true;
2949 bitmap_set_bit (n_materialize, idx);
2953 while (changed || iteration == 1);
2955 /* Materialize. */
2956 for (i = 0; i < vertices.length (); ++i)
2958 int perm = n_perm[i];
2959 if (perm <= 0)
2960 continue;
2962 slp_tree node = vertices[i];
2964 /* First permute invariant/external original successors. */
2965 unsigned j;
2966 slp_tree child;
2967 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2969 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2970 continue;
2972 /* If the vector is uniform there's nothing to do. */
2973 if (vect_slp_tree_uniform_p (child))
2974 continue;
2976 /* We can end up sharing some externals via two_operator
2977 handling. Be prepared to unshare those. */
2978 if (child->refcnt != 1)
2980 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
2981 SLP_TREE_CHILDREN (node)[j] = child
2982 = vect_create_new_slp_node
2983 (SLP_TREE_SCALAR_OPS (child).copy ());
2985 vect_slp_permute (perms[perm],
2986 SLP_TREE_SCALAR_OPS (child), true);
2989 if (bitmap_bit_p (n_materialize, i))
2991 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2992 /* For loads simply drop the permutation, the load permutation
2993 already performs the desired permutation. */
2995 else
2997 if (dump_enabled_p ())
2998 dump_printf_loc (MSG_NOTE, vect_location,
2999 "inserting permute node in place of %p\n",
3000 node);
3002 /* Make a copy of NODE and in-place change it to a
3003 VEC_PERM node to permute the lanes of the copy. */
3004 slp_tree copy = new _slp_tree;
3005 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3006 SLP_TREE_CHILDREN (node) = vNULL;
3007 SLP_TREE_SCALAR_STMTS (copy)
3008 = SLP_TREE_SCALAR_STMTS (node).copy ();
3009 vect_slp_permute (perms[perm],
3010 SLP_TREE_SCALAR_STMTS (copy), true);
3011 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3012 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3013 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3014 SLP_TREE_LANE_PERMUTATION (copy)
3015 = SLP_TREE_LANE_PERMUTATION (node);
3016 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3017 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3018 copy->refcnt = 1;
3019 copy->max_nunits = node->max_nunits;
3020 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3021 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3022 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3024 /* Now turn NODE into a VEC_PERM. */
3025 SLP_TREE_CHILDREN (node).safe_push (copy);
3026 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3027 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3028 SLP_TREE_LANE_PERMUTATION (node)
3029 .quick_push (std::make_pair (0, perms[perm][j]));
3030 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3033 else
3035 /* Apply the reverse permutation to our stmts. */
3036 vect_slp_permute (perms[perm],
3037 SLP_TREE_SCALAR_STMTS (node), true);
3038 /* And to the load permutation, which we can simply
3039 make regular by design. */
3040 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3042 /* ??? When we handle non-bijective permutes the idea
3043 is that we can force the load-permutation to be
3044 { min, min + 1, min + 2, ... max }. But then the
3045 scalar defs might no longer match the lane content
3046 which means wrong-code with live lane vectorization.
3047 So we possibly have to have NULL entries for those. */
3048 vect_slp_permute (perms[perm],
3049 SLP_TREE_LOAD_PERMUTATION (node), true);
3054 /* Free the perms vector used for propagation. */
3055 while (!perms.is_empty ())
3056 perms.pop ().release ();
3057 free_graph (slpg);
3060 /* Now elide load permutations that are not necessary. */
3061 for (i = 0; i < leafs.length (); ++i)
3063 node = vertices[leafs[i]];
3064 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3065 continue;
3067 /* In basic block vectorization we allow any subchain of an interleaving
3068 chain.
3069 FORNOW: not in loop SLP because of realignment complications. */
3070 if (is_a <bb_vec_info> (vinfo))
3072 bool subchain_p = true;
3073 stmt_vec_info next_load_info = NULL;
3074 stmt_vec_info load_info;
3075 unsigned j;
3076 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3078 if (j != 0
3079 && (next_load_info != load_info
3080 || DR_GROUP_GAP (load_info) != 1))
3082 subchain_p = false;
3083 break;
3085 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3087 if (subchain_p)
3089 SLP_TREE_LOAD_PERMUTATION (node).release ();
3090 continue;
3093 else
3095 stmt_vec_info load_info;
3096 bool this_load_permuted = false;
3097 unsigned j;
3098 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3099 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3101 this_load_permuted = true;
3102 break;
3104 stmt_vec_info first_stmt_info
3105 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3106 if (!this_load_permuted
3107 /* The load requires permutation when unrolling exposes
3108 a gap either because the group is larger than the SLP
3109 group-size or because there is a gap between the groups. */
3110 && (known_eq (LOOP_VINFO_VECT_FACTOR
3111 (as_a <loop_vec_info> (vinfo)), 1U)
3112 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3113 && DR_GROUP_GAP (first_stmt_info) == 0)))
3115 SLP_TREE_LOAD_PERMUTATION (node).release ();
3116 continue;
3123 /* For each possible SLP instance decide whether to SLP it and calculate overall
3124 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3125 least one instance. */
3127 bool
3128 vect_make_slp_decision (loop_vec_info loop_vinfo)
3130 unsigned int i;
3131 poly_uint64 unrolling_factor = 1;
3132 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3133 slp_instance instance;
3134 int decided_to_slp = 0;
3136 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3138 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3140 /* FORNOW: SLP if you can. */
3141 /* All unroll factors have the form:
3143 GET_MODE_SIZE (vinfo->vector_mode) * X
3145 for some rational X, so they must have a common multiple. */
3146 unrolling_factor
3147 = force_common_multiple (unrolling_factor,
3148 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3150 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3151 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3152 loop-based vectorization. Such stmts will be marked as HYBRID. */
3153 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3154 decided_to_slp++;
3157 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3159 if (decided_to_slp && dump_enabled_p ())
3161 dump_printf_loc (MSG_NOTE, vect_location,
3162 "Decided to SLP %d instances. Unrolling factor ",
3163 decided_to_slp);
3164 dump_dec (MSG_NOTE, unrolling_factor);
3165 dump_printf (MSG_NOTE, "\n");
3168 return (decided_to_slp > 0);
3171 /* Private data for vect_detect_hybrid_slp. */
3172 struct vdhs_data
3174 loop_vec_info loop_vinfo;
3175 vec<stmt_vec_info> *worklist;
3178 /* Walker for walk_gimple_op. */
3180 static tree
3181 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3183 walk_stmt_info *wi = (walk_stmt_info *)data;
3184 vdhs_data *dat = (vdhs_data *)wi->info;
3186 if (wi->is_lhs)
3187 return NULL_TREE;
3189 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3190 if (!def_stmt_info)
3191 return NULL_TREE;
3192 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3193 if (PURE_SLP_STMT (def_stmt_info))
3195 if (dump_enabled_p ())
3196 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3197 def_stmt_info->stmt);
3198 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3199 dat->worklist->safe_push (def_stmt_info);
3202 return NULL_TREE;
3205 /* Find stmts that must be both vectorized and SLPed. */
3207 void
3208 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3210 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3212 /* All stmts participating in SLP are marked pure_slp, all other
3213 stmts are loop_vect.
3214 First collect all loop_vect stmts into a worklist. */
3215 auto_vec<stmt_vec_info> worklist;
3216 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3218 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3219 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3220 gsi_next (&gsi))
3222 gphi *phi = gsi.phi ();
3223 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3224 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3225 worklist.safe_push (stmt_info);
3227 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3228 gsi_next (&gsi))
3230 gimple *stmt = gsi_stmt (gsi);
3231 if (is_gimple_debug (stmt))
3232 continue;
3233 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3234 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3236 for (gimple_stmt_iterator gsi2
3237 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3238 !gsi_end_p (gsi2); gsi_next (&gsi2))
3240 stmt_vec_info patt_info
3241 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3242 if (!STMT_SLP_TYPE (patt_info)
3243 && STMT_VINFO_RELEVANT (patt_info))
3244 worklist.safe_push (patt_info);
3246 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3248 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3249 worklist.safe_push (stmt_info);
3253 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3254 mark any SLP vectorized stmt as hybrid.
3255 ??? We're visiting def stmts N times (once for each non-SLP and
3256 once for each hybrid-SLP use). */
3257 walk_stmt_info wi;
3258 vdhs_data dat;
3259 dat.worklist = &worklist;
3260 dat.loop_vinfo = loop_vinfo;
3261 memset (&wi, 0, sizeof (wi));
3262 wi.info = (void *)&dat;
3263 while (!worklist.is_empty ())
3265 stmt_vec_info stmt_info = worklist.pop ();
3266 /* Since SSA operands are not set up for pattern stmts we need
3267 to use walk_gimple_op. */
3268 wi.is_lhs = 0;
3269 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3274 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3276 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3277 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs)
3279 for (unsigned i = 0; i < bbs.length (); ++i)
3281 if (i != 0)
3282 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3283 gsi_next (&si))
3285 gphi *phi = si.phi ();
3286 gimple_set_uid (phi, 0);
3287 add_stmt (phi);
3289 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3290 !gsi_end_p (gsi); gsi_next (&gsi))
3292 gimple *stmt = gsi_stmt (gsi);
3293 gimple_set_uid (stmt, 0);
3294 if (is_gimple_debug (stmt))
3295 continue;
3296 add_stmt (stmt);
3302 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3303 stmts in the basic block. */
3305 _bb_vec_info::~_bb_vec_info ()
3307 /* Reset region marker. */
3308 for (unsigned i = 0; i < bbs.length (); ++i)
3310 if (i != 0)
3311 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3312 gsi_next (&si))
3314 gphi *phi = si.phi ();
3315 gimple_set_uid (phi, -1);
3317 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3318 !gsi_end_p (gsi); gsi_next (&gsi))
3320 gimple *stmt = gsi_stmt (gsi);
3321 gimple_set_uid (stmt, -1);
3326 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3327 given then that child nodes have already been processed, and that
3328 their def types currently match their SLP node's def type. */
3330 static bool
3331 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3332 slp_instance node_instance,
3333 stmt_vector_for_cost *cost_vec)
3335 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3336 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3338 /* Calculate the number of vector statements to be created for the
3339 scalar stmts in this node. For SLP reductions it is equal to the
3340 number of vector statements in the children (which has already been
3341 calculated by the recursive call). Otherwise it is the number of
3342 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3343 VF divided by the number of elements in a vector. */
3344 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3345 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3347 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3348 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3350 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3351 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3352 break;
3355 else
3357 poly_uint64 vf;
3358 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3359 vf = loop_vinfo->vectorization_factor;
3360 else
3361 vf = 1;
3362 unsigned int group_size = SLP_TREE_LANES (node);
3363 tree vectype = SLP_TREE_VECTYPE (node);
3364 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3365 = vect_get_num_vectors (vf * group_size, vectype);
3368 /* Handle purely internal nodes. */
3369 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3370 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3372 if (is_a <bb_vec_info> (vinfo)
3373 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3374 return false;
3376 bool dummy;
3377 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3378 node, node_instance, cost_vec);
3381 /* Try to build NODE from scalars, returning true on success.
3382 NODE_INSTANCE is the SLP instance that contains NODE. */
3384 static bool
3385 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3386 slp_instance node_instance)
3388 stmt_vec_info stmt_info;
3389 unsigned int i;
3391 if (!is_a <bb_vec_info> (vinfo)
3392 || node == SLP_INSTANCE_TREE (node_instance)
3393 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3394 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3395 return false;
3397 if (dump_enabled_p ())
3398 dump_printf_loc (MSG_NOTE, vect_location,
3399 "Building vector operands from scalars instead\n");
3401 /* Don't remove and free the child nodes here, since they could be
3402 referenced by other structures. The analysis and scheduling phases
3403 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3404 unsigned int group_size = SLP_TREE_LANES (node);
3405 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3406 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3407 SLP_TREE_LOAD_PERMUTATION (node).release ();
3408 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3410 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3411 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3413 return true;
3416 /* Compute the prologue cost for invariant or constant operands represented
3417 by NODE. */
3419 static void
3420 vect_prologue_cost_for_slp (slp_tree node,
3421 stmt_vector_for_cost *cost_vec)
3423 /* There's a special case of an existing vector, that costs nothing. */
3424 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3425 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3426 return;
3427 /* Without looking at the actual initializer a vector of
3428 constants can be implemented as load from the constant pool.
3429 When all elements are the same we can use a splat. */
3430 tree vectype = SLP_TREE_VECTYPE (node);
3431 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3432 unsigned num_vects_to_check;
3433 unsigned HOST_WIDE_INT const_nunits;
3434 unsigned nelt_limit;
3435 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3436 && ! multiple_p (const_nunits, group_size))
3438 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3439 nelt_limit = const_nunits;
3441 else
3443 /* If either the vector has variable length or the vectors
3444 are composed of repeated whole groups we only need to
3445 cost construction once. All vectors will be the same. */
3446 num_vects_to_check = 1;
3447 nelt_limit = group_size;
3449 tree elt = NULL_TREE;
3450 unsigned nelt = 0;
3451 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3453 unsigned si = j % group_size;
3454 if (nelt == 0)
3455 elt = SLP_TREE_SCALAR_OPS (node)[si];
3456 /* ??? We're just tracking whether all operands of a single
3457 vector initializer are the same, ideally we'd check if
3458 we emitted the same one already. */
3459 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3460 elt = NULL_TREE;
3461 nelt++;
3462 if (nelt == nelt_limit)
3464 record_stmt_cost (cost_vec, 1,
3465 SLP_TREE_DEF_TYPE (node) == vect_external_def
3466 ? (elt ? scalar_to_vec : vec_construct)
3467 : vector_load,
3468 NULL, vectype, 0, vect_prologue);
3469 nelt = 0;
3474 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3475 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3477 Return true if the operations are supported. */
3479 static bool
3480 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3481 slp_instance node_instance,
3482 hash_set<slp_tree> &visited,
3483 hash_set<slp_tree> &lvisited,
3484 stmt_vector_for_cost *cost_vec)
3486 int i, j;
3487 slp_tree child;
3489 /* Assume we can code-generate all invariants. */
3490 if (!node
3491 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3492 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3493 return true;
3495 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3497 if (dump_enabled_p ())
3498 dump_printf_loc (MSG_NOTE, vect_location,
3499 "Failed cyclic SLP reference in %p", node);
3500 return false;
3502 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3504 /* If we already analyzed the exact same set of scalar stmts we're done.
3505 We share the generated vector stmts for those. */
3506 if (visited.contains (node)
3507 || lvisited.add (node))
3508 return true;
3510 bool res = true;
3511 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3513 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3514 visited, lvisited, cost_vec);
3515 if (!res)
3516 break;
3519 if (res)
3520 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3521 cost_vec);
3522 if (!res)
3523 lvisited.remove (node);
3525 /* When the node can be vectorized cost invariant nodes it references.
3526 This is not done in DFS order to allow the refering node
3527 vectorizable_* calls to nail down the invariant nodes vector type
3528 and possibly unshare it if it needs a different vector type than
3529 other referrers. */
3530 if (res)
3531 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3532 if (child
3533 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3534 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3535 /* Perform usual caching, note code-generation still
3536 code-gens these nodes multiple times but we expect
3537 to CSE them later. */
3538 && !visited.contains (child)
3539 && !lvisited.add (child))
3541 /* ??? After auditing more code paths make a "default"
3542 and push the vector type from NODE to all children
3543 if it is not already set. */
3544 /* Compute the number of vectors to be generated. */
3545 tree vector_type = SLP_TREE_VECTYPE (child);
3546 if (!vector_type)
3548 /* For shifts with a scalar argument we don't need
3549 to cost or code-generate anything.
3550 ??? Represent this more explicitely. */
3551 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3552 == shift_vec_info_type)
3553 && j == 1);
3554 continue;
3556 unsigned group_size = SLP_TREE_LANES (child);
3557 poly_uint64 vf = 1;
3558 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3559 vf = loop_vinfo->vectorization_factor;
3560 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3561 = vect_get_num_vectors (vf * group_size, vector_type);
3562 /* And cost them. */
3563 vect_prologue_cost_for_slp (child, cost_vec);
3566 /* If this node or any of its children can't be vectorized, try pruning
3567 the tree here rather than felling the whole thing. */
3568 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3570 /* We'll need to revisit this for invariant costing and number
3571 of vectorized stmt setting. */
3572 res = true;
3575 return res;
3579 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3580 region and that can be vectorized using vectorizable_live_operation
3581 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3582 scalar code computing it to be retained. */
3584 static void
3585 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3586 slp_instance instance,
3587 stmt_vector_for_cost *cost_vec,
3588 hash_set<stmt_vec_info> &svisited,
3589 hash_set<slp_tree> &visited)
3591 if (visited.add (node))
3592 return;
3594 unsigned i;
3595 stmt_vec_info stmt_info;
3596 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3597 bool all_visited = true;
3598 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3600 if (svisited.contains (stmt_info))
3601 continue;
3602 all_visited = false;
3603 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3604 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3605 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3606 /* Only the pattern root stmt computes the original scalar value. */
3607 continue;
3608 bool mark_visited = true;
3609 gimple *orig_stmt = orig_stmt_info->stmt;
3610 ssa_op_iter op_iter;
3611 def_operand_p def_p;
3612 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3614 imm_use_iterator use_iter;
3615 gimple *use_stmt;
3616 stmt_vec_info use_stmt_info;
3617 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3618 if (!is_gimple_debug (use_stmt))
3620 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3621 if (!use_stmt_info
3622 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3624 STMT_VINFO_LIVE_P (stmt_info) = true;
3625 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3626 NULL, node, instance, i,
3627 false, cost_vec))
3628 /* ??? So we know we can vectorize the live stmt
3629 from one SLP node. If we cannot do so from all
3630 or none consistently we'd have to record which
3631 SLP node (and lane) we want to use for the live
3632 operation. So make sure we can code-generate
3633 from all nodes. */
3634 mark_visited = false;
3635 else
3636 STMT_VINFO_LIVE_P (stmt_info) = false;
3637 BREAK_FROM_IMM_USE_STMT (use_iter);
3640 /* We have to verify whether we can insert the lane extract
3641 before all uses. The following is a conservative approximation.
3642 We cannot put this into vectorizable_live_operation because
3643 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3644 doesn't work.
3645 Note that while the fact that we emit code for loads at the
3646 first load should make this a non-problem leafs we construct
3647 from scalars are vectorized after the last scalar def.
3648 ??? If we'd actually compute the insert location during
3649 analysis we could use sth less conservative than the last
3650 scalar stmt in the node for the dominance check. */
3651 /* ??? What remains is "live" uses in vector CTORs in the same
3652 SLP graph which is where those uses can end up code-generated
3653 right after their definition instead of close to their original
3654 use. But that would restrict us to code-generate lane-extracts
3655 from the latest stmt in a node. So we compensate for this
3656 during code-generation, simply not replacing uses for those
3657 hopefully rare cases. */
3658 if (STMT_VINFO_LIVE_P (stmt_info))
3659 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3660 if (!is_gimple_debug (use_stmt)
3661 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
3662 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3663 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
3665 if (dump_enabled_p ())
3666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3667 "Cannot determine insertion place for "
3668 "lane extract\n");
3669 STMT_VINFO_LIVE_P (stmt_info) = false;
3670 mark_visited = true;
3673 if (mark_visited)
3674 svisited.add (stmt_info);
3676 if (all_visited)
3677 return;
3679 slp_tree child;
3680 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3681 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3682 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
3683 cost_vec, svisited, visited);
3686 /* Analyze statements in SLP instances of VINFO. Return true if the
3687 operations are supported. */
3689 bool
3690 vect_slp_analyze_operations (vec_info *vinfo)
3692 slp_instance instance;
3693 int i;
3695 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3697 hash_set<slp_tree> visited;
3698 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
3700 hash_set<slp_tree> lvisited;
3701 stmt_vector_for_cost cost_vec;
3702 cost_vec.create (2);
3703 if (is_a <bb_vec_info> (vinfo))
3704 vect_location = instance->location ();
3705 if (!vect_slp_analyze_node_operations (vinfo,
3706 SLP_INSTANCE_TREE (instance),
3707 instance, visited, lvisited,
3708 &cost_vec)
3709 /* Instances with a root stmt require vectorized defs for the
3710 SLP tree root. */
3711 || (SLP_INSTANCE_ROOT_STMT (instance)
3712 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
3713 != vect_internal_def)))
3715 slp_tree node = SLP_INSTANCE_TREE (instance);
3716 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3717 if (dump_enabled_p ())
3718 dump_printf_loc (MSG_NOTE, vect_location,
3719 "removing SLP instance operations starting from: %G",
3720 stmt_info->stmt);
3721 vect_free_slp_instance (instance);
3722 vinfo->slp_instances.ordered_remove (i);
3723 cost_vec.release ();
3725 else
3727 for (hash_set<slp_tree>::iterator x = lvisited.begin();
3728 x != lvisited.end(); ++x)
3729 visited.add (*x);
3730 i++;
3732 /* For BB vectorization remember the SLP graph entry
3733 cost for later. */
3734 if (is_a <bb_vec_info> (vinfo))
3735 instance->cost_vec = cost_vec;
3736 else
3738 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3739 cost_vec.release ();
3744 /* Compute vectorizable live stmts. */
3745 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3747 hash_set<stmt_vec_info> svisited;
3748 hash_set<slp_tree> visited;
3749 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
3751 vect_location = instance->location ();
3752 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
3753 instance, &instance->cost_vec, svisited,
3754 visited);
3758 return !vinfo->slp_instances.is_empty ();
3761 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3762 closing the eventual chain. */
3764 static slp_instance
3765 get_ultimate_leader (slp_instance instance,
3766 hash_map<slp_instance, slp_instance> &instance_leader)
3768 auto_vec<slp_instance *, 8> chain;
3769 slp_instance *tem;
3770 while (*(tem = instance_leader.get (instance)) != instance)
3772 chain.safe_push (tem);
3773 instance = *tem;
3775 while (!chain.is_empty ())
3776 *chain.pop () = instance;
3777 return instance;
3780 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3782 static void
3783 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
3784 slp_instance instance, slp_tree node,
3785 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
3786 hash_map<slp_instance, slp_instance> &instance_leader,
3787 hash_set<slp_tree> &visited)
3789 stmt_vec_info stmt_info;
3790 unsigned i;
3792 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3794 bool existed_p;
3795 slp_instance &stmt_instance
3796 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
3797 if (!existed_p)
3799 else if (stmt_instance != instance)
3801 /* If we're running into a previously marked stmt make us the
3802 leader of the current ultimate leader. This keeps the
3803 leader chain acyclic and works even when the current instance
3804 connects two previously independent graph parts. */
3805 slp_instance stmt_leader
3806 = get_ultimate_leader (stmt_instance, instance_leader);
3807 if (stmt_leader != instance)
3808 instance_leader.put (stmt_leader, instance);
3810 stmt_instance = instance;
3813 if (visited.add (node))
3814 return;
3816 slp_tree child;
3817 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3818 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3819 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
3820 instance_leader, visited);
3823 /* Partition the SLP graph into pieces that can be costed independently. */
3825 static void
3826 vect_bb_partition_graph (bb_vec_info bb_vinfo)
3828 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3830 /* First walk the SLP graph assigning each involved scalar stmt a
3831 corresponding SLP graph entry and upon visiting a previously
3832 marked stmt, make the stmts leader the current SLP graph entry. */
3833 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
3834 hash_map<slp_instance, slp_instance> instance_leader;
3835 hash_set<slp_tree> visited;
3836 slp_instance instance;
3837 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3839 instance_leader.put (instance, instance);
3840 vect_bb_partition_graph_r (bb_vinfo,
3841 instance, SLP_INSTANCE_TREE (instance),
3842 stmt_to_instance, instance_leader,
3843 visited);
3846 /* Then collect entries to each independent subgraph. */
3847 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3849 slp_instance leader = get_ultimate_leader (instance, instance_leader);
3850 leader->subgraph_entries.safe_push (instance);
3851 if (dump_enabled_p ()
3852 && leader != instance)
3853 dump_printf_loc (MSG_NOTE, vect_location,
3854 "instance %p is leader of %p\n",
3855 leader, instance);
3859 /* Compute the scalar cost of the SLP node NODE and its children
3860 and return it. Do not account defs that are marked in LIFE and
3861 update LIFE according to uses of NODE. */
3863 static void
3864 vect_bb_slp_scalar_cost (vec_info *vinfo,
3865 slp_tree node, vec<bool, va_heap> *life,
3866 stmt_vector_for_cost *cost_vec,
3867 hash_set<slp_tree> &visited)
3869 unsigned i;
3870 stmt_vec_info stmt_info;
3871 slp_tree child;
3873 if (visited.add (node))
3874 return;
3876 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3878 ssa_op_iter op_iter;
3879 def_operand_p def_p;
3881 if ((*life)[i])
3882 continue;
3884 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3885 gimple *orig_stmt = orig_stmt_info->stmt;
3887 /* If there is a non-vectorized use of the defs then the scalar
3888 stmt is kept live in which case we do not account it or any
3889 required defs in the SLP children in the scalar cost. This
3890 way we make the vectorization more costly when compared to
3891 the scalar cost. */
3892 if (!STMT_VINFO_LIVE_P (stmt_info))
3894 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3896 imm_use_iterator use_iter;
3897 gimple *use_stmt;
3898 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3899 if (!is_gimple_debug (use_stmt))
3901 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3902 if (!use_stmt_info
3903 || !PURE_SLP_STMT
3904 (vect_stmt_to_vectorize (use_stmt_info)))
3906 (*life)[i] = true;
3907 BREAK_FROM_IMM_USE_STMT (use_iter);
3911 if ((*life)[i])
3912 continue;
3915 /* Count scalar stmts only once. */
3916 if (gimple_visited_p (orig_stmt))
3917 continue;
3918 gimple_set_visited (orig_stmt, true);
3920 vect_cost_for_stmt kind;
3921 if (STMT_VINFO_DATA_REF (orig_stmt_info))
3923 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
3924 kind = scalar_load;
3925 else
3926 kind = scalar_store;
3928 else if (vect_nop_conversion_p (orig_stmt_info))
3929 continue;
3930 else
3931 kind = scalar_stmt;
3932 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info, 0, vect_body);
3935 auto_vec<bool, 20> subtree_life;
3936 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3938 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3940 /* Do not directly pass LIFE to the recursive call, copy it to
3941 confine changes in the callee to the current child/subtree. */
3942 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3944 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
3945 for (unsigned j = 0;
3946 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3948 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3949 if (perm.first == i)
3950 subtree_life[perm.second] = (*life)[j];
3953 else
3955 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
3956 subtree_life.safe_splice (*life);
3958 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3959 visited);
3960 subtree_life.truncate (0);
3965 /* Check if vectorization of the basic block is profitable for the
3966 subgraph denoted by SLP_INSTANCES. */
3968 static bool
3969 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
3970 vec<slp_instance> slp_instances)
3972 slp_instance instance;
3973 int i;
3974 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3975 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3977 void *vect_target_cost_data = init_cost (NULL);
3979 /* Calculate scalar cost and sum the cost for the vector stmts
3980 previously collected. */
3981 stmt_vector_for_cost scalar_costs;
3982 scalar_costs.create (0);
3983 hash_set<slp_tree> visited;
3984 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3986 auto_vec<bool, 20> life;
3987 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
3988 true);
3989 vect_bb_slp_scalar_cost (bb_vinfo,
3990 SLP_INSTANCE_TREE (instance),
3991 &life, &scalar_costs, visited);
3992 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
3993 instance->cost_vec.release ();
3995 /* Unset visited flag. */
3996 stmt_info_for_cost *si;
3997 FOR_EACH_VEC_ELT (scalar_costs, i, si)
3998 gimple_set_visited (si->stmt_info->stmt, false);
4000 void *scalar_target_cost_data = init_cost (NULL);
4001 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
4002 scalar_costs.release ();
4003 unsigned dummy;
4004 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
4005 destroy_cost_data (scalar_target_cost_data);
4007 /* Complete the target-specific vector cost calculation. */
4008 finish_cost (vect_target_cost_data, &vec_prologue_cost,
4009 &vec_inside_cost, &vec_epilogue_cost);
4010 destroy_cost_data (vect_target_cost_data);
4012 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
4014 if (dump_enabled_p ())
4016 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4017 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
4018 vec_inside_cost);
4019 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
4020 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
4021 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
4024 /* Vectorization is profitable if its cost is more than the cost of scalar
4025 version. Note that we err on the vector side for equal cost because
4026 the cost estimate is otherwise quite pessimistic (constant uses are
4027 free on the scalar side but cost a load on the vector side for
4028 example). */
4029 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4030 return false;
4032 return true;
4035 /* Find any vectorizable constructors and add them to the grouped_store
4036 array. */
4038 static void
4039 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4041 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4042 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4043 !gsi_end_p (gsi); gsi_next (&gsi))
4045 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4046 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
4047 continue;
4049 tree rhs = gimple_assign_rhs1 (assign);
4050 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4051 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4052 CONSTRUCTOR_NELTS (rhs))
4053 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4054 || uniform_vector_p (rhs))
4055 continue;
4057 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4058 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4062 /* Check if the region described by BB_VINFO can be vectorized, returning
4063 true if so. When returning false, set FATAL to true if the same failure
4064 would prevent vectorization at other vector sizes, false if it is still
4065 worth trying other sizes. N_STMTS is the number of statements in the
4066 region. */
4068 static bool
4069 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4070 vec<int> *dataref_groups)
4072 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4074 slp_instance instance;
4075 int i;
4076 poly_uint64 min_vf = 2;
4078 /* The first group of checks is independent of the vector size. */
4079 fatal = true;
4081 /* Analyze the data references. */
4083 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4085 if (dump_enabled_p ())
4086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4087 "not vectorized: unhandled data-ref in basic "
4088 "block.\n");
4089 return false;
4092 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4094 if (dump_enabled_p ())
4095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4096 "not vectorized: unhandled data access in "
4097 "basic block.\n");
4098 return false;
4101 vect_slp_check_for_constructors (bb_vinfo);
4103 /* If there are no grouped stores and no constructors in the region
4104 there is no need to continue with pattern recog as vect_analyze_slp
4105 will fail anyway. */
4106 if (bb_vinfo->grouped_stores.is_empty ())
4108 if (dump_enabled_p ())
4109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4110 "not vectorized: no grouped stores in "
4111 "basic block.\n");
4112 return false;
4115 /* While the rest of the analysis below depends on it in some way. */
4116 fatal = false;
4118 vect_pattern_recog (bb_vinfo);
4120 /* Check the SLP opportunities in the basic block, analyze and build SLP
4121 trees. */
4122 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4124 if (dump_enabled_p ())
4126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4127 "Failed to SLP the basic block.\n");
4128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4129 "not vectorized: failed to find SLP opportunities "
4130 "in basic block.\n");
4132 return false;
4135 /* Optimize permutations. */
4136 vect_optimize_slp (bb_vinfo);
4138 vect_record_base_alignments (bb_vinfo);
4140 /* Analyze and verify the alignment of data references and the
4141 dependence in the SLP instances. */
4142 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4144 vect_location = instance->location ();
4145 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4146 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4148 slp_tree node = SLP_INSTANCE_TREE (instance);
4149 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4150 if (dump_enabled_p ())
4151 dump_printf_loc (MSG_NOTE, vect_location,
4152 "removing SLP instance operations starting from: %G",
4153 stmt_info->stmt);
4154 vect_free_slp_instance (instance);
4155 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4156 continue;
4159 /* Mark all the statements that we want to vectorize as pure SLP and
4160 relevant. */
4161 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4162 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4163 if (SLP_INSTANCE_ROOT_STMT (instance))
4164 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
4166 i++;
4168 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4169 return false;
4171 if (!vect_slp_analyze_operations (bb_vinfo))
4173 if (dump_enabled_p ())
4174 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4175 "not vectorized: bad operation in basic block.\n");
4176 return false;
4179 vect_bb_partition_graph (bb_vinfo);
4181 return true;
4184 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4185 basic blocks in BBS, returning true on success.
4186 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4188 static bool
4189 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4190 vec<int> *dataref_groups, unsigned int n_stmts)
4192 bb_vec_info bb_vinfo;
4193 auto_vector_modes vector_modes;
4195 /* Autodetect first vector size we try. */
4196 machine_mode next_vector_mode = VOIDmode;
4197 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4198 unsigned int mode_i = 0;
4200 vec_info_shared shared;
4202 machine_mode autodetected_vector_mode = VOIDmode;
4203 while (1)
4205 bool vectorized = false;
4206 bool fatal = false;
4207 bb_vinfo = new _bb_vec_info (bbs, &shared);
4209 bool first_time_p = shared.datarefs.is_empty ();
4210 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4211 if (first_time_p)
4212 bb_vinfo->shared->save_datarefs ();
4213 else
4214 bb_vinfo->shared->check_datarefs ();
4215 bb_vinfo->vector_mode = next_vector_mode;
4217 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
4218 && dbg_cnt (vect_slp))
4220 if (dump_enabled_p ())
4222 dump_printf_loc (MSG_NOTE, vect_location,
4223 "***** Analysis succeeded with vector mode"
4224 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4225 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4228 bb_vinfo->shared->check_datarefs ();
4230 unsigned i;
4231 slp_instance instance;
4232 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4234 if (instance->subgraph_entries.is_empty ())
4235 continue;
4237 vect_location = instance->location ();
4238 if (!unlimited_cost_model (NULL)
4239 && !vect_bb_vectorization_profitable_p
4240 (bb_vinfo, instance->subgraph_entries))
4242 if (dump_enabled_p ())
4243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4244 "not vectorized: vectorization is not "
4245 "profitable.\n");
4246 continue;
4249 if (!vectorized && dump_enabled_p ())
4250 dump_printf_loc (MSG_NOTE, vect_location,
4251 "Basic block will be vectorized "
4252 "using SLP\n");
4253 vectorized = true;
4255 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4257 unsigned HOST_WIDE_INT bytes;
4258 if (dump_enabled_p ())
4260 if (GET_MODE_SIZE
4261 (bb_vinfo->vector_mode).is_constant (&bytes))
4262 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4263 "basic block part vectorized using %wu "
4264 "byte vectors\n", bytes);
4265 else
4266 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4267 "basic block part vectorized using "
4268 "variable length vectors\n");
4272 else
4274 if (dump_enabled_p ())
4275 dump_printf_loc (MSG_NOTE, vect_location,
4276 "***** Analysis failed with vector mode %s\n",
4277 GET_MODE_NAME (bb_vinfo->vector_mode));
4280 if (mode_i == 0)
4281 autodetected_vector_mode = bb_vinfo->vector_mode;
4283 if (!fatal)
4284 while (mode_i < vector_modes.length ()
4285 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4287 if (dump_enabled_p ())
4288 dump_printf_loc (MSG_NOTE, vect_location,
4289 "***** The result for vector mode %s would"
4290 " be the same\n",
4291 GET_MODE_NAME (vector_modes[mode_i]));
4292 mode_i += 1;
4295 delete bb_vinfo;
4297 if (mode_i < vector_modes.length ()
4298 && VECTOR_MODE_P (autodetected_vector_mode)
4299 && (related_vector_mode (vector_modes[mode_i],
4300 GET_MODE_INNER (autodetected_vector_mode))
4301 == autodetected_vector_mode)
4302 && (related_vector_mode (autodetected_vector_mode,
4303 GET_MODE_INNER (vector_modes[mode_i]))
4304 == vector_modes[mode_i]))
4306 if (dump_enabled_p ())
4307 dump_printf_loc (MSG_NOTE, vect_location,
4308 "***** Skipping vector mode %s, which would"
4309 " repeat the analysis for %s\n",
4310 GET_MODE_NAME (vector_modes[mode_i]),
4311 GET_MODE_NAME (autodetected_vector_mode));
4312 mode_i += 1;
4315 if (vectorized
4316 || mode_i == vector_modes.length ()
4317 || autodetected_vector_mode == VOIDmode
4318 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4319 vector sizes will fail do not bother iterating. */
4320 || fatal)
4321 return vectorized;
4323 /* Try the next biggest vector size. */
4324 next_vector_mode = vector_modes[mode_i++];
4325 if (dump_enabled_p ())
4326 dump_printf_loc (MSG_NOTE, vect_location,
4327 "***** Re-trying analysis with vector mode %s\n",
4328 GET_MODE_NAME (next_vector_mode));
4333 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4334 true if anything in the basic-block was vectorized. */
4336 static bool
4337 vect_slp_bbs (vec<basic_block> bbs)
4339 vec<data_reference_p> datarefs = vNULL;
4340 auto_vec<int> dataref_groups;
4341 int insns = 0;
4342 int current_group = 0;
4344 for (unsigned i = 0; i < bbs.length (); i++)
4346 basic_block bb = bbs[i];
4347 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
4348 gsi_next (&gsi))
4350 gimple *stmt = gsi_stmt (gsi);
4351 if (is_gimple_debug (stmt))
4352 continue;
4354 insns++;
4356 if (gimple_location (stmt) != UNKNOWN_LOCATION)
4357 vect_location = stmt;
4359 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
4360 &dataref_groups, current_group))
4361 ++current_group;
4365 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
4368 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4369 true if anything in the basic-block was vectorized. */
4371 bool
4372 vect_slp_bb (basic_block bb)
4374 auto_vec<basic_block> bbs;
4375 bbs.safe_push (bb);
4376 return vect_slp_bbs (bbs);
4379 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4380 true if anything in the basic-block was vectorized. */
4382 bool
4383 vect_slp_function (function *fun)
4385 bool r = false;
4386 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
4387 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
4389 /* For the moment split the function into pieces to avoid making
4390 the iteration on the vector mode moot. Split at points we know
4391 to not handle well which is CFG merges (SLP discovery doesn't
4392 handle non-loop-header PHIs) and loop exits. Since pattern
4393 recog requires reverse iteration to visit uses before defs
4394 simply chop RPO into pieces. */
4395 auto_vec<basic_block> bbs;
4396 for (unsigned i = 0; i < n; i++)
4398 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
4399 bool split = false;
4401 /* Split when a BB is not dominated by the first block. */
4402 if (!bbs.is_empty ()
4403 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
4405 if (dump_enabled_p ())
4406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4407 "splitting region at dominance boundary bb%d\n",
4408 bb->index);
4409 split = true;
4411 /* Split when the loop determined by the first block
4412 is exited. This is because we eventually insert
4413 invariants at region begin. */
4414 else if (!bbs.is_empty ()
4415 && bbs[0]->loop_father != bb->loop_father
4416 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
4418 if (dump_enabled_p ())
4419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4420 "splitting region at loop %d exit at bb%d\n",
4421 bbs[0]->loop_father->num, bb->index);
4422 split = true;
4425 if (split && !bbs.is_empty ())
4427 r |= vect_slp_bbs (bbs);
4428 bbs.truncate (0);
4429 bbs.quick_push (bb);
4431 else
4432 bbs.safe_push (bb);
4434 /* When we have a stmt ending this block and defining a
4435 value we have to insert on edges when inserting after it for
4436 a vector containing its definition. Avoid this for now. */
4437 if (gimple *last = last_stmt (bb))
4438 if (gimple_get_lhs (last)
4439 && is_ctrl_altering_stmt (last))
4441 if (dump_enabled_p ())
4442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4443 "splitting region at control altering "
4444 "definition %G", last);
4445 r |= vect_slp_bbs (bbs);
4446 bbs.truncate (0);
4450 if (!bbs.is_empty ())
4451 r |= vect_slp_bbs (bbs);
4453 free (rpo);
4455 return r;
4458 /* Build a variable-length vector in which the elements in ELTS are repeated
4459 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4460 RESULTS and add any new instructions to SEQ.
4462 The approach we use is:
4464 (1) Find a vector mode VM with integer elements of mode IM.
4466 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4467 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4468 from small vectors to IM.
4470 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4472 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4473 correct byte contents.
4475 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4477 We try to find the largest IM for which this sequence works, in order
4478 to cut down on the number of interleaves. */
4480 void
4481 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
4482 vec<tree> elts, unsigned int nresults,
4483 vec<tree> &results)
4485 unsigned int nelts = elts.length ();
4486 tree element_type = TREE_TYPE (vector_type);
4488 /* (1) Find a vector mode VM with integer elements of mode IM. */
4489 unsigned int nvectors = 1;
4490 tree new_vector_type;
4491 tree permutes[2];
4492 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
4493 &nvectors, &new_vector_type,
4494 permutes))
4495 gcc_unreachable ();
4497 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4498 unsigned int partial_nelts = nelts / nvectors;
4499 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
4501 tree_vector_builder partial_elts;
4502 auto_vec<tree, 32> pieces (nvectors * 2);
4503 pieces.quick_grow (nvectors * 2);
4504 for (unsigned int i = 0; i < nvectors; ++i)
4506 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4507 ELTS' has mode IM. */
4508 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
4509 for (unsigned int j = 0; j < partial_nelts; ++j)
4510 partial_elts.quick_push (elts[i * partial_nelts + j]);
4511 tree t = gimple_build_vector (seq, &partial_elts);
4512 t = gimple_build (seq, VIEW_CONVERT_EXPR,
4513 TREE_TYPE (new_vector_type), t);
4515 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4516 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
4519 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4520 correct byte contents.
4522 We need to repeat the following operation log2(nvectors) times:
4524 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4525 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4527 However, if each input repeats every N elements and the VF is
4528 a multiple of N * 2, the HI result is the same as the LO. */
4529 unsigned int in_start = 0;
4530 unsigned int out_start = nvectors;
4531 unsigned int hi_start = nvectors / 2;
4532 /* A bound on the number of outputs needed to produce NRESULTS results
4533 in the final iteration. */
4534 unsigned int noutputs_bound = nvectors * nresults;
4535 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
4537 noutputs_bound /= 2;
4538 unsigned int limit = MIN (noutputs_bound, nvectors);
4539 for (unsigned int i = 0; i < limit; ++i)
4541 if ((i & 1) != 0
4542 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
4543 2 * in_repeat))
4545 pieces[out_start + i] = pieces[out_start + i - 1];
4546 continue;
4549 tree output = make_ssa_name (new_vector_type);
4550 tree input1 = pieces[in_start + (i / 2)];
4551 tree input2 = pieces[in_start + (i / 2) + hi_start];
4552 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
4553 input1, input2,
4554 permutes[i & 1]);
4555 gimple_seq_add_stmt (seq, stmt);
4556 pieces[out_start + i] = output;
4558 std::swap (in_start, out_start);
4561 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4562 results.reserve (nresults);
4563 for (unsigned int i = 0; i < nresults; ++i)
4564 if (i < nvectors)
4565 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
4566 pieces[in_start + i]));
4567 else
4568 results.quick_push (results[i - nvectors]);
4572 /* For constant and loop invariant defs in OP_NODE this function creates
4573 vector defs that will be used in the vectorized stmts and stores them
4574 to SLP_TREE_VEC_DEFS of OP_NODE. */
4576 static void
4577 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
4579 unsigned HOST_WIDE_INT nunits;
4580 tree vec_cst;
4581 unsigned j, number_of_places_left_in_vector;
4582 tree vector_type;
4583 tree vop;
4584 int group_size = op_node->ops.length ();
4585 unsigned int vec_num, i;
4586 unsigned number_of_copies = 1;
4587 bool constant_p;
4588 gimple_seq ctor_seq = NULL;
4589 auto_vec<tree, 16> permute_results;
4591 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4592 vector_type = SLP_TREE_VECTYPE (op_node);
4594 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
4595 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
4596 auto_vec<tree> voprnds (number_of_vectors);
4598 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4599 created vectors. It is greater than 1 if unrolling is performed.
4601 For example, we have two scalar operands, s1 and s2 (e.g., group of
4602 strided accesses of size two), while NUNITS is four (i.e., four scalars
4603 of this type can be packed in a vector). The output vector will contain
4604 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4605 will be 2).
4607 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4608 containing the operands.
4610 For example, NUNITS is four as before, and the group size is 8
4611 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4612 {s5, s6, s7, s8}. */
4614 /* When using duplicate_and_interleave, we just need one element for
4615 each scalar statement. */
4616 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4617 nunits = group_size;
4619 number_of_copies = nunits * number_of_vectors / group_size;
4621 number_of_places_left_in_vector = nunits;
4622 constant_p = true;
4623 tree_vector_builder elts (vector_type, nunits, 1);
4624 elts.quick_grow (nunits);
4625 stmt_vec_info insert_after = NULL;
4626 for (j = 0; j < number_of_copies; j++)
4628 tree op;
4629 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
4631 /* Create 'vect_ = {op0,op1,...,opn}'. */
4632 number_of_places_left_in_vector--;
4633 tree orig_op = op;
4634 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
4636 if (CONSTANT_CLASS_P (op))
4638 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4640 /* Can't use VIEW_CONVERT_EXPR for booleans because
4641 of possibly different sizes of scalar value and
4642 vector element. */
4643 if (integer_zerop (op))
4644 op = build_int_cst (TREE_TYPE (vector_type), 0);
4645 else if (integer_onep (op))
4646 op = build_all_ones_cst (TREE_TYPE (vector_type));
4647 else
4648 gcc_unreachable ();
4650 else
4651 op = fold_unary (VIEW_CONVERT_EXPR,
4652 TREE_TYPE (vector_type), op);
4653 gcc_assert (op && CONSTANT_CLASS_P (op));
4655 else
4657 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
4658 gimple *init_stmt;
4659 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4661 tree true_val
4662 = build_all_ones_cst (TREE_TYPE (vector_type));
4663 tree false_val
4664 = build_zero_cst (TREE_TYPE (vector_type));
4665 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
4666 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
4667 op, true_val,
4668 false_val);
4670 else
4672 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
4673 op);
4674 init_stmt
4675 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
4676 op);
4678 gimple_seq_add_stmt (&ctor_seq, init_stmt);
4679 op = new_temp;
4682 elts[number_of_places_left_in_vector] = op;
4683 if (!CONSTANT_CLASS_P (op))
4684 constant_p = false;
4685 /* For BB vectorization we have to compute an insert location
4686 when a def is inside the analyzed region since we cannot
4687 simply insert at the BB start in this case. */
4688 stmt_vec_info opdef;
4689 if (TREE_CODE (orig_op) == SSA_NAME
4690 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
4691 && is_a <bb_vec_info> (vinfo)
4692 && (opdef = vinfo->lookup_def (orig_op)))
4694 if (!insert_after)
4695 insert_after = opdef;
4696 else
4697 insert_after = get_later_stmt (insert_after, opdef);
4700 if (number_of_places_left_in_vector == 0)
4702 if (constant_p
4703 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4704 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4705 vec_cst = gimple_build_vector (&ctor_seq, &elts);
4706 else
4708 if (permute_results.is_empty ())
4709 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
4710 elts, number_of_vectors,
4711 permute_results);
4712 vec_cst = permute_results[number_of_vectors - j - 1];
4714 if (!gimple_seq_empty_p (ctor_seq))
4716 if (insert_after)
4718 gimple_stmt_iterator gsi;
4719 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
4721 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
4722 gsi_insert_seq_before (&gsi, ctor_seq,
4723 GSI_CONTINUE_LINKING);
4725 else if (!stmt_ends_bb_p (insert_after->stmt))
4727 gsi = gsi_for_stmt (insert_after->stmt);
4728 gsi_insert_seq_after (&gsi, ctor_seq,
4729 GSI_CONTINUE_LINKING);
4731 else
4733 /* When we want to insert after a def where the
4734 defining stmt throws then insert on the fallthru
4735 edge. */
4736 edge e = find_fallthru_edge
4737 (gimple_bb (insert_after->stmt)->succs);
4738 basic_block new_bb
4739 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
4740 gcc_assert (!new_bb);
4743 else
4744 vinfo->insert_seq_on_entry (NULL, ctor_seq);
4745 ctor_seq = NULL;
4747 voprnds.quick_push (vec_cst);
4748 insert_after = NULL;
4749 number_of_places_left_in_vector = nunits;
4750 constant_p = true;
4751 elts.new_vector (vector_type, nunits, 1);
4752 elts.quick_grow (nunits);
4757 /* Since the vectors are created in the reverse order, we should invert
4758 them. */
4759 vec_num = voprnds.length ();
4760 for (j = vec_num; j != 0; j--)
4762 vop = voprnds[j - 1];
4763 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4766 /* In case that VF is greater than the unrolling factor needed for the SLP
4767 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4768 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4769 to replicate the vectors. */
4770 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
4771 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
4772 i++)
4773 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4776 /* Get the Ith vectorized definition from SLP_NODE. */
4778 tree
4779 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
4781 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
4782 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
4783 else
4784 return SLP_TREE_VEC_DEFS (slp_node)[i];
4787 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4789 void
4790 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
4792 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
4793 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
4795 unsigned j;
4796 gimple *vec_def_stmt;
4797 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
4798 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
4800 else
4801 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
4804 /* Get N vectorized definitions for SLP_NODE. */
4806 void
4807 vect_get_slp_defs (vec_info *,
4808 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
4810 if (n == -1U)
4811 n = SLP_TREE_CHILDREN (slp_node).length ();
4813 for (unsigned i = 0; i < n; ++i)
4815 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
4816 vec<tree> vec_defs = vNULL;
4817 vect_get_slp_defs (child, &vec_defs);
4818 vec_oprnds->quick_push (vec_defs);
4822 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4823 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4824 permute statements for the SLP node NODE. */
4826 bool
4827 vect_transform_slp_perm_load (vec_info *vinfo,
4828 slp_tree node, vec<tree> dr_chain,
4829 gimple_stmt_iterator *gsi, poly_uint64 vf,
4830 bool analyze_only, unsigned *n_perms)
4832 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4833 int vec_index = 0;
4834 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4835 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
4836 unsigned int mask_element;
4837 machine_mode mode;
4839 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
4840 return false;
4842 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
4844 mode = TYPE_MODE (vectype);
4845 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4847 /* Initialize the vect stmts of NODE to properly insert the generated
4848 stmts later. */
4849 if (! analyze_only)
4850 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
4851 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
4852 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
4854 /* Generate permutation masks for every NODE. Number of masks for each NODE
4855 is equal to GROUP_SIZE.
4856 E.g., we have a group of three nodes with three loads from the same
4857 location in each node, and the vector size is 4. I.e., we have a
4858 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4859 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4860 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4863 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4864 The last mask is illegal since we assume two operands for permute
4865 operation, and the mask element values can't be outside that range.
4866 Hence, the last mask must be converted into {2,5,5,5}.
4867 For the first two permutations we need the first and the second input
4868 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4869 we need the second and the third vectors: {b1,c1,a2,b2} and
4870 {c2,a3,b3,c3}. */
4872 int vect_stmts_counter = 0;
4873 unsigned int index = 0;
4874 int first_vec_index = -1;
4875 int second_vec_index = -1;
4876 bool noop_p = true;
4877 *n_perms = 0;
4879 vec_perm_builder mask;
4880 unsigned int nelts_to_build;
4881 unsigned int nvectors_per_build;
4882 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
4883 && multiple_p (nunits, group_size));
4884 if (repeating_p)
4886 /* A single vector contains a whole number of copies of the node, so:
4887 (a) all permutes can use the same mask; and
4888 (b) the permutes only need a single vector input. */
4889 mask.new_vector (nunits, group_size, 3);
4890 nelts_to_build = mask.encoded_nelts ();
4891 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
4893 else
4895 /* We need to construct a separate mask for each vector statement. */
4896 unsigned HOST_WIDE_INT const_nunits, const_vf;
4897 if (!nunits.is_constant (&const_nunits)
4898 || !vf.is_constant (&const_vf))
4899 return false;
4900 mask.new_vector (const_nunits, const_nunits, 1);
4901 nelts_to_build = const_vf * group_size;
4902 nvectors_per_build = 1;
4905 unsigned int count = mask.encoded_nelts ();
4906 mask.quick_grow (count);
4907 vec_perm_indices indices;
4909 for (unsigned int j = 0; j < nelts_to_build; j++)
4911 unsigned int iter_num = j / group_size;
4912 unsigned int stmt_num = j % group_size;
4913 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
4914 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
4915 if (repeating_p)
4917 first_vec_index = 0;
4918 mask_element = i;
4920 else
4922 /* Enforced before the loop when !repeating_p. */
4923 unsigned int const_nunits = nunits.to_constant ();
4924 vec_index = i / const_nunits;
4925 mask_element = i % const_nunits;
4926 if (vec_index == first_vec_index
4927 || first_vec_index == -1)
4929 first_vec_index = vec_index;
4931 else if (vec_index == second_vec_index
4932 || second_vec_index == -1)
4934 second_vec_index = vec_index;
4935 mask_element += const_nunits;
4937 else
4939 if (dump_enabled_p ())
4940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4941 "permutation requires at "
4942 "least three vectors %G",
4943 stmt_info->stmt);
4944 gcc_assert (analyze_only);
4945 return false;
4948 gcc_assert (mask_element < 2 * const_nunits);
4951 if (mask_element != index)
4952 noop_p = false;
4953 mask[index++] = mask_element;
4955 if (index == count && !noop_p)
4957 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4958 if (!can_vec_perm_const_p (mode, indices))
4960 if (dump_enabled_p ())
4962 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4963 vect_location,
4964 "unsupported vect permute { ");
4965 for (i = 0; i < count; ++i)
4967 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4968 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4970 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4972 gcc_assert (analyze_only);
4973 return false;
4976 ++*n_perms;
4979 if (index == count)
4981 if (!analyze_only)
4983 tree mask_vec = NULL_TREE;
4985 if (! noop_p)
4986 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4988 if (second_vec_index == -1)
4989 second_vec_index = first_vec_index;
4991 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4993 /* Generate the permute statement if necessary. */
4994 tree first_vec = dr_chain[first_vec_index + ri];
4995 tree second_vec = dr_chain[second_vec_index + ri];
4996 gimple *perm_stmt;
4997 if (! noop_p)
4999 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5000 tree perm_dest
5001 = vect_create_destination_var (gimple_assign_lhs (stmt),
5002 vectype);
5003 perm_dest = make_ssa_name (perm_dest);
5004 perm_stmt
5005 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5006 first_vec, second_vec,
5007 mask_vec);
5008 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
5009 gsi);
5011 else
5012 /* If mask was NULL_TREE generate the requested
5013 identity transform. */
5014 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
5016 /* Store the vector statement in NODE. */
5017 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
5021 index = 0;
5022 first_vec_index = -1;
5023 second_vec_index = -1;
5024 noop_p = true;
5028 return true;
5032 /* Vectorize the SLP permutations in NODE as specified
5033 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5034 child number and lane number.
5035 Interleaving of two two-lane two-child SLP subtrees (not supported):
5036 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5037 A blend of two four-lane two-child SLP subtrees:
5038 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5039 Highpart of a four-lane one-child SLP subtree (not supported):
5040 [ { 0, 2 }, { 0, 3 } ]
5041 Where currently only a subset is supported by code generating below. */
5043 static bool
5044 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5045 slp_tree node, stmt_vector_for_cost *cost_vec)
5047 tree vectype = SLP_TREE_VECTYPE (node);
5049 /* ??? We currently only support all same vector input and output types
5050 while the SLP IL should really do a concat + select and thus accept
5051 arbitrary mismatches. */
5052 slp_tree child;
5053 unsigned i;
5054 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5055 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5057 if (dump_enabled_p ())
5058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5059 "Unsupported lane permutation\n");
5060 return false;
5063 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5064 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5065 if (dump_enabled_p ())
5067 dump_printf_loc (MSG_NOTE, vect_location,
5068 "vectorizing permutation");
5069 for (unsigned i = 0; i < perm.length (); ++i)
5070 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5071 dump_printf (MSG_NOTE, "\n");
5074 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5075 if (!nunits.is_constant ())
5076 return false;
5077 unsigned HOST_WIDE_INT vf = 1;
5078 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5079 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
5080 return false;
5081 unsigned olanes = vf * SLP_TREE_LANES (node);
5082 gcc_assert (multiple_p (olanes, nunits));
5084 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5085 from the { SLP operand, scalar lane } permutation as recorded in the
5086 SLP node as intermediate step. This part should already work
5087 with SLP children with arbitrary number of lanes. */
5088 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
5089 auto_vec<unsigned> active_lane;
5090 vperm.create (olanes);
5091 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
5092 for (unsigned i = 0; i < vf; ++i)
5094 for (unsigned pi = 0; pi < perm.length (); ++pi)
5096 std::pair<unsigned, unsigned> p = perm[pi];
5097 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
5098 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
5099 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
5100 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
5101 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
5103 /* Advance to the next group. */
5104 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
5105 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
5108 if (dump_enabled_p ())
5110 dump_printf_loc (MSG_NOTE, vect_location, "as");
5111 for (unsigned i = 0; i < vperm.length (); ++i)
5113 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
5114 dump_printf (MSG_NOTE, ",");
5115 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
5116 vperm[i].first.first, vperm[i].first.second,
5117 vperm[i].first.second);
5119 dump_printf (MSG_NOTE, "\n");
5122 /* We can only handle two-vector permutes, everything else should
5123 be lowered on the SLP level. The following is closely inspired
5124 by vect_transform_slp_perm_load and is supposed to eventually
5125 replace it.
5126 ??? As intermediate step do code-gen in the SLP tree representation
5127 somehow? */
5128 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
5129 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
5130 unsigned int const_nunits = nunits.to_constant ();
5131 unsigned int index = 0;
5132 unsigned int mask_element;
5133 vec_perm_builder mask;
5134 mask.new_vector (const_nunits, const_nunits, 1);
5135 unsigned int count = mask.encoded_nelts ();
5136 mask.quick_grow (count);
5137 vec_perm_indices indices;
5138 unsigned nperms = 0;
5139 for (unsigned i = 0; i < vperm.length (); ++i)
5141 mask_element = vperm[i].second;
5142 if (first_vec.first == -1U
5143 || first_vec == vperm[i].first)
5144 first_vec = vperm[i].first;
5145 else if (second_vec.first == -1U
5146 || second_vec == vperm[i].first)
5148 second_vec = vperm[i].first;
5149 mask_element += const_nunits;
5151 else
5153 if (dump_enabled_p ())
5154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5155 "permutation requires at "
5156 "least three vectors");
5157 gcc_assert (!gsi);
5158 return false;
5161 mask[index++] = mask_element;
5163 if (index == count)
5165 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
5166 const_nunits);
5167 bool identity_p = indices.series_p (0, 1, 0, 1);
5168 if (!identity_p
5169 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5171 if (dump_enabled_p ())
5173 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5174 vect_location,
5175 "unsupported vect permute { ");
5176 for (i = 0; i < count; ++i)
5178 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5179 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5181 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5183 gcc_assert (!gsi);
5184 return false;
5187 if (!identity_p)
5188 nperms++;
5189 if (gsi)
5191 if (second_vec.first == -1U)
5192 second_vec = first_vec;
5194 /* Generate the permute statement if necessary. */
5195 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
5196 tree first_def
5197 = vect_get_slp_vect_def (first_node, first_vec.second);
5198 gassign *perm_stmt;
5199 tree perm_dest = make_ssa_name (vectype);
5200 if (!identity_p)
5202 slp_tree second_node
5203 = SLP_TREE_CHILDREN (node)[second_vec.first];
5204 tree second_def
5205 = vect_get_slp_vect_def (second_node, second_vec.second);
5206 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5207 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5208 first_def, second_def,
5209 mask_vec);
5211 else
5212 /* We need a copy here in case the def was external. */
5213 perm_stmt = gimple_build_assign (perm_dest, first_def);
5214 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5215 /* Store the vector statement in NODE. */
5216 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5219 index = 0;
5220 first_vec = std::make_pair (-1U, -1U);
5221 second_vec = std::make_pair (-1U, -1U);
5225 if (!gsi)
5226 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
5228 return true;
5231 /* Vectorize SLP NODE. */
5233 static void
5234 vect_schedule_slp_node (vec_info *vinfo,
5235 slp_tree node, slp_instance instance)
5237 gimple_stmt_iterator si;
5238 int i;
5239 slp_tree child;
5241 /* For existing vectors there's nothing to do. */
5242 if (SLP_TREE_VEC_DEFS (node).exists ())
5243 return;
5245 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
5247 /* Vectorize externals and constants. */
5248 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5249 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5251 /* ??? vectorizable_shift can end up using a scalar operand which is
5252 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5253 node in this case. */
5254 if (!SLP_TREE_VECTYPE (node))
5255 return;
5257 vect_create_constant_vectors (vinfo, node);
5258 return;
5261 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5263 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
5264 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
5266 if (dump_enabled_p ())
5267 dump_printf_loc (MSG_NOTE, vect_location,
5268 "------>vectorizing SLP node starting from: %G",
5269 stmt_info->stmt);
5271 if (STMT_VINFO_DATA_REF (stmt_info)
5272 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5274 /* Vectorized loads go before the first scalar load to make it
5275 ready early, vectorized stores go before the last scalar
5276 stmt which is where all uses are ready. */
5277 stmt_vec_info last_stmt_info = NULL;
5278 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
5279 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
5280 else /* DR_IS_WRITE */
5281 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
5282 si = gsi_for_stmt (last_stmt_info->stmt);
5284 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
5285 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
5286 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
5287 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5289 /* For PHI node vectorization we do not use the insertion iterator. */
5290 si = gsi_none ();
5292 else
5294 /* Emit other stmts after the children vectorized defs which is
5295 earliest possible. */
5296 gimple *last_stmt = NULL;
5297 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5298 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5300 /* For fold-left reductions we are retaining the scalar
5301 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5302 set so the representation isn't perfect. Resort to the
5303 last scalar def here. */
5304 if (SLP_TREE_VEC_STMTS (child).is_empty ())
5306 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
5307 == cycle_phi_info_type);
5308 gphi *phi = as_a <gphi *>
5309 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
5310 if (!last_stmt
5311 || vect_stmt_dominates_stmt_p (last_stmt, phi))
5312 last_stmt = phi;
5314 /* We are emitting all vectorized stmts in the same place and
5315 the last one is the last.
5316 ??? Unless we have a load permutation applied and that
5317 figures to re-use an earlier generated load. */
5318 unsigned j;
5319 gimple *vstmt;
5320 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
5321 if (!last_stmt
5322 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5323 last_stmt = vstmt;
5325 else if (!SLP_TREE_VECTYPE (child))
5327 /* For externals we use unvectorized at all scalar defs. */
5328 unsigned j;
5329 tree def;
5330 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
5331 if (TREE_CODE (def) == SSA_NAME
5332 && !SSA_NAME_IS_DEFAULT_DEF (def))
5334 gimple *stmt = SSA_NAME_DEF_STMT (def);
5335 if (!last_stmt
5336 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
5337 last_stmt = stmt;
5340 else
5342 /* For externals we have to look at all defs since their
5343 insertion place is decided per vector. But beware
5344 of pre-existing vectors where we need to make sure
5345 we do not insert before the region boundary. */
5346 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
5347 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
5348 last_stmt = gsi_stmt (gsi_after_labels
5349 (as_a <bb_vec_info> (vinfo)->bbs[0]));
5350 else
5352 unsigned j;
5353 tree vdef;
5354 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
5355 if (TREE_CODE (vdef) == SSA_NAME
5356 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
5358 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
5359 if (!last_stmt
5360 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5361 last_stmt = vstmt;
5365 /* This can happen when all children are pre-existing vectors or
5366 constants. */
5367 if (!last_stmt)
5368 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
5369 if (is_a <gphi *> (last_stmt))
5370 si = gsi_after_labels (gimple_bb (last_stmt));
5371 else
5373 si = gsi_for_stmt (last_stmt);
5374 gsi_next (&si);
5378 bool done_p = false;
5380 /* Handle purely internal nodes. */
5381 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5383 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5384 be shared with different SLP nodes (but usually it's the same
5385 operation apart from the case the stmt is only there for denoting
5386 the actual scalar lane defs ...). So do not call vect_transform_stmt
5387 but open-code it here (partly). */
5388 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
5389 gcc_assert (done);
5390 done_p = true;
5392 if (!done_p)
5393 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
5396 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5397 For loop vectorization this is done in vectorizable_call, but for SLP
5398 it needs to be deferred until end of vect_schedule_slp, because multiple
5399 SLP instances may refer to the same scalar stmt. */
5401 static void
5402 vect_remove_slp_scalar_calls (vec_info *vinfo,
5403 slp_tree node, hash_set<slp_tree> &visited)
5405 gimple *new_stmt;
5406 gimple_stmt_iterator gsi;
5407 int i;
5408 slp_tree child;
5409 tree lhs;
5410 stmt_vec_info stmt_info;
5412 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5413 return;
5415 if (visited.add (node))
5416 return;
5418 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5419 vect_remove_slp_scalar_calls (vinfo, child, visited);
5421 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5423 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
5424 if (!stmt || gimple_bb (stmt) == NULL)
5425 continue;
5426 if (is_pattern_stmt_p (stmt_info)
5427 || !PURE_SLP_STMT (stmt_info))
5428 continue;
5429 lhs = gimple_call_lhs (stmt);
5430 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
5431 gsi = gsi_for_stmt (stmt);
5432 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
5433 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
5437 static void
5438 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
5440 hash_set<slp_tree> visited;
5441 vect_remove_slp_scalar_calls (vinfo, node, visited);
5444 /* Vectorize the instance root. */
5446 void
5447 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
5449 gassign *rstmt = NULL;
5451 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
5453 gimple *child_stmt;
5454 int j;
5456 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5458 tree vect_lhs = gimple_get_lhs (child_stmt);
5459 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
5460 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
5461 TREE_TYPE (vect_lhs)))
5462 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
5463 vect_lhs);
5464 rstmt = gimple_build_assign (root_lhs, vect_lhs);
5465 break;
5468 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
5470 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5471 gimple *child_stmt;
5472 int j;
5473 vec<constructor_elt, va_gc> *v;
5474 vec_alloc (v, nelts);
5476 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5478 CONSTRUCTOR_APPEND_ELT (v,
5479 NULL_TREE,
5480 gimple_get_lhs (child_stmt));
5482 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
5483 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
5484 tree r_constructor = build_constructor (rtype, v);
5485 rstmt = gimple_build_assign (lhs, r_constructor);
5488 gcc_assert (rstmt);
5490 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
5491 gsi_replace (&rgsi, rstmt, true);
5494 struct slp_scc_info
5496 bool on_stack;
5497 int dfs;
5498 int lowlink;
5501 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
5503 static void
5504 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
5505 hash_map<slp_tree, slp_scc_info> &scc_info,
5506 int &maxdfs, vec<slp_tree> &stack)
5508 bool existed_p;
5509 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
5510 gcc_assert (!existed_p);
5511 info->dfs = maxdfs;
5512 info->lowlink = maxdfs;
5513 info->on_stack = true;
5514 maxdfs++;
5515 stack.safe_push (node);
5516 unsigned i;
5517 slp_tree child;
5519 /* ??? We're keeping SLP_TREE_CHILDREN of externalized nodes. */
5520 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
5521 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5523 if (!child)
5524 continue;
5525 slp_scc_info *child_info = scc_info.get (child);
5526 if (!child_info)
5528 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
5529 /* Recursion might have re-allocated the node. */
5530 info = scc_info.get (node);
5531 child_info = scc_info.get (child);
5532 info->lowlink = MIN (info->lowlink, child_info->lowlink);
5534 else if (child_info->on_stack)
5535 info->lowlink = MIN (info->lowlink, child_info->dfs);
5537 if (info->lowlink != info->dfs)
5538 return;
5540 /* Singleton. */
5541 if (stack.last () == node)
5543 stack.pop ();
5544 info->on_stack = false;
5545 vect_schedule_slp_node (vinfo, node, instance);
5546 return;
5548 /* SCC. */
5549 int last_idx = stack.length () - 1;
5550 while (stack[last_idx] != node)
5551 last_idx--;
5552 /* We can break the cycle at PHIs who have at least one child
5553 code generated. Then we could re-start the DFS walk until
5554 all nodes in the SCC are covered (we might have new entries
5555 for only back-reachable nodes). But it's simpler to just
5556 iterate and schedule those that are ready. */
5557 auto_vec<slp_tree, 4> phis_to_fixup;
5558 unsigned todo = stack.length () - last_idx;
5561 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
5563 slp_tree entry = stack[idx];
5564 if (!entry)
5565 continue;
5566 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
5567 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
5568 bool ready = !phi;
5569 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
5570 if (!child)
5572 gcc_assert (phi);
5573 ready = true;
5574 break;
5576 else if (scc_info.get (child)->on_stack)
5578 if (!phi)
5580 ready = false;
5581 break;
5584 else
5586 if (phi)
5588 ready = true;
5589 break;
5592 if (ready)
5594 vect_schedule_slp_node (vinfo, entry, instance);
5595 scc_info.get (entry)->on_stack = false;
5596 stack[idx] = NULL;
5597 todo--;
5598 if (phi)
5599 phis_to_fixup.safe_push (entry);
5603 while (todo != 0);
5605 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
5606 slp_tree phi_node;
5607 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
5609 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
5610 edge_iterator ei;
5611 edge e;
5612 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
5614 unsigned dest_idx = e->dest_idx;
5615 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
5616 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
5617 continue;
5618 /* Simply fill all args. */
5619 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
5620 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
5621 vect_get_slp_vect_def (child, i),
5622 e, gimple_phi_arg_location (phi, dest_idx));
5626 /* Pop the SCC. */
5627 stack.truncate (last_idx);
5630 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
5632 void
5633 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
5635 slp_instance instance;
5636 unsigned int i;
5638 hash_map<slp_tree, slp_scc_info> scc_info;
5639 int maxdfs = 0;
5640 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5642 slp_tree node = SLP_INSTANCE_TREE (instance);
5643 if (dump_enabled_p ())
5645 dump_printf_loc (MSG_NOTE, vect_location,
5646 "Vectorizing SLP tree:\n");
5647 if (SLP_INSTANCE_ROOT_STMT (instance))
5648 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
5649 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
5650 vect_print_slp_graph (MSG_NOTE, vect_location,
5651 SLP_INSTANCE_TREE (instance));
5653 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
5654 have a PHI be the node breaking the cycle. */
5655 auto_vec<slp_tree> stack;
5656 if (!scc_info.get (node))
5657 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
5659 if (SLP_INSTANCE_ROOT_STMT (instance))
5660 vectorize_slp_instance_root_stmt (node, instance);
5662 if (dump_enabled_p ())
5663 dump_printf_loc (MSG_NOTE, vect_location,
5664 "vectorizing stmts using SLP.\n");
5667 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5669 slp_tree root = SLP_INSTANCE_TREE (instance);
5670 stmt_vec_info store_info;
5671 unsigned int j;
5673 /* Remove scalar call stmts. Do not do this for basic-block
5674 vectorization as not all uses may be vectorized.
5675 ??? Why should this be necessary? DCE should be able to
5676 remove the stmts itself.
5677 ??? For BB vectorization we can as well remove scalar
5678 stmts starting from the SLP tree root if they have no
5679 uses. */
5680 if (is_a <loop_vec_info> (vinfo))
5681 vect_remove_slp_scalar_calls (vinfo, root);
5683 /* Remove vectorized stores original scalar stmts. */
5684 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
5686 if (!STMT_VINFO_DATA_REF (store_info)
5687 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
5688 break;
5690 store_info = vect_orig_stmt (store_info);
5691 /* Free the attached stmt_vec_info and remove the stmt. */
5692 vinfo->remove_stmt (store_info);