testsuite: remove SPE tests.
[official-gcc.git] / gcc / tree-vect-slp.c
blob1ffbf6f6af99df017a103617e4e3060c7913b495
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
50 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
51 slp_tree, stmt_vector_for_cost *);
53 /* Initialize a SLP node. */
55 _slp_tree::_slp_tree ()
57 SLP_TREE_SCALAR_STMTS (this) = vNULL;
58 SLP_TREE_SCALAR_OPS (this) = vNULL;
59 SLP_TREE_VEC_STMTS (this) = vNULL;
60 SLP_TREE_VEC_DEFS (this) = vNULL;
61 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
62 SLP_TREE_CHILDREN (this) = vNULL;
63 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
64 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
65 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
66 SLP_TREE_CODE (this) = ERROR_MARK;
67 SLP_TREE_VECTYPE (this) = NULL_TREE;
68 SLP_TREE_REPRESENTATIVE (this) = NULL;
69 this->refcnt = 1;
70 this->max_nunits = 1;
71 this->lanes = 0;
74 /* Tear down a SLP node. */
76 _slp_tree::~_slp_tree ()
78 SLP_TREE_CHILDREN (this).release ();
79 SLP_TREE_SCALAR_STMTS (this).release ();
80 SLP_TREE_SCALAR_OPS (this).release ();
81 SLP_TREE_VEC_STMTS (this).release ();
82 SLP_TREE_VEC_DEFS (this).release ();
83 SLP_TREE_LOAD_PERMUTATION (this).release ();
84 SLP_TREE_LANE_PERMUTATION (this).release ();
87 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
88 FINAL_P is true if we have vectorized the instance or if we have
89 made a final decision not to vectorize the statements in any way. */
91 static void
92 vect_free_slp_tree (slp_tree node, bool final_p)
94 int i;
95 slp_tree child;
97 if (--node->refcnt != 0)
98 return;
100 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
101 vect_free_slp_tree (child, final_p);
103 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
104 Some statements might no longer exist, after having been
105 removed by vect_transform_stmt. Updating the remaining
106 statements would be redundant. */
107 if (!final_p)
109 stmt_vec_info stmt_info;
110 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
112 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
113 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
117 delete node;
120 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
121 have vectorized the instance or if we have made a final decision not
122 to vectorize the statements in any way. */
124 void
125 vect_free_slp_instance (slp_instance instance, bool final_p)
127 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
128 SLP_INSTANCE_LOADS (instance).release ();
129 free (instance);
133 /* Create an SLP node for SCALAR_STMTS. */
135 static slp_tree
136 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
138 slp_tree node = new _slp_tree;
139 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
140 SLP_TREE_CHILDREN (node).create (nops);
141 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
142 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
143 SLP_TREE_LANES (node) = scalar_stmts.length ();
145 unsigned i;
146 stmt_vec_info stmt_info;
147 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
148 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
150 return node;
153 /* Create an SLP node for OPS. */
155 static slp_tree
156 vect_create_new_slp_node (vec<tree> ops)
158 slp_tree node = new _slp_tree;
159 SLP_TREE_SCALAR_OPS (node) = ops;
160 SLP_TREE_DEF_TYPE (node) = vect_external_def;
161 SLP_TREE_LANES (node) = ops.length ();
162 return node;
166 /* This structure is used in creation of an SLP tree. Each instance
167 corresponds to the same operand in a group of scalar stmts in an SLP
168 node. */
169 typedef struct _slp_oprnd_info
171 /* Def-stmts for the operands. */
172 vec<stmt_vec_info> def_stmts;
173 /* Operands. */
174 vec<tree> ops;
175 /* Information about the first statement, its vector def-type, type, the
176 operand itself in case it's constant, and an indication if it's a pattern
177 stmt. */
178 tree first_op_type;
179 enum vect_def_type first_dt;
180 bool any_pattern;
181 } *slp_oprnd_info;
184 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
185 operand. */
186 static vec<slp_oprnd_info>
187 vect_create_oprnd_info (int nops, int group_size)
189 int i;
190 slp_oprnd_info oprnd_info;
191 vec<slp_oprnd_info> oprnds_info;
193 oprnds_info.create (nops);
194 for (i = 0; i < nops; i++)
196 oprnd_info = XNEW (struct _slp_oprnd_info);
197 oprnd_info->def_stmts.create (group_size);
198 oprnd_info->ops.create (group_size);
199 oprnd_info->first_dt = vect_uninitialized_def;
200 oprnd_info->first_op_type = NULL_TREE;
201 oprnd_info->any_pattern = false;
202 oprnds_info.quick_push (oprnd_info);
205 return oprnds_info;
209 /* Free operands info. */
211 static void
212 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
214 int i;
215 slp_oprnd_info oprnd_info;
217 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
219 oprnd_info->def_stmts.release ();
220 oprnd_info->ops.release ();
221 XDELETE (oprnd_info);
224 oprnds_info.release ();
228 /* Return true if STMTS contains a pattern statement. */
230 static bool
231 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
233 stmt_vec_info stmt_info;
234 unsigned int i;
235 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
236 if (is_pattern_stmt_p (stmt_info))
237 return true;
238 return false;
241 /* Return true when all lanes in the external or constant NODE have
242 the same value. */
244 static bool
245 vect_slp_tree_uniform_p (slp_tree node)
247 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
248 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
250 unsigned i;
251 tree op, first = NULL_TREE;
252 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
253 if (!first)
254 first = op;
255 else if (!operand_equal_p (first, op, 0))
256 return false;
258 return true;
261 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
262 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
263 of the chain. */
266 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
267 stmt_vec_info first_stmt_info)
269 stmt_vec_info next_stmt_info = first_stmt_info;
270 int result = 0;
272 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
273 return -1;
277 if (next_stmt_info == stmt_info)
278 return result;
279 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
280 if (next_stmt_info)
281 result += DR_GROUP_GAP (next_stmt_info);
283 while (next_stmt_info);
285 return -1;
288 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
289 using the method implemented by duplicate_and_interleave. Return true
290 if so, returning the number of intermediate vectors in *NVECTORS_OUT
291 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
292 (if nonnull). */
294 bool
295 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
296 tree elt_type, unsigned int *nvectors_out,
297 tree *vector_type_out,
298 tree *permutes)
300 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
301 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
302 return false;
304 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
305 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
306 unsigned int nvectors = 1;
307 for (;;)
309 scalar_int_mode int_mode;
310 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
311 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
313 /* Get the natural vector type for this SLP group size. */
314 tree int_type = build_nonstandard_integer_type
315 (GET_MODE_BITSIZE (int_mode), 1);
316 tree vector_type
317 = get_vectype_for_scalar_type (vinfo, int_type, count);
318 if (vector_type
319 && VECTOR_MODE_P (TYPE_MODE (vector_type))
320 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
321 GET_MODE_SIZE (base_vector_mode)))
323 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
324 together into elements of type INT_TYPE and using the result
325 to build NVECTORS vectors. */
326 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
327 vec_perm_builder sel1 (nelts, 2, 3);
328 vec_perm_builder sel2 (nelts, 2, 3);
329 poly_int64 half_nelts = exact_div (nelts, 2);
330 for (unsigned int i = 0; i < 3; ++i)
332 sel1.quick_push (i);
333 sel1.quick_push (i + nelts);
334 sel2.quick_push (half_nelts + i);
335 sel2.quick_push (half_nelts + i + nelts);
337 vec_perm_indices indices1 (sel1, 2, nelts);
338 vec_perm_indices indices2 (sel2, 2, nelts);
339 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
340 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
342 if (nvectors_out)
343 *nvectors_out = nvectors;
344 if (vector_type_out)
345 *vector_type_out = vector_type;
346 if (permutes)
348 permutes[0] = vect_gen_perm_mask_checked (vector_type,
349 indices1);
350 permutes[1] = vect_gen_perm_mask_checked (vector_type,
351 indices2);
353 return true;
357 if (!multiple_p (elt_bytes, 2, &elt_bytes))
358 return false;
359 nvectors *= 2;
363 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
364 they are of a valid type and that they match the defs of the first stmt of
365 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
366 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
367 indicates swap is required for cond_expr stmts. Specifically, *SWAP
368 is 1 if STMT is cond and operands of comparison need to be swapped;
369 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
370 If there is any operand swap in this function, *SWAP is set to non-zero
371 value.
372 If there was a fatal error return -1; if the error could be corrected by
373 swapping operands of father node of this one, return 1; if everything is
374 ok return 0. */
375 static int
376 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
377 vec<stmt_vec_info> stmts, unsigned stmt_num,
378 vec<slp_oprnd_info> *oprnds_info)
380 stmt_vec_info stmt_info = stmts[stmt_num];
381 tree oprnd;
382 unsigned int i, number_of_oprnds;
383 enum vect_def_type dt = vect_uninitialized_def;
384 slp_oprnd_info oprnd_info;
385 int first_op_idx = 1;
386 unsigned int commutative_op = -1U;
387 bool first_op_cond = false;
388 bool first = stmt_num == 0;
390 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
392 number_of_oprnds = gimple_call_num_args (stmt);
393 first_op_idx = 3;
394 if (gimple_call_internal_p (stmt))
396 internal_fn ifn = gimple_call_internal_fn (stmt);
397 commutative_op = first_commutative_argument (ifn);
399 /* Masked load, only look at mask. */
400 if (ifn == IFN_MASK_LOAD)
402 number_of_oprnds = 1;
403 /* Mask operand index. */
404 first_op_idx = 5;
408 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
410 enum tree_code code = gimple_assign_rhs_code (stmt);
411 number_of_oprnds = gimple_num_ops (stmt) - 1;
412 /* Swap can only be done for cond_expr if asked to, otherwise we
413 could result in different comparison code to the first stmt. */
414 if (code == COND_EXPR
415 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
417 first_op_cond = true;
418 number_of_oprnds++;
420 else
421 commutative_op = commutative_tree_code (code) ? 0U : -1U;
423 else
424 return -1;
426 bool swapped = (*swap != 0);
427 gcc_assert (!swapped || first_op_cond);
428 for (i = 0; i < number_of_oprnds; i++)
430 again:
431 if (first_op_cond)
433 /* Map indicating how operands of cond_expr should be swapped. */
434 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
435 int *map = maps[*swap];
437 if (i < 2)
438 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
439 first_op_idx), map[i]);
440 else
441 oprnd = gimple_op (stmt_info->stmt, map[i]);
443 else
444 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
445 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
446 oprnd = TREE_OPERAND (oprnd, 0);
448 oprnd_info = (*oprnds_info)[i];
450 stmt_vec_info def_stmt_info;
451 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
453 if (dump_enabled_p ())
454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
455 "Build SLP failed: can't analyze def for %T\n",
456 oprnd);
458 return -1;
461 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
462 oprnd_info->any_pattern = true;
464 if (first)
466 /* For the swapping logic below force vect_reduction_def
467 for the reduction op in a SLP reduction group. */
468 if (!STMT_VINFO_DATA_REF (stmt_info)
469 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
470 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
471 && def_stmt_info)
472 dt = vect_reduction_def;
473 oprnd_info->first_dt = dt;
474 oprnd_info->first_op_type = TREE_TYPE (oprnd);
476 else
478 /* Not first stmt of the group, check that the def-stmt/s match
479 the def-stmt/s of the first stmt. Allow different definition
480 types for reduction chains: the first stmt must be a
481 vect_reduction_def (a phi node), and the rest
482 end in the reduction chain. */
483 tree type = TREE_TYPE (oprnd);
484 if ((oprnd_info->first_dt != dt
485 && !(oprnd_info->first_dt == vect_reduction_def
486 && !STMT_VINFO_DATA_REF (stmt_info)
487 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
488 && def_stmt_info
489 && !STMT_VINFO_DATA_REF (def_stmt_info)
490 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
491 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
492 && !((oprnd_info->first_dt == vect_external_def
493 || oprnd_info->first_dt == vect_constant_def)
494 && (dt == vect_external_def
495 || dt == vect_constant_def)))
496 || !types_compatible_p (oprnd_info->first_op_type, type)
497 || (!STMT_VINFO_DATA_REF (stmt_info)
498 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
499 && ((!def_stmt_info
500 || STMT_VINFO_DATA_REF (def_stmt_info)
501 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
502 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
503 != (oprnd_info->first_dt != vect_reduction_def))))
505 /* Try swapping operands if we got a mismatch. */
506 if (i == commutative_op && !swapped)
508 if (dump_enabled_p ())
509 dump_printf_loc (MSG_NOTE, vect_location,
510 "trying swapped operands\n");
511 swapped = true;
512 goto again;
515 if (dump_enabled_p ())
516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
517 "Build SLP failed: different types\n");
519 return 1;
521 if ((dt == vect_constant_def
522 || dt == vect_external_def)
523 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
524 && (TREE_CODE (type) == BOOLEAN_TYPE
525 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
526 type)))
528 if (dump_enabled_p ())
529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
530 "Build SLP failed: invalid type of def "
531 "for variable-length SLP %T\n", oprnd);
532 return -1;
536 /* Check the types of the definitions. */
537 switch (dt)
539 case vect_external_def:
540 /* Make sure to demote the overall operand to external. */
541 oprnd_info->first_dt = vect_external_def;
542 /* Fallthru. */
543 case vect_constant_def:
544 oprnd_info->def_stmts.quick_push (NULL);
545 oprnd_info->ops.quick_push (oprnd);
546 break;
548 case vect_internal_def:
549 case vect_reduction_def:
550 if (oprnd_info->first_dt == vect_reduction_def
551 && !STMT_VINFO_DATA_REF (stmt_info)
552 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
553 && !STMT_VINFO_DATA_REF (def_stmt_info)
554 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
555 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
557 /* For a SLP reduction chain we want to duplicate the
558 reduction to each of the chain members. That gets
559 us a sane SLP graph (still the stmts are not 100%
560 correct wrt the initial values). */
561 gcc_assert (!first);
562 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
563 oprnd_info->ops.quick_push (oprnd_info->ops[0]);
564 break;
566 /* Fallthru. */
567 case vect_induction_def:
568 oprnd_info->def_stmts.quick_push (def_stmt_info);
569 oprnd_info->ops.quick_push (oprnd);
570 break;
572 default:
573 /* FORNOW: Not supported. */
574 if (dump_enabled_p ())
575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
576 "Build SLP failed: illegal type of def %T\n",
577 oprnd);
579 return -1;
583 /* Swap operands. */
584 if (swapped)
586 if (dump_enabled_p ())
587 dump_printf_loc (MSG_NOTE, vect_location,
588 "swapped operands to match def types in %G",
589 stmt_info->stmt);
592 *swap = swapped;
593 return 0;
596 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
597 Return true if we can, meaning that this choice doesn't conflict with
598 existing SLP nodes that use STMT_INFO. */
600 static bool
601 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
603 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
604 if (old_vectype && useless_type_conversion_p (vectype, old_vectype))
605 return true;
607 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
608 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
610 /* We maintain the invariant that if any statement in the group is
611 used, all other members of the group have the same vector type. */
612 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
613 stmt_vec_info member_info = first_info;
614 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
615 if (STMT_VINFO_NUM_SLP_USES (member_info) > 0
616 || is_pattern_stmt_p (member_info))
617 break;
619 if (!member_info)
621 for (member_info = first_info; member_info;
622 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
623 STMT_VINFO_VECTYPE (member_info) = vectype;
624 return true;
627 else if (STMT_VINFO_NUM_SLP_USES (stmt_info) == 0
628 && !is_pattern_stmt_p (stmt_info))
630 STMT_VINFO_VECTYPE (stmt_info) = vectype;
631 return true;
634 if (dump_enabled_p ())
636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
637 "Build SLP failed: incompatible vector"
638 " types for: %G", stmt_info->stmt);
639 dump_printf_loc (MSG_NOTE, vect_location,
640 " old vector type: %T\n", old_vectype);
641 dump_printf_loc (MSG_NOTE, vect_location,
642 " new vector type: %T\n", vectype);
644 return false;
647 /* Return true if call statements CALL1 and CALL2 are similar enough
648 to be combined into the same SLP group. */
650 static bool
651 compatible_calls_p (gcall *call1, gcall *call2)
653 unsigned int nargs = gimple_call_num_args (call1);
654 if (nargs != gimple_call_num_args (call2))
655 return false;
657 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
658 return false;
660 if (gimple_call_internal_p (call1))
662 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
663 TREE_TYPE (gimple_call_lhs (call2))))
664 return false;
665 for (unsigned int i = 0; i < nargs; ++i)
666 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
667 TREE_TYPE (gimple_call_arg (call2, i))))
668 return false;
670 else
672 if (!operand_equal_p (gimple_call_fn (call1),
673 gimple_call_fn (call2), 0))
674 return false;
676 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
677 return false;
679 return true;
682 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
683 caller's attempt to find the vector type in STMT_INFO with the narrowest
684 element type. Return true if VECTYPE is nonnull and if it is valid
685 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
686 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
687 vect_build_slp_tree. */
689 static bool
690 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
691 unsigned int group_size,
692 tree vectype, poly_uint64 *max_nunits)
694 if (!vectype)
696 if (dump_enabled_p ())
697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
698 "Build SLP failed: unsupported data-type in %G\n",
699 stmt_info->stmt);
700 /* Fatal mismatch. */
701 return false;
704 /* If populating the vector type requires unrolling then fail
705 before adjusting *max_nunits for basic-block vectorization. */
706 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
707 unsigned HOST_WIDE_INT const_nunits;
708 if (is_a <bb_vec_info> (vinfo)
709 && (!nunits.is_constant (&const_nunits)
710 || const_nunits > group_size))
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
714 "Build SLP failed: unrolling required "
715 "in basic block SLP\n");
716 /* Fatal mismatch. */
717 return false;
720 /* In case of multiple types we need to detect the smallest type. */
721 vect_update_max_nunits (max_nunits, vectype);
722 return true;
725 /* Verify if the scalar stmts STMTS are isomorphic, require data
726 permutation or are of unsupported types of operation. Return
727 true if they are, otherwise return false and indicate in *MATCHES
728 which stmts are not isomorphic to the first one. If MATCHES[0]
729 is false then this indicates the comparison could not be
730 carried out or the stmts will never be vectorized by SLP.
732 Note COND_EXPR is possibly isomorphic to another one after swapping its
733 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
734 the first stmt by swapping the two operands of comparison; set SWAP[i]
735 to 2 if stmt I is isormorphic to the first stmt by inverting the code
736 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
737 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
739 static bool
740 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
741 vec<stmt_vec_info> stmts, unsigned int group_size,
742 poly_uint64 *max_nunits, bool *matches,
743 bool *two_operators, tree *node_vectype)
745 unsigned int i;
746 stmt_vec_info first_stmt_info = stmts[0];
747 enum tree_code first_stmt_code = ERROR_MARK;
748 enum tree_code alt_stmt_code = ERROR_MARK;
749 enum tree_code rhs_code = ERROR_MARK;
750 enum tree_code first_cond_code = ERROR_MARK;
751 tree lhs;
752 bool need_same_oprnds = false;
753 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
754 optab optab;
755 int icode;
756 machine_mode optab_op2_mode;
757 machine_mode vec_mode;
758 stmt_vec_info first_load = NULL, prev_first_load = NULL;
759 bool load_p = false;
761 /* For every stmt in NODE find its def stmt/s. */
762 stmt_vec_info stmt_info;
763 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
765 gimple *stmt = stmt_info->stmt;
766 swap[i] = 0;
767 matches[i] = false;
769 if (dump_enabled_p ())
770 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
772 /* Fail to vectorize statements marked as unvectorizable. */
773 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
775 if (dump_enabled_p ())
776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
777 "Build SLP failed: unvectorizable statement %G",
778 stmt);
779 /* Fatal mismatch. */
780 matches[0] = false;
781 return false;
784 lhs = gimple_get_lhs (stmt);
785 if (lhs == NULL_TREE)
787 if (dump_enabled_p ())
788 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
789 "Build SLP failed: not GIMPLE_ASSIGN nor "
790 "GIMPLE_CALL %G", stmt);
791 /* Fatal mismatch. */
792 matches[0] = false;
793 return false;
796 tree nunits_vectype;
797 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
798 &nunits_vectype, group_size)
799 || (nunits_vectype
800 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
801 nunits_vectype, max_nunits)))
803 /* Fatal mismatch. */
804 matches[0] = false;
805 return false;
808 gcc_assert (vectype);
810 if (is_a <bb_vec_info> (vinfo)
811 && !vect_update_shared_vectype (stmt_info, vectype))
812 continue;
814 gcall *call_stmt = dyn_cast <gcall *> (stmt);
815 if (call_stmt)
817 rhs_code = CALL_EXPR;
819 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
820 load_p = true;
821 else if ((gimple_call_internal_p (call_stmt)
822 && (!vectorizable_internal_fn_p
823 (gimple_call_internal_fn (call_stmt))))
824 || gimple_call_tail_p (call_stmt)
825 || gimple_call_noreturn_p (call_stmt)
826 || !gimple_call_nothrow_p (call_stmt)
827 || gimple_call_chain (call_stmt))
829 if (dump_enabled_p ())
830 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
831 "Build SLP failed: unsupported call type %G",
832 call_stmt);
833 /* Fatal mismatch. */
834 matches[0] = false;
835 return false;
838 else
840 rhs_code = gimple_assign_rhs_code (stmt);
841 load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference;
844 /* Check the operation. */
845 if (i == 0)
847 *node_vectype = vectype;
848 first_stmt_code = rhs_code;
850 /* Shift arguments should be equal in all the packed stmts for a
851 vector shift with scalar shift operand. */
852 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
853 || rhs_code == LROTATE_EXPR
854 || rhs_code == RROTATE_EXPR)
856 vec_mode = TYPE_MODE (vectype);
858 /* First see if we have a vector/vector shift. */
859 optab = optab_for_tree_code (rhs_code, vectype,
860 optab_vector);
862 if (!optab
863 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
865 /* No vector/vector shift, try for a vector/scalar shift. */
866 optab = optab_for_tree_code (rhs_code, vectype,
867 optab_scalar);
869 if (!optab)
871 if (dump_enabled_p ())
872 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
873 "Build SLP failed: no optab.\n");
874 /* Fatal mismatch. */
875 matches[0] = false;
876 return false;
878 icode = (int) optab_handler (optab, vec_mode);
879 if (icode == CODE_FOR_nothing)
881 if (dump_enabled_p ())
882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
883 "Build SLP failed: "
884 "op not supported by target.\n");
885 /* Fatal mismatch. */
886 matches[0] = false;
887 return false;
889 optab_op2_mode = insn_data[icode].operand[2].mode;
890 if (!VECTOR_MODE_P (optab_op2_mode))
892 need_same_oprnds = true;
893 first_op1 = gimple_assign_rhs2 (stmt);
897 else if (rhs_code == WIDEN_LSHIFT_EXPR)
899 need_same_oprnds = true;
900 first_op1 = gimple_assign_rhs2 (stmt);
902 else if (call_stmt
903 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
905 need_same_oprnds = true;
906 first_op1 = gimple_call_arg (call_stmt, 1);
909 else
911 if (first_stmt_code != rhs_code
912 && alt_stmt_code == ERROR_MARK)
913 alt_stmt_code = rhs_code;
914 if (first_stmt_code != rhs_code
915 && (first_stmt_code != IMAGPART_EXPR
916 || rhs_code != REALPART_EXPR)
917 && (first_stmt_code != REALPART_EXPR
918 || rhs_code != IMAGPART_EXPR)
919 /* Handle mismatches in plus/minus by computing both
920 and merging the results. */
921 && !((first_stmt_code == PLUS_EXPR
922 || first_stmt_code == MINUS_EXPR)
923 && (alt_stmt_code == PLUS_EXPR
924 || alt_stmt_code == MINUS_EXPR)
925 && rhs_code == alt_stmt_code)
926 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
927 && (first_stmt_code == ARRAY_REF
928 || first_stmt_code == BIT_FIELD_REF
929 || first_stmt_code == INDIRECT_REF
930 || first_stmt_code == COMPONENT_REF
931 || first_stmt_code == MEM_REF)))
933 if (dump_enabled_p ())
935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
936 "Build SLP failed: different operation "
937 "in stmt %G", stmt);
938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
939 "original stmt %G", first_stmt_info->stmt);
941 /* Mismatch. */
942 continue;
945 if (need_same_oprnds)
947 tree other_op1 = (call_stmt
948 ? gimple_call_arg (call_stmt, 1)
949 : gimple_assign_rhs2 (stmt));
950 if (!operand_equal_p (first_op1, other_op1, 0))
952 if (dump_enabled_p ())
953 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
954 "Build SLP failed: different shift "
955 "arguments in %G", stmt);
956 /* Mismatch. */
957 continue;
961 if (!load_p && rhs_code == CALL_EXPR)
963 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
964 as_a <gcall *> (stmt)))
966 if (dump_enabled_p ())
967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
968 "Build SLP failed: different calls in %G",
969 stmt);
970 /* Mismatch. */
971 continue;
976 /* Grouped store or load. */
977 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
979 if (REFERENCE_CLASS_P (lhs))
981 /* Store. */
984 else
986 /* Load. */
987 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
988 if (prev_first_load)
990 /* Check that there are no loads from different interleaving
991 chains in the same node. */
992 if (prev_first_load != first_load)
994 if (dump_enabled_p ())
995 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
996 vect_location,
997 "Build SLP failed: different "
998 "interleaving chains in one node %G",
999 stmt);
1000 /* Mismatch. */
1001 continue;
1004 else
1005 prev_first_load = first_load;
1007 } /* Grouped access. */
1008 else
1010 if (load_p)
1012 /* Not grouped load. */
1013 if (dump_enabled_p ())
1014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1015 "Build SLP failed: not grouped load %G", stmt);
1017 /* FORNOW: Not grouped loads are not supported. */
1018 /* Fatal mismatch. */
1019 matches[0] = false;
1020 return false;
1023 /* Not memory operation. */
1024 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1025 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1026 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1027 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1028 && rhs_code != VIEW_CONVERT_EXPR
1029 && rhs_code != CALL_EXPR)
1031 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1033 "Build SLP failed: operation unsupported %G",
1034 stmt);
1035 /* Fatal mismatch. */
1036 matches[0] = false;
1037 return false;
1040 if (rhs_code == COND_EXPR)
1042 tree cond_expr = gimple_assign_rhs1 (stmt);
1043 enum tree_code cond_code = TREE_CODE (cond_expr);
1044 enum tree_code swap_code = ERROR_MARK;
1045 enum tree_code invert_code = ERROR_MARK;
1047 if (i == 0)
1048 first_cond_code = TREE_CODE (cond_expr);
1049 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1051 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1052 swap_code = swap_tree_comparison (cond_code);
1053 invert_code = invert_tree_comparison (cond_code, honor_nans);
1056 if (first_cond_code == cond_code)
1058 /* Isomorphic can be achieved by swapping. */
1059 else if (first_cond_code == swap_code)
1060 swap[i] = 1;
1061 /* Isomorphic can be achieved by inverting. */
1062 else if (first_cond_code == invert_code)
1063 swap[i] = 2;
1064 else
1066 if (dump_enabled_p ())
1067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1068 "Build SLP failed: different"
1069 " operation %G", stmt);
1070 /* Mismatch. */
1071 continue;
1076 matches[i] = true;
1079 for (i = 0; i < group_size; ++i)
1080 if (!matches[i])
1081 return false;
1083 /* If we allowed a two-operation SLP node verify the target can cope
1084 with the permute we are going to use. */
1085 if (alt_stmt_code != ERROR_MARK
1086 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1088 *two_operators = true;
1091 return true;
1094 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1095 Note we never remove apart from at destruction time so we do not
1096 need a special value for deleted that differs from empty. */
1097 struct bst_traits
1099 typedef vec <stmt_vec_info> value_type;
1100 typedef vec <stmt_vec_info> compare_type;
1101 static inline hashval_t hash (value_type);
1102 static inline bool equal (value_type existing, value_type candidate);
1103 static inline bool is_empty (value_type x) { return !x.exists (); }
1104 static inline bool is_deleted (value_type x) { return !x.exists (); }
1105 static const bool empty_zero_p = true;
1106 static inline void mark_empty (value_type &x) { x.release (); }
1107 static inline void mark_deleted (value_type &x) { x.release (); }
1108 static inline void remove (value_type &x) { x.release (); }
1110 inline hashval_t
1111 bst_traits::hash (value_type x)
1113 inchash::hash h;
1114 for (unsigned i = 0; i < x.length (); ++i)
1115 h.add_int (gimple_uid (x[i]->stmt));
1116 return h.end ();
1118 inline bool
1119 bst_traits::equal (value_type existing, value_type candidate)
1121 if (existing.length () != candidate.length ())
1122 return false;
1123 for (unsigned i = 0; i < existing.length (); ++i)
1124 if (existing[i] != candidate[i])
1125 return false;
1126 return true;
1129 typedef hash_map <vec <gimple *>, slp_tree,
1130 simple_hashmap_traits <bst_traits, slp_tree> >
1131 scalar_stmts_to_slp_tree_map_t;
1133 static slp_tree
1134 vect_build_slp_tree_2 (vec_info *vinfo,
1135 vec<stmt_vec_info> stmts, unsigned int group_size,
1136 poly_uint64 *max_nunits,
1137 bool *matches, unsigned *npermutes, unsigned *tree_size,
1138 scalar_stmts_to_slp_tree_map_t *bst_map);
1140 static slp_tree
1141 vect_build_slp_tree (vec_info *vinfo,
1142 vec<stmt_vec_info> stmts, unsigned int group_size,
1143 poly_uint64 *max_nunits,
1144 bool *matches, unsigned *npermutes, unsigned *tree_size,
1145 scalar_stmts_to_slp_tree_map_t *bst_map)
1147 if (slp_tree *leader = bst_map->get (stmts))
1149 if (dump_enabled_p ())
1150 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1151 *leader ? "" : "failed ", *leader);
1152 if (*leader)
1154 (*leader)->refcnt++;
1155 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1157 return *leader;
1159 poly_uint64 this_max_nunits = 1;
1160 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1161 &this_max_nunits,
1162 matches, npermutes, tree_size, bst_map);
1163 if (res)
1165 res->max_nunits = this_max_nunits;
1166 vect_update_max_nunits (max_nunits, this_max_nunits);
1167 /* Keep a reference for the bst_map use. */
1168 res->refcnt++;
1170 bst_map->put (stmts.copy (), res);
1171 return res;
1174 /* Recursively build an SLP tree starting from NODE.
1175 Fail (and return a value not equal to zero) if def-stmts are not
1176 isomorphic, require data permutation or are of unsupported types of
1177 operation. Otherwise, return 0.
1178 The value returned is the depth in the SLP tree where a mismatch
1179 was found. */
1181 static slp_tree
1182 vect_build_slp_tree_2 (vec_info *vinfo,
1183 vec<stmt_vec_info> stmts, unsigned int group_size,
1184 poly_uint64 *max_nunits,
1185 bool *matches, unsigned *npermutes, unsigned *tree_size,
1186 scalar_stmts_to_slp_tree_map_t *bst_map)
1188 unsigned nops, i, this_tree_size = 0;
1189 poly_uint64 this_max_nunits = *max_nunits;
1190 slp_tree node;
1192 matches[0] = false;
1194 stmt_vec_info stmt_info = stmts[0];
1195 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1196 nops = gimple_call_num_args (stmt);
1197 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1199 nops = gimple_num_ops (stmt) - 1;
1200 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1201 nops++;
1203 else if (is_a <gphi *> (stmt_info->stmt))
1204 nops = 0;
1205 else
1206 return NULL;
1208 /* If the SLP node is a PHI (induction or reduction), terminate
1209 the recursion. */
1210 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1212 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1213 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1214 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1215 max_nunits))
1216 return NULL;
1218 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1219 /* Induction from different IVs is not supported. */
1220 if (def_type == vect_induction_def)
1222 stmt_vec_info other_info;
1223 FOR_EACH_VEC_ELT (stmts, i, other_info)
1224 if (stmt_info != other_info)
1225 return NULL;
1227 else if (def_type == vect_reduction_def
1228 || def_type == vect_double_reduction_def
1229 || def_type == vect_nested_cycle)
1231 /* Else def types have to match. */
1232 stmt_vec_info other_info;
1233 FOR_EACH_VEC_ELT (stmts, i, other_info)
1234 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1235 return NULL;
1237 else
1238 return NULL;
1239 (*tree_size)++;
1240 node = vect_create_new_slp_node (stmts, 0);
1241 SLP_TREE_VECTYPE (node) = vectype;
1242 return node;
1246 bool two_operators = false;
1247 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1248 tree vectype = NULL_TREE;
1249 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1250 &this_max_nunits, matches, &two_operators,
1251 &vectype))
1252 return NULL;
1254 /* If the SLP node is a load, terminate the recursion unless masked. */
1255 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1256 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1258 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1260 /* Masked load. */
1261 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1262 nops = 1;
1264 else
1266 *max_nunits = this_max_nunits;
1267 (*tree_size)++;
1268 node = vect_create_new_slp_node (stmts, 0);
1269 SLP_TREE_VECTYPE (node) = vectype;
1270 /* And compute the load permutation. Whether it is actually
1271 a permutation depends on the unrolling factor which is
1272 decided later. */
1273 vec<unsigned> load_permutation;
1274 int j;
1275 stmt_vec_info load_info;
1276 load_permutation.create (group_size);
1277 stmt_vec_info first_stmt_info
1278 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1279 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1281 int load_place = vect_get_place_in_interleaving_chain
1282 (load_info, first_stmt_info);
1283 gcc_assert (load_place != -1);
1284 load_permutation.safe_push (load_place);
1286 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1287 return node;
1291 /* Get at the operands, verifying they are compatible. */
1292 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1293 slp_oprnd_info oprnd_info;
1294 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1296 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1297 stmts, i, &oprnds_info);
1298 if (res != 0)
1299 matches[(res == -1) ? 0 : i] = false;
1300 if (!matches[0])
1301 break;
1303 for (i = 0; i < group_size; ++i)
1304 if (!matches[i])
1306 vect_free_oprnd_info (oprnds_info);
1307 return NULL;
1310 auto_vec<slp_tree, 4> children;
1312 stmt_info = stmts[0];
1314 /* Create SLP_TREE nodes for the definition node/s. */
1315 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1317 slp_tree child;
1318 unsigned int j;
1320 if (oprnd_info->first_dt == vect_uninitialized_def)
1322 /* COND_EXPR have one too many eventually if the condition
1323 is a SSA name. */
1324 gcc_assert (i == 3 && nops == 4);
1325 continue;
1328 if (oprnd_info->first_dt != vect_internal_def
1329 && oprnd_info->first_dt != vect_reduction_def
1330 && oprnd_info->first_dt != vect_induction_def)
1332 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1333 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1334 oprnd_info->ops = vNULL;
1335 children.safe_push (invnode);
1336 continue;
1339 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1340 group_size, &this_max_nunits,
1341 matches, npermutes,
1342 &this_tree_size, bst_map)) != NULL)
1344 oprnd_info->def_stmts = vNULL;
1345 children.safe_push (child);
1346 continue;
1349 /* If the SLP build failed fatally and we analyze a basic-block
1350 simply treat nodes we fail to build as externally defined
1351 (and thus build vectors from the scalar defs).
1352 The cost model will reject outright expensive cases.
1353 ??? This doesn't treat cases where permutation ultimatively
1354 fails (or we don't try permutation below). Ideally we'd
1355 even compute a permutation that will end up with the maximum
1356 SLP tree size... */
1357 if (is_a <bb_vec_info> (vinfo)
1358 && !matches[0]
1359 /* ??? Rejecting patterns this way doesn't work. We'd have to
1360 do extra work to cancel the pattern so the uses see the
1361 scalar version. */
1362 && !is_pattern_stmt_p (stmt_info)
1363 && !oprnd_info->any_pattern)
1365 if (dump_enabled_p ())
1366 dump_printf_loc (MSG_NOTE, vect_location,
1367 "Building vector operands from scalars\n");
1368 this_tree_size++;
1369 child = vect_create_new_slp_node (oprnd_info->ops);
1370 children.safe_push (child);
1371 oprnd_info->ops = vNULL;
1372 oprnd_info->def_stmts = vNULL;
1373 continue;
1376 /* If the SLP build for operand zero failed and operand zero
1377 and one can be commutated try that for the scalar stmts
1378 that failed the match. */
1379 if (i == 0
1380 /* A first scalar stmt mismatch signals a fatal mismatch. */
1381 && matches[0]
1382 /* ??? For COND_EXPRs we can swap the comparison operands
1383 as well as the arms under some constraints. */
1384 && nops == 2
1385 && oprnds_info[1]->first_dt == vect_internal_def
1386 && is_gimple_assign (stmt_info->stmt)
1387 /* Swapping operands for reductions breaks assumptions later on. */
1388 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1389 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1390 /* Do so only if the number of not successful permutes was nor more
1391 than a cut-ff as re-trying the recursive match on
1392 possibly each level of the tree would expose exponential
1393 behavior. */
1394 && *npermutes < 4)
1396 /* See whether we can swap the matching or the non-matching
1397 stmt operands. */
1398 bool swap_not_matching = true;
1401 for (j = 0; j < group_size; ++j)
1403 if (matches[j] != !swap_not_matching)
1404 continue;
1405 stmt_vec_info stmt_info = stmts[j];
1406 /* Verify if we can swap operands of this stmt. */
1407 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1408 if (!stmt
1409 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1411 if (!swap_not_matching)
1412 goto fail;
1413 swap_not_matching = false;
1414 break;
1418 while (j != group_size);
1420 /* Swap mismatched definition stmts. */
1421 if (dump_enabled_p ())
1422 dump_printf_loc (MSG_NOTE, vect_location,
1423 "Re-trying with swapped operands of stmts ");
1424 for (j = 0; j < group_size; ++j)
1425 if (matches[j] == !swap_not_matching)
1427 std::swap (oprnds_info[0]->def_stmts[j],
1428 oprnds_info[1]->def_stmts[j]);
1429 std::swap (oprnds_info[0]->ops[j],
1430 oprnds_info[1]->ops[j]);
1431 if (dump_enabled_p ())
1432 dump_printf (MSG_NOTE, "%d ", j);
1434 if (dump_enabled_p ())
1435 dump_printf (MSG_NOTE, "\n");
1436 /* And try again with scratch 'matches' ... */
1437 bool *tem = XALLOCAVEC (bool, group_size);
1438 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1439 group_size, &this_max_nunits,
1440 tem, npermutes,
1441 &this_tree_size, bst_map)) != NULL)
1443 oprnd_info->def_stmts = vNULL;
1444 children.safe_push (child);
1445 continue;
1448 ++*npermutes;
1451 fail:
1452 gcc_assert (child == NULL);
1453 FOR_EACH_VEC_ELT (children, j, child)
1454 vect_free_slp_tree (child, false);
1455 vect_free_oprnd_info (oprnds_info);
1456 return NULL;
1459 vect_free_oprnd_info (oprnds_info);
1461 /* If we have all children of a non-unary child built up from
1462 uniform scalars then just throw that away, causing it built up
1463 from scalars. */
1464 if (nops > 1
1465 && is_a <bb_vec_info> (vinfo)
1466 /* ??? Rejecting patterns this way doesn't work. We'd have to
1467 do extra work to cancel the pattern so the uses see the
1468 scalar version. */
1469 && !is_pattern_stmt_p (stmt_info))
1471 slp_tree child;
1472 unsigned j;
1473 FOR_EACH_VEC_ELT (children, j, child)
1474 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
1475 || !vect_slp_tree_uniform_p (child))
1476 break;
1477 if (!child)
1479 /* Roll back. */
1480 matches[0] = false;
1481 FOR_EACH_VEC_ELT (children, j, child)
1482 vect_free_slp_tree (child, false);
1484 if (dump_enabled_p ())
1485 dump_printf_loc (MSG_NOTE, vect_location,
1486 "Building parent vector operands from "
1487 "scalars instead\n");
1488 return NULL;
1492 *tree_size += this_tree_size + 1;
1493 *max_nunits = this_max_nunits;
1495 if (two_operators)
1497 /* ??? We'd likely want to either cache in bst_map sth like
1498 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1499 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1500 explicit stmts to put in so the keying on 'stmts' doesn't
1501 work (but we have the same issue with nodes that use 'ops'). */
1502 slp_tree one = new _slp_tree;
1503 slp_tree two = new _slp_tree;
1504 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1505 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1506 SLP_TREE_VECTYPE (one) = vectype;
1507 SLP_TREE_VECTYPE (two) = vectype;
1508 SLP_TREE_CHILDREN (one).safe_splice (children);
1509 SLP_TREE_CHILDREN (two).safe_splice (children);
1510 slp_tree child;
1511 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1512 child->refcnt++;
1514 /* Here we record the original defs since this
1515 node represents the final lane configuration. */
1516 node = vect_create_new_slp_node (stmts, 2);
1517 SLP_TREE_VECTYPE (node) = vectype;
1518 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1519 SLP_TREE_CHILDREN (node).quick_push (one);
1520 SLP_TREE_CHILDREN (node).quick_push (two);
1521 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1522 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1523 enum tree_code ocode = ERROR_MARK;
1524 stmt_vec_info ostmt_info;
1525 unsigned j = 0;
1526 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1528 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1529 if (gimple_assign_rhs_code (ostmt) != code0)
1531 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1532 ocode = gimple_assign_rhs_code (ostmt);
1533 j = i;
1535 else
1536 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1538 SLP_TREE_CODE (one) = code0;
1539 SLP_TREE_CODE (two) = ocode;
1540 SLP_TREE_LANES (one) = stmts.length ();
1541 SLP_TREE_LANES (two) = stmts.length ();
1542 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1543 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1544 return node;
1547 node = vect_create_new_slp_node (stmts, nops);
1548 SLP_TREE_VECTYPE (node) = vectype;
1549 SLP_TREE_CHILDREN (node).splice (children);
1550 return node;
1553 /* Dump a single SLP tree NODE. */
1555 static void
1556 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1557 slp_tree node)
1559 unsigned i, j;
1560 slp_tree child;
1561 stmt_vec_info stmt_info;
1562 tree op;
1564 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1565 dump_user_location_t user_loc = loc.get_user_location ();
1566 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1567 SLP_TREE_DEF_TYPE (node) == vect_external_def
1568 ? " (external)"
1569 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1570 ? " (constant)"
1571 : ""), node,
1572 estimated_poly_value (node->max_nunits), node->refcnt);
1573 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1574 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1575 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1576 else
1578 dump_printf_loc (metadata, user_loc, "\t{ ");
1579 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1580 dump_printf (metadata, "%T%s ", op,
1581 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1582 dump_printf (metadata, "}\n");
1584 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1586 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1587 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1588 dump_printf (dump_kind, " %u", j);
1589 dump_printf (dump_kind, " }\n");
1591 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1593 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1594 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1595 dump_printf (dump_kind, " %u[%u]",
1596 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1597 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1598 dump_printf (dump_kind, " }\n");
1600 if (SLP_TREE_CHILDREN (node).is_empty ())
1601 return;
1602 dump_printf_loc (metadata, user_loc, "\tchildren");
1603 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1604 dump_printf (dump_kind, " %p", (void *)child);
1605 dump_printf (dump_kind, "\n");
1608 DEBUG_FUNCTION void
1609 debug (slp_tree node)
1611 debug_dump_context ctx;
1612 vect_print_slp_tree (MSG_NOTE,
1613 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1614 node);
1617 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1619 static void
1620 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1621 slp_tree node, hash_set<slp_tree> &visited)
1623 unsigned i;
1624 slp_tree child;
1626 if (visited.add (node))
1627 return;
1629 vect_print_slp_tree (dump_kind, loc, node);
1631 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1632 vect_print_slp_graph (dump_kind, loc, child, visited);
1635 static void
1636 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1637 slp_tree entry)
1639 hash_set<slp_tree> visited;
1640 vect_print_slp_graph (dump_kind, loc, entry, visited);
1643 /* Mark the tree rooted at NODE with PURE_SLP. */
1645 static void
1646 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1648 int i;
1649 stmt_vec_info stmt_info;
1650 slp_tree child;
1652 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1653 return;
1655 if (visited.add (node))
1656 return;
1658 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1659 STMT_SLP_TYPE (stmt_info) = pure_slp;
1661 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1662 vect_mark_slp_stmts (child, visited);
1665 static void
1666 vect_mark_slp_stmts (slp_tree node)
1668 hash_set<slp_tree> visited;
1669 vect_mark_slp_stmts (node, visited);
1672 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1674 static void
1675 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1677 int i;
1678 stmt_vec_info stmt_info;
1679 slp_tree child;
1681 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1682 return;
1684 if (visited.add (node))
1685 return;
1687 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1689 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1690 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1691 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1694 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1695 vect_mark_slp_stmts_relevant (child, visited);
1698 static void
1699 vect_mark_slp_stmts_relevant (slp_tree node)
1701 hash_set<slp_tree> visited;
1702 vect_mark_slp_stmts_relevant (node, visited);
1705 /* Copy the SLP subtree rooted at NODE. */
1707 static slp_tree
1708 slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
1710 unsigned i;
1712 bool existed_p;
1713 slp_tree &copy_ref = map.get_or_insert (node, &existed_p);
1714 if (existed_p)
1715 return copy_ref;
1717 copy_ref = new _slp_tree;
1718 slp_tree copy = copy_ref;
1719 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
1720 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
1721 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
1722 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
1723 copy->max_nunits = node->max_nunits;
1724 copy->refcnt = 0;
1725 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1727 SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy ();
1728 stmt_vec_info stmt_info;
1729 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1730 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
1732 if (SLP_TREE_SCALAR_OPS (node).exists ())
1733 SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
1734 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1735 SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
1736 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1737 SLP_TREE_LANE_PERMUTATION (copy) = SLP_TREE_LANE_PERMUTATION (node).copy ();
1738 if (SLP_TREE_CHILDREN (node).exists ())
1739 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
1740 gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
1742 slp_tree child;
1743 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child)
1745 SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map);
1746 SLP_TREE_CHILDREN (copy)[i]->refcnt++;
1748 return copy;
1751 /* Rearrange the statements of NODE according to PERMUTATION. */
1753 static void
1754 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1755 vec<unsigned> permutation,
1756 hash_set<slp_tree> &visited)
1758 unsigned int i;
1759 slp_tree child;
1761 if (visited.add (node))
1762 return;
1764 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1765 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1767 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1769 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1770 vec<stmt_vec_info> tmp_stmts;
1771 tmp_stmts.create (group_size);
1772 tmp_stmts.quick_grow (group_size);
1773 stmt_vec_info stmt_info;
1774 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1775 tmp_stmts[permutation[i]] = stmt_info;
1776 SLP_TREE_SCALAR_STMTS (node).release ();
1777 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1779 if (SLP_TREE_SCALAR_OPS (node).exists ())
1781 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1782 vec<tree> tmp_ops;
1783 tmp_ops.create (group_size);
1784 tmp_ops.quick_grow (group_size);
1785 tree op;
1786 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1787 tmp_ops[permutation[i]] = op;
1788 SLP_TREE_SCALAR_OPS (node).release ();
1789 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1791 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1793 gcc_assert (group_size == SLP_TREE_LANE_PERMUTATION (node).length ());
1794 for (i = 0; i < group_size; ++i)
1795 SLP_TREE_LANE_PERMUTATION (node)[i].second
1796 = permutation[SLP_TREE_LANE_PERMUTATION (node)[i].second];
1801 /* Attempt to reorder stmts in a reduction chain so that we don't
1802 require any load permutation. Return true if that was possible,
1803 otherwise return false. */
1805 static bool
1806 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1808 unsigned int i, j;
1809 unsigned int lidx;
1810 slp_tree node, load;
1812 if (SLP_INSTANCE_LOADS (slp_instn).is_empty ())
1813 return false;
1815 /* Compare all the permutation sequences to the first one. We know
1816 that at least one load is permuted. */
1817 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1818 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1819 return false;
1820 unsigned int group_size = SLP_TREE_LOAD_PERMUTATION (node).length ();
1821 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1823 if (!SLP_TREE_LOAD_PERMUTATION (load).exists ()
1824 || SLP_TREE_LOAD_PERMUTATION (load).length () != group_size)
1825 return false;
1826 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load), j, lidx)
1827 if (lidx != SLP_TREE_LOAD_PERMUTATION (node)[j])
1828 return false;
1831 /* Check that the loads in the first sequence are different and there
1832 are no gaps between them. */
1833 auto_sbitmap load_index (group_size);
1834 bitmap_clear (load_index);
1835 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1837 if (lidx >= group_size)
1838 return false;
1839 if (bitmap_bit_p (load_index, lidx))
1840 return false;
1842 bitmap_set_bit (load_index, lidx);
1844 for (i = 0; i < group_size; i++)
1845 if (!bitmap_bit_p (load_index, i))
1846 return false;
1848 /* This permutation is valid for reduction. Since the order of the
1849 statements in the nodes is not important unless they are memory
1850 accesses, we can rearrange the statements in all the nodes
1851 according to the order of the loads. */
1853 /* We have to unshare the SLP tree we modify. */
1854 hash_map<slp_tree, slp_tree> map;
1855 slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map);
1856 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn), false);
1857 unshared->refcnt++;
1858 SLP_INSTANCE_TREE (slp_instn) = unshared;
1859 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1860 SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node);
1861 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1863 /* Do the actual re-arrangement. */
1864 hash_set<slp_tree> visited;
1865 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1866 node->load_permutation, visited);
1868 /* We are done, no actual permutations need to be generated. */
1869 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1870 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1872 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1873 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1874 /* But we have to keep those permutations that are required because
1875 of handling of gaps. */
1876 if (known_eq (unrolling_factor, 1U)
1877 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1878 && DR_GROUP_GAP (first_stmt_info) == 0))
1879 SLP_TREE_LOAD_PERMUTATION (node).release ();
1880 else
1881 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1882 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1885 return true;
1888 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1890 static void
1891 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
1892 hash_set<slp_tree> &visited)
1894 if (visited.add (node))
1895 return;
1897 if (SLP_TREE_CHILDREN (node).length () == 0)
1899 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1900 return;
1901 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1902 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1903 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1904 loads.safe_push (node);
1906 else
1908 unsigned i;
1909 slp_tree child;
1910 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1911 vect_gather_slp_loads (loads, child, visited);
1915 static void
1916 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1918 hash_set<slp_tree> visited;
1919 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
1923 /* Find the last store in SLP INSTANCE. */
1925 stmt_vec_info
1926 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1928 stmt_vec_info last = NULL;
1929 stmt_vec_info stmt_vinfo;
1931 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1933 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1934 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1937 return last;
1940 /* Find the first stmt in NODE. */
1942 stmt_vec_info
1943 vect_find_first_scalar_stmt_in_slp (slp_tree node)
1945 stmt_vec_info first = NULL;
1946 stmt_vec_info stmt_vinfo;
1948 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1950 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1951 if (!first
1952 || get_later_stmt (stmt_vinfo, first) == first)
1953 first = stmt_vinfo;
1956 return first;
1959 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1960 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1961 (also containing the first GROUP1_SIZE stmts, since stores are
1962 consecutive), the second containing the remainder.
1963 Return the first stmt in the second group. */
1965 static stmt_vec_info
1966 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1968 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1969 gcc_assert (group1_size > 0);
1970 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1971 gcc_assert (group2_size > 0);
1972 DR_GROUP_SIZE (first_vinfo) = group1_size;
1974 stmt_vec_info stmt_info = first_vinfo;
1975 for (unsigned i = group1_size; i > 1; i--)
1977 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1978 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1980 /* STMT is now the last element of the first group. */
1981 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1982 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1984 DR_GROUP_SIZE (group2) = group2_size;
1985 for (stmt_info = group2; stmt_info;
1986 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1988 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1989 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1992 /* For the second group, the DR_GROUP_GAP is that before the original group,
1993 plus skipping over the first vector. */
1994 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1996 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1997 DR_GROUP_GAP (first_vinfo) += group2_size;
1999 if (dump_enabled_p ())
2000 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2001 group1_size, group2_size);
2003 return group2;
2006 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2007 statements and a vector of NUNITS elements. */
2009 static poly_uint64
2010 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2012 return exact_div (common_multiple (nunits, group_size), group_size);
2015 /* Analyze an SLP instance starting from a group of grouped stores. Call
2016 vect_build_slp_tree to build a tree of packed stmts if possible.
2017 Return FALSE if it's impossible to SLP any stmt in the loop. */
2019 static bool
2020 vect_analyze_slp_instance (vec_info *vinfo,
2021 scalar_stmts_to_slp_tree_map_t *bst_map,
2022 stmt_vec_info stmt_info, unsigned max_tree_size)
2024 slp_instance new_instance;
2025 slp_tree node;
2026 unsigned int group_size;
2027 tree vectype, scalar_type = NULL_TREE;
2028 unsigned int i;
2029 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2030 vec<stmt_vec_info> scalar_stmts;
2031 bool constructor = false;
2033 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2035 scalar_type = TREE_TYPE (DR_REF (dr));
2036 group_size = DR_GROUP_SIZE (stmt_info);
2037 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
2039 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2041 gcc_assert (is_a <loop_vec_info> (vinfo));
2042 vectype = STMT_VINFO_VECTYPE (stmt_info);
2043 group_size = REDUC_GROUP_SIZE (stmt_info);
2045 else if (is_gimple_assign (stmt_info->stmt)
2046 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2048 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
2049 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2050 constructor = true;
2052 else
2054 gcc_assert (is_a <loop_vec_info> (vinfo));
2055 vectype = STMT_VINFO_VECTYPE (stmt_info);
2056 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2059 if (!vectype)
2061 if (dump_enabled_p ())
2062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2063 "Build SLP failed: unsupported data-type %T\n",
2064 scalar_type);
2066 return false;
2068 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2070 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2071 scalar_stmts.create (group_size);
2072 stmt_vec_info next_info = stmt_info;
2073 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2075 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2076 while (next_info)
2078 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2079 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2082 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2084 /* Collect the reduction stmts and store them in
2085 SLP_TREE_SCALAR_STMTS. */
2086 while (next_info)
2088 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2089 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2091 /* Mark the first element of the reduction chain as reduction to properly
2092 transform the node. In the reduction analysis phase only the last
2093 element of the chain is marked as reduction. */
2094 STMT_VINFO_DEF_TYPE (stmt_info)
2095 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2096 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2097 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2099 else if (constructor)
2101 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2102 tree val;
2103 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2105 if (TREE_CODE (val) == SSA_NAME)
2107 gimple* def = SSA_NAME_DEF_STMT (val);
2108 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2109 /* Value is defined in another basic block. */
2110 if (!def_info)
2111 return false;
2112 def_info = vect_stmt_to_vectorize (def_info);
2113 scalar_stmts.safe_push (def_info);
2115 else
2116 return false;
2118 if (dump_enabled_p ())
2119 dump_printf_loc (MSG_NOTE, vect_location,
2120 "Analyzing vectorizable constructor: %G\n",
2121 stmt_info->stmt);
2123 else
2125 /* Collect reduction statements. */
2126 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2127 for (i = 0; reductions.iterate (i, &next_info); i++)
2128 scalar_stmts.safe_push (next_info);
2131 /* Build the tree for the SLP instance. */
2132 bool *matches = XALLOCAVEC (bool, group_size);
2133 unsigned npermutes = 0;
2134 poly_uint64 max_nunits = nunits;
2135 unsigned tree_size = 0;
2136 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2137 &max_nunits, matches, &npermutes,
2138 &tree_size, bst_map);
2139 if (node != NULL)
2141 /* Calculate the unrolling factor based on the smallest type. */
2142 poly_uint64 unrolling_factor
2143 = calculate_unrolling_factor (max_nunits, group_size);
2145 if (maybe_ne (unrolling_factor, 1U)
2146 && is_a <bb_vec_info> (vinfo))
2148 unsigned HOST_WIDE_INT const_max_nunits;
2149 if (!max_nunits.is_constant (&const_max_nunits)
2150 || const_max_nunits > group_size)
2152 if (dump_enabled_p ())
2153 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2154 "Build SLP failed: store group "
2155 "size not a multiple of the vector size "
2156 "in basic block SLP\n");
2157 vect_free_slp_tree (node, false);
2158 return false;
2160 /* Fatal mismatch. */
2161 matches[group_size / const_max_nunits * const_max_nunits] = false;
2162 vect_free_slp_tree (node, false);
2164 else
2166 /* Create a new SLP instance. */
2167 new_instance = XNEW (class _slp_instance);
2168 SLP_INSTANCE_TREE (new_instance) = node;
2169 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2170 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2171 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2173 vect_gather_slp_loads (new_instance, node);
2174 if (dump_enabled_p ())
2175 dump_printf_loc (MSG_NOTE, vect_location,
2176 "SLP size %u vs. limit %u.\n",
2177 tree_size, max_tree_size);
2179 /* Check whether any load is possibly permuted. */
2180 slp_tree load_node;
2181 bool loads_permuted = false;
2182 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2184 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2185 continue;
2186 unsigned j;
2187 stmt_vec_info load_info;
2188 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2189 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2191 loads_permuted = true;
2192 break;
2196 /* If the loads and stores can be handled with load/store-lane
2197 instructions do not generate this SLP instance. */
2198 if (is_a <loop_vec_info> (vinfo)
2199 && loads_permuted
2200 && dr && vect_store_lanes_supported (vectype, group_size, false))
2202 slp_tree load_node;
2203 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2205 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2206 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2207 /* Use SLP for strided accesses (or if we can't load-lanes). */
2208 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2209 || ! vect_load_lanes_supported
2210 (STMT_VINFO_VECTYPE (stmt_vinfo),
2211 DR_GROUP_SIZE (stmt_vinfo), false))
2212 break;
2214 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2216 if (dump_enabled_p ())
2217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2218 "Built SLP cancelled: can use "
2219 "load/store-lanes\n");
2220 vect_free_slp_instance (new_instance, false);
2221 return false;
2225 /* If this is a reduction chain with a conversion in front
2226 amend the SLP tree with a node for that. */
2227 if (!dr
2228 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2229 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2231 /* Get at the conversion stmt - we know it's the single use
2232 of the last stmt of the reduction chain. */
2233 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2234 use_operand_p use_p;
2235 gimple *use_stmt;
2236 bool r = single_imm_use (gimple_assign_lhs (tem),
2237 &use_p, &use_stmt);
2238 gcc_assert (r);
2239 next_info = vinfo->lookup_stmt (use_stmt);
2240 next_info = vect_stmt_to_vectorize (next_info);
2241 scalar_stmts = vNULL;
2242 scalar_stmts.create (group_size);
2243 for (unsigned i = 0; i < group_size; ++i)
2244 scalar_stmts.quick_push (next_info);
2245 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2246 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2247 SLP_TREE_CHILDREN (conv).quick_push (node);
2248 SLP_INSTANCE_TREE (new_instance) = conv;
2249 /* We also have to fake this conversion stmt as SLP reduction
2250 group so we don't have to mess with too much code
2251 elsewhere. */
2252 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2253 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2256 vinfo->slp_instances.safe_push (new_instance);
2258 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2259 the number of scalar stmts in the root in a few places.
2260 Verify that assumption holds. */
2261 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2262 .length () == group_size);
2264 if (dump_enabled_p ())
2266 dump_printf_loc (MSG_NOTE, vect_location,
2267 "Final SLP tree for instance:\n");
2268 vect_print_slp_graph (MSG_NOTE, vect_location,
2269 SLP_INSTANCE_TREE (new_instance));
2272 return true;
2275 else
2277 /* Failed to SLP. */
2278 /* Free the allocated memory. */
2279 scalar_stmts.release ();
2282 /* For basic block SLP, try to break the group up into multiples of the
2283 vector size. */
2284 unsigned HOST_WIDE_INT const_nunits;
2285 if (is_a <bb_vec_info> (vinfo)
2286 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2287 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2288 && nunits.is_constant (&const_nunits))
2290 /* We consider breaking the group only on VF boundaries from the existing
2291 start. */
2292 for (i = 0; i < group_size; i++)
2293 if (!matches[i]) break;
2295 if (i >= const_nunits && i < group_size)
2297 /* Split into two groups at the first vector boundary before i. */
2298 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2299 unsigned group1_size = i & ~(const_nunits - 1);
2301 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2302 group1_size);
2303 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2304 max_tree_size);
2305 /* If the first non-match was in the middle of a vector,
2306 skip the rest of that vector. */
2307 if (group1_size < i)
2309 i = group1_size + const_nunits;
2310 if (i < group_size)
2311 rest = vect_split_slp_store_group (rest, const_nunits);
2313 if (i < group_size)
2314 res |= vect_analyze_slp_instance (vinfo, bst_map,
2315 rest, max_tree_size);
2316 return res;
2318 /* Even though the first vector did not all match, we might be able to SLP
2319 (some) of the remainder. FORNOW ignore this possibility. */
2322 return false;
2326 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2327 trees of packed scalar stmts if SLP is possible. */
2329 opt_result
2330 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2332 unsigned int i;
2333 stmt_vec_info first_element;
2335 DUMP_VECT_SCOPE ("vect_analyze_slp");
2337 scalar_stmts_to_slp_tree_map_t *bst_map
2338 = new scalar_stmts_to_slp_tree_map_t ();
2340 /* Find SLP sequences starting from groups of grouped stores. */
2341 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2342 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2344 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2346 if (loop_vinfo->reduction_chains.length () > 0)
2348 /* Find SLP sequences starting from reduction chains. */
2349 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2350 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2351 max_tree_size))
2353 /* Dissolve reduction chain group. */
2354 stmt_vec_info vinfo = first_element;
2355 stmt_vec_info last = NULL;
2356 while (vinfo)
2358 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2359 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2360 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2361 last = vinfo;
2362 vinfo = next;
2364 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2365 /* It can be still vectorized as part of an SLP reduction. */
2366 loop_vinfo->reductions.safe_push (last);
2370 /* Find SLP sequences starting from groups of reductions. */
2371 if (loop_vinfo->reductions.length () > 1)
2372 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2373 max_tree_size);
2376 /* The map keeps a reference on SLP nodes built, release that. */
2377 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2378 it != bst_map->end (); ++it)
2379 if ((*it).second)
2380 vect_free_slp_tree ((*it).second, false);
2381 delete bst_map;
2383 /* Optimize permutations in SLP reductions. */
2384 slp_instance instance;
2385 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2387 slp_tree node = SLP_INSTANCE_TREE (instance);
2388 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2389 /* Reduction (there are no data-refs in the root).
2390 In reduction chain the order of the loads is not important. */
2391 if (!STMT_VINFO_DATA_REF (stmt_info)
2392 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2393 vect_attempt_slp_rearrange_stmts (instance);
2396 /* Gather all loads in the SLP graph. */
2397 hash_set<slp_tree> visited;
2398 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2399 vect_gather_slp_loads (vinfo->slp_loads, SLP_INSTANCE_TREE (instance),
2400 visited);
2402 return opt_result::success ();
2405 void
2406 vect_optimize_slp (vec_info *vinfo)
2408 slp_tree node;
2409 unsigned i;
2410 FOR_EACH_VEC_ELT (vinfo->slp_loads, i, node)
2412 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2413 continue;
2415 /* In basic block vectorization we allow any subchain of an interleaving
2416 chain.
2417 FORNOW: not in loop SLP because of realignment complications. */
2418 if (is_a <bb_vec_info> (vinfo))
2420 bool subchain_p = true;
2421 stmt_vec_info next_load_info = NULL;
2422 stmt_vec_info load_info;
2423 unsigned j;
2424 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2426 if (j != 0
2427 && (next_load_info != load_info
2428 || DR_GROUP_GAP (load_info) != 1))
2430 subchain_p = false;
2431 break;
2433 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
2435 if (subchain_p)
2437 SLP_TREE_LOAD_PERMUTATION (node).release ();
2438 continue;
2441 else
2443 stmt_vec_info load_info;
2444 bool this_load_permuted = false;
2445 unsigned j;
2446 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2447 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
2449 this_load_permuted = true;
2450 break;
2452 stmt_vec_info first_stmt_info
2453 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
2454 if (!this_load_permuted
2455 /* The load requires permutation when unrolling exposes
2456 a gap either because the group is larger than the SLP
2457 group-size or because there is a gap between the groups. */
2458 && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a <loop_vec_info> (vinfo)), 1U)
2459 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
2460 && DR_GROUP_GAP (first_stmt_info) == 0)))
2462 SLP_TREE_LOAD_PERMUTATION (node).release ();
2463 continue;
2470 /* For each possible SLP instance decide whether to SLP it and calculate overall
2471 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2472 least one instance. */
2474 bool
2475 vect_make_slp_decision (loop_vec_info loop_vinfo)
2477 unsigned int i;
2478 poly_uint64 unrolling_factor = 1;
2479 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2480 slp_instance instance;
2481 int decided_to_slp = 0;
2483 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2485 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2487 /* FORNOW: SLP if you can. */
2488 /* All unroll factors have the form:
2490 GET_MODE_SIZE (vinfo->vector_mode) * X
2492 for some rational X, so they must have a common multiple. */
2493 unrolling_factor
2494 = force_common_multiple (unrolling_factor,
2495 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2497 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2498 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2499 loop-based vectorization. Such stmts will be marked as HYBRID. */
2500 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2501 decided_to_slp++;
2504 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2506 if (decided_to_slp && dump_enabled_p ())
2508 dump_printf_loc (MSG_NOTE, vect_location,
2509 "Decided to SLP %d instances. Unrolling factor ",
2510 decided_to_slp);
2511 dump_dec (MSG_NOTE, unrolling_factor);
2512 dump_printf (MSG_NOTE, "\n");
2515 return (decided_to_slp > 0);
2518 /* Private data for vect_detect_hybrid_slp. */
2519 struct vdhs_data
2521 loop_vec_info loop_vinfo;
2522 vec<stmt_vec_info> *worklist;
2525 /* Walker for walk_gimple_op. */
2527 static tree
2528 vect_detect_hybrid_slp (tree *tp, int *, void *data)
2530 walk_stmt_info *wi = (walk_stmt_info *)data;
2531 vdhs_data *dat = (vdhs_data *)wi->info;
2533 if (wi->is_lhs)
2534 return NULL_TREE;
2536 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
2537 if (!def_stmt_info)
2538 return NULL_TREE;
2539 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
2540 if (PURE_SLP_STMT (def_stmt_info))
2542 if (dump_enabled_p ())
2543 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2544 def_stmt_info->stmt);
2545 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2546 dat->worklist->safe_push (def_stmt_info);
2549 return NULL_TREE;
2552 /* Find stmts that must be both vectorized and SLPed. */
2554 void
2555 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2557 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2559 /* All stmts participating in SLP are marked pure_slp, all other
2560 stmts are loop_vect.
2561 First collect all loop_vect stmts into a worklist. */
2562 auto_vec<stmt_vec_info> worklist;
2563 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2565 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2566 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2567 gsi_next (&gsi))
2569 gphi *phi = gsi.phi ();
2570 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
2571 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2572 worklist.safe_push (stmt_info);
2574 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2575 gsi_next (&gsi))
2577 gimple *stmt = gsi_stmt (gsi);
2578 if (is_gimple_debug (stmt))
2579 continue;
2580 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2581 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2583 for (gimple_stmt_iterator gsi2
2584 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2585 !gsi_end_p (gsi2); gsi_next (&gsi2))
2587 stmt_vec_info patt_info
2588 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
2589 if (!STMT_SLP_TYPE (patt_info)
2590 && STMT_VINFO_RELEVANT (patt_info))
2591 worklist.safe_push (patt_info);
2593 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2595 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2596 worklist.safe_push (stmt_info);
2600 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2601 mark any SLP vectorized stmt as hybrid.
2602 ??? We're visiting def stmts N times (once for each non-SLP and
2603 once for each hybrid-SLP use). */
2604 walk_stmt_info wi;
2605 vdhs_data dat;
2606 dat.worklist = &worklist;
2607 dat.loop_vinfo = loop_vinfo;
2608 memset (&wi, 0, sizeof (wi));
2609 wi.info = (void *)&dat;
2610 while (!worklist.is_empty ())
2612 stmt_vec_info stmt_info = worklist.pop ();
2613 /* Since SSA operands are not set up for pattern stmts we need
2614 to use walk_gimple_op. */
2615 wi.is_lhs = 0;
2616 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
2621 /* Initialize a bb_vec_info struct for the statements between
2622 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2624 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2625 gimple_stmt_iterator region_end_in,
2626 vec_info_shared *shared)
2627 : vec_info (vec_info::bb, init_cost (NULL), shared),
2628 bb (gsi_bb (region_begin_in)),
2629 region_begin (region_begin_in),
2630 region_end (region_end_in)
2632 for (gimple *stmt : this->region_stmts ())
2634 gimple_set_uid (stmt, 0);
2635 if (is_gimple_debug (stmt))
2636 continue;
2637 add_stmt (stmt);
2640 bb->aux = this;
2644 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2645 stmts in the basic block. */
2647 _bb_vec_info::~_bb_vec_info ()
2649 for (gimple *stmt : this->region_stmts ())
2650 /* Reset region marker. */
2651 gimple_set_uid (stmt, -1);
2653 bb->aux = NULL;
2656 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2657 given then that child nodes have already been processed, and that
2658 their def types currently match their SLP node's def type. */
2660 static bool
2661 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2662 slp_instance node_instance,
2663 stmt_vector_for_cost *cost_vec)
2665 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2666 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2668 /* Calculate the number of vector statements to be created for the
2669 scalar stmts in this node. For SLP reductions it is equal to the
2670 number of vector statements in the children (which has already been
2671 calculated by the recursive call). Otherwise it is the number of
2672 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2673 VF divided by the number of elements in a vector. */
2674 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2675 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2677 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2678 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2680 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2681 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2682 break;
2685 else
2687 poly_uint64 vf;
2688 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2689 vf = loop_vinfo->vectorization_factor;
2690 else
2691 vf = 1;
2692 unsigned int group_size = SLP_TREE_LANES (node);
2693 tree vectype = SLP_TREE_VECTYPE (node);
2694 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2695 = vect_get_num_vectors (vf * group_size, vectype);
2698 /* Handle purely internal nodes. */
2699 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2700 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
2702 bool dummy;
2703 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
2704 node, node_instance, cost_vec);
2707 /* Try to build NODE from scalars, returning true on success.
2708 NODE_INSTANCE is the SLP instance that contains NODE. */
2710 static bool
2711 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
2712 slp_instance node_instance)
2714 stmt_vec_info stmt_info;
2715 unsigned int i;
2717 if (!is_a <bb_vec_info> (vinfo)
2718 || node == SLP_INSTANCE_TREE (node_instance)
2719 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
2720 return false;
2722 if (dump_enabled_p ())
2723 dump_printf_loc (MSG_NOTE, vect_location,
2724 "Building vector operands from scalars instead\n");
2726 /* Don't remove and free the child nodes here, since they could be
2727 referenced by other structures. The analysis and scheduling phases
2728 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2729 unsigned int group_size = SLP_TREE_LANES (node);
2730 SLP_TREE_DEF_TYPE (node) = vect_external_def;
2731 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size);
2732 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2734 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
2735 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
2737 return true;
2740 /* Compute the prologue cost for invariant or constant operands represented
2741 by NODE. */
2743 static void
2744 vect_prologue_cost_for_slp (slp_tree node,
2745 stmt_vector_for_cost *cost_vec)
2747 /* Without looking at the actual initializer a vector of
2748 constants can be implemented as load from the constant pool.
2749 When all elements are the same we can use a splat. */
2750 tree vectype = SLP_TREE_VECTYPE (node);
2751 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
2752 unsigned num_vects_to_check;
2753 unsigned HOST_WIDE_INT const_nunits;
2754 unsigned nelt_limit;
2755 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
2756 && ! multiple_p (const_nunits, group_size))
2758 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
2759 nelt_limit = const_nunits;
2761 else
2763 /* If either the vector has variable length or the vectors
2764 are composed of repeated whole groups we only need to
2765 cost construction once. All vectors will be the same. */
2766 num_vects_to_check = 1;
2767 nelt_limit = group_size;
2769 tree elt = NULL_TREE;
2770 unsigned nelt = 0;
2771 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
2773 unsigned si = j % group_size;
2774 if (nelt == 0)
2775 elt = SLP_TREE_SCALAR_OPS (node)[si];
2776 /* ??? We're just tracking whether all operands of a single
2777 vector initializer are the same, ideally we'd check if
2778 we emitted the same one already. */
2779 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
2780 elt = NULL_TREE;
2781 nelt++;
2782 if (nelt == nelt_limit)
2784 record_stmt_cost (cost_vec, 1,
2785 SLP_TREE_DEF_TYPE (node) == vect_external_def
2786 ? (elt ? scalar_to_vec : vec_construct)
2787 : vector_load,
2788 NULL, vectype, 0, vect_prologue);
2789 nelt = 0;
2794 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2795 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2797 Return true if the operations are supported. */
2799 static bool
2800 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2801 slp_instance node_instance,
2802 hash_set<slp_tree> &visited,
2803 hash_set<slp_tree> &lvisited,
2804 stmt_vector_for_cost *cost_vec)
2806 int i, j;
2807 slp_tree child;
2809 /* Assume we can code-generate all invariants. */
2810 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2811 return true;
2813 /* If we already analyzed the exact same set of scalar stmts we're done.
2814 We share the generated vector stmts for those.
2815 The SLP graph is acyclic so not caching whether we failed or succeeded
2816 doesn't result in any issue since we throw away the lvisited set
2817 when we fail. */
2818 if (visited.contains (node)
2819 || lvisited.add (node))
2820 return true;
2822 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2823 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2824 visited, lvisited, cost_vec))
2825 return false;
2827 bool res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2828 cost_vec);
2830 /* When the node can be vectorized cost invariant nodes it references.
2831 This is not done in DFS order to allow the refering node
2832 vectorizable_* calls to nail down the invariant nodes vector type
2833 and possibly unshare it if it needs a different vector type than
2834 other referrers. */
2835 if (res)
2836 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2837 if ((SLP_TREE_DEF_TYPE (child) == vect_constant_def
2838 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
2839 /* Perform usual caching, note code-generation still
2840 code-gens these nodes multiple times but we expect
2841 to CSE them later. */
2842 && !visited.contains (child)
2843 && !lvisited.add (child))
2845 /* ??? After auditing more code paths make a "default"
2846 and push the vector type from NODE to all children
2847 if it is not already set. */
2848 /* Compute the number of vectors to be generated. */
2849 tree vector_type = SLP_TREE_VECTYPE (child);
2850 if (!vector_type)
2852 /* For shifts with a scalar argument we don't need
2853 to cost or code-generate anything.
2854 ??? Represent this more explicitely. */
2855 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
2856 == shift_vec_info_type)
2857 && j == 1);
2858 continue;
2860 unsigned group_size = SLP_TREE_SCALAR_OPS (child).length ();
2861 poly_uint64 vf = 1;
2862 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2863 vf = loop_vinfo->vectorization_factor;
2864 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
2865 = vect_get_num_vectors (vf * group_size, vector_type);
2866 /* And cost them. */
2867 vect_prologue_cost_for_slp (child, cost_vec);
2870 /* If this node can't be vectorized, try pruning the tree here rather
2871 than felling the whole thing. */
2872 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
2874 /* We'll need to revisit this for invariant costing and number
2875 of vectorized stmt setting. */
2876 lvisited.remove (node);
2877 res = true;
2880 return res;
2884 /* Analyze statements in SLP instances of VINFO. Return true if the
2885 operations are supported. */
2887 bool
2888 vect_slp_analyze_operations (vec_info *vinfo)
2890 slp_instance instance;
2891 int i;
2893 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2895 hash_set<slp_tree> visited;
2896 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2898 hash_set<slp_tree> lvisited;
2899 stmt_vector_for_cost cost_vec;
2900 cost_vec.create (2);
2901 if (!vect_slp_analyze_node_operations (vinfo,
2902 SLP_INSTANCE_TREE (instance),
2903 instance, visited, lvisited,
2904 &cost_vec)
2905 /* Instances with a root stmt require vectorized defs for the
2906 SLP tree root. */
2907 || (SLP_INSTANCE_ROOT_STMT (instance)
2908 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
2909 != vect_internal_def)))
2911 slp_tree node = SLP_INSTANCE_TREE (instance);
2912 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2913 if (dump_enabled_p ())
2914 dump_printf_loc (MSG_NOTE, vect_location,
2915 "removing SLP instance operations starting from: %G",
2916 stmt_info->stmt);
2917 vect_free_slp_instance (instance, false);
2918 vinfo->slp_instances.ordered_remove (i);
2919 cost_vec.release ();
2921 else
2923 for (hash_set<slp_tree>::iterator x = lvisited.begin();
2924 x != lvisited.end(); ++x)
2925 visited.add (*x);
2926 i++;
2928 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
2929 cost_vec.release ();
2933 return !vinfo->slp_instances.is_empty ();
2937 /* Compute the scalar cost of the SLP node NODE and its children
2938 and return it. Do not account defs that are marked in LIFE and
2939 update LIFE according to uses of NODE. */
2941 static void
2942 vect_bb_slp_scalar_cost (vec_info *vinfo,
2943 slp_tree node, vec<bool, va_heap> *life,
2944 stmt_vector_for_cost *cost_vec,
2945 hash_set<slp_tree> &visited)
2947 unsigned i;
2948 stmt_vec_info stmt_info;
2949 slp_tree child;
2951 if (visited.add (node))
2952 return;
2954 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2956 gimple *stmt = stmt_info->stmt;
2957 ssa_op_iter op_iter;
2958 def_operand_p def_p;
2960 if ((*life)[i])
2961 continue;
2963 /* If there is a non-vectorized use of the defs then the scalar
2964 stmt is kept live in which case we do not account it or any
2965 required defs in the SLP children in the scalar cost. This
2966 way we make the vectorization more costly when compared to
2967 the scalar cost. */
2968 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2970 imm_use_iterator use_iter;
2971 gimple *use_stmt;
2972 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2973 if (!is_gimple_debug (use_stmt))
2975 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
2976 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
2978 (*life)[i] = true;
2979 BREAK_FROM_IMM_USE_STMT (use_iter);
2983 if ((*life)[i])
2984 continue;
2986 /* Count scalar stmts only once. */
2987 if (gimple_visited_p (stmt))
2988 continue;
2989 gimple_set_visited (stmt, true);
2991 vect_cost_for_stmt kind;
2992 if (STMT_VINFO_DATA_REF (stmt_info))
2994 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2995 kind = scalar_load;
2996 else
2997 kind = scalar_store;
2999 else if (vect_nop_conversion_p (stmt_info))
3000 continue;
3001 else
3002 kind = scalar_stmt;
3003 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
3006 auto_vec<bool, 20> subtree_life;
3007 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3009 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3011 /* Do not directly pass LIFE to the recursive call, copy it to
3012 confine changes in the callee to the current child/subtree. */
3013 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3015 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child));
3016 for (unsigned j = 0;
3017 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3019 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3020 if (perm.first == i)
3021 subtree_life[perm.second] = (*life)[j];
3024 else
3026 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
3027 subtree_life.safe_splice (*life);
3029 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3030 visited);
3031 subtree_life.truncate (0);
3036 /* Check if vectorization of the basic block is profitable. */
3038 static bool
3039 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
3041 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3042 slp_instance instance;
3043 int i;
3044 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3045 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3047 /* Calculate scalar cost. */
3048 stmt_vector_for_cost scalar_costs;
3049 scalar_costs.create (0);
3050 hash_set<slp_tree> visited;
3051 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3053 auto_vec<bool, 20> life;
3054 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)));
3055 vect_bb_slp_scalar_cost (bb_vinfo,
3056 SLP_INSTANCE_TREE (instance),
3057 &life, &scalar_costs, visited);
3059 /* Unset visited flag. */
3060 stmt_info_for_cost *si;
3061 FOR_EACH_VEC_ELT (scalar_costs, i, si)
3062 gimple_set_visited (si->stmt_info->stmt, false);
3064 void *target_cost_data = init_cost (NULL);
3065 add_stmt_costs (bb_vinfo, target_cost_data, &scalar_costs);
3066 scalar_costs.release ();
3067 unsigned dummy;
3068 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
3069 destroy_cost_data (target_cost_data);
3071 /* Complete the target-specific cost calculation. */
3072 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
3073 &vec_inside_cost, &vec_epilogue_cost);
3075 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3077 if (dump_enabled_p ())
3079 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3080 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3081 vec_inside_cost);
3082 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3083 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3084 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3087 /* Vectorization is profitable if its cost is more than the cost of scalar
3088 version. Note that we err on the vector side for equal cost because
3089 the cost estimate is otherwise quite pessimistic (constant uses are
3090 free on the scalar side but cost a load on the vector side for
3091 example). */
3092 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3093 return false;
3095 return true;
3098 /* Find any vectorizable constructors and add them to the grouped_store
3099 array. */
3101 static void
3102 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3104 for (gimple *stmt : bb_vinfo->region_stmts ())
3106 gassign *assign = dyn_cast<gassign *> (stmt);
3107 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
3108 continue;
3110 tree rhs = gimple_assign_rhs1 (assign);
3111 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
3112 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
3113 CONSTRUCTOR_NELTS (rhs))
3114 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
3115 || uniform_vector_p (rhs))
3116 continue;
3118 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
3119 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3123 /* Check if the region described by BB_VINFO can be vectorized, returning
3124 true if so. When returning false, set FATAL to true if the same failure
3125 would prevent vectorization at other vector sizes, false if it is still
3126 worth trying other sizes. N_STMTS is the number of statements in the
3127 region. */
3129 static bool
3130 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
3132 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3134 slp_instance instance;
3135 int i;
3136 poly_uint64 min_vf = 2;
3138 /* The first group of checks is independent of the vector size. */
3139 fatal = true;
3141 /* Analyze the data references. */
3143 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3145 if (dump_enabled_p ())
3146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3147 "not vectorized: unhandled data-ref in basic "
3148 "block.\n");
3149 return false;
3152 if (!vect_analyze_data_ref_accesses (bb_vinfo))
3154 if (dump_enabled_p ())
3155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3156 "not vectorized: unhandled data access in "
3157 "basic block.\n");
3158 return false;
3161 vect_slp_check_for_constructors (bb_vinfo);
3163 /* If there are no grouped stores and no constructors in the region
3164 there is no need to continue with pattern recog as vect_analyze_slp
3165 will fail anyway. */
3166 if (bb_vinfo->grouped_stores.is_empty ())
3168 if (dump_enabled_p ())
3169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3170 "not vectorized: no grouped stores in "
3171 "basic block.\n");
3172 return false;
3175 /* While the rest of the analysis below depends on it in some way. */
3176 fatal = false;
3178 vect_pattern_recog (bb_vinfo);
3180 /* Check the SLP opportunities in the basic block, analyze and build SLP
3181 trees. */
3182 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3184 if (dump_enabled_p ())
3186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3187 "Failed to SLP the basic block.\n");
3188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3189 "not vectorized: failed to find SLP opportunities "
3190 "in basic block.\n");
3192 return false;
3195 /* Optimize permutations. */
3196 vect_optimize_slp (bb_vinfo);
3198 vect_record_base_alignments (bb_vinfo);
3200 /* Analyze and verify the alignment of data references and the
3201 dependence in the SLP instances. */
3202 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3204 if (! vect_slp_analyze_and_verify_instance_alignment (bb_vinfo, instance)
3205 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
3207 slp_tree node = SLP_INSTANCE_TREE (instance);
3208 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3209 if (dump_enabled_p ())
3210 dump_printf_loc (MSG_NOTE, vect_location,
3211 "removing SLP instance operations starting from: %G",
3212 stmt_info->stmt);
3213 vect_free_slp_instance (instance, false);
3214 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3215 continue;
3218 /* Mark all the statements that we want to vectorize as pure SLP and
3219 relevant. */
3220 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3221 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3222 if (SLP_INSTANCE_ROOT_STMT (instance))
3223 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3225 i++;
3227 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3228 return false;
3230 if (!vect_slp_analyze_operations (bb_vinfo))
3232 if (dump_enabled_p ())
3233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3234 "not vectorized: bad operation in basic block.\n");
3235 return false;
3238 /* Cost model: check if the vectorization is worthwhile. */
3239 if (!unlimited_cost_model (NULL)
3240 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3242 if (dump_enabled_p ())
3243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3244 "not vectorized: vectorization is not "
3245 "profitable.\n");
3246 return false;
3249 if (dump_enabled_p ())
3250 dump_printf_loc (MSG_NOTE, vect_location,
3251 "Basic block will be vectorized using SLP\n");
3252 return true;
3255 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3256 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3257 on success. The region has N_STMTS statements and has the datarefs
3258 given by DATAREFS. */
3260 static bool
3261 vect_slp_bb_region (gimple_stmt_iterator region_begin,
3262 gimple_stmt_iterator region_end,
3263 vec<data_reference_p> datarefs,
3264 unsigned int n_stmts)
3266 bb_vec_info bb_vinfo;
3267 auto_vector_modes vector_modes;
3269 /* Autodetect first vector size we try. */
3270 machine_mode next_vector_mode = VOIDmode;
3271 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3272 unsigned int mode_i = 0;
3274 vec_info_shared shared;
3276 machine_mode autodetected_vector_mode = VOIDmode;
3277 while (1)
3279 bool vectorized = false;
3280 bool fatal = false;
3281 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3283 bool first_time_p = shared.datarefs.is_empty ();
3284 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3285 if (first_time_p)
3286 bb_vinfo->shared->save_datarefs ();
3287 else
3288 bb_vinfo->shared->check_datarefs ();
3289 bb_vinfo->vector_mode = next_vector_mode;
3291 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
3292 && dbg_cnt (vect_slp))
3294 if (dump_enabled_p ())
3296 dump_printf_loc (MSG_NOTE, vect_location,
3297 "***** Analysis succeeded with vector mode"
3298 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
3299 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3302 bb_vinfo->shared->check_datarefs ();
3303 vect_schedule_slp (bb_vinfo);
3305 unsigned HOST_WIDE_INT bytes;
3306 if (dump_enabled_p ())
3308 if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
3309 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3310 "basic block part vectorized using %wu byte "
3311 "vectors\n", bytes);
3312 else
3313 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3314 "basic block part vectorized using variable "
3315 "length vectors\n");
3318 vectorized = true;
3320 else
3322 if (dump_enabled_p ())
3323 dump_printf_loc (MSG_NOTE, vect_location,
3324 "***** Analysis failed with vector mode %s\n",
3325 GET_MODE_NAME (bb_vinfo->vector_mode));
3328 if (mode_i == 0)
3329 autodetected_vector_mode = bb_vinfo->vector_mode;
3331 if (!fatal)
3332 while (mode_i < vector_modes.length ()
3333 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
3335 if (dump_enabled_p ())
3336 dump_printf_loc (MSG_NOTE, vect_location,
3337 "***** The result for vector mode %s would"
3338 " be the same\n",
3339 GET_MODE_NAME (vector_modes[mode_i]));
3340 mode_i += 1;
3343 delete bb_vinfo;
3345 if (mode_i < vector_modes.length ()
3346 && VECTOR_MODE_P (autodetected_vector_mode)
3347 && (related_vector_mode (vector_modes[mode_i],
3348 GET_MODE_INNER (autodetected_vector_mode))
3349 == autodetected_vector_mode)
3350 && (related_vector_mode (autodetected_vector_mode,
3351 GET_MODE_INNER (vector_modes[mode_i]))
3352 == vector_modes[mode_i]))
3354 if (dump_enabled_p ())
3355 dump_printf_loc (MSG_NOTE, vect_location,
3356 "***** Skipping vector mode %s, which would"
3357 " repeat the analysis for %s\n",
3358 GET_MODE_NAME (vector_modes[mode_i]),
3359 GET_MODE_NAME (autodetected_vector_mode));
3360 mode_i += 1;
3363 if (vectorized
3364 || mode_i == vector_modes.length ()
3365 || autodetected_vector_mode == VOIDmode
3366 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3367 vector sizes will fail do not bother iterating. */
3368 || fatal)
3369 return vectorized;
3371 /* Try the next biggest vector size. */
3372 next_vector_mode = vector_modes[mode_i++];
3373 if (dump_enabled_p ())
3374 dump_printf_loc (MSG_NOTE, vect_location,
3375 "***** Re-trying analysis with vector mode %s\n",
3376 GET_MODE_NAME (next_vector_mode));
3380 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3381 true if anything in the basic-block was vectorized. */
3383 bool
3384 vect_slp_bb (basic_block bb)
3386 gimple_stmt_iterator gsi;
3387 bool any_vectorized = false;
3389 gsi = gsi_after_labels (bb);
3390 while (!gsi_end_p (gsi))
3392 gimple_stmt_iterator region_begin = gsi;
3393 vec<data_reference_p> datarefs = vNULL;
3394 int insns = 0;
3396 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3398 gimple *stmt = gsi_stmt (gsi);
3399 if (is_gimple_debug (stmt))
3401 /* Skip leading debug stmts. */
3402 if (gsi_stmt (region_begin) == stmt)
3403 gsi_next (&region_begin);
3404 continue;
3406 insns++;
3408 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3409 vect_location = stmt;
3411 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3412 break;
3414 if (gsi_end_p (region_begin))
3415 break;
3417 /* Skip leading unhandled stmts. */
3418 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3420 gsi_next (&gsi);
3421 continue;
3424 gimple_stmt_iterator region_end = gsi;
3426 if (insns > param_slp_max_insns_in_bb)
3428 if (dump_enabled_p ())
3429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3430 "not vectorized: too many instructions in "
3431 "basic block.\n");
3433 else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
3434 any_vectorized = true;
3436 if (gsi_end_p (region_end))
3437 break;
3439 /* Skip the unhandled stmt. */
3440 gsi_next (&gsi);
3443 return any_vectorized;
3447 /* Build a variable-length vector in which the elements in ELTS are repeated
3448 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3449 RESULTS and add any new instructions to SEQ.
3451 The approach we use is:
3453 (1) Find a vector mode VM with integer elements of mode IM.
3455 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3456 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3457 from small vectors to IM.
3459 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3461 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3462 correct byte contents.
3464 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3466 We try to find the largest IM for which this sequence works, in order
3467 to cut down on the number of interleaves. */
3469 void
3470 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3471 vec<tree> elts, unsigned int nresults,
3472 vec<tree> &results)
3474 unsigned int nelts = elts.length ();
3475 tree element_type = TREE_TYPE (vector_type);
3477 /* (1) Find a vector mode VM with integer elements of mode IM. */
3478 unsigned int nvectors = 1;
3479 tree new_vector_type;
3480 tree permutes[2];
3481 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
3482 &nvectors, &new_vector_type,
3483 permutes))
3484 gcc_unreachable ();
3486 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3487 unsigned int partial_nelts = nelts / nvectors;
3488 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3490 tree_vector_builder partial_elts;
3491 auto_vec<tree, 32> pieces (nvectors * 2);
3492 pieces.quick_grow (nvectors * 2);
3493 for (unsigned int i = 0; i < nvectors; ++i)
3495 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3496 ELTS' has mode IM. */
3497 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3498 for (unsigned int j = 0; j < partial_nelts; ++j)
3499 partial_elts.quick_push (elts[i * partial_nelts + j]);
3500 tree t = gimple_build_vector (seq, &partial_elts);
3501 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3502 TREE_TYPE (new_vector_type), t);
3504 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3505 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3508 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3509 correct byte contents.
3511 We need to repeat the following operation log2(nvectors) times:
3513 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3514 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3516 However, if each input repeats every N elements and the VF is
3517 a multiple of N * 2, the HI result is the same as the LO. */
3518 unsigned int in_start = 0;
3519 unsigned int out_start = nvectors;
3520 unsigned int hi_start = nvectors / 2;
3521 /* A bound on the number of outputs needed to produce NRESULTS results
3522 in the final iteration. */
3523 unsigned int noutputs_bound = nvectors * nresults;
3524 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3526 noutputs_bound /= 2;
3527 unsigned int limit = MIN (noutputs_bound, nvectors);
3528 for (unsigned int i = 0; i < limit; ++i)
3530 if ((i & 1) != 0
3531 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3532 2 * in_repeat))
3534 pieces[out_start + i] = pieces[out_start + i - 1];
3535 continue;
3538 tree output = make_ssa_name (new_vector_type);
3539 tree input1 = pieces[in_start + (i / 2)];
3540 tree input2 = pieces[in_start + (i / 2) + hi_start];
3541 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3542 input1, input2,
3543 permutes[i & 1]);
3544 gimple_seq_add_stmt (seq, stmt);
3545 pieces[out_start + i] = output;
3547 std::swap (in_start, out_start);
3550 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3551 results.reserve (nresults);
3552 for (unsigned int i = 0; i < nresults; ++i)
3553 if (i < nvectors)
3554 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3555 pieces[in_start + i]));
3556 else
3557 results.quick_push (results[i - nvectors]);
3561 /* For constant and loop invariant defs in OP_NODE this function creates
3562 vector defs that will be used in the vectorized stmts and stores them
3563 to SLP_TREE_VEC_DEFS of OP_NODE. */
3565 static void
3566 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
3568 unsigned HOST_WIDE_INT nunits;
3569 tree vec_cst;
3570 unsigned j, number_of_places_left_in_vector;
3571 tree vector_type;
3572 tree vop;
3573 int group_size = op_node->ops.length ();
3574 unsigned int vec_num, i;
3575 unsigned number_of_copies = 1;
3576 bool constant_p;
3577 gimple_seq ctor_seq = NULL;
3578 auto_vec<tree, 16> permute_results;
3580 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
3581 vector_type = SLP_TREE_VECTYPE (op_node);
3583 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
3584 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
3585 auto_vec<tree> voprnds (number_of_vectors);
3587 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3588 created vectors. It is greater than 1 if unrolling is performed.
3590 For example, we have two scalar operands, s1 and s2 (e.g., group of
3591 strided accesses of size two), while NUNITS is four (i.e., four scalars
3592 of this type can be packed in a vector). The output vector will contain
3593 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3594 will be 2).
3596 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3597 containing the operands.
3599 For example, NUNITS is four as before, and the group size is 8
3600 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3601 {s5, s6, s7, s8}. */
3603 /* When using duplicate_and_interleave, we just need one element for
3604 each scalar statement. */
3605 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3606 nunits = group_size;
3608 number_of_copies = nunits * number_of_vectors / group_size;
3610 number_of_places_left_in_vector = nunits;
3611 constant_p = true;
3612 tree_vector_builder elts (vector_type, nunits, 1);
3613 elts.quick_grow (nunits);
3614 stmt_vec_info insert_after = NULL;
3615 for (j = 0; j < number_of_copies; j++)
3617 tree op;
3618 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3620 /* Create 'vect_ = {op0,op1,...,opn}'. */
3621 number_of_places_left_in_vector--;
3622 tree orig_op = op;
3623 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3625 if (CONSTANT_CLASS_P (op))
3627 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3629 /* Can't use VIEW_CONVERT_EXPR for booleans because
3630 of possibly different sizes of scalar value and
3631 vector element. */
3632 if (integer_zerop (op))
3633 op = build_int_cst (TREE_TYPE (vector_type), 0);
3634 else if (integer_onep (op))
3635 op = build_all_ones_cst (TREE_TYPE (vector_type));
3636 else
3637 gcc_unreachable ();
3639 else
3640 op = fold_unary (VIEW_CONVERT_EXPR,
3641 TREE_TYPE (vector_type), op);
3642 gcc_assert (op && CONSTANT_CLASS_P (op));
3644 else
3646 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3647 gimple *init_stmt;
3648 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3650 tree true_val
3651 = build_all_ones_cst (TREE_TYPE (vector_type));
3652 tree false_val
3653 = build_zero_cst (TREE_TYPE (vector_type));
3654 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3655 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3656 op, true_val,
3657 false_val);
3659 else
3661 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3662 op);
3663 init_stmt
3664 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3665 op);
3667 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3668 op = new_temp;
3671 elts[number_of_places_left_in_vector] = op;
3672 if (!CONSTANT_CLASS_P (op))
3673 constant_p = false;
3674 /* For BB vectorization we have to compute an insert location
3675 when a def is inside the analyzed region since we cannot
3676 simply insert at the BB start in this case. */
3677 stmt_vec_info opdef;
3678 if (TREE_CODE (orig_op) == SSA_NAME
3679 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3680 && is_a <bb_vec_info> (vinfo)
3681 && (opdef = vinfo->lookup_def (orig_op)))
3683 if (!insert_after)
3684 insert_after = opdef;
3685 else
3686 insert_after = get_later_stmt (insert_after, opdef);
3689 if (number_of_places_left_in_vector == 0)
3691 if (constant_p
3692 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3693 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3694 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3695 else
3697 if (permute_results.is_empty ())
3698 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
3699 elts, number_of_vectors,
3700 permute_results);
3701 vec_cst = permute_results[number_of_vectors - j - 1];
3703 tree init;
3704 if (insert_after)
3706 gimple_stmt_iterator gsi = gsi_for_stmt (insert_after->stmt);
3707 /* vect_init_vector inserts before. */
3708 gsi_next (&gsi);
3709 init = vect_init_vector (vinfo, NULL, vec_cst,
3710 vector_type, &gsi);
3712 else
3713 init = vect_init_vector (vinfo, NULL, vec_cst,
3714 vector_type, NULL);
3715 if (ctor_seq != NULL)
3717 gimple_stmt_iterator gsi
3718 = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3719 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3720 ctor_seq = NULL;
3722 voprnds.quick_push (init);
3723 insert_after = NULL;
3724 number_of_places_left_in_vector = nunits;
3725 constant_p = true;
3726 elts.new_vector (vector_type, nunits, 1);
3727 elts.quick_grow (nunits);
3732 /* Since the vectors are created in the reverse order, we should invert
3733 them. */
3734 vec_num = voprnds.length ();
3735 for (j = vec_num; j != 0; j--)
3737 vop = voprnds[j - 1];
3738 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
3741 /* In case that VF is greater than the unrolling factor needed for the SLP
3742 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3743 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3744 to replicate the vectors. */
3745 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
3746 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
3747 i++)
3748 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
3751 /* Get the Ith vectorized definition from SLP_NODE. */
3753 tree
3754 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
3756 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
3757 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
3758 else
3759 return SLP_TREE_VEC_DEFS (slp_node)[i];
3762 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
3764 void
3765 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
3767 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
3768 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
3770 unsigned j;
3771 gimple *vec_def_stmt;
3772 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
3773 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
3775 else
3776 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
3779 /* Get N vectorized definitions for SLP_NODE. */
3781 void
3782 vect_get_slp_defs (vec_info *,
3783 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
3785 if (n == -1U)
3786 n = SLP_TREE_CHILDREN (slp_node).length ();
3788 for (unsigned i = 0; i < n; ++i)
3790 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
3791 vec<tree> vec_defs = vNULL;
3792 vect_get_slp_defs (child, &vec_defs);
3793 vec_oprnds->quick_push (vec_defs);
3797 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3798 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3799 permute statements for the SLP node NODE. */
3801 bool
3802 vect_transform_slp_perm_load (vec_info *vinfo,
3803 slp_tree node, vec<tree> dr_chain,
3804 gimple_stmt_iterator *gsi, poly_uint64 vf,
3805 bool analyze_only, unsigned *n_perms)
3807 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3808 int vec_index = 0;
3809 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3810 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
3811 unsigned int mask_element;
3812 machine_mode mode;
3814 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3815 return false;
3817 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3819 mode = TYPE_MODE (vectype);
3820 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3822 /* Initialize the vect stmts of NODE to properly insert the generated
3823 stmts later. */
3824 if (! analyze_only)
3825 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3826 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3827 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3829 /* Generate permutation masks for every NODE. Number of masks for each NODE
3830 is equal to GROUP_SIZE.
3831 E.g., we have a group of three nodes with three loads from the same
3832 location in each node, and the vector size is 4. I.e., we have a
3833 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3834 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3835 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3838 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3839 The last mask is illegal since we assume two operands for permute
3840 operation, and the mask element values can't be outside that range.
3841 Hence, the last mask must be converted into {2,5,5,5}.
3842 For the first two permutations we need the first and the second input
3843 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3844 we need the second and the third vectors: {b1,c1,a2,b2} and
3845 {c2,a3,b3,c3}. */
3847 int vect_stmts_counter = 0;
3848 unsigned int index = 0;
3849 int first_vec_index = -1;
3850 int second_vec_index = -1;
3851 bool noop_p = true;
3852 *n_perms = 0;
3854 vec_perm_builder mask;
3855 unsigned int nelts_to_build;
3856 unsigned int nvectors_per_build;
3857 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3858 && multiple_p (nunits, group_size));
3859 if (repeating_p)
3861 /* A single vector contains a whole number of copies of the node, so:
3862 (a) all permutes can use the same mask; and
3863 (b) the permutes only need a single vector input. */
3864 mask.new_vector (nunits, group_size, 3);
3865 nelts_to_build = mask.encoded_nelts ();
3866 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3868 else
3870 /* We need to construct a separate mask for each vector statement. */
3871 unsigned HOST_WIDE_INT const_nunits, const_vf;
3872 if (!nunits.is_constant (&const_nunits)
3873 || !vf.is_constant (&const_vf))
3874 return false;
3875 mask.new_vector (const_nunits, const_nunits, 1);
3876 nelts_to_build = const_vf * group_size;
3877 nvectors_per_build = 1;
3880 unsigned int count = mask.encoded_nelts ();
3881 mask.quick_grow (count);
3882 vec_perm_indices indices;
3884 for (unsigned int j = 0; j < nelts_to_build; j++)
3886 unsigned int iter_num = j / group_size;
3887 unsigned int stmt_num = j % group_size;
3888 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3889 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3890 if (repeating_p)
3892 first_vec_index = 0;
3893 mask_element = i;
3895 else
3897 /* Enforced before the loop when !repeating_p. */
3898 unsigned int const_nunits = nunits.to_constant ();
3899 vec_index = i / const_nunits;
3900 mask_element = i % const_nunits;
3901 if (vec_index == first_vec_index
3902 || first_vec_index == -1)
3904 first_vec_index = vec_index;
3906 else if (vec_index == second_vec_index
3907 || second_vec_index == -1)
3909 second_vec_index = vec_index;
3910 mask_element += const_nunits;
3912 else
3914 if (dump_enabled_p ())
3915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3916 "permutation requires at "
3917 "least three vectors %G",
3918 stmt_info->stmt);
3919 gcc_assert (analyze_only);
3920 return false;
3923 gcc_assert (mask_element < 2 * const_nunits);
3926 if (mask_element != index)
3927 noop_p = false;
3928 mask[index++] = mask_element;
3930 if (index == count && !noop_p)
3932 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
3933 if (!can_vec_perm_const_p (mode, indices))
3935 if (dump_enabled_p ())
3937 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3938 vect_location,
3939 "unsupported vect permute { ");
3940 for (i = 0; i < count; ++i)
3942 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3943 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3945 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3947 gcc_assert (analyze_only);
3948 return false;
3951 ++*n_perms;
3954 if (index == count)
3956 if (!analyze_only)
3958 tree mask_vec = NULL_TREE;
3960 if (! noop_p)
3961 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
3963 if (second_vec_index == -1)
3964 second_vec_index = first_vec_index;
3966 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
3968 /* Generate the permute statement if necessary. */
3969 tree first_vec = dr_chain[first_vec_index + ri];
3970 tree second_vec = dr_chain[second_vec_index + ri];
3971 gimple *perm_stmt;
3972 if (! noop_p)
3974 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3975 tree perm_dest
3976 = vect_create_destination_var (gimple_assign_lhs (stmt),
3977 vectype);
3978 perm_dest = make_ssa_name (perm_dest);
3979 perm_stmt
3980 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3981 first_vec, second_vec,
3982 mask_vec);
3983 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
3984 gsi);
3986 else
3987 /* If mask was NULL_TREE generate the requested
3988 identity transform. */
3989 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
3991 /* Store the vector statement in NODE. */
3992 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
3996 index = 0;
3997 first_vec_index = -1;
3998 second_vec_index = -1;
3999 noop_p = true;
4003 return true;
4007 /* Vectorize the SLP permutations in NODE as specified
4008 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
4009 child number and lane number.
4010 Interleaving of two two-lane two-child SLP subtrees (not supported):
4011 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
4012 A blend of two four-lane two-child SLP subtrees:
4013 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
4014 Highpart of a four-lane one-child SLP subtree (not supported):
4015 [ { 0, 2 }, { 0, 3 } ]
4016 Where currently only a subset is supported by code generating below. */
4018 static bool
4019 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
4020 slp_tree node, stmt_vector_for_cost *cost_vec)
4022 tree vectype = SLP_TREE_VECTYPE (node);
4024 /* ??? We currently only support all same vector input and output types
4025 while the SLP IL should really do a concat + select and thus accept
4026 arbitrary mismatches. */
4027 slp_tree child;
4028 unsigned i;
4029 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4030 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
4032 if (dump_enabled_p ())
4033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4034 "Unsupported lane permutation\n");
4035 return false;
4038 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
4039 gcc_assert (perm.length () == SLP_TREE_LANES (node));
4040 if (dump_enabled_p ())
4042 dump_printf_loc (MSG_NOTE, vect_location,
4043 "vectorizing permutation");
4044 for (unsigned i = 0; i < perm.length (); ++i)
4045 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
4046 dump_printf (MSG_NOTE, "\n");
4049 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4050 if (!nunits.is_constant ())
4051 return false;
4052 unsigned HOST_WIDE_INT vf = 1;
4053 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
4054 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
4055 return false;
4056 unsigned olanes = vf * SLP_TREE_LANES (node);
4057 gcc_assert (multiple_p (olanes, nunits));
4059 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
4060 from the { SLP operand, scalar lane } permutation as recorded in the
4061 SLP node as intermediate step. This part should already work
4062 with SLP children with arbitrary number of lanes. */
4063 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
4064 auto_vec<unsigned> active_lane;
4065 vperm.create (olanes);
4066 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length ());
4067 for (unsigned i = 0; i < vf; ++i)
4069 for (unsigned pi = 0; pi < perm.length (); ++pi)
4071 std::pair<unsigned, unsigned> p = perm[pi];
4072 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
4073 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
4074 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
4075 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
4076 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
4078 /* Advance to the next group. */
4079 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
4080 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
4083 if (dump_enabled_p ())
4085 dump_printf_loc (MSG_NOTE, vect_location, "as");
4086 for (unsigned i = 0; i < vperm.length (); ++i)
4088 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
4089 dump_printf (MSG_NOTE, ",");
4090 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
4091 vperm[i].first.first, vperm[i].first.second,
4092 vperm[i].first.second);
4094 dump_printf (MSG_NOTE, "\n");
4097 /* We can only handle two-vector permutes, everything else should
4098 be lowered on the SLP level. The following is closely inspired
4099 by vect_transform_slp_perm_load and is supposed to eventually
4100 replace it.
4101 ??? As intermediate step do code-gen in the SLP tree representation
4102 somehow? */
4103 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
4104 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
4105 unsigned int const_nunits = nunits.to_constant ();
4106 unsigned int index = 0;
4107 unsigned int mask_element;
4108 vec_perm_builder mask;
4109 mask.new_vector (const_nunits, const_nunits, 1);
4110 unsigned int count = mask.encoded_nelts ();
4111 mask.quick_grow (count);
4112 vec_perm_indices indices;
4113 unsigned nperms = 0;
4114 for (unsigned i = 0; i < vperm.length (); ++i)
4116 mask_element = vperm[i].second;
4117 if (first_vec.first == -1U
4118 || first_vec == vperm[i].first)
4119 first_vec = vperm[i].first;
4120 else if (second_vec.first == -1U
4121 || second_vec == vperm[i].first)
4123 second_vec = vperm[i].first;
4124 mask_element += const_nunits;
4126 else
4128 if (dump_enabled_p ())
4129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4130 "permutation requires at "
4131 "least three vectors");
4132 gcc_assert (!gsi);
4133 return false;
4136 mask[index++] = mask_element;
4138 if (index == count)
4140 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
4141 const_nunits);
4142 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
4144 if (dump_enabled_p ())
4146 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4147 vect_location,
4148 "unsupported vect permute { ");
4149 for (i = 0; i < count; ++i)
4151 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4152 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4154 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4156 gcc_assert (!gsi);
4157 return false;
4160 nperms++;
4161 if (gsi)
4163 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4165 if (second_vec.first == -1U)
4166 second_vec = first_vec;
4168 /* Generate the permute statement if necessary. */
4169 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
4170 tree first_def
4171 = vect_get_slp_vect_def (first_node, first_vec.second);
4172 slp_tree second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
4173 tree second_def
4174 = vect_get_slp_vect_def (second_node, second_vec.second);
4175 tree perm_dest = make_ssa_name (vectype);
4176 gassign *perm_stmt
4177 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4178 first_def, second_def,
4179 mask_vec);
4180 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
4181 /* Store the vector statement in NODE. */
4182 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
4185 index = 0;
4186 first_vec = std::make_pair (-1U, -1U);
4187 second_vec = std::make_pair (-1U, -1U);
4191 if (!gsi)
4192 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
4194 return true;
4197 /* Vectorize SLP instance tree in postorder. */
4199 static void
4200 vect_schedule_slp_instance (vec_info *vinfo,
4201 slp_tree node, slp_instance instance)
4203 gimple_stmt_iterator si;
4204 int i;
4205 slp_tree child;
4207 /* See if we have already vectorized the node in the graph of the
4208 SLP instance. */
4209 if ((SLP_TREE_DEF_TYPE (node) == vect_internal_def
4210 && SLP_TREE_VEC_STMTS (node).exists ())
4211 || SLP_TREE_VEC_DEFS (node).exists ())
4212 return;
4214 /* Vectorize externals and constants. */
4215 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
4216 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4218 /* ??? vectorizable_shift can end up using a scalar operand which is
4219 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
4220 node in this case. */
4221 if (!SLP_TREE_VECTYPE (node))
4222 return;
4224 vect_create_constant_vectors (vinfo, node);
4225 return;
4228 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4229 vect_schedule_slp_instance (vinfo, child, instance);
4231 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
4232 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4234 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4235 if (dump_enabled_p ())
4236 dump_printf_loc (MSG_NOTE, vect_location,
4237 "------>vectorizing SLP node starting from: %G",
4238 stmt_info->stmt);
4240 if (STMT_VINFO_DATA_REF (stmt_info)
4241 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
4243 /* Vectorized loads go before the first scalar load to make it
4244 ready early, vectorized stores go before the last scalar
4245 stmt which is where all uses are ready. */
4246 stmt_vec_info last_stmt_info = NULL;
4247 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
4248 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
4249 else /* DR_IS_WRITE */
4250 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
4251 si = gsi_for_stmt (last_stmt_info->stmt);
4253 else if (SLP_TREE_CHILDREN (node).is_empty ())
4255 /* This happens for reduction and induction PHIs where we do not use the
4256 insertion iterator. */
4257 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4258 == cycle_phi_info_type
4259 || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4260 == induc_vec_info_type));
4261 si = gsi_none ();
4263 else
4265 /* Emit other stmts after the children vectorized defs which is
4266 earliest possible. */
4267 gimple *last_stmt = NULL;
4268 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4269 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4271 /* For fold-left reductions we are retaining the scalar
4272 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
4273 set so the representation isn't perfect. Resort to the
4274 last scalar def here. */
4275 if (SLP_TREE_VEC_STMTS (child).is_empty ())
4277 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
4278 == cycle_phi_info_type);
4279 gphi *phi = as_a <gphi *>
4280 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
4281 if (!last_stmt
4282 || vect_stmt_dominates_stmt_p (last_stmt, phi))
4283 last_stmt = phi;
4285 /* We are emitting all vectorized stmts in the same place and
4286 the last one is the last.
4287 ??? Unless we have a load permutation applied and that
4288 figures to re-use an earlier generated load. */
4289 unsigned j;
4290 gimple *vstmt;
4291 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
4292 if (!last_stmt
4293 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
4294 last_stmt = vstmt;
4296 else if (!SLP_TREE_VECTYPE (child))
4298 /* For externals we use unvectorized at all scalar defs. */
4299 unsigned j;
4300 tree def;
4301 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
4302 if (TREE_CODE (def) == SSA_NAME
4303 && !SSA_NAME_IS_DEFAULT_DEF (def))
4305 gimple *stmt = SSA_NAME_DEF_STMT (def);
4306 if (!last_stmt
4307 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
4308 last_stmt = stmt;
4311 else
4313 /* For externals we have to look at all defs since their
4314 insertion place is decided per vector. */
4315 unsigned j;
4316 tree vdef;
4317 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
4318 if (TREE_CODE (vdef) == SSA_NAME)
4320 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
4321 if (!last_stmt
4322 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
4323 last_stmt = vstmt;
4326 if (is_a <gphi *> (last_stmt))
4327 si = gsi_after_labels (gimple_bb (last_stmt));
4328 else
4330 si = gsi_for_stmt (last_stmt);
4331 gsi_next (&si);
4335 bool done_p = false;
4337 /* Handle purely internal nodes. */
4338 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4340 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
4341 be shared with different SLP nodes (but usually it's the same
4342 operation apart from the case the stmt is only there for denoting
4343 the actual scalar lane defs ...). So do not call vect_transform_stmt
4344 but open-code it here (partly). */
4345 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
4346 gcc_assert (done);
4347 done_p = true;
4349 if (!done_p)
4350 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
4353 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4354 For loop vectorization this is done in vectorizable_call, but for SLP
4355 it needs to be deferred until end of vect_schedule_slp, because multiple
4356 SLP instances may refer to the same scalar stmt. */
4358 static void
4359 vect_remove_slp_scalar_calls (vec_info *vinfo,
4360 slp_tree node, hash_set<slp_tree> &visited)
4362 gimple *new_stmt;
4363 gimple_stmt_iterator gsi;
4364 int i;
4365 slp_tree child;
4366 tree lhs;
4367 stmt_vec_info stmt_info;
4369 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4370 return;
4372 if (visited.add (node))
4373 return;
4375 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4376 vect_remove_slp_scalar_calls (vinfo, child, visited);
4378 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4380 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4381 if (!stmt || gimple_bb (stmt) == NULL)
4382 continue;
4383 if (is_pattern_stmt_p (stmt_info)
4384 || !PURE_SLP_STMT (stmt_info))
4385 continue;
4386 lhs = gimple_call_lhs (stmt);
4387 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4388 gsi = gsi_for_stmt (stmt);
4389 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4390 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4394 static void
4395 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
4397 hash_set<slp_tree> visited;
4398 vect_remove_slp_scalar_calls (vinfo, node, visited);
4401 /* Vectorize the instance root. */
4403 void
4404 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
4406 gassign *rstmt = NULL;
4408 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
4410 gimple *child_stmt;
4411 int j;
4413 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
4415 tree vect_lhs = gimple_get_lhs (child_stmt);
4416 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
4417 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
4418 TREE_TYPE (vect_lhs)))
4419 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
4420 vect_lhs);
4421 rstmt = gimple_build_assign (root_lhs, vect_lhs);
4422 break;
4425 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
4427 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4428 gimple *child_stmt;
4429 int j;
4430 vec<constructor_elt, va_gc> *v;
4431 vec_alloc (v, nelts);
4433 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
4435 CONSTRUCTOR_APPEND_ELT (v,
4436 NULL_TREE,
4437 gimple_get_lhs (child_stmt));
4439 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
4440 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
4441 tree r_constructor = build_constructor (rtype, v);
4442 rstmt = gimple_build_assign (lhs, r_constructor);
4445 gcc_assert (rstmt);
4447 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
4448 gsi_replace (&rgsi, rstmt, true);
4451 /* Generate vector code for all SLP instances in the loop/basic block. */
4453 void
4454 vect_schedule_slp (vec_info *vinfo)
4456 vec<slp_instance> slp_instances;
4457 slp_instance instance;
4458 unsigned int i;
4460 slp_instances = vinfo->slp_instances;
4461 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4463 slp_tree node = SLP_INSTANCE_TREE (instance);
4464 /* Schedule the tree of INSTANCE. */
4465 vect_schedule_slp_instance (vinfo, node, instance);
4467 if (SLP_INSTANCE_ROOT_STMT (instance))
4468 vectorize_slp_instance_root_stmt (node, instance);
4470 if (dump_enabled_p ())
4471 dump_printf_loc (MSG_NOTE, vect_location,
4472 "vectorizing stmts using SLP.\n");
4475 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4477 slp_tree root = SLP_INSTANCE_TREE (instance);
4478 stmt_vec_info store_info;
4479 unsigned int j;
4481 /* Remove scalar call stmts. Do not do this for basic-block
4482 vectorization as not all uses may be vectorized.
4483 ??? Why should this be necessary? DCE should be able to
4484 remove the stmts itself.
4485 ??? For BB vectorization we can as well remove scalar
4486 stmts starting from the SLP tree root if they have no
4487 uses. */
4488 if (is_a <loop_vec_info> (vinfo))
4489 vect_remove_slp_scalar_calls (vinfo, root);
4491 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
4493 if (!STMT_VINFO_DATA_REF (store_info))
4494 break;
4496 if (SLP_INSTANCE_ROOT_STMT (instance))
4497 continue;
4499 store_info = vect_orig_stmt (store_info);
4500 /* Free the attached stmt_vec_info and remove the stmt. */
4501 vinfo->remove_stmt (store_info);