Add emergency dump after an ICE
[official-gcc.git] / gcc / tree-vect-slp.c
blob458a2c5bb301157f4f843776943435eba8c49d65
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
50 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
51 slp_tree, stmt_vector_for_cost *);
53 /* Initialize a SLP node. */
55 _slp_tree::_slp_tree ()
57 SLP_TREE_SCALAR_STMTS (this) = vNULL;
58 SLP_TREE_SCALAR_OPS (this) = vNULL;
59 SLP_TREE_VEC_STMTS (this) = vNULL;
60 SLP_TREE_VEC_DEFS (this) = vNULL;
61 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
62 SLP_TREE_CHILDREN (this) = vNULL;
63 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
64 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
65 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
66 SLP_TREE_CODE (this) = ERROR_MARK;
67 SLP_TREE_VECTYPE (this) = NULL_TREE;
68 SLP_TREE_REPRESENTATIVE (this) = NULL;
69 this->refcnt = 1;
70 this->max_nunits = 1;
71 this->lanes = 0;
74 /* Tear down a SLP node. */
76 _slp_tree::~_slp_tree ()
78 SLP_TREE_CHILDREN (this).release ();
79 SLP_TREE_SCALAR_STMTS (this).release ();
80 SLP_TREE_SCALAR_OPS (this).release ();
81 SLP_TREE_VEC_STMTS (this).release ();
82 SLP_TREE_VEC_DEFS (this).release ();
83 SLP_TREE_LOAD_PERMUTATION (this).release ();
84 SLP_TREE_LANE_PERMUTATION (this).release ();
87 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
88 FINAL_P is true if we have vectorized the instance or if we have
89 made a final decision not to vectorize the statements in any way. */
91 static void
92 vect_free_slp_tree (slp_tree node, bool final_p)
94 int i;
95 slp_tree child;
97 if (--node->refcnt != 0)
98 return;
100 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
101 vect_free_slp_tree (child, final_p);
103 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
104 Some statements might no longer exist, after having been
105 removed by vect_transform_stmt. Updating the remaining
106 statements would be redundant. */
107 if (!final_p)
109 stmt_vec_info stmt_info;
110 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
112 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
113 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
117 delete node;
120 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
121 have vectorized the instance or if we have made a final decision not
122 to vectorize the statements in any way. */
124 void
125 vect_free_slp_instance (slp_instance instance, bool final_p)
127 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
128 SLP_INSTANCE_LOADS (instance).release ();
129 instance->subgraph_entries.release ();
130 instance->cost_vec.release ();
131 free (instance);
135 /* Create an SLP node for SCALAR_STMTS. */
137 static slp_tree
138 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
140 slp_tree node = new _slp_tree;
141 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
142 SLP_TREE_CHILDREN (node).create (nops);
143 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
144 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
145 SLP_TREE_LANES (node) = scalar_stmts.length ();
147 unsigned i;
148 stmt_vec_info stmt_info;
149 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
150 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
152 return node;
155 /* Create an SLP node for OPS. */
157 static slp_tree
158 vect_create_new_slp_node (vec<tree> ops)
160 slp_tree node = new _slp_tree;
161 SLP_TREE_SCALAR_OPS (node) = ops;
162 SLP_TREE_DEF_TYPE (node) = vect_external_def;
163 SLP_TREE_LANES (node) = ops.length ();
164 return node;
168 /* This structure is used in creation of an SLP tree. Each instance
169 corresponds to the same operand in a group of scalar stmts in an SLP
170 node. */
171 typedef struct _slp_oprnd_info
173 /* Def-stmts for the operands. */
174 vec<stmt_vec_info> def_stmts;
175 /* Operands. */
176 vec<tree> ops;
177 /* Information about the first statement, its vector def-type, type, the
178 operand itself in case it's constant, and an indication if it's a pattern
179 stmt. */
180 tree first_op_type;
181 enum vect_def_type first_dt;
182 bool any_pattern;
183 } *slp_oprnd_info;
186 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
187 operand. */
188 static vec<slp_oprnd_info>
189 vect_create_oprnd_info (int nops, int group_size)
191 int i;
192 slp_oprnd_info oprnd_info;
193 vec<slp_oprnd_info> oprnds_info;
195 oprnds_info.create (nops);
196 for (i = 0; i < nops; i++)
198 oprnd_info = XNEW (struct _slp_oprnd_info);
199 oprnd_info->def_stmts.create (group_size);
200 oprnd_info->ops.create (group_size);
201 oprnd_info->first_dt = vect_uninitialized_def;
202 oprnd_info->first_op_type = NULL_TREE;
203 oprnd_info->any_pattern = false;
204 oprnds_info.quick_push (oprnd_info);
207 return oprnds_info;
211 /* Free operands info. */
213 static void
214 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
216 int i;
217 slp_oprnd_info oprnd_info;
219 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
221 oprnd_info->def_stmts.release ();
222 oprnd_info->ops.release ();
223 XDELETE (oprnd_info);
226 oprnds_info.release ();
230 /* Return true if STMTS contains a pattern statement. */
232 static bool
233 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
235 stmt_vec_info stmt_info;
236 unsigned int i;
237 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
238 if (is_pattern_stmt_p (stmt_info))
239 return true;
240 return false;
243 /* Return true when all lanes in the external or constant NODE have
244 the same value. */
246 static bool
247 vect_slp_tree_uniform_p (slp_tree node)
249 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
250 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
252 /* Pre-exsting vectors. */
253 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
254 return false;
256 unsigned i;
257 tree op, first = NULL_TREE;
258 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
259 if (!first)
260 first = op;
261 else if (!operand_equal_p (first, op, 0))
262 return false;
264 return true;
267 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
268 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
269 of the chain. */
272 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
273 stmt_vec_info first_stmt_info)
275 stmt_vec_info next_stmt_info = first_stmt_info;
276 int result = 0;
278 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
279 return -1;
283 if (next_stmt_info == stmt_info)
284 return result;
285 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
286 if (next_stmt_info)
287 result += DR_GROUP_GAP (next_stmt_info);
289 while (next_stmt_info);
291 return -1;
294 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
295 using the method implemented by duplicate_and_interleave. Return true
296 if so, returning the number of intermediate vectors in *NVECTORS_OUT
297 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
298 (if nonnull). */
300 bool
301 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
302 tree elt_type, unsigned int *nvectors_out,
303 tree *vector_type_out,
304 tree *permutes)
306 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
307 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
308 return false;
310 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
311 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
312 unsigned int nvectors = 1;
313 for (;;)
315 scalar_int_mode int_mode;
316 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
317 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
319 /* Get the natural vector type for this SLP group size. */
320 tree int_type = build_nonstandard_integer_type
321 (GET_MODE_BITSIZE (int_mode), 1);
322 tree vector_type
323 = get_vectype_for_scalar_type (vinfo, int_type, count);
324 if (vector_type
325 && VECTOR_MODE_P (TYPE_MODE (vector_type))
326 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
327 GET_MODE_SIZE (base_vector_mode)))
329 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
330 together into elements of type INT_TYPE and using the result
331 to build NVECTORS vectors. */
332 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
333 vec_perm_builder sel1 (nelts, 2, 3);
334 vec_perm_builder sel2 (nelts, 2, 3);
335 poly_int64 half_nelts = exact_div (nelts, 2);
336 for (unsigned int i = 0; i < 3; ++i)
338 sel1.quick_push (i);
339 sel1.quick_push (i + nelts);
340 sel2.quick_push (half_nelts + i);
341 sel2.quick_push (half_nelts + i + nelts);
343 vec_perm_indices indices1 (sel1, 2, nelts);
344 vec_perm_indices indices2 (sel2, 2, nelts);
345 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
346 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
348 if (nvectors_out)
349 *nvectors_out = nvectors;
350 if (vector_type_out)
351 *vector_type_out = vector_type;
352 if (permutes)
354 permutes[0] = vect_gen_perm_mask_checked (vector_type,
355 indices1);
356 permutes[1] = vect_gen_perm_mask_checked (vector_type,
357 indices2);
359 return true;
363 if (!multiple_p (elt_bytes, 2, &elt_bytes))
364 return false;
365 nvectors *= 2;
369 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
370 they are of a valid type and that they match the defs of the first stmt of
371 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
372 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
373 indicates swap is required for cond_expr stmts. Specifically, *SWAP
374 is 1 if STMT is cond and operands of comparison need to be swapped;
375 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
376 If there is any operand swap in this function, *SWAP is set to non-zero
377 value.
378 If there was a fatal error return -1; if the error could be corrected by
379 swapping operands of father node of this one, return 1; if everything is
380 ok return 0. */
381 static int
382 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
383 vec<stmt_vec_info> stmts, unsigned stmt_num,
384 vec<slp_oprnd_info> *oprnds_info)
386 stmt_vec_info stmt_info = stmts[stmt_num];
387 tree oprnd;
388 unsigned int i, number_of_oprnds;
389 enum vect_def_type dt = vect_uninitialized_def;
390 slp_oprnd_info oprnd_info;
391 int first_op_idx = 1;
392 unsigned int commutative_op = -1U;
393 bool first_op_cond = false;
394 bool first = stmt_num == 0;
396 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
398 number_of_oprnds = gimple_call_num_args (stmt);
399 first_op_idx = 3;
400 if (gimple_call_internal_p (stmt))
402 internal_fn ifn = gimple_call_internal_fn (stmt);
403 commutative_op = first_commutative_argument (ifn);
405 /* Masked load, only look at mask. */
406 if (ifn == IFN_MASK_LOAD)
408 number_of_oprnds = 1;
409 /* Mask operand index. */
410 first_op_idx = 5;
414 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
416 enum tree_code code = gimple_assign_rhs_code (stmt);
417 number_of_oprnds = gimple_num_ops (stmt) - 1;
418 /* Swap can only be done for cond_expr if asked to, otherwise we
419 could result in different comparison code to the first stmt. */
420 if (code == COND_EXPR
421 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
423 first_op_cond = true;
424 number_of_oprnds++;
426 else
427 commutative_op = commutative_tree_code (code) ? 0U : -1U;
429 else
430 return -1;
432 bool swapped = (*swap != 0);
433 gcc_assert (!swapped || first_op_cond);
434 for (i = 0; i < number_of_oprnds; i++)
436 again:
437 if (first_op_cond)
439 /* Map indicating how operands of cond_expr should be swapped. */
440 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
441 int *map = maps[*swap];
443 if (i < 2)
444 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
445 first_op_idx), map[i]);
446 else
447 oprnd = gimple_op (stmt_info->stmt, map[i]);
449 else
450 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
451 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
452 oprnd = TREE_OPERAND (oprnd, 0);
454 oprnd_info = (*oprnds_info)[i];
456 stmt_vec_info def_stmt_info;
457 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
459 if (dump_enabled_p ())
460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
461 "Build SLP failed: can't analyze def for %T\n",
462 oprnd);
464 return -1;
467 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
468 oprnd_info->any_pattern = true;
470 if (first)
472 /* For the swapping logic below force vect_reduction_def
473 for the reduction op in a SLP reduction group. */
474 if (!STMT_VINFO_DATA_REF (stmt_info)
475 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
476 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
477 && def_stmt_info)
478 dt = vect_reduction_def;
479 oprnd_info->first_dt = dt;
480 oprnd_info->first_op_type = TREE_TYPE (oprnd);
482 else
484 /* Not first stmt of the group, check that the def-stmt/s match
485 the def-stmt/s of the first stmt. Allow different definition
486 types for reduction chains: the first stmt must be a
487 vect_reduction_def (a phi node), and the rest
488 end in the reduction chain. */
489 tree type = TREE_TYPE (oprnd);
490 if ((oprnd_info->first_dt != dt
491 && !(oprnd_info->first_dt == vect_reduction_def
492 && !STMT_VINFO_DATA_REF (stmt_info)
493 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
494 && def_stmt_info
495 && !STMT_VINFO_DATA_REF (def_stmt_info)
496 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
497 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
498 && !((oprnd_info->first_dt == vect_external_def
499 || oprnd_info->first_dt == vect_constant_def)
500 && (dt == vect_external_def
501 || dt == vect_constant_def)))
502 || !types_compatible_p (oprnd_info->first_op_type, type)
503 || (!STMT_VINFO_DATA_REF (stmt_info)
504 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
505 && ((!def_stmt_info
506 || STMT_VINFO_DATA_REF (def_stmt_info)
507 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
508 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
509 != (oprnd_info->first_dt != vect_reduction_def))))
511 /* Try swapping operands if we got a mismatch. */
512 if (i == commutative_op && !swapped)
514 if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE, vect_location,
516 "trying swapped operands\n");
517 swapped = true;
518 goto again;
521 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "Build SLP failed: different types\n");
525 return 1;
527 if ((dt == vect_constant_def
528 || dt == vect_external_def)
529 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
530 && (TREE_CODE (type) == BOOLEAN_TYPE
531 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
532 type)))
534 if (dump_enabled_p ())
535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
536 "Build SLP failed: invalid type of def "
537 "for variable-length SLP %T\n", oprnd);
538 return -1;
542 /* Check the types of the definitions. */
543 switch (dt)
545 case vect_external_def:
546 /* Make sure to demote the overall operand to external. */
547 oprnd_info->first_dt = vect_external_def;
548 /* Fallthru. */
549 case vect_constant_def:
550 oprnd_info->def_stmts.quick_push (NULL);
551 oprnd_info->ops.quick_push (oprnd);
552 break;
554 case vect_internal_def:
555 case vect_reduction_def:
556 if (oprnd_info->first_dt == vect_reduction_def
557 && !STMT_VINFO_DATA_REF (stmt_info)
558 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
559 && !STMT_VINFO_DATA_REF (def_stmt_info)
560 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
561 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
563 /* For a SLP reduction chain we want to duplicate the
564 reduction to each of the chain members. That gets
565 us a sane SLP graph (still the stmts are not 100%
566 correct wrt the initial values). */
567 gcc_assert (!first);
568 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
569 oprnd_info->ops.quick_push (oprnd_info->ops[0]);
570 break;
572 /* Fallthru. */
573 case vect_induction_def:
574 oprnd_info->def_stmts.quick_push (def_stmt_info);
575 oprnd_info->ops.quick_push (oprnd);
576 break;
578 default:
579 /* FORNOW: Not supported. */
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
582 "Build SLP failed: illegal type of def %T\n",
583 oprnd);
585 return -1;
589 /* Swap operands. */
590 if (swapped)
592 if (dump_enabled_p ())
593 dump_printf_loc (MSG_NOTE, vect_location,
594 "swapped operands to match def types in %G",
595 stmt_info->stmt);
598 *swap = swapped;
599 return 0;
602 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
603 Return true if we can, meaning that this choice doesn't conflict with
604 existing SLP nodes that use STMT_INFO. */
606 static bool
607 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
609 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
610 if (old_vectype && useless_type_conversion_p (vectype, old_vectype))
611 return true;
613 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
614 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
616 /* We maintain the invariant that if any statement in the group is
617 used, all other members of the group have the same vector type. */
618 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
619 stmt_vec_info member_info = first_info;
620 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
621 if (STMT_VINFO_NUM_SLP_USES (member_info) > 0
622 || is_pattern_stmt_p (member_info))
623 break;
625 if (!member_info)
627 for (member_info = first_info; member_info;
628 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
629 STMT_VINFO_VECTYPE (member_info) = vectype;
630 return true;
633 else if (STMT_VINFO_NUM_SLP_USES (stmt_info) == 0
634 && !is_pattern_stmt_p (stmt_info))
636 STMT_VINFO_VECTYPE (stmt_info) = vectype;
637 return true;
640 if (dump_enabled_p ())
642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
643 "Build SLP failed: incompatible vector"
644 " types for: %G", stmt_info->stmt);
645 dump_printf_loc (MSG_NOTE, vect_location,
646 " old vector type: %T\n", old_vectype);
647 dump_printf_loc (MSG_NOTE, vect_location,
648 " new vector type: %T\n", vectype);
650 return false;
653 /* Return true if call statements CALL1 and CALL2 are similar enough
654 to be combined into the same SLP group. */
656 static bool
657 compatible_calls_p (gcall *call1, gcall *call2)
659 unsigned int nargs = gimple_call_num_args (call1);
660 if (nargs != gimple_call_num_args (call2))
661 return false;
663 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
664 return false;
666 if (gimple_call_internal_p (call1))
668 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
669 TREE_TYPE (gimple_call_lhs (call2))))
670 return false;
671 for (unsigned int i = 0; i < nargs; ++i)
672 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
673 TREE_TYPE (gimple_call_arg (call2, i))))
674 return false;
676 else
678 if (!operand_equal_p (gimple_call_fn (call1),
679 gimple_call_fn (call2), 0))
680 return false;
682 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
683 return false;
685 return true;
688 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
689 caller's attempt to find the vector type in STMT_INFO with the narrowest
690 element type. Return true if VECTYPE is nonnull and if it is valid
691 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
692 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
693 vect_build_slp_tree. */
695 static bool
696 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
697 unsigned int group_size,
698 tree vectype, poly_uint64 *max_nunits)
700 if (!vectype)
702 if (dump_enabled_p ())
703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
704 "Build SLP failed: unsupported data-type in %G\n",
705 stmt_info->stmt);
706 /* Fatal mismatch. */
707 return false;
710 /* If populating the vector type requires unrolling then fail
711 before adjusting *max_nunits for basic-block vectorization. */
712 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
713 unsigned HOST_WIDE_INT const_nunits;
714 if (is_a <bb_vec_info> (vinfo)
715 && (!nunits.is_constant (&const_nunits)
716 || const_nunits > group_size))
718 if (dump_enabled_p ())
719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
720 "Build SLP failed: unrolling required "
721 "in basic block SLP\n");
722 /* Fatal mismatch. */
723 return false;
726 /* In case of multiple types we need to detect the smallest type. */
727 vect_update_max_nunits (max_nunits, vectype);
728 return true;
731 /* Verify if the scalar stmts STMTS are isomorphic, require data
732 permutation or are of unsupported types of operation. Return
733 true if they are, otherwise return false and indicate in *MATCHES
734 which stmts are not isomorphic to the first one. If MATCHES[0]
735 is false then this indicates the comparison could not be
736 carried out or the stmts will never be vectorized by SLP.
738 Note COND_EXPR is possibly isomorphic to another one after swapping its
739 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
740 the first stmt by swapping the two operands of comparison; set SWAP[i]
741 to 2 if stmt I is isormorphic to the first stmt by inverting the code
742 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
743 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
745 static bool
746 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
747 vec<stmt_vec_info> stmts, unsigned int group_size,
748 poly_uint64 *max_nunits, bool *matches,
749 bool *two_operators, tree *node_vectype)
751 unsigned int i;
752 stmt_vec_info first_stmt_info = stmts[0];
753 enum tree_code first_stmt_code = ERROR_MARK;
754 enum tree_code alt_stmt_code = ERROR_MARK;
755 enum tree_code rhs_code = ERROR_MARK;
756 enum tree_code first_cond_code = ERROR_MARK;
757 tree lhs;
758 bool need_same_oprnds = false;
759 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
760 optab optab;
761 int icode;
762 machine_mode optab_op2_mode;
763 machine_mode vec_mode;
764 stmt_vec_info first_load = NULL, prev_first_load = NULL;
765 bool first_stmt_load_p = false, load_p = false;
767 /* For every stmt in NODE find its def stmt/s. */
768 stmt_vec_info stmt_info;
769 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
771 gimple *stmt = stmt_info->stmt;
772 swap[i] = 0;
773 matches[i] = false;
775 if (dump_enabled_p ())
776 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
778 /* Fail to vectorize statements marked as unvectorizable. */
779 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
781 if (dump_enabled_p ())
782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
783 "Build SLP failed: unvectorizable statement %G",
784 stmt);
785 /* Fatal mismatch. */
786 matches[0] = false;
787 return false;
790 lhs = gimple_get_lhs (stmt);
791 if (lhs == NULL_TREE)
793 if (dump_enabled_p ())
794 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
795 "Build SLP failed: not GIMPLE_ASSIGN nor "
796 "GIMPLE_CALL %G", stmt);
797 /* Fatal mismatch. */
798 matches[0] = false;
799 return false;
802 tree nunits_vectype;
803 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
804 &nunits_vectype, group_size)
805 || (nunits_vectype
806 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
807 nunits_vectype, max_nunits)))
809 /* Fatal mismatch. */
810 matches[0] = false;
811 return false;
814 gcc_assert (vectype);
816 if (is_a <bb_vec_info> (vinfo)
817 && !vect_update_shared_vectype (stmt_info, vectype))
818 continue;
820 gcall *call_stmt = dyn_cast <gcall *> (stmt);
821 if (call_stmt)
823 rhs_code = CALL_EXPR;
825 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
826 load_p = true;
827 else if ((gimple_call_internal_p (call_stmt)
828 && (!vectorizable_internal_fn_p
829 (gimple_call_internal_fn (call_stmt))))
830 || gimple_call_tail_p (call_stmt)
831 || gimple_call_noreturn_p (call_stmt)
832 || !gimple_call_nothrow_p (call_stmt)
833 || gimple_call_chain (call_stmt))
835 if (dump_enabled_p ())
836 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
837 "Build SLP failed: unsupported call type %G",
838 call_stmt);
839 /* Fatal mismatch. */
840 matches[0] = false;
841 return false;
844 else
846 rhs_code = gimple_assign_rhs_code (stmt);
847 load_p = gimple_vuse (stmt);
850 /* Check the operation. */
851 if (i == 0)
853 *node_vectype = vectype;
854 first_stmt_code = rhs_code;
855 first_stmt_load_p = load_p;
857 /* Shift arguments should be equal in all the packed stmts for a
858 vector shift with scalar shift operand. */
859 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
860 || rhs_code == LROTATE_EXPR
861 || rhs_code == RROTATE_EXPR)
863 vec_mode = TYPE_MODE (vectype);
865 /* First see if we have a vector/vector shift. */
866 optab = optab_for_tree_code (rhs_code, vectype,
867 optab_vector);
869 if (!optab
870 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
872 /* No vector/vector shift, try for a vector/scalar shift. */
873 optab = optab_for_tree_code (rhs_code, vectype,
874 optab_scalar);
876 if (!optab)
878 if (dump_enabled_p ())
879 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
880 "Build SLP failed: no optab.\n");
881 /* Fatal mismatch. */
882 matches[0] = false;
883 return false;
885 icode = (int) optab_handler (optab, vec_mode);
886 if (icode == CODE_FOR_nothing)
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
890 "Build SLP failed: "
891 "op not supported by target.\n");
892 /* Fatal mismatch. */
893 matches[0] = false;
894 return false;
896 optab_op2_mode = insn_data[icode].operand[2].mode;
897 if (!VECTOR_MODE_P (optab_op2_mode))
899 need_same_oprnds = true;
900 first_op1 = gimple_assign_rhs2 (stmt);
904 else if (rhs_code == WIDEN_LSHIFT_EXPR)
906 need_same_oprnds = true;
907 first_op1 = gimple_assign_rhs2 (stmt);
909 else if (!load_p
910 && rhs_code == BIT_FIELD_REF)
912 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
913 if (TREE_CODE (vec) != SSA_NAME
914 || !types_compatible_p (vectype, TREE_TYPE (vec)))
916 if (dump_enabled_p ())
917 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
918 "Build SLP failed: "
919 "BIT_FIELD_REF not supported\n");
920 /* Fatal mismatch. */
921 matches[0] = false;
922 return false;
925 else if (call_stmt
926 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
928 need_same_oprnds = true;
929 first_op1 = gimple_call_arg (call_stmt, 1);
932 else
934 if (first_stmt_code != rhs_code
935 && alt_stmt_code == ERROR_MARK)
936 alt_stmt_code = rhs_code;
937 if ((first_stmt_code != rhs_code
938 && (first_stmt_code != IMAGPART_EXPR
939 || rhs_code != REALPART_EXPR)
940 && (first_stmt_code != REALPART_EXPR
941 || rhs_code != IMAGPART_EXPR)
942 /* Handle mismatches in plus/minus by computing both
943 and merging the results. */
944 && !((first_stmt_code == PLUS_EXPR
945 || first_stmt_code == MINUS_EXPR)
946 && (alt_stmt_code == PLUS_EXPR
947 || alt_stmt_code == MINUS_EXPR)
948 && rhs_code == alt_stmt_code)
949 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
950 && (first_stmt_code == ARRAY_REF
951 || first_stmt_code == BIT_FIELD_REF
952 || first_stmt_code == INDIRECT_REF
953 || first_stmt_code == COMPONENT_REF
954 || first_stmt_code == MEM_REF)))
955 || first_stmt_load_p != load_p)
957 if (dump_enabled_p ())
959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
960 "Build SLP failed: different operation "
961 "in stmt %G", stmt);
962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
963 "original stmt %G", first_stmt_info->stmt);
965 /* Mismatch. */
966 continue;
969 if (need_same_oprnds)
971 tree other_op1 = (call_stmt
972 ? gimple_call_arg (call_stmt, 1)
973 : gimple_assign_rhs2 (stmt));
974 if (!operand_equal_p (first_op1, other_op1, 0))
976 if (dump_enabled_p ())
977 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
978 "Build SLP failed: different shift "
979 "arguments in %G", stmt);
980 /* Mismatch. */
981 continue;
984 if (!load_p
985 && first_stmt_code == BIT_FIELD_REF
986 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
987 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
989 if (dump_enabled_p ())
990 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
991 "Build SLP failed: different BIT_FIELD_REF "
992 "arguments in %G", stmt);
993 /* Mismatch. */
994 continue;
997 if (!load_p && rhs_code == CALL_EXPR)
999 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1000 as_a <gcall *> (stmt)))
1002 if (dump_enabled_p ())
1003 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1004 "Build SLP failed: different calls in %G",
1005 stmt);
1006 /* Mismatch. */
1007 continue;
1012 /* Grouped store or load. */
1013 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1015 if (REFERENCE_CLASS_P (lhs))
1017 /* Store. */
1020 else
1022 /* Load. */
1023 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1024 if (prev_first_load)
1026 /* Check that there are no loads from different interleaving
1027 chains in the same node. */
1028 if (prev_first_load != first_load)
1030 if (dump_enabled_p ())
1031 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1032 vect_location,
1033 "Build SLP failed: different "
1034 "interleaving chains in one node %G",
1035 stmt);
1036 /* Mismatch. */
1037 continue;
1040 else
1041 prev_first_load = first_load;
1043 } /* Grouped access. */
1044 else
1046 if (load_p)
1048 /* Not grouped load. */
1049 if (dump_enabled_p ())
1050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1051 "Build SLP failed: not grouped load %G", stmt);
1053 /* FORNOW: Not grouped loads are not supported. */
1054 /* Fatal mismatch. */
1055 matches[0] = false;
1056 return false;
1059 /* Not memory operation. */
1060 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1061 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1062 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1063 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1064 && rhs_code != VIEW_CONVERT_EXPR
1065 && rhs_code != CALL_EXPR
1066 && rhs_code != BIT_FIELD_REF)
1068 if (dump_enabled_p ())
1069 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1070 "Build SLP failed: operation unsupported %G",
1071 stmt);
1072 /* Fatal mismatch. */
1073 matches[0] = false;
1074 return false;
1077 if (rhs_code == COND_EXPR)
1079 tree cond_expr = gimple_assign_rhs1 (stmt);
1080 enum tree_code cond_code = TREE_CODE (cond_expr);
1081 enum tree_code swap_code = ERROR_MARK;
1082 enum tree_code invert_code = ERROR_MARK;
1084 if (i == 0)
1085 first_cond_code = TREE_CODE (cond_expr);
1086 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1088 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1089 swap_code = swap_tree_comparison (cond_code);
1090 invert_code = invert_tree_comparison (cond_code, honor_nans);
1093 if (first_cond_code == cond_code)
1095 /* Isomorphic can be achieved by swapping. */
1096 else if (first_cond_code == swap_code)
1097 swap[i] = 1;
1098 /* Isomorphic can be achieved by inverting. */
1099 else if (first_cond_code == invert_code)
1100 swap[i] = 2;
1101 else
1103 if (dump_enabled_p ())
1104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1105 "Build SLP failed: different"
1106 " operation %G", stmt);
1107 /* Mismatch. */
1108 continue;
1113 matches[i] = true;
1116 for (i = 0; i < group_size; ++i)
1117 if (!matches[i])
1118 return false;
1120 /* If we allowed a two-operation SLP node verify the target can cope
1121 with the permute we are going to use. */
1122 if (alt_stmt_code != ERROR_MARK
1123 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1125 *two_operators = true;
1128 return true;
1131 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1132 Note we never remove apart from at destruction time so we do not
1133 need a special value for deleted that differs from empty. */
1134 struct bst_traits
1136 typedef vec <stmt_vec_info> value_type;
1137 typedef vec <stmt_vec_info> compare_type;
1138 static inline hashval_t hash (value_type);
1139 static inline bool equal (value_type existing, value_type candidate);
1140 static inline bool is_empty (value_type x) { return !x.exists (); }
1141 static inline bool is_deleted (value_type x) { return !x.exists (); }
1142 static const bool empty_zero_p = true;
1143 static inline void mark_empty (value_type &x) { x.release (); }
1144 static inline void mark_deleted (value_type &x) { x.release (); }
1145 static inline void remove (value_type &x) { x.release (); }
1147 inline hashval_t
1148 bst_traits::hash (value_type x)
1150 inchash::hash h;
1151 for (unsigned i = 0; i < x.length (); ++i)
1152 h.add_int (gimple_uid (x[i]->stmt));
1153 return h.end ();
1155 inline bool
1156 bst_traits::equal (value_type existing, value_type candidate)
1158 if (existing.length () != candidate.length ())
1159 return false;
1160 for (unsigned i = 0; i < existing.length (); ++i)
1161 if (existing[i] != candidate[i])
1162 return false;
1163 return true;
1166 typedef hash_map <vec <gimple *>, slp_tree,
1167 simple_hashmap_traits <bst_traits, slp_tree> >
1168 scalar_stmts_to_slp_tree_map_t;
1170 static slp_tree
1171 vect_build_slp_tree_2 (vec_info *vinfo,
1172 vec<stmt_vec_info> stmts, unsigned int group_size,
1173 poly_uint64 *max_nunits,
1174 bool *matches, unsigned *npermutes, unsigned *tree_size,
1175 scalar_stmts_to_slp_tree_map_t *bst_map);
1177 static slp_tree
1178 vect_build_slp_tree (vec_info *vinfo,
1179 vec<stmt_vec_info> stmts, unsigned int group_size,
1180 poly_uint64 *max_nunits,
1181 bool *matches, unsigned *npermutes, unsigned *tree_size,
1182 scalar_stmts_to_slp_tree_map_t *bst_map)
1184 if (slp_tree *leader = bst_map->get (stmts))
1186 if (dump_enabled_p ())
1187 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1188 *leader ? "" : "failed ", *leader);
1189 if (*leader)
1191 (*leader)->refcnt++;
1192 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1194 return *leader;
1196 poly_uint64 this_max_nunits = 1;
1197 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1198 &this_max_nunits,
1199 matches, npermutes, tree_size, bst_map);
1200 if (res)
1202 res->max_nunits = this_max_nunits;
1203 vect_update_max_nunits (max_nunits, this_max_nunits);
1204 /* Keep a reference for the bst_map use. */
1205 res->refcnt++;
1207 bst_map->put (stmts.copy (), res);
1208 return res;
1211 /* Recursively build an SLP tree starting from NODE.
1212 Fail (and return a value not equal to zero) if def-stmts are not
1213 isomorphic, require data permutation or are of unsupported types of
1214 operation. Otherwise, return 0.
1215 The value returned is the depth in the SLP tree where a mismatch
1216 was found. */
1218 static slp_tree
1219 vect_build_slp_tree_2 (vec_info *vinfo,
1220 vec<stmt_vec_info> stmts, unsigned int group_size,
1221 poly_uint64 *max_nunits,
1222 bool *matches, unsigned *npermutes, unsigned *tree_size,
1223 scalar_stmts_to_slp_tree_map_t *bst_map)
1225 unsigned nops, i, this_tree_size = 0;
1226 poly_uint64 this_max_nunits = *max_nunits;
1227 slp_tree node;
1229 matches[0] = false;
1231 stmt_vec_info stmt_info = stmts[0];
1232 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1233 nops = gimple_call_num_args (stmt);
1234 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1236 nops = gimple_num_ops (stmt) - 1;
1237 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1238 nops++;
1240 else if (is_a <gphi *> (stmt_info->stmt))
1241 nops = 0;
1242 else
1243 return NULL;
1245 /* If the SLP node is a PHI (induction or reduction), terminate
1246 the recursion. */
1247 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1249 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1250 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1251 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1252 max_nunits))
1253 return NULL;
1255 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1256 /* Induction from different IVs is not supported. */
1257 if (def_type == vect_induction_def)
1259 stmt_vec_info other_info;
1260 FOR_EACH_VEC_ELT (stmts, i, other_info)
1261 if (stmt_info != other_info)
1262 return NULL;
1264 else if (def_type == vect_reduction_def
1265 || def_type == vect_double_reduction_def
1266 || def_type == vect_nested_cycle)
1268 /* Else def types have to match. */
1269 stmt_vec_info other_info;
1270 FOR_EACH_VEC_ELT (stmts, i, other_info)
1271 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1272 return NULL;
1274 else
1275 return NULL;
1276 (*tree_size)++;
1277 node = vect_create_new_slp_node (stmts, 0);
1278 SLP_TREE_VECTYPE (node) = vectype;
1279 return node;
1283 bool two_operators = false;
1284 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1285 tree vectype = NULL_TREE;
1286 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1287 &this_max_nunits, matches, &two_operators,
1288 &vectype))
1289 return NULL;
1291 /* If the SLP node is a load, terminate the recursion unless masked. */
1292 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1293 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1295 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1297 /* Masked load. */
1298 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1299 nops = 1;
1301 else
1303 *max_nunits = this_max_nunits;
1304 (*tree_size)++;
1305 node = vect_create_new_slp_node (stmts, 0);
1306 SLP_TREE_VECTYPE (node) = vectype;
1307 /* And compute the load permutation. Whether it is actually
1308 a permutation depends on the unrolling factor which is
1309 decided later. */
1310 vec<unsigned> load_permutation;
1311 int j;
1312 stmt_vec_info load_info;
1313 load_permutation.create (group_size);
1314 stmt_vec_info first_stmt_info
1315 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1316 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1318 int load_place = vect_get_place_in_interleaving_chain
1319 (load_info, first_stmt_info);
1320 gcc_assert (load_place != -1);
1321 load_permutation.safe_push (load_place);
1323 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1324 return node;
1327 else if (gimple_assign_single_p (stmt_info->stmt)
1328 && !gimple_vuse (stmt_info->stmt)
1329 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1331 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1332 the same SSA name vector of a compatible type to vectype. */
1333 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1334 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1335 stmt_vec_info estmt_info;
1336 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1338 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1339 tree bfref = gimple_assign_rhs1 (estmt);
1340 HOST_WIDE_INT lane;
1341 if (!known_eq (bit_field_size (bfref),
1342 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1343 || !constant_multiple_p (bit_field_offset (bfref),
1344 bit_field_size (bfref), &lane))
1346 lperm.release ();
1347 return NULL;
1349 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1351 slp_tree vnode = vect_create_new_slp_node (vNULL);
1352 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1353 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1354 /* We are always building a permutation node even if it is an identity
1355 permute to shield the rest of the vectorizer from the odd node
1356 representing an actual vector without any scalar ops.
1357 ??? We could hide it completely with making the permute node
1358 external? */
1359 node = vect_create_new_slp_node (stmts, 1);
1360 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1361 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1362 SLP_TREE_VECTYPE (node) = vectype;
1363 SLP_TREE_CHILDREN (node).quick_push (vnode);
1364 return node;
1367 /* Get at the operands, verifying they are compatible. */
1368 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1369 slp_oprnd_info oprnd_info;
1370 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1372 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1373 stmts, i, &oprnds_info);
1374 if (res != 0)
1375 matches[(res == -1) ? 0 : i] = false;
1376 if (!matches[0])
1377 break;
1379 for (i = 0; i < group_size; ++i)
1380 if (!matches[i])
1382 vect_free_oprnd_info (oprnds_info);
1383 return NULL;
1386 auto_vec<slp_tree, 4> children;
1388 stmt_info = stmts[0];
1390 /* Create SLP_TREE nodes for the definition node/s. */
1391 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1393 slp_tree child;
1394 unsigned int j;
1396 if (oprnd_info->first_dt == vect_uninitialized_def)
1398 /* COND_EXPR have one too many eventually if the condition
1399 is a SSA name. */
1400 gcc_assert (i == 3 && nops == 4);
1401 continue;
1404 if (oprnd_info->first_dt != vect_internal_def
1405 && oprnd_info->first_dt != vect_reduction_def
1406 && oprnd_info->first_dt != vect_induction_def)
1408 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1409 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1410 oprnd_info->ops = vNULL;
1411 children.safe_push (invnode);
1412 continue;
1415 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1416 group_size, &this_max_nunits,
1417 matches, npermutes,
1418 &this_tree_size, bst_map)) != NULL)
1420 oprnd_info->def_stmts = vNULL;
1421 children.safe_push (child);
1422 continue;
1425 /* If the SLP build failed fatally and we analyze a basic-block
1426 simply treat nodes we fail to build as externally defined
1427 (and thus build vectors from the scalar defs).
1428 The cost model will reject outright expensive cases.
1429 ??? This doesn't treat cases where permutation ultimatively
1430 fails (or we don't try permutation below). Ideally we'd
1431 even compute a permutation that will end up with the maximum
1432 SLP tree size... */
1433 if (is_a <bb_vec_info> (vinfo)
1434 && !matches[0]
1435 /* ??? Rejecting patterns this way doesn't work. We'd have to
1436 do extra work to cancel the pattern so the uses see the
1437 scalar version. */
1438 && !is_pattern_stmt_p (stmt_info)
1439 && !oprnd_info->any_pattern)
1441 if (dump_enabled_p ())
1442 dump_printf_loc (MSG_NOTE, vect_location,
1443 "Building vector operands from scalars\n");
1444 this_tree_size++;
1445 child = vect_create_new_slp_node (oprnd_info->ops);
1446 children.safe_push (child);
1447 oprnd_info->ops = vNULL;
1448 oprnd_info->def_stmts = vNULL;
1449 continue;
1452 /* If the SLP build for operand zero failed and operand zero
1453 and one can be commutated try that for the scalar stmts
1454 that failed the match. */
1455 if (i == 0
1456 /* A first scalar stmt mismatch signals a fatal mismatch. */
1457 && matches[0]
1458 /* ??? For COND_EXPRs we can swap the comparison operands
1459 as well as the arms under some constraints. */
1460 && nops == 2
1461 && oprnds_info[1]->first_dt == vect_internal_def
1462 && is_gimple_assign (stmt_info->stmt)
1463 /* Swapping operands for reductions breaks assumptions later on. */
1464 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1465 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1466 /* Do so only if the number of not successful permutes was nor more
1467 than a cut-ff as re-trying the recursive match on
1468 possibly each level of the tree would expose exponential
1469 behavior. */
1470 && *npermutes < 4)
1472 /* See whether we can swap the matching or the non-matching
1473 stmt operands. */
1474 bool swap_not_matching = true;
1477 for (j = 0; j < group_size; ++j)
1479 if (matches[j] != !swap_not_matching)
1480 continue;
1481 stmt_vec_info stmt_info = stmts[j];
1482 /* Verify if we can swap operands of this stmt. */
1483 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1484 if (!stmt
1485 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1487 if (!swap_not_matching)
1488 goto fail;
1489 swap_not_matching = false;
1490 break;
1494 while (j != group_size);
1496 /* Swap mismatched definition stmts. */
1497 if (dump_enabled_p ())
1498 dump_printf_loc (MSG_NOTE, vect_location,
1499 "Re-trying with swapped operands of stmts ");
1500 for (j = 0; j < group_size; ++j)
1501 if (matches[j] == !swap_not_matching)
1503 std::swap (oprnds_info[0]->def_stmts[j],
1504 oprnds_info[1]->def_stmts[j]);
1505 std::swap (oprnds_info[0]->ops[j],
1506 oprnds_info[1]->ops[j]);
1507 if (dump_enabled_p ())
1508 dump_printf (MSG_NOTE, "%d ", j);
1510 if (dump_enabled_p ())
1511 dump_printf (MSG_NOTE, "\n");
1512 /* And try again with scratch 'matches' ... */
1513 bool *tem = XALLOCAVEC (bool, group_size);
1514 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1515 group_size, &this_max_nunits,
1516 tem, npermutes,
1517 &this_tree_size, bst_map)) != NULL)
1519 oprnd_info->def_stmts = vNULL;
1520 children.safe_push (child);
1521 continue;
1524 ++*npermutes;
1527 fail:
1528 gcc_assert (child == NULL);
1529 FOR_EACH_VEC_ELT (children, j, child)
1530 vect_free_slp_tree (child, false);
1531 vect_free_oprnd_info (oprnds_info);
1532 return NULL;
1535 vect_free_oprnd_info (oprnds_info);
1537 /* If we have all children of a child built up from uniform scalars
1538 then just throw that away, causing it built up from scalars.
1539 The exception is the SLP node for the vector store. */
1540 if (is_a <bb_vec_info> (vinfo)
1541 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1542 /* ??? Rejecting patterns this way doesn't work. We'd have to
1543 do extra work to cancel the pattern so the uses see the
1544 scalar version. */
1545 && !is_pattern_stmt_p (stmt_info))
1547 slp_tree child;
1548 unsigned j;
1549 FOR_EACH_VEC_ELT (children, j, child)
1550 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
1551 || !vect_slp_tree_uniform_p (child))
1552 break;
1553 if (!child)
1555 /* Roll back. */
1556 matches[0] = false;
1557 FOR_EACH_VEC_ELT (children, j, child)
1558 vect_free_slp_tree (child, false);
1560 if (dump_enabled_p ())
1561 dump_printf_loc (MSG_NOTE, vect_location,
1562 "Building parent vector operands from "
1563 "scalars instead\n");
1564 return NULL;
1568 *tree_size += this_tree_size + 1;
1569 *max_nunits = this_max_nunits;
1571 if (two_operators)
1573 /* ??? We'd likely want to either cache in bst_map sth like
1574 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1575 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1576 explicit stmts to put in so the keying on 'stmts' doesn't
1577 work (but we have the same issue with nodes that use 'ops'). */
1578 slp_tree one = new _slp_tree;
1579 slp_tree two = new _slp_tree;
1580 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1581 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1582 SLP_TREE_VECTYPE (one) = vectype;
1583 SLP_TREE_VECTYPE (two) = vectype;
1584 SLP_TREE_CHILDREN (one).safe_splice (children);
1585 SLP_TREE_CHILDREN (two).safe_splice (children);
1586 slp_tree child;
1587 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1588 child->refcnt++;
1590 /* Here we record the original defs since this
1591 node represents the final lane configuration. */
1592 node = vect_create_new_slp_node (stmts, 2);
1593 SLP_TREE_VECTYPE (node) = vectype;
1594 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1595 SLP_TREE_CHILDREN (node).quick_push (one);
1596 SLP_TREE_CHILDREN (node).quick_push (two);
1597 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1598 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1599 enum tree_code ocode = ERROR_MARK;
1600 stmt_vec_info ostmt_info;
1601 unsigned j = 0;
1602 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1604 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1605 if (gimple_assign_rhs_code (ostmt) != code0)
1607 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1608 ocode = gimple_assign_rhs_code (ostmt);
1609 j = i;
1611 else
1612 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1614 SLP_TREE_CODE (one) = code0;
1615 SLP_TREE_CODE (two) = ocode;
1616 SLP_TREE_LANES (one) = stmts.length ();
1617 SLP_TREE_LANES (two) = stmts.length ();
1618 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1619 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1620 return node;
1623 node = vect_create_new_slp_node (stmts, nops);
1624 SLP_TREE_VECTYPE (node) = vectype;
1625 SLP_TREE_CHILDREN (node).splice (children);
1626 return node;
1629 /* Dump a single SLP tree NODE. */
1631 static void
1632 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1633 slp_tree node)
1635 unsigned i, j;
1636 slp_tree child;
1637 stmt_vec_info stmt_info;
1638 tree op;
1640 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1641 dump_user_location_t user_loc = loc.get_user_location ();
1642 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1643 SLP_TREE_DEF_TYPE (node) == vect_external_def
1644 ? " (external)"
1645 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1646 ? " (constant)"
1647 : ""), node,
1648 estimated_poly_value (node->max_nunits), node->refcnt);
1649 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1650 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1651 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1652 else
1654 dump_printf_loc (metadata, user_loc, "\t{ ");
1655 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1656 dump_printf (metadata, "%T%s ", op,
1657 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1658 dump_printf (metadata, "}\n");
1660 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1662 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1663 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1664 dump_printf (dump_kind, " %u", j);
1665 dump_printf (dump_kind, " }\n");
1667 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1669 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1670 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1671 dump_printf (dump_kind, " %u[%u]",
1672 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1673 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1674 dump_printf (dump_kind, " }\n");
1676 if (SLP_TREE_CHILDREN (node).is_empty ())
1677 return;
1678 dump_printf_loc (metadata, user_loc, "\tchildren");
1679 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1680 dump_printf (dump_kind, " %p", (void *)child);
1681 dump_printf (dump_kind, "\n");
1684 DEBUG_FUNCTION void
1685 debug (slp_tree node)
1687 debug_dump_context ctx;
1688 vect_print_slp_tree (MSG_NOTE,
1689 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1690 node);
1693 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1695 static void
1696 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1697 slp_tree node, hash_set<slp_tree> &visited)
1699 unsigned i;
1700 slp_tree child;
1702 if (visited.add (node))
1703 return;
1705 vect_print_slp_tree (dump_kind, loc, node);
1707 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1708 vect_print_slp_graph (dump_kind, loc, child, visited);
1711 static void
1712 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1713 slp_tree entry)
1715 hash_set<slp_tree> visited;
1716 vect_print_slp_graph (dump_kind, loc, entry, visited);
1719 /* Mark the tree rooted at NODE with PURE_SLP. */
1721 static void
1722 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1724 int i;
1725 stmt_vec_info stmt_info;
1726 slp_tree child;
1728 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1729 return;
1731 if (visited.add (node))
1732 return;
1734 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1735 STMT_SLP_TYPE (stmt_info) = pure_slp;
1737 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1738 vect_mark_slp_stmts (child, visited);
1741 static void
1742 vect_mark_slp_stmts (slp_tree node)
1744 hash_set<slp_tree> visited;
1745 vect_mark_slp_stmts (node, visited);
1748 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1750 static void
1751 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1753 int i;
1754 stmt_vec_info stmt_info;
1755 slp_tree child;
1757 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1758 return;
1760 if (visited.add (node))
1761 return;
1763 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1765 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1766 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1767 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1770 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1771 vect_mark_slp_stmts_relevant (child, visited);
1774 static void
1775 vect_mark_slp_stmts_relevant (slp_tree node)
1777 hash_set<slp_tree> visited;
1778 vect_mark_slp_stmts_relevant (node, visited);
1781 /* Copy the SLP subtree rooted at NODE. */
1783 static slp_tree
1784 slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
1786 unsigned i;
1788 bool existed_p;
1789 slp_tree &copy_ref = map.get_or_insert (node, &existed_p);
1790 if (existed_p)
1791 return copy_ref;
1793 copy_ref = new _slp_tree;
1794 slp_tree copy = copy_ref;
1795 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
1796 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
1797 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
1798 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
1799 copy->max_nunits = node->max_nunits;
1800 copy->refcnt = 0;
1801 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1803 SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy ();
1804 stmt_vec_info stmt_info;
1805 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1806 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
1808 if (SLP_TREE_SCALAR_OPS (node).exists ())
1809 SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
1810 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1811 SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
1812 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1813 SLP_TREE_LANE_PERMUTATION (copy) = SLP_TREE_LANE_PERMUTATION (node).copy ();
1814 if (SLP_TREE_CHILDREN (node).exists ())
1815 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
1816 gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
1818 slp_tree child;
1819 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child)
1821 SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map);
1822 SLP_TREE_CHILDREN (copy)[i]->refcnt++;
1824 return copy;
1827 /* Rearrange the statements of NODE according to PERMUTATION. */
1829 static void
1830 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1831 vec<unsigned> permutation,
1832 hash_set<slp_tree> &visited)
1834 unsigned int i;
1835 slp_tree child;
1837 if (visited.add (node))
1838 return;
1840 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1841 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1843 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1845 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1846 vec<stmt_vec_info> tmp_stmts;
1847 tmp_stmts.create (group_size);
1848 tmp_stmts.quick_grow (group_size);
1849 stmt_vec_info stmt_info;
1850 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1851 tmp_stmts[permutation[i]] = stmt_info;
1852 SLP_TREE_SCALAR_STMTS (node).release ();
1853 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1855 if (SLP_TREE_SCALAR_OPS (node).exists ())
1857 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1858 vec<tree> tmp_ops;
1859 tmp_ops.create (group_size);
1860 tmp_ops.quick_grow (group_size);
1861 tree op;
1862 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1863 tmp_ops[permutation[i]] = op;
1864 SLP_TREE_SCALAR_OPS (node).release ();
1865 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1867 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1869 gcc_assert (group_size == SLP_TREE_LANE_PERMUTATION (node).length ());
1870 for (i = 0; i < group_size; ++i)
1871 SLP_TREE_LANE_PERMUTATION (node)[i].second
1872 = permutation[SLP_TREE_LANE_PERMUTATION (node)[i].second];
1877 /* Attempt to reorder stmts in a reduction chain so that we don't
1878 require any load permutation. Return true if that was possible,
1879 otherwise return false. */
1881 static bool
1882 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1884 unsigned int i, j;
1885 unsigned int lidx;
1886 slp_tree node, load;
1888 if (SLP_INSTANCE_LOADS (slp_instn).is_empty ())
1889 return false;
1891 /* Compare all the permutation sequences to the first one. We know
1892 that at least one load is permuted. */
1893 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1894 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1895 return false;
1896 unsigned int group_size = SLP_TREE_LOAD_PERMUTATION (node).length ();
1897 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1899 if (!SLP_TREE_LOAD_PERMUTATION (load).exists ()
1900 || SLP_TREE_LOAD_PERMUTATION (load).length () != group_size)
1901 return false;
1902 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load), j, lidx)
1903 if (lidx != SLP_TREE_LOAD_PERMUTATION (node)[j])
1904 return false;
1907 /* Check that the loads in the first sequence are different and there
1908 are no gaps between them and that there is an actual permutation. */
1909 bool any_permute = false;
1910 auto_sbitmap load_index (group_size);
1911 bitmap_clear (load_index);
1912 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1914 if (lidx != i)
1915 any_permute = true;
1916 if (lidx >= group_size)
1917 return false;
1918 if (bitmap_bit_p (load_index, lidx))
1919 return false;
1921 bitmap_set_bit (load_index, lidx);
1923 if (!any_permute)
1924 return false;
1925 for (i = 0; i < group_size; i++)
1926 if (!bitmap_bit_p (load_index, i))
1927 return false;
1929 /* This permutation is valid for reduction. Since the order of the
1930 statements in the nodes is not important unless they are memory
1931 accesses, we can rearrange the statements in all the nodes
1932 according to the order of the loads. */
1934 /* We have to unshare the SLP tree we modify. */
1935 hash_map<slp_tree, slp_tree> map;
1936 slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map);
1937 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn), false);
1938 unshared->refcnt++;
1939 SLP_INSTANCE_TREE (slp_instn) = unshared;
1940 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1941 SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node);
1942 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1944 /* Do the actual re-arrangement. */
1945 hash_set<slp_tree> visited;
1946 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1947 node->load_permutation, visited);
1949 /* We are done, no actual permutations need to be generated. */
1950 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1951 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1953 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1954 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1955 /* But we have to keep those permutations that are required because
1956 of handling of gaps. */
1957 if (known_eq (unrolling_factor, 1U)
1958 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1959 && DR_GROUP_GAP (first_stmt_info) == 0))
1960 SLP_TREE_LOAD_PERMUTATION (node).release ();
1961 else
1962 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1963 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1966 return true;
1969 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1971 static void
1972 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
1973 hash_set<slp_tree> &visited)
1975 if (visited.add (node))
1976 return;
1978 if (SLP_TREE_CHILDREN (node).length () == 0)
1980 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1981 return;
1982 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1983 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1984 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1985 loads.safe_push (node);
1987 else
1989 unsigned i;
1990 slp_tree child;
1991 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1992 vect_gather_slp_loads (loads, child, visited);
1996 static void
1997 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1999 hash_set<slp_tree> visited;
2000 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
2004 /* Find the last store in SLP INSTANCE. */
2006 stmt_vec_info
2007 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2009 stmt_vec_info last = NULL;
2010 stmt_vec_info stmt_vinfo;
2012 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2014 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2015 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2018 return last;
2021 /* Find the first stmt in NODE. */
2023 stmt_vec_info
2024 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2026 stmt_vec_info first = NULL;
2027 stmt_vec_info stmt_vinfo;
2029 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2031 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2032 if (!first
2033 || get_later_stmt (stmt_vinfo, first) == first)
2034 first = stmt_vinfo;
2037 return first;
2040 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2041 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2042 (also containing the first GROUP1_SIZE stmts, since stores are
2043 consecutive), the second containing the remainder.
2044 Return the first stmt in the second group. */
2046 static stmt_vec_info
2047 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2049 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2050 gcc_assert (group1_size > 0);
2051 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2052 gcc_assert (group2_size > 0);
2053 DR_GROUP_SIZE (first_vinfo) = group1_size;
2055 stmt_vec_info stmt_info = first_vinfo;
2056 for (unsigned i = group1_size; i > 1; i--)
2058 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2059 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2061 /* STMT is now the last element of the first group. */
2062 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2063 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2065 DR_GROUP_SIZE (group2) = group2_size;
2066 for (stmt_info = group2; stmt_info;
2067 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2069 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2070 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2073 /* For the second group, the DR_GROUP_GAP is that before the original group,
2074 plus skipping over the first vector. */
2075 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2077 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2078 DR_GROUP_GAP (first_vinfo) += group2_size;
2080 if (dump_enabled_p ())
2081 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2082 group1_size, group2_size);
2084 return group2;
2087 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2088 statements and a vector of NUNITS elements. */
2090 static poly_uint64
2091 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2093 return exact_div (common_multiple (nunits, group_size), group_size);
2096 /* Analyze an SLP instance starting from a group of grouped stores. Call
2097 vect_build_slp_tree to build a tree of packed stmts if possible.
2098 Return FALSE if it's impossible to SLP any stmt in the loop. */
2100 static bool
2101 vect_analyze_slp_instance (vec_info *vinfo,
2102 scalar_stmts_to_slp_tree_map_t *bst_map,
2103 stmt_vec_info stmt_info, unsigned max_tree_size)
2105 slp_instance new_instance;
2106 slp_tree node;
2107 unsigned int group_size;
2108 tree vectype, scalar_type = NULL_TREE;
2109 unsigned int i;
2110 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2111 vec<stmt_vec_info> scalar_stmts;
2112 bool constructor = false;
2114 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2116 scalar_type = TREE_TYPE (DR_REF (dr));
2117 group_size = DR_GROUP_SIZE (stmt_info);
2118 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
2120 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2122 gcc_assert (is_a <loop_vec_info> (vinfo));
2123 vectype = STMT_VINFO_VECTYPE (stmt_info);
2124 group_size = REDUC_GROUP_SIZE (stmt_info);
2126 else if (is_gimple_assign (stmt_info->stmt)
2127 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2129 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
2130 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2131 constructor = true;
2133 else
2135 gcc_assert (is_a <loop_vec_info> (vinfo));
2136 vectype = STMT_VINFO_VECTYPE (stmt_info);
2137 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2140 if (!vectype)
2142 if (dump_enabled_p ())
2143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2144 "Build SLP failed: unsupported data-type %T\n",
2145 scalar_type);
2147 return false;
2149 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2151 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2152 scalar_stmts.create (group_size);
2153 stmt_vec_info next_info = stmt_info;
2154 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2156 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2157 while (next_info)
2159 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2160 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2163 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2165 /* Collect the reduction stmts and store them in
2166 SLP_TREE_SCALAR_STMTS. */
2167 while (next_info)
2169 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2170 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2172 /* Mark the first element of the reduction chain as reduction to properly
2173 transform the node. In the reduction analysis phase only the last
2174 element of the chain is marked as reduction. */
2175 STMT_VINFO_DEF_TYPE (stmt_info)
2176 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2177 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2178 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2180 else if (constructor)
2182 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2183 tree val;
2184 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2186 if (TREE_CODE (val) == SSA_NAME)
2188 gimple* def = SSA_NAME_DEF_STMT (val);
2189 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2190 /* Value is defined in another basic block. */
2191 if (!def_info)
2192 return false;
2193 def_info = vect_stmt_to_vectorize (def_info);
2194 scalar_stmts.safe_push (def_info);
2196 else
2197 return false;
2199 if (dump_enabled_p ())
2200 dump_printf_loc (MSG_NOTE, vect_location,
2201 "Analyzing vectorizable constructor: %G\n",
2202 stmt_info->stmt);
2204 else
2206 /* Collect reduction statements. */
2207 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2208 for (i = 0; reductions.iterate (i, &next_info); i++)
2209 scalar_stmts.safe_push (next_info);
2212 if (dump_enabled_p ())
2214 dump_printf_loc (MSG_NOTE, vect_location,
2215 "Starting SLP discovery for\n");
2216 for (i = 0; i < scalar_stmts.length (); ++i)
2217 dump_printf_loc (MSG_NOTE, vect_location,
2218 " %G", scalar_stmts[i]->stmt);
2221 /* Build the tree for the SLP instance. */
2222 bool *matches = XALLOCAVEC (bool, group_size);
2223 unsigned npermutes = 0;
2224 poly_uint64 max_nunits = nunits;
2225 unsigned tree_size = 0;
2226 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2227 &max_nunits, matches, &npermutes,
2228 &tree_size, bst_map);
2229 if (node != NULL)
2231 /* Calculate the unrolling factor based on the smallest type. */
2232 poly_uint64 unrolling_factor
2233 = calculate_unrolling_factor (max_nunits, group_size);
2235 if (maybe_ne (unrolling_factor, 1U)
2236 && is_a <bb_vec_info> (vinfo))
2238 unsigned HOST_WIDE_INT const_max_nunits;
2239 if (!max_nunits.is_constant (&const_max_nunits)
2240 || const_max_nunits > group_size)
2242 if (dump_enabled_p ())
2243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2244 "Build SLP failed: store group "
2245 "size not a multiple of the vector size "
2246 "in basic block SLP\n");
2247 vect_free_slp_tree (node, false);
2248 return false;
2250 /* Fatal mismatch. */
2251 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_NOTE, vect_location,
2253 "SLP discovery succeeded but node needs "
2254 "splitting\n");
2255 matches[0] = true;
2256 matches[group_size / const_max_nunits * const_max_nunits] = false;
2257 vect_free_slp_tree (node, false);
2259 else
2261 /* Create a new SLP instance. */
2262 new_instance = XNEW (class _slp_instance);
2263 SLP_INSTANCE_TREE (new_instance) = node;
2264 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2265 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2266 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2267 new_instance->reduc_phis = NULL;
2268 new_instance->cost_vec = vNULL;
2269 new_instance->subgraph_entries = vNULL;
2271 vect_gather_slp_loads (new_instance, node);
2272 if (dump_enabled_p ())
2273 dump_printf_loc (MSG_NOTE, vect_location,
2274 "SLP size %u vs. limit %u.\n",
2275 tree_size, max_tree_size);
2277 /* Check whether any load is possibly permuted. */
2278 slp_tree load_node;
2279 bool loads_permuted = false;
2280 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2282 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2283 continue;
2284 unsigned j;
2285 stmt_vec_info load_info;
2286 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2287 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2289 loads_permuted = true;
2290 break;
2294 /* If the loads and stores can be handled with load/store-lane
2295 instructions do not generate this SLP instance. */
2296 if (is_a <loop_vec_info> (vinfo)
2297 && loads_permuted
2298 && dr && vect_store_lanes_supported (vectype, group_size, false))
2300 slp_tree load_node;
2301 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2303 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2304 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2305 /* Use SLP for strided accesses (or if we can't load-lanes). */
2306 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2307 || ! vect_load_lanes_supported
2308 (STMT_VINFO_VECTYPE (stmt_vinfo),
2309 DR_GROUP_SIZE (stmt_vinfo), false))
2310 break;
2312 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2314 if (dump_enabled_p ())
2315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316 "Built SLP cancelled: can use "
2317 "load/store-lanes\n");
2318 vect_free_slp_instance (new_instance, false);
2319 return false;
2323 /* If this is a reduction chain with a conversion in front
2324 amend the SLP tree with a node for that. */
2325 if (!dr
2326 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2327 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2329 /* Get at the conversion stmt - we know it's the single use
2330 of the last stmt of the reduction chain. */
2331 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2332 use_operand_p use_p;
2333 gimple *use_stmt;
2334 bool r = single_imm_use (gimple_assign_lhs (tem),
2335 &use_p, &use_stmt);
2336 gcc_assert (r);
2337 next_info = vinfo->lookup_stmt (use_stmt);
2338 next_info = vect_stmt_to_vectorize (next_info);
2339 scalar_stmts = vNULL;
2340 scalar_stmts.create (group_size);
2341 for (unsigned i = 0; i < group_size; ++i)
2342 scalar_stmts.quick_push (next_info);
2343 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2344 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2345 SLP_TREE_CHILDREN (conv).quick_push (node);
2346 SLP_INSTANCE_TREE (new_instance) = conv;
2347 /* We also have to fake this conversion stmt as SLP reduction
2348 group so we don't have to mess with too much code
2349 elsewhere. */
2350 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2351 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2354 vinfo->slp_instances.safe_push (new_instance);
2356 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2357 the number of scalar stmts in the root in a few places.
2358 Verify that assumption holds. */
2359 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2360 .length () == group_size);
2362 if (dump_enabled_p ())
2364 dump_printf_loc (MSG_NOTE, vect_location,
2365 "Final SLP tree for instance:\n");
2366 vect_print_slp_graph (MSG_NOTE, vect_location,
2367 SLP_INSTANCE_TREE (new_instance));
2370 return true;
2373 else
2375 /* Failed to SLP. */
2376 /* Free the allocated memory. */
2377 scalar_stmts.release ();
2380 /* For basic block SLP, try to break the group up into multiples of the
2381 vector size. */
2382 unsigned HOST_WIDE_INT const_nunits;
2383 if (is_a <bb_vec_info> (vinfo)
2384 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2385 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2386 && nunits.is_constant (&const_nunits))
2388 /* We consider breaking the group only on VF boundaries from the existing
2389 start. */
2390 for (i = 0; i < group_size; i++)
2391 if (!matches[i]) break;
2393 if (i >= const_nunits && i < group_size)
2395 /* Split into two groups at the first vector boundary before i. */
2396 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2397 unsigned group1_size = i & ~(const_nunits - 1);
2399 if (dump_enabled_p ())
2400 dump_printf_loc (MSG_NOTE, vect_location,
2401 "Splitting SLP group at stmt %u\n", i);
2402 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2403 group1_size);
2404 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2405 max_tree_size);
2406 /* If the first non-match was in the middle of a vector,
2407 skip the rest of that vector. */
2408 if (group1_size < i)
2410 i = group1_size + const_nunits;
2411 if (i < group_size)
2412 rest = vect_split_slp_store_group (rest, const_nunits);
2414 if (i < group_size)
2415 res |= vect_analyze_slp_instance (vinfo, bst_map,
2416 rest, max_tree_size);
2417 return res;
2419 /* Even though the first vector did not all match, we might be able to SLP
2420 (some) of the remainder. FORNOW ignore this possibility. */
2423 /* Failed to SLP. */
2424 if (dump_enabled_p ())
2425 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2426 return false;
2430 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2431 trees of packed scalar stmts if SLP is possible. */
2433 opt_result
2434 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2436 unsigned int i;
2437 stmt_vec_info first_element;
2439 DUMP_VECT_SCOPE ("vect_analyze_slp");
2441 scalar_stmts_to_slp_tree_map_t *bst_map
2442 = new scalar_stmts_to_slp_tree_map_t ();
2444 /* Find SLP sequences starting from groups of grouped stores. */
2445 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2446 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2448 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2450 if (loop_vinfo->reduction_chains.length () > 0)
2452 /* Find SLP sequences starting from reduction chains. */
2453 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2454 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2455 max_tree_size))
2457 /* Dissolve reduction chain group. */
2458 stmt_vec_info vinfo = first_element;
2459 stmt_vec_info last = NULL;
2460 while (vinfo)
2462 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2463 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2464 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2465 last = vinfo;
2466 vinfo = next;
2468 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2469 /* It can be still vectorized as part of an SLP reduction. */
2470 loop_vinfo->reductions.safe_push (last);
2474 /* Find SLP sequences starting from groups of reductions. */
2475 if (loop_vinfo->reductions.length () > 1)
2476 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2477 max_tree_size);
2480 /* The map keeps a reference on SLP nodes built, release that. */
2481 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2482 it != bst_map->end (); ++it)
2483 if ((*it).second)
2484 vect_free_slp_tree ((*it).second, false);
2485 delete bst_map;
2487 /* Optimize permutations in SLP reductions. */
2488 slp_instance instance;
2489 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2491 slp_tree node = SLP_INSTANCE_TREE (instance);
2492 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2493 /* Reduction (there are no data-refs in the root).
2494 In reduction chain the order of the loads is not important. */
2495 if (!STMT_VINFO_DATA_REF (stmt_info)
2496 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2497 vect_attempt_slp_rearrange_stmts (instance);
2500 /* Gather all loads in the SLP graph. */
2501 hash_set<slp_tree> visited;
2502 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2503 vect_gather_slp_loads (vinfo->slp_loads, SLP_INSTANCE_TREE (instance),
2504 visited);
2506 return opt_result::success ();
2509 void
2510 vect_optimize_slp (vec_info *vinfo)
2512 slp_tree node;
2513 unsigned i;
2514 FOR_EACH_VEC_ELT (vinfo->slp_loads, i, node)
2516 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2517 continue;
2519 /* In basic block vectorization we allow any subchain of an interleaving
2520 chain.
2521 FORNOW: not in loop SLP because of realignment complications. */
2522 if (is_a <bb_vec_info> (vinfo))
2524 bool subchain_p = true;
2525 stmt_vec_info next_load_info = NULL;
2526 stmt_vec_info load_info;
2527 unsigned j;
2528 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2530 if (j != 0
2531 && (next_load_info != load_info
2532 || DR_GROUP_GAP (load_info) != 1))
2534 subchain_p = false;
2535 break;
2537 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
2539 if (subchain_p)
2541 SLP_TREE_LOAD_PERMUTATION (node).release ();
2542 continue;
2545 else
2547 stmt_vec_info load_info;
2548 bool this_load_permuted = false;
2549 unsigned j;
2550 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2551 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
2553 this_load_permuted = true;
2554 break;
2556 stmt_vec_info first_stmt_info
2557 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
2558 if (!this_load_permuted
2559 /* The load requires permutation when unrolling exposes
2560 a gap either because the group is larger than the SLP
2561 group-size or because there is a gap between the groups. */
2562 && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a <loop_vec_info> (vinfo)), 1U)
2563 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
2564 && DR_GROUP_GAP (first_stmt_info) == 0)))
2566 SLP_TREE_LOAD_PERMUTATION (node).release ();
2567 continue;
2574 /* For each possible SLP instance decide whether to SLP it and calculate overall
2575 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2576 least one instance. */
2578 bool
2579 vect_make_slp_decision (loop_vec_info loop_vinfo)
2581 unsigned int i;
2582 poly_uint64 unrolling_factor = 1;
2583 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2584 slp_instance instance;
2585 int decided_to_slp = 0;
2587 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2589 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2591 /* FORNOW: SLP if you can. */
2592 /* All unroll factors have the form:
2594 GET_MODE_SIZE (vinfo->vector_mode) * X
2596 for some rational X, so they must have a common multiple. */
2597 unrolling_factor
2598 = force_common_multiple (unrolling_factor,
2599 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2601 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2602 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2603 loop-based vectorization. Such stmts will be marked as HYBRID. */
2604 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2605 decided_to_slp++;
2608 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2610 if (decided_to_slp && dump_enabled_p ())
2612 dump_printf_loc (MSG_NOTE, vect_location,
2613 "Decided to SLP %d instances. Unrolling factor ",
2614 decided_to_slp);
2615 dump_dec (MSG_NOTE, unrolling_factor);
2616 dump_printf (MSG_NOTE, "\n");
2619 return (decided_to_slp > 0);
2622 /* Private data for vect_detect_hybrid_slp. */
2623 struct vdhs_data
2625 loop_vec_info loop_vinfo;
2626 vec<stmt_vec_info> *worklist;
2629 /* Walker for walk_gimple_op. */
2631 static tree
2632 vect_detect_hybrid_slp (tree *tp, int *, void *data)
2634 walk_stmt_info *wi = (walk_stmt_info *)data;
2635 vdhs_data *dat = (vdhs_data *)wi->info;
2637 if (wi->is_lhs)
2638 return NULL_TREE;
2640 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
2641 if (!def_stmt_info)
2642 return NULL_TREE;
2643 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
2644 if (PURE_SLP_STMT (def_stmt_info))
2646 if (dump_enabled_p ())
2647 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2648 def_stmt_info->stmt);
2649 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2650 dat->worklist->safe_push (def_stmt_info);
2653 return NULL_TREE;
2656 /* Find stmts that must be both vectorized and SLPed. */
2658 void
2659 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2661 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2663 /* All stmts participating in SLP are marked pure_slp, all other
2664 stmts are loop_vect.
2665 First collect all loop_vect stmts into a worklist. */
2666 auto_vec<stmt_vec_info> worklist;
2667 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2669 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2670 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2671 gsi_next (&gsi))
2673 gphi *phi = gsi.phi ();
2674 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
2675 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2676 worklist.safe_push (stmt_info);
2678 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2679 gsi_next (&gsi))
2681 gimple *stmt = gsi_stmt (gsi);
2682 if (is_gimple_debug (stmt))
2683 continue;
2684 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2685 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2687 for (gimple_stmt_iterator gsi2
2688 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2689 !gsi_end_p (gsi2); gsi_next (&gsi2))
2691 stmt_vec_info patt_info
2692 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
2693 if (!STMT_SLP_TYPE (patt_info)
2694 && STMT_VINFO_RELEVANT (patt_info))
2695 worklist.safe_push (patt_info);
2697 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2699 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2700 worklist.safe_push (stmt_info);
2704 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2705 mark any SLP vectorized stmt as hybrid.
2706 ??? We're visiting def stmts N times (once for each non-SLP and
2707 once for each hybrid-SLP use). */
2708 walk_stmt_info wi;
2709 vdhs_data dat;
2710 dat.worklist = &worklist;
2711 dat.loop_vinfo = loop_vinfo;
2712 memset (&wi, 0, sizeof (wi));
2713 wi.info = (void *)&dat;
2714 while (!worklist.is_empty ())
2716 stmt_vec_info stmt_info = worklist.pop ();
2717 /* Since SSA operands are not set up for pattern stmts we need
2718 to use walk_gimple_op. */
2719 wi.is_lhs = 0;
2720 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
2725 /* Initialize a bb_vec_info struct for the statements between
2726 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2728 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2729 gimple_stmt_iterator region_end_in,
2730 vec_info_shared *shared)
2731 : vec_info (vec_info::bb, init_cost (NULL), shared),
2732 bb (gsi_bb (region_begin_in)),
2733 region_begin (region_begin_in),
2734 region_end (region_end_in)
2736 for (gimple *stmt : this->region_stmts ())
2738 gimple_set_uid (stmt, 0);
2739 if (is_gimple_debug (stmt))
2740 continue;
2741 add_stmt (stmt);
2744 bb->aux = this;
2748 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2749 stmts in the basic block. */
2751 _bb_vec_info::~_bb_vec_info ()
2753 for (gimple *stmt : this->region_stmts ())
2754 /* Reset region marker. */
2755 gimple_set_uid (stmt, -1);
2757 bb->aux = NULL;
2760 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2761 given then that child nodes have already been processed, and that
2762 their def types currently match their SLP node's def type. */
2764 static bool
2765 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2766 slp_instance node_instance,
2767 stmt_vector_for_cost *cost_vec)
2769 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2770 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2772 /* Calculate the number of vector statements to be created for the
2773 scalar stmts in this node. For SLP reductions it is equal to the
2774 number of vector statements in the children (which has already been
2775 calculated by the recursive call). Otherwise it is the number of
2776 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2777 VF divided by the number of elements in a vector. */
2778 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2779 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2781 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2782 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2784 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2785 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2786 break;
2789 else
2791 poly_uint64 vf;
2792 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2793 vf = loop_vinfo->vectorization_factor;
2794 else
2795 vf = 1;
2796 unsigned int group_size = SLP_TREE_LANES (node);
2797 tree vectype = SLP_TREE_VECTYPE (node);
2798 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2799 = vect_get_num_vectors (vf * group_size, vectype);
2802 /* Handle purely internal nodes. */
2803 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2804 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
2806 bool dummy;
2807 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
2808 node, node_instance, cost_vec);
2811 /* Try to build NODE from scalars, returning true on success.
2812 NODE_INSTANCE is the SLP instance that contains NODE. */
2814 static bool
2815 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
2816 slp_instance node_instance)
2818 stmt_vec_info stmt_info;
2819 unsigned int i;
2821 if (!is_a <bb_vec_info> (vinfo)
2822 || node == SLP_INSTANCE_TREE (node_instance)
2823 || !SLP_TREE_SCALAR_STMTS (node).exists ()
2824 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
2825 return false;
2827 if (dump_enabled_p ())
2828 dump_printf_loc (MSG_NOTE, vect_location,
2829 "Building vector operands from scalars instead\n");
2831 /* Don't remove and free the child nodes here, since they could be
2832 referenced by other structures. The analysis and scheduling phases
2833 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2834 unsigned int group_size = SLP_TREE_LANES (node);
2835 SLP_TREE_DEF_TYPE (node) = vect_external_def;
2836 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
2837 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2839 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
2840 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
2842 return true;
2845 /* Compute the prologue cost for invariant or constant operands represented
2846 by NODE. */
2848 static void
2849 vect_prologue_cost_for_slp (slp_tree node,
2850 stmt_vector_for_cost *cost_vec)
2852 /* There's a special case of an existing vector, that costs nothing. */
2853 if (SLP_TREE_SCALAR_OPS (node).length () == 0
2854 && !SLP_TREE_VEC_DEFS (node).is_empty ())
2855 return;
2856 /* Without looking at the actual initializer a vector of
2857 constants can be implemented as load from the constant pool.
2858 When all elements are the same we can use a splat. */
2859 tree vectype = SLP_TREE_VECTYPE (node);
2860 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
2861 unsigned num_vects_to_check;
2862 unsigned HOST_WIDE_INT const_nunits;
2863 unsigned nelt_limit;
2864 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
2865 && ! multiple_p (const_nunits, group_size))
2867 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
2868 nelt_limit = const_nunits;
2870 else
2872 /* If either the vector has variable length or the vectors
2873 are composed of repeated whole groups we only need to
2874 cost construction once. All vectors will be the same. */
2875 num_vects_to_check = 1;
2876 nelt_limit = group_size;
2878 tree elt = NULL_TREE;
2879 unsigned nelt = 0;
2880 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
2882 unsigned si = j % group_size;
2883 if (nelt == 0)
2884 elt = SLP_TREE_SCALAR_OPS (node)[si];
2885 /* ??? We're just tracking whether all operands of a single
2886 vector initializer are the same, ideally we'd check if
2887 we emitted the same one already. */
2888 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
2889 elt = NULL_TREE;
2890 nelt++;
2891 if (nelt == nelt_limit)
2893 record_stmt_cost (cost_vec, 1,
2894 SLP_TREE_DEF_TYPE (node) == vect_external_def
2895 ? (elt ? scalar_to_vec : vec_construct)
2896 : vector_load,
2897 NULL, vectype, 0, vect_prologue);
2898 nelt = 0;
2903 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2904 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2906 Return true if the operations are supported. */
2908 static bool
2909 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2910 slp_instance node_instance,
2911 hash_set<slp_tree> &visited,
2912 hash_set<slp_tree> &lvisited,
2913 stmt_vector_for_cost *cost_vec)
2915 int i, j;
2916 slp_tree child;
2918 /* Assume we can code-generate all invariants. */
2919 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2920 return true;
2922 /* If we already analyzed the exact same set of scalar stmts we're done.
2923 We share the generated vector stmts for those.
2924 The SLP graph is acyclic so not caching whether we failed or succeeded
2925 doesn't result in any issue since we throw away the lvisited set
2926 when we fail. */
2927 if (visited.contains (node)
2928 || lvisited.contains (node))
2929 return true;
2931 bool res = true;
2932 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2934 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
2935 visited, lvisited, cost_vec);
2936 if (!res)
2937 break;
2940 if (res)
2942 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2943 cost_vec);
2944 if (res)
2945 lvisited.add (node);
2948 /* When the node can be vectorized cost invariant nodes it references.
2949 This is not done in DFS order to allow the refering node
2950 vectorizable_* calls to nail down the invariant nodes vector type
2951 and possibly unshare it if it needs a different vector type than
2952 other referrers. */
2953 if (res)
2954 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2955 if ((SLP_TREE_DEF_TYPE (child) == vect_constant_def
2956 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
2957 /* Perform usual caching, note code-generation still
2958 code-gens these nodes multiple times but we expect
2959 to CSE them later. */
2960 && !visited.contains (child)
2961 && !lvisited.add (child))
2963 /* ??? After auditing more code paths make a "default"
2964 and push the vector type from NODE to all children
2965 if it is not already set. */
2966 /* Compute the number of vectors to be generated. */
2967 tree vector_type = SLP_TREE_VECTYPE (child);
2968 if (!vector_type)
2970 /* For shifts with a scalar argument we don't need
2971 to cost or code-generate anything.
2972 ??? Represent this more explicitely. */
2973 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
2974 == shift_vec_info_type)
2975 && j == 1);
2976 continue;
2978 unsigned group_size = SLP_TREE_LANES (child);
2979 poly_uint64 vf = 1;
2980 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2981 vf = loop_vinfo->vectorization_factor;
2982 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
2983 = vect_get_num_vectors (vf * group_size, vector_type);
2984 /* And cost them. */
2985 vect_prologue_cost_for_slp (child, cost_vec);
2988 /* If this node or any of its children can't be vectorized, try pruning
2989 the tree here rather than felling the whole thing. */
2990 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
2992 /* We'll need to revisit this for invariant costing and number
2993 of vectorized stmt setting. */
2994 res = true;
2997 return res;
3001 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3002 region and that can be vectorized using vectorizable_live_operation
3003 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3004 scalar code computing it to be retained. */
3006 static void
3007 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3008 slp_instance instance,
3009 stmt_vector_for_cost *cost_vec,
3010 hash_set<stmt_vec_info> &svisited)
3012 unsigned i;
3013 stmt_vec_info stmt_info;
3014 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3015 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3017 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3018 if (svisited.contains (orig_stmt_info))
3019 continue;
3020 bool mark_visited = true;
3021 gimple *orig_stmt = orig_stmt_info->stmt;
3022 ssa_op_iter op_iter;
3023 def_operand_p def_p;
3024 FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3026 imm_use_iterator use_iter;
3027 gimple *use_stmt;
3028 stmt_vec_info use_stmt_info;
3029 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3030 if (!is_gimple_debug (use_stmt))
3032 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3033 if (!use_stmt_info
3034 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3036 STMT_VINFO_LIVE_P (stmt_info) = true;
3037 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3038 NULL, node, instance, i,
3039 false, cost_vec))
3040 /* ??? So we know we can vectorize the live stmt
3041 from one SLP node. If we cannot do so from all
3042 or none consistently we'd have to record which
3043 SLP node (and lane) we want to use for the live
3044 operation. So make sure we can code-generate
3045 from all nodes. */
3046 mark_visited = false;
3047 else
3048 STMT_VINFO_LIVE_P (stmt_info) = false;
3049 BREAK_FROM_IMM_USE_STMT (use_iter);
3052 /* We have to verify whether we can insert the lane extract
3053 before all uses. The following is a conservative approximation.
3054 We cannot put this into vectorizable_live_operation because
3055 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3056 doesn't work.
3057 Note that while the fact that we emit code for loads at the
3058 first load should make this a non-problem leafs we construct
3059 from scalars are vectorized after the last scalar def.
3060 ??? If we'd actually compute the insert location during
3061 analysis we could use sth less conservative than the last
3062 scalar stmt in the node for the dominance check. */
3063 /* ??? What remains is "live" uses in vector CTORs in the same
3064 SLP graph which is where those uses can end up code-generated
3065 right after their definition instead of close to their original
3066 use. But that would restrict us to code-generate lane-extracts
3067 from the latest stmt in a node. So we compensate for this
3068 during code-generation, simply not replacing uses for those
3069 hopefully rare cases. */
3070 if (STMT_VINFO_LIVE_P (stmt_info))
3071 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3072 if (!is_gimple_debug (use_stmt)
3073 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
3074 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3075 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
3077 if (dump_enabled_p ())
3078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3079 "Cannot determine insertion place for "
3080 "lane extract\n");
3081 STMT_VINFO_LIVE_P (stmt_info) = false;
3082 mark_visited = true;
3085 if (mark_visited)
3086 svisited.add (orig_stmt_info);
3089 slp_tree child;
3090 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3091 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3092 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
3093 cost_vec, svisited);
3096 /* Analyze statements in SLP instances of VINFO. Return true if the
3097 operations are supported. */
3099 bool
3100 vect_slp_analyze_operations (vec_info *vinfo)
3102 slp_instance instance;
3103 int i;
3105 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3107 hash_set<slp_tree> visited;
3108 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
3110 hash_set<slp_tree> lvisited;
3111 stmt_vector_for_cost cost_vec;
3112 cost_vec.create (2);
3113 if (!vect_slp_analyze_node_operations (vinfo,
3114 SLP_INSTANCE_TREE (instance),
3115 instance, visited, lvisited,
3116 &cost_vec)
3117 /* Instances with a root stmt require vectorized defs for the
3118 SLP tree root. */
3119 || (SLP_INSTANCE_ROOT_STMT (instance)
3120 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
3121 != vect_internal_def)))
3123 slp_tree node = SLP_INSTANCE_TREE (instance);
3124 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3125 if (dump_enabled_p ())
3126 dump_printf_loc (MSG_NOTE, vect_location,
3127 "removing SLP instance operations starting from: %G",
3128 stmt_info->stmt);
3129 vect_free_slp_instance (instance, false);
3130 vinfo->slp_instances.ordered_remove (i);
3131 cost_vec.release ();
3133 else
3135 for (hash_set<slp_tree>::iterator x = lvisited.begin();
3136 x != lvisited.end(); ++x)
3137 visited.add (*x);
3138 i++;
3140 /* Remember the SLP graph entry cost for later. */
3141 instance->cost_vec = cost_vec;
3145 /* Compute vectorizable live stmts. */
3146 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3148 hash_set<stmt_vec_info> svisited;
3149 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
3150 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
3151 instance, &instance->cost_vec, svisited);
3154 return !vinfo->slp_instances.is_empty ();
3157 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3158 closing the eventual chain. */
3160 static slp_instance
3161 get_ultimate_leader (slp_instance instance,
3162 hash_map<slp_instance, slp_instance> &instance_leader)
3164 auto_vec<slp_instance *, 8> chain;
3165 slp_instance *tem;
3166 while (*(tem = instance_leader.get (instance)) != instance)
3168 chain.safe_push (tem);
3169 instance = *tem;
3171 while (!chain.is_empty ())
3172 *chain.pop () = instance;
3173 return instance;
3176 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3178 static void
3179 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
3180 slp_instance instance, slp_tree node,
3181 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
3182 hash_map<slp_instance, slp_instance> &instance_leader)
3184 stmt_vec_info stmt_info;
3185 unsigned i;
3186 bool all = true;
3187 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3189 bool existed_p;
3190 slp_instance &stmt_instance
3191 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
3192 if (!existed_p)
3193 all = false;
3194 else if (stmt_instance != instance)
3196 /* If we're running into a previously marked stmt make us the
3197 leader of the current ultimate leader. This keeps the
3198 leader chain acyclic and works even when the current instance
3199 connects two previously independent graph parts. */
3200 slp_instance stmt_leader
3201 = get_ultimate_leader (stmt_instance, instance_leader);
3202 if (stmt_leader != instance)
3203 instance_leader.put (stmt_leader, instance);
3205 stmt_instance = instance;
3207 /* If not all stmts had been visited we have to recurse on children. */
3208 if (all)
3209 return;
3211 slp_tree child;
3212 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3213 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3214 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
3215 instance_leader);
3218 /* Partition the SLP graph into pieces that can be costed independently. */
3220 static void
3221 vect_bb_partition_graph (bb_vec_info bb_vinfo)
3223 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3225 /* First walk the SLP graph assigning each involved scalar stmt a
3226 corresponding SLP graph entry and upon visiting a previously
3227 marked stmt, make the stmts leader the current SLP graph entry. */
3228 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
3229 hash_map<slp_instance, slp_instance> instance_leader;
3230 slp_instance instance;
3231 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3233 instance_leader.put (instance, instance);
3234 vect_bb_partition_graph_r (bb_vinfo,
3235 instance, SLP_INSTANCE_TREE (instance),
3236 stmt_to_instance, instance_leader);
3239 /* Then collect entries to each independent subgraph. */
3240 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3242 slp_instance leader = get_ultimate_leader (instance, instance_leader);
3243 leader->subgraph_entries.safe_push (instance);
3244 if (dump_enabled_p ()
3245 && leader != instance)
3246 dump_printf_loc (MSG_NOTE, vect_location,
3247 "instance %p is leader of %p\n",
3248 leader, instance);
3252 /* Compute the scalar cost of the SLP node NODE and its children
3253 and return it. Do not account defs that are marked in LIFE and
3254 update LIFE according to uses of NODE. */
3256 static void
3257 vect_bb_slp_scalar_cost (vec_info *vinfo,
3258 slp_tree node, vec<bool, va_heap> *life,
3259 stmt_vector_for_cost *cost_vec,
3260 hash_set<slp_tree> &visited)
3262 unsigned i;
3263 stmt_vec_info stmt_info;
3264 slp_tree child;
3266 if (visited.add (node))
3267 return;
3269 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3271 ssa_op_iter op_iter;
3272 def_operand_p def_p;
3274 if ((*life)[i])
3275 continue;
3277 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3278 gimple *orig_stmt = orig_stmt_info->stmt;
3280 /* If there is a non-vectorized use of the defs then the scalar
3281 stmt is kept live in which case we do not account it or any
3282 required defs in the SLP children in the scalar cost. This
3283 way we make the vectorization more costly when compared to
3284 the scalar cost. */
3285 if (!STMT_VINFO_LIVE_P (stmt_info))
3287 FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3289 imm_use_iterator use_iter;
3290 gimple *use_stmt;
3291 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3292 if (!is_gimple_debug (use_stmt))
3294 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3295 if (!use_stmt_info
3296 || !PURE_SLP_STMT
3297 (vect_stmt_to_vectorize (use_stmt_info)))
3299 (*life)[i] = true;
3300 BREAK_FROM_IMM_USE_STMT (use_iter);
3304 if ((*life)[i])
3305 continue;
3308 /* Count scalar stmts only once. */
3309 if (gimple_visited_p (orig_stmt))
3310 continue;
3311 gimple_set_visited (orig_stmt, true);
3313 vect_cost_for_stmt kind;
3314 if (STMT_VINFO_DATA_REF (orig_stmt_info))
3316 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
3317 kind = scalar_load;
3318 else
3319 kind = scalar_store;
3321 else if (vect_nop_conversion_p (orig_stmt_info))
3322 continue;
3323 else
3324 kind = scalar_stmt;
3325 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info, 0, vect_body);
3328 auto_vec<bool, 20> subtree_life;
3329 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3331 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3333 /* Do not directly pass LIFE to the recursive call, copy it to
3334 confine changes in the callee to the current child/subtree. */
3335 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3337 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
3338 for (unsigned j = 0;
3339 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3341 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3342 if (perm.first == i)
3343 subtree_life[perm.second] = (*life)[j];
3346 else
3348 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
3349 subtree_life.safe_splice (*life);
3351 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3352 visited);
3353 subtree_life.truncate (0);
3358 /* Check if vectorization of the basic block is profitable for the
3359 subgraph denoted by SLP_INSTANCES. */
3361 static bool
3362 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
3363 vec<slp_instance> slp_instances)
3365 slp_instance instance;
3366 int i;
3367 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3368 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3370 void *vect_target_cost_data = init_cost (NULL);
3372 /* Calculate scalar cost and sum the cost for the vector stmts
3373 previously collected. */
3374 stmt_vector_for_cost scalar_costs;
3375 scalar_costs.create (0);
3376 hash_set<slp_tree> visited;
3377 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3379 auto_vec<bool, 20> life;
3380 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
3381 true);
3382 vect_bb_slp_scalar_cost (bb_vinfo,
3383 SLP_INSTANCE_TREE (instance),
3384 &life, &scalar_costs, visited);
3385 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
3386 instance->cost_vec.release ();
3388 /* Unset visited flag. */
3389 stmt_info_for_cost *si;
3390 FOR_EACH_VEC_ELT (scalar_costs, i, si)
3391 gimple_set_visited (si->stmt_info->stmt, false);
3393 void *scalar_target_cost_data = init_cost (NULL);
3394 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
3395 scalar_costs.release ();
3396 unsigned dummy;
3397 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
3398 destroy_cost_data (scalar_target_cost_data);
3400 /* Complete the target-specific vector cost calculation. */
3401 finish_cost (vect_target_cost_data, &vec_prologue_cost,
3402 &vec_inside_cost, &vec_epilogue_cost);
3403 destroy_cost_data (vect_target_cost_data);
3405 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3407 if (dump_enabled_p ())
3409 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3410 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3411 vec_inside_cost);
3412 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3413 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3414 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3417 /* Vectorization is profitable if its cost is more than the cost of scalar
3418 version. Note that we err on the vector side for equal cost because
3419 the cost estimate is otherwise quite pessimistic (constant uses are
3420 free on the scalar side but cost a load on the vector side for
3421 example). */
3422 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3423 return false;
3425 return true;
3428 /* For each SLP subgraph determine profitability and remove parts not so.
3429 Returns true if any profitable to vectorize subgraph remains. */
3431 static bool
3432 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
3434 slp_instance instance;
3435 unsigned i;
3437 auto_vec<slp_instance> subgraphs (BB_VINFO_SLP_INSTANCES (bb_vinfo).length ());
3438 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
3439 if (!instance->subgraph_entries.is_empty ())
3440 subgraphs.quick_push (instance);
3441 BB_VINFO_SLP_INSTANCES (bb_vinfo).truncate (0);
3442 for (i = 0; i < subgraphs.length ();)
3444 instance = subgraphs[i];
3445 if (!vect_bb_vectorization_profitable_p (bb_vinfo,
3446 instance->subgraph_entries))
3448 /* ??? We need to think of providing better dump/opt-report
3449 locations here. */
3450 if (dump_enabled_p ())
3452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3453 "not vectorized: vectorization is not "
3454 "profitable.\n");
3456 slp_instance entry;
3457 unsigned j;
3458 FOR_EACH_VEC_ELT (instance->subgraph_entries, j, entry)
3459 if (entry != instance)
3460 vect_free_slp_instance (entry, false);
3461 vect_free_slp_instance (instance, false);
3462 subgraphs.ordered_remove (i);
3464 else
3466 slp_instance entry;
3467 unsigned j;
3468 FOR_EACH_VEC_ELT (instance->subgraph_entries, j, entry)
3469 BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (entry);
3470 ++i;
3473 return !BB_VINFO_SLP_INSTANCES (bb_vinfo).is_empty ();
3476 /* Find any vectorizable constructors and add them to the grouped_store
3477 array. */
3479 static void
3480 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3482 for (gimple *stmt : bb_vinfo->region_stmts ())
3484 gassign *assign = dyn_cast<gassign *> (stmt);
3485 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
3486 continue;
3488 tree rhs = gimple_assign_rhs1 (assign);
3489 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
3490 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
3491 CONSTRUCTOR_NELTS (rhs))
3492 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
3493 || uniform_vector_p (rhs))
3494 continue;
3496 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
3497 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3501 /* Check if the region described by BB_VINFO can be vectorized, returning
3502 true if so. When returning false, set FATAL to true if the same failure
3503 would prevent vectorization at other vector sizes, false if it is still
3504 worth trying other sizes. N_STMTS is the number of statements in the
3505 region. */
3507 static bool
3508 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
3509 vec<int> *dataref_groups)
3511 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3513 slp_instance instance;
3514 int i;
3515 poly_uint64 min_vf = 2;
3517 /* The first group of checks is independent of the vector size. */
3518 fatal = true;
3520 /* Analyze the data references. */
3522 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3524 if (dump_enabled_p ())
3525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3526 "not vectorized: unhandled data-ref in basic "
3527 "block.\n");
3528 return false;
3531 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
3533 if (dump_enabled_p ())
3534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3535 "not vectorized: unhandled data access in "
3536 "basic block.\n");
3537 return false;
3540 vect_slp_check_for_constructors (bb_vinfo);
3542 /* If there are no grouped stores and no constructors in the region
3543 there is no need to continue with pattern recog as vect_analyze_slp
3544 will fail anyway. */
3545 if (bb_vinfo->grouped_stores.is_empty ())
3547 if (dump_enabled_p ())
3548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3549 "not vectorized: no grouped stores in "
3550 "basic block.\n");
3551 return false;
3554 /* While the rest of the analysis below depends on it in some way. */
3555 fatal = false;
3557 vect_pattern_recog (bb_vinfo);
3559 /* Check the SLP opportunities in the basic block, analyze and build SLP
3560 trees. */
3561 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3563 if (dump_enabled_p ())
3565 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3566 "Failed to SLP the basic block.\n");
3567 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3568 "not vectorized: failed to find SLP opportunities "
3569 "in basic block.\n");
3571 return false;
3574 /* Optimize permutations. */
3575 vect_optimize_slp (bb_vinfo);
3577 vect_record_base_alignments (bb_vinfo);
3579 /* Analyze and verify the alignment of data references and the
3580 dependence in the SLP instances. */
3581 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3583 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
3584 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
3586 slp_tree node = SLP_INSTANCE_TREE (instance);
3587 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3588 if (dump_enabled_p ())
3589 dump_printf_loc (MSG_NOTE, vect_location,
3590 "removing SLP instance operations starting from: %G",
3591 stmt_info->stmt);
3592 vect_free_slp_instance (instance, false);
3593 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3594 continue;
3597 /* Mark all the statements that we want to vectorize as pure SLP and
3598 relevant. */
3599 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3600 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3601 if (SLP_INSTANCE_ROOT_STMT (instance))
3602 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3604 i++;
3606 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3607 return false;
3609 if (!vect_slp_analyze_operations (bb_vinfo))
3611 if (dump_enabled_p ())
3612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3613 "not vectorized: bad operation in basic block.\n");
3614 return false;
3617 vect_bb_partition_graph (bb_vinfo);
3619 /* Cost model: check if the vectorization is worthwhile. */
3620 if (!unlimited_cost_model (NULL)
3621 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3623 if (dump_enabled_p ())
3624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3625 "not vectorized: vectorization is not "
3626 "profitable.\n");
3627 return false;
3630 if (dump_enabled_p ())
3631 dump_printf_loc (MSG_NOTE, vect_location,
3632 "Basic block will be vectorized using SLP\n");
3633 return true;
3636 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3637 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3638 on success. The region has N_STMTS statements and has the datarefs
3639 given by DATAREFS. */
3641 static bool
3642 vect_slp_region (gimple_stmt_iterator region_begin,
3643 gimple_stmt_iterator region_end,
3644 vec<data_reference_p> datarefs,
3645 vec<int> *dataref_groups,
3646 unsigned int n_stmts)
3648 bb_vec_info bb_vinfo;
3649 auto_vector_modes vector_modes;
3651 /* Autodetect first vector size we try. */
3652 machine_mode next_vector_mode = VOIDmode;
3653 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3654 unsigned int mode_i = 0;
3656 vec_info_shared shared;
3658 machine_mode autodetected_vector_mode = VOIDmode;
3659 while (1)
3661 bool vectorized = false;
3662 bool fatal = false;
3663 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3665 bool first_time_p = shared.datarefs.is_empty ();
3666 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3667 if (first_time_p)
3668 bb_vinfo->shared->save_datarefs ();
3669 else
3670 bb_vinfo->shared->check_datarefs ();
3671 bb_vinfo->vector_mode = next_vector_mode;
3673 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
3674 && dbg_cnt (vect_slp))
3676 if (dump_enabled_p ())
3678 dump_printf_loc (MSG_NOTE, vect_location,
3679 "***** Analysis succeeded with vector mode"
3680 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
3681 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3684 bb_vinfo->shared->check_datarefs ();
3685 vect_schedule_slp (bb_vinfo);
3687 unsigned HOST_WIDE_INT bytes;
3688 if (dump_enabled_p ())
3690 if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
3691 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3692 "basic block part vectorized using %wu byte "
3693 "vectors\n", bytes);
3694 else
3695 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3696 "basic block part vectorized using variable "
3697 "length vectors\n");
3700 vectorized = true;
3702 else
3704 if (dump_enabled_p ())
3705 dump_printf_loc (MSG_NOTE, vect_location,
3706 "***** Analysis failed with vector mode %s\n",
3707 GET_MODE_NAME (bb_vinfo->vector_mode));
3710 if (mode_i == 0)
3711 autodetected_vector_mode = bb_vinfo->vector_mode;
3713 if (!fatal)
3714 while (mode_i < vector_modes.length ()
3715 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
3717 if (dump_enabled_p ())
3718 dump_printf_loc (MSG_NOTE, vect_location,
3719 "***** The result for vector mode %s would"
3720 " be the same\n",
3721 GET_MODE_NAME (vector_modes[mode_i]));
3722 mode_i += 1;
3725 delete bb_vinfo;
3727 if (mode_i < vector_modes.length ()
3728 && VECTOR_MODE_P (autodetected_vector_mode)
3729 && (related_vector_mode (vector_modes[mode_i],
3730 GET_MODE_INNER (autodetected_vector_mode))
3731 == autodetected_vector_mode)
3732 && (related_vector_mode (autodetected_vector_mode,
3733 GET_MODE_INNER (vector_modes[mode_i]))
3734 == vector_modes[mode_i]))
3736 if (dump_enabled_p ())
3737 dump_printf_loc (MSG_NOTE, vect_location,
3738 "***** Skipping vector mode %s, which would"
3739 " repeat the analysis for %s\n",
3740 GET_MODE_NAME (vector_modes[mode_i]),
3741 GET_MODE_NAME (autodetected_vector_mode));
3742 mode_i += 1;
3745 if (vectorized
3746 || mode_i == vector_modes.length ()
3747 || autodetected_vector_mode == VOIDmode
3748 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3749 vector sizes will fail do not bother iterating. */
3750 || fatal)
3751 return vectorized;
3753 /* Try the next biggest vector size. */
3754 next_vector_mode = vector_modes[mode_i++];
3755 if (dump_enabled_p ())
3756 dump_printf_loc (MSG_NOTE, vect_location,
3757 "***** Re-trying analysis with vector mode %s\n",
3758 GET_MODE_NAME (next_vector_mode));
3762 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3763 true if anything in the basic-block was vectorized. */
3765 bool
3766 vect_slp_bb (basic_block bb)
3768 vec<data_reference_p> datarefs = vNULL;
3769 vec<int> dataref_groups = vNULL;
3770 int insns = 0;
3771 int current_group = 0;
3772 gimple_stmt_iterator region_begin = gsi_start_nondebug_after_labels_bb (bb);
3773 gimple_stmt_iterator region_end = gsi_last_bb (bb);
3774 if (!gsi_end_p (region_end))
3775 gsi_next (&region_end);
3777 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
3778 gsi_next (&gsi))
3780 gimple *stmt = gsi_stmt (gsi);
3781 if (is_gimple_debug (stmt))
3782 continue;
3784 insns++;
3786 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3787 vect_location = stmt;
3789 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
3790 &dataref_groups, current_group))
3791 ++current_group;
3793 if (insns > param_slp_max_insns_in_bb)
3795 if (dump_enabled_p ())
3796 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3797 "not vectorized: too many instructions in "
3798 "basic block.\n");
3802 return vect_slp_region (region_begin, region_end, datarefs,
3803 &dataref_groups, insns);
3807 /* Build a variable-length vector in which the elements in ELTS are repeated
3808 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3809 RESULTS and add any new instructions to SEQ.
3811 The approach we use is:
3813 (1) Find a vector mode VM with integer elements of mode IM.
3815 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3816 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3817 from small vectors to IM.
3819 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3821 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3822 correct byte contents.
3824 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3826 We try to find the largest IM for which this sequence works, in order
3827 to cut down on the number of interleaves. */
3829 void
3830 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3831 vec<tree> elts, unsigned int nresults,
3832 vec<tree> &results)
3834 unsigned int nelts = elts.length ();
3835 tree element_type = TREE_TYPE (vector_type);
3837 /* (1) Find a vector mode VM with integer elements of mode IM. */
3838 unsigned int nvectors = 1;
3839 tree new_vector_type;
3840 tree permutes[2];
3841 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
3842 &nvectors, &new_vector_type,
3843 permutes))
3844 gcc_unreachable ();
3846 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3847 unsigned int partial_nelts = nelts / nvectors;
3848 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3850 tree_vector_builder partial_elts;
3851 auto_vec<tree, 32> pieces (nvectors * 2);
3852 pieces.quick_grow (nvectors * 2);
3853 for (unsigned int i = 0; i < nvectors; ++i)
3855 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3856 ELTS' has mode IM. */
3857 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3858 for (unsigned int j = 0; j < partial_nelts; ++j)
3859 partial_elts.quick_push (elts[i * partial_nelts + j]);
3860 tree t = gimple_build_vector (seq, &partial_elts);
3861 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3862 TREE_TYPE (new_vector_type), t);
3864 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3865 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3868 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3869 correct byte contents.
3871 We need to repeat the following operation log2(nvectors) times:
3873 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3874 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3876 However, if each input repeats every N elements and the VF is
3877 a multiple of N * 2, the HI result is the same as the LO. */
3878 unsigned int in_start = 0;
3879 unsigned int out_start = nvectors;
3880 unsigned int hi_start = nvectors / 2;
3881 /* A bound on the number of outputs needed to produce NRESULTS results
3882 in the final iteration. */
3883 unsigned int noutputs_bound = nvectors * nresults;
3884 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3886 noutputs_bound /= 2;
3887 unsigned int limit = MIN (noutputs_bound, nvectors);
3888 for (unsigned int i = 0; i < limit; ++i)
3890 if ((i & 1) != 0
3891 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3892 2 * in_repeat))
3894 pieces[out_start + i] = pieces[out_start + i - 1];
3895 continue;
3898 tree output = make_ssa_name (new_vector_type);
3899 tree input1 = pieces[in_start + (i / 2)];
3900 tree input2 = pieces[in_start + (i / 2) + hi_start];
3901 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3902 input1, input2,
3903 permutes[i & 1]);
3904 gimple_seq_add_stmt (seq, stmt);
3905 pieces[out_start + i] = output;
3907 std::swap (in_start, out_start);
3910 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3911 results.reserve (nresults);
3912 for (unsigned int i = 0; i < nresults; ++i)
3913 if (i < nvectors)
3914 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3915 pieces[in_start + i]));
3916 else
3917 results.quick_push (results[i - nvectors]);
3921 /* For constant and loop invariant defs in OP_NODE this function creates
3922 vector defs that will be used in the vectorized stmts and stores them
3923 to SLP_TREE_VEC_DEFS of OP_NODE. */
3925 static void
3926 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
3928 unsigned HOST_WIDE_INT nunits;
3929 tree vec_cst;
3930 unsigned j, number_of_places_left_in_vector;
3931 tree vector_type;
3932 tree vop;
3933 int group_size = op_node->ops.length ();
3934 unsigned int vec_num, i;
3935 unsigned number_of_copies = 1;
3936 bool constant_p;
3937 gimple_seq ctor_seq = NULL;
3938 auto_vec<tree, 16> permute_results;
3940 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
3941 vector_type = SLP_TREE_VECTYPE (op_node);
3943 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
3944 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
3945 auto_vec<tree> voprnds (number_of_vectors);
3947 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3948 created vectors. It is greater than 1 if unrolling is performed.
3950 For example, we have two scalar operands, s1 and s2 (e.g., group of
3951 strided accesses of size two), while NUNITS is four (i.e., four scalars
3952 of this type can be packed in a vector). The output vector will contain
3953 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3954 will be 2).
3956 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3957 containing the operands.
3959 For example, NUNITS is four as before, and the group size is 8
3960 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3961 {s5, s6, s7, s8}. */
3963 /* When using duplicate_and_interleave, we just need one element for
3964 each scalar statement. */
3965 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3966 nunits = group_size;
3968 number_of_copies = nunits * number_of_vectors / group_size;
3970 number_of_places_left_in_vector = nunits;
3971 constant_p = true;
3972 tree_vector_builder elts (vector_type, nunits, 1);
3973 elts.quick_grow (nunits);
3974 stmt_vec_info insert_after = NULL;
3975 for (j = 0; j < number_of_copies; j++)
3977 tree op;
3978 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3980 /* Create 'vect_ = {op0,op1,...,opn}'. */
3981 number_of_places_left_in_vector--;
3982 tree orig_op = op;
3983 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3985 if (CONSTANT_CLASS_P (op))
3987 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3989 /* Can't use VIEW_CONVERT_EXPR for booleans because
3990 of possibly different sizes of scalar value and
3991 vector element. */
3992 if (integer_zerop (op))
3993 op = build_int_cst (TREE_TYPE (vector_type), 0);
3994 else if (integer_onep (op))
3995 op = build_all_ones_cst (TREE_TYPE (vector_type));
3996 else
3997 gcc_unreachable ();
3999 else
4000 op = fold_unary (VIEW_CONVERT_EXPR,
4001 TREE_TYPE (vector_type), op);
4002 gcc_assert (op && CONSTANT_CLASS_P (op));
4004 else
4006 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
4007 gimple *init_stmt;
4008 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4010 tree true_val
4011 = build_all_ones_cst (TREE_TYPE (vector_type));
4012 tree false_val
4013 = build_zero_cst (TREE_TYPE (vector_type));
4014 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
4015 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
4016 op, true_val,
4017 false_val);
4019 else
4021 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
4022 op);
4023 init_stmt
4024 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
4025 op);
4027 gimple_seq_add_stmt (&ctor_seq, init_stmt);
4028 op = new_temp;
4031 elts[number_of_places_left_in_vector] = op;
4032 if (!CONSTANT_CLASS_P (op))
4033 constant_p = false;
4034 /* For BB vectorization we have to compute an insert location
4035 when a def is inside the analyzed region since we cannot
4036 simply insert at the BB start in this case. */
4037 stmt_vec_info opdef;
4038 if (TREE_CODE (orig_op) == SSA_NAME
4039 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
4040 && is_a <bb_vec_info> (vinfo)
4041 && (opdef = vinfo->lookup_def (orig_op)))
4043 if (!insert_after)
4044 insert_after = opdef;
4045 else
4046 insert_after = get_later_stmt (insert_after, opdef);
4049 if (number_of_places_left_in_vector == 0)
4051 if (constant_p
4052 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4053 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4054 vec_cst = gimple_build_vector (&ctor_seq, &elts);
4055 else
4057 if (permute_results.is_empty ())
4058 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
4059 elts, number_of_vectors,
4060 permute_results);
4061 vec_cst = permute_results[number_of_vectors - j - 1];
4063 if (!gimple_seq_empty_p (ctor_seq))
4065 if (insert_after)
4067 gimple_stmt_iterator gsi
4068 = gsi_for_stmt (insert_after->stmt);
4069 gsi_insert_seq_after (&gsi, ctor_seq,
4070 GSI_CONTINUE_LINKING);
4072 else
4073 vinfo->insert_seq_on_entry (NULL, ctor_seq);
4074 ctor_seq = NULL;
4076 voprnds.quick_push (vec_cst);
4077 insert_after = NULL;
4078 number_of_places_left_in_vector = nunits;
4079 constant_p = true;
4080 elts.new_vector (vector_type, nunits, 1);
4081 elts.quick_grow (nunits);
4086 /* Since the vectors are created in the reverse order, we should invert
4087 them. */
4088 vec_num = voprnds.length ();
4089 for (j = vec_num; j != 0; j--)
4091 vop = voprnds[j - 1];
4092 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4095 /* In case that VF is greater than the unrolling factor needed for the SLP
4096 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4097 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4098 to replicate the vectors. */
4099 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
4100 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
4101 i++)
4102 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4105 /* Get the Ith vectorized definition from SLP_NODE. */
4107 tree
4108 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
4110 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
4111 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
4112 else
4113 return SLP_TREE_VEC_DEFS (slp_node)[i];
4116 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4118 void
4119 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
4121 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
4122 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
4124 unsigned j;
4125 gimple *vec_def_stmt;
4126 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
4127 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
4129 else
4130 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
4133 /* Get N vectorized definitions for SLP_NODE. */
4135 void
4136 vect_get_slp_defs (vec_info *,
4137 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
4139 if (n == -1U)
4140 n = SLP_TREE_CHILDREN (slp_node).length ();
4142 for (unsigned i = 0; i < n; ++i)
4144 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
4145 vec<tree> vec_defs = vNULL;
4146 vect_get_slp_defs (child, &vec_defs);
4147 vec_oprnds->quick_push (vec_defs);
4151 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4152 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4153 permute statements for the SLP node NODE. */
4155 bool
4156 vect_transform_slp_perm_load (vec_info *vinfo,
4157 slp_tree node, vec<tree> dr_chain,
4158 gimple_stmt_iterator *gsi, poly_uint64 vf,
4159 bool analyze_only, unsigned *n_perms)
4161 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4162 int vec_index = 0;
4163 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4164 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
4165 unsigned int mask_element;
4166 machine_mode mode;
4168 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
4169 return false;
4171 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
4173 mode = TYPE_MODE (vectype);
4174 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4176 /* Initialize the vect stmts of NODE to properly insert the generated
4177 stmts later. */
4178 if (! analyze_only)
4179 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
4180 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
4181 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
4183 /* Generate permutation masks for every NODE. Number of masks for each NODE
4184 is equal to GROUP_SIZE.
4185 E.g., we have a group of three nodes with three loads from the same
4186 location in each node, and the vector size is 4. I.e., we have a
4187 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4188 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4189 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4192 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4193 The last mask is illegal since we assume two operands for permute
4194 operation, and the mask element values can't be outside that range.
4195 Hence, the last mask must be converted into {2,5,5,5}.
4196 For the first two permutations we need the first and the second input
4197 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4198 we need the second and the third vectors: {b1,c1,a2,b2} and
4199 {c2,a3,b3,c3}. */
4201 int vect_stmts_counter = 0;
4202 unsigned int index = 0;
4203 int first_vec_index = -1;
4204 int second_vec_index = -1;
4205 bool noop_p = true;
4206 *n_perms = 0;
4208 vec_perm_builder mask;
4209 unsigned int nelts_to_build;
4210 unsigned int nvectors_per_build;
4211 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
4212 && multiple_p (nunits, group_size));
4213 if (repeating_p)
4215 /* A single vector contains a whole number of copies of the node, so:
4216 (a) all permutes can use the same mask; and
4217 (b) the permutes only need a single vector input. */
4218 mask.new_vector (nunits, group_size, 3);
4219 nelts_to_build = mask.encoded_nelts ();
4220 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
4222 else
4224 /* We need to construct a separate mask for each vector statement. */
4225 unsigned HOST_WIDE_INT const_nunits, const_vf;
4226 if (!nunits.is_constant (&const_nunits)
4227 || !vf.is_constant (&const_vf))
4228 return false;
4229 mask.new_vector (const_nunits, const_nunits, 1);
4230 nelts_to_build = const_vf * group_size;
4231 nvectors_per_build = 1;
4234 unsigned int count = mask.encoded_nelts ();
4235 mask.quick_grow (count);
4236 vec_perm_indices indices;
4238 for (unsigned int j = 0; j < nelts_to_build; j++)
4240 unsigned int iter_num = j / group_size;
4241 unsigned int stmt_num = j % group_size;
4242 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
4243 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
4244 if (repeating_p)
4246 first_vec_index = 0;
4247 mask_element = i;
4249 else
4251 /* Enforced before the loop when !repeating_p. */
4252 unsigned int const_nunits = nunits.to_constant ();
4253 vec_index = i / const_nunits;
4254 mask_element = i % const_nunits;
4255 if (vec_index == first_vec_index
4256 || first_vec_index == -1)
4258 first_vec_index = vec_index;
4260 else if (vec_index == second_vec_index
4261 || second_vec_index == -1)
4263 second_vec_index = vec_index;
4264 mask_element += const_nunits;
4266 else
4268 if (dump_enabled_p ())
4269 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4270 "permutation requires at "
4271 "least three vectors %G",
4272 stmt_info->stmt);
4273 gcc_assert (analyze_only);
4274 return false;
4277 gcc_assert (mask_element < 2 * const_nunits);
4280 if (mask_element != index)
4281 noop_p = false;
4282 mask[index++] = mask_element;
4284 if (index == count && !noop_p)
4286 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4287 if (!can_vec_perm_const_p (mode, indices))
4289 if (dump_enabled_p ())
4291 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4292 vect_location,
4293 "unsupported vect permute { ");
4294 for (i = 0; i < count; ++i)
4296 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4297 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4299 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4301 gcc_assert (analyze_only);
4302 return false;
4305 ++*n_perms;
4308 if (index == count)
4310 if (!analyze_only)
4312 tree mask_vec = NULL_TREE;
4314 if (! noop_p)
4315 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4317 if (second_vec_index == -1)
4318 second_vec_index = first_vec_index;
4320 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4322 /* Generate the permute statement if necessary. */
4323 tree first_vec = dr_chain[first_vec_index + ri];
4324 tree second_vec = dr_chain[second_vec_index + ri];
4325 gimple *perm_stmt;
4326 if (! noop_p)
4328 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4329 tree perm_dest
4330 = vect_create_destination_var (gimple_assign_lhs (stmt),
4331 vectype);
4332 perm_dest = make_ssa_name (perm_dest);
4333 perm_stmt
4334 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4335 first_vec, second_vec,
4336 mask_vec);
4337 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
4338 gsi);
4340 else
4341 /* If mask was NULL_TREE generate the requested
4342 identity transform. */
4343 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
4345 /* Store the vector statement in NODE. */
4346 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
4350 index = 0;
4351 first_vec_index = -1;
4352 second_vec_index = -1;
4353 noop_p = true;
4357 return true;
4361 /* Vectorize the SLP permutations in NODE as specified
4362 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
4363 child number and lane number.
4364 Interleaving of two two-lane two-child SLP subtrees (not supported):
4365 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
4366 A blend of two four-lane two-child SLP subtrees:
4367 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
4368 Highpart of a four-lane one-child SLP subtree (not supported):
4369 [ { 0, 2 }, { 0, 3 } ]
4370 Where currently only a subset is supported by code generating below. */
4372 static bool
4373 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
4374 slp_tree node, stmt_vector_for_cost *cost_vec)
4376 tree vectype = SLP_TREE_VECTYPE (node);
4378 /* ??? We currently only support all same vector input and output types
4379 while the SLP IL should really do a concat + select and thus accept
4380 arbitrary mismatches. */
4381 slp_tree child;
4382 unsigned i;
4383 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4384 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
4386 if (dump_enabled_p ())
4387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4388 "Unsupported lane permutation\n");
4389 return false;
4392 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
4393 gcc_assert (perm.length () == SLP_TREE_LANES (node));
4394 if (dump_enabled_p ())
4396 dump_printf_loc (MSG_NOTE, vect_location,
4397 "vectorizing permutation");
4398 for (unsigned i = 0; i < perm.length (); ++i)
4399 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
4400 dump_printf (MSG_NOTE, "\n");
4403 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4404 if (!nunits.is_constant ())
4405 return false;
4406 unsigned HOST_WIDE_INT vf = 1;
4407 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
4408 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
4409 return false;
4410 unsigned olanes = vf * SLP_TREE_LANES (node);
4411 gcc_assert (multiple_p (olanes, nunits));
4413 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
4414 from the { SLP operand, scalar lane } permutation as recorded in the
4415 SLP node as intermediate step. This part should already work
4416 with SLP children with arbitrary number of lanes. */
4417 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
4418 auto_vec<unsigned> active_lane;
4419 vperm.create (olanes);
4420 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
4421 for (unsigned i = 0; i < vf; ++i)
4423 for (unsigned pi = 0; pi < perm.length (); ++pi)
4425 std::pair<unsigned, unsigned> p = perm[pi];
4426 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
4427 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
4428 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
4429 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
4430 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
4432 /* Advance to the next group. */
4433 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
4434 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
4437 if (dump_enabled_p ())
4439 dump_printf_loc (MSG_NOTE, vect_location, "as");
4440 for (unsigned i = 0; i < vperm.length (); ++i)
4442 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
4443 dump_printf (MSG_NOTE, ",");
4444 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
4445 vperm[i].first.first, vperm[i].first.second,
4446 vperm[i].first.second);
4448 dump_printf (MSG_NOTE, "\n");
4451 /* We can only handle two-vector permutes, everything else should
4452 be lowered on the SLP level. The following is closely inspired
4453 by vect_transform_slp_perm_load and is supposed to eventually
4454 replace it.
4455 ??? As intermediate step do code-gen in the SLP tree representation
4456 somehow? */
4457 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
4458 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
4459 unsigned int const_nunits = nunits.to_constant ();
4460 unsigned int index = 0;
4461 unsigned int mask_element;
4462 vec_perm_builder mask;
4463 mask.new_vector (const_nunits, const_nunits, 1);
4464 unsigned int count = mask.encoded_nelts ();
4465 mask.quick_grow (count);
4466 vec_perm_indices indices;
4467 unsigned nperms = 0;
4468 for (unsigned i = 0; i < vperm.length (); ++i)
4470 mask_element = vperm[i].second;
4471 if (first_vec.first == -1U
4472 || first_vec == vperm[i].first)
4473 first_vec = vperm[i].first;
4474 else if (second_vec.first == -1U
4475 || second_vec == vperm[i].first)
4477 second_vec = vperm[i].first;
4478 mask_element += const_nunits;
4480 else
4482 if (dump_enabled_p ())
4483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4484 "permutation requires at "
4485 "least three vectors");
4486 gcc_assert (!gsi);
4487 return false;
4490 mask[index++] = mask_element;
4492 if (index == count)
4494 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
4495 const_nunits);
4496 bool identity_p = indices.series_p (0, 1, 0, 1);
4497 if (!identity_p
4498 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
4500 if (dump_enabled_p ())
4502 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4503 vect_location,
4504 "unsupported vect permute { ");
4505 for (i = 0; i < count; ++i)
4507 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4508 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4510 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4512 gcc_assert (!gsi);
4513 return false;
4516 if (!identity_p)
4517 nperms++;
4518 if (gsi)
4520 if (second_vec.first == -1U)
4521 second_vec = first_vec;
4523 /* Generate the permute statement if necessary. */
4524 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
4525 tree first_def
4526 = vect_get_slp_vect_def (first_node, first_vec.second);
4527 gassign *perm_stmt;
4528 tree perm_dest = make_ssa_name (vectype);
4529 if (!identity_p)
4531 slp_tree second_node
4532 = SLP_TREE_CHILDREN (node)[second_vec.first];
4533 tree second_def
4534 = vect_get_slp_vect_def (second_node, second_vec.second);
4535 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4536 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4537 first_def, second_def,
4538 mask_vec);
4540 else
4541 /* We need a copy here in case the def was external. */
4542 perm_stmt = gimple_build_assign (perm_dest, first_def);
4543 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
4544 /* Store the vector statement in NODE. */
4545 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
4548 index = 0;
4549 first_vec = std::make_pair (-1U, -1U);
4550 second_vec = std::make_pair (-1U, -1U);
4554 if (!gsi)
4555 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
4557 return true;
4560 /* Vectorize SLP instance tree in postorder. */
4562 static void
4563 vect_schedule_slp_instance (vec_info *vinfo,
4564 slp_tree node, slp_instance instance)
4566 gimple_stmt_iterator si;
4567 int i;
4568 slp_tree child;
4570 /* See if we have already vectorized the node in the graph of the
4571 SLP instance. */
4572 if ((SLP_TREE_DEF_TYPE (node) == vect_internal_def
4573 && SLP_TREE_VEC_STMTS (node).exists ())
4574 || SLP_TREE_VEC_DEFS (node).exists ())
4575 return;
4577 /* Vectorize externals and constants. */
4578 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
4579 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4581 /* ??? vectorizable_shift can end up using a scalar operand which is
4582 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
4583 node in this case. */
4584 if (!SLP_TREE_VECTYPE (node))
4585 return;
4587 vect_create_constant_vectors (vinfo, node);
4588 return;
4591 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4592 vect_schedule_slp_instance (vinfo, child, instance);
4594 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
4595 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4597 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4598 if (dump_enabled_p ())
4599 dump_printf_loc (MSG_NOTE, vect_location,
4600 "------>vectorizing SLP node starting from: %G",
4601 stmt_info->stmt);
4603 if (STMT_VINFO_DATA_REF (stmt_info)
4604 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
4606 /* Vectorized loads go before the first scalar load to make it
4607 ready early, vectorized stores go before the last scalar
4608 stmt which is where all uses are ready. */
4609 stmt_vec_info last_stmt_info = NULL;
4610 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
4611 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
4612 else /* DR_IS_WRITE */
4613 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
4614 si = gsi_for_stmt (last_stmt_info->stmt);
4616 else if (SLP_TREE_CHILDREN (node).is_empty ())
4618 /* This happens for reduction and induction PHIs where we do not use the
4619 insertion iterator. */
4620 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4621 == cycle_phi_info_type
4622 || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4623 == induc_vec_info_type));
4624 si = gsi_none ();
4626 else
4628 /* Emit other stmts after the children vectorized defs which is
4629 earliest possible. */
4630 gimple *last_stmt = NULL;
4631 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4632 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4634 /* For fold-left reductions we are retaining the scalar
4635 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
4636 set so the representation isn't perfect. Resort to the
4637 last scalar def here. */
4638 if (SLP_TREE_VEC_STMTS (child).is_empty ())
4640 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
4641 == cycle_phi_info_type);
4642 gphi *phi = as_a <gphi *>
4643 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
4644 if (!last_stmt
4645 || vect_stmt_dominates_stmt_p (last_stmt, phi))
4646 last_stmt = phi;
4648 /* We are emitting all vectorized stmts in the same place and
4649 the last one is the last.
4650 ??? Unless we have a load permutation applied and that
4651 figures to re-use an earlier generated load. */
4652 unsigned j;
4653 gimple *vstmt;
4654 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
4655 if (!last_stmt
4656 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
4657 last_stmt = vstmt;
4659 else if (!SLP_TREE_VECTYPE (child))
4661 /* For externals we use unvectorized at all scalar defs. */
4662 unsigned j;
4663 tree def;
4664 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
4665 if (TREE_CODE (def) == SSA_NAME
4666 && !SSA_NAME_IS_DEFAULT_DEF (def))
4668 gimple *stmt = SSA_NAME_DEF_STMT (def);
4669 if (!last_stmt
4670 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
4671 last_stmt = stmt;
4674 else
4676 /* For externals we have to look at all defs since their
4677 insertion place is decided per vector. But beware
4678 of pre-existing vectors where we need to make sure
4679 we do not insert before the region boundary. */
4680 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
4681 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
4682 last_stmt = gsi_stmt (as_a <bb_vec_info> (vinfo)->region_begin);
4683 else
4685 unsigned j;
4686 tree vdef;
4687 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
4688 if (TREE_CODE (vdef) == SSA_NAME
4689 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
4691 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
4692 if (!last_stmt
4693 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
4694 last_stmt = vstmt;
4698 /* This can happen when all children are pre-existing vectors or
4699 constants. */
4700 if (!last_stmt)
4701 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
4702 if (is_a <gphi *> (last_stmt))
4703 si = gsi_after_labels (gimple_bb (last_stmt));
4704 else
4706 si = gsi_for_stmt (last_stmt);
4707 gsi_next (&si);
4711 bool done_p = false;
4713 /* Handle purely internal nodes. */
4714 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4716 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
4717 be shared with different SLP nodes (but usually it's the same
4718 operation apart from the case the stmt is only there for denoting
4719 the actual scalar lane defs ...). So do not call vect_transform_stmt
4720 but open-code it here (partly). */
4721 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
4722 gcc_assert (done);
4723 done_p = true;
4725 if (!done_p)
4726 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
4729 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4730 For loop vectorization this is done in vectorizable_call, but for SLP
4731 it needs to be deferred until end of vect_schedule_slp, because multiple
4732 SLP instances may refer to the same scalar stmt. */
4734 static void
4735 vect_remove_slp_scalar_calls (vec_info *vinfo,
4736 slp_tree node, hash_set<slp_tree> &visited)
4738 gimple *new_stmt;
4739 gimple_stmt_iterator gsi;
4740 int i;
4741 slp_tree child;
4742 tree lhs;
4743 stmt_vec_info stmt_info;
4745 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4746 return;
4748 if (visited.add (node))
4749 return;
4751 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4752 vect_remove_slp_scalar_calls (vinfo, child, visited);
4754 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4756 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4757 if (!stmt || gimple_bb (stmt) == NULL)
4758 continue;
4759 if (is_pattern_stmt_p (stmt_info)
4760 || !PURE_SLP_STMT (stmt_info))
4761 continue;
4762 lhs = gimple_call_lhs (stmt);
4763 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4764 gsi = gsi_for_stmt (stmt);
4765 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4766 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4770 static void
4771 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
4773 hash_set<slp_tree> visited;
4774 vect_remove_slp_scalar_calls (vinfo, node, visited);
4777 /* Vectorize the instance root. */
4779 void
4780 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
4782 gassign *rstmt = NULL;
4784 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
4786 gimple *child_stmt;
4787 int j;
4789 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
4791 tree vect_lhs = gimple_get_lhs (child_stmt);
4792 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
4793 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
4794 TREE_TYPE (vect_lhs)))
4795 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
4796 vect_lhs);
4797 rstmt = gimple_build_assign (root_lhs, vect_lhs);
4798 break;
4801 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
4803 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4804 gimple *child_stmt;
4805 int j;
4806 vec<constructor_elt, va_gc> *v;
4807 vec_alloc (v, nelts);
4809 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
4811 CONSTRUCTOR_APPEND_ELT (v,
4812 NULL_TREE,
4813 gimple_get_lhs (child_stmt));
4815 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
4816 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
4817 tree r_constructor = build_constructor (rtype, v);
4818 rstmt = gimple_build_assign (lhs, r_constructor);
4821 gcc_assert (rstmt);
4823 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
4824 gsi_replace (&rgsi, rstmt, true);
4827 /* Generate vector code for all SLP instances in the loop/basic block. */
4829 void
4830 vect_schedule_slp (vec_info *vinfo)
4832 vec<slp_instance> slp_instances;
4833 slp_instance instance;
4834 unsigned int i;
4836 slp_instances = vinfo->slp_instances;
4837 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4839 slp_tree node = SLP_INSTANCE_TREE (instance);
4840 if (dump_enabled_p ())
4842 dump_printf_loc (MSG_NOTE, vect_location,
4843 "Vectorizing SLP tree:\n");
4844 if (SLP_INSTANCE_ROOT_STMT (instance))
4845 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
4846 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
4847 vect_print_slp_graph (MSG_NOTE, vect_location,
4848 SLP_INSTANCE_TREE (instance));
4850 /* Schedule the tree of INSTANCE. */
4851 vect_schedule_slp_instance (vinfo, node, instance);
4853 if (SLP_INSTANCE_ROOT_STMT (instance))
4854 vectorize_slp_instance_root_stmt (node, instance);
4856 if (dump_enabled_p ())
4857 dump_printf_loc (MSG_NOTE, vect_location,
4858 "vectorizing stmts using SLP.\n");
4861 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4863 slp_tree root = SLP_INSTANCE_TREE (instance);
4864 stmt_vec_info store_info;
4865 unsigned int j;
4867 /* For reductions set the latch values of the vectorized PHIs. */
4868 if (instance->reduc_phis
4869 && STMT_VINFO_REDUC_TYPE (SLP_TREE_REPRESENTATIVE
4870 (instance->reduc_phis)) != FOLD_LEFT_REDUCTION
4871 && STMT_VINFO_REDUC_TYPE (SLP_TREE_REPRESENTATIVE
4872 (instance->reduc_phis)) != EXTRACT_LAST_REDUCTION)
4874 slp_tree slp_node = root;
4875 slp_tree phi_node = instance->reduc_phis;
4876 gphi *phi = as_a <gphi *> (SLP_TREE_SCALAR_STMTS (phi_node)[0]->stmt);
4877 edge e = loop_latch_edge (gimple_bb (phi)->loop_father);
4878 gcc_assert (SLP_TREE_VEC_STMTS (phi_node).length ()
4879 == SLP_TREE_VEC_STMTS (slp_node).length ());
4880 for (unsigned j = 0; j < SLP_TREE_VEC_STMTS (phi_node).length (); ++j)
4881 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[j]),
4882 vect_get_slp_vect_def (slp_node, j),
4883 e, gimple_phi_arg_location (phi, e->dest_idx));
4886 /* Remove scalar call stmts. Do not do this for basic-block
4887 vectorization as not all uses may be vectorized.
4888 ??? Why should this be necessary? DCE should be able to
4889 remove the stmts itself.
4890 ??? For BB vectorization we can as well remove scalar
4891 stmts starting from the SLP tree root if they have no
4892 uses. */
4893 if (is_a <loop_vec_info> (vinfo))
4894 vect_remove_slp_scalar_calls (vinfo, root);
4896 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
4898 if (!STMT_VINFO_DATA_REF (store_info))
4899 break;
4901 if (SLP_INSTANCE_ROOT_STMT (instance))
4902 continue;
4904 store_info = vect_orig_stmt (store_info);
4905 /* Free the attached stmt_vec_info and remove the stmt. */
4906 vinfo->remove_stmt (store_info);