typeck.c (cp_truthvalue_conversion): Add tsubst_flags_t parameter and use it in calls...
[official-gcc.git] / gcc / tree-vect-slp.c
blob1c1e5022c968b905ed264994465d517e8e2ff201
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
49 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
50 FINAL_P is true if we have vectorized the instance or if we have
51 made a final decision not to vectorize the statements in any way. */
53 static void
54 vect_free_slp_tree (slp_tree node, bool final_p)
56 int i;
57 slp_tree child;
59 if (--node->refcnt != 0)
60 return;
62 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
63 vect_free_slp_tree (child, final_p);
65 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
66 Some statements might no longer exist, after having been
67 removed by vect_transform_stmt. Updating the remaining
68 statements would be redundant. */
69 if (!final_p)
71 stmt_vec_info stmt_info;
72 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
74 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
75 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
79 SLP_TREE_CHILDREN (node).release ();
80 SLP_TREE_SCALAR_STMTS (node).release ();
81 SLP_TREE_SCALAR_OPS (node).release ();
82 SLP_TREE_VEC_STMTS (node).release ();
83 SLP_TREE_LOAD_PERMUTATION (node).release ();
85 free (node);
88 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
89 have vectorized the instance or if we have made a final decision not
90 to vectorize the statements in any way. */
92 void
93 vect_free_slp_instance (slp_instance instance, bool final_p)
95 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
96 SLP_INSTANCE_LOADS (instance).release ();
97 free (instance);
101 /* Create an SLP node for SCALAR_STMTS. */
103 static slp_tree
104 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
106 slp_tree node;
107 stmt_vec_info stmt_info = scalar_stmts[0];
108 unsigned int nops;
110 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
111 nops = gimple_call_num_args (stmt);
112 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
114 nops = gimple_num_ops (stmt) - 1;
115 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
116 nops++;
118 else if (is_a <gphi *> (stmt_info->stmt))
119 nops = 0;
120 else
121 return NULL;
123 node = XNEW (struct _slp_tree);
124 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
125 SLP_TREE_SCALAR_OPS (node) = vNULL;
126 SLP_TREE_VEC_STMTS (node).create (0);
127 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
128 SLP_TREE_CHILDREN (node).create (nops);
129 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
130 SLP_TREE_TWO_OPERATORS (node) = false;
131 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
132 node->refcnt = 1;
133 node->max_nunits = 1;
135 unsigned i;
136 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
137 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
139 return node;
142 /* Create an SLP node for OPS. */
144 static slp_tree
145 vect_create_new_slp_node (vec<tree> ops)
147 slp_tree node;
149 node = XNEW (struct _slp_tree);
150 SLP_TREE_SCALAR_STMTS (node) = vNULL;
151 SLP_TREE_SCALAR_OPS (node) = ops;
152 SLP_TREE_VEC_STMTS (node).create (0);
153 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
154 SLP_TREE_CHILDREN (node) = vNULL;
155 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
156 SLP_TREE_TWO_OPERATORS (node) = false;
157 SLP_TREE_DEF_TYPE (node) = vect_external_def;
158 node->refcnt = 1;
159 node->max_nunits = 1;
161 return node;
165 /* This structure is used in creation of an SLP tree. Each instance
166 corresponds to the same operand in a group of scalar stmts in an SLP
167 node. */
168 typedef struct _slp_oprnd_info
170 /* Def-stmts for the operands. */
171 vec<stmt_vec_info> def_stmts;
172 /* Operands. */
173 vec<tree> ops;
174 /* Information about the first statement, its vector def-type, type, the
175 operand itself in case it's constant, and an indication if it's a pattern
176 stmt. */
177 tree first_op_type;
178 enum vect_def_type first_dt;
179 bool any_pattern;
180 } *slp_oprnd_info;
183 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
184 operand. */
185 static vec<slp_oprnd_info>
186 vect_create_oprnd_info (int nops, int group_size)
188 int i;
189 slp_oprnd_info oprnd_info;
190 vec<slp_oprnd_info> oprnds_info;
192 oprnds_info.create (nops);
193 for (i = 0; i < nops; i++)
195 oprnd_info = XNEW (struct _slp_oprnd_info);
196 oprnd_info->def_stmts.create (group_size);
197 oprnd_info->ops.create (group_size);
198 oprnd_info->first_dt = vect_uninitialized_def;
199 oprnd_info->first_op_type = NULL_TREE;
200 oprnd_info->any_pattern = false;
201 oprnds_info.quick_push (oprnd_info);
204 return oprnds_info;
208 /* Free operands info. */
210 static void
211 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
213 int i;
214 slp_oprnd_info oprnd_info;
216 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
218 oprnd_info->def_stmts.release ();
219 oprnd_info->ops.release ();
220 XDELETE (oprnd_info);
223 oprnds_info.release ();
227 /* Return true if STMTS contains a pattern statement. */
229 static bool
230 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
232 stmt_vec_info stmt_info;
233 unsigned int i;
234 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
235 if (is_pattern_stmt_p (stmt_info))
236 return true;
237 return false;
240 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
241 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
242 of the chain. */
245 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
246 stmt_vec_info first_stmt_info)
248 stmt_vec_info next_stmt_info = first_stmt_info;
249 int result = 0;
251 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
252 return -1;
256 if (next_stmt_info == stmt_info)
257 return result;
258 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
259 if (next_stmt_info)
260 result += DR_GROUP_GAP (next_stmt_info);
262 while (next_stmt_info);
264 return -1;
267 /* Check whether it is possible to load COUNT elements of type ELT_MODE
268 using the method implemented by duplicate_and_interleave. Return true
269 if so, returning the number of intermediate vectors in *NVECTORS_OUT
270 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
271 (if nonnull). */
273 bool
274 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
275 machine_mode elt_mode,
276 unsigned int *nvectors_out,
277 tree *vector_type_out,
278 tree *permutes)
280 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
281 poly_int64 nelts;
282 unsigned int nvectors = 1;
283 for (;;)
285 scalar_int_mode int_mode;
286 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
287 if (multiple_p (GET_MODE_SIZE (vinfo->vector_mode), elt_bytes, &nelts)
288 && int_mode_for_size (elt_bits, 0).exists (&int_mode))
290 tree int_type = build_nonstandard_integer_type
291 (GET_MODE_BITSIZE (int_mode), 1);
292 tree vector_type = build_vector_type (int_type, nelts);
293 if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
295 vec_perm_builder sel1 (nelts, 2, 3);
296 vec_perm_builder sel2 (nelts, 2, 3);
297 poly_int64 half_nelts = exact_div (nelts, 2);
298 for (unsigned int i = 0; i < 3; ++i)
300 sel1.quick_push (i);
301 sel1.quick_push (i + nelts);
302 sel2.quick_push (half_nelts + i);
303 sel2.quick_push (half_nelts + i + nelts);
305 vec_perm_indices indices1 (sel1, 2, nelts);
306 vec_perm_indices indices2 (sel2, 2, nelts);
307 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
308 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
310 if (nvectors_out)
311 *nvectors_out = nvectors;
312 if (vector_type_out)
313 *vector_type_out = vector_type;
314 if (permutes)
316 permutes[0] = vect_gen_perm_mask_checked (vector_type,
317 indices1);
318 permutes[1] = vect_gen_perm_mask_checked (vector_type,
319 indices2);
321 return true;
325 if (!multiple_p (elt_bytes, 2, &elt_bytes))
326 return false;
327 nvectors *= 2;
331 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
332 they are of a valid type and that they match the defs of the first stmt of
333 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
334 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
335 indicates swap is required for cond_expr stmts. Specifically, *SWAP
336 is 1 if STMT is cond and operands of comparison need to be swapped;
337 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
338 If there is any operand swap in this function, *SWAP is set to non-zero
339 value.
340 If there was a fatal error return -1; if the error could be corrected by
341 swapping operands of father node of this one, return 1; if everything is
342 ok return 0. */
343 static int
344 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
345 vec<stmt_vec_info> stmts, unsigned stmt_num,
346 vec<slp_oprnd_info> *oprnds_info)
348 stmt_vec_info stmt_info = stmts[stmt_num];
349 tree oprnd;
350 unsigned int i, number_of_oprnds;
351 enum vect_def_type dt = vect_uninitialized_def;
352 slp_oprnd_info oprnd_info;
353 int first_op_idx = 1;
354 unsigned int commutative_op = -1U;
355 bool first_op_cond = false;
356 bool first = stmt_num == 0;
358 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
360 number_of_oprnds = gimple_call_num_args (stmt);
361 first_op_idx = 3;
362 if (gimple_call_internal_p (stmt))
364 internal_fn ifn = gimple_call_internal_fn (stmt);
365 commutative_op = first_commutative_argument (ifn);
367 /* Masked load, only look at mask. */
368 if (ifn == IFN_MASK_LOAD)
370 number_of_oprnds = 1;
371 /* Mask operand index. */
372 first_op_idx = 5;
376 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
378 enum tree_code code = gimple_assign_rhs_code (stmt);
379 number_of_oprnds = gimple_num_ops (stmt) - 1;
380 /* Swap can only be done for cond_expr if asked to, otherwise we
381 could result in different comparison code to the first stmt. */
382 if (code == COND_EXPR
383 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
385 first_op_cond = true;
386 number_of_oprnds++;
388 else
389 commutative_op = commutative_tree_code (code) ? 0U : -1U;
391 else
392 return -1;
394 bool swapped = (*swap != 0);
395 gcc_assert (!swapped || first_op_cond);
396 for (i = 0; i < number_of_oprnds; i++)
398 again:
399 if (first_op_cond)
401 /* Map indicating how operands of cond_expr should be swapped. */
402 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
403 int *map = maps[*swap];
405 if (i < 2)
406 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
407 first_op_idx), map[i]);
408 else
409 oprnd = gimple_op (stmt_info->stmt, map[i]);
411 else
412 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
413 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
414 oprnd = TREE_OPERAND (oprnd, 0);
416 oprnd_info = (*oprnds_info)[i];
418 stmt_vec_info def_stmt_info;
419 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
421 if (dump_enabled_p ())
422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
423 "Build SLP failed: can't analyze def for %T\n",
424 oprnd);
426 return -1;
429 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
430 oprnd_info->any_pattern = true;
432 if (first)
434 /* For the swapping logic below force vect_reduction_def
435 for the reduction op in a SLP reduction group. */
436 if (!STMT_VINFO_DATA_REF (stmt_info)
437 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
438 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
439 && def_stmt_info)
440 dt = vect_reduction_def;
441 oprnd_info->first_dt = dt;
442 oprnd_info->first_op_type = TREE_TYPE (oprnd);
444 else
446 /* Not first stmt of the group, check that the def-stmt/s match
447 the def-stmt/s of the first stmt. Allow different definition
448 types for reduction chains: the first stmt must be a
449 vect_reduction_def (a phi node), and the rest
450 end in the reduction chain. */
451 tree type = TREE_TYPE (oprnd);
452 if ((oprnd_info->first_dt != dt
453 && !(oprnd_info->first_dt == vect_reduction_def
454 && !STMT_VINFO_DATA_REF (stmt_info)
455 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
456 && def_stmt_info
457 && !STMT_VINFO_DATA_REF (def_stmt_info)
458 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
459 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
460 && !((oprnd_info->first_dt == vect_external_def
461 || oprnd_info->first_dt == vect_constant_def)
462 && (dt == vect_external_def
463 || dt == vect_constant_def)))
464 || !types_compatible_p (oprnd_info->first_op_type, type)
465 || (!STMT_VINFO_DATA_REF (stmt_info)
466 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
467 && ((!def_stmt_info
468 || STMT_VINFO_DATA_REF (def_stmt_info)
469 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
470 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
471 != (oprnd_info->first_dt != vect_reduction_def))))
473 /* Try swapping operands if we got a mismatch. */
474 if (i == commutative_op && !swapped)
476 if (dump_enabled_p ())
477 dump_printf_loc (MSG_NOTE, vect_location,
478 "trying swapped operands\n");
479 swapped = true;
480 goto again;
483 if (dump_enabled_p ())
484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
485 "Build SLP failed: different types\n");
487 return 1;
489 if ((dt == vect_constant_def
490 || dt == vect_external_def)
491 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
492 && (TREE_CODE (type) == BOOLEAN_TYPE
493 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
494 TYPE_MODE (type))))
496 if (dump_enabled_p ())
497 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
498 "Build SLP failed: invalid type of def "
499 "for variable-length SLP %T\n", oprnd);
500 return -1;
504 /* Check the types of the definitions. */
505 switch (dt)
507 case vect_external_def:
508 /* Make sure to demote the overall operand to external. */
509 oprnd_info->first_dt = vect_external_def;
510 /* Fallthru. */
511 case vect_constant_def:
512 oprnd_info->def_stmts.quick_push (NULL);
513 oprnd_info->ops.quick_push (oprnd);
514 break;
516 case vect_internal_def:
517 case vect_reduction_def:
518 if (oprnd_info->first_dt == vect_reduction_def
519 && !STMT_VINFO_DATA_REF (stmt_info)
520 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
521 && !STMT_VINFO_DATA_REF (def_stmt_info)
522 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
523 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
525 /* For a SLP reduction chain we want to duplicate the
526 reduction to each of the chain members. That gets
527 us a sane SLP graph (still the stmts are not 100%
528 correct wrt the initial values). */
529 gcc_assert (!first);
530 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
531 oprnd_info->ops.quick_push (oprnd_info->ops[0]);
532 break;
534 /* Fallthru. */
535 case vect_induction_def:
536 oprnd_info->def_stmts.quick_push (def_stmt_info);
537 oprnd_info->ops.quick_push (oprnd);
538 break;
540 default:
541 /* FORNOW: Not supported. */
542 if (dump_enabled_p ())
543 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
544 "Build SLP failed: illegal type of def %T\n",
545 oprnd);
547 return -1;
551 /* Swap operands. */
552 if (swapped)
554 if (first_op_cond)
556 /* If there are already uses of this stmt in a SLP instance then
557 we've committed to the operand order and can't swap it. */
558 if (STMT_VINFO_NUM_SLP_USES (stmt_info) != 0)
560 if (dump_enabled_p ())
561 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
562 "Build SLP failed: cannot swap operands of "
563 "shared stmt %G", stmt_info->stmt);
564 return -1;
567 /* To get rid of this swapping we have to move the stmt code
568 to the SLP tree as well (and gather it here per stmt). */
569 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
570 tree cond = gimple_assign_rhs1 (stmt);
571 enum tree_code code = TREE_CODE (cond);
573 /* Swap. */
574 if (*swap == 1)
576 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
577 &TREE_OPERAND (cond, 1));
578 TREE_SET_CODE (cond, swap_tree_comparison (code));
580 /* Invert. */
581 else
583 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
584 gimple_assign_rhs3_ptr (stmt));
585 if (STMT_VINFO_REDUC_IDX (stmt_info) == 1)
586 STMT_VINFO_REDUC_IDX (stmt_info) = 2;
587 else if (STMT_VINFO_REDUC_IDX (stmt_info) == 2)
588 STMT_VINFO_REDUC_IDX (stmt_info) = 1;
589 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
590 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
591 gcc_assert (code != ERROR_MARK);
592 TREE_SET_CODE (cond, code);
595 else
597 /* Commutative ops need not reflect swapping, ops are in
598 the SLP tree. */
600 if (dump_enabled_p ())
601 dump_printf_loc (MSG_NOTE, vect_location,
602 "swapped operands to match def types in %G",
603 stmt_info->stmt);
606 *swap = swapped;
607 return 0;
610 /* Return true if call statements CALL1 and CALL2 are similar enough
611 to be combined into the same SLP group. */
613 static bool
614 compatible_calls_p (gcall *call1, gcall *call2)
616 unsigned int nargs = gimple_call_num_args (call1);
617 if (nargs != gimple_call_num_args (call2))
618 return false;
620 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
621 return false;
623 if (gimple_call_internal_p (call1))
625 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
626 TREE_TYPE (gimple_call_lhs (call2))))
627 return false;
628 for (unsigned int i = 0; i < nargs; ++i)
629 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
630 TREE_TYPE (gimple_call_arg (call2, i))))
631 return false;
633 else
635 if (!operand_equal_p (gimple_call_fn (call1),
636 gimple_call_fn (call2), 0))
637 return false;
639 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
640 return false;
642 return true;
645 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
646 caller's attempt to find the vector type in STMT_INFO with the narrowest
647 element type. Return true if VECTYPE is nonnull and if it is valid
648 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
649 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
650 vect_build_slp_tree. */
652 static bool
653 vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
654 tree vectype, poly_uint64 *max_nunits)
656 if (!vectype)
658 if (dump_enabled_p ())
659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
660 "Build SLP failed: unsupported data-type in %G\n",
661 stmt_info->stmt);
662 /* Fatal mismatch. */
663 return false;
666 /* If populating the vector type requires unrolling then fail
667 before adjusting *max_nunits for basic-block vectorization. */
668 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
669 unsigned HOST_WIDE_INT const_nunits;
670 if (STMT_VINFO_BB_VINFO (stmt_info)
671 && (!nunits.is_constant (&const_nunits)
672 || const_nunits > group_size))
674 if (dump_enabled_p ())
675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
676 "Build SLP failed: unrolling required "
677 "in basic block SLP\n");
678 /* Fatal mismatch. */
679 return false;
682 /* In case of multiple types we need to detect the smallest type. */
683 vect_update_max_nunits (max_nunits, vectype);
684 return true;
687 /* STMTS is a group of GROUP_SIZE SLP statements in which some
688 statements do the same operation as the first statement and in which
689 the others do ALT_STMT_CODE. Return true if we can take one vector
690 of the first operation and one vector of the second and permute them
691 to get the required result. VECTYPE is the type of the vector that
692 would be permuted. */
694 static bool
695 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
696 unsigned int group_size, tree vectype,
697 tree_code alt_stmt_code)
699 unsigned HOST_WIDE_INT count;
700 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
701 return false;
703 vec_perm_builder sel (count, count, 1);
704 for (unsigned int i = 0; i < count; ++i)
706 unsigned int elt = i;
707 gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
708 if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
709 elt += count;
710 sel.quick_push (elt);
712 vec_perm_indices indices (sel, 2, count);
713 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
716 /* Verify if the scalar stmts STMTS are isomorphic, require data
717 permutation or are of unsupported types of operation. Return
718 true if they are, otherwise return false and indicate in *MATCHES
719 which stmts are not isomorphic to the first one. If MATCHES[0]
720 is false then this indicates the comparison could not be
721 carried out or the stmts will never be vectorized by SLP.
723 Note COND_EXPR is possibly isomorphic to another one after swapping its
724 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
725 the first stmt by swapping the two operands of comparison; set SWAP[i]
726 to 2 if stmt I is isormorphic to the first stmt by inverting the code
727 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
728 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
730 static bool
731 vect_build_slp_tree_1 (unsigned char *swap,
732 vec<stmt_vec_info> stmts, unsigned int group_size,
733 poly_uint64 *max_nunits, bool *matches,
734 bool *two_operators)
736 unsigned int i;
737 stmt_vec_info first_stmt_info = stmts[0];
738 enum tree_code first_stmt_code = ERROR_MARK;
739 enum tree_code alt_stmt_code = ERROR_MARK;
740 enum tree_code rhs_code = ERROR_MARK;
741 enum tree_code first_cond_code = ERROR_MARK;
742 tree lhs;
743 bool need_same_oprnds = false;
744 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
745 optab optab;
746 int icode;
747 machine_mode optab_op2_mode;
748 machine_mode vec_mode;
749 stmt_vec_info first_load = NULL, prev_first_load = NULL;
750 bool load_p = false;
752 /* For every stmt in NODE find its def stmt/s. */
753 stmt_vec_info stmt_info;
754 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
756 gimple *stmt = stmt_info->stmt;
757 swap[i] = 0;
758 matches[i] = false;
760 if (dump_enabled_p ())
761 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
763 /* Fail to vectorize statements marked as unvectorizable. */
764 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
766 if (dump_enabled_p ())
767 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
768 "Build SLP failed: unvectorizable statement %G",
769 stmt);
770 /* Fatal mismatch. */
771 matches[0] = false;
772 return false;
775 lhs = gimple_get_lhs (stmt);
776 if (lhs == NULL_TREE)
778 if (dump_enabled_p ())
779 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
780 "Build SLP failed: not GIMPLE_ASSIGN nor "
781 "GIMPLE_CALL %G", stmt);
782 /* Fatal mismatch. */
783 matches[0] = false;
784 return false;
787 tree nunits_vectype;
788 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
789 &nunits_vectype)
790 || (nunits_vectype
791 && !vect_record_max_nunits (stmt_info, group_size,
792 nunits_vectype, max_nunits)))
794 /* Fatal mismatch. */
795 matches[0] = false;
796 return false;
799 gcc_assert (vectype);
801 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
803 rhs_code = CALL_EXPR;
805 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
806 load_p = true;
807 else if ((gimple_call_internal_p (call_stmt)
808 && (!vectorizable_internal_fn_p
809 (gimple_call_internal_fn (call_stmt))))
810 || gimple_call_tail_p (call_stmt)
811 || gimple_call_noreturn_p (call_stmt)
812 || !gimple_call_nothrow_p (call_stmt)
813 || gimple_call_chain (call_stmt))
815 if (dump_enabled_p ())
816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
817 "Build SLP failed: unsupported call type %G",
818 call_stmt);
819 /* Fatal mismatch. */
820 matches[0] = false;
821 return false;
824 else
826 rhs_code = gimple_assign_rhs_code (stmt);
827 load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference;
830 /* Check the operation. */
831 if (i == 0)
833 first_stmt_code = rhs_code;
835 /* Shift arguments should be equal in all the packed stmts for a
836 vector shift with scalar shift operand. */
837 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
838 || rhs_code == LROTATE_EXPR
839 || rhs_code == RROTATE_EXPR)
841 if (vectype == boolean_type_node)
843 if (dump_enabled_p ())
844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
845 "Build SLP failed: shift of a"
846 " boolean.\n");
847 /* Fatal mismatch. */
848 matches[0] = false;
849 return false;
852 vec_mode = TYPE_MODE (vectype);
854 /* First see if we have a vector/vector shift. */
855 optab = optab_for_tree_code (rhs_code, vectype,
856 optab_vector);
858 if (!optab
859 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
861 /* No vector/vector shift, try for a vector/scalar shift. */
862 optab = optab_for_tree_code (rhs_code, vectype,
863 optab_scalar);
865 if (!optab)
867 if (dump_enabled_p ())
868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
869 "Build SLP failed: no optab.\n");
870 /* Fatal mismatch. */
871 matches[0] = false;
872 return false;
874 icode = (int) optab_handler (optab, vec_mode);
875 if (icode == CODE_FOR_nothing)
877 if (dump_enabled_p ())
878 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
879 "Build SLP failed: "
880 "op not supported by target.\n");
881 /* Fatal mismatch. */
882 matches[0] = false;
883 return false;
885 optab_op2_mode = insn_data[icode].operand[2].mode;
886 if (!VECTOR_MODE_P (optab_op2_mode))
888 need_same_oprnds = true;
889 first_op1 = gimple_assign_rhs2 (stmt);
893 else if (rhs_code == WIDEN_LSHIFT_EXPR)
895 need_same_oprnds = true;
896 first_op1 = gimple_assign_rhs2 (stmt);
899 else
901 if (first_stmt_code != rhs_code
902 && alt_stmt_code == ERROR_MARK)
903 alt_stmt_code = rhs_code;
904 if (first_stmt_code != rhs_code
905 && (first_stmt_code != IMAGPART_EXPR
906 || rhs_code != REALPART_EXPR)
907 && (first_stmt_code != REALPART_EXPR
908 || rhs_code != IMAGPART_EXPR)
909 /* Handle mismatches in plus/minus by computing both
910 and merging the results. */
911 && !((first_stmt_code == PLUS_EXPR
912 || first_stmt_code == MINUS_EXPR)
913 && (alt_stmt_code == PLUS_EXPR
914 || alt_stmt_code == MINUS_EXPR)
915 && rhs_code == alt_stmt_code)
916 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
917 && (first_stmt_code == ARRAY_REF
918 || first_stmt_code == BIT_FIELD_REF
919 || first_stmt_code == INDIRECT_REF
920 || first_stmt_code == COMPONENT_REF
921 || first_stmt_code == MEM_REF)))
923 if (dump_enabled_p ())
925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
926 "Build SLP failed: different operation "
927 "in stmt %G", stmt);
928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
929 "original stmt %G", first_stmt_info->stmt);
931 /* Mismatch. */
932 continue;
935 if (need_same_oprnds
936 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
938 if (dump_enabled_p ())
939 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
940 "Build SLP failed: different shift "
941 "arguments in %G", stmt);
942 /* Mismatch. */
943 continue;
946 if (!load_p && rhs_code == CALL_EXPR)
948 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
949 as_a <gcall *> (stmt)))
951 if (dump_enabled_p ())
952 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
953 "Build SLP failed: different calls in %G",
954 stmt);
955 /* Mismatch. */
956 continue;
961 /* Grouped store or load. */
962 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
964 if (REFERENCE_CLASS_P (lhs))
966 /* Store. */
969 else
971 /* Load. */
972 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
973 if (prev_first_load)
975 /* Check that there are no loads from different interleaving
976 chains in the same node. */
977 if (prev_first_load != first_load)
979 if (dump_enabled_p ())
980 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
981 vect_location,
982 "Build SLP failed: different "
983 "interleaving chains in one node %G",
984 stmt);
985 /* Mismatch. */
986 continue;
989 else
990 prev_first_load = first_load;
992 } /* Grouped access. */
993 else
995 if (load_p)
997 /* Not grouped load. */
998 if (dump_enabled_p ())
999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1000 "Build SLP failed: not grouped load %G", stmt);
1002 /* FORNOW: Not grouped loads are not supported. */
1003 /* Fatal mismatch. */
1004 matches[0] = false;
1005 return false;
1008 /* Not memory operation. */
1009 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1010 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1011 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1012 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1013 && rhs_code != VIEW_CONVERT_EXPR
1014 && rhs_code != CALL_EXPR)
1016 if (dump_enabled_p ())
1017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1018 "Build SLP failed: operation unsupported %G",
1019 stmt);
1020 /* Fatal mismatch. */
1021 matches[0] = false;
1022 return false;
1025 if (rhs_code == COND_EXPR)
1027 tree cond_expr = gimple_assign_rhs1 (stmt);
1028 enum tree_code cond_code = TREE_CODE (cond_expr);
1029 enum tree_code swap_code = ERROR_MARK;
1030 enum tree_code invert_code = ERROR_MARK;
1032 if (i == 0)
1033 first_cond_code = TREE_CODE (cond_expr);
1034 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1036 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1037 swap_code = swap_tree_comparison (cond_code);
1038 invert_code = invert_tree_comparison (cond_code, honor_nans);
1041 if (first_cond_code == cond_code)
1043 /* Isomorphic can be achieved by swapping. */
1044 else if (first_cond_code == swap_code)
1045 swap[i] = 1;
1046 /* Isomorphic can be achieved by inverting. */
1047 else if (first_cond_code == invert_code)
1048 swap[i] = 2;
1049 else
1051 if (dump_enabled_p ())
1052 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1053 "Build SLP failed: different"
1054 " operation %G", stmt);
1055 /* Mismatch. */
1056 continue;
1061 matches[i] = true;
1064 for (i = 0; i < group_size; ++i)
1065 if (!matches[i])
1066 return false;
1068 /* If we allowed a two-operation SLP node verify the target can cope
1069 with the permute we are going to use. */
1070 if (alt_stmt_code != ERROR_MARK
1071 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1073 if (vectype == boolean_type_node
1074 || !vect_two_operations_perm_ok_p (stmts, group_size,
1075 vectype, alt_stmt_code))
1077 for (i = 0; i < group_size; ++i)
1078 if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
1080 matches[i] = false;
1081 if (dump_enabled_p ())
1083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1084 "Build SLP failed: different operation "
1085 "in stmt %G", stmts[i]->stmt);
1086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1087 "original stmt %G", first_stmt_info->stmt);
1090 return false;
1092 *two_operators = true;
1095 return true;
1098 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1099 Note we never remove apart from at destruction time so we do not
1100 need a special value for deleted that differs from empty. */
1101 struct bst_traits
1103 typedef vec <stmt_vec_info> value_type;
1104 typedef vec <stmt_vec_info> compare_type;
1105 static inline hashval_t hash (value_type);
1106 static inline bool equal (value_type existing, value_type candidate);
1107 static inline bool is_empty (value_type x) { return !x.exists (); }
1108 static inline bool is_deleted (value_type x) { return !x.exists (); }
1109 static inline void mark_empty (value_type &x) { x.release (); }
1110 static inline void mark_deleted (value_type &x) { x.release (); }
1111 static inline void remove (value_type &x) { x.release (); }
1113 inline hashval_t
1114 bst_traits::hash (value_type x)
1116 inchash::hash h;
1117 for (unsigned i = 0; i < x.length (); ++i)
1118 h.add_int (gimple_uid (x[i]->stmt));
1119 return h.end ();
1121 inline bool
1122 bst_traits::equal (value_type existing, value_type candidate)
1124 if (existing.length () != candidate.length ())
1125 return false;
1126 for (unsigned i = 0; i < existing.length (); ++i)
1127 if (existing[i] != candidate[i])
1128 return false;
1129 return true;
1132 typedef hash_map <vec <gimple *>, slp_tree,
1133 simple_hashmap_traits <bst_traits, slp_tree> >
1134 scalar_stmts_to_slp_tree_map_t;
1136 static slp_tree
1137 vect_build_slp_tree_2 (vec_info *vinfo,
1138 vec<stmt_vec_info> stmts, unsigned int group_size,
1139 poly_uint64 *max_nunits,
1140 bool *matches, unsigned *npermutes, unsigned *tree_size,
1141 scalar_stmts_to_slp_tree_map_t *bst_map);
1143 static slp_tree
1144 vect_build_slp_tree (vec_info *vinfo,
1145 vec<stmt_vec_info> stmts, unsigned int group_size,
1146 poly_uint64 *max_nunits,
1147 bool *matches, unsigned *npermutes, unsigned *tree_size,
1148 scalar_stmts_to_slp_tree_map_t *bst_map)
1150 if (slp_tree *leader = bst_map->get (stmts))
1152 if (dump_enabled_p ())
1153 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1154 *leader ? "" : "failed ", *leader);
1155 if (*leader)
1157 (*leader)->refcnt++;
1158 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1160 return *leader;
1162 poly_uint64 this_max_nunits = 1;
1163 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1164 matches, npermutes, tree_size, bst_map);
1165 if (res)
1167 res->max_nunits = this_max_nunits;
1168 vect_update_max_nunits (max_nunits, this_max_nunits);
1169 /* Keep a reference for the bst_map use. */
1170 res->refcnt++;
1172 bst_map->put (stmts.copy (), res);
1173 return res;
1176 /* Recursively build an SLP tree starting from NODE.
1177 Fail (and return a value not equal to zero) if def-stmts are not
1178 isomorphic, require data permutation or are of unsupported types of
1179 operation. Otherwise, return 0.
1180 The value returned is the depth in the SLP tree where a mismatch
1181 was found. */
1183 static slp_tree
1184 vect_build_slp_tree_2 (vec_info *vinfo,
1185 vec<stmt_vec_info> stmts, unsigned int group_size,
1186 poly_uint64 *max_nunits,
1187 bool *matches, unsigned *npermutes, unsigned *tree_size,
1188 scalar_stmts_to_slp_tree_map_t *bst_map)
1190 unsigned nops, i, this_tree_size = 0;
1191 poly_uint64 this_max_nunits = *max_nunits;
1192 slp_tree node;
1194 matches[0] = false;
1196 stmt_vec_info stmt_info = stmts[0];
1197 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1198 nops = gimple_call_num_args (stmt);
1199 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1201 nops = gimple_num_ops (stmt) - 1;
1202 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1203 nops++;
1205 else if (is_a <gphi *> (stmt_info->stmt))
1206 nops = 0;
1207 else
1208 return NULL;
1210 /* If the SLP node is a PHI (induction or reduction), terminate
1211 the recursion. */
1212 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1214 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1215 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1216 if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits))
1217 return NULL;
1219 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1220 /* Induction from different IVs is not supported. */
1221 if (def_type == vect_induction_def)
1223 stmt_vec_info other_info;
1224 FOR_EACH_VEC_ELT (stmts, i, other_info)
1225 if (stmt_info != other_info)
1226 return NULL;
1228 else if (def_type == vect_reduction_def
1229 || def_type == vect_double_reduction_def
1230 || def_type == vect_nested_cycle)
1232 /* Else def types have to match. */
1233 stmt_vec_info other_info;
1234 FOR_EACH_VEC_ELT (stmts, i, other_info)
1235 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1236 return NULL;
1238 else
1239 return NULL;
1240 (*tree_size)++;
1241 node = vect_create_new_slp_node (stmts);
1242 return node;
1246 bool two_operators = false;
1247 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1248 if (!vect_build_slp_tree_1 (swap, stmts, group_size,
1249 &this_max_nunits, matches, &two_operators))
1250 return NULL;
1252 /* If the SLP node is a load, terminate the recursion unless masked. */
1253 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1254 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1256 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1258 /* Masked load. */
1259 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1260 nops = 1;
1262 else
1264 *max_nunits = this_max_nunits;
1265 (*tree_size)++;
1266 node = vect_create_new_slp_node (stmts);
1267 return node;
1271 /* Get at the operands, verifying they are compatible. */
1272 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1273 slp_oprnd_info oprnd_info;
1274 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1276 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1277 stmts, i, &oprnds_info);
1278 if (res != 0)
1279 matches[(res == -1) ? 0 : i] = false;
1280 if (!matches[0])
1281 break;
1283 for (i = 0; i < group_size; ++i)
1284 if (!matches[i])
1286 vect_free_oprnd_info (oprnds_info);
1287 return NULL;
1290 auto_vec<slp_tree, 4> children;
1292 stmt_info = stmts[0];
1294 /* Create SLP_TREE nodes for the definition node/s. */
1295 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1297 slp_tree child;
1298 unsigned old_tree_size = this_tree_size;
1299 unsigned int j;
1301 if (oprnd_info->first_dt == vect_uninitialized_def)
1303 /* COND_EXPR have one too many eventually if the condition
1304 is a SSA name. */
1305 gcc_assert (i == 3 && nops == 4);
1306 continue;
1309 if (oprnd_info->first_dt != vect_internal_def
1310 && oprnd_info->first_dt != vect_reduction_def
1311 && oprnd_info->first_dt != vect_induction_def)
1313 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1314 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1315 oprnd_info->ops = vNULL;
1316 children.safe_push (invnode);
1317 continue;
1320 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1321 group_size, &this_max_nunits,
1322 matches, npermutes,
1323 &this_tree_size, bst_map)) != NULL)
1325 /* If we have all children of child built up from scalars then just
1326 throw that away and build it up this node from scalars. */
1327 if (is_a <bb_vec_info> (vinfo)
1328 && !SLP_TREE_CHILDREN (child).is_empty ()
1329 /* ??? Rejecting patterns this way doesn't work. We'd have to
1330 do extra work to cancel the pattern so the uses see the
1331 scalar version. */
1332 && !oprnd_info->any_pattern)
1334 slp_tree grandchild;
1336 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1337 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1338 break;
1339 if (!grandchild)
1341 /* Roll back. */
1342 this_tree_size = old_tree_size;
1343 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1344 vect_free_slp_tree (grandchild, false);
1345 SLP_TREE_CHILDREN (child).truncate (0);
1347 if (dump_enabled_p ())
1348 dump_printf_loc (MSG_NOTE, vect_location,
1349 "Building parent vector operands from "
1350 "scalars instead\n");
1351 oprnd_info->def_stmts = vNULL;
1352 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1353 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1354 oprnd_info->ops = vNULL;
1355 ++this_tree_size;
1356 children.safe_push (child);
1357 continue;
1361 oprnd_info->def_stmts = vNULL;
1362 children.safe_push (child);
1363 continue;
1366 /* If the SLP build failed fatally and we analyze a basic-block
1367 simply treat nodes we fail to build as externally defined
1368 (and thus build vectors from the scalar defs).
1369 The cost model will reject outright expensive cases.
1370 ??? This doesn't treat cases where permutation ultimatively
1371 fails (or we don't try permutation below). Ideally we'd
1372 even compute a permutation that will end up with the maximum
1373 SLP tree size... */
1374 if (is_a <bb_vec_info> (vinfo)
1375 && !matches[0]
1376 /* ??? Rejecting patterns this way doesn't work. We'd have to
1377 do extra work to cancel the pattern so the uses see the
1378 scalar version. */
1379 && !is_pattern_stmt_p (stmt_info)
1380 && !oprnd_info->any_pattern)
1382 if (dump_enabled_p ())
1383 dump_printf_loc (MSG_NOTE, vect_location,
1384 "Building vector operands from scalars\n");
1385 this_tree_size++;
1386 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1387 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1388 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1389 children.safe_push (child);
1390 oprnd_info->ops = vNULL;
1391 oprnd_info->def_stmts = vNULL;
1392 continue;
1395 /* If the SLP build for operand zero failed and operand zero
1396 and one can be commutated try that for the scalar stmts
1397 that failed the match. */
1398 if (i == 0
1399 /* A first scalar stmt mismatch signals a fatal mismatch. */
1400 && matches[0]
1401 /* ??? For COND_EXPRs we can swap the comparison operands
1402 as well as the arms under some constraints. */
1403 && nops == 2
1404 && oprnds_info[1]->first_dt == vect_internal_def
1405 && is_gimple_assign (stmt_info->stmt)
1406 /* Swapping operands for reductions breaks assumptions later on. */
1407 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1408 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1409 /* Do so only if the number of not successful permutes was nor more
1410 than a cut-ff as re-trying the recursive match on
1411 possibly each level of the tree would expose exponential
1412 behavior. */
1413 && *npermutes < 4)
1415 /* See whether we can swap the matching or the non-matching
1416 stmt operands. */
1417 bool swap_not_matching = true;
1420 for (j = 0; j < group_size; ++j)
1422 if (matches[j] != !swap_not_matching)
1423 continue;
1424 stmt_vec_info stmt_info = stmts[j];
1425 /* Verify if we can swap operands of this stmt. */
1426 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1427 if (!stmt
1428 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1430 if (!swap_not_matching)
1431 goto fail;
1432 swap_not_matching = false;
1433 break;
1437 while (j != group_size);
1439 /* Swap mismatched definition stmts. */
1440 if (dump_enabled_p ())
1441 dump_printf_loc (MSG_NOTE, vect_location,
1442 "Re-trying with swapped operands of stmts ");
1443 for (j = 0; j < group_size; ++j)
1444 if (matches[j] == !swap_not_matching)
1446 std::swap (oprnds_info[0]->def_stmts[j],
1447 oprnds_info[1]->def_stmts[j]);
1448 std::swap (oprnds_info[0]->ops[j],
1449 oprnds_info[1]->ops[j]);
1450 if (dump_enabled_p ())
1451 dump_printf (MSG_NOTE, "%d ", j);
1453 if (dump_enabled_p ())
1454 dump_printf (MSG_NOTE, "\n");
1455 /* And try again with scratch 'matches' ... */
1456 bool *tem = XALLOCAVEC (bool, group_size);
1457 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1458 group_size, &this_max_nunits,
1459 tem, npermutes,
1460 &this_tree_size, bst_map)) != NULL)
1462 /* If we have all children of child built up from scalars then
1463 just throw that away and build it up this node from scalars. */
1464 if (is_a <bb_vec_info> (vinfo)
1465 && !SLP_TREE_CHILDREN (child).is_empty ()
1466 /* ??? Rejecting patterns this way doesn't work. We'd have
1467 to do extra work to cancel the pattern so the uses see the
1468 scalar version. */
1469 && !oprnd_info->any_pattern)
1471 unsigned int j;
1472 slp_tree grandchild;
1474 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1475 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1476 break;
1477 if (!grandchild)
1479 /* Roll back. */
1480 this_tree_size = old_tree_size;
1481 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1482 vect_free_slp_tree (grandchild, false);
1483 SLP_TREE_CHILDREN (child).truncate (0);
1485 if (dump_enabled_p ())
1486 dump_printf_loc (MSG_NOTE, vect_location,
1487 "Building parent vector operands from "
1488 "scalars instead\n");
1489 oprnd_info->def_stmts = vNULL;
1490 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1491 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1492 oprnd_info->ops = vNULL;
1493 ++this_tree_size;
1494 children.safe_push (child);
1495 continue;
1499 oprnd_info->def_stmts = vNULL;
1500 children.safe_push (child);
1501 continue;
1504 ++*npermutes;
1507 fail:
1508 gcc_assert (child == NULL);
1509 FOR_EACH_VEC_ELT (children, j, child)
1510 vect_free_slp_tree (child, false);
1511 vect_free_oprnd_info (oprnds_info);
1512 return NULL;
1515 vect_free_oprnd_info (oprnds_info);
1517 *tree_size += this_tree_size + 1;
1518 *max_nunits = this_max_nunits;
1520 node = vect_create_new_slp_node (stmts);
1521 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1522 SLP_TREE_CHILDREN (node).splice (children);
1523 return node;
1526 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1528 static void
1529 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1530 slp_tree node, hash_set<slp_tree> &visited)
1532 unsigned i;
1533 stmt_vec_info stmt_info;
1534 slp_tree child;
1535 tree op;
1537 if (visited.add (node))
1538 return;
1540 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1541 dump_user_location_t user_loc = loc.get_user_location ();
1542 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u)\n",
1543 SLP_TREE_DEF_TYPE (node) == vect_external_def
1544 ? " (external)"
1545 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1546 ? " (constant)"
1547 : ""), node,
1548 estimated_poly_value (node->max_nunits));
1549 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1550 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1551 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1552 else
1554 dump_printf_loc (metadata, user_loc, "\t{ ");
1555 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1556 dump_printf (metadata, "%T%s ", op,
1557 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1558 dump_printf (metadata, "}\n");
1560 if (SLP_TREE_CHILDREN (node).is_empty ())
1561 return;
1562 dump_printf_loc (metadata, user_loc, "\tchildren");
1563 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1564 dump_printf (dump_kind, " %p", (void *)child);
1565 dump_printf (dump_kind, "\n");
1566 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1567 vect_print_slp_tree (dump_kind, loc, child, visited);
1570 static void
1571 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1572 slp_tree node)
1574 hash_set<slp_tree> visited;
1575 vect_print_slp_tree (dump_kind, loc, node, visited);
1578 /* Mark the tree rooted at NODE with PURE_SLP. */
1580 static void
1581 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1583 int i;
1584 stmt_vec_info stmt_info;
1585 slp_tree child;
1587 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1588 return;
1590 if (visited.add (node))
1591 return;
1593 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1594 STMT_SLP_TYPE (stmt_info) = pure_slp;
1596 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1597 vect_mark_slp_stmts (child, visited);
1600 static void
1601 vect_mark_slp_stmts (slp_tree node)
1603 hash_set<slp_tree> visited;
1604 vect_mark_slp_stmts (node, visited);
1607 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1609 static void
1610 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1612 int i;
1613 stmt_vec_info stmt_info;
1614 slp_tree child;
1616 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1617 return;
1619 if (visited.add (node))
1620 return;
1622 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1624 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1625 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1626 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1629 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1630 vect_mark_slp_stmts_relevant (child, visited);
1633 static void
1634 vect_mark_slp_stmts_relevant (slp_tree node)
1636 hash_set<slp_tree> visited;
1637 vect_mark_slp_stmts_relevant (node, visited);
1641 /* Rearrange the statements of NODE according to PERMUTATION. */
1643 static void
1644 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1645 vec<unsigned> permutation,
1646 hash_set<slp_tree> &visited)
1648 unsigned int i;
1649 slp_tree child;
1651 if (visited.add (node))
1652 return;
1654 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1655 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1657 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1659 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1660 vec<stmt_vec_info> tmp_stmts;
1661 tmp_stmts.create (group_size);
1662 tmp_stmts.quick_grow (group_size);
1663 stmt_vec_info stmt_info;
1664 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1665 tmp_stmts[permutation[i]] = stmt_info;
1666 SLP_TREE_SCALAR_STMTS (node).release ();
1667 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1669 if (SLP_TREE_SCALAR_OPS (node).exists ())
1671 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1672 vec<tree> tmp_ops;
1673 tmp_ops.create (group_size);
1674 tmp_ops.quick_grow (group_size);
1675 tree op;
1676 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1677 tmp_ops[permutation[i]] = op;
1678 SLP_TREE_SCALAR_OPS (node).release ();
1679 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1684 /* Attempt to reorder stmts in a reduction chain so that we don't
1685 require any load permutation. Return true if that was possible,
1686 otherwise return false. */
1688 static bool
1689 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1691 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1692 unsigned int i, j;
1693 unsigned int lidx;
1694 slp_tree node, load;
1696 /* Compare all the permutation sequences to the first one. We know
1697 that at least one load is permuted. */
1698 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1699 if (!node->load_permutation.exists ())
1700 return false;
1701 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1703 if (!load->load_permutation.exists ())
1704 return false;
1705 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1706 if (lidx != node->load_permutation[j])
1707 return false;
1710 /* Check that the loads in the first sequence are different and there
1711 are no gaps between them. */
1712 auto_sbitmap load_index (group_size);
1713 bitmap_clear (load_index);
1714 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1716 if (lidx >= group_size)
1717 return false;
1718 if (bitmap_bit_p (load_index, lidx))
1719 return false;
1721 bitmap_set_bit (load_index, lidx);
1723 for (i = 0; i < group_size; i++)
1724 if (!bitmap_bit_p (load_index, i))
1725 return false;
1727 /* This permutation is valid for reduction. Since the order of the
1728 statements in the nodes is not important unless they are memory
1729 accesses, we can rearrange the statements in all the nodes
1730 according to the order of the loads. */
1731 hash_set<slp_tree> visited;
1732 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1733 node->load_permutation, visited);
1735 /* We are done, no actual permutations need to be generated. */
1736 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1737 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1739 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1740 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1741 /* But we have to keep those permutations that are required because
1742 of handling of gaps. */
1743 if (known_eq (unrolling_factor, 1U)
1744 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1745 && DR_GROUP_GAP (first_stmt_info) == 0))
1746 SLP_TREE_LOAD_PERMUTATION (node).release ();
1747 else
1748 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1749 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1752 return true;
1755 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1757 static void
1758 vect_gather_slp_loads (slp_instance inst, slp_tree node,
1759 hash_set<slp_tree> &visited)
1761 if (visited.add (node))
1762 return;
1764 if (SLP_TREE_CHILDREN (node).length () == 0)
1766 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1767 return;
1768 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1769 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1770 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1771 SLP_INSTANCE_LOADS (inst).safe_push (node);
1773 else
1775 unsigned i;
1776 slp_tree child;
1777 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1778 vect_gather_slp_loads (inst, child, visited);
1782 static void
1783 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1785 hash_set<slp_tree> visited;
1786 vect_gather_slp_loads (inst, node, visited);
1789 /* Check if the required load permutations in the SLP instance
1790 SLP_INSTN are supported. */
1792 static bool
1793 vect_supported_load_permutation_p (slp_instance slp_instn)
1795 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1796 unsigned int i, j, k, next;
1797 slp_tree node;
1799 if (dump_enabled_p ())
1801 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1802 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1803 if (node->load_permutation.exists ())
1804 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1805 dump_printf (MSG_NOTE, "%d ", next);
1806 else
1807 for (k = 0; k < group_size; ++k)
1808 dump_printf (MSG_NOTE, "%d ", k);
1809 dump_printf (MSG_NOTE, "\n");
1812 /* In case of reduction every load permutation is allowed, since the order
1813 of the reduction statements is not important (as opposed to the case of
1814 grouped stores). The only condition we need to check is that all the
1815 load nodes are of the same size and have the same permutation (and then
1816 rearrange all the nodes of the SLP instance according to this
1817 permutation). */
1819 /* Check that all the load nodes are of the same size. */
1820 /* ??? Can't we assert this? */
1821 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1822 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1823 return false;
1825 node = SLP_INSTANCE_TREE (slp_instn);
1826 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1828 /* Reduction (there are no data-refs in the root).
1829 In reduction chain the order of the loads is not important. */
1830 if (!STMT_VINFO_DATA_REF (stmt_info)
1831 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1832 vect_attempt_slp_rearrange_stmts (slp_instn);
1834 /* In basic block vectorization we allow any subchain of an interleaving
1835 chain.
1836 FORNOW: not supported in loop SLP because of realignment compications. */
1837 if (STMT_VINFO_BB_VINFO (stmt_info))
1839 /* Check whether the loads in an instance form a subchain and thus
1840 no permutation is necessary. */
1841 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1843 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1844 continue;
1845 bool subchain_p = true;
1846 stmt_vec_info next_load_info = NULL;
1847 stmt_vec_info load_info;
1848 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1850 if (j != 0
1851 && (next_load_info != load_info
1852 || DR_GROUP_GAP (load_info) != 1))
1854 subchain_p = false;
1855 break;
1857 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
1859 if (subchain_p)
1860 SLP_TREE_LOAD_PERMUTATION (node).release ();
1861 else
1863 stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
1864 group_info = DR_GROUP_FIRST_ELEMENT (group_info);
1865 unsigned HOST_WIDE_INT nunits;
1866 unsigned k, maxk = 0;
1867 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1868 if (k > maxk)
1869 maxk = k;
1870 /* In BB vectorization we may not actually use a loaded vector
1871 accessing elements in excess of DR_GROUP_SIZE. */
1872 tree vectype = STMT_VINFO_VECTYPE (group_info);
1873 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1874 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
1876 if (dump_enabled_p ())
1877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1878 "BB vectorization with gaps at the end of "
1879 "a load is not supported\n");
1880 return false;
1883 /* Verify the permutation can be generated. */
1884 vec<tree> tem;
1885 unsigned n_perms;
1886 if (!vect_transform_slp_perm_load (node, tem, NULL,
1887 1, slp_instn, true, &n_perms))
1889 if (dump_enabled_p ())
1890 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1891 vect_location,
1892 "unsupported load permutation\n");
1893 return false;
1897 return true;
1900 /* For loop vectorization verify we can generate the permutation. Be
1901 conservative about the vectorization factor, there are permutations
1902 that will use three vector inputs only starting from a specific factor
1903 and the vectorization factor is not yet final.
1904 ??? The SLP instance unrolling factor might not be the maximum one. */
1905 unsigned n_perms;
1906 poly_uint64 test_vf
1907 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1908 LOOP_VINFO_VECT_FACTOR
1909 (STMT_VINFO_LOOP_VINFO (stmt_info)));
1910 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1911 if (node->load_permutation.exists ()
1912 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1913 slp_instn, true, &n_perms))
1914 return false;
1916 return true;
1920 /* Find the last store in SLP INSTANCE. */
1922 stmt_vec_info
1923 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1925 stmt_vec_info last = NULL;
1926 stmt_vec_info stmt_vinfo;
1928 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1930 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1931 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1934 return last;
1937 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1938 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1939 (also containing the first GROUP1_SIZE stmts, since stores are
1940 consecutive), the second containing the remainder.
1941 Return the first stmt in the second group. */
1943 static stmt_vec_info
1944 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1946 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1947 gcc_assert (group1_size > 0);
1948 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1949 gcc_assert (group2_size > 0);
1950 DR_GROUP_SIZE (first_vinfo) = group1_size;
1952 stmt_vec_info stmt_info = first_vinfo;
1953 for (unsigned i = group1_size; i > 1; i--)
1955 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1956 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1958 /* STMT is now the last element of the first group. */
1959 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1960 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1962 DR_GROUP_SIZE (group2) = group2_size;
1963 for (stmt_info = group2; stmt_info;
1964 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1966 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1967 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1970 /* For the second group, the DR_GROUP_GAP is that before the original group,
1971 plus skipping over the first vector. */
1972 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1974 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1975 DR_GROUP_GAP (first_vinfo) += group2_size;
1977 if (dump_enabled_p ())
1978 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1979 group1_size, group2_size);
1981 return group2;
1984 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1985 statements and a vector of NUNITS elements. */
1987 static poly_uint64
1988 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
1990 return exact_div (common_multiple (nunits, group_size), group_size);
1993 /* Analyze an SLP instance starting from a group of grouped stores. Call
1994 vect_build_slp_tree to build a tree of packed stmts if possible.
1995 Return FALSE if it's impossible to SLP any stmt in the loop. */
1997 static bool
1998 vect_analyze_slp_instance (vec_info *vinfo,
1999 stmt_vec_info stmt_info, unsigned max_tree_size)
2001 slp_instance new_instance;
2002 slp_tree node;
2003 unsigned int group_size;
2004 tree vectype, scalar_type = NULL_TREE;
2005 unsigned int i;
2006 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2007 vec<stmt_vec_info> scalar_stmts;
2008 bool constructor = false;
2010 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2012 scalar_type = TREE_TYPE (DR_REF (dr));
2013 vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
2014 group_size = DR_GROUP_SIZE (stmt_info);
2016 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2018 gcc_assert (is_a <loop_vec_info> (vinfo));
2019 vectype = STMT_VINFO_VECTYPE (stmt_info);
2020 group_size = REDUC_GROUP_SIZE (stmt_info);
2022 else if (is_gimple_assign (stmt_info->stmt)
2023 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2025 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
2026 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2027 constructor = true;
2029 else
2031 gcc_assert (is_a <loop_vec_info> (vinfo));
2032 vectype = STMT_VINFO_VECTYPE (stmt_info);
2033 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2036 if (!vectype)
2038 if (dump_enabled_p ())
2039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2040 "Build SLP failed: unsupported data-type %T\n",
2041 scalar_type);
2043 return false;
2045 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2047 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2048 scalar_stmts.create (group_size);
2049 stmt_vec_info next_info = stmt_info;
2050 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2053 while (next_info)
2055 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2056 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2059 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2061 /* Collect the reduction stmts and store them in
2062 SLP_TREE_SCALAR_STMTS. */
2063 while (next_info)
2065 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2066 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2068 /* Mark the first element of the reduction chain as reduction to properly
2069 transform the node. In the reduction analysis phase only the last
2070 element of the chain is marked as reduction. */
2071 STMT_VINFO_DEF_TYPE (stmt_info)
2072 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2073 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2074 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2076 else if (constructor)
2078 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2079 tree val;
2080 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2082 if (TREE_CODE (val) == SSA_NAME)
2084 gimple* def = SSA_NAME_DEF_STMT (val);
2085 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2086 /* Value is defined in another basic block. */
2087 if (!def_info)
2088 return false;
2089 scalar_stmts.safe_push (def_info);
2091 else
2092 return false;
2095 else
2097 /* Collect reduction statements. */
2098 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2099 for (i = 0; reductions.iterate (i, &next_info); i++)
2100 scalar_stmts.safe_push (next_info);
2103 /* Build the tree for the SLP instance. */
2104 bool *matches = XALLOCAVEC (bool, group_size);
2105 unsigned npermutes = 0;
2106 scalar_stmts_to_slp_tree_map_t *bst_map
2107 = new scalar_stmts_to_slp_tree_map_t ();
2108 poly_uint64 max_nunits = nunits;
2109 unsigned tree_size = 0;
2110 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2111 &max_nunits, matches, &npermutes,
2112 &tree_size, bst_map);
2113 /* The map keeps a reference on SLP nodes built, release that. */
2114 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2115 it != bst_map->end (); ++it)
2116 if ((*it).second)
2117 vect_free_slp_tree ((*it).second, false);
2118 delete bst_map;
2119 if (node != NULL)
2121 /* If this is a reduction chain with a conversion in front
2122 amend the SLP tree with a node for that. */
2123 if (!dr
2124 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2125 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2127 /* Get at the conversion stmt - we know it's the single use
2128 of the last stmt of the reduction chain. */
2129 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2130 use_operand_p use_p;
2131 gimple *use_stmt;
2132 bool r = single_imm_use (gimple_assign_lhs (tem), &use_p, &use_stmt);
2133 gcc_assert (r);
2134 next_info = vinfo->lookup_stmt (use_stmt);
2135 next_info = vect_stmt_to_vectorize (next_info);
2136 scalar_stmts = vNULL;
2137 scalar_stmts.create (group_size);
2138 for (unsigned i = 0; i < group_size; ++i)
2139 scalar_stmts.quick_push (next_info);
2140 slp_tree conv = vect_create_new_slp_node (scalar_stmts);
2141 SLP_TREE_CHILDREN (conv).quick_push (node);
2142 node = conv;
2143 /* We also have to fake this conversion stmt as SLP reduction group
2144 so we don't have to mess with too much code elsewhere. */
2145 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2146 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2149 /* Calculate the unrolling factor based on the smallest type. */
2150 poly_uint64 unrolling_factor
2151 = calculate_unrolling_factor (max_nunits, group_size);
2153 if (maybe_ne (unrolling_factor, 1U)
2154 && is_a <bb_vec_info> (vinfo))
2156 unsigned HOST_WIDE_INT const_max_nunits;
2157 if (!max_nunits.is_constant (&const_max_nunits)
2158 || const_max_nunits > group_size)
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162 "Build SLP failed: store group "
2163 "size not a multiple of the vector size "
2164 "in basic block SLP\n");
2165 vect_free_slp_tree (node, false);
2166 return false;
2168 /* Fatal mismatch. */
2169 matches[group_size / const_max_nunits * const_max_nunits] = false;
2170 vect_free_slp_tree (node, false);
2172 else
2174 /* Create a new SLP instance. */
2175 new_instance = XNEW (class _slp_instance);
2176 SLP_INSTANCE_TREE (new_instance) = node;
2177 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2178 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2179 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2180 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2182 vect_gather_slp_loads (new_instance, node);
2183 if (dump_enabled_p ())
2184 dump_printf_loc (MSG_NOTE, vect_location,
2185 "SLP size %u vs. limit %u.\n",
2186 tree_size, max_tree_size);
2188 /* Compute the load permutation. */
2189 slp_tree load_node;
2190 bool loads_permuted = false;
2191 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2193 vec<unsigned> load_permutation;
2194 int j;
2195 stmt_vec_info load_info;
2196 bool this_load_permuted = false;
2197 load_permutation.create (group_size);
2198 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
2199 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2200 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2202 int load_place = vect_get_place_in_interleaving_chain
2203 (load_info, first_stmt_info);
2204 gcc_assert (load_place != -1);
2205 if (load_place != j)
2206 this_load_permuted = true;
2207 load_permutation.safe_push (load_place);
2209 if (!this_load_permuted
2210 /* The load requires permutation when unrolling exposes
2211 a gap either because the group is larger than the SLP
2212 group-size or because there is a gap between the groups. */
2213 && (known_eq (unrolling_factor, 1U)
2214 || (group_size == DR_GROUP_SIZE (first_stmt_info)
2215 && DR_GROUP_GAP (first_stmt_info) == 0)))
2217 load_permutation.release ();
2218 continue;
2220 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2221 loads_permuted = true;
2224 if (loads_permuted)
2226 if (!vect_supported_load_permutation_p (new_instance))
2228 if (dump_enabled_p ())
2229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230 "Build SLP failed: unsupported load "
2231 "permutation %G", stmt_info->stmt);
2232 vect_free_slp_instance (new_instance, false);
2233 return false;
2237 /* If the loads and stores can be handled with load/store-lan
2238 instructions do not generate this SLP instance. */
2239 if (is_a <loop_vec_info> (vinfo)
2240 && loads_permuted
2241 && dr && vect_store_lanes_supported (vectype, group_size, false))
2243 slp_tree load_node;
2244 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2246 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2247 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2248 /* Use SLP for strided accesses (or if we can't load-lanes). */
2249 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2250 || ! vect_load_lanes_supported
2251 (STMT_VINFO_VECTYPE (stmt_vinfo),
2252 DR_GROUP_SIZE (stmt_vinfo), false))
2253 break;
2255 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2257 if (dump_enabled_p ())
2258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259 "Built SLP cancelled: can use "
2260 "load/store-lanes\n");
2261 vect_free_slp_instance (new_instance, false);
2262 return false;
2266 vinfo->slp_instances.safe_push (new_instance);
2268 if (dump_enabled_p ())
2270 dump_printf_loc (MSG_NOTE, vect_location,
2271 "Final SLP tree for instance:\n");
2272 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2275 return true;
2278 else
2280 /* Failed to SLP. */
2281 /* Free the allocated memory. */
2282 scalar_stmts.release ();
2285 /* For basic block SLP, try to break the group up into multiples of the
2286 vector size. */
2287 unsigned HOST_WIDE_INT const_nunits;
2288 if (is_a <bb_vec_info> (vinfo)
2289 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2290 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2291 && nunits.is_constant (&const_nunits))
2293 /* We consider breaking the group only on VF boundaries from the existing
2294 start. */
2295 for (i = 0; i < group_size; i++)
2296 if (!matches[i]) break;
2298 if (i >= const_nunits && i < group_size)
2300 /* Split into two groups at the first vector boundary before i. */
2301 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2302 unsigned group1_size = i & ~(const_nunits - 1);
2304 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2305 group1_size);
2306 bool res = vect_analyze_slp_instance (vinfo, stmt_info,
2307 max_tree_size);
2308 /* If the first non-match was in the middle of a vector,
2309 skip the rest of that vector. */
2310 if (group1_size < i)
2312 i = group1_size + const_nunits;
2313 if (i < group_size)
2314 rest = vect_split_slp_store_group (rest, const_nunits);
2316 if (i < group_size)
2317 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2318 return res;
2320 /* Even though the first vector did not all match, we might be able to SLP
2321 (some) of the remainder. FORNOW ignore this possibility. */
2324 return false;
2328 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2329 trees of packed scalar stmts if SLP is possible. */
2331 opt_result
2332 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2334 unsigned int i;
2335 stmt_vec_info first_element;
2337 DUMP_VECT_SCOPE ("vect_analyze_slp");
2339 /* Find SLP sequences starting from groups of grouped stores. */
2340 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2341 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2343 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2345 if (loop_vinfo->reduction_chains.length () > 0)
2347 /* Find SLP sequences starting from reduction chains. */
2348 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2349 if (! vect_analyze_slp_instance (vinfo, first_element,
2350 max_tree_size))
2352 /* Dissolve reduction chain group. */
2353 stmt_vec_info vinfo = first_element;
2354 stmt_vec_info last = NULL;
2355 while (vinfo)
2357 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2358 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2359 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2360 last = vinfo;
2361 vinfo = next;
2363 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2364 /* It can be still vectorized as part of an SLP reduction. */
2365 loop_vinfo->reductions.safe_push (last);
2369 /* Find SLP sequences starting from groups of reductions. */
2370 if (loop_vinfo->reductions.length () > 1)
2371 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2372 max_tree_size);
2375 return opt_result::success ();
2379 /* For each possible SLP instance decide whether to SLP it and calculate overall
2380 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2381 least one instance. */
2383 bool
2384 vect_make_slp_decision (loop_vec_info loop_vinfo)
2386 unsigned int i;
2387 poly_uint64 unrolling_factor = 1;
2388 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2389 slp_instance instance;
2390 int decided_to_slp = 0;
2392 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2394 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2396 /* FORNOW: SLP if you can. */
2397 /* All unroll factors have the form:
2399 GET_MODE_SIZE (vinfo->vector_mode) * X
2401 for some rational X, so they must have a common multiple. */
2402 unrolling_factor
2403 = force_common_multiple (unrolling_factor,
2404 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2406 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2407 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2408 loop-based vectorization. Such stmts will be marked as HYBRID. */
2409 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2410 decided_to_slp++;
2413 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2415 if (decided_to_slp && dump_enabled_p ())
2417 dump_printf_loc (MSG_NOTE, vect_location,
2418 "Decided to SLP %d instances. Unrolling factor ",
2419 decided_to_slp);
2420 dump_dec (MSG_NOTE, unrolling_factor);
2421 dump_printf (MSG_NOTE, "\n");
2424 return (decided_to_slp > 0);
2428 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2429 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2431 static void
2432 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype,
2433 hash_map<slp_tree, unsigned> &visited)
2435 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (node)[i];
2436 imm_use_iterator imm_iter;
2437 gimple *use_stmt;
2438 stmt_vec_info use_vinfo;
2439 slp_tree child;
2440 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2441 int j;
2443 /* We need to union stype over the incoming graph edges but we still
2444 want to limit recursion to stay O(N+E). */
2445 bool only_edge = (++visited.get_or_insert (node) < node->refcnt);
2447 /* Propagate hybrid down the SLP tree. */
2448 if (stype == hybrid)
2450 else if (HYBRID_SLP_STMT (stmt_vinfo))
2451 stype = hybrid;
2452 else if (!only_edge)
2454 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2455 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2456 /* If we get a pattern stmt here we have to use the LHS of the
2457 original stmt for immediate uses. */
2458 gimple *stmt = vect_orig_stmt (stmt_vinfo)->stmt;
2459 tree def;
2460 if (gimple_code (stmt) == GIMPLE_PHI)
2461 def = gimple_phi_result (stmt);
2462 else
2463 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2464 if (def)
2465 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2467 use_vinfo = loop_vinfo->lookup_stmt (use_stmt);
2468 if (!use_vinfo)
2469 continue;
2470 use_vinfo = vect_stmt_to_vectorize (use_vinfo);
2471 if (!STMT_SLP_TYPE (use_vinfo)
2472 && (STMT_VINFO_RELEVANT (use_vinfo)
2473 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2474 && !(gimple_code (use_stmt) == GIMPLE_PHI
2475 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2477 if (dump_enabled_p ())
2478 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2479 "def in non-SLP stmt: %G", use_stmt);
2480 stype = hybrid;
2485 if (stype == hybrid
2486 && !HYBRID_SLP_STMT (stmt_vinfo))
2488 if (dump_enabled_p ())
2489 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2490 stmt_vinfo->stmt);
2491 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2494 if (!only_edge)
2495 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2496 if (SLP_TREE_DEF_TYPE (child) != vect_external_def
2497 && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
2498 vect_detect_hybrid_slp_stmts (child, i, stype, visited);
2501 static void
2502 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2504 hash_map<slp_tree, unsigned> visited;
2505 vect_detect_hybrid_slp_stmts (node, i, stype, visited);
2508 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2510 static tree
2511 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2513 walk_stmt_info *wi = (walk_stmt_info *)data;
2514 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2516 if (wi->is_lhs)
2517 return NULL_TREE;
2519 stmt_vec_info def_stmt_info = loop_vinfo->lookup_def (*tp);
2520 if (def_stmt_info && PURE_SLP_STMT (def_stmt_info))
2522 if (dump_enabled_p ())
2523 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2524 def_stmt_info->stmt);
2525 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2528 return NULL_TREE;
2531 static tree
2532 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2533 walk_stmt_info *wi)
2535 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2536 stmt_vec_info use_vinfo = loop_vinfo->lookup_stmt (gsi_stmt (*gsi));
2537 /* If the stmt is in a SLP instance then this isn't a reason
2538 to mark use definitions in other SLP instances as hybrid. */
2539 if (! STMT_SLP_TYPE (use_vinfo)
2540 && (STMT_VINFO_RELEVANT (use_vinfo)
2541 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2542 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2543 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2545 else
2546 *handled = true;
2547 return NULL_TREE;
2550 /* Find stmts that must be both vectorized and SLPed. */
2552 void
2553 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2555 unsigned int i;
2556 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2557 slp_instance instance;
2559 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2561 /* First walk all pattern stmt in the loop and mark defs of uses as
2562 hybrid because immediate uses in them are not recorded. */
2563 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2565 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2566 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2567 gsi_next (&gsi))
2569 gimple *stmt = gsi_stmt (gsi);
2570 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2571 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2573 walk_stmt_info wi;
2574 memset (&wi, 0, sizeof (wi));
2575 wi.info = loop_vinfo;
2576 gimple_stmt_iterator gsi2
2577 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)->stmt);
2578 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2579 vect_detect_hybrid_slp_1, &wi);
2580 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2581 vect_detect_hybrid_slp_2,
2582 vect_detect_hybrid_slp_1, &wi);
2587 /* Then walk the SLP instance trees marking stmts with uses in
2588 non-SLP stmts as hybrid, also propagating hybrid down the
2589 SLP tree, collecting the above info on-the-fly. */
2590 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2592 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2593 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2594 i, pure_slp);
2599 /* Initialize a bb_vec_info struct for the statements between
2600 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2602 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2603 gimple_stmt_iterator region_end_in,
2604 vec_info_shared *shared)
2605 : vec_info (vec_info::bb, init_cost (NULL), shared),
2606 bb (gsi_bb (region_begin_in)),
2607 region_begin (region_begin_in),
2608 region_end (region_end_in)
2610 gimple_stmt_iterator gsi;
2612 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2613 gsi_next (&gsi))
2615 gimple *stmt = gsi_stmt (gsi);
2616 gimple_set_uid (stmt, 0);
2617 add_stmt (stmt);
2620 bb->aux = this;
2624 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2625 stmts in the basic block. */
2627 _bb_vec_info::~_bb_vec_info ()
2629 for (gimple_stmt_iterator si = region_begin;
2630 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2631 /* Reset region marker. */
2632 gimple_set_uid (gsi_stmt (si), -1);
2634 bb->aux = NULL;
2637 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2638 given then that child nodes have already been processed, and that
2639 their def types currently match their SLP node's def type. */
2641 static bool
2642 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2643 slp_instance node_instance,
2644 stmt_vector_for_cost *cost_vec)
2646 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2647 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2649 /* For BB vectorization vector types are assigned here.
2650 Memory accesses already got their vector type assigned
2651 in vect_analyze_data_refs. */
2652 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2653 if (bb_vinfo
2654 && ! STMT_VINFO_DATA_REF (stmt_info))
2656 tree vectype, nunits_vectype;
2657 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
2658 &nunits_vectype))
2659 /* We checked this when building the node. */
2660 gcc_unreachable ();
2661 if (vectype == boolean_type_node)
2663 vectype = vect_get_mask_type_for_stmt (stmt_info);
2664 if (!vectype)
2665 /* vect_get_mask_type_for_stmt has already explained the
2666 failure. */
2667 return false;
2670 stmt_vec_info sstmt_info;
2671 unsigned int i;
2672 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt_info)
2673 STMT_VINFO_VECTYPE (sstmt_info) = vectype;
2676 /* Calculate the number of vector statements to be created for the
2677 scalar stmts in this node. For SLP reductions it is equal to the
2678 number of vector statements in the children (which has already been
2679 calculated by the recursive call). Otherwise it is the number of
2680 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2681 VF divided by the number of elements in a vector. */
2682 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2683 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2685 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2686 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2688 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2689 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2690 break;
2693 else
2695 poly_uint64 vf;
2696 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2697 vf = loop_vinfo->vectorization_factor;
2698 else
2699 vf = 1;
2700 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2701 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2702 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2703 = vect_get_num_vectors (vf * group_size, vectype);
2706 bool dummy;
2707 return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec);
2710 /* Try to build NODE from scalars, returning true on success.
2711 NODE_INSTANCE is the SLP instance that contains NODE. */
2713 static bool
2714 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
2715 slp_instance node_instance)
2717 stmt_vec_info stmt_info;
2718 unsigned int i;
2720 if (!is_a <bb_vec_info> (vinfo)
2721 || node == SLP_INSTANCE_TREE (node_instance)
2722 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
2723 return false;
2725 if (dump_enabled_p ())
2726 dump_printf_loc (MSG_NOTE, vect_location,
2727 "Building vector operands from scalars instead\n");
2729 /* Don't remove and free the child nodes here, since they could be
2730 referenced by other structures. The analysis and scheduling phases
2731 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2732 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
2733 SLP_TREE_DEF_TYPE (node) = vect_external_def;
2734 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size);
2735 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2737 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
2738 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
2740 return true;
2743 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2744 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2746 Return true if the operations are supported. */
2748 static bool
2749 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2750 slp_instance node_instance,
2751 scalar_stmts_to_slp_tree_map_t *visited,
2752 scalar_stmts_to_slp_tree_map_t *lvisited,
2753 stmt_vector_for_cost *cost_vec)
2755 int i, j;
2756 slp_tree child;
2758 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2759 return true;
2761 /* If we already analyzed the exact same set of scalar stmts we're done.
2762 We share the generated vector stmts for those. */
2763 slp_tree *leader;
2764 if ((leader = visited->get (SLP_TREE_SCALAR_STMTS (node)))
2765 || (leader = lvisited->get (SLP_TREE_SCALAR_STMTS (node))))
2767 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2768 = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
2769 /* Cope with cases in which we made a late decision to build the
2770 node from scalars. */
2771 if (SLP_TREE_DEF_TYPE (*leader) == vect_external_def
2772 && vect_slp_convert_to_external (vinfo, node, node_instance))
2774 else
2775 gcc_assert (SLP_TREE_DEF_TYPE (node) == SLP_TREE_DEF_TYPE (*leader));
2776 return true;
2779 /* The SLP graph is acyclic so not caching whether we failed or succeeded
2780 doesn't result in any issue since we throw away the lvisited set
2781 when we fail. */
2782 lvisited->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
2784 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2785 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2786 visited, lvisited, cost_vec))
2787 return false;
2789 /* ??? We have to catch the case late where two first scalar stmts appear
2790 in multiple SLP children with different def type and fail. Remember
2791 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2792 match it when that is vect_internal_def. */
2793 auto_vec<vect_def_type, 4> dt;
2794 dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2795 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2796 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2797 dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
2799 /* Push SLP node def-type to stmt operands. */
2800 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2801 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def
2802 && SLP_TREE_SCALAR_STMTS (child).length () != 0)
2803 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2804 = SLP_TREE_DEF_TYPE (child);
2806 /* Check everything worked out. */
2807 bool res = true;
2808 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2809 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2811 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2813 if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2814 != SLP_TREE_DEF_TYPE (child))
2815 res = false;
2817 else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2818 != dt[j])
2819 res = false;
2821 if (!res && dump_enabled_p ())
2822 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2823 "not vectorized: same operand with different "
2824 "def type in stmt.\n");
2826 if (res)
2827 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2828 cost_vec);
2830 /* Restore def-types. */
2831 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2832 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2833 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
2835 /* If this node can't be vectorized, try pruning the tree here rather
2836 than felling the whole thing. */
2837 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
2838 res = true;
2840 return res;
2844 /* Analyze statements in SLP instances of VINFO. Return true if the
2845 operations are supported. */
2847 bool
2848 vect_slp_analyze_operations (vec_info *vinfo)
2850 slp_instance instance;
2851 int i;
2853 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2855 scalar_stmts_to_slp_tree_map_t *visited
2856 = new scalar_stmts_to_slp_tree_map_t ();
2857 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2859 scalar_stmts_to_slp_tree_map_t lvisited;
2860 stmt_vector_for_cost cost_vec;
2861 cost_vec.create (2);
2862 if (!vect_slp_analyze_node_operations (vinfo,
2863 SLP_INSTANCE_TREE (instance),
2864 instance, visited, &lvisited,
2865 &cost_vec))
2867 slp_tree node = SLP_INSTANCE_TREE (instance);
2868 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2869 if (dump_enabled_p ())
2870 dump_printf_loc (MSG_NOTE, vect_location,
2871 "removing SLP instance operations starting from: %G",
2872 stmt_info->stmt);
2873 vect_free_slp_instance (instance, false);
2874 vinfo->slp_instances.ordered_remove (i);
2875 cost_vec.release ();
2877 else
2879 for (scalar_stmts_to_slp_tree_map_t::iterator x = lvisited.begin();
2880 x != lvisited.end(); ++x)
2881 visited->put ((*x).first.copy (), (*x).second);
2882 i++;
2884 add_stmt_costs (vinfo->target_cost_data, &cost_vec);
2885 cost_vec.release ();
2888 delete visited;
2890 return !vinfo->slp_instances.is_empty ();
2894 /* Compute the scalar cost of the SLP node NODE and its children
2895 and return it. Do not account defs that are marked in LIFE and
2896 update LIFE according to uses of NODE. */
2898 static void
2899 vect_bb_slp_scalar_cost (basic_block bb,
2900 slp_tree node, vec<bool, va_heap> *life,
2901 stmt_vector_for_cost *cost_vec,
2902 hash_set<slp_tree> &visited)
2904 unsigned i;
2905 stmt_vec_info stmt_info;
2906 slp_tree child;
2908 if (visited.add (node))
2909 return;
2911 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2913 gimple *stmt = stmt_info->stmt;
2914 vec_info *vinfo = stmt_info->vinfo;
2915 ssa_op_iter op_iter;
2916 def_operand_p def_p;
2918 if ((*life)[i])
2919 continue;
2921 /* If there is a non-vectorized use of the defs then the scalar
2922 stmt is kept live in which case we do not account it or any
2923 required defs in the SLP children in the scalar cost. This
2924 way we make the vectorization more costly when compared to
2925 the scalar cost. */
2926 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2928 imm_use_iterator use_iter;
2929 gimple *use_stmt;
2930 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2931 if (!is_gimple_debug (use_stmt))
2933 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
2934 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
2936 (*life)[i] = true;
2937 BREAK_FROM_IMM_USE_STMT (use_iter);
2941 if ((*life)[i])
2942 continue;
2944 /* Count scalar stmts only once. */
2945 if (gimple_visited_p (stmt))
2946 continue;
2947 gimple_set_visited (stmt, true);
2949 vect_cost_for_stmt kind;
2950 if (STMT_VINFO_DATA_REF (stmt_info))
2952 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2953 kind = scalar_load;
2954 else
2955 kind = scalar_store;
2957 else if (vect_nop_conversion_p (stmt_info))
2958 continue;
2959 else
2960 kind = scalar_stmt;
2961 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
2964 auto_vec<bool, 20> subtree_life;
2965 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2967 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2969 /* Do not directly pass LIFE to the recursive call, copy it to
2970 confine changes in the callee to the current child/subtree. */
2971 subtree_life.safe_splice (*life);
2972 vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec,
2973 visited);
2974 subtree_life.truncate (0);
2979 static void
2980 vect_bb_slp_scalar_cost (basic_block bb,
2981 slp_tree node, vec<bool, va_heap> *life,
2982 stmt_vector_for_cost *cost_vec)
2984 hash_set<slp_tree> visited;
2985 vect_bb_slp_scalar_cost (bb, node, life, cost_vec, visited);
2988 /* Check if vectorization of the basic block is profitable. */
2990 static bool
2991 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2993 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2994 slp_instance instance;
2995 int i;
2996 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2997 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2999 /* Calculate scalar cost. */
3000 stmt_vector_for_cost scalar_costs;
3001 scalar_costs.create (0);
3002 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3004 auto_vec<bool, 20> life;
3005 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
3006 vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
3007 SLP_INSTANCE_TREE (instance),
3008 &life, &scalar_costs);
3010 void *target_cost_data = init_cost (NULL);
3011 add_stmt_costs (target_cost_data, &scalar_costs);
3012 scalar_costs.release ();
3013 unsigned dummy;
3014 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
3015 destroy_cost_data (target_cost_data);
3017 /* Unset visited flag. */
3018 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
3019 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3020 gimple_set_visited (gsi_stmt (gsi), false);
3022 /* Complete the target-specific cost calculation. */
3023 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
3024 &vec_inside_cost, &vec_epilogue_cost);
3026 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3028 if (dump_enabled_p ())
3030 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3031 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3032 vec_inside_cost);
3033 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3034 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3035 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3038 /* Vectorization is profitable if its cost is more than the cost of scalar
3039 version. Note that we err on the vector side for equal cost because
3040 the cost estimate is otherwise quite pessimistic (constant uses are
3041 free on the scalar side but cost a load on the vector side for
3042 example). */
3043 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3044 return false;
3046 return true;
3049 /* Find any vectorizable constructors and add them to the grouped_store
3050 array. */
3052 static void
3053 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3055 gimple_stmt_iterator gsi;
3057 for (gsi = bb_vinfo->region_begin;
3058 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3060 gimple *stmt = gsi_stmt (gsi);
3062 if (is_gimple_assign (stmt)
3063 && gimple_assign_rhs_code (stmt) == CONSTRUCTOR
3064 && TREE_CODE (gimple_assign_lhs (stmt)) == SSA_NAME
3065 && TREE_CODE (TREE_TYPE (gimple_assign_lhs (stmt))) == VECTOR_TYPE)
3067 tree rhs = gimple_assign_rhs1 (stmt);
3069 if (CONSTRUCTOR_NELTS (rhs) == 0)
3070 continue;
3072 poly_uint64 subparts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs));
3074 if (maybe_ne (subparts, CONSTRUCTOR_NELTS (rhs)))
3075 continue;
3077 if (dump_enabled_p ())
3078 dump_printf_loc (MSG_NOTE, vect_location,
3079 "Found vectorizable constructor: %G\n", stmt);
3080 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (stmt);
3081 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3086 /* Check if the region described by BB_VINFO can be vectorized, returning
3087 true if so. When returning false, set FATAL to true if the same failure
3088 would prevent vectorization at other vector sizes, false if it is still
3089 worth trying other sizes. N_STMTS is the number of statements in the
3090 region. */
3092 static bool
3093 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
3095 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3097 slp_instance instance;
3098 int i;
3099 poly_uint64 min_vf = 2;
3101 /* The first group of checks is independent of the vector size. */
3102 fatal = true;
3104 /* Analyze the data references. */
3106 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3108 if (dump_enabled_p ())
3109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3110 "not vectorized: unhandled data-ref in basic "
3111 "block.\n");
3112 return false;
3115 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
3117 if (dump_enabled_p ())
3118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3119 "not vectorized: not enough data-refs in "
3120 "basic block.\n");
3121 return false;
3124 if (!vect_analyze_data_ref_accesses (bb_vinfo))
3126 if (dump_enabled_p ())
3127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3128 "not vectorized: unhandled data access in "
3129 "basic block.\n");
3130 return false;
3133 vect_slp_check_for_constructors (bb_vinfo);
3135 /* If there are no grouped stores in the region there is no need
3136 to continue with pattern recog as vect_analyze_slp will fail
3137 anyway. */
3138 if (bb_vinfo->grouped_stores.is_empty ())
3140 if (dump_enabled_p ())
3141 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3142 "not vectorized: no grouped stores in "
3143 "basic block.\n");
3144 return false;
3147 /* While the rest of the analysis below depends on it in some way. */
3148 fatal = false;
3150 vect_pattern_recog (bb_vinfo);
3152 /* Check the SLP opportunities in the basic block, analyze and build SLP
3153 trees. */
3154 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3156 if (dump_enabled_p ())
3158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3159 "Failed to SLP the basic block.\n");
3160 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3161 "not vectorized: failed to find SLP opportunities "
3162 "in basic block.\n");
3164 return false;
3167 vect_record_base_alignments (bb_vinfo);
3169 /* Analyze and verify the alignment of data references and the
3170 dependence in the SLP instances. */
3171 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3173 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
3174 || ! vect_slp_analyze_instance_dependence (instance))
3176 slp_tree node = SLP_INSTANCE_TREE (instance);
3177 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3178 if (dump_enabled_p ())
3179 dump_printf_loc (MSG_NOTE, vect_location,
3180 "removing SLP instance operations starting from: %G",
3181 stmt_info->stmt);
3182 vect_free_slp_instance (instance, false);
3183 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3184 continue;
3187 /* Mark all the statements that we want to vectorize as pure SLP and
3188 relevant. */
3189 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3190 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3191 if (SLP_INSTANCE_ROOT_STMT (instance))
3192 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3194 i++;
3196 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3197 return false;
3199 if (!vect_slp_analyze_operations (bb_vinfo))
3201 if (dump_enabled_p ())
3202 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3203 "not vectorized: bad operation in basic block.\n");
3204 return false;
3207 /* Cost model: check if the vectorization is worthwhile. */
3208 if (!unlimited_cost_model (NULL)
3209 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3211 if (dump_enabled_p ())
3212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3213 "not vectorized: vectorization is not "
3214 "profitable.\n");
3215 return false;
3218 if (dump_enabled_p ())
3219 dump_printf_loc (MSG_NOTE, vect_location,
3220 "Basic block will be vectorized using SLP\n");
3221 return true;
3224 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3225 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3226 on success. The region has N_STMTS statements and has the datarefs
3227 given by DATAREFS. */
3229 static bool
3230 vect_slp_bb_region (gimple_stmt_iterator region_begin,
3231 gimple_stmt_iterator region_end,
3232 vec<data_reference_p> datarefs,
3233 unsigned int n_stmts)
3235 bb_vec_info bb_vinfo;
3236 auto_vector_modes vector_modes;
3238 /* Autodetect first vector size we try. */
3239 machine_mode next_vector_mode = VOIDmode;
3240 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3241 unsigned int mode_i = 0;
3243 vec_info_shared shared;
3245 machine_mode autodetected_vector_mode = VOIDmode;
3246 while (1)
3248 bool vectorized = false;
3249 bool fatal = false;
3250 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3252 bool first_time_p = shared.datarefs.is_empty ();
3253 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3254 if (first_time_p)
3255 bb_vinfo->shared->save_datarefs ();
3256 else
3257 bb_vinfo->shared->check_datarefs ();
3258 bb_vinfo->vector_mode = next_vector_mode;
3260 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
3261 && dbg_cnt (vect_slp))
3263 if (dump_enabled_p ())
3265 dump_printf_loc (MSG_NOTE, vect_location,
3266 "***** Analysis succeeded with vector mode"
3267 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
3268 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3271 bb_vinfo->shared->check_datarefs ();
3272 vect_schedule_slp (bb_vinfo);
3274 unsigned HOST_WIDE_INT bytes;
3275 if (dump_enabled_p ())
3277 if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
3278 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3279 "basic block part vectorized using %wu byte "
3280 "vectors\n", bytes);
3281 else
3282 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3283 "basic block part vectorized using variable "
3284 "length vectors\n");
3287 vectorized = true;
3289 else
3291 if (dump_enabled_p ())
3292 dump_printf_loc (MSG_NOTE, vect_location,
3293 "***** Analysis failed with vector mode %s\n",
3294 GET_MODE_NAME (bb_vinfo->vector_mode));
3297 if (mode_i == 0)
3298 autodetected_vector_mode = bb_vinfo->vector_mode;
3300 if (!fatal)
3301 while (mode_i < vector_modes.length ()
3302 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
3304 if (dump_enabled_p ())
3305 dump_printf_loc (MSG_NOTE, vect_location,
3306 "***** The result for vector mode %s would"
3307 " be the same\n",
3308 GET_MODE_NAME (vector_modes[mode_i]));
3309 mode_i += 1;
3312 delete bb_vinfo;
3314 if (mode_i < vector_modes.length ()
3315 && VECTOR_MODE_P (autodetected_vector_mode)
3316 && (related_vector_mode (vector_modes[mode_i],
3317 GET_MODE_INNER (autodetected_vector_mode))
3318 == autodetected_vector_mode)
3319 && (related_vector_mode (autodetected_vector_mode,
3320 GET_MODE_INNER (vector_modes[mode_i]))
3321 == vector_modes[mode_i]))
3323 if (dump_enabled_p ())
3324 dump_printf_loc (MSG_NOTE, vect_location,
3325 "***** Skipping vector mode %s, which would"
3326 " repeat the analysis for %s\n",
3327 GET_MODE_NAME (vector_modes[mode_i]),
3328 GET_MODE_NAME (autodetected_vector_mode));
3329 mode_i += 1;
3332 if (vectorized
3333 || mode_i == vector_modes.length ()
3334 || autodetected_vector_mode == VOIDmode
3335 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3336 vector sizes will fail do not bother iterating. */
3337 || fatal)
3338 return vectorized;
3340 /* Try the next biggest vector size. */
3341 next_vector_mode = vector_modes[mode_i++];
3342 if (dump_enabled_p ())
3343 dump_printf_loc (MSG_NOTE, vect_location,
3344 "***** Re-trying analysis with vector mode %s\n",
3345 GET_MODE_NAME (next_vector_mode));
3349 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3350 true if anything in the basic-block was vectorized. */
3352 bool
3353 vect_slp_bb (basic_block bb)
3355 gimple_stmt_iterator gsi;
3356 bool any_vectorized = false;
3358 gsi = gsi_start_bb (bb);
3359 while (!gsi_end_p (gsi))
3361 gimple_stmt_iterator region_begin = gsi;
3362 vec<data_reference_p> datarefs = vNULL;
3363 int insns = 0;
3365 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3367 gimple *stmt = gsi_stmt (gsi);
3368 if (is_gimple_debug (stmt))
3369 continue;
3370 insns++;
3372 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3373 vect_location = stmt;
3375 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3376 break;
3379 /* Skip leading unhandled stmts. */
3380 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3382 gsi_next (&gsi);
3383 continue;
3386 gimple_stmt_iterator region_end = gsi;
3388 if (insns > param_slp_max_insns_in_bb)
3390 if (dump_enabled_p ())
3391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3392 "not vectorized: too many instructions in "
3393 "basic block.\n");
3395 else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
3396 any_vectorized = true;
3398 if (gsi_end_p (region_end))
3399 break;
3401 /* Skip the unhandled stmt. */
3402 gsi_next (&gsi);
3405 return any_vectorized;
3409 /* Return 1 if vector type STMT_VINFO is a boolean vector. */
3411 static bool
3412 vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo)
3414 enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
3415 tree op, vectype;
3416 enum vect_def_type dt;
3418 /* For comparison and COND_EXPR type is chosen depending
3419 on the non-constant other comparison operand. */
3420 if (TREE_CODE_CLASS (code) == tcc_comparison)
3422 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3423 op = gimple_assign_rhs1 (stmt);
3425 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3426 gcc_unreachable ();
3428 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3431 if (code == COND_EXPR)
3433 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3434 tree cond = gimple_assign_rhs1 (stmt);
3436 if (TREE_CODE (cond) == SSA_NAME)
3437 op = cond;
3438 else
3439 op = TREE_OPERAND (cond, 0);
3441 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3442 gcc_unreachable ();
3444 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3447 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3450 /* Build a variable-length vector in which the elements in ELTS are repeated
3451 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3452 RESULTS and add any new instructions to SEQ.
3454 The approach we use is:
3456 (1) Find a vector mode VM with integer elements of mode IM.
3458 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3459 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3460 from small vectors to IM.
3462 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3464 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3465 correct byte contents.
3467 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3469 We try to find the largest IM for which this sequence works, in order
3470 to cut down on the number of interleaves. */
3472 void
3473 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3474 vec<tree> elts, unsigned int nresults,
3475 vec<tree> &results)
3477 unsigned int nelts = elts.length ();
3478 tree element_type = TREE_TYPE (vector_type);
3480 /* (1) Find a vector mode VM with integer elements of mode IM. */
3481 unsigned int nvectors = 1;
3482 tree new_vector_type;
3483 tree permutes[2];
3484 if (!can_duplicate_and_interleave_p (vinfo, nelts, TYPE_MODE (element_type),
3485 &nvectors, &new_vector_type,
3486 permutes))
3487 gcc_unreachable ();
3489 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3490 unsigned int partial_nelts = nelts / nvectors;
3491 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3493 tree_vector_builder partial_elts;
3494 auto_vec<tree, 32> pieces (nvectors * 2);
3495 pieces.quick_grow (nvectors * 2);
3496 for (unsigned int i = 0; i < nvectors; ++i)
3498 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3499 ELTS' has mode IM. */
3500 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3501 for (unsigned int j = 0; j < partial_nelts; ++j)
3502 partial_elts.quick_push (elts[i * partial_nelts + j]);
3503 tree t = gimple_build_vector (seq, &partial_elts);
3504 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3505 TREE_TYPE (new_vector_type), t);
3507 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3508 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3511 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3512 correct byte contents.
3514 We need to repeat the following operation log2(nvectors) times:
3516 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3517 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3519 However, if each input repeats every N elements and the VF is
3520 a multiple of N * 2, the HI result is the same as the LO. */
3521 unsigned int in_start = 0;
3522 unsigned int out_start = nvectors;
3523 unsigned int hi_start = nvectors / 2;
3524 /* A bound on the number of outputs needed to produce NRESULTS results
3525 in the final iteration. */
3526 unsigned int noutputs_bound = nvectors * nresults;
3527 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3529 noutputs_bound /= 2;
3530 unsigned int limit = MIN (noutputs_bound, nvectors);
3531 for (unsigned int i = 0; i < limit; ++i)
3533 if ((i & 1) != 0
3534 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3535 2 * in_repeat))
3537 pieces[out_start + i] = pieces[out_start + i - 1];
3538 continue;
3541 tree output = make_ssa_name (new_vector_type);
3542 tree input1 = pieces[in_start + (i / 2)];
3543 tree input2 = pieces[in_start + (i / 2) + hi_start];
3544 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3545 input1, input2,
3546 permutes[i & 1]);
3547 gimple_seq_add_stmt (seq, stmt);
3548 pieces[out_start + i] = output;
3550 std::swap (in_start, out_start);
3553 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3554 results.reserve (nresults);
3555 for (unsigned int i = 0; i < nresults; ++i)
3556 if (i < nvectors)
3557 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3558 pieces[in_start + i]));
3559 else
3560 results.quick_push (results[i - nvectors]);
3564 /* For constant and loop invariant defs of SLP_NODE this function returns
3565 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3566 OP_NODE determines the node for the operand containing the scalar
3567 operands. */
3569 static void
3570 vect_get_constant_vectors (slp_tree op_node, slp_tree slp_node,
3571 vec<tree> *vec_oprnds)
3573 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3574 vec_info *vinfo = stmt_vinfo->vinfo;
3575 unsigned HOST_WIDE_INT nunits;
3576 tree vec_cst;
3577 unsigned j, number_of_places_left_in_vector;
3578 tree vector_type;
3579 tree vop;
3580 int group_size = op_node->ops.length ();
3581 unsigned int vec_num, i;
3582 unsigned number_of_copies = 1;
3583 bool constant_p;
3584 tree neutral_op = NULL;
3585 gimple_seq ctor_seq = NULL;
3586 auto_vec<tree, 16> permute_results;
3588 /* ??? SLP analysis should compute the vector type for the
3589 constant / invariant and store it in the SLP node. */
3590 tree op = op_node->ops[0];
3591 /* Check if vector type is a boolean vector. */
3592 tree stmt_vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3593 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3594 && vect_mask_constant_operand_p (stmt_vinfo))
3595 vector_type = truth_type_for (stmt_vectype);
3596 else
3597 vector_type = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op));
3599 /* ??? For lane-reducing ops we should also have the required number
3600 of vector stmts initialized rather than second-guessing here. */
3601 unsigned int number_of_vectors;
3602 if (is_gimple_assign (stmt_vinfo->stmt)
3603 && (gimple_assign_rhs_code (stmt_vinfo->stmt) == SAD_EXPR
3604 || gimple_assign_rhs_code (stmt_vinfo->stmt) == DOT_PROD_EXPR
3605 || gimple_assign_rhs_code (stmt_vinfo->stmt) == WIDEN_SUM_EXPR))
3606 number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3607 else
3608 number_of_vectors
3609 = vect_get_num_vectors (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
3610 * TYPE_VECTOR_SUBPARTS (stmt_vectype),
3611 vector_type);
3612 vec_oprnds->create (number_of_vectors);
3613 auto_vec<tree> voprnds (number_of_vectors);
3615 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3616 created vectors. It is greater than 1 if unrolling is performed.
3618 For example, we have two scalar operands, s1 and s2 (e.g., group of
3619 strided accesses of size two), while NUNITS is four (i.e., four scalars
3620 of this type can be packed in a vector). The output vector will contain
3621 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3622 will be 2).
3624 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3625 containing the operands.
3627 For example, NUNITS is four as before, and the group size is 8
3628 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3629 {s5, s6, s7, s8}. */
3631 /* When using duplicate_and_interleave, we just need one element for
3632 each scalar statement. */
3633 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3634 nunits = group_size;
3636 number_of_copies = nunits * number_of_vectors / group_size;
3638 number_of_places_left_in_vector = nunits;
3639 constant_p = true;
3640 tree_vector_builder elts (vector_type, nunits, 1);
3641 elts.quick_grow (nunits);
3642 bool place_after_defs = false;
3643 for (j = 0; j < number_of_copies; j++)
3645 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3647 /* Create 'vect_ = {op0,op1,...,opn}'. */
3648 number_of_places_left_in_vector--;
3649 tree orig_op = op;
3650 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3652 if (CONSTANT_CLASS_P (op))
3654 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3656 /* Can't use VIEW_CONVERT_EXPR for booleans because
3657 of possibly different sizes of scalar value and
3658 vector element. */
3659 if (integer_zerop (op))
3660 op = build_int_cst (TREE_TYPE (vector_type), 0);
3661 else if (integer_onep (op))
3662 op = build_all_ones_cst (TREE_TYPE (vector_type));
3663 else
3664 gcc_unreachable ();
3666 else
3667 op = fold_unary (VIEW_CONVERT_EXPR,
3668 TREE_TYPE (vector_type), op);
3669 gcc_assert (op && CONSTANT_CLASS_P (op));
3671 else
3673 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3674 gimple *init_stmt;
3675 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3677 tree true_val
3678 = build_all_ones_cst (TREE_TYPE (vector_type));
3679 tree false_val
3680 = build_zero_cst (TREE_TYPE (vector_type));
3681 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3682 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3683 op, true_val,
3684 false_val);
3686 else
3688 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3689 op);
3690 init_stmt
3691 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3692 op);
3694 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3695 op = new_temp;
3698 elts[number_of_places_left_in_vector] = op;
3699 if (!CONSTANT_CLASS_P (op))
3700 constant_p = false;
3701 if (TREE_CODE (orig_op) == SSA_NAME
3702 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3703 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3704 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3705 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3706 place_after_defs = true;
3708 if (number_of_places_left_in_vector == 0)
3710 if (constant_p
3711 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3712 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3713 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3714 else
3716 if (permute_results.is_empty ())
3717 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
3718 elts, number_of_vectors,
3719 permute_results);
3720 vec_cst = permute_results[number_of_vectors - j - 1];
3722 tree init;
3723 gimple_stmt_iterator gsi;
3724 if (place_after_defs)
3726 stmt_vec_info last_stmt_info
3727 = vect_find_last_scalar_stmt_in_slp (slp_node);
3728 gsi = gsi_for_stmt (last_stmt_info->stmt);
3729 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3730 &gsi);
3732 else
3733 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3734 NULL);
3735 if (ctor_seq != NULL)
3737 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3738 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3739 ctor_seq = NULL;
3741 voprnds.quick_push (init);
3742 place_after_defs = false;
3743 number_of_places_left_in_vector = nunits;
3744 constant_p = true;
3745 elts.new_vector (vector_type, nunits, 1);
3746 elts.quick_grow (nunits);
3751 /* Since the vectors are created in the reverse order, we should invert
3752 them. */
3753 vec_num = voprnds.length ();
3754 for (j = vec_num; j != 0; j--)
3756 vop = voprnds[j - 1];
3757 vec_oprnds->quick_push (vop);
3760 /* In case that VF is greater than the unrolling factor needed for the SLP
3761 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3762 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3763 to replicate the vectors. */
3764 while (number_of_vectors > vec_oprnds->length ())
3766 tree neutral_vec = NULL;
3768 if (neutral_op)
3770 if (!neutral_vec)
3771 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3773 vec_oprnds->quick_push (neutral_vec);
3775 else
3777 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3778 vec_oprnds->quick_push (vop);
3784 /* Get vectorized definitions from SLP_NODE that contains corresponding
3785 vectorized def-stmts. */
3787 static void
3788 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3790 stmt_vec_info vec_def_stmt_info;
3791 unsigned int i;
3793 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3795 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
3796 vec_oprnds->quick_push (gimple_get_lhs (vec_def_stmt_info->stmt));
3800 /* Get N vectorized definitions for SLP_NODE.
3801 If the scalar definitions are loop invariants or constants, collect them and
3802 call vect_get_constant_vectors() to create vector stmts.
3803 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3804 must be stored in the corresponding child of SLP_NODE, and we call
3805 vect_get_slp_vect_defs () to retrieve them. */
3807 void
3808 vect_get_slp_defs (slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
3810 if (n == -1U)
3811 n = SLP_TREE_CHILDREN (slp_node).length ();
3813 for (unsigned i = 0; i < n; ++i)
3815 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
3817 vec<tree> vec_defs = vNULL;
3819 /* For each operand we check if it has vectorized definitions in a child
3820 node or we need to create them (for invariants and constants). */
3821 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3823 vec_defs.create (SLP_TREE_NUMBER_OF_VEC_STMTS (child));
3824 vect_get_slp_vect_defs (child, &vec_defs);
3826 else
3827 vect_get_constant_vectors (child, slp_node, &vec_defs);
3829 vec_oprnds->quick_push (vec_defs);
3833 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3834 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3835 permute statements for the SLP node NODE of the SLP instance
3836 SLP_NODE_INSTANCE. */
3838 bool
3839 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3840 gimple_stmt_iterator *gsi, poly_uint64 vf,
3841 slp_instance slp_node_instance, bool analyze_only,
3842 unsigned *n_perms)
3844 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3845 vec_info *vinfo = stmt_info->vinfo;
3846 int vec_index = 0;
3847 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3848 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3849 unsigned int mask_element;
3850 machine_mode mode;
3852 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3853 return false;
3855 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3857 mode = TYPE_MODE (vectype);
3858 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3860 /* Initialize the vect stmts of NODE to properly insert the generated
3861 stmts later. */
3862 if (! analyze_only)
3863 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3864 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3865 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3867 /* Generate permutation masks for every NODE. Number of masks for each NODE
3868 is equal to GROUP_SIZE.
3869 E.g., we have a group of three nodes with three loads from the same
3870 location in each node, and the vector size is 4. I.e., we have a
3871 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3872 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3873 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3876 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3877 The last mask is illegal since we assume two operands for permute
3878 operation, and the mask element values can't be outside that range.
3879 Hence, the last mask must be converted into {2,5,5,5}.
3880 For the first two permutations we need the first and the second input
3881 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3882 we need the second and the third vectors: {b1,c1,a2,b2} and
3883 {c2,a3,b3,c3}. */
3885 int vect_stmts_counter = 0;
3886 unsigned int index = 0;
3887 int first_vec_index = -1;
3888 int second_vec_index = -1;
3889 bool noop_p = true;
3890 *n_perms = 0;
3892 vec_perm_builder mask;
3893 unsigned int nelts_to_build;
3894 unsigned int nvectors_per_build;
3895 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3896 && multiple_p (nunits, group_size));
3897 if (repeating_p)
3899 /* A single vector contains a whole number of copies of the node, so:
3900 (a) all permutes can use the same mask; and
3901 (b) the permutes only need a single vector input. */
3902 mask.new_vector (nunits, group_size, 3);
3903 nelts_to_build = mask.encoded_nelts ();
3904 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3906 else
3908 /* We need to construct a separate mask for each vector statement. */
3909 unsigned HOST_WIDE_INT const_nunits, const_vf;
3910 if (!nunits.is_constant (&const_nunits)
3911 || !vf.is_constant (&const_vf))
3912 return false;
3913 mask.new_vector (const_nunits, const_nunits, 1);
3914 nelts_to_build = const_vf * group_size;
3915 nvectors_per_build = 1;
3918 unsigned int count = mask.encoded_nelts ();
3919 mask.quick_grow (count);
3920 vec_perm_indices indices;
3922 for (unsigned int j = 0; j < nelts_to_build; j++)
3924 unsigned int iter_num = j / group_size;
3925 unsigned int stmt_num = j % group_size;
3926 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3927 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3928 if (repeating_p)
3930 first_vec_index = 0;
3931 mask_element = i;
3933 else
3935 /* Enforced before the loop when !repeating_p. */
3936 unsigned int const_nunits = nunits.to_constant ();
3937 vec_index = i / const_nunits;
3938 mask_element = i % const_nunits;
3939 if (vec_index == first_vec_index
3940 || first_vec_index == -1)
3942 first_vec_index = vec_index;
3944 else if (vec_index == second_vec_index
3945 || second_vec_index == -1)
3947 second_vec_index = vec_index;
3948 mask_element += const_nunits;
3950 else
3952 if (dump_enabled_p ())
3953 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3954 "permutation requires at "
3955 "least three vectors %G",
3956 stmt_info->stmt);
3957 gcc_assert (analyze_only);
3958 return false;
3961 gcc_assert (mask_element < 2 * const_nunits);
3964 if (mask_element != index)
3965 noop_p = false;
3966 mask[index++] = mask_element;
3968 if (index == count && !noop_p)
3970 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
3971 if (!can_vec_perm_const_p (mode, indices))
3973 if (dump_enabled_p ())
3975 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3976 vect_location,
3977 "unsupported vect permute { ");
3978 for (i = 0; i < count; ++i)
3980 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3981 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3983 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3985 gcc_assert (analyze_only);
3986 return false;
3989 ++*n_perms;
3992 if (index == count)
3994 if (!analyze_only)
3996 tree mask_vec = NULL_TREE;
3998 if (! noop_p)
3999 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4001 if (second_vec_index == -1)
4002 second_vec_index = first_vec_index;
4004 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4006 /* Generate the permute statement if necessary. */
4007 tree first_vec = dr_chain[first_vec_index + ri];
4008 tree second_vec = dr_chain[second_vec_index + ri];
4009 stmt_vec_info perm_stmt_info;
4010 if (! noop_p)
4012 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4013 tree perm_dest
4014 = vect_create_destination_var (gimple_assign_lhs (stmt),
4015 vectype);
4016 perm_dest = make_ssa_name (perm_dest);
4017 gassign *perm_stmt
4018 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4019 first_vec, second_vec,
4020 mask_vec);
4021 perm_stmt_info
4022 = vect_finish_stmt_generation (stmt_info, perm_stmt,
4023 gsi);
4025 else
4026 /* If mask was NULL_TREE generate the requested
4027 identity transform. */
4028 perm_stmt_info = vinfo->lookup_def (first_vec);
4030 /* Store the vector statement in NODE. */
4031 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
4032 = perm_stmt_info;
4036 index = 0;
4037 first_vec_index = -1;
4038 second_vec_index = -1;
4039 noop_p = true;
4043 return true;
4046 /* Vectorize SLP instance tree in postorder. */
4048 static void
4049 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
4050 scalar_stmts_to_slp_tree_map_t *bst_map)
4052 gimple_stmt_iterator si;
4053 stmt_vec_info stmt_info;
4054 unsigned int group_size;
4055 tree vectype;
4056 int i, j;
4057 slp_tree child;
4059 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4060 return;
4062 /* See if we have already vectorized the node in the graph of the
4063 SLP instance. */
4064 if (SLP_TREE_VEC_STMTS (node).exists ())
4065 return;
4067 /* See if we have already vectorized the same set of stmts and reuse their
4068 vectorized stmts across instances. */
4069 if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
4071 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
4072 return;
4075 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
4076 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4077 vect_schedule_slp_instance (child, instance, bst_map);
4079 /* Push SLP node def-type to stmts. */
4080 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4081 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4083 stmt_vec_info child_stmt_info;
4084 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4085 STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
4088 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4090 /* VECTYPE is the type of the destination. */
4091 vectype = STMT_VINFO_VECTYPE (stmt_info);
4092 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4093 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
4095 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
4096 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4098 if (dump_enabled_p ())
4099 dump_printf_loc (MSG_NOTE, vect_location,
4100 "------>vectorizing SLP node starting from: %G",
4101 stmt_info->stmt);
4103 /* Vectorized stmts go before the last scalar stmt which is where
4104 all uses are ready. */
4105 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
4106 si = gsi_for_stmt (last_stmt_info->stmt);
4108 /* Handle two-operation SLP nodes by vectorizing the group with
4109 both operations and then performing a merge. */
4110 if (SLP_TREE_TWO_OPERATORS (node))
4112 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4113 enum tree_code code0 = gimple_assign_rhs_code (stmt);
4114 enum tree_code ocode = ERROR_MARK;
4115 stmt_vec_info ostmt_info;
4116 vec_perm_builder mask (group_size, group_size, 1);
4117 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
4119 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
4120 if (gimple_assign_rhs_code (ostmt) != code0)
4122 mask.quick_push (1);
4123 ocode = gimple_assign_rhs_code (ostmt);
4125 else
4126 mask.quick_push (0);
4128 if (ocode != ERROR_MARK)
4130 vec<stmt_vec_info> v0;
4131 vec<stmt_vec_info> v1;
4132 unsigned j;
4133 tree tmask = NULL_TREE;
4134 vect_transform_stmt (stmt_info, &si, node, instance);
4135 v0 = SLP_TREE_VEC_STMTS (node).copy ();
4136 SLP_TREE_VEC_STMTS (node).truncate (0);
4137 gimple_assign_set_rhs_code (stmt, ocode);
4138 vect_transform_stmt (stmt_info, &si, node, instance);
4139 gimple_assign_set_rhs_code (stmt, code0);
4140 v1 = SLP_TREE_VEC_STMTS (node).copy ();
4141 SLP_TREE_VEC_STMTS (node).truncate (0);
4142 tree meltype = build_nonstandard_integer_type
4143 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
4144 tree mvectype = get_same_sized_vectype (meltype, vectype);
4145 unsigned k = 0, l;
4146 for (j = 0; j < v0.length (); ++j)
4148 /* Enforced by vect_build_slp_tree, which rejects variable-length
4149 vectors for SLP_TREE_TWO_OPERATORS. */
4150 unsigned int const_nunits = nunits.to_constant ();
4151 tree_vector_builder melts (mvectype, const_nunits, 1);
4152 for (l = 0; l < const_nunits; ++l)
4154 if (k >= group_size)
4155 k = 0;
4156 tree t = build_int_cst (meltype,
4157 mask[k++] * const_nunits + l);
4158 melts.quick_push (t);
4160 tmask = melts.build ();
4162 /* ??? Not all targets support a VEC_PERM_EXPR with a
4163 constant mask that would translate to a vec_merge RTX
4164 (with their vec_perm_const_ok). We can either not
4165 vectorize in that case or let veclower do its job.
4166 Unfortunately that isn't too great and at least for
4167 plus/minus we'd eventually like to match targets
4168 vector addsub instructions. */
4169 gimple *vstmt;
4170 vstmt = gimple_build_assign (make_ssa_name (vectype),
4171 VEC_PERM_EXPR,
4172 gimple_assign_lhs (v0[j]->stmt),
4173 gimple_assign_lhs (v1[j]->stmt),
4174 tmask);
4175 SLP_TREE_VEC_STMTS (node).quick_push
4176 (vect_finish_stmt_generation (stmt_info, vstmt, &si));
4178 v0.release ();
4179 v1.release ();
4180 return;
4183 vect_transform_stmt (stmt_info, &si, node, instance);
4185 /* Restore stmt def-types. */
4186 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4187 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4189 stmt_vec_info child_stmt_info;
4190 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4191 STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
4195 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4196 For loop vectorization this is done in vectorizable_call, but for SLP
4197 it needs to be deferred until end of vect_schedule_slp, because multiple
4198 SLP instances may refer to the same scalar stmt. */
4200 static void
4201 vect_remove_slp_scalar_calls (slp_tree node, hash_set<slp_tree> &visited)
4203 gimple *new_stmt;
4204 gimple_stmt_iterator gsi;
4205 int i;
4206 slp_tree child;
4207 tree lhs;
4208 stmt_vec_info stmt_info;
4210 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4211 return;
4213 if (visited.add (node))
4214 return;
4216 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4217 vect_remove_slp_scalar_calls (child, visited);
4219 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4221 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4222 if (!stmt || gimple_bb (stmt) == NULL)
4223 continue;
4224 if (is_pattern_stmt_p (stmt_info)
4225 || !PURE_SLP_STMT (stmt_info))
4226 continue;
4227 lhs = gimple_call_lhs (stmt);
4228 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4229 gsi = gsi_for_stmt (stmt);
4230 stmt_info->vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4231 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4235 static void
4236 vect_remove_slp_scalar_calls (slp_tree node)
4238 hash_set<slp_tree> visited;
4239 vect_remove_slp_scalar_calls (node, visited);
4242 /* Vectorize the instance root. */
4244 void
4245 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
4247 gassign *rstmt = NULL;
4249 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
4251 stmt_vec_info child_stmt_info;
4252 int j;
4254 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
4256 tree vect_lhs = gimple_get_lhs (child_stmt_info->stmt);
4257 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
4258 rstmt = gimple_build_assign (root_lhs, vect_lhs);
4259 break;
4262 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
4264 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4265 stmt_vec_info child_stmt_info;
4266 int j;
4267 vec<constructor_elt, va_gc> *v;
4268 vec_alloc (v, nelts);
4270 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
4272 CONSTRUCTOR_APPEND_ELT (v,
4273 NULL_TREE,
4274 gimple_get_lhs (child_stmt_info->stmt));
4276 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
4277 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
4278 tree r_constructor = build_constructor (rtype, v);
4279 rstmt = gimple_build_assign (lhs, r_constructor);
4282 gcc_assert (rstmt);
4284 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
4285 gsi_replace (&rgsi, rstmt, true);
4288 /* Generate vector code for all SLP instances in the loop/basic block. */
4290 void
4291 vect_schedule_slp (vec_info *vinfo)
4293 vec<slp_instance> slp_instances;
4294 slp_instance instance;
4295 unsigned int i;
4297 scalar_stmts_to_slp_tree_map_t *bst_map
4298 = new scalar_stmts_to_slp_tree_map_t ();
4299 slp_instances = vinfo->slp_instances;
4300 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4302 slp_tree node = SLP_INSTANCE_TREE (instance);
4303 /* Schedule the tree of INSTANCE. */
4304 vect_schedule_slp_instance (node, instance, bst_map);
4306 if (SLP_INSTANCE_ROOT_STMT (instance))
4307 vectorize_slp_instance_root_stmt (node, instance);
4309 if (dump_enabled_p ())
4310 dump_printf_loc (MSG_NOTE, vect_location,
4311 "vectorizing stmts using SLP.\n");
4313 delete bst_map;
4315 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4317 slp_tree root = SLP_INSTANCE_TREE (instance);
4318 stmt_vec_info store_info;
4319 unsigned int j;
4321 /* Remove scalar call stmts. Do not do this for basic-block
4322 vectorization as not all uses may be vectorized.
4323 ??? Why should this be necessary? DCE should be able to
4324 remove the stmts itself.
4325 ??? For BB vectorization we can as well remove scalar
4326 stmts starting from the SLP tree root if they have no
4327 uses. */
4328 if (is_a <loop_vec_info> (vinfo))
4329 vect_remove_slp_scalar_calls (root);
4331 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info)
4332 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4334 if (!STMT_VINFO_DATA_REF (store_info))
4335 break;
4337 if (SLP_INSTANCE_ROOT_STMT (instance))
4338 continue;
4340 store_info = vect_orig_stmt (store_info);
4341 /* Free the attached stmt_vec_info and remove the stmt. */
4342 vinfo->remove_stmt (store_info);