PR tree-optimization/84740
[official-gcc.git] / gcc / tree-vect-slp.c
blob73aa2271b531021bab52472a85e276de766a64d1
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
52 static void
53 vect_free_slp_tree (slp_tree node)
55 int i;
56 slp_tree child;
58 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
59 vect_free_slp_tree (child);
61 gimple *stmt;
62 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
63 /* After transform some stmts are removed and thus their vinfo is gone. */
64 if (vinfo_for_stmt (stmt))
66 gcc_assert (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) > 0);
67 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))--;
70 SLP_TREE_CHILDREN (node).release ();
71 SLP_TREE_SCALAR_STMTS (node).release ();
72 SLP_TREE_VEC_STMTS (node).release ();
73 SLP_TREE_LOAD_PERMUTATION (node).release ();
75 free (node);
79 /* Free the memory allocated for the SLP instance. */
81 void
82 vect_free_slp_instance (slp_instance instance)
84 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
85 SLP_INSTANCE_LOADS (instance).release ();
86 free (instance);
90 /* Create an SLP node for SCALAR_STMTS. */
92 static slp_tree
93 vect_create_new_slp_node (vec<gimple *> scalar_stmts)
95 slp_tree node;
96 gimple *stmt = scalar_stmts[0];
97 unsigned int nops;
99 if (is_gimple_call (stmt))
100 nops = gimple_call_num_args (stmt);
101 else if (is_gimple_assign (stmt))
103 nops = gimple_num_ops (stmt) - 1;
104 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
105 nops++;
107 else if (gimple_code (stmt) == GIMPLE_PHI)
108 nops = 0;
109 else
110 return NULL;
112 node = XNEW (struct _slp_tree);
113 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
114 SLP_TREE_VEC_STMTS (node).create (0);
115 SLP_TREE_CHILDREN (node).create (nops);
116 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
117 SLP_TREE_TWO_OPERATORS (node) = false;
118 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
120 unsigned i;
121 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt)
122 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))++;
124 return node;
128 /* This structure is used in creation of an SLP tree. Each instance
129 corresponds to the same operand in a group of scalar stmts in an SLP
130 node. */
131 typedef struct _slp_oprnd_info
133 /* Def-stmts for the operands. */
134 vec<gimple *> def_stmts;
135 /* Information about the first statement, its vector def-type, type, the
136 operand itself in case it's constant, and an indication if it's a pattern
137 stmt. */
138 tree first_op_type;
139 enum vect_def_type first_dt;
140 bool first_pattern;
141 bool second_pattern;
142 } *slp_oprnd_info;
145 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
146 operand. */
147 static vec<slp_oprnd_info>
148 vect_create_oprnd_info (int nops, int group_size)
150 int i;
151 slp_oprnd_info oprnd_info;
152 vec<slp_oprnd_info> oprnds_info;
154 oprnds_info.create (nops);
155 for (i = 0; i < nops; i++)
157 oprnd_info = XNEW (struct _slp_oprnd_info);
158 oprnd_info->def_stmts.create (group_size);
159 oprnd_info->first_dt = vect_uninitialized_def;
160 oprnd_info->first_op_type = NULL_TREE;
161 oprnd_info->first_pattern = false;
162 oprnd_info->second_pattern = false;
163 oprnds_info.quick_push (oprnd_info);
166 return oprnds_info;
170 /* Free operands info. */
172 static void
173 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
175 int i;
176 slp_oprnd_info oprnd_info;
178 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
180 oprnd_info->def_stmts.release ();
181 XDELETE (oprnd_info);
184 oprnds_info.release ();
188 /* Find the place of the data-ref in STMT in the interleaving chain that starts
189 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
192 vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
194 gimple *next_stmt = first_stmt;
195 int result = 0;
197 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
198 return -1;
202 if (next_stmt == stmt)
203 return result;
204 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
205 if (next_stmt)
206 result += GROUP_GAP (vinfo_for_stmt (next_stmt));
208 while (next_stmt);
210 return -1;
213 /* Check whether it is possible to load COUNT elements of type ELT_MODE
214 using the method implemented by duplicate_and_interleave. Return true
215 if so, returning the number of intermediate vectors in *NVECTORS_OUT
216 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
217 (if nonnull). */
219 bool
220 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
221 unsigned int *nvectors_out,
222 tree *vector_type_out,
223 tree *permutes)
225 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
226 poly_int64 nelts;
227 unsigned int nvectors = 1;
228 for (;;)
230 scalar_int_mode int_mode;
231 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
232 if (multiple_p (current_vector_size, elt_bytes, &nelts)
233 && int_mode_for_size (elt_bits, 0).exists (&int_mode))
235 tree int_type = build_nonstandard_integer_type
236 (GET_MODE_BITSIZE (int_mode), 1);
237 tree vector_type = build_vector_type (int_type, nelts);
238 if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
240 vec_perm_builder sel1 (nelts, 2, 3);
241 vec_perm_builder sel2 (nelts, 2, 3);
242 poly_int64 half_nelts = exact_div (nelts, 2);
243 for (unsigned int i = 0; i < 3; ++i)
245 sel1.quick_push (i);
246 sel1.quick_push (i + nelts);
247 sel2.quick_push (half_nelts + i);
248 sel2.quick_push (half_nelts + i + nelts);
250 vec_perm_indices indices1 (sel1, 2, nelts);
251 vec_perm_indices indices2 (sel2, 2, nelts);
252 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
253 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
255 if (nvectors_out)
256 *nvectors_out = nvectors;
257 if (vector_type_out)
258 *vector_type_out = vector_type;
259 if (permutes)
261 permutes[0] = vect_gen_perm_mask_checked (vector_type,
262 indices1);
263 permutes[1] = vect_gen_perm_mask_checked (vector_type,
264 indices2);
266 return true;
270 if (!multiple_p (elt_bytes, 2, &elt_bytes))
271 return false;
272 nvectors *= 2;
276 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
277 they are of a valid type and that they match the defs of the first stmt of
278 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
279 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
280 indicates swap is required for cond_expr stmts. Specifically, *SWAP
281 is 1 if STMT is cond and operands of comparison need to be swapped;
282 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
283 If there is any operand swap in this function, *SWAP is set to non-zero
284 value.
285 If there was a fatal error return -1; if the error could be corrected by
286 swapping operands of father node of this one, return 1; if everything is
287 ok return 0. */
288 static int
289 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
290 vec<gimple *> stmts, unsigned stmt_num,
291 vec<slp_oprnd_info> *oprnds_info)
293 gimple *stmt = stmts[stmt_num];
294 tree oprnd;
295 unsigned int i, number_of_oprnds;
296 gimple *def_stmt;
297 enum vect_def_type dt = vect_uninitialized_def;
298 bool pattern = false;
299 slp_oprnd_info oprnd_info;
300 int first_op_idx = 1;
301 bool commutative = false;
302 bool first_op_cond = false;
303 bool first = stmt_num == 0;
304 bool second = stmt_num == 1;
306 if (is_gimple_call (stmt))
308 number_of_oprnds = gimple_call_num_args (stmt);
309 first_op_idx = 3;
311 else if (is_gimple_assign (stmt))
313 enum tree_code code = gimple_assign_rhs_code (stmt);
314 number_of_oprnds = gimple_num_ops (stmt) - 1;
315 /* Swap can only be done for cond_expr if asked to, otherwise we
316 could result in different comparison code to the first stmt. */
317 if (gimple_assign_rhs_code (stmt) == COND_EXPR
318 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
320 first_op_cond = true;
321 number_of_oprnds++;
323 else
324 commutative = commutative_tree_code (code);
326 else
327 return -1;
329 bool swapped = (*swap != 0);
330 gcc_assert (!swapped || first_op_cond);
331 for (i = 0; i < number_of_oprnds; i++)
333 again:
334 if (first_op_cond)
336 /* Map indicating how operands of cond_expr should be swapped. */
337 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
338 int *map = maps[*swap];
340 if (i < 2)
341 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx), map[i]);
342 else
343 oprnd = gimple_op (stmt, map[i]);
345 else
346 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
348 oprnd_info = (*oprnds_info)[i];
350 if (!vect_is_simple_use (oprnd, vinfo, &def_stmt, &dt))
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 "Build SLP failed: can't analyze def for ");
356 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
357 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
360 return -1;
363 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
364 from the pattern. Check that all the stmts of the node are in the
365 pattern. */
366 if (def_stmt && gimple_bb (def_stmt)
367 && vect_stmt_in_region_p (vinfo, def_stmt)
368 && vinfo_for_stmt (def_stmt)
369 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
370 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
371 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
373 pattern = true;
374 if (!first && !oprnd_info->first_pattern
375 /* Allow different pattern state for the defs of the
376 first stmt in reduction chains. */
377 && (oprnd_info->first_dt != vect_reduction_def
378 || (!second && !oprnd_info->second_pattern)))
380 if (i == 0
381 && !swapped
382 && commutative)
384 swapped = true;
385 goto again;
388 if (dump_enabled_p ())
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
391 "Build SLP failed: some of the stmts"
392 " are in a pattern, and others are not ");
393 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
394 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
397 return 1;
400 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
401 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
403 if (dt == vect_unknown_def_type)
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "Unsupported pattern.\n");
408 return -1;
411 switch (gimple_code (def_stmt))
413 case GIMPLE_PHI:
414 case GIMPLE_ASSIGN:
415 break;
417 default:
418 if (dump_enabled_p ())
419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
420 "unsupported defining stmt:\n");
421 return -1;
425 if (second)
426 oprnd_info->second_pattern = pattern;
428 if (first)
430 oprnd_info->first_dt = dt;
431 oprnd_info->first_pattern = pattern;
432 oprnd_info->first_op_type = TREE_TYPE (oprnd);
434 else
436 /* Not first stmt of the group, check that the def-stmt/s match
437 the def-stmt/s of the first stmt. Allow different definition
438 types for reduction chains: the first stmt must be a
439 vect_reduction_def (a phi node), and the rest
440 vect_internal_def. */
441 tree type = TREE_TYPE (oprnd);
442 if ((oprnd_info->first_dt != dt
443 && !(oprnd_info->first_dt == vect_reduction_def
444 && dt == vect_internal_def)
445 && !((oprnd_info->first_dt == vect_external_def
446 || oprnd_info->first_dt == vect_constant_def)
447 && (dt == vect_external_def
448 || dt == vect_constant_def)))
449 || !types_compatible_p (oprnd_info->first_op_type, type))
451 /* Try swapping operands if we got a mismatch. */
452 if (i == 0
453 && !swapped
454 && commutative)
456 swapped = true;
457 goto again;
460 if (dump_enabled_p ())
461 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
462 "Build SLP failed: different types\n");
464 return 1;
466 if ((dt == vect_constant_def
467 || dt == vect_external_def)
468 && !current_vector_size.is_constant ()
469 && (TREE_CODE (type) == BOOLEAN_TYPE
470 || !can_duplicate_and_interleave_p (stmts.length (),
471 TYPE_MODE (type))))
473 if (dump_enabled_p ())
475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
476 "Build SLP failed: invalid type of def "
477 "for variable-length SLP ");
478 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
479 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
481 return -1;
485 /* Check the types of the definitions. */
486 switch (dt)
488 case vect_constant_def:
489 case vect_external_def:
490 break;
492 case vect_reduction_def:
493 case vect_induction_def:
494 case vect_internal_def:
495 oprnd_info->def_stmts.quick_push (def_stmt);
496 break;
498 default:
499 /* FORNOW: Not supported. */
500 if (dump_enabled_p ())
502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
503 "Build SLP failed: illegal type of def ");
504 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
505 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
508 return -1;
512 /* Swap operands. */
513 if (swapped)
515 /* If there are already uses of this stmt in a SLP instance then
516 we've committed to the operand order and can't swap it. */
517 if (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) != 0)
519 if (dump_enabled_p ())
521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
522 "Build SLP failed: cannot swap operands of "
523 "shared stmt ");
524 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
526 return -1;
529 if (first_op_cond)
531 tree cond = gimple_assign_rhs1 (stmt);
532 enum tree_code code = TREE_CODE (cond);
534 /* Swap. */
535 if (*swap == 1)
537 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
538 &TREE_OPERAND (cond, 1));
539 TREE_SET_CODE (cond, swap_tree_comparison (code));
541 /* Invert. */
542 else
544 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
545 gimple_assign_rhs3_ptr (stmt));
546 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
547 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
548 gcc_assert (code != ERROR_MARK);
549 TREE_SET_CODE (cond, code);
552 else
553 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
554 gimple_assign_rhs2_ptr (stmt));
555 if (dump_enabled_p ())
557 dump_printf_loc (MSG_NOTE, vect_location,
558 "swapped operands to match def types in ");
559 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
563 *swap = swapped;
564 return 0;
567 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
568 caller's attempt to find the vector type in STMT with the narrowest
569 element type. Return true if VECTYPE is nonnull and if it is valid
570 for VINFO. When returning true, update MAX_NUNITS to reflect the
571 number of units in VECTYPE. VINFO, GORUP_SIZE and MAX_NUNITS are
572 as for vect_build_slp_tree. */
574 static bool
575 vect_record_max_nunits (vec_info *vinfo, gimple *stmt, unsigned int group_size,
576 tree vectype, poly_uint64 *max_nunits)
578 if (!vectype)
580 if (dump_enabled_p ())
582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
583 "Build SLP failed: unsupported data-type in ");
584 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
585 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
587 /* Fatal mismatch. */
588 return false;
591 /* If populating the vector type requires unrolling then fail
592 before adjusting *max_nunits for basic-block vectorization. */
593 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
594 unsigned HOST_WIDE_INT const_nunits;
595 if (is_a <bb_vec_info> (vinfo)
596 && (!nunits.is_constant (&const_nunits)
597 || const_nunits > group_size))
599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
600 "Build SLP failed: unrolling required "
601 "in basic block SLP\n");
602 /* Fatal mismatch. */
603 return false;
606 /* In case of multiple types we need to detect the smallest type. */
607 vect_update_max_nunits (max_nunits, vectype);
608 return true;
611 /* Verify if the scalar stmts STMTS are isomorphic, require data
612 permutation or are of unsupported types of operation. Return
613 true if they are, otherwise return false and indicate in *MATCHES
614 which stmts are not isomorphic to the first one. If MATCHES[0]
615 is false then this indicates the comparison could not be
616 carried out or the stmts will never be vectorized by SLP.
618 Note COND_EXPR is possibly ismorphic to another one after swapping its
619 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
620 the first stmt by swapping the two operands of comparison; set SWAP[i]
621 to 2 if stmt I is isormorphic to the first stmt by inverting the code
622 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
623 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
625 static bool
626 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
627 vec<gimple *> stmts, unsigned int group_size,
628 unsigned nops, poly_uint64 *max_nunits,
629 bool *matches, bool *two_operators)
631 unsigned int i;
632 gimple *first_stmt = stmts[0], *stmt = stmts[0];
633 enum tree_code first_stmt_code = ERROR_MARK;
634 enum tree_code alt_stmt_code = ERROR_MARK;
635 enum tree_code rhs_code = ERROR_MARK;
636 enum tree_code first_cond_code = ERROR_MARK;
637 tree lhs;
638 bool need_same_oprnds = false;
639 tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
640 optab optab;
641 int icode;
642 machine_mode optab_op2_mode;
643 machine_mode vec_mode;
644 HOST_WIDE_INT dummy;
645 gimple *first_load = NULL, *prev_first_load = NULL;
647 /* For every stmt in NODE find its def stmt/s. */
648 FOR_EACH_VEC_ELT (stmts, i, stmt)
650 swap[i] = 0;
651 matches[i] = false;
653 if (dump_enabled_p ())
655 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
656 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
659 /* Fail to vectorize statements marked as unvectorizable. */
660 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
662 if (dump_enabled_p ())
664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
665 "Build SLP failed: unvectorizable statement ");
666 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
668 /* Fatal mismatch. */
669 matches[0] = false;
670 return false;
673 lhs = gimple_get_lhs (stmt);
674 if (lhs == NULL_TREE)
676 if (dump_enabled_p ())
678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
679 "Build SLP failed: not GIMPLE_ASSIGN nor "
680 "GIMPLE_CALL ");
681 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
683 /* Fatal mismatch. */
684 matches[0] = false;
685 return false;
688 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
689 vectype = get_vectype_for_scalar_type (scalar_type);
690 if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
691 max_nunits))
693 /* Fatal mismatch. */
694 matches[0] = false;
695 return false;
698 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
700 rhs_code = CALL_EXPR;
701 if (gimple_call_internal_p (call_stmt)
702 || gimple_call_tail_p (call_stmt)
703 || gimple_call_noreturn_p (call_stmt)
704 || !gimple_call_nothrow_p (call_stmt)
705 || gimple_call_chain (call_stmt))
707 if (dump_enabled_p ())
709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
710 "Build SLP failed: unsupported call type ");
711 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
712 call_stmt, 0);
714 /* Fatal mismatch. */
715 matches[0] = false;
716 return false;
719 else
720 rhs_code = gimple_assign_rhs_code (stmt);
722 /* Check the operation. */
723 if (i == 0)
725 first_stmt_code = rhs_code;
727 /* Shift arguments should be equal in all the packed stmts for a
728 vector shift with scalar shift operand. */
729 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
730 || rhs_code == LROTATE_EXPR
731 || rhs_code == RROTATE_EXPR)
733 vec_mode = TYPE_MODE (vectype);
735 /* First see if we have a vector/vector shift. */
736 optab = optab_for_tree_code (rhs_code, vectype,
737 optab_vector);
739 if (!optab
740 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
742 /* No vector/vector shift, try for a vector/scalar shift. */
743 optab = optab_for_tree_code (rhs_code, vectype,
744 optab_scalar);
746 if (!optab)
748 if (dump_enabled_p ())
749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
750 "Build SLP failed: no optab.\n");
751 /* Fatal mismatch. */
752 matches[0] = false;
753 return false;
755 icode = (int) optab_handler (optab, vec_mode);
756 if (icode == CODE_FOR_nothing)
758 if (dump_enabled_p ())
759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
760 "Build SLP failed: "
761 "op not supported by target.\n");
762 /* Fatal mismatch. */
763 matches[0] = false;
764 return false;
766 optab_op2_mode = insn_data[icode].operand[2].mode;
767 if (!VECTOR_MODE_P (optab_op2_mode))
769 need_same_oprnds = true;
770 first_op1 = gimple_assign_rhs2 (stmt);
774 else if (rhs_code == WIDEN_LSHIFT_EXPR)
776 need_same_oprnds = true;
777 first_op1 = gimple_assign_rhs2 (stmt);
780 else
782 if (first_stmt_code != rhs_code
783 && alt_stmt_code == ERROR_MARK)
784 alt_stmt_code = rhs_code;
785 if (first_stmt_code != rhs_code
786 && (first_stmt_code != IMAGPART_EXPR
787 || rhs_code != REALPART_EXPR)
788 && (first_stmt_code != REALPART_EXPR
789 || rhs_code != IMAGPART_EXPR)
790 /* Handle mismatches in plus/minus by computing both
791 and merging the results. */
792 && !((first_stmt_code == PLUS_EXPR
793 || first_stmt_code == MINUS_EXPR)
794 && (alt_stmt_code == PLUS_EXPR
795 || alt_stmt_code == MINUS_EXPR)
796 && rhs_code == alt_stmt_code)
797 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
798 && (first_stmt_code == ARRAY_REF
799 || first_stmt_code == BIT_FIELD_REF
800 || first_stmt_code == INDIRECT_REF
801 || first_stmt_code == COMPONENT_REF
802 || first_stmt_code == MEM_REF)))
804 if (dump_enabled_p ())
806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
807 "Build SLP failed: different operation "
808 "in stmt ");
809 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
811 "original stmt ");
812 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
813 first_stmt, 0);
815 /* Mismatch. */
816 continue;
819 if (need_same_oprnds
820 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
822 if (dump_enabled_p ())
824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
825 "Build SLP failed: different shift "
826 "arguments in ");
827 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
829 /* Mismatch. */
830 continue;
833 if (rhs_code == CALL_EXPR)
835 gimple *first_stmt = stmts[0];
836 if (gimple_call_num_args (stmt) != nops
837 || !operand_equal_p (gimple_call_fn (first_stmt),
838 gimple_call_fn (stmt), 0)
839 || gimple_call_fntype (first_stmt)
840 != gimple_call_fntype (stmt))
842 if (dump_enabled_p ())
844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
845 "Build SLP failed: different calls in ");
846 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
847 stmt, 0);
849 /* Mismatch. */
850 continue;
855 /* Grouped store or load. */
856 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
858 if (REFERENCE_CLASS_P (lhs))
860 /* Store. */
863 else
865 /* Load. */
866 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
867 if (prev_first_load)
869 /* Check that there are no loads from different interleaving
870 chains in the same node. */
871 if (prev_first_load != first_load)
873 if (dump_enabled_p ())
875 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
876 vect_location,
877 "Build SLP failed: different "
878 "interleaving chains in one node ");
879 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
880 stmt, 0);
882 /* Mismatch. */
883 continue;
886 else
887 prev_first_load = first_load;
889 } /* Grouped access. */
890 else
892 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
894 /* Not grouped load. */
895 if (dump_enabled_p ())
897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
898 "Build SLP failed: not grouped load ");
899 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
902 /* FORNOW: Not grouped loads are not supported. */
903 /* Fatal mismatch. */
904 matches[0] = false;
905 return false;
908 /* Not memory operation. */
909 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
910 && TREE_CODE_CLASS (rhs_code) != tcc_unary
911 && TREE_CODE_CLASS (rhs_code) != tcc_expression
912 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
913 && rhs_code != CALL_EXPR)
915 if (dump_enabled_p ())
917 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
918 "Build SLP failed: operation");
919 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
920 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
922 /* Fatal mismatch. */
923 matches[0] = false;
924 return false;
927 if (rhs_code == COND_EXPR)
929 tree cond_expr = gimple_assign_rhs1 (stmt);
930 enum tree_code cond_code = TREE_CODE (cond_expr);
931 enum tree_code swap_code = ERROR_MARK;
932 enum tree_code invert_code = ERROR_MARK;
934 if (i == 0)
935 first_cond_code = TREE_CODE (cond_expr);
936 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
938 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
939 swap_code = swap_tree_comparison (cond_code);
940 invert_code = invert_tree_comparison (cond_code, honor_nans);
943 if (first_cond_code == cond_code)
945 /* Isomorphic can be achieved by swapping. */
946 else if (first_cond_code == swap_code)
947 swap[i] = 1;
948 /* Isomorphic can be achieved by inverting. */
949 else if (first_cond_code == invert_code)
950 swap[i] = 2;
951 else
953 if (dump_enabled_p ())
955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
956 "Build SLP failed: different"
957 " operation");
958 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
959 stmt, 0);
961 /* Mismatch. */
962 continue;
967 matches[i] = true;
970 for (i = 0; i < group_size; ++i)
971 if (!matches[i])
972 return false;
974 /* If we allowed a two-operation SLP node verify the target can cope
975 with the permute we are going to use. */
976 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
977 if (alt_stmt_code != ERROR_MARK
978 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
980 unsigned HOST_WIDE_INT count;
981 if (!nunits.is_constant (&count))
983 if (dump_enabled_p ())
984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
985 "Build SLP failed: different operations "
986 "not allowed with variable-length SLP.\n");
987 return false;
989 vec_perm_builder sel (count, count, 1);
990 for (i = 0; i < count; ++i)
992 unsigned int elt = i;
993 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
994 elt += count;
995 sel.quick_push (elt);
997 vec_perm_indices indices (sel, 2, count);
998 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
1000 for (i = 0; i < group_size; ++i)
1001 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
1003 matches[i] = false;
1004 if (dump_enabled_p ())
1006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1007 "Build SLP failed: different operation "
1008 "in stmt ");
1009 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1010 stmts[i], 0);
1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 "original stmt ");
1013 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1014 first_stmt, 0);
1017 return false;
1019 *two_operators = true;
1022 return true;
1025 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1026 Note we never remove apart from at destruction time so we do not
1027 need a special value for deleted that differs from empty. */
1028 struct bst_traits
1030 typedef vec <gimple *> value_type;
1031 typedef vec <gimple *> compare_type;
1032 static inline hashval_t hash (value_type);
1033 static inline bool equal (value_type existing, value_type candidate);
1034 static inline bool is_empty (value_type x) { return !x.exists (); }
1035 static inline bool is_deleted (value_type x) { return !x.exists (); }
1036 static inline void mark_empty (value_type &x) { x.release (); }
1037 static inline void mark_deleted (value_type &x) { x.release (); }
1038 static inline void remove (value_type &x) { x.release (); }
1040 inline hashval_t
1041 bst_traits::hash (value_type x)
1043 inchash::hash h;
1044 for (unsigned i = 0; i < x.length (); ++i)
1045 h.add_int (gimple_uid (x[i]));
1046 return h.end ();
1048 inline bool
1049 bst_traits::equal (value_type existing, value_type candidate)
1051 if (existing.length () != candidate.length ())
1052 return false;
1053 for (unsigned i = 0; i < existing.length (); ++i)
1054 if (existing[i] != candidate[i])
1055 return false;
1056 return true;
1059 typedef hash_set <vec <gimple *>, bst_traits> scalar_stmts_set_t;
1060 static scalar_stmts_set_t *bst_fail;
1062 static slp_tree
1063 vect_build_slp_tree_2 (vec_info *vinfo,
1064 vec<gimple *> stmts, unsigned int group_size,
1065 poly_uint64 *max_nunits,
1066 vec<slp_tree> *loads,
1067 bool *matches, unsigned *npermutes, unsigned *tree_size,
1068 unsigned max_tree_size);
1070 static slp_tree
1071 vect_build_slp_tree (vec_info *vinfo,
1072 vec<gimple *> stmts, unsigned int group_size,
1073 poly_uint64 *max_nunits, vec<slp_tree> *loads,
1074 bool *matches, unsigned *npermutes, unsigned *tree_size,
1075 unsigned max_tree_size)
1077 if (bst_fail->contains (stmts))
1078 return NULL;
1079 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1080 loads, matches, npermutes, tree_size,
1081 max_tree_size);
1082 /* When SLP build fails for stmts record this, otherwise SLP build
1083 can be exponential in time when we allow to construct parts from
1084 scalars, see PR81723. */
1085 if (! res)
1087 vec <gimple *> x;
1088 x.create (stmts.length ());
1089 x.splice (stmts);
1090 bst_fail->add (x);
1092 return res;
1095 /* Recursively build an SLP tree starting from NODE.
1096 Fail (and return a value not equal to zero) if def-stmts are not
1097 isomorphic, require data permutation or are of unsupported types of
1098 operation. Otherwise, return 0.
1099 The value returned is the depth in the SLP tree where a mismatch
1100 was found. */
1102 static slp_tree
1103 vect_build_slp_tree_2 (vec_info *vinfo,
1104 vec<gimple *> stmts, unsigned int group_size,
1105 poly_uint64 *max_nunits,
1106 vec<slp_tree> *loads,
1107 bool *matches, unsigned *npermutes, unsigned *tree_size,
1108 unsigned max_tree_size)
1110 unsigned nops, i, this_tree_size = 0;
1111 poly_uint64 this_max_nunits = *max_nunits;
1112 gimple *stmt;
1113 slp_tree node;
1115 matches[0] = false;
1117 stmt = stmts[0];
1118 if (is_gimple_call (stmt))
1119 nops = gimple_call_num_args (stmt);
1120 else if (is_gimple_assign (stmt))
1122 nops = gimple_num_ops (stmt) - 1;
1123 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1124 nops++;
1126 else if (gimple_code (stmt) == GIMPLE_PHI)
1127 nops = 0;
1128 else
1129 return NULL;
1131 /* If the SLP node is a PHI (induction or reduction), terminate
1132 the recursion. */
1133 if (gimple_code (stmt) == GIMPLE_PHI)
1135 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1136 tree vectype = get_vectype_for_scalar_type (scalar_type);
1137 if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
1138 max_nunits))
1139 return NULL;
1141 vect_def_type def_type = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt));
1142 /* Induction from different IVs is not supported. */
1143 if (def_type == vect_induction_def)
1145 FOR_EACH_VEC_ELT (stmts, i, stmt)
1146 if (stmt != stmts[0])
1147 return NULL;
1149 else
1151 /* Else def types have to match. */
1152 FOR_EACH_VEC_ELT (stmts, i, stmt)
1154 /* But for reduction chains only check on the first stmt. */
1155 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1156 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt)
1157 continue;
1158 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != def_type)
1159 return NULL;
1162 node = vect_create_new_slp_node (stmts);
1163 return node;
1167 bool two_operators = false;
1168 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1169 if (!vect_build_slp_tree_1 (vinfo, swap,
1170 stmts, group_size, nops,
1171 &this_max_nunits, matches, &two_operators))
1172 return NULL;
1174 /* If the SLP node is a load, terminate the recursion. */
1175 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
1176 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1178 *max_nunits = this_max_nunits;
1179 node = vect_create_new_slp_node (stmts);
1180 loads->safe_push (node);
1181 return node;
1184 /* Get at the operands, verifying they are compatible. */
1185 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1186 slp_oprnd_info oprnd_info;
1187 FOR_EACH_VEC_ELT (stmts, i, stmt)
1189 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1190 stmts, i, &oprnds_info);
1191 if (res != 0)
1192 matches[(res == -1) ? 0 : i] = false;
1193 if (!matches[0])
1194 break;
1196 for (i = 0; i < group_size; ++i)
1197 if (!matches[i])
1199 vect_free_oprnd_info (oprnds_info);
1200 return NULL;
1203 auto_vec<slp_tree, 4> children;
1204 auto_vec<slp_tree> this_loads;
1206 stmt = stmts[0];
1208 if (tree_size)
1209 max_tree_size -= *tree_size;
1211 /* Create SLP_TREE nodes for the definition node/s. */
1212 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1214 slp_tree child;
1215 unsigned old_nloads = this_loads.length ();
1216 unsigned old_tree_size = this_tree_size;
1217 unsigned int j;
1219 if (oprnd_info->first_dt != vect_internal_def
1220 && oprnd_info->first_dt != vect_reduction_def
1221 && oprnd_info->first_dt != vect_induction_def)
1222 continue;
1224 if (++this_tree_size > max_tree_size)
1226 FOR_EACH_VEC_ELT (children, j, child)
1227 vect_free_slp_tree (child);
1228 vect_free_oprnd_info (oprnds_info);
1229 return NULL;
1232 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1233 group_size, &this_max_nunits,
1234 &this_loads, matches, npermutes,
1235 &this_tree_size,
1236 max_tree_size)) != NULL)
1238 /* If we have all children of child built up from scalars then just
1239 throw that away and build it up this node from scalars. */
1240 if (!SLP_TREE_CHILDREN (child).is_empty ()
1241 /* ??? Rejecting patterns this way doesn't work. We'd have to
1242 do extra work to cancel the pattern so the uses see the
1243 scalar version. */
1244 && !is_pattern_stmt_p
1245 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1247 slp_tree grandchild;
1249 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1250 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1251 break;
1252 if (!grandchild)
1254 /* Roll back. */
1255 this_loads.truncate (old_nloads);
1256 this_tree_size = old_tree_size;
1257 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1258 vect_free_slp_tree (grandchild);
1259 SLP_TREE_CHILDREN (child).truncate (0);
1261 dump_printf_loc (MSG_NOTE, vect_location,
1262 "Building parent vector operands from "
1263 "scalars instead\n");
1264 oprnd_info->def_stmts = vNULL;
1265 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1266 children.safe_push (child);
1267 continue;
1271 oprnd_info->def_stmts = vNULL;
1272 children.safe_push (child);
1273 continue;
1276 /* If the SLP build failed fatally and we analyze a basic-block
1277 simply treat nodes we fail to build as externally defined
1278 (and thus build vectors from the scalar defs).
1279 The cost model will reject outright expensive cases.
1280 ??? This doesn't treat cases where permutation ultimatively
1281 fails (or we don't try permutation below). Ideally we'd
1282 even compute a permutation that will end up with the maximum
1283 SLP tree size... */
1284 if (is_a <bb_vec_info> (vinfo)
1285 && !matches[0]
1286 /* ??? Rejecting patterns this way doesn't work. We'd have to
1287 do extra work to cancel the pattern so the uses see the
1288 scalar version. */
1289 && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1291 dump_printf_loc (MSG_NOTE, vect_location,
1292 "Building vector operands from scalars\n");
1293 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1294 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1295 children.safe_push (child);
1296 oprnd_info->def_stmts = vNULL;
1297 continue;
1300 /* If the SLP build for operand zero failed and operand zero
1301 and one can be commutated try that for the scalar stmts
1302 that failed the match. */
1303 if (i == 0
1304 /* A first scalar stmt mismatch signals a fatal mismatch. */
1305 && matches[0]
1306 /* ??? For COND_EXPRs we can swap the comparison operands
1307 as well as the arms under some constraints. */
1308 && nops == 2
1309 && oprnds_info[1]->first_dt == vect_internal_def
1310 && is_gimple_assign (stmt)
1311 /* Do so only if the number of not successful permutes was nor more
1312 than a cut-ff as re-trying the recursive match on
1313 possibly each level of the tree would expose exponential
1314 behavior. */
1315 && *npermutes < 4)
1317 /* See whether we can swap the matching or the non-matching
1318 stmt operands. */
1319 bool swap_not_matching = true;
1322 for (j = 0; j < group_size; ++j)
1324 if (matches[j] != !swap_not_matching)
1325 continue;
1326 gimple *stmt = stmts[j];
1327 /* Verify if we can swap operands of this stmt. */
1328 if (!is_gimple_assign (stmt)
1329 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1331 if (!swap_not_matching)
1332 goto fail;
1333 swap_not_matching = false;
1334 break;
1336 /* Verify if we can safely swap or if we committed to a
1337 specific operand order already.
1338 ??? Instead of modifying GIMPLE stmts here we could
1339 record whether we want to swap operands in the SLP
1340 node and temporarily do that when processing it
1341 (or wrap operand accessors in a helper). */
1342 else if (swap[j] != 0
1343 || STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)))
1345 if (!swap_not_matching)
1347 if (dump_enabled_p ())
1349 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1350 vect_location,
1351 "Build SLP failed: cannot swap "
1352 "operands of shared stmt ");
1353 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
1354 TDF_SLIM, stmts[j], 0);
1356 goto fail;
1358 swap_not_matching = false;
1359 break;
1363 while (j != group_size);
1365 /* Swap mismatched definition stmts. */
1366 dump_printf_loc (MSG_NOTE, vect_location,
1367 "Re-trying with swapped operands of stmts ");
1368 for (j = 0; j < group_size; ++j)
1369 if (matches[j] == !swap_not_matching)
1371 std::swap (oprnds_info[0]->def_stmts[j],
1372 oprnds_info[1]->def_stmts[j]);
1373 dump_printf (MSG_NOTE, "%d ", j);
1375 dump_printf (MSG_NOTE, "\n");
1376 /* And try again with scratch 'matches' ... */
1377 bool *tem = XALLOCAVEC (bool, group_size);
1378 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1379 group_size, &this_max_nunits,
1380 &this_loads, tem, npermutes,
1381 &this_tree_size,
1382 max_tree_size)) != NULL)
1384 /* ... so if successful we can apply the operand swapping
1385 to the GIMPLE IL. This is necessary because for example
1386 vect_get_slp_defs uses operand indexes and thus expects
1387 canonical operand order. This is also necessary even
1388 if we end up building the operand from scalars as
1389 we'll continue to process swapped operand two. */
1390 for (j = 0; j < group_size; ++j)
1392 gimple *stmt = stmts[j];
1393 gimple_set_plf (stmt, GF_PLF_1, false);
1395 for (j = 0; j < group_size; ++j)
1397 gimple *stmt = stmts[j];
1398 if (matches[j] == !swap_not_matching)
1400 /* Avoid swapping operands twice. */
1401 if (gimple_plf (stmt, GF_PLF_1))
1402 continue;
1403 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1404 gimple_assign_rhs2_ptr (stmt));
1405 gimple_set_plf (stmt, GF_PLF_1, true);
1408 /* Verify we swap all duplicates or none. */
1409 if (flag_checking)
1410 for (j = 0; j < group_size; ++j)
1412 gimple *stmt = stmts[j];
1413 gcc_assert (gimple_plf (stmt, GF_PLF_1)
1414 == (matches[j] == !swap_not_matching));
1417 /* If we have all children of child built up from scalars then
1418 just throw that away and build it up this node from scalars. */
1419 if (!SLP_TREE_CHILDREN (child).is_empty ()
1420 /* ??? Rejecting patterns this way doesn't work. We'd have
1421 to do extra work to cancel the pattern so the uses see the
1422 scalar version. */
1423 && !is_pattern_stmt_p
1424 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1426 unsigned int j;
1427 slp_tree grandchild;
1429 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1430 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1431 break;
1432 if (!grandchild)
1434 /* Roll back. */
1435 this_loads.truncate (old_nloads);
1436 this_tree_size = old_tree_size;
1437 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1438 vect_free_slp_tree (grandchild);
1439 SLP_TREE_CHILDREN (child).truncate (0);
1441 dump_printf_loc (MSG_NOTE, vect_location,
1442 "Building parent vector operands from "
1443 "scalars instead\n");
1444 oprnd_info->def_stmts = vNULL;
1445 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1446 children.safe_push (child);
1447 continue;
1451 oprnd_info->def_stmts = vNULL;
1452 children.safe_push (child);
1453 continue;
1456 ++*npermutes;
1459 fail:
1460 gcc_assert (child == NULL);
1461 FOR_EACH_VEC_ELT (children, j, child)
1462 vect_free_slp_tree (child);
1463 vect_free_oprnd_info (oprnds_info);
1464 return NULL;
1467 vect_free_oprnd_info (oprnds_info);
1469 if (tree_size)
1470 *tree_size += this_tree_size;
1471 *max_nunits = this_max_nunits;
1472 loads->safe_splice (this_loads);
1474 node = vect_create_new_slp_node (stmts);
1475 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1476 SLP_TREE_CHILDREN (node).splice (children);
1477 return node;
1480 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1482 static void
1483 vect_print_slp_tree (dump_flags_t dump_kind, location_t loc, slp_tree node)
1485 int i;
1486 gimple *stmt;
1487 slp_tree child;
1489 dump_printf_loc (dump_kind, loc, "node%s\n",
1490 SLP_TREE_DEF_TYPE (node) != vect_internal_def
1491 ? " (external)" : "");
1492 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1494 dump_printf_loc (dump_kind, loc, "\tstmt %d ", i);
1495 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1497 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1498 vect_print_slp_tree (dump_kind, loc, child);
1502 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1503 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1504 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1505 stmts in NODE are to be marked. */
1507 static void
1508 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1510 int i;
1511 gimple *stmt;
1512 slp_tree child;
1514 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1515 return;
1517 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1518 if (j < 0 || i == j)
1519 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1521 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1522 vect_mark_slp_stmts (child, mark, j);
1526 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1528 static void
1529 vect_mark_slp_stmts_relevant (slp_tree node)
1531 int i;
1532 gimple *stmt;
1533 stmt_vec_info stmt_info;
1534 slp_tree child;
1536 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1537 return;
1539 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1541 stmt_info = vinfo_for_stmt (stmt);
1542 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1543 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1544 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1547 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1548 vect_mark_slp_stmts_relevant (child);
1552 /* Rearrange the statements of NODE according to PERMUTATION. */
1554 static void
1555 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1556 vec<unsigned> permutation)
1558 gimple *stmt;
1559 vec<gimple *> tmp_stmts;
1560 unsigned int i;
1561 slp_tree child;
1563 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1564 vect_slp_rearrange_stmts (child, group_size, permutation);
1566 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1567 tmp_stmts.create (group_size);
1568 tmp_stmts.quick_grow_cleared (group_size);
1570 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1571 tmp_stmts[permutation[i]] = stmt;
1573 SLP_TREE_SCALAR_STMTS (node).release ();
1574 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1578 /* Attempt to reorder stmts in a reduction chain so that we don't
1579 require any load permutation. Return true if that was possible,
1580 otherwise return false. */
1582 static bool
1583 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1585 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1586 unsigned int i, j;
1587 unsigned int lidx;
1588 slp_tree node, load;
1590 /* Compare all the permutation sequences to the first one. We know
1591 that at least one load is permuted. */
1592 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1593 if (!node->load_permutation.exists ())
1594 return false;
1595 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1597 if (!load->load_permutation.exists ())
1598 return false;
1599 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1600 if (lidx != node->load_permutation[j])
1601 return false;
1604 /* Check that the loads in the first sequence are different and there
1605 are no gaps between them. */
1606 auto_sbitmap load_index (group_size);
1607 bitmap_clear (load_index);
1608 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1610 if (lidx >= group_size)
1611 return false;
1612 if (bitmap_bit_p (load_index, lidx))
1613 return false;
1615 bitmap_set_bit (load_index, lidx);
1617 for (i = 0; i < group_size; i++)
1618 if (!bitmap_bit_p (load_index, i))
1619 return false;
1621 /* This permutation is valid for reduction. Since the order of the
1622 statements in the nodes is not important unless they are memory
1623 accesses, we can rearrange the statements in all the nodes
1624 according to the order of the loads. */
1625 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1626 node->load_permutation);
1628 /* We are done, no actual permutations need to be generated. */
1629 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1630 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1632 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1633 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
1634 /* But we have to keep those permutations that are required because
1635 of handling of gaps. */
1636 if (known_eq (unrolling_factor, 1U)
1637 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
1638 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0))
1639 SLP_TREE_LOAD_PERMUTATION (node).release ();
1640 else
1641 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1642 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1645 return true;
1648 /* Check if the required load permutations in the SLP instance
1649 SLP_INSTN are supported. */
1651 static bool
1652 vect_supported_load_permutation_p (slp_instance slp_instn)
1654 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1655 unsigned int i, j, k, next;
1656 slp_tree node;
1657 gimple *stmt, *load, *next_load;
1659 if (dump_enabled_p ())
1661 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1662 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1663 if (node->load_permutation.exists ())
1664 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1665 dump_printf (MSG_NOTE, "%d ", next);
1666 else
1667 for (k = 0; k < group_size; ++k)
1668 dump_printf (MSG_NOTE, "%d ", k);
1669 dump_printf (MSG_NOTE, "\n");
1672 /* In case of reduction every load permutation is allowed, since the order
1673 of the reduction statements is not important (as opposed to the case of
1674 grouped stores). The only condition we need to check is that all the
1675 load nodes are of the same size and have the same permutation (and then
1676 rearrange all the nodes of the SLP instance according to this
1677 permutation). */
1679 /* Check that all the load nodes are of the same size. */
1680 /* ??? Can't we assert this? */
1681 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1682 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1683 return false;
1685 node = SLP_INSTANCE_TREE (slp_instn);
1686 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1688 /* Reduction (there are no data-refs in the root).
1689 In reduction chain the order of the loads is not important. */
1690 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1691 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1692 vect_attempt_slp_rearrange_stmts (slp_instn);
1694 /* In basic block vectorization we allow any subchain of an interleaving
1695 chain.
1696 FORNOW: not supported in loop SLP because of realignment compications. */
1697 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1699 /* Check whether the loads in an instance form a subchain and thus
1700 no permutation is necessary. */
1701 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1703 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1704 continue;
1705 bool subchain_p = true;
1706 next_load = NULL;
1707 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1709 if (j != 0
1710 && (next_load != load
1711 || GROUP_GAP (vinfo_for_stmt (load)) != 1))
1713 subchain_p = false;
1714 break;
1716 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1718 if (subchain_p)
1719 SLP_TREE_LOAD_PERMUTATION (node).release ();
1720 else
1722 stmt_vec_info group_info
1723 = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1724 group_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (group_info));
1725 unsigned HOST_WIDE_INT nunits;
1726 unsigned k, maxk = 0;
1727 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1728 if (k > maxk)
1729 maxk = k;
1730 /* In BB vectorization we may not actually use a loaded vector
1731 accessing elements in excess of GROUP_SIZE. */
1732 tree vectype = STMT_VINFO_VECTYPE (group_info);
1733 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1734 || maxk >= (GROUP_SIZE (group_info) & ~(nunits - 1)))
1736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1737 "BB vectorization with gaps at the end of "
1738 "a load is not supported\n");
1739 return false;
1742 /* Verify the permutation can be generated. */
1743 vec<tree> tem;
1744 unsigned n_perms;
1745 if (!vect_transform_slp_perm_load (node, tem, NULL,
1746 1, slp_instn, true, &n_perms))
1748 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1749 vect_location,
1750 "unsupported load permutation\n");
1751 return false;
1755 return true;
1758 /* For loop vectorization verify we can generate the permutation. Be
1759 conservative about the vectorization factor, there are permutations
1760 that will use three vector inputs only starting from a specific factor
1761 and the vectorization factor is not yet final.
1762 ??? The SLP instance unrolling factor might not be the maximum one. */
1763 unsigned n_perms;
1764 poly_uint64 test_vf
1765 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1766 LOOP_VINFO_VECT_FACTOR
1767 (STMT_VINFO_LOOP_VINFO (vinfo_for_stmt (stmt))));
1768 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1769 if (node->load_permutation.exists ()
1770 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1771 slp_instn, true, &n_perms))
1772 return false;
1774 return true;
1778 /* Find the last store in SLP INSTANCE. */
1780 gimple *
1781 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1783 gimple *last = NULL, *stmt;
1785 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1787 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1788 if (is_pattern_stmt_p (stmt_vinfo))
1789 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1790 else
1791 last = get_later_stmt (stmt, last);
1794 return last;
1797 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1799 static void
1800 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1801 stmt_vector_for_cost *prologue_cost_vec,
1802 stmt_vector_for_cost *body_cost_vec,
1803 unsigned ncopies_for_cost,
1804 scalar_stmts_set_t* visited)
1806 unsigned i, j;
1807 slp_tree child;
1808 gimple *stmt;
1809 stmt_vec_info stmt_info;
1810 tree lhs;
1812 /* If we already costed the exact same set of scalar stmts we're done.
1813 We share the generated vector stmts for those. */
1814 if (visited->contains (SLP_TREE_SCALAR_STMTS (node)))
1815 return;
1817 visited->add (SLP_TREE_SCALAR_STMTS (node).copy ());
1819 /* Recurse down the SLP tree. */
1820 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1821 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1822 vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1823 body_cost_vec, ncopies_for_cost, visited);
1825 /* Look at the first scalar stmt to determine the cost. */
1826 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1827 stmt_info = vinfo_for_stmt (stmt);
1828 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1830 vect_memory_access_type memory_access_type
1831 = (STMT_VINFO_STRIDED_P (stmt_info)
1832 ? VMAT_STRIDED_SLP
1833 : VMAT_CONTIGUOUS);
1834 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1835 vect_model_store_cost (stmt_info, ncopies_for_cost,
1836 memory_access_type, VLS_STORE,
1837 node, prologue_cost_vec, body_cost_vec);
1838 else
1840 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1841 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1843 /* If the load is permuted then the alignment is determined by
1844 the first group element not by the first scalar stmt DR. */
1845 stmt = GROUP_FIRST_ELEMENT (stmt_info);
1846 stmt_info = vinfo_for_stmt (stmt);
1847 /* Record the cost for the permutation. */
1848 unsigned n_perms;
1849 vect_transform_slp_perm_load (node, vNULL, NULL,
1850 ncopies_for_cost, instance, true,
1851 &n_perms);
1852 record_stmt_cost (body_cost_vec, n_perms, vec_perm,
1853 stmt_info, 0, vect_body);
1854 unsigned assumed_nunits
1855 = vect_nunits_for_cost (STMT_VINFO_VECTYPE (stmt_info));
1856 /* And adjust the number of loads performed. This handles
1857 redundancies as well as loads that are later dead. */
1858 auto_sbitmap perm (GROUP_SIZE (stmt_info));
1859 bitmap_clear (perm);
1860 for (i = 0; i < SLP_TREE_LOAD_PERMUTATION (node).length (); ++i)
1861 bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (node)[i]);
1862 ncopies_for_cost = 0;
1863 bool load_seen = false;
1864 for (i = 0; i < GROUP_SIZE (stmt_info); ++i)
1866 if (i % assumed_nunits == 0)
1868 if (load_seen)
1869 ncopies_for_cost++;
1870 load_seen = false;
1872 if (bitmap_bit_p (perm, i))
1873 load_seen = true;
1875 if (load_seen)
1876 ncopies_for_cost++;
1877 gcc_assert (ncopies_for_cost
1878 <= (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info)
1879 + assumed_nunits - 1) / assumed_nunits);
1880 poly_uint64 uf = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1881 ncopies_for_cost *= estimated_poly_value (uf);
1883 /* Record the cost for the vector loads. */
1884 vect_model_load_cost (stmt_info, ncopies_for_cost,
1885 memory_access_type, node, prologue_cost_vec,
1886 body_cost_vec);
1887 return;
1890 else if (STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type)
1892 /* ncopies_for_cost is the number of IVs we generate. */
1893 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1894 stmt_info, 0, vect_body);
1896 /* Prologue cost for the initial values and step vector. */
1897 record_stmt_cost (prologue_cost_vec, ncopies_for_cost,
1898 CONSTANT_CLASS_P
1899 (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED
1900 (stmt_info))
1901 ? vector_load : vec_construct,
1902 stmt_info, 0, vect_prologue);
1903 record_stmt_cost (prologue_cost_vec, 1,
1904 CONSTANT_CLASS_P
1905 (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info))
1906 ? vector_load : vec_construct,
1907 stmt_info, 0, vect_prologue);
1909 /* ??? No easy way to get at the actual number of vector stmts
1910 to be geneated and thus the derived IVs. */
1912 else
1914 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1915 stmt_info, 0, vect_body);
1916 if (SLP_TREE_TWO_OPERATORS (node))
1918 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1919 stmt_info, 0, vect_body);
1920 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1921 stmt_info, 0, vect_body);
1925 /* Push SLP node def-type to stmts. */
1926 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1927 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1928 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
1929 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
1931 /* Scan operands and account for prologue cost of constants/externals.
1932 ??? This over-estimates cost for multiple uses and should be
1933 re-engineered. */
1934 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1935 lhs = gimple_get_lhs (stmt);
1936 for (i = 0; i < gimple_num_ops (stmt); ++i)
1938 tree op = gimple_op (stmt, i);
1939 gimple *def_stmt;
1940 enum vect_def_type dt;
1941 if (!op || op == lhs)
1942 continue;
1943 if (vect_is_simple_use (op, stmt_info->vinfo, &def_stmt, &dt)
1944 && (dt == vect_constant_def || dt == vect_external_def))
1946 /* Without looking at the actual initializer a vector of
1947 constants can be implemented as load from the constant pool.
1948 When all elements are the same we can use a splat. */
1949 tree vectype = get_vectype_for_scalar_type (TREE_TYPE (op));
1950 unsigned group_size = SLP_TREE_SCALAR_STMTS (node).length ();
1951 unsigned num_vects_to_check;
1952 unsigned HOST_WIDE_INT const_nunits;
1953 unsigned nelt_limit;
1954 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
1955 && ! multiple_p (const_nunits, group_size))
1957 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
1958 nelt_limit = const_nunits;
1960 else
1962 /* If either the vector has variable length or the vectors
1963 are composed of repeated whole groups we only need to
1964 cost construction once. All vectors will be the same. */
1965 num_vects_to_check = 1;
1966 nelt_limit = group_size;
1968 tree elt = NULL_TREE;
1969 unsigned nelt = 0;
1970 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
1972 unsigned si = j % group_size;
1973 if (nelt == 0)
1974 elt = gimple_op (SLP_TREE_SCALAR_STMTS (node)[si], i);
1975 /* ??? We're just tracking whether all operands of a single
1976 vector initializer are the same, ideally we'd check if
1977 we emitted the same one already. */
1978 else if (elt != gimple_op (SLP_TREE_SCALAR_STMTS (node)[si], i))
1979 elt = NULL_TREE;
1980 nelt++;
1981 if (nelt == nelt_limit)
1983 /* ??? We need to pass down stmt_info for a vector type
1984 even if it points to the wrong stmt. */
1985 record_stmt_cost (prologue_cost_vec, 1,
1986 dt == vect_external_def
1987 ? (elt ? scalar_to_vec : vec_construct)
1988 : vector_load,
1989 stmt_info, 0, vect_prologue);
1990 nelt = 0;
1996 /* Restore stmt def-types. */
1997 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1998 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1999 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
2000 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
2003 /* Compute the cost for the SLP instance INSTANCE. */
2005 static void
2006 vect_analyze_slp_cost (slp_instance instance, void *data, scalar_stmts_set_t *visited)
2008 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
2009 unsigned ncopies_for_cost;
2010 stmt_info_for_cost *si;
2011 unsigned i;
2013 /* Calculate the number of vector stmts to create based on the unrolling
2014 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
2015 GROUP_SIZE / NUNITS otherwise. */
2016 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
2017 slp_tree node = SLP_INSTANCE_TREE (instance);
2018 stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2019 /* Get the estimated vectorization factor, which is always one for
2020 basic-block vectorization. */
2021 unsigned int assumed_vf;
2022 if (STMT_VINFO_LOOP_VINFO (stmt_info))
2023 assumed_vf = vect_vf_for_cost (STMT_VINFO_LOOP_VINFO (stmt_info));
2024 else
2025 assumed_vf = 1;
2026 /* For reductions look at a reduction operand in case the reduction
2027 operation is widening like DOT_PROD or SAD. */
2028 tree vectype_for_cost = STMT_VINFO_VECTYPE (stmt_info);
2029 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2031 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2032 switch (gimple_assign_rhs_code (stmt))
2034 case DOT_PROD_EXPR:
2035 case SAD_EXPR:
2036 vectype_for_cost = get_vectype_for_scalar_type
2037 (TREE_TYPE (gimple_assign_rhs1 (stmt)));
2038 break;
2039 default:;
2042 unsigned int assumed_nunits = vect_nunits_for_cost (vectype_for_cost);
2043 ncopies_for_cost = (least_common_multiple (assumed_nunits,
2044 group_size * assumed_vf)
2045 / assumed_nunits);
2047 prologue_cost_vec.create (10);
2048 body_cost_vec.create (10);
2049 vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
2050 &prologue_cost_vec, &body_cost_vec,
2051 ncopies_for_cost, visited);
2053 /* Record the prologue costs, which were delayed until we were
2054 sure that SLP was successful. */
2055 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
2057 struct _stmt_vec_info *stmt_info
2058 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2059 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2060 si->misalign, vect_prologue);
2063 /* Record the instance's instructions in the target cost model. */
2064 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
2066 struct _stmt_vec_info *stmt_info
2067 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2068 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2069 si->misalign, vect_body);
2072 prologue_cost_vec.release ();
2073 body_cost_vec.release ();
2076 /* Splits a group of stores, currently beginning at FIRST_STMT, into two groups:
2077 one (still beginning at FIRST_STMT) of size GROUP1_SIZE (also containing
2078 the first GROUP1_SIZE stmts, since stores are consecutive), the second
2079 containing the remainder.
2080 Return the first stmt in the second group. */
2082 static gimple *
2083 vect_split_slp_store_group (gimple *first_stmt, unsigned group1_size)
2085 stmt_vec_info first_vinfo = vinfo_for_stmt (first_stmt);
2086 gcc_assert (GROUP_FIRST_ELEMENT (first_vinfo) == first_stmt);
2087 gcc_assert (group1_size > 0);
2088 int group2_size = GROUP_SIZE (first_vinfo) - group1_size;
2089 gcc_assert (group2_size > 0);
2090 GROUP_SIZE (first_vinfo) = group1_size;
2092 gimple *stmt = first_stmt;
2093 for (unsigned i = group1_size; i > 1; i--)
2095 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2096 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
2098 /* STMT is now the last element of the first group. */
2099 gimple *group2 = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2100 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)) = 0;
2102 GROUP_SIZE (vinfo_for_stmt (group2)) = group2_size;
2103 for (stmt = group2; stmt; stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)))
2105 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = group2;
2106 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
2109 /* For the second group, the GROUP_GAP is that before the original group,
2110 plus skipping over the first vector. */
2111 GROUP_GAP (vinfo_for_stmt (group2)) =
2112 GROUP_GAP (first_vinfo) + group1_size;
2114 /* GROUP_GAP of the first group now has to skip over the second group too. */
2115 GROUP_GAP (first_vinfo) += group2_size;
2117 if (dump_enabled_p ())
2118 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2119 group1_size, group2_size);
2121 return group2;
2124 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2125 statements and a vector of NUNITS elements. */
2127 static poly_uint64
2128 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2130 return exact_div (common_multiple (nunits, group_size), group_size);
2133 /* Analyze an SLP instance starting from a group of grouped stores. Call
2134 vect_build_slp_tree to build a tree of packed stmts if possible.
2135 Return FALSE if it's impossible to SLP any stmt in the loop. */
2137 static bool
2138 vect_analyze_slp_instance (vec_info *vinfo,
2139 gimple *stmt, unsigned max_tree_size)
2141 slp_instance new_instance;
2142 slp_tree node;
2143 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
2144 tree vectype, scalar_type = NULL_TREE;
2145 gimple *next;
2146 unsigned int i;
2147 vec<slp_tree> loads;
2148 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
2149 vec<gimple *> scalar_stmts;
2151 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2153 if (dr)
2155 scalar_type = TREE_TYPE (DR_REF (dr));
2156 vectype = get_vectype_for_scalar_type (scalar_type);
2158 else
2160 gcc_assert (is_a <loop_vec_info> (vinfo));
2161 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2164 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
2166 else
2168 gcc_assert (is_a <loop_vec_info> (vinfo));
2169 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2170 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2173 if (!vectype)
2175 if (dump_enabled_p ())
2177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2178 "Build SLP failed: unsupported data-type ");
2179 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
2180 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2183 return false;
2185 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2187 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2188 scalar_stmts.create (group_size);
2189 next = stmt;
2190 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2192 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2193 while (next)
2195 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
2196 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
2197 scalar_stmts.safe_push (
2198 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
2199 else
2200 scalar_stmts.safe_push (next);
2201 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2203 /* Mark the first element of the reduction chain as reduction to properly
2204 transform the node. In the reduction analysis phase only the last
2205 element of the chain is marked as reduction. */
2206 if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
2207 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
2209 else
2211 /* Collect reduction statements. */
2212 vec<gimple *> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2213 for (i = 0; reductions.iterate (i, &next); i++)
2214 scalar_stmts.safe_push (next);
2217 loads.create (group_size);
2219 /* Build the tree for the SLP instance. */
2220 bool *matches = XALLOCAVEC (bool, group_size);
2221 unsigned npermutes = 0;
2222 bst_fail = new scalar_stmts_set_t ();
2223 poly_uint64 max_nunits = nunits;
2224 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2225 &max_nunits, &loads, matches, &npermutes,
2226 NULL, max_tree_size);
2227 delete bst_fail;
2228 if (node != NULL)
2230 /* Calculate the unrolling factor based on the smallest type. */
2231 poly_uint64 unrolling_factor
2232 = calculate_unrolling_factor (max_nunits, group_size);
2234 if (maybe_ne (unrolling_factor, 1U)
2235 && is_a <bb_vec_info> (vinfo))
2237 unsigned HOST_WIDE_INT const_max_nunits;
2238 if (!max_nunits.is_constant (&const_max_nunits)
2239 || const_max_nunits > group_size)
2241 if (dump_enabled_p ())
2242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2243 "Build SLP failed: store group "
2244 "size not a multiple of the vector size "
2245 "in basic block SLP\n");
2246 vect_free_slp_tree (node);
2247 loads.release ();
2248 return false;
2250 /* Fatal mismatch. */
2251 matches[group_size / const_max_nunits * const_max_nunits] = false;
2252 vect_free_slp_tree (node);
2253 loads.release ();
2255 else
2257 /* Create a new SLP instance. */
2258 new_instance = XNEW (struct _slp_instance);
2259 SLP_INSTANCE_TREE (new_instance) = node;
2260 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2261 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2262 SLP_INSTANCE_LOADS (new_instance) = loads;
2264 /* Compute the load permutation. */
2265 slp_tree load_node;
2266 bool loads_permuted = false;
2267 FOR_EACH_VEC_ELT (loads, i, load_node)
2269 vec<unsigned> load_permutation;
2270 int j;
2271 gimple *load, *first_stmt;
2272 bool this_load_permuted = false;
2273 load_permutation.create (group_size);
2274 first_stmt = GROUP_FIRST_ELEMENT
2275 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2276 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
2278 int load_place = vect_get_place_in_interleaving_chain
2279 (load, first_stmt);
2280 gcc_assert (load_place != -1);
2281 if (load_place != j)
2282 this_load_permuted = true;
2283 load_permutation.safe_push (load_place);
2285 if (!this_load_permuted
2286 /* The load requires permutation when unrolling exposes
2287 a gap either because the group is larger than the SLP
2288 group-size or because there is a gap between the groups. */
2289 && (known_eq (unrolling_factor, 1U)
2290 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
2291 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
2293 load_permutation.release ();
2294 continue;
2296 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2297 loads_permuted = true;
2300 if (loads_permuted)
2302 if (!vect_supported_load_permutation_p (new_instance))
2304 if (dump_enabled_p ())
2306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2307 "Build SLP failed: unsupported load "
2308 "permutation ");
2309 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
2310 TDF_SLIM, stmt, 0);
2312 vect_free_slp_instance (new_instance);
2313 return false;
2317 /* If the loads and stores can be handled with load/store-lan
2318 instructions do not generate this SLP instance. */
2319 if (is_a <loop_vec_info> (vinfo)
2320 && loads_permuted
2321 && dr && vect_store_lanes_supported (vectype, group_size, false))
2323 slp_tree load_node;
2324 FOR_EACH_VEC_ELT (loads, i, load_node)
2326 gimple *first_stmt = GROUP_FIRST_ELEMENT
2327 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2328 stmt_vec_info stmt_vinfo = vinfo_for_stmt (first_stmt);
2329 /* Use SLP for strided accesses (or if we
2330 can't load-lanes). */
2331 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2332 || ! vect_load_lanes_supported
2333 (STMT_VINFO_VECTYPE (stmt_vinfo),
2334 GROUP_SIZE (stmt_vinfo), false))
2335 break;
2337 if (i == loads.length ())
2339 if (dump_enabled_p ())
2340 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2341 "Built SLP cancelled: can use "
2342 "load/store-lanes\n");
2343 vect_free_slp_instance (new_instance);
2344 return false;
2348 vinfo->slp_instances.safe_push (new_instance);
2350 if (dump_enabled_p ())
2352 dump_printf_loc (MSG_NOTE, vect_location,
2353 "Final SLP tree for instance:\n");
2354 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2357 return true;
2360 else
2362 /* Failed to SLP. */
2363 /* Free the allocated memory. */
2364 scalar_stmts.release ();
2365 loads.release ();
2368 /* For basic block SLP, try to break the group up into multiples of the
2369 vector size. */
2370 unsigned HOST_WIDE_INT const_nunits;
2371 if (is_a <bb_vec_info> (vinfo)
2372 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2373 && STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
2374 && nunits.is_constant (&const_nunits))
2376 /* We consider breaking the group only on VF boundaries from the existing
2377 start. */
2378 for (i = 0; i < group_size; i++)
2379 if (!matches[i]) break;
2381 if (i >= const_nunits && i < group_size)
2383 /* Split into two groups at the first vector boundary before i. */
2384 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2385 unsigned group1_size = i & ~(const_nunits - 1);
2387 gimple *rest = vect_split_slp_store_group (stmt, group1_size);
2388 bool res = vect_analyze_slp_instance (vinfo, stmt, max_tree_size);
2389 /* If the first non-match was in the middle of a vector,
2390 skip the rest of that vector. */
2391 if (group1_size < i)
2393 i = group1_size + const_nunits;
2394 if (i < group_size)
2395 rest = vect_split_slp_store_group (rest, const_nunits);
2397 if (i < group_size)
2398 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2399 return res;
2401 /* Even though the first vector did not all match, we might be able to SLP
2402 (some) of the remainder. FORNOW ignore this possibility. */
2405 return false;
2409 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2410 trees of packed scalar stmts if SLP is possible. */
2412 bool
2413 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2415 unsigned int i;
2416 gimple *first_element;
2418 if (dump_enabled_p ())
2419 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
2421 /* Find SLP sequences starting from groups of grouped stores. */
2422 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2423 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2425 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2427 if (loop_vinfo->reduction_chains.length () > 0)
2429 /* Find SLP sequences starting from reduction chains. */
2430 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2431 if (! vect_analyze_slp_instance (vinfo, first_element,
2432 max_tree_size))
2434 /* Dissolve reduction chain group. */
2435 gimple *next, *stmt = first_element;
2436 while (stmt)
2438 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2439 next = GROUP_NEXT_ELEMENT (vinfo);
2440 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2441 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2442 stmt = next;
2444 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (first_element))
2445 = vect_internal_def;
2449 /* Find SLP sequences starting from groups of reductions. */
2450 if (loop_vinfo->reductions.length () > 1)
2451 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2452 max_tree_size);
2455 return true;
2459 /* For each possible SLP instance decide whether to SLP it and calculate overall
2460 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2461 least one instance. */
2463 bool
2464 vect_make_slp_decision (loop_vec_info loop_vinfo)
2466 unsigned int i;
2467 poly_uint64 unrolling_factor = 1;
2468 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2469 slp_instance instance;
2470 int decided_to_slp = 0;
2472 if (dump_enabled_p ())
2473 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
2474 "\n");
2476 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2478 /* FORNOW: SLP if you can. */
2479 /* All unroll factors have the form current_vector_size * X for some
2480 rational X, so they must have a common multiple. */
2481 unrolling_factor
2482 = force_common_multiple (unrolling_factor,
2483 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2485 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2486 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2487 loop-based vectorization. Such stmts will be marked as HYBRID. */
2488 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2489 decided_to_slp++;
2492 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2494 if (decided_to_slp && dump_enabled_p ())
2496 dump_printf_loc (MSG_NOTE, vect_location,
2497 "Decided to SLP %d instances. Unrolling factor ",
2498 decided_to_slp);
2499 dump_dec (MSG_NOTE, unrolling_factor);
2500 dump_printf (MSG_NOTE, "\n");
2503 return (decided_to_slp > 0);
2507 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2508 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2510 static void
2511 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2513 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[i];
2514 imm_use_iterator imm_iter;
2515 gimple *use_stmt;
2516 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
2517 slp_tree child;
2518 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2519 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2520 int j;
2522 /* Propagate hybrid down the SLP tree. */
2523 if (stype == hybrid)
2525 else if (HYBRID_SLP_STMT (stmt_vinfo))
2526 stype = hybrid;
2527 else
2529 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2530 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2531 /* If we get a pattern stmt here we have to use the LHS of the
2532 original stmt for immediate uses. */
2533 if (! STMT_VINFO_IN_PATTERN_P (stmt_vinfo)
2534 && STMT_VINFO_RELATED_STMT (stmt_vinfo))
2535 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
2536 tree def;
2537 if (gimple_code (stmt) == GIMPLE_PHI)
2538 def = gimple_phi_result (stmt);
2539 else
2540 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2541 if (def)
2542 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2544 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2545 continue;
2546 use_vinfo = vinfo_for_stmt (use_stmt);
2547 if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2548 && STMT_VINFO_RELATED_STMT (use_vinfo))
2549 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
2550 if (!STMT_SLP_TYPE (use_vinfo)
2551 && (STMT_VINFO_RELEVANT (use_vinfo)
2552 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2553 && !(gimple_code (use_stmt) == GIMPLE_PHI
2554 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2556 if (dump_enabled_p ())
2558 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2559 "def in non-SLP stmt: ");
2560 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
2562 stype = hybrid;
2567 if (stype == hybrid
2568 && !HYBRID_SLP_STMT (stmt_vinfo))
2570 if (dump_enabled_p ())
2572 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2573 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2575 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2578 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2579 if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
2580 vect_detect_hybrid_slp_stmts (child, i, stype);
2583 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2585 static tree
2586 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2588 walk_stmt_info *wi = (walk_stmt_info *)data;
2589 struct loop *loopp = (struct loop *)wi->info;
2591 if (wi->is_lhs)
2592 return NULL_TREE;
2594 if (TREE_CODE (*tp) == SSA_NAME
2595 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2597 gimple *def_stmt = SSA_NAME_DEF_STMT (*tp);
2598 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2599 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2601 if (dump_enabled_p ())
2603 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2604 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2606 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2610 return NULL_TREE;
2613 static tree
2614 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2615 walk_stmt_info *)
2617 stmt_vec_info use_vinfo = vinfo_for_stmt (gsi_stmt (*gsi));
2618 /* If the stmt is in a SLP instance then this isn't a reason
2619 to mark use definitions in other SLP instances as hybrid. */
2620 if (! STMT_SLP_TYPE (use_vinfo)
2621 && (STMT_VINFO_RELEVANT (use_vinfo)
2622 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2623 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2624 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2626 else
2627 *handled = true;
2628 return NULL_TREE;
2631 /* Find stmts that must be both vectorized and SLPed. */
2633 void
2634 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2636 unsigned int i;
2637 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2638 slp_instance instance;
2640 if (dump_enabled_p ())
2641 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2642 "\n");
2644 /* First walk all pattern stmt in the loop and mark defs of uses as
2645 hybrid because immediate uses in them are not recorded. */
2646 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2648 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2649 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2650 gsi_next (&gsi))
2652 gimple *stmt = gsi_stmt (gsi);
2653 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2654 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2656 walk_stmt_info wi;
2657 memset (&wi, 0, sizeof (wi));
2658 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2659 gimple_stmt_iterator gsi2
2660 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2661 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2662 vect_detect_hybrid_slp_1, &wi);
2663 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2664 vect_detect_hybrid_slp_2,
2665 vect_detect_hybrid_slp_1, &wi);
2670 /* Then walk the SLP instance trees marking stmts with uses in
2671 non-SLP stmts as hybrid, also propagating hybrid down the
2672 SLP tree, collecting the above info on-the-fly. */
2673 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2675 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2676 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2677 i, pure_slp);
2682 /* Initialize a bb_vec_info struct for the statements between
2683 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2685 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2686 gimple_stmt_iterator region_end_in)
2687 : vec_info (vec_info::bb, init_cost (NULL)),
2688 bb (gsi_bb (region_begin_in)),
2689 region_begin (region_begin_in),
2690 region_end (region_end_in)
2692 gimple_stmt_iterator gsi;
2694 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2695 gsi_next (&gsi))
2697 gimple *stmt = gsi_stmt (gsi);
2698 gimple_set_uid (stmt, 0);
2699 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
2702 bb->aux = this;
2706 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2707 stmts in the basic block. */
2709 _bb_vec_info::~_bb_vec_info ()
2711 for (gimple_stmt_iterator si = region_begin;
2712 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2714 gimple *stmt = gsi_stmt (si);
2715 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2717 if (stmt_info)
2718 /* Free stmt_vec_info. */
2719 free_stmt_vec_info (stmt);
2721 /* Reset region marker. */
2722 gimple_set_uid (stmt, -1);
2725 bb->aux = NULL;
2729 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2730 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2732 Return true if the operations are supported. */
2734 static bool
2735 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2736 slp_instance node_instance)
2738 bool dummy;
2739 int i, j;
2740 gimple *stmt;
2741 slp_tree child;
2743 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2744 return true;
2746 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2747 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance))
2748 return false;
2750 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2751 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2752 gcc_assert (stmt_info);
2753 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2755 /* For BB vectorization vector types are assigned here.
2756 Memory accesses already got their vector type assigned
2757 in vect_analyze_data_refs. */
2758 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2759 if (bb_vinfo
2760 && ! STMT_VINFO_DATA_REF (stmt_info))
2762 gcc_assert (PURE_SLP_STMT (stmt_info));
2764 tree scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
2765 if (dump_enabled_p ())
2767 dump_printf_loc (MSG_NOTE, vect_location,
2768 "get vectype for scalar type: ");
2769 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
2770 dump_printf (MSG_NOTE, "\n");
2773 tree vectype = get_vectype_for_scalar_type (scalar_type);
2774 if (!vectype)
2776 if (dump_enabled_p ())
2778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2779 "not SLPed: unsupported data-type ");
2780 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2781 scalar_type);
2782 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2784 return false;
2787 if (dump_enabled_p ())
2789 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
2790 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
2791 dump_printf (MSG_NOTE, "\n");
2794 gimple *sstmt;
2795 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt)
2796 STMT_VINFO_VECTYPE (vinfo_for_stmt (sstmt)) = vectype;
2799 /* Calculate the number of vector statements to be created for the
2800 scalar stmts in this node. For SLP reductions it is equal to the
2801 number of vector statements in the children (which has already been
2802 calculated by the recursive call). Otherwise it is the number of
2803 scalar elements in one scalar iteration (GROUP_SIZE) multiplied by
2804 VF divided by the number of elements in a vector. */
2805 if (GROUP_FIRST_ELEMENT (stmt_info)
2806 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2807 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2808 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2809 else
2811 poly_uint64 vf;
2812 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2813 vf = loop_vinfo->vectorization_factor;
2814 else
2815 vf = 1;
2816 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2817 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2818 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2819 = vect_get_num_vectors (vf * group_size, vectype);
2822 /* Push SLP node def-type to stmt operands. */
2823 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2824 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2825 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2826 = SLP_TREE_DEF_TYPE (child);
2827 bool res = vect_analyze_stmt (stmt, &dummy, node, node_instance);
2828 /* Restore def-types. */
2829 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2830 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2831 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2832 = vect_internal_def;
2833 if (! res)
2834 return false;
2836 return true;
2840 /* Analyze statements in SLP instances of VINFO. Return true if the
2841 operations are supported. */
2843 bool
2844 vect_slp_analyze_operations (vec_info *vinfo)
2846 slp_instance instance;
2847 int i;
2849 if (dump_enabled_p ())
2850 dump_printf_loc (MSG_NOTE, vect_location,
2851 "=== vect_slp_analyze_operations ===\n");
2853 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2855 if (!vect_slp_analyze_node_operations (vinfo,
2856 SLP_INSTANCE_TREE (instance),
2857 instance))
2859 dump_printf_loc (MSG_NOTE, vect_location,
2860 "removing SLP instance operations starting from: ");
2861 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2862 SLP_TREE_SCALAR_STMTS
2863 (SLP_INSTANCE_TREE (instance))[0], 0);
2864 vect_free_slp_instance (instance);
2865 vinfo->slp_instances.ordered_remove (i);
2867 else
2868 i++;
2871 if (dump_enabled_p ())
2872 dump_printf_loc (MSG_NOTE, vect_location,
2873 "=== vect_analyze_slp_cost ===\n");
2875 /* Compute the costs of the SLP instances. */
2876 scalar_stmts_set_t *visited = new scalar_stmts_set_t ();
2877 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
2878 vect_analyze_slp_cost (instance, vinfo->target_cost_data, visited);
2879 delete visited;
2881 return !vinfo->slp_instances.is_empty ();
2885 /* Compute the scalar cost of the SLP node NODE and its children
2886 and return it. Do not account defs that are marked in LIFE and
2887 update LIFE according to uses of NODE. */
2889 static unsigned
2890 vect_bb_slp_scalar_cost (basic_block bb,
2891 slp_tree node, vec<bool, va_heap> *life)
2893 unsigned scalar_cost = 0;
2894 unsigned i;
2895 gimple *stmt;
2896 slp_tree child;
2898 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2900 unsigned stmt_cost;
2901 ssa_op_iter op_iter;
2902 def_operand_p def_p;
2903 stmt_vec_info stmt_info;
2905 if ((*life)[i])
2906 continue;
2908 /* If there is a non-vectorized use of the defs then the scalar
2909 stmt is kept live in which case we do not account it or any
2910 required defs in the SLP children in the scalar cost. This
2911 way we make the vectorization more costly when compared to
2912 the scalar cost. */
2913 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2915 imm_use_iterator use_iter;
2916 gimple *use_stmt;
2917 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2918 if (!is_gimple_debug (use_stmt)
2919 && (! vect_stmt_in_region_p (vinfo_for_stmt (stmt)->vinfo,
2920 use_stmt)
2921 || ! PURE_SLP_STMT (vinfo_for_stmt (use_stmt))))
2923 (*life)[i] = true;
2924 BREAK_FROM_IMM_USE_STMT (use_iter);
2927 if ((*life)[i])
2928 continue;
2930 /* Count scalar stmts only once. */
2931 if (gimple_visited_p (stmt))
2932 continue;
2933 gimple_set_visited (stmt, true);
2935 stmt_info = vinfo_for_stmt (stmt);
2936 if (STMT_VINFO_DATA_REF (stmt_info))
2938 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2939 stmt_cost = vect_get_stmt_cost (scalar_load);
2940 else
2941 stmt_cost = vect_get_stmt_cost (scalar_store);
2943 else
2944 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2946 scalar_cost += stmt_cost;
2949 auto_vec<bool, 20> subtree_life;
2950 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2952 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2954 /* Do not directly pass LIFE to the recursive call, copy it to
2955 confine changes in the callee to the current child/subtree. */
2956 subtree_life.safe_splice (*life);
2957 scalar_cost += vect_bb_slp_scalar_cost (bb, child, &subtree_life);
2958 subtree_life.truncate (0);
2962 return scalar_cost;
2965 /* Check if vectorization of the basic block is profitable. */
2967 static bool
2968 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2970 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2971 slp_instance instance;
2972 int i;
2973 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2974 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2976 /* Calculate scalar cost. */
2977 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2979 auto_vec<bool, 20> life;
2980 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2981 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2982 SLP_INSTANCE_TREE (instance),
2983 &life);
2986 /* Unset visited flag. */
2987 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2988 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2989 gimple_set_visited (gsi_stmt (gsi), false);
2991 /* Complete the target-specific cost calculation. */
2992 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2993 &vec_inside_cost, &vec_epilogue_cost);
2995 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2997 if (dump_enabled_p ())
2999 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3000 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3001 vec_inside_cost);
3002 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3003 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3004 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3007 /* Vectorization is profitable if its cost is more than the cost of scalar
3008 version. Note that we err on the vector side for equal cost because
3009 the cost estimate is otherwise quite pessimistic (constant uses are
3010 free on the scalar side but cost a load on the vector side for
3011 example). */
3012 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3013 return false;
3015 return true;
3018 /* Check if the basic block can be vectorized. Returns a bb_vec_info
3019 if so and sets fatal to true if failure is independent of
3020 current_vector_size. */
3022 static bb_vec_info
3023 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
3024 gimple_stmt_iterator region_end,
3025 vec<data_reference_p> datarefs, int n_stmts,
3026 bool &fatal)
3028 bb_vec_info bb_vinfo;
3029 slp_instance instance;
3030 int i;
3031 poly_uint64 min_vf = 2;
3033 /* The first group of checks is independent of the vector size. */
3034 fatal = true;
3036 if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
3038 if (dump_enabled_p ())
3039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3040 "not vectorized: too many instructions in "
3041 "basic block.\n");
3042 free_data_refs (datarefs);
3043 return NULL;
3046 bb_vinfo = new _bb_vec_info (region_begin, region_end);
3047 if (!bb_vinfo)
3048 return NULL;
3050 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3052 /* Analyze the data references. */
3054 if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
3056 if (dump_enabled_p ())
3057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3058 "not vectorized: unhandled data-ref in basic "
3059 "block.\n");
3061 delete bb_vinfo;
3062 return NULL;
3065 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
3067 if (dump_enabled_p ())
3068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3069 "not vectorized: not enough data-refs in "
3070 "basic block.\n");
3072 delete bb_vinfo;
3073 return NULL;
3076 if (!vect_analyze_data_ref_accesses (bb_vinfo))
3078 if (dump_enabled_p ())
3079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3080 "not vectorized: unhandled data access in "
3081 "basic block.\n");
3083 delete bb_vinfo;
3084 return NULL;
3087 /* If there are no grouped stores in the region there is no need
3088 to continue with pattern recog as vect_analyze_slp will fail
3089 anyway. */
3090 if (bb_vinfo->grouped_stores.is_empty ())
3092 if (dump_enabled_p ())
3093 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3094 "not vectorized: no grouped stores in "
3095 "basic block.\n");
3097 delete bb_vinfo;
3098 return NULL;
3101 /* While the rest of the analysis below depends on it in some way. */
3102 fatal = false;
3104 vect_pattern_recog (bb_vinfo);
3106 /* Check the SLP opportunities in the basic block, analyze and build SLP
3107 trees. */
3108 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3110 if (dump_enabled_p ())
3112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3113 "Failed to SLP the basic block.\n");
3114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3115 "not vectorized: failed to find SLP opportunities "
3116 "in basic block.\n");
3119 delete bb_vinfo;
3120 return NULL;
3123 vect_record_base_alignments (bb_vinfo);
3125 /* Analyze and verify the alignment of data references and the
3126 dependence in the SLP instances. */
3127 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3129 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
3130 || ! vect_slp_analyze_instance_dependence (instance))
3132 dump_printf_loc (MSG_NOTE, vect_location,
3133 "removing SLP instance operations starting from: ");
3134 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3135 SLP_TREE_SCALAR_STMTS
3136 (SLP_INSTANCE_TREE (instance))[0], 0);
3137 vect_free_slp_instance (instance);
3138 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3139 continue;
3142 /* Mark all the statements that we want to vectorize as pure SLP and
3143 relevant. */
3144 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
3145 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3147 i++;
3149 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3151 delete bb_vinfo;
3152 return NULL;
3155 if (!vect_slp_analyze_operations (bb_vinfo))
3157 if (dump_enabled_p ())
3158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3159 "not vectorized: bad operation in basic block.\n");
3161 delete bb_vinfo;
3162 return NULL;
3165 /* Cost model: check if the vectorization is worthwhile. */
3166 if (!unlimited_cost_model (NULL)
3167 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3169 if (dump_enabled_p ())
3170 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3171 "not vectorized: vectorization is not "
3172 "profitable.\n");
3174 delete bb_vinfo;
3175 return NULL;
3178 if (dump_enabled_p ())
3179 dump_printf_loc (MSG_NOTE, vect_location,
3180 "Basic block will be vectorized using SLP\n");
3182 return bb_vinfo;
3186 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3187 true if anything in the basic-block was vectorized. */
3189 bool
3190 vect_slp_bb (basic_block bb)
3192 bb_vec_info bb_vinfo;
3193 gimple_stmt_iterator gsi;
3194 bool any_vectorized = false;
3195 auto_vector_sizes vector_sizes;
3197 if (dump_enabled_p ())
3198 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
3200 /* Autodetect first vector size we try. */
3201 current_vector_size = 0;
3202 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
3203 unsigned int next_size = 0;
3205 gsi = gsi_start_bb (bb);
3207 poly_uint64 autodetected_vector_size = 0;
3208 while (1)
3210 if (gsi_end_p (gsi))
3211 break;
3213 gimple_stmt_iterator region_begin = gsi;
3214 vec<data_reference_p> datarefs = vNULL;
3215 int insns = 0;
3217 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3219 gimple *stmt = gsi_stmt (gsi);
3220 if (is_gimple_debug (stmt))
3221 continue;
3222 insns++;
3224 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3225 vect_location = gimple_location (stmt);
3227 if (!find_data_references_in_stmt (NULL, stmt, &datarefs))
3228 break;
3231 /* Skip leading unhandled stmts. */
3232 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3234 gsi_next (&gsi);
3235 continue;
3238 gimple_stmt_iterator region_end = gsi;
3240 bool vectorized = false;
3241 bool fatal = false;
3242 bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
3243 datarefs, insns, fatal);
3244 if (bb_vinfo
3245 && dbg_cnt (vect_slp))
3247 if (dump_enabled_p ())
3248 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3250 vect_schedule_slp (bb_vinfo);
3252 if (dump_enabled_p ())
3253 dump_printf_loc (MSG_NOTE, vect_location,
3254 "basic block part vectorized\n");
3256 vectorized = true;
3258 delete bb_vinfo;
3260 any_vectorized |= vectorized;
3262 if (next_size == 0)
3263 autodetected_vector_size = current_vector_size;
3265 if (next_size < vector_sizes.length ()
3266 && known_eq (vector_sizes[next_size], autodetected_vector_size))
3267 next_size += 1;
3269 if (vectorized
3270 || next_size == vector_sizes.length ()
3271 || known_eq (current_vector_size, 0U)
3272 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3273 vector sizes will fail do not bother iterating. */
3274 || fatal)
3276 if (gsi_end_p (region_end))
3277 break;
3279 /* Skip the unhandled stmt. */
3280 gsi_next (&gsi);
3282 /* And reset vector sizes. */
3283 current_vector_size = 0;
3284 next_size = 0;
3286 else
3288 /* Try the next biggest vector size. */
3289 current_vector_size = vector_sizes[next_size++];
3290 if (dump_enabled_p ())
3292 dump_printf_loc (MSG_NOTE, vect_location,
3293 "***** Re-trying analysis with "
3294 "vector size ");
3295 dump_dec (MSG_NOTE, current_vector_size);
3296 dump_printf (MSG_NOTE, "\n");
3299 /* Start over. */
3300 gsi = region_begin;
3304 return any_vectorized;
3308 /* Return 1 if vector type of boolean constant which is OPNUM
3309 operand in statement STMT is a boolean vector. */
3311 static bool
3312 vect_mask_constant_operand_p (gimple *stmt, int opnum)
3314 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3315 enum tree_code code = gimple_expr_code (stmt);
3316 tree op, vectype;
3317 gimple *def_stmt;
3318 enum vect_def_type dt;
3320 /* For comparison and COND_EXPR type is chosen depending
3321 on the other comparison operand. */
3322 if (TREE_CODE_CLASS (code) == tcc_comparison)
3324 if (opnum)
3325 op = gimple_assign_rhs1 (stmt);
3326 else
3327 op = gimple_assign_rhs2 (stmt);
3329 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3330 &dt, &vectype))
3331 gcc_unreachable ();
3333 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3336 if (code == COND_EXPR)
3338 tree cond = gimple_assign_rhs1 (stmt);
3340 if (TREE_CODE (cond) == SSA_NAME)
3341 op = cond;
3342 else if (opnum)
3343 op = TREE_OPERAND (cond, 1);
3344 else
3345 op = TREE_OPERAND (cond, 0);
3347 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3348 &dt, &vectype))
3349 gcc_unreachable ();
3351 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3354 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3357 /* Build a variable-length vector in which the elements in ELTS are repeated
3358 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3359 RESULTS and add any new instructions to SEQ.
3361 The approach we use is:
3363 (1) Find a vector mode VM with integer elements of mode IM.
3365 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3366 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3367 from small vectors to IM.
3369 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3371 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3372 correct byte contents.
3374 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3376 We try to find the largest IM for which this sequence works, in order
3377 to cut down on the number of interleaves. */
3379 void
3380 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
3381 unsigned int nresults, vec<tree> &results)
3383 unsigned int nelts = elts.length ();
3384 tree element_type = TREE_TYPE (vector_type);
3386 /* (1) Find a vector mode VM with integer elements of mode IM. */
3387 unsigned int nvectors = 1;
3388 tree new_vector_type;
3389 tree permutes[2];
3390 if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
3391 &nvectors, &new_vector_type,
3392 permutes))
3393 gcc_unreachable ();
3395 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3396 unsigned int partial_nelts = nelts / nvectors;
3397 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3399 tree_vector_builder partial_elts;
3400 auto_vec<tree, 32> pieces (nvectors * 2);
3401 pieces.quick_grow (nvectors * 2);
3402 for (unsigned int i = 0; i < nvectors; ++i)
3404 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3405 ELTS' has mode IM. */
3406 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3407 for (unsigned int j = 0; j < partial_nelts; ++j)
3408 partial_elts.quick_push (elts[i * partial_nelts + j]);
3409 tree t = gimple_build_vector (seq, &partial_elts);
3410 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3411 TREE_TYPE (new_vector_type), t);
3413 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3414 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3417 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3418 correct byte contents.
3420 We need to repeat the following operation log2(nvectors) times:
3422 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3423 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3425 However, if each input repeats every N elements and the VF is
3426 a multiple of N * 2, the HI result is the same as the LO. */
3427 unsigned int in_start = 0;
3428 unsigned int out_start = nvectors;
3429 unsigned int hi_start = nvectors / 2;
3430 /* A bound on the number of outputs needed to produce NRESULTS results
3431 in the final iteration. */
3432 unsigned int noutputs_bound = nvectors * nresults;
3433 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3435 noutputs_bound /= 2;
3436 unsigned int limit = MIN (noutputs_bound, nvectors);
3437 for (unsigned int i = 0; i < limit; ++i)
3439 if ((i & 1) != 0
3440 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3441 2 * in_repeat))
3443 pieces[out_start + i] = pieces[out_start + i - 1];
3444 continue;
3447 tree output = make_ssa_name (new_vector_type);
3448 tree input1 = pieces[in_start + (i / 2)];
3449 tree input2 = pieces[in_start + (i / 2) + hi_start];
3450 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3451 input1, input2,
3452 permutes[i & 1]);
3453 gimple_seq_add_stmt (seq, stmt);
3454 pieces[out_start + i] = output;
3456 std::swap (in_start, out_start);
3459 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3460 results.reserve (nresults);
3461 for (unsigned int i = 0; i < nresults; ++i)
3462 if (i < nvectors)
3463 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3464 pieces[in_start + i]));
3465 else
3466 results.quick_push (results[i - nvectors]);
3470 /* For constant and loop invariant defs of SLP_NODE this function returns
3471 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3472 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
3473 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
3474 REDUC_INDEX is the index of the reduction operand in the statements, unless
3475 it is -1. */
3477 static void
3478 vect_get_constant_vectors (tree op, slp_tree slp_node,
3479 vec<tree> *vec_oprnds,
3480 unsigned int op_num, unsigned int number_of_vectors)
3482 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3483 gimple *stmt = stmts[0];
3484 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3485 unsigned HOST_WIDE_INT nunits;
3486 tree vec_cst;
3487 unsigned j, number_of_places_left_in_vector;
3488 tree vector_type;
3489 tree vop;
3490 int group_size = stmts.length ();
3491 unsigned int vec_num, i;
3492 unsigned number_of_copies = 1;
3493 vec<tree> voprnds;
3494 voprnds.create (number_of_vectors);
3495 bool constant_p, is_store;
3496 tree neutral_op = NULL;
3497 enum tree_code code = gimple_expr_code (stmt);
3498 gimple_seq ctor_seq = NULL;
3499 auto_vec<tree, 16> permute_results;
3501 /* Check if vector type is a boolean vector. */
3502 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3503 && vect_mask_constant_operand_p (stmt, op_num))
3504 vector_type
3505 = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
3506 else
3507 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
3509 if (STMT_VINFO_DATA_REF (stmt_vinfo))
3511 is_store = true;
3512 op = gimple_assign_rhs1 (stmt);
3514 else
3515 is_store = false;
3517 gcc_assert (op);
3519 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3520 created vectors. It is greater than 1 if unrolling is performed.
3522 For example, we have two scalar operands, s1 and s2 (e.g., group of
3523 strided accesses of size two), while NUNITS is four (i.e., four scalars
3524 of this type can be packed in a vector). The output vector will contain
3525 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3526 will be 2).
3528 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3529 containing the operands.
3531 For example, NUNITS is four as before, and the group size is 8
3532 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3533 {s5, s6, s7, s8}. */
3535 /* When using duplicate_and_interleave, we just need one element for
3536 each scalar statement. */
3537 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3538 nunits = group_size;
3540 number_of_copies = nunits * number_of_vectors / group_size;
3542 number_of_places_left_in_vector = nunits;
3543 constant_p = true;
3544 tree_vector_builder elts (vector_type, nunits, 1);
3545 elts.quick_grow (nunits);
3546 bool place_after_defs = false;
3547 for (j = 0; j < number_of_copies; j++)
3549 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
3551 if (is_store)
3552 op = gimple_assign_rhs1 (stmt);
3553 else
3555 switch (code)
3557 case COND_EXPR:
3559 tree cond = gimple_assign_rhs1 (stmt);
3560 if (TREE_CODE (cond) == SSA_NAME)
3561 op = gimple_op (stmt, op_num + 1);
3562 else if (op_num == 0 || op_num == 1)
3563 op = TREE_OPERAND (cond, op_num);
3564 else
3566 if (op_num == 2)
3567 op = gimple_assign_rhs2 (stmt);
3568 else
3569 op = gimple_assign_rhs3 (stmt);
3572 break;
3574 case CALL_EXPR:
3575 op = gimple_call_arg (stmt, op_num);
3576 break;
3578 case LSHIFT_EXPR:
3579 case RSHIFT_EXPR:
3580 case LROTATE_EXPR:
3581 case RROTATE_EXPR:
3582 op = gimple_op (stmt, op_num + 1);
3583 /* Unlike the other binary operators, shifts/rotates have
3584 the shift count being int, instead of the same type as
3585 the lhs, so make sure the scalar is the right type if
3586 we are dealing with vectors of
3587 long long/long/short/char. */
3588 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3589 op = fold_convert (TREE_TYPE (vector_type), op);
3590 break;
3592 default:
3593 op = gimple_op (stmt, op_num + 1);
3594 break;
3598 /* Create 'vect_ = {op0,op1,...,opn}'. */
3599 number_of_places_left_in_vector--;
3600 tree orig_op = op;
3601 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3603 if (CONSTANT_CLASS_P (op))
3605 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3607 /* Can't use VIEW_CONVERT_EXPR for booleans because
3608 of possibly different sizes of scalar value and
3609 vector element. */
3610 if (integer_zerop (op))
3611 op = build_int_cst (TREE_TYPE (vector_type), 0);
3612 else if (integer_onep (op))
3613 op = build_all_ones_cst (TREE_TYPE (vector_type));
3614 else
3615 gcc_unreachable ();
3617 else
3618 op = fold_unary (VIEW_CONVERT_EXPR,
3619 TREE_TYPE (vector_type), op);
3620 gcc_assert (op && CONSTANT_CLASS_P (op));
3622 else
3624 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3625 gimple *init_stmt;
3626 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3628 tree true_val
3629 = build_all_ones_cst (TREE_TYPE (vector_type));
3630 tree false_val
3631 = build_zero_cst (TREE_TYPE (vector_type));
3632 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3633 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3634 op, true_val,
3635 false_val);
3637 else
3639 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3640 op);
3641 init_stmt
3642 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3643 op);
3645 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3646 op = new_temp;
3649 elts[number_of_places_left_in_vector] = op;
3650 if (!CONSTANT_CLASS_P (op))
3651 constant_p = false;
3652 if (TREE_CODE (orig_op) == SSA_NAME
3653 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3654 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3655 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3656 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3657 place_after_defs = true;
3659 if (number_of_places_left_in_vector == 0)
3661 if (constant_p
3662 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3663 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3664 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3665 else
3667 if (vec_oprnds->is_empty ())
3668 duplicate_and_interleave (&ctor_seq, vector_type, elts,
3669 number_of_vectors,
3670 permute_results);
3671 vec_cst = permute_results[number_of_vectors - j - 1];
3673 tree init;
3674 gimple_stmt_iterator gsi;
3675 if (place_after_defs)
3677 gsi = gsi_for_stmt
3678 (vect_find_last_scalar_stmt_in_slp (slp_node));
3679 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
3681 else
3682 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
3683 if (ctor_seq != NULL)
3685 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3686 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3687 ctor_seq = NULL;
3689 voprnds.quick_push (init);
3690 place_after_defs = false;
3691 number_of_places_left_in_vector = nunits;
3692 constant_p = true;
3693 elts.new_vector (vector_type, nunits, 1);
3694 elts.quick_grow (nunits);
3699 /* Since the vectors are created in the reverse order, we should invert
3700 them. */
3701 vec_num = voprnds.length ();
3702 for (j = vec_num; j != 0; j--)
3704 vop = voprnds[j - 1];
3705 vec_oprnds->quick_push (vop);
3708 voprnds.release ();
3710 /* In case that VF is greater than the unrolling factor needed for the SLP
3711 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3712 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3713 to replicate the vectors. */
3714 while (number_of_vectors > vec_oprnds->length ())
3716 tree neutral_vec = NULL;
3718 if (neutral_op)
3720 if (!neutral_vec)
3721 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3723 vec_oprnds->quick_push (neutral_vec);
3725 else
3727 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3728 vec_oprnds->quick_push (vop);
3734 /* Get vectorized definitions from SLP_NODE that contains corresponding
3735 vectorized def-stmts. */
3737 static void
3738 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3740 tree vec_oprnd;
3741 gimple *vec_def_stmt;
3742 unsigned int i;
3744 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3746 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
3748 gcc_assert (vec_def_stmt);
3749 if (gimple_code (vec_def_stmt) == GIMPLE_PHI)
3750 vec_oprnd = gimple_phi_result (vec_def_stmt);
3751 else
3752 vec_oprnd = gimple_get_lhs (vec_def_stmt);
3753 vec_oprnds->quick_push (vec_oprnd);
3758 /* Get vectorized definitions for SLP_NODE.
3759 If the scalar definitions are loop invariants or constants, collect them and
3760 call vect_get_constant_vectors() to create vector stmts.
3761 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3762 must be stored in the corresponding child of SLP_NODE, and we call
3763 vect_get_slp_vect_defs () to retrieve them. */
3765 void
3766 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3767 vec<vec<tree> > *vec_oprnds)
3769 gimple *first_stmt;
3770 int number_of_vects = 0, i;
3771 unsigned int child_index = 0;
3772 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3773 slp_tree child = NULL;
3774 vec<tree> vec_defs;
3775 tree oprnd;
3776 bool vectorized_defs;
3778 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3779 FOR_EACH_VEC_ELT (ops, i, oprnd)
3781 /* For each operand we check if it has vectorized definitions in a child
3782 node or we need to create them (for invariants and constants). We
3783 check if the LHS of the first stmt of the next child matches OPRND.
3784 If it does, we found the correct child. Otherwise, we call
3785 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3786 to check this child node for the next operand. */
3787 vectorized_defs = false;
3788 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3790 child = SLP_TREE_CHILDREN (slp_node)[child_index];
3792 /* We have to check both pattern and original def, if available. */
3793 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3795 gimple *first_def = SLP_TREE_SCALAR_STMTS (child)[0];
3796 gimple *related
3797 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
3798 tree first_def_op;
3800 if (gimple_code (first_def) == GIMPLE_PHI)
3801 first_def_op = gimple_phi_result (first_def);
3802 else
3803 first_def_op = gimple_get_lhs (first_def);
3804 if (operand_equal_p (oprnd, first_def_op, 0)
3805 || (related
3806 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
3808 /* The number of vector defs is determined by the number of
3809 vector statements in the node from which we get those
3810 statements. */
3811 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3812 vectorized_defs = true;
3813 child_index++;
3816 else
3817 child_index++;
3820 if (!vectorized_defs)
3822 if (i == 0)
3824 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3825 /* Number of vector stmts was calculated according to LHS in
3826 vect_schedule_slp_instance (), fix it by replacing LHS with
3827 RHS, if necessary. See vect_get_smallest_scalar_type () for
3828 details. */
3829 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
3830 &rhs_size_unit);
3831 if (rhs_size_unit != lhs_size_unit)
3833 number_of_vects *= rhs_size_unit;
3834 number_of_vects /= lhs_size_unit;
3839 /* Allocate memory for vectorized defs. */
3840 vec_defs = vNULL;
3841 vec_defs.create (number_of_vects);
3843 /* For reduction defs we call vect_get_constant_vectors (), since we are
3844 looking for initial loop invariant values. */
3845 if (vectorized_defs)
3846 /* The defs are already vectorized. */
3847 vect_get_slp_vect_defs (child, &vec_defs);
3848 else
3849 /* Build vectors from scalar defs. */
3850 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3851 number_of_vects);
3853 vec_oprnds->quick_push (vec_defs);
3857 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3858 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3859 permute statements for the SLP node NODE of the SLP instance
3860 SLP_NODE_INSTANCE. */
3862 bool
3863 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3864 gimple_stmt_iterator *gsi, poly_uint64 vf,
3865 slp_instance slp_node_instance, bool analyze_only,
3866 unsigned *n_perms)
3868 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3869 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3870 tree mask_element_type = NULL_TREE, mask_type;
3871 int vec_index = 0;
3872 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3873 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3874 unsigned int mask_element;
3875 machine_mode mode;
3876 unsigned HOST_WIDE_INT nunits, const_vf;
3878 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3879 return false;
3881 stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
3883 mode = TYPE_MODE (vectype);
3885 /* At the moment, all permutations are represented using per-element
3886 indices, so we can't cope with variable vector lengths or
3887 vectorization factors. */
3888 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
3889 || !vf.is_constant (&const_vf))
3890 return false;
3892 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3893 same size as the vector element being permuted. */
3894 mask_element_type = lang_hooks.types.type_for_mode
3895 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
3896 mask_type = get_vectype_for_scalar_type (mask_element_type);
3897 vec_perm_builder mask (nunits, nunits, 1);
3898 mask.quick_grow (nunits);
3899 vec_perm_indices indices;
3901 /* Initialize the vect stmts of NODE to properly insert the generated
3902 stmts later. */
3903 if (! analyze_only)
3904 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3905 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3906 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3908 /* Generate permutation masks for every NODE. Number of masks for each NODE
3909 is equal to GROUP_SIZE.
3910 E.g., we have a group of three nodes with three loads from the same
3911 location in each node, and the vector size is 4. I.e., we have a
3912 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3913 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3914 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3917 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3918 The last mask is illegal since we assume two operands for permute
3919 operation, and the mask element values can't be outside that range.
3920 Hence, the last mask must be converted into {2,5,5,5}.
3921 For the first two permutations we need the first and the second input
3922 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3923 we need the second and the third vectors: {b1,c1,a2,b2} and
3924 {c2,a3,b3,c3}. */
3926 int vect_stmts_counter = 0;
3927 unsigned int index = 0;
3928 int first_vec_index = -1;
3929 int second_vec_index = -1;
3930 bool noop_p = true;
3931 *n_perms = 0;
3933 for (unsigned int j = 0; j < const_vf; j++)
3935 for (int k = 0; k < group_size; k++)
3937 unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
3938 + j * STMT_VINFO_GROUP_SIZE (stmt_info));
3939 vec_index = i / nunits;
3940 mask_element = i % nunits;
3941 if (vec_index == first_vec_index
3942 || first_vec_index == -1)
3944 first_vec_index = vec_index;
3946 else if (vec_index == second_vec_index
3947 || second_vec_index == -1)
3949 second_vec_index = vec_index;
3950 mask_element += nunits;
3952 else
3954 if (dump_enabled_p ())
3956 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3957 "permutation requires at "
3958 "least three vectors ");
3959 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3960 stmt, 0);
3962 gcc_assert (analyze_only);
3963 return false;
3966 gcc_assert (mask_element < 2 * nunits);
3967 if (mask_element != index)
3968 noop_p = false;
3969 mask[index++] = mask_element;
3971 if (index == nunits && !noop_p)
3973 indices.new_vector (mask, 2, nunits);
3974 if (!can_vec_perm_const_p (mode, indices))
3976 if (dump_enabled_p ())
3978 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3979 vect_location,
3980 "unsupported vect permute { ");
3981 for (i = 0; i < nunits; ++i)
3983 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3984 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3986 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3988 gcc_assert (analyze_only);
3989 return false;
3992 ++*n_perms;
3995 if (index == nunits)
3997 if (!analyze_only)
3999 tree mask_vec = NULL_TREE;
4001 if (! noop_p)
4002 mask_vec = vec_perm_indices_to_tree (mask_type, indices);
4004 if (second_vec_index == -1)
4005 second_vec_index = first_vec_index;
4007 /* Generate the permute statement if necessary. */
4008 tree first_vec = dr_chain[first_vec_index];
4009 tree second_vec = dr_chain[second_vec_index];
4010 gimple *perm_stmt;
4011 if (! noop_p)
4013 tree perm_dest
4014 = vect_create_destination_var (gimple_assign_lhs (stmt),
4015 vectype);
4016 perm_dest = make_ssa_name (perm_dest);
4017 perm_stmt = gimple_build_assign (perm_dest,
4018 VEC_PERM_EXPR,
4019 first_vec, second_vec,
4020 mask_vec);
4021 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4023 else
4024 /* If mask was NULL_TREE generate the requested
4025 identity transform. */
4026 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
4028 /* Store the vector statement in NODE. */
4029 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
4032 index = 0;
4033 first_vec_index = -1;
4034 second_vec_index = -1;
4035 noop_p = true;
4040 return true;
4043 typedef hash_map <vec <gimple *>, slp_tree,
4044 simple_hashmap_traits <bst_traits, slp_tree> >
4045 scalar_stmts_to_slp_tree_map_t;
4047 /* Vectorize SLP instance tree in postorder. */
4049 static bool
4050 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
4051 scalar_stmts_to_slp_tree_map_t *bst_map)
4053 gimple *stmt;
4054 bool grouped_store, is_store;
4055 gimple_stmt_iterator si;
4056 stmt_vec_info stmt_info;
4057 unsigned int group_size;
4058 tree vectype;
4059 int i, j;
4060 slp_tree child;
4062 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4063 return false;
4065 /* See if we have already vectorized the same set of stmts and reuse their
4066 vectorized stmts. */
4067 slp_tree &leader
4068 = bst_map->get_or_insert (SLP_TREE_SCALAR_STMTS (node).copy ());
4069 if (leader)
4071 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (leader));
4072 return false;
4075 leader = node;
4076 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4077 vect_schedule_slp_instance (child, instance, bst_map);
4079 /* Push SLP node def-type to stmts. */
4080 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4081 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4082 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4083 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
4085 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
4086 stmt_info = vinfo_for_stmt (stmt);
4088 /* VECTYPE is the type of the destination. */
4089 vectype = STMT_VINFO_VECTYPE (stmt_info);
4090 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4091 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
4093 if (!SLP_TREE_VEC_STMTS (node).exists ())
4094 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4096 if (dump_enabled_p ())
4098 dump_printf_loc (MSG_NOTE,vect_location,
4099 "------>vectorizing SLP node starting from: ");
4100 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4103 /* Vectorized stmts go before the last scalar stmt which is where
4104 all uses are ready. */
4105 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
4107 /* Mark the first element of the reduction chain as reduction to properly
4108 transform the node. In the analysis phase only the last element of the
4109 chain is marked as reduction. */
4110 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
4111 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
4113 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
4114 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4117 /* Handle two-operation SLP nodes by vectorizing the group with
4118 both operations and then performing a merge. */
4119 if (SLP_TREE_TWO_OPERATORS (node))
4121 enum tree_code code0 = gimple_assign_rhs_code (stmt);
4122 enum tree_code ocode = ERROR_MARK;
4123 gimple *ostmt;
4124 vec_perm_builder mask (group_size, group_size, 1);
4125 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
4126 if (gimple_assign_rhs_code (ostmt) != code0)
4128 mask.quick_push (1);
4129 ocode = gimple_assign_rhs_code (ostmt);
4131 else
4132 mask.quick_push (0);
4133 if (ocode != ERROR_MARK)
4135 vec<gimple *> v0;
4136 vec<gimple *> v1;
4137 unsigned j;
4138 tree tmask = NULL_TREE;
4139 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4140 v0 = SLP_TREE_VEC_STMTS (node).copy ();
4141 SLP_TREE_VEC_STMTS (node).truncate (0);
4142 gimple_assign_set_rhs_code (stmt, ocode);
4143 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4144 gimple_assign_set_rhs_code (stmt, code0);
4145 v1 = SLP_TREE_VEC_STMTS (node).copy ();
4146 SLP_TREE_VEC_STMTS (node).truncate (0);
4147 tree meltype = build_nonstandard_integer_type
4148 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
4149 tree mvectype = get_same_sized_vectype (meltype, vectype);
4150 unsigned k = 0, l;
4151 for (j = 0; j < v0.length (); ++j)
4153 /* Enforced by vect_build_slp_tree, which rejects variable-length
4154 vectors for SLP_TREE_TWO_OPERATORS. */
4155 unsigned int const_nunits = nunits.to_constant ();
4156 tree_vector_builder melts (mvectype, const_nunits, 1);
4157 for (l = 0; l < const_nunits; ++l)
4159 if (k >= group_size)
4160 k = 0;
4161 tree t = build_int_cst (meltype,
4162 mask[k++] * const_nunits + l);
4163 melts.quick_push (t);
4165 tmask = melts.build ();
4167 /* ??? Not all targets support a VEC_PERM_EXPR with a
4168 constant mask that would translate to a vec_merge RTX
4169 (with their vec_perm_const_ok). We can either not
4170 vectorize in that case or let veclower do its job.
4171 Unfortunately that isn't too great and at least for
4172 plus/minus we'd eventually like to match targets
4173 vector addsub instructions. */
4174 gimple *vstmt;
4175 vstmt = gimple_build_assign (make_ssa_name (vectype),
4176 VEC_PERM_EXPR,
4177 gimple_assign_lhs (v0[j]),
4178 gimple_assign_lhs (v1[j]), tmask);
4179 vect_finish_stmt_generation (stmt, vstmt, &si);
4180 SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
4182 v0.release ();
4183 v1.release ();
4184 return false;
4187 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4189 /* Restore stmt def-types. */
4190 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4191 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4192 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4193 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
4195 return is_store;
4198 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4199 For loop vectorization this is done in vectorizable_call, but for SLP
4200 it needs to be deferred until end of vect_schedule_slp, because multiple
4201 SLP instances may refer to the same scalar stmt. */
4203 static void
4204 vect_remove_slp_scalar_calls (slp_tree node)
4206 gimple *stmt, *new_stmt;
4207 gimple_stmt_iterator gsi;
4208 int i;
4209 slp_tree child;
4210 tree lhs;
4211 stmt_vec_info stmt_info;
4213 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4214 return;
4216 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4217 vect_remove_slp_scalar_calls (child);
4219 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4221 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
4222 continue;
4223 stmt_info = vinfo_for_stmt (stmt);
4224 if (stmt_info == NULL
4225 || is_pattern_stmt_p (stmt_info)
4226 || !PURE_SLP_STMT (stmt_info))
4227 continue;
4228 lhs = gimple_call_lhs (stmt);
4229 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4230 set_vinfo_for_stmt (new_stmt, stmt_info);
4231 set_vinfo_for_stmt (stmt, NULL);
4232 STMT_VINFO_STMT (stmt_info) = new_stmt;
4233 gsi = gsi_for_stmt (stmt);
4234 gsi_replace (&gsi, new_stmt, false);
4235 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4239 /* Generate vector code for all SLP instances in the loop/basic block. */
4241 bool
4242 vect_schedule_slp (vec_info *vinfo)
4244 vec<slp_instance> slp_instances;
4245 slp_instance instance;
4246 unsigned int i;
4247 bool is_store = false;
4250 scalar_stmts_to_slp_tree_map_t *bst_map
4251 = new scalar_stmts_to_slp_tree_map_t ();
4252 slp_instances = vinfo->slp_instances;
4253 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4255 /* Schedule the tree of INSTANCE. */
4256 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4257 instance, bst_map);
4258 if (dump_enabled_p ())
4259 dump_printf_loc (MSG_NOTE, vect_location,
4260 "vectorizing stmts using SLP.\n");
4262 delete bst_map;
4264 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4266 slp_tree root = SLP_INSTANCE_TREE (instance);
4267 gimple *store;
4268 unsigned int j;
4269 gimple_stmt_iterator gsi;
4271 /* Remove scalar call stmts. Do not do this for basic-block
4272 vectorization as not all uses may be vectorized.
4273 ??? Why should this be necessary? DCE should be able to
4274 remove the stmts itself.
4275 ??? For BB vectorization we can as well remove scalar
4276 stmts starting from the SLP tree root if they have no
4277 uses. */
4278 if (is_a <loop_vec_info> (vinfo))
4279 vect_remove_slp_scalar_calls (root);
4281 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
4282 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4284 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
4285 break;
4287 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
4288 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
4289 /* Free the attached stmt_vec_info and remove the stmt. */
4290 gsi = gsi_for_stmt (store);
4291 unlink_stmt_vdef (store);
4292 gsi_remove (&gsi, true);
4293 release_defs (store);
4294 free_stmt_vec_info (store);
4298 return is_store;