PR tree-optimization/81184
[official-gcc.git] / gcc / tree-vect-slp.c
blob7fae17b340d011827fb91b22c2a761258b595e69
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
52 static void
53 vect_free_slp_tree (slp_tree node)
55 int i;
56 slp_tree child;
58 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
59 vect_free_slp_tree (child);
61 gimple *stmt;
62 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
63 /* After transform some stmts are removed and thus their vinfo is gone. */
64 if (vinfo_for_stmt (stmt))
66 gcc_assert (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) > 0);
67 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))--;
70 SLP_TREE_CHILDREN (node).release ();
71 SLP_TREE_SCALAR_STMTS (node).release ();
72 SLP_TREE_VEC_STMTS (node).release ();
73 SLP_TREE_LOAD_PERMUTATION (node).release ();
75 free (node);
79 /* Free the memory allocated for the SLP instance. */
81 void
82 vect_free_slp_instance (slp_instance instance)
84 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
85 SLP_INSTANCE_LOADS (instance).release ();
86 free (instance);
90 /* Create an SLP node for SCALAR_STMTS. */
92 static slp_tree
93 vect_create_new_slp_node (vec<gimple *> scalar_stmts)
95 slp_tree node;
96 gimple *stmt = scalar_stmts[0];
97 unsigned int nops;
99 if (is_gimple_call (stmt))
100 nops = gimple_call_num_args (stmt);
101 else if (is_gimple_assign (stmt))
103 nops = gimple_num_ops (stmt) - 1;
104 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
105 nops++;
107 else if (gimple_code (stmt) == GIMPLE_PHI)
108 nops = 0;
109 else
110 return NULL;
112 node = XNEW (struct _slp_tree);
113 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
114 SLP_TREE_VEC_STMTS (node).create (0);
115 SLP_TREE_CHILDREN (node).create (nops);
116 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
117 SLP_TREE_TWO_OPERATORS (node) = false;
118 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
120 unsigned i;
121 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt)
122 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))++;
124 return node;
128 /* This structure is used in creation of an SLP tree. Each instance
129 corresponds to the same operand in a group of scalar stmts in an SLP
130 node. */
131 typedef struct _slp_oprnd_info
133 /* Def-stmts for the operands. */
134 vec<gimple *> def_stmts;
135 /* Information about the first statement, its vector def-type, type, the
136 operand itself in case it's constant, and an indication if it's a pattern
137 stmt. */
138 tree first_op_type;
139 enum vect_def_type first_dt;
140 bool first_pattern;
141 bool second_pattern;
142 } *slp_oprnd_info;
145 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
146 operand. */
147 static vec<slp_oprnd_info>
148 vect_create_oprnd_info (int nops, int group_size)
150 int i;
151 slp_oprnd_info oprnd_info;
152 vec<slp_oprnd_info> oprnds_info;
154 oprnds_info.create (nops);
155 for (i = 0; i < nops; i++)
157 oprnd_info = XNEW (struct _slp_oprnd_info);
158 oprnd_info->def_stmts.create (group_size);
159 oprnd_info->first_dt = vect_uninitialized_def;
160 oprnd_info->first_op_type = NULL_TREE;
161 oprnd_info->first_pattern = false;
162 oprnd_info->second_pattern = false;
163 oprnds_info.quick_push (oprnd_info);
166 return oprnds_info;
170 /* Free operands info. */
172 static void
173 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
175 int i;
176 slp_oprnd_info oprnd_info;
178 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
180 oprnd_info->def_stmts.release ();
181 XDELETE (oprnd_info);
184 oprnds_info.release ();
188 /* Find the place of the data-ref in STMT in the interleaving chain that starts
189 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
191 static int
192 vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
194 gimple *next_stmt = first_stmt;
195 int result = 0;
197 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
198 return -1;
202 if (next_stmt == stmt)
203 return result;
204 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
205 if (next_stmt)
206 result += GROUP_GAP (vinfo_for_stmt (next_stmt));
208 while (next_stmt);
210 return -1;
213 /* Check whether it is possible to load COUNT elements of type ELT_MODE
214 using the method implemented by duplicate_and_interleave. Return true
215 if so, returning the number of intermediate vectors in *NVECTORS_OUT
216 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
217 (if nonnull). */
219 bool
220 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
221 unsigned int *nvectors_out,
222 tree *vector_type_out,
223 tree *permutes)
225 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
226 poly_int64 nelts;
227 unsigned int nvectors = 1;
228 for (;;)
230 scalar_int_mode int_mode;
231 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
232 if (multiple_p (current_vector_size, elt_bytes, &nelts)
233 && int_mode_for_size (elt_bits, 0).exists (&int_mode))
235 tree int_type = build_nonstandard_integer_type
236 (GET_MODE_BITSIZE (int_mode), 1);
237 tree vector_type = build_vector_type (int_type, nelts);
238 if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
240 vec_perm_builder sel1 (nelts, 2, 3);
241 vec_perm_builder sel2 (nelts, 2, 3);
242 poly_int64 half_nelts = exact_div (nelts, 2);
243 for (unsigned int i = 0; i < 3; ++i)
245 sel1.quick_push (i);
246 sel1.quick_push (i + nelts);
247 sel2.quick_push (half_nelts + i);
248 sel2.quick_push (half_nelts + i + nelts);
250 vec_perm_indices indices1 (sel1, 2, nelts);
251 vec_perm_indices indices2 (sel2, 2, nelts);
252 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
253 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
255 if (nvectors_out)
256 *nvectors_out = nvectors;
257 if (vector_type_out)
258 *vector_type_out = vector_type;
259 if (permutes)
261 permutes[0] = vect_gen_perm_mask_checked (vector_type,
262 indices1);
263 permutes[1] = vect_gen_perm_mask_checked (vector_type,
264 indices2);
266 return true;
270 if (!multiple_p (elt_bytes, 2, &elt_bytes))
271 return false;
272 nvectors *= 2;
276 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
277 they are of a valid type and that they match the defs of the first stmt of
278 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
279 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
280 indicates swap is required for cond_expr stmts. Specifically, *SWAP
281 is 1 if STMT is cond and operands of comparison need to be swapped;
282 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
283 If there is any operand swap in this function, *SWAP is set to non-zero
284 value.
285 If there was a fatal error return -1; if the error could be corrected by
286 swapping operands of father node of this one, return 1; if everything is
287 ok return 0. */
288 static int
289 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
290 vec<gimple *> stmts, unsigned stmt_num,
291 vec<slp_oprnd_info> *oprnds_info)
293 gimple *stmt = stmts[stmt_num];
294 tree oprnd;
295 unsigned int i, number_of_oprnds;
296 gimple *def_stmt;
297 enum vect_def_type dt = vect_uninitialized_def;
298 bool pattern = false;
299 slp_oprnd_info oprnd_info;
300 int first_op_idx = 1;
301 bool commutative = false;
302 bool first_op_cond = false;
303 bool first = stmt_num == 0;
304 bool second = stmt_num == 1;
306 if (is_gimple_call (stmt))
308 number_of_oprnds = gimple_call_num_args (stmt);
309 first_op_idx = 3;
311 else if (is_gimple_assign (stmt))
313 enum tree_code code = gimple_assign_rhs_code (stmt);
314 number_of_oprnds = gimple_num_ops (stmt) - 1;
315 /* Swap can only be done for cond_expr if asked to, otherwise we
316 could result in different comparison code to the first stmt. */
317 if (gimple_assign_rhs_code (stmt) == COND_EXPR
318 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
320 first_op_cond = true;
321 number_of_oprnds++;
323 else
324 commutative = commutative_tree_code (code);
326 else
327 return -1;
329 bool swapped = (*swap != 0);
330 gcc_assert (!swapped || first_op_cond);
331 for (i = 0; i < number_of_oprnds; i++)
333 again:
334 if (first_op_cond)
336 /* Map indicating how operands of cond_expr should be swapped. */
337 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
338 int *map = maps[*swap];
340 if (i < 2)
341 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx), map[i]);
342 else
343 oprnd = gimple_op (stmt, map[i]);
345 else
346 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
348 oprnd_info = (*oprnds_info)[i];
350 if (!vect_is_simple_use (oprnd, vinfo, &def_stmt, &dt))
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 "Build SLP failed: can't analyze def for ");
356 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
357 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
360 return -1;
363 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
364 from the pattern. Check that all the stmts of the node are in the
365 pattern. */
366 if (def_stmt && gimple_bb (def_stmt)
367 && vect_stmt_in_region_p (vinfo, def_stmt)
368 && vinfo_for_stmt (def_stmt)
369 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
370 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
371 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
373 pattern = true;
374 if (!first && !oprnd_info->first_pattern
375 /* Allow different pattern state for the defs of the
376 first stmt in reduction chains. */
377 && (oprnd_info->first_dt != vect_reduction_def
378 || (!second && !oprnd_info->second_pattern)))
380 if (i == 0
381 && !swapped
382 && commutative)
384 swapped = true;
385 goto again;
388 if (dump_enabled_p ())
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
391 "Build SLP failed: some of the stmts"
392 " are in a pattern, and others are not ");
393 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
394 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
397 return 1;
400 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
401 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
403 if (dt == vect_unknown_def_type)
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "Unsupported pattern.\n");
408 return -1;
411 switch (gimple_code (def_stmt))
413 case GIMPLE_PHI:
414 case GIMPLE_ASSIGN:
415 break;
417 default:
418 if (dump_enabled_p ())
419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
420 "unsupported defining stmt:\n");
421 return -1;
425 if (second)
426 oprnd_info->second_pattern = pattern;
428 if (first)
430 oprnd_info->first_dt = dt;
431 oprnd_info->first_pattern = pattern;
432 oprnd_info->first_op_type = TREE_TYPE (oprnd);
434 else
436 /* Not first stmt of the group, check that the def-stmt/s match
437 the def-stmt/s of the first stmt. Allow different definition
438 types for reduction chains: the first stmt must be a
439 vect_reduction_def (a phi node), and the rest
440 vect_internal_def. */
441 tree type = TREE_TYPE (oprnd);
442 if ((oprnd_info->first_dt != dt
443 && !(oprnd_info->first_dt == vect_reduction_def
444 && dt == vect_internal_def)
445 && !((oprnd_info->first_dt == vect_external_def
446 || oprnd_info->first_dt == vect_constant_def)
447 && (dt == vect_external_def
448 || dt == vect_constant_def)))
449 || !types_compatible_p (oprnd_info->first_op_type, type))
451 /* Try swapping operands if we got a mismatch. */
452 if (i == 0
453 && !swapped
454 && commutative)
456 swapped = true;
457 goto again;
460 if (dump_enabled_p ())
461 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
462 "Build SLP failed: different types\n");
464 return 1;
466 if ((dt == vect_constant_def
467 || dt == vect_external_def)
468 && !current_vector_size.is_constant ()
469 && (TREE_CODE (type) == BOOLEAN_TYPE
470 || !can_duplicate_and_interleave_p (stmts.length (),
471 TYPE_MODE (type))))
473 if (dump_enabled_p ())
475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
476 "Build SLP failed: invalid type of def "
477 "for variable-length SLP ");
478 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
479 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
481 return -1;
485 /* Check the types of the definitions. */
486 switch (dt)
488 case vect_constant_def:
489 case vect_external_def:
490 break;
492 case vect_reduction_def:
493 case vect_induction_def:
494 case vect_internal_def:
495 oprnd_info->def_stmts.quick_push (def_stmt);
496 break;
498 default:
499 /* FORNOW: Not supported. */
500 if (dump_enabled_p ())
502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
503 "Build SLP failed: illegal type of def ");
504 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
505 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
508 return -1;
512 /* Swap operands. */
513 if (swapped)
515 /* If there are already uses of this stmt in a SLP instance then
516 we've committed to the operand order and can't swap it. */
517 if (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) != 0)
519 if (dump_enabled_p ())
521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
522 "Build SLP failed: cannot swap operands of "
523 "shared stmt ");
524 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
526 return -1;
529 if (first_op_cond)
531 tree cond = gimple_assign_rhs1 (stmt);
532 enum tree_code code = TREE_CODE (cond);
534 /* Swap. */
535 if (*swap == 1)
537 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
538 &TREE_OPERAND (cond, 1));
539 TREE_SET_CODE (cond, swap_tree_comparison (code));
541 /* Invert. */
542 else
544 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
545 gimple_assign_rhs3_ptr (stmt));
546 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
547 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
548 gcc_assert (code != ERROR_MARK);
549 TREE_SET_CODE (cond, code);
552 else
553 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
554 gimple_assign_rhs2_ptr (stmt));
555 if (dump_enabled_p ())
557 dump_printf_loc (MSG_NOTE, vect_location,
558 "swapped operands to match def types in ");
559 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
563 *swap = swapped;
564 return 0;
567 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
568 caller's attempt to find the vector type in STMT with the narrowest
569 element type. Return true if VECTYPE is nonnull and if it is valid
570 for VINFO. When returning true, update MAX_NUNITS to reflect the
571 number of units in VECTYPE. VINFO, GORUP_SIZE and MAX_NUNITS are
572 as for vect_build_slp_tree. */
574 static bool
575 vect_record_max_nunits (vec_info *vinfo, gimple *stmt, unsigned int group_size,
576 tree vectype, poly_uint64 *max_nunits)
578 if (!vectype)
580 if (dump_enabled_p ())
582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
583 "Build SLP failed: unsupported data-type in ");
584 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
585 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
587 /* Fatal mismatch. */
588 return false;
591 /* If populating the vector type requires unrolling then fail
592 before adjusting *max_nunits for basic-block vectorization. */
593 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
594 unsigned HOST_WIDE_INT const_nunits;
595 if (is_a <bb_vec_info> (vinfo)
596 && (!nunits.is_constant (&const_nunits)
597 || const_nunits > group_size))
599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
600 "Build SLP failed: unrolling required "
601 "in basic block SLP\n");
602 /* Fatal mismatch. */
603 return false;
606 /* In case of multiple types we need to detect the smallest type. */
607 vect_update_max_nunits (max_nunits, vectype);
608 return true;
611 /* Verify if the scalar stmts STMTS are isomorphic, require data
612 permutation or are of unsupported types of operation. Return
613 true if they are, otherwise return false and indicate in *MATCHES
614 which stmts are not isomorphic to the first one. If MATCHES[0]
615 is false then this indicates the comparison could not be
616 carried out or the stmts will never be vectorized by SLP.
618 Note COND_EXPR is possibly ismorphic to another one after swapping its
619 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
620 the first stmt by swapping the two operands of comparison; set SWAP[i]
621 to 2 if stmt I is isormorphic to the first stmt by inverting the code
622 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
623 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
625 static bool
626 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
627 vec<gimple *> stmts, unsigned int group_size,
628 unsigned nops, poly_uint64 *max_nunits,
629 bool *matches, bool *two_operators)
631 unsigned int i;
632 gimple *first_stmt = stmts[0], *stmt = stmts[0];
633 enum tree_code first_stmt_code = ERROR_MARK;
634 enum tree_code alt_stmt_code = ERROR_MARK;
635 enum tree_code rhs_code = ERROR_MARK;
636 enum tree_code first_cond_code = ERROR_MARK;
637 tree lhs;
638 bool need_same_oprnds = false;
639 tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
640 optab optab;
641 int icode;
642 machine_mode optab_op2_mode;
643 machine_mode vec_mode;
644 HOST_WIDE_INT dummy;
645 gimple *first_load = NULL, *prev_first_load = NULL;
647 /* For every stmt in NODE find its def stmt/s. */
648 FOR_EACH_VEC_ELT (stmts, i, stmt)
650 swap[i] = 0;
651 matches[i] = false;
653 if (dump_enabled_p ())
655 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
656 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
659 /* Fail to vectorize statements marked as unvectorizable. */
660 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
662 if (dump_enabled_p ())
664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
665 "Build SLP failed: unvectorizable statement ");
666 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
668 /* Fatal mismatch. */
669 matches[0] = false;
670 return false;
673 lhs = gimple_get_lhs (stmt);
674 if (lhs == NULL_TREE)
676 if (dump_enabled_p ())
678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
679 "Build SLP failed: not GIMPLE_ASSIGN nor "
680 "GIMPLE_CALL ");
681 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
683 /* Fatal mismatch. */
684 matches[0] = false;
685 return false;
688 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
689 vectype = get_vectype_for_scalar_type (scalar_type);
690 if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
691 max_nunits))
693 /* Fatal mismatch. */
694 matches[0] = false;
695 return false;
698 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
700 rhs_code = CALL_EXPR;
701 if (gimple_call_internal_p (call_stmt)
702 || gimple_call_tail_p (call_stmt)
703 || gimple_call_noreturn_p (call_stmt)
704 || !gimple_call_nothrow_p (call_stmt)
705 || gimple_call_chain (call_stmt))
707 if (dump_enabled_p ())
709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
710 "Build SLP failed: unsupported call type ");
711 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
712 call_stmt, 0);
714 /* Fatal mismatch. */
715 matches[0] = false;
716 return false;
719 else
720 rhs_code = gimple_assign_rhs_code (stmt);
722 /* Check the operation. */
723 if (i == 0)
725 first_stmt_code = rhs_code;
727 /* Shift arguments should be equal in all the packed stmts for a
728 vector shift with scalar shift operand. */
729 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
730 || rhs_code == LROTATE_EXPR
731 || rhs_code == RROTATE_EXPR)
733 vec_mode = TYPE_MODE (vectype);
735 /* First see if we have a vector/vector shift. */
736 optab = optab_for_tree_code (rhs_code, vectype,
737 optab_vector);
739 if (!optab
740 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
742 /* No vector/vector shift, try for a vector/scalar shift. */
743 optab = optab_for_tree_code (rhs_code, vectype,
744 optab_scalar);
746 if (!optab)
748 if (dump_enabled_p ())
749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
750 "Build SLP failed: no optab.\n");
751 /* Fatal mismatch. */
752 matches[0] = false;
753 return false;
755 icode = (int) optab_handler (optab, vec_mode);
756 if (icode == CODE_FOR_nothing)
758 if (dump_enabled_p ())
759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
760 "Build SLP failed: "
761 "op not supported by target.\n");
762 /* Fatal mismatch. */
763 matches[0] = false;
764 return false;
766 optab_op2_mode = insn_data[icode].operand[2].mode;
767 if (!VECTOR_MODE_P (optab_op2_mode))
769 need_same_oprnds = true;
770 first_op1 = gimple_assign_rhs2 (stmt);
774 else if (rhs_code == WIDEN_LSHIFT_EXPR)
776 need_same_oprnds = true;
777 first_op1 = gimple_assign_rhs2 (stmt);
780 else
782 if (first_stmt_code != rhs_code
783 && alt_stmt_code == ERROR_MARK)
784 alt_stmt_code = rhs_code;
785 if (first_stmt_code != rhs_code
786 && (first_stmt_code != IMAGPART_EXPR
787 || rhs_code != REALPART_EXPR)
788 && (first_stmt_code != REALPART_EXPR
789 || rhs_code != IMAGPART_EXPR)
790 /* Handle mismatches in plus/minus by computing both
791 and merging the results. */
792 && !((first_stmt_code == PLUS_EXPR
793 || first_stmt_code == MINUS_EXPR)
794 && (alt_stmt_code == PLUS_EXPR
795 || alt_stmt_code == MINUS_EXPR)
796 && rhs_code == alt_stmt_code)
797 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
798 && (first_stmt_code == ARRAY_REF
799 || first_stmt_code == BIT_FIELD_REF
800 || first_stmt_code == INDIRECT_REF
801 || first_stmt_code == COMPONENT_REF
802 || first_stmt_code == MEM_REF)))
804 if (dump_enabled_p ())
806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
807 "Build SLP failed: different operation "
808 "in stmt ");
809 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
811 "original stmt ");
812 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
813 first_stmt, 0);
815 /* Mismatch. */
816 continue;
819 if (need_same_oprnds
820 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
822 if (dump_enabled_p ())
824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
825 "Build SLP failed: different shift "
826 "arguments in ");
827 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
829 /* Mismatch. */
830 continue;
833 if (rhs_code == CALL_EXPR)
835 gimple *first_stmt = stmts[0];
836 if (gimple_call_num_args (stmt) != nops
837 || !operand_equal_p (gimple_call_fn (first_stmt),
838 gimple_call_fn (stmt), 0)
839 || gimple_call_fntype (first_stmt)
840 != gimple_call_fntype (stmt))
842 if (dump_enabled_p ())
844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
845 "Build SLP failed: different calls in ");
846 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
847 stmt, 0);
849 /* Mismatch. */
850 continue;
855 /* Grouped store or load. */
856 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
858 if (REFERENCE_CLASS_P (lhs))
860 /* Store. */
863 else
865 /* Load. */
866 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
867 if (prev_first_load)
869 /* Check that there are no loads from different interleaving
870 chains in the same node. */
871 if (prev_first_load != first_load)
873 if (dump_enabled_p ())
875 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
876 vect_location,
877 "Build SLP failed: different "
878 "interleaving chains in one node ");
879 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
880 stmt, 0);
882 /* Mismatch. */
883 continue;
886 else
887 prev_first_load = first_load;
889 } /* Grouped access. */
890 else
892 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
894 /* Not grouped load. */
895 if (dump_enabled_p ())
897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
898 "Build SLP failed: not grouped load ");
899 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
902 /* FORNOW: Not grouped loads are not supported. */
903 /* Fatal mismatch. */
904 matches[0] = false;
905 return false;
908 /* Not memory operation. */
909 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
910 && TREE_CODE_CLASS (rhs_code) != tcc_unary
911 && TREE_CODE_CLASS (rhs_code) != tcc_expression
912 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
913 && rhs_code != CALL_EXPR)
915 if (dump_enabled_p ())
917 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
918 "Build SLP failed: operation");
919 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
920 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
922 /* Fatal mismatch. */
923 matches[0] = false;
924 return false;
927 if (rhs_code == COND_EXPR)
929 tree cond_expr = gimple_assign_rhs1 (stmt);
930 enum tree_code cond_code = TREE_CODE (cond_expr);
931 enum tree_code swap_code = ERROR_MARK;
932 enum tree_code invert_code = ERROR_MARK;
934 if (i == 0)
935 first_cond_code = TREE_CODE (cond_expr);
936 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
938 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
939 swap_code = swap_tree_comparison (cond_code);
940 invert_code = invert_tree_comparison (cond_code, honor_nans);
943 if (first_cond_code == cond_code)
945 /* Isomorphic can be achieved by swapping. */
946 else if (first_cond_code == swap_code)
947 swap[i] = 1;
948 /* Isomorphic can be achieved by inverting. */
949 else if (first_cond_code == invert_code)
950 swap[i] = 2;
951 else
953 if (dump_enabled_p ())
955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
956 "Build SLP failed: different"
957 " operation");
958 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
959 stmt, 0);
961 /* Mismatch. */
962 continue;
967 matches[i] = true;
970 for (i = 0; i < group_size; ++i)
971 if (!matches[i])
972 return false;
974 /* If we allowed a two-operation SLP node verify the target can cope
975 with the permute we are going to use. */
976 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
977 if (alt_stmt_code != ERROR_MARK
978 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
980 unsigned HOST_WIDE_INT count;
981 if (!nunits.is_constant (&count))
983 if (dump_enabled_p ())
984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
985 "Build SLP failed: different operations "
986 "not allowed with variable-length SLP.\n");
987 return false;
989 vec_perm_builder sel (count, count, 1);
990 for (i = 0; i < count; ++i)
992 unsigned int elt = i;
993 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
994 elt += count;
995 sel.quick_push (elt);
997 vec_perm_indices indices (sel, 2, count);
998 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
1000 for (i = 0; i < group_size; ++i)
1001 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
1003 matches[i] = false;
1004 if (dump_enabled_p ())
1006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1007 "Build SLP failed: different operation "
1008 "in stmt ");
1009 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1010 stmts[i], 0);
1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 "original stmt ");
1013 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1014 first_stmt, 0);
1017 return false;
1019 *two_operators = true;
1022 return true;
1025 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1026 Note we never remove apart from at destruction time so we do not
1027 need a special value for deleted that differs from empty. */
1028 struct bst_traits
1030 typedef vec <gimple *> value_type;
1031 typedef vec <gimple *> compare_type;
1032 static inline hashval_t hash (value_type);
1033 static inline bool equal (value_type existing, value_type candidate);
1034 static inline bool is_empty (value_type x) { return !x.exists (); }
1035 static inline bool is_deleted (value_type x) { return !x.exists (); }
1036 static inline void mark_empty (value_type &x) { x.release (); }
1037 static inline void mark_deleted (value_type &x) { x.release (); }
1038 static inline void remove (value_type &x) { x.release (); }
1040 inline hashval_t
1041 bst_traits::hash (value_type x)
1043 inchash::hash h;
1044 for (unsigned i = 0; i < x.length (); ++i)
1045 h.add_int (gimple_uid (x[i]));
1046 return h.end ();
1048 inline bool
1049 bst_traits::equal (value_type existing, value_type candidate)
1051 if (existing.length () != candidate.length ())
1052 return false;
1053 for (unsigned i = 0; i < existing.length (); ++i)
1054 if (existing[i] != candidate[i])
1055 return false;
1056 return true;
1059 typedef hash_set <vec <gimple *>, bst_traits> scalar_stmts_set_t;
1060 static scalar_stmts_set_t *bst_fail;
1062 static slp_tree
1063 vect_build_slp_tree_2 (vec_info *vinfo,
1064 vec<gimple *> stmts, unsigned int group_size,
1065 poly_uint64 *max_nunits,
1066 vec<slp_tree> *loads,
1067 bool *matches, unsigned *npermutes, unsigned *tree_size,
1068 unsigned max_tree_size);
1070 static slp_tree
1071 vect_build_slp_tree (vec_info *vinfo,
1072 vec<gimple *> stmts, unsigned int group_size,
1073 poly_uint64 *max_nunits, vec<slp_tree> *loads,
1074 bool *matches, unsigned *npermutes, unsigned *tree_size,
1075 unsigned max_tree_size)
1077 if (bst_fail->contains (stmts))
1078 return NULL;
1079 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1080 loads, matches, npermutes, tree_size,
1081 max_tree_size);
1082 /* When SLP build fails for stmts record this, otherwise SLP build
1083 can be exponential in time when we allow to construct parts from
1084 scalars, see PR81723. */
1085 if (! res)
1087 vec <gimple *> x;
1088 x.create (stmts.length ());
1089 x.splice (stmts);
1090 bst_fail->add (x);
1092 return res;
1095 /* Recursively build an SLP tree starting from NODE.
1096 Fail (and return a value not equal to zero) if def-stmts are not
1097 isomorphic, require data permutation or are of unsupported types of
1098 operation. Otherwise, return 0.
1099 The value returned is the depth in the SLP tree where a mismatch
1100 was found. */
1102 static slp_tree
1103 vect_build_slp_tree_2 (vec_info *vinfo,
1104 vec<gimple *> stmts, unsigned int group_size,
1105 poly_uint64 *max_nunits,
1106 vec<slp_tree> *loads,
1107 bool *matches, unsigned *npermutes, unsigned *tree_size,
1108 unsigned max_tree_size)
1110 unsigned nops, i, this_tree_size = 0;
1111 poly_uint64 this_max_nunits = *max_nunits;
1112 gimple *stmt;
1113 slp_tree node;
1115 matches[0] = false;
1117 stmt = stmts[0];
1118 if (is_gimple_call (stmt))
1119 nops = gimple_call_num_args (stmt);
1120 else if (is_gimple_assign (stmt))
1122 nops = gimple_num_ops (stmt) - 1;
1123 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1124 nops++;
1126 else if (gimple_code (stmt) == GIMPLE_PHI)
1127 nops = 0;
1128 else
1129 return NULL;
1131 /* If the SLP node is a PHI (induction or reduction), terminate
1132 the recursion. */
1133 if (gimple_code (stmt) == GIMPLE_PHI)
1135 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1136 tree vectype = get_vectype_for_scalar_type (scalar_type);
1137 if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
1138 max_nunits))
1139 return NULL;
1141 vect_def_type def_type = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt));
1142 /* Induction from different IVs is not supported. */
1143 if (def_type == vect_induction_def)
1145 FOR_EACH_VEC_ELT (stmts, i, stmt)
1146 if (stmt != stmts[0])
1147 return NULL;
1149 else
1151 /* Else def types have to match. */
1152 FOR_EACH_VEC_ELT (stmts, i, stmt)
1154 /* But for reduction chains only check on the first stmt. */
1155 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1156 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt)
1157 continue;
1158 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != def_type)
1159 return NULL;
1162 node = vect_create_new_slp_node (stmts);
1163 return node;
1167 bool two_operators = false;
1168 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1169 if (!vect_build_slp_tree_1 (vinfo, swap,
1170 stmts, group_size, nops,
1171 &this_max_nunits, matches, &two_operators))
1172 return NULL;
1174 /* If the SLP node is a load, terminate the recursion. */
1175 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
1176 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1178 *max_nunits = this_max_nunits;
1179 node = vect_create_new_slp_node (stmts);
1180 loads->safe_push (node);
1181 return node;
1184 /* Get at the operands, verifying they are compatible. */
1185 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1186 slp_oprnd_info oprnd_info;
1187 FOR_EACH_VEC_ELT (stmts, i, stmt)
1189 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1190 stmts, i, &oprnds_info);
1191 if (res != 0)
1192 matches[(res == -1) ? 0 : i] = false;
1193 if (!matches[0])
1194 break;
1196 for (i = 0; i < group_size; ++i)
1197 if (!matches[i])
1199 vect_free_oprnd_info (oprnds_info);
1200 return NULL;
1203 auto_vec<slp_tree, 4> children;
1204 auto_vec<slp_tree> this_loads;
1206 stmt = stmts[0];
1208 if (tree_size)
1209 max_tree_size -= *tree_size;
1211 /* Create SLP_TREE nodes for the definition node/s. */
1212 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1214 slp_tree child;
1215 unsigned old_nloads = this_loads.length ();
1216 unsigned old_tree_size = this_tree_size;
1217 unsigned int j;
1219 if (oprnd_info->first_dt != vect_internal_def
1220 && oprnd_info->first_dt != vect_reduction_def
1221 && oprnd_info->first_dt != vect_induction_def)
1222 continue;
1224 if (++this_tree_size > max_tree_size)
1226 FOR_EACH_VEC_ELT (children, j, child)
1227 vect_free_slp_tree (child);
1228 vect_free_oprnd_info (oprnds_info);
1229 return NULL;
1232 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1233 group_size, &this_max_nunits,
1234 &this_loads, matches, npermutes,
1235 &this_tree_size,
1236 max_tree_size)) != NULL)
1238 /* If we have all children of child built up from scalars then just
1239 throw that away and build it up this node from scalars. */
1240 if (!SLP_TREE_CHILDREN (child).is_empty ()
1241 /* ??? Rejecting patterns this way doesn't work. We'd have to
1242 do extra work to cancel the pattern so the uses see the
1243 scalar version. */
1244 && !is_pattern_stmt_p
1245 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1247 slp_tree grandchild;
1249 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1250 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1251 break;
1252 if (!grandchild)
1254 /* Roll back. */
1255 this_loads.truncate (old_nloads);
1256 this_tree_size = old_tree_size;
1257 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1258 vect_free_slp_tree (grandchild);
1259 SLP_TREE_CHILDREN (child).truncate (0);
1261 dump_printf_loc (MSG_NOTE, vect_location,
1262 "Building parent vector operands from "
1263 "scalars instead\n");
1264 oprnd_info->def_stmts = vNULL;
1265 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1266 children.safe_push (child);
1267 continue;
1271 oprnd_info->def_stmts = vNULL;
1272 children.safe_push (child);
1273 continue;
1276 /* If the SLP build failed fatally and we analyze a basic-block
1277 simply treat nodes we fail to build as externally defined
1278 (and thus build vectors from the scalar defs).
1279 The cost model will reject outright expensive cases.
1280 ??? This doesn't treat cases where permutation ultimatively
1281 fails (or we don't try permutation below). Ideally we'd
1282 even compute a permutation that will end up with the maximum
1283 SLP tree size... */
1284 if (is_a <bb_vec_info> (vinfo)
1285 && !matches[0]
1286 /* ??? Rejecting patterns this way doesn't work. We'd have to
1287 do extra work to cancel the pattern so the uses see the
1288 scalar version. */
1289 && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1291 dump_printf_loc (MSG_NOTE, vect_location,
1292 "Building vector operands from scalars\n");
1293 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1294 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1295 children.safe_push (child);
1296 oprnd_info->def_stmts = vNULL;
1297 continue;
1300 /* If the SLP build for operand zero failed and operand zero
1301 and one can be commutated try that for the scalar stmts
1302 that failed the match. */
1303 if (i == 0
1304 /* A first scalar stmt mismatch signals a fatal mismatch. */
1305 && matches[0]
1306 /* ??? For COND_EXPRs we can swap the comparison operands
1307 as well as the arms under some constraints. */
1308 && nops == 2
1309 && oprnds_info[1]->first_dt == vect_internal_def
1310 && is_gimple_assign (stmt)
1311 && commutative_tree_code (gimple_assign_rhs_code (stmt))
1312 && ! two_operators
1313 /* Do so only if the number of not successful permutes was nor more
1314 than a cut-ff as re-trying the recursive match on
1315 possibly each level of the tree would expose exponential
1316 behavior. */
1317 && *npermutes < 4)
1319 /* Verify if we can safely swap or if we committed to a specific
1320 operand order already. */
1321 for (j = 0; j < group_size; ++j)
1322 if (!matches[j]
1323 && (swap[j] != 0
1324 || STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmts[j]))))
1326 if (dump_enabled_p ())
1328 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1329 "Build SLP failed: cannot swap operands "
1330 "of shared stmt ");
1331 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1332 stmts[j], 0);
1334 goto fail;
1337 /* Swap mismatched definition stmts. */
1338 dump_printf_loc (MSG_NOTE, vect_location,
1339 "Re-trying with swapped operands of stmts ");
1340 for (j = 0; j < group_size; ++j)
1341 if (!matches[j])
1343 std::swap (oprnds_info[0]->def_stmts[j],
1344 oprnds_info[1]->def_stmts[j]);
1345 dump_printf (MSG_NOTE, "%d ", j);
1347 dump_printf (MSG_NOTE, "\n");
1348 /* And try again with scratch 'matches' ... */
1349 bool *tem = XALLOCAVEC (bool, group_size);
1350 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1351 group_size, &this_max_nunits,
1352 &this_loads, tem, npermutes,
1353 &this_tree_size,
1354 max_tree_size)) != NULL)
1356 /* ... so if successful we can apply the operand swapping
1357 to the GIMPLE IL. This is necessary because for example
1358 vect_get_slp_defs uses operand indexes and thus expects
1359 canonical operand order. This is also necessary even
1360 if we end up building the operand from scalars as
1361 we'll continue to process swapped operand two. */
1362 for (j = 0; j < group_size; ++j)
1364 gimple *stmt = stmts[j];
1365 gimple_set_plf (stmt, GF_PLF_1, false);
1367 for (j = 0; j < group_size; ++j)
1369 gimple *stmt = stmts[j];
1370 if (!matches[j])
1372 /* Avoid swapping operands twice. */
1373 if (gimple_plf (stmt, GF_PLF_1))
1374 continue;
1375 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1376 gimple_assign_rhs2_ptr (stmt));
1377 gimple_set_plf (stmt, GF_PLF_1, true);
1380 /* Verify we swap all duplicates or none. */
1381 if (flag_checking)
1382 for (j = 0; j < group_size; ++j)
1384 gimple *stmt = stmts[j];
1385 gcc_assert (gimple_plf (stmt, GF_PLF_1) == ! matches[j]);
1388 /* If we have all children of child built up from scalars then
1389 just throw that away and build it up this node from scalars. */
1390 if (!SLP_TREE_CHILDREN (child).is_empty ()
1391 /* ??? Rejecting patterns this way doesn't work. We'd have
1392 to do extra work to cancel the pattern so the uses see the
1393 scalar version. */
1394 && !is_pattern_stmt_p
1395 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1397 unsigned int j;
1398 slp_tree grandchild;
1400 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1401 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1402 break;
1403 if (!grandchild)
1405 /* Roll back. */
1406 this_loads.truncate (old_nloads);
1407 this_tree_size = old_tree_size;
1408 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1409 vect_free_slp_tree (grandchild);
1410 SLP_TREE_CHILDREN (child).truncate (0);
1412 dump_printf_loc (MSG_NOTE, vect_location,
1413 "Building parent vector operands from "
1414 "scalars instead\n");
1415 oprnd_info->def_stmts = vNULL;
1416 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1417 children.safe_push (child);
1418 continue;
1422 oprnd_info->def_stmts = vNULL;
1423 children.safe_push (child);
1424 continue;
1427 ++*npermutes;
1430 fail:
1431 gcc_assert (child == NULL);
1432 FOR_EACH_VEC_ELT (children, j, child)
1433 vect_free_slp_tree (child);
1434 vect_free_oprnd_info (oprnds_info);
1435 return NULL;
1438 vect_free_oprnd_info (oprnds_info);
1440 if (tree_size)
1441 *tree_size += this_tree_size;
1442 *max_nunits = this_max_nunits;
1443 loads->safe_splice (this_loads);
1445 node = vect_create_new_slp_node (stmts);
1446 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1447 SLP_TREE_CHILDREN (node).splice (children);
1448 return node;
1451 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1453 static void
1454 vect_print_slp_tree (dump_flags_t dump_kind, location_t loc, slp_tree node)
1456 int i;
1457 gimple *stmt;
1458 slp_tree child;
1460 dump_printf_loc (dump_kind, loc, "node%s\n",
1461 SLP_TREE_DEF_TYPE (node) != vect_internal_def
1462 ? " (external)" : "");
1463 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1465 dump_printf_loc (dump_kind, loc, "\tstmt %d ", i);
1466 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1468 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1469 vect_print_slp_tree (dump_kind, loc, child);
1473 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1474 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1475 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1476 stmts in NODE are to be marked. */
1478 static void
1479 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1481 int i;
1482 gimple *stmt;
1483 slp_tree child;
1485 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1486 return;
1488 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1489 if (j < 0 || i == j)
1490 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1492 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1493 vect_mark_slp_stmts (child, mark, j);
1497 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1499 static void
1500 vect_mark_slp_stmts_relevant (slp_tree node)
1502 int i;
1503 gimple *stmt;
1504 stmt_vec_info stmt_info;
1505 slp_tree child;
1507 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1508 return;
1510 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1512 stmt_info = vinfo_for_stmt (stmt);
1513 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1514 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1515 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1518 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1519 vect_mark_slp_stmts_relevant (child);
1523 /* Rearrange the statements of NODE according to PERMUTATION. */
1525 static void
1526 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1527 vec<unsigned> permutation)
1529 gimple *stmt;
1530 vec<gimple *> tmp_stmts;
1531 unsigned int i;
1532 slp_tree child;
1534 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1535 vect_slp_rearrange_stmts (child, group_size, permutation);
1537 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1538 tmp_stmts.create (group_size);
1539 tmp_stmts.quick_grow_cleared (group_size);
1541 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1542 tmp_stmts[permutation[i]] = stmt;
1544 SLP_TREE_SCALAR_STMTS (node).release ();
1545 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1549 /* Attempt to reorder stmts in a reduction chain so that we don't
1550 require any load permutation. Return true if that was possible,
1551 otherwise return false. */
1553 static bool
1554 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1556 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1557 unsigned int i, j;
1558 unsigned int lidx;
1559 slp_tree node, load;
1561 /* Compare all the permutation sequences to the first one. We know
1562 that at least one load is permuted. */
1563 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1564 if (!node->load_permutation.exists ())
1565 return false;
1566 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1568 if (!load->load_permutation.exists ())
1569 return false;
1570 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1571 if (lidx != node->load_permutation[j])
1572 return false;
1575 /* Check that the loads in the first sequence are different and there
1576 are no gaps between them. */
1577 auto_sbitmap load_index (group_size);
1578 bitmap_clear (load_index);
1579 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1581 if (lidx >= group_size)
1582 return false;
1583 if (bitmap_bit_p (load_index, lidx))
1584 return false;
1586 bitmap_set_bit (load_index, lidx);
1588 for (i = 0; i < group_size; i++)
1589 if (!bitmap_bit_p (load_index, i))
1590 return false;
1592 /* This permutation is valid for reduction. Since the order of the
1593 statements in the nodes is not important unless they are memory
1594 accesses, we can rearrange the statements in all the nodes
1595 according to the order of the loads. */
1596 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1597 node->load_permutation);
1599 /* We are done, no actual permutations need to be generated. */
1600 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1601 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1603 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1604 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
1605 /* But we have to keep those permutations that are required because
1606 of handling of gaps. */
1607 if (known_eq (unrolling_factor, 1U)
1608 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
1609 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0))
1610 SLP_TREE_LOAD_PERMUTATION (node).release ();
1611 else
1612 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1613 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1616 return true;
1619 /* Check if the required load permutations in the SLP instance
1620 SLP_INSTN are supported. */
1622 static bool
1623 vect_supported_load_permutation_p (slp_instance slp_instn)
1625 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1626 unsigned int i, j, k, next;
1627 slp_tree node;
1628 gimple *stmt, *load, *next_load;
1630 if (dump_enabled_p ())
1632 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1633 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1634 if (node->load_permutation.exists ())
1635 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1636 dump_printf (MSG_NOTE, "%d ", next);
1637 else
1638 for (k = 0; k < group_size; ++k)
1639 dump_printf (MSG_NOTE, "%d ", k);
1640 dump_printf (MSG_NOTE, "\n");
1643 /* In case of reduction every load permutation is allowed, since the order
1644 of the reduction statements is not important (as opposed to the case of
1645 grouped stores). The only condition we need to check is that all the
1646 load nodes are of the same size and have the same permutation (and then
1647 rearrange all the nodes of the SLP instance according to this
1648 permutation). */
1650 /* Check that all the load nodes are of the same size. */
1651 /* ??? Can't we assert this? */
1652 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1653 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1654 return false;
1656 node = SLP_INSTANCE_TREE (slp_instn);
1657 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1659 /* Reduction (there are no data-refs in the root).
1660 In reduction chain the order of the loads is not important. */
1661 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1662 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1663 vect_attempt_slp_rearrange_stmts (slp_instn);
1665 /* In basic block vectorization we allow any subchain of an interleaving
1666 chain.
1667 FORNOW: not supported in loop SLP because of realignment compications. */
1668 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1670 /* Check whether the loads in an instance form a subchain and thus
1671 no permutation is necessary. */
1672 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1674 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1675 continue;
1676 bool subchain_p = true;
1677 next_load = NULL;
1678 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1680 if (j != 0
1681 && (next_load != load
1682 || GROUP_GAP (vinfo_for_stmt (load)) != 1))
1684 subchain_p = false;
1685 break;
1687 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1689 if (subchain_p)
1690 SLP_TREE_LOAD_PERMUTATION (node).release ();
1691 else
1693 stmt_vec_info group_info
1694 = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1695 group_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (group_info));
1696 unsigned HOST_WIDE_INT nunits;
1697 unsigned k, maxk = 0;
1698 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1699 if (k > maxk)
1700 maxk = k;
1701 /* In BB vectorization we may not actually use a loaded vector
1702 accessing elements in excess of GROUP_SIZE. */
1703 tree vectype = STMT_VINFO_VECTYPE (group_info);
1704 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1705 || maxk >= (GROUP_SIZE (group_info) & ~(nunits - 1)))
1707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1708 "BB vectorization with gaps at the end of "
1709 "a load is not supported\n");
1710 return false;
1713 /* Verify the permutation can be generated. */
1714 vec<tree> tem;
1715 unsigned n_perms;
1716 if (!vect_transform_slp_perm_load (node, tem, NULL,
1717 1, slp_instn, true, &n_perms))
1719 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1720 vect_location,
1721 "unsupported load permutation\n");
1722 return false;
1726 return true;
1729 /* For loop vectorization verify we can generate the permutation. Be
1730 conservative about the vectorization factor, there are permutations
1731 that will use three vector inputs only starting from a specific factor
1732 and the vectorization factor is not yet final.
1733 ??? The SLP instance unrolling factor might not be the maximum one. */
1734 unsigned n_perms;
1735 poly_uint64 test_vf
1736 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1737 LOOP_VINFO_VECT_FACTOR
1738 (STMT_VINFO_LOOP_VINFO (vinfo_for_stmt (stmt))));
1739 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1740 if (node->load_permutation.exists ()
1741 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1742 slp_instn, true, &n_perms))
1743 return false;
1745 return true;
1749 /* Find the last store in SLP INSTANCE. */
1751 gimple *
1752 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1754 gimple *last = NULL, *stmt;
1756 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1758 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1759 if (is_pattern_stmt_p (stmt_vinfo))
1760 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1761 else
1762 last = get_later_stmt (stmt, last);
1765 return last;
1768 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1770 static void
1771 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1772 stmt_vector_for_cost *prologue_cost_vec,
1773 stmt_vector_for_cost *body_cost_vec,
1774 unsigned ncopies_for_cost,
1775 scalar_stmts_set_t* visited)
1777 unsigned i, j;
1778 slp_tree child;
1779 gimple *stmt;
1780 stmt_vec_info stmt_info;
1781 tree lhs;
1783 /* If we already costed the exact same set of scalar stmts we're done.
1784 We share the generated vector stmts for those. */
1785 if (visited->contains (SLP_TREE_SCALAR_STMTS (node)))
1786 return;
1788 visited->add (SLP_TREE_SCALAR_STMTS (node).copy ());
1790 /* Recurse down the SLP tree. */
1791 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1792 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1793 vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1794 body_cost_vec, ncopies_for_cost, visited);
1796 /* Look at the first scalar stmt to determine the cost. */
1797 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1798 stmt_info = vinfo_for_stmt (stmt);
1799 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1801 vect_memory_access_type memory_access_type
1802 = (STMT_VINFO_STRIDED_P (stmt_info)
1803 ? VMAT_STRIDED_SLP
1804 : VMAT_CONTIGUOUS);
1805 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1806 vect_model_store_cost (stmt_info, ncopies_for_cost,
1807 memory_access_type, VLS_STORE,
1808 node, prologue_cost_vec, body_cost_vec);
1809 else
1811 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1812 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1814 /* If the load is permuted then the alignment is determined by
1815 the first group element not by the first scalar stmt DR. */
1816 stmt = GROUP_FIRST_ELEMENT (stmt_info);
1817 stmt_info = vinfo_for_stmt (stmt);
1818 /* Record the cost for the permutation. */
1819 unsigned n_perms;
1820 vect_transform_slp_perm_load (node, vNULL, NULL,
1821 ncopies_for_cost, instance, true,
1822 &n_perms);
1823 record_stmt_cost (body_cost_vec, n_perms, vec_perm,
1824 stmt_info, 0, vect_body);
1825 unsigned assumed_nunits
1826 = vect_nunits_for_cost (STMT_VINFO_VECTYPE (stmt_info));
1827 /* And adjust the number of loads performed. This handles
1828 redundancies as well as loads that are later dead. */
1829 auto_sbitmap perm (GROUP_SIZE (stmt_info));
1830 bitmap_clear (perm);
1831 for (i = 0; i < SLP_TREE_LOAD_PERMUTATION (node).length (); ++i)
1832 bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (node)[i]);
1833 ncopies_for_cost = 0;
1834 bool load_seen = false;
1835 for (i = 0; i < GROUP_SIZE (stmt_info); ++i)
1837 if (i % assumed_nunits == 0)
1839 if (load_seen)
1840 ncopies_for_cost++;
1841 load_seen = false;
1843 if (bitmap_bit_p (perm, i))
1844 load_seen = true;
1846 if (load_seen)
1847 ncopies_for_cost++;
1848 gcc_assert (ncopies_for_cost
1849 <= (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info)
1850 + assumed_nunits - 1) / assumed_nunits);
1851 poly_uint64 uf = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1852 ncopies_for_cost *= estimated_poly_value (uf);
1854 /* Record the cost for the vector loads. */
1855 vect_model_load_cost (stmt_info, ncopies_for_cost,
1856 memory_access_type, node, prologue_cost_vec,
1857 body_cost_vec);
1858 return;
1861 else if (STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type)
1863 /* ncopies_for_cost is the number of IVs we generate. */
1864 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1865 stmt_info, 0, vect_body);
1867 /* Prologue cost for the initial values and step vector. */
1868 record_stmt_cost (prologue_cost_vec, ncopies_for_cost,
1869 CONSTANT_CLASS_P
1870 (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED
1871 (stmt_info))
1872 ? vector_load : vec_construct,
1873 stmt_info, 0, vect_prologue);
1874 record_stmt_cost (prologue_cost_vec, 1,
1875 CONSTANT_CLASS_P
1876 (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info))
1877 ? vector_load : vec_construct,
1878 stmt_info, 0, vect_prologue);
1880 /* ??? No easy way to get at the actual number of vector stmts
1881 to be geneated and thus the derived IVs. */
1883 else
1885 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1886 stmt_info, 0, vect_body);
1887 if (SLP_TREE_TWO_OPERATORS (node))
1889 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1890 stmt_info, 0, vect_body);
1891 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1892 stmt_info, 0, vect_body);
1896 /* Push SLP node def-type to stmts. */
1897 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1898 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1899 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
1900 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
1902 /* Scan operands and account for prologue cost of constants/externals.
1903 ??? This over-estimates cost for multiple uses and should be
1904 re-engineered. */
1905 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1906 lhs = gimple_get_lhs (stmt);
1907 for (i = 0; i < gimple_num_ops (stmt); ++i)
1909 tree op = gimple_op (stmt, i);
1910 gimple *def_stmt;
1911 enum vect_def_type dt;
1912 if (!op || op == lhs)
1913 continue;
1914 if (vect_is_simple_use (op, stmt_info->vinfo, &def_stmt, &dt))
1916 /* Without looking at the actual initializer a vector of
1917 constants can be implemented as load from the constant pool.
1918 ??? We need to pass down stmt_info for a vector type
1919 even if it points to the wrong stmt. */
1920 if (dt == vect_constant_def)
1921 record_stmt_cost (prologue_cost_vec, 1, vector_load,
1922 stmt_info, 0, vect_prologue);
1923 else if (dt == vect_external_def)
1924 record_stmt_cost (prologue_cost_vec, 1, vec_construct,
1925 stmt_info, 0, vect_prologue);
1929 /* Restore stmt def-types. */
1930 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1931 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1932 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
1933 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
1936 /* Compute the cost for the SLP instance INSTANCE. */
1938 static void
1939 vect_analyze_slp_cost (slp_instance instance, void *data)
1941 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1942 unsigned ncopies_for_cost;
1943 stmt_info_for_cost *si;
1944 unsigned i;
1946 if (dump_enabled_p ())
1947 dump_printf_loc (MSG_NOTE, vect_location,
1948 "=== vect_analyze_slp_cost ===\n");
1950 /* Calculate the number of vector stmts to create based on the unrolling
1951 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1952 GROUP_SIZE / NUNITS otherwise. */
1953 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1954 slp_tree node = SLP_INSTANCE_TREE (instance);
1955 stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1956 /* Get the estimated vectorization factor, which is always one for
1957 basic-block vectorization. */
1958 unsigned int assumed_vf;
1959 if (STMT_VINFO_LOOP_VINFO (stmt_info))
1960 assumed_vf = vect_vf_for_cost (STMT_VINFO_LOOP_VINFO (stmt_info));
1961 else
1962 assumed_vf = 1;
1963 /* For reductions look at a reduction operand in case the reduction
1964 operation is widening like DOT_PROD or SAD. */
1965 tree vectype_for_cost = STMT_VINFO_VECTYPE (stmt_info);
1966 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1968 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1969 switch (gimple_assign_rhs_code (stmt))
1971 case DOT_PROD_EXPR:
1972 case SAD_EXPR:
1973 vectype_for_cost = get_vectype_for_scalar_type
1974 (TREE_TYPE (gimple_assign_rhs1 (stmt)));
1975 break;
1976 default:;
1979 unsigned int assumed_nunits = vect_nunits_for_cost (vectype_for_cost);
1980 ncopies_for_cost = (least_common_multiple (assumed_nunits,
1981 group_size * assumed_vf)
1982 / assumed_nunits);
1984 prologue_cost_vec.create (10);
1985 body_cost_vec.create (10);
1986 scalar_stmts_set_t *visited = new scalar_stmts_set_t ();
1987 vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
1988 &prologue_cost_vec, &body_cost_vec,
1989 ncopies_for_cost, visited);
1990 delete visited;
1992 /* Record the prologue costs, which were delayed until we were
1993 sure that SLP was successful. */
1994 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1996 struct _stmt_vec_info *stmt_info
1997 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1998 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1999 si->misalign, vect_prologue);
2002 /* Record the instance's instructions in the target cost model. */
2003 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
2005 struct _stmt_vec_info *stmt_info
2006 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2007 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2008 si->misalign, vect_body);
2011 prologue_cost_vec.release ();
2012 body_cost_vec.release ();
2015 /* Splits a group of stores, currently beginning at FIRST_STMT, into two groups:
2016 one (still beginning at FIRST_STMT) of size GROUP1_SIZE (also containing
2017 the first GROUP1_SIZE stmts, since stores are consecutive), the second
2018 containing the remainder.
2019 Return the first stmt in the second group. */
2021 static gimple *
2022 vect_split_slp_store_group (gimple *first_stmt, unsigned group1_size)
2024 stmt_vec_info first_vinfo = vinfo_for_stmt (first_stmt);
2025 gcc_assert (GROUP_FIRST_ELEMENT (first_vinfo) == first_stmt);
2026 gcc_assert (group1_size > 0);
2027 int group2_size = GROUP_SIZE (first_vinfo) - group1_size;
2028 gcc_assert (group2_size > 0);
2029 GROUP_SIZE (first_vinfo) = group1_size;
2031 gimple *stmt = first_stmt;
2032 for (unsigned i = group1_size; i > 1; i--)
2034 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2035 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
2037 /* STMT is now the last element of the first group. */
2038 gimple *group2 = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2039 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)) = 0;
2041 GROUP_SIZE (vinfo_for_stmt (group2)) = group2_size;
2042 for (stmt = group2; stmt; stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)))
2044 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = group2;
2045 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
2048 /* For the second group, the GROUP_GAP is that before the original group,
2049 plus skipping over the first vector. */
2050 GROUP_GAP (vinfo_for_stmt (group2)) =
2051 GROUP_GAP (first_vinfo) + group1_size;
2053 /* GROUP_GAP of the first group now has to skip over the second group too. */
2054 GROUP_GAP (first_vinfo) += group2_size;
2056 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2058 group1_size, group2_size);
2060 return group2;
2063 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2064 statements and a vector of NUNITS elements. */
2066 static poly_uint64
2067 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2069 return exact_div (common_multiple (nunits, group_size), group_size);
2072 /* Analyze an SLP instance starting from a group of grouped stores. Call
2073 vect_build_slp_tree to build a tree of packed stmts if possible.
2074 Return FALSE if it's impossible to SLP any stmt in the loop. */
2076 static bool
2077 vect_analyze_slp_instance (vec_info *vinfo,
2078 gimple *stmt, unsigned max_tree_size)
2080 slp_instance new_instance;
2081 slp_tree node;
2082 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
2083 tree vectype, scalar_type = NULL_TREE;
2084 gimple *next;
2085 unsigned int i;
2086 vec<slp_tree> loads;
2087 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
2088 vec<gimple *> scalar_stmts;
2090 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2092 if (dr)
2094 scalar_type = TREE_TYPE (DR_REF (dr));
2095 vectype = get_vectype_for_scalar_type (scalar_type);
2097 else
2099 gcc_assert (is_a <loop_vec_info> (vinfo));
2100 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2103 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
2105 else
2107 gcc_assert (is_a <loop_vec_info> (vinfo));
2108 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2109 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2112 if (!vectype)
2114 if (dump_enabled_p ())
2116 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2117 "Build SLP failed: unsupported data-type ");
2118 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
2119 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2122 return false;
2124 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2126 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2127 scalar_stmts.create (group_size);
2128 next = stmt;
2129 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2131 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2132 while (next)
2134 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
2135 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
2136 scalar_stmts.safe_push (
2137 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
2138 else
2139 scalar_stmts.safe_push (next);
2140 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2142 /* Mark the first element of the reduction chain as reduction to properly
2143 transform the node. In the reduction analysis phase only the last
2144 element of the chain is marked as reduction. */
2145 if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
2146 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
2148 else
2150 /* Collect reduction statements. */
2151 vec<gimple *> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2152 for (i = 0; reductions.iterate (i, &next); i++)
2153 scalar_stmts.safe_push (next);
2156 loads.create (group_size);
2158 /* Build the tree for the SLP instance. */
2159 bool *matches = XALLOCAVEC (bool, group_size);
2160 unsigned npermutes = 0;
2161 bst_fail = new scalar_stmts_set_t ();
2162 poly_uint64 max_nunits = nunits;
2163 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2164 &max_nunits, &loads, matches, &npermutes,
2165 NULL, max_tree_size);
2166 delete bst_fail;
2167 if (node != NULL)
2169 /* Calculate the unrolling factor based on the smallest type. */
2170 poly_uint64 unrolling_factor
2171 = calculate_unrolling_factor (max_nunits, group_size);
2173 if (maybe_ne (unrolling_factor, 1U)
2174 && is_a <bb_vec_info> (vinfo))
2176 unsigned HOST_WIDE_INT const_max_nunits;
2177 if (!max_nunits.is_constant (&const_max_nunits)
2178 || const_max_nunits > group_size)
2180 if (dump_enabled_p ())
2181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182 "Build SLP failed: store group "
2183 "size not a multiple of the vector size "
2184 "in basic block SLP\n");
2185 vect_free_slp_tree (node);
2186 loads.release ();
2187 return false;
2189 /* Fatal mismatch. */
2190 matches[group_size / const_max_nunits * const_max_nunits] = false;
2191 vect_free_slp_tree (node);
2192 loads.release ();
2194 else
2196 /* Create a new SLP instance. */
2197 new_instance = XNEW (struct _slp_instance);
2198 SLP_INSTANCE_TREE (new_instance) = node;
2199 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2200 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2201 SLP_INSTANCE_LOADS (new_instance) = loads;
2203 /* Compute the load permutation. */
2204 slp_tree load_node;
2205 bool loads_permuted = false;
2206 FOR_EACH_VEC_ELT (loads, i, load_node)
2208 vec<unsigned> load_permutation;
2209 int j;
2210 gimple *load, *first_stmt;
2211 bool this_load_permuted = false;
2212 load_permutation.create (group_size);
2213 first_stmt = GROUP_FIRST_ELEMENT
2214 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2215 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
2217 int load_place = vect_get_place_in_interleaving_chain
2218 (load, first_stmt);
2219 gcc_assert (load_place != -1);
2220 if (load_place != j)
2221 this_load_permuted = true;
2222 load_permutation.safe_push (load_place);
2224 if (!this_load_permuted
2225 /* The load requires permutation when unrolling exposes
2226 a gap either because the group is larger than the SLP
2227 group-size or because there is a gap between the groups. */
2228 && (known_eq (unrolling_factor, 1U)
2229 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
2230 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
2232 load_permutation.release ();
2233 continue;
2235 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2236 loads_permuted = true;
2239 if (loads_permuted)
2241 if (!vect_supported_load_permutation_p (new_instance))
2243 if (dump_enabled_p ())
2245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246 "Build SLP failed: unsupported load "
2247 "permutation ");
2248 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
2249 TDF_SLIM, stmt, 0);
2251 vect_free_slp_instance (new_instance);
2252 return false;
2256 /* If the loads and stores can be handled with load/store-lan
2257 instructions do not generate this SLP instance. */
2258 if (is_a <loop_vec_info> (vinfo)
2259 && loads_permuted
2260 && dr && vect_store_lanes_supported (vectype, group_size, false))
2262 slp_tree load_node;
2263 FOR_EACH_VEC_ELT (loads, i, load_node)
2265 gimple *first_stmt = GROUP_FIRST_ELEMENT
2266 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2267 stmt_vec_info stmt_vinfo = vinfo_for_stmt (first_stmt);
2268 /* Use SLP for strided accesses (or if we
2269 can't load-lanes). */
2270 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2271 || ! vect_load_lanes_supported
2272 (STMT_VINFO_VECTYPE (stmt_vinfo),
2273 GROUP_SIZE (stmt_vinfo), false))
2274 break;
2276 if (i == loads.length ())
2278 if (dump_enabled_p ())
2279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2280 "Built SLP cancelled: can use "
2281 "load/store-lanes\n");
2282 vect_free_slp_instance (new_instance);
2283 return false;
2287 vinfo->slp_instances.safe_push (new_instance);
2289 if (dump_enabled_p ())
2291 dump_printf_loc (MSG_NOTE, vect_location,
2292 "Final SLP tree for instance:\n");
2293 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2296 return true;
2299 else
2301 /* Failed to SLP. */
2302 /* Free the allocated memory. */
2303 scalar_stmts.release ();
2304 loads.release ();
2307 /* For basic block SLP, try to break the group up into multiples of the
2308 vector size. */
2309 unsigned HOST_WIDE_INT const_nunits;
2310 if (is_a <bb_vec_info> (vinfo)
2311 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2312 && STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
2313 && nunits.is_constant (&const_nunits))
2315 /* We consider breaking the group only on VF boundaries from the existing
2316 start. */
2317 for (i = 0; i < group_size; i++)
2318 if (!matches[i]) break;
2320 if (i >= const_nunits && i < group_size)
2322 /* Split into two groups at the first vector boundary before i. */
2323 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2324 unsigned group1_size = i & ~(const_nunits - 1);
2326 gimple *rest = vect_split_slp_store_group (stmt, group1_size);
2327 bool res = vect_analyze_slp_instance (vinfo, stmt, max_tree_size);
2328 /* If the first non-match was in the middle of a vector,
2329 skip the rest of that vector. */
2330 if (group1_size < i)
2332 i = group1_size + const_nunits;
2333 if (i < group_size)
2334 rest = vect_split_slp_store_group (rest, const_nunits);
2336 if (i < group_size)
2337 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2338 return res;
2340 /* Even though the first vector did not all match, we might be able to SLP
2341 (some) of the remainder. FORNOW ignore this possibility. */
2344 return false;
2348 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2349 trees of packed scalar stmts if SLP is possible. */
2351 bool
2352 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2354 unsigned int i;
2355 gimple *first_element;
2357 if (dump_enabled_p ())
2358 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
2360 /* Find SLP sequences starting from groups of grouped stores. */
2361 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2362 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2364 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2366 if (loop_vinfo->reduction_chains.length () > 0)
2368 /* Find SLP sequences starting from reduction chains. */
2369 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2370 if (! vect_analyze_slp_instance (vinfo, first_element,
2371 max_tree_size))
2373 /* Dissolve reduction chain group. */
2374 gimple *next, *stmt = first_element;
2375 while (stmt)
2377 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2378 next = GROUP_NEXT_ELEMENT (vinfo);
2379 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2380 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2381 stmt = next;
2383 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (first_element))
2384 = vect_internal_def;
2388 /* Find SLP sequences starting from groups of reductions. */
2389 if (loop_vinfo->reductions.length () > 1)
2390 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2391 max_tree_size);
2394 return true;
2398 /* For each possible SLP instance decide whether to SLP it and calculate overall
2399 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2400 least one instance. */
2402 bool
2403 vect_make_slp_decision (loop_vec_info loop_vinfo)
2405 unsigned int i;
2406 poly_uint64 unrolling_factor = 1;
2407 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2408 slp_instance instance;
2409 int decided_to_slp = 0;
2411 if (dump_enabled_p ())
2412 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
2413 "\n");
2415 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2417 /* FORNOW: SLP if you can. */
2418 /* All unroll factors have the form current_vector_size * X for some
2419 rational X, so they must have a common multiple. */
2420 unrolling_factor
2421 = force_common_multiple (unrolling_factor,
2422 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2424 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2425 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2426 loop-based vectorization. Such stmts will be marked as HYBRID. */
2427 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2428 decided_to_slp++;
2431 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2433 if (decided_to_slp && dump_enabled_p ())
2435 dump_printf_loc (MSG_NOTE, vect_location,
2436 "Decided to SLP %d instances. Unrolling factor ",
2437 decided_to_slp);
2438 dump_dec (MSG_NOTE, unrolling_factor);
2439 dump_printf (MSG_NOTE, "\n");
2442 return (decided_to_slp > 0);
2446 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2447 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2449 static void
2450 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2452 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[i];
2453 imm_use_iterator imm_iter;
2454 gimple *use_stmt;
2455 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
2456 slp_tree child;
2457 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2458 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2459 int j;
2461 /* Propagate hybrid down the SLP tree. */
2462 if (stype == hybrid)
2464 else if (HYBRID_SLP_STMT (stmt_vinfo))
2465 stype = hybrid;
2466 else
2468 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2469 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2470 /* If we get a pattern stmt here we have to use the LHS of the
2471 original stmt for immediate uses. */
2472 if (! STMT_VINFO_IN_PATTERN_P (stmt_vinfo)
2473 && STMT_VINFO_RELATED_STMT (stmt_vinfo))
2474 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
2475 tree def;
2476 if (gimple_code (stmt) == GIMPLE_PHI)
2477 def = gimple_phi_result (stmt);
2478 else
2479 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2480 if (def)
2481 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2483 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2484 continue;
2485 use_vinfo = vinfo_for_stmt (use_stmt);
2486 if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2487 && STMT_VINFO_RELATED_STMT (use_vinfo))
2488 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
2489 if (!STMT_SLP_TYPE (use_vinfo)
2490 && (STMT_VINFO_RELEVANT (use_vinfo)
2491 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2492 && !(gimple_code (use_stmt) == GIMPLE_PHI
2493 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2495 if (dump_enabled_p ())
2497 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2498 "def in non-SLP stmt: ");
2499 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
2501 stype = hybrid;
2506 if (stype == hybrid
2507 && !HYBRID_SLP_STMT (stmt_vinfo))
2509 if (dump_enabled_p ())
2511 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2512 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2514 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2517 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2518 if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
2519 vect_detect_hybrid_slp_stmts (child, i, stype);
2522 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2524 static tree
2525 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2527 walk_stmt_info *wi = (walk_stmt_info *)data;
2528 struct loop *loopp = (struct loop *)wi->info;
2530 if (wi->is_lhs)
2531 return NULL_TREE;
2533 if (TREE_CODE (*tp) == SSA_NAME
2534 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2536 gimple *def_stmt = SSA_NAME_DEF_STMT (*tp);
2537 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2538 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2540 if (dump_enabled_p ())
2542 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2543 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2545 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2549 return NULL_TREE;
2552 static tree
2553 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2554 walk_stmt_info *)
2556 stmt_vec_info use_vinfo = vinfo_for_stmt (gsi_stmt (*gsi));
2557 /* If the stmt is in a SLP instance then this isn't a reason
2558 to mark use definitions in other SLP instances as hybrid. */
2559 if (! STMT_SLP_TYPE (use_vinfo)
2560 && (STMT_VINFO_RELEVANT (use_vinfo)
2561 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2562 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2563 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2565 else
2566 *handled = true;
2567 return NULL_TREE;
2570 /* Find stmts that must be both vectorized and SLPed. */
2572 void
2573 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2575 unsigned int i;
2576 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2577 slp_instance instance;
2579 if (dump_enabled_p ())
2580 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2581 "\n");
2583 /* First walk all pattern stmt in the loop and mark defs of uses as
2584 hybrid because immediate uses in them are not recorded. */
2585 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2587 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2588 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2589 gsi_next (&gsi))
2591 gimple *stmt = gsi_stmt (gsi);
2592 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2593 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2595 walk_stmt_info wi;
2596 memset (&wi, 0, sizeof (wi));
2597 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2598 gimple_stmt_iterator gsi2
2599 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2600 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2601 vect_detect_hybrid_slp_1, &wi);
2602 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2603 vect_detect_hybrid_slp_2,
2604 vect_detect_hybrid_slp_1, &wi);
2609 /* Then walk the SLP instance trees marking stmts with uses in
2610 non-SLP stmts as hybrid, also propagating hybrid down the
2611 SLP tree, collecting the above info on-the-fly. */
2612 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2614 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2615 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2616 i, pure_slp);
2621 /* Initialize a bb_vec_info struct for the statements between
2622 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2624 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2625 gimple_stmt_iterator region_end_in)
2626 : vec_info (vec_info::bb, init_cost (NULL)),
2627 bb (gsi_bb (region_begin_in)),
2628 region_begin (region_begin_in),
2629 region_end (region_end_in)
2631 gimple_stmt_iterator gsi;
2633 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2634 gsi_next (&gsi))
2636 gimple *stmt = gsi_stmt (gsi);
2637 gimple_set_uid (stmt, 0);
2638 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
2641 bb->aux = this;
2645 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2646 stmts in the basic block. */
2648 _bb_vec_info::~_bb_vec_info ()
2650 for (gimple_stmt_iterator si = region_begin;
2651 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2653 gimple *stmt = gsi_stmt (si);
2654 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2656 if (stmt_info)
2657 /* Free stmt_vec_info. */
2658 free_stmt_vec_info (stmt);
2660 /* Reset region marker. */
2661 gimple_set_uid (stmt, -1);
2664 bb->aux = NULL;
2668 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2669 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2671 Return true if the operations are supported. */
2673 static bool
2674 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2675 slp_instance node_instance)
2677 bool dummy;
2678 int i, j;
2679 gimple *stmt;
2680 slp_tree child;
2682 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2683 return true;
2685 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2686 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance))
2687 return false;
2689 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2690 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2691 gcc_assert (stmt_info);
2692 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2694 /* For BB vectorization vector types are assigned here.
2695 Memory accesses already got their vector type assigned
2696 in vect_analyze_data_refs. */
2697 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2698 if (bb_vinfo
2699 && ! STMT_VINFO_DATA_REF (stmt_info))
2701 gcc_assert (PURE_SLP_STMT (stmt_info));
2703 tree scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
2704 if (dump_enabled_p ())
2706 dump_printf_loc (MSG_NOTE, vect_location,
2707 "get vectype for scalar type: ");
2708 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
2709 dump_printf (MSG_NOTE, "\n");
2712 tree vectype = get_vectype_for_scalar_type (scalar_type);
2713 if (!vectype)
2715 if (dump_enabled_p ())
2717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2718 "not SLPed: unsupported data-type ");
2719 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2720 scalar_type);
2721 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2723 return false;
2726 if (dump_enabled_p ())
2728 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
2729 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
2730 dump_printf (MSG_NOTE, "\n");
2733 gimple *sstmt;
2734 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt)
2735 STMT_VINFO_VECTYPE (vinfo_for_stmt (sstmt)) = vectype;
2738 /* Calculate the number of vector statements to be created for the
2739 scalar stmts in this node. For SLP reductions it is equal to the
2740 number of vector statements in the children (which has already been
2741 calculated by the recursive call). Otherwise it is the number of
2742 scalar elements in one scalar iteration (GROUP_SIZE) multiplied by
2743 VF divided by the number of elements in a vector. */
2744 if (GROUP_FIRST_ELEMENT (stmt_info)
2745 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2746 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2747 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2748 else
2750 poly_uint64 vf;
2751 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2752 vf = loop_vinfo->vectorization_factor;
2753 else
2754 vf = 1;
2755 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2756 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2757 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2758 = vect_get_num_vectors (vf * group_size, vectype);
2761 /* Push SLP node def-type to stmt operands. */
2762 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2763 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2764 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2765 = SLP_TREE_DEF_TYPE (child);
2766 bool res = vect_analyze_stmt (stmt, &dummy, node, node_instance);
2767 /* Restore def-types. */
2768 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2769 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2770 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2771 = vect_internal_def;
2772 if (! res)
2773 return false;
2775 return true;
2779 /* Analyze statements in SLP instances of VINFO. Return true if the
2780 operations are supported. */
2782 bool
2783 vect_slp_analyze_operations (vec_info *vinfo)
2785 slp_instance instance;
2786 int i;
2788 if (dump_enabled_p ())
2789 dump_printf_loc (MSG_NOTE, vect_location,
2790 "=== vect_slp_analyze_operations ===\n");
2792 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2794 if (!vect_slp_analyze_node_operations (vinfo,
2795 SLP_INSTANCE_TREE (instance),
2796 instance))
2798 dump_printf_loc (MSG_NOTE, vect_location,
2799 "removing SLP instance operations starting from: ");
2800 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2801 SLP_TREE_SCALAR_STMTS
2802 (SLP_INSTANCE_TREE (instance))[0], 0);
2803 vect_free_slp_instance (instance);
2804 vinfo->slp_instances.ordered_remove (i);
2806 else
2808 /* Compute the costs of the SLP instance. */
2809 vect_analyze_slp_cost (instance, vinfo->target_cost_data);
2810 i++;
2814 return !vinfo->slp_instances.is_empty ();
2818 /* Compute the scalar cost of the SLP node NODE and its children
2819 and return it. Do not account defs that are marked in LIFE and
2820 update LIFE according to uses of NODE. */
2822 static unsigned
2823 vect_bb_slp_scalar_cost (basic_block bb,
2824 slp_tree node, vec<bool, va_heap> *life)
2826 unsigned scalar_cost = 0;
2827 unsigned i;
2828 gimple *stmt;
2829 slp_tree child;
2831 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2833 unsigned stmt_cost;
2834 ssa_op_iter op_iter;
2835 def_operand_p def_p;
2836 stmt_vec_info stmt_info;
2838 if ((*life)[i])
2839 continue;
2841 /* If there is a non-vectorized use of the defs then the scalar
2842 stmt is kept live in which case we do not account it or any
2843 required defs in the SLP children in the scalar cost. This
2844 way we make the vectorization more costly when compared to
2845 the scalar cost. */
2846 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2848 imm_use_iterator use_iter;
2849 gimple *use_stmt;
2850 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2851 if (!is_gimple_debug (use_stmt)
2852 && (! vect_stmt_in_region_p (vinfo_for_stmt (stmt)->vinfo,
2853 use_stmt)
2854 || ! PURE_SLP_STMT (vinfo_for_stmt (use_stmt))))
2856 (*life)[i] = true;
2857 BREAK_FROM_IMM_USE_STMT (use_iter);
2860 if ((*life)[i])
2861 continue;
2863 /* Count scalar stmts only once. */
2864 if (gimple_visited_p (stmt))
2865 continue;
2866 gimple_set_visited (stmt, true);
2868 stmt_info = vinfo_for_stmt (stmt);
2869 if (STMT_VINFO_DATA_REF (stmt_info))
2871 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2872 stmt_cost = vect_get_stmt_cost (scalar_load);
2873 else
2874 stmt_cost = vect_get_stmt_cost (scalar_store);
2876 else
2877 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2879 scalar_cost += stmt_cost;
2882 auto_vec<bool, 20> subtree_life;
2883 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2885 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2887 /* Do not directly pass LIFE to the recursive call, copy it to
2888 confine changes in the callee to the current child/subtree. */
2889 subtree_life.safe_splice (*life);
2890 scalar_cost += vect_bb_slp_scalar_cost (bb, child, &subtree_life);
2891 subtree_life.truncate (0);
2895 return scalar_cost;
2898 /* Check if vectorization of the basic block is profitable. */
2900 static bool
2901 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2903 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2904 slp_instance instance;
2905 int i;
2906 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2907 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2909 /* Calculate scalar cost. */
2910 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2912 auto_vec<bool, 20> life;
2913 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2914 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2915 SLP_INSTANCE_TREE (instance),
2916 &life);
2919 /* Unset visited flag. */
2920 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2921 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2922 gimple_set_visited (gsi_stmt (gsi), false);
2924 /* Complete the target-specific cost calculation. */
2925 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2926 &vec_inside_cost, &vec_epilogue_cost);
2928 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2930 if (dump_enabled_p ())
2932 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2933 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2934 vec_inside_cost);
2935 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2936 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2937 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2940 /* Vectorization is profitable if its cost is more than the cost of scalar
2941 version. Note that we err on the vector side for equal cost because
2942 the cost estimate is otherwise quite pessimistic (constant uses are
2943 free on the scalar side but cost a load on the vector side for
2944 example). */
2945 if (vec_outside_cost + vec_inside_cost > scalar_cost)
2946 return false;
2948 return true;
2951 /* Check if the basic block can be vectorized. Returns a bb_vec_info
2952 if so and sets fatal to true if failure is independent of
2953 current_vector_size. */
2955 static bb_vec_info
2956 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
2957 gimple_stmt_iterator region_end,
2958 vec<data_reference_p> datarefs, int n_stmts,
2959 bool &fatal)
2961 bb_vec_info bb_vinfo;
2962 slp_instance instance;
2963 int i;
2964 poly_uint64 min_vf = 2;
2966 /* The first group of checks is independent of the vector size. */
2967 fatal = true;
2969 if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2971 if (dump_enabled_p ())
2972 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2973 "not vectorized: too many instructions in "
2974 "basic block.\n");
2975 free_data_refs (datarefs);
2976 return NULL;
2979 bb_vinfo = new _bb_vec_info (region_begin, region_end);
2980 if (!bb_vinfo)
2981 return NULL;
2983 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
2985 /* Analyze the data references. */
2987 if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
2989 if (dump_enabled_p ())
2990 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2991 "not vectorized: unhandled data-ref in basic "
2992 "block.\n");
2994 delete bb_vinfo;
2995 return NULL;
2998 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
3000 if (dump_enabled_p ())
3001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3002 "not vectorized: not enough data-refs in "
3003 "basic block.\n");
3005 delete bb_vinfo;
3006 return NULL;
3009 if (!vect_analyze_data_ref_accesses (bb_vinfo))
3011 if (dump_enabled_p ())
3012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3013 "not vectorized: unhandled data access in "
3014 "basic block.\n");
3016 delete bb_vinfo;
3017 return NULL;
3020 /* If there are no grouped stores in the region there is no need
3021 to continue with pattern recog as vect_analyze_slp will fail
3022 anyway. */
3023 if (bb_vinfo->grouped_stores.is_empty ())
3025 if (dump_enabled_p ())
3026 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3027 "not vectorized: no grouped stores in "
3028 "basic block.\n");
3030 delete bb_vinfo;
3031 return NULL;
3034 /* While the rest of the analysis below depends on it in some way. */
3035 fatal = false;
3037 vect_pattern_recog (bb_vinfo);
3039 /* Check the SLP opportunities in the basic block, analyze and build SLP
3040 trees. */
3041 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3043 if (dump_enabled_p ())
3045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3046 "Failed to SLP the basic block.\n");
3047 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3048 "not vectorized: failed to find SLP opportunities "
3049 "in basic block.\n");
3052 delete bb_vinfo;
3053 return NULL;
3056 vect_record_base_alignments (bb_vinfo);
3058 /* Analyze and verify the alignment of data references and the
3059 dependence in the SLP instances. */
3060 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3062 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
3063 || ! vect_slp_analyze_instance_dependence (instance))
3065 dump_printf_loc (MSG_NOTE, vect_location,
3066 "removing SLP instance operations starting from: ");
3067 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3068 SLP_TREE_SCALAR_STMTS
3069 (SLP_INSTANCE_TREE (instance))[0], 0);
3070 vect_free_slp_instance (instance);
3071 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3072 continue;
3075 /* Mark all the statements that we want to vectorize as pure SLP and
3076 relevant. */
3077 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
3078 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3080 i++;
3082 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3084 delete bb_vinfo;
3085 return NULL;
3088 if (!vect_slp_analyze_operations (bb_vinfo))
3090 if (dump_enabled_p ())
3091 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3092 "not vectorized: bad operation in basic block.\n");
3094 delete bb_vinfo;
3095 return NULL;
3098 /* Cost model: check if the vectorization is worthwhile. */
3099 if (!unlimited_cost_model (NULL)
3100 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3102 if (dump_enabled_p ())
3103 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3104 "not vectorized: vectorization is not "
3105 "profitable.\n");
3107 delete bb_vinfo;
3108 return NULL;
3111 if (dump_enabled_p ())
3112 dump_printf_loc (MSG_NOTE, vect_location,
3113 "Basic block will be vectorized using SLP\n");
3115 return bb_vinfo;
3119 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3120 true if anything in the basic-block was vectorized. */
3122 bool
3123 vect_slp_bb (basic_block bb)
3125 bb_vec_info bb_vinfo;
3126 gimple_stmt_iterator gsi;
3127 bool any_vectorized = false;
3128 auto_vector_sizes vector_sizes;
3130 if (dump_enabled_p ())
3131 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
3133 /* Autodetect first vector size we try. */
3134 current_vector_size = 0;
3135 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
3136 unsigned int next_size = 0;
3138 gsi = gsi_start_bb (bb);
3140 poly_uint64 autodetected_vector_size = 0;
3141 while (1)
3143 if (gsi_end_p (gsi))
3144 break;
3146 gimple_stmt_iterator region_begin = gsi;
3147 vec<data_reference_p> datarefs = vNULL;
3148 int insns = 0;
3150 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3152 gimple *stmt = gsi_stmt (gsi);
3153 if (is_gimple_debug (stmt))
3154 continue;
3155 insns++;
3157 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3158 vect_location = gimple_location (stmt);
3160 if (!find_data_references_in_stmt (NULL, stmt, &datarefs))
3161 break;
3164 /* Skip leading unhandled stmts. */
3165 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3167 gsi_next (&gsi);
3168 continue;
3171 gimple_stmt_iterator region_end = gsi;
3173 bool vectorized = false;
3174 bool fatal = false;
3175 bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
3176 datarefs, insns, fatal);
3177 if (bb_vinfo
3178 && dbg_cnt (vect_slp))
3180 if (dump_enabled_p ())
3181 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3183 vect_schedule_slp (bb_vinfo);
3185 if (dump_enabled_p ())
3186 dump_printf_loc (MSG_NOTE, vect_location,
3187 "basic block part vectorized\n");
3189 vectorized = true;
3191 delete bb_vinfo;
3193 any_vectorized |= vectorized;
3195 if (next_size == 0)
3196 autodetected_vector_size = current_vector_size;
3198 if (next_size < vector_sizes.length ()
3199 && known_eq (vector_sizes[next_size], autodetected_vector_size))
3200 next_size += 1;
3202 if (vectorized
3203 || next_size == vector_sizes.length ()
3204 || known_eq (current_vector_size, 0U)
3205 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3206 vector sizes will fail do not bother iterating. */
3207 || fatal)
3209 if (gsi_end_p (region_end))
3210 break;
3212 /* Skip the unhandled stmt. */
3213 gsi_next (&gsi);
3215 /* And reset vector sizes. */
3216 current_vector_size = 0;
3217 next_size = 0;
3219 else
3221 /* Try the next biggest vector size. */
3222 current_vector_size = vector_sizes[next_size++];
3223 if (dump_enabled_p ())
3225 dump_printf_loc (MSG_NOTE, vect_location,
3226 "***** Re-trying analysis with "
3227 "vector size ");
3228 dump_dec (MSG_NOTE, current_vector_size);
3229 dump_printf (MSG_NOTE, "\n");
3232 /* Start over. */
3233 gsi = region_begin;
3237 return any_vectorized;
3241 /* Return 1 if vector type of boolean constant which is OPNUM
3242 operand in statement STMT is a boolean vector. */
3244 static bool
3245 vect_mask_constant_operand_p (gimple *stmt, int opnum)
3247 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3248 enum tree_code code = gimple_expr_code (stmt);
3249 tree op, vectype;
3250 gimple *def_stmt;
3251 enum vect_def_type dt;
3253 /* For comparison and COND_EXPR type is chosen depending
3254 on the other comparison operand. */
3255 if (TREE_CODE_CLASS (code) == tcc_comparison)
3257 if (opnum)
3258 op = gimple_assign_rhs1 (stmt);
3259 else
3260 op = gimple_assign_rhs2 (stmt);
3262 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3263 &dt, &vectype))
3264 gcc_unreachable ();
3266 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3269 if (code == COND_EXPR)
3271 tree cond = gimple_assign_rhs1 (stmt);
3273 if (TREE_CODE (cond) == SSA_NAME)
3274 op = cond;
3275 else if (opnum)
3276 op = TREE_OPERAND (cond, 1);
3277 else
3278 op = TREE_OPERAND (cond, 0);
3280 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3281 &dt, &vectype))
3282 gcc_unreachable ();
3284 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3287 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3290 /* Build a variable-length vector in which the elements in ELTS are repeated
3291 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3292 RESULTS and add any new instructions to SEQ.
3294 The approach we use is:
3296 (1) Find a vector mode VM with integer elements of mode IM.
3298 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3299 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3300 from small vectors to IM.
3302 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3304 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3305 correct byte contents.
3307 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3309 We try to find the largest IM for which this sequence works, in order
3310 to cut down on the number of interleaves. */
3312 void
3313 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
3314 unsigned int nresults, vec<tree> &results)
3316 unsigned int nelts = elts.length ();
3317 tree element_type = TREE_TYPE (vector_type);
3319 /* (1) Find a vector mode VM with integer elements of mode IM. */
3320 unsigned int nvectors = 1;
3321 tree new_vector_type;
3322 tree permutes[2];
3323 if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
3324 &nvectors, &new_vector_type,
3325 permutes))
3326 gcc_unreachable ();
3328 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3329 unsigned int partial_nelts = nelts / nvectors;
3330 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3332 tree_vector_builder partial_elts;
3333 auto_vec<tree, 32> pieces (nvectors * 2);
3334 pieces.quick_grow (nvectors * 2);
3335 for (unsigned int i = 0; i < nvectors; ++i)
3337 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3338 ELTS' has mode IM. */
3339 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3340 for (unsigned int j = 0; j < partial_nelts; ++j)
3341 partial_elts.quick_push (elts[i * partial_nelts + j]);
3342 tree t = gimple_build_vector (seq, &partial_elts);
3343 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3344 TREE_TYPE (new_vector_type), t);
3346 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3347 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3350 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3351 correct byte contents.
3353 We need to repeat the following operation log2(nvectors) times:
3355 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3356 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3358 However, if each input repeats every N elements and the VF is
3359 a multiple of N * 2, the HI result is the same as the LO. */
3360 unsigned int in_start = 0;
3361 unsigned int out_start = nvectors;
3362 unsigned int hi_start = nvectors / 2;
3363 /* A bound on the number of outputs needed to produce NRESULTS results
3364 in the final iteration. */
3365 unsigned int noutputs_bound = nvectors * nresults;
3366 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3368 noutputs_bound /= 2;
3369 unsigned int limit = MIN (noutputs_bound, nvectors);
3370 for (unsigned int i = 0; i < limit; ++i)
3372 if ((i & 1) != 0
3373 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3374 2 * in_repeat))
3376 pieces[out_start + i] = pieces[out_start + i - 1];
3377 continue;
3380 tree output = make_ssa_name (new_vector_type);
3381 tree input1 = pieces[in_start + (i / 2)];
3382 tree input2 = pieces[in_start + (i / 2) + hi_start];
3383 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3384 input1, input2,
3385 permutes[i & 1]);
3386 gimple_seq_add_stmt (seq, stmt);
3387 pieces[out_start + i] = output;
3389 std::swap (in_start, out_start);
3392 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3393 results.reserve (nresults);
3394 for (unsigned int i = 0; i < nresults; ++i)
3395 if (i < nvectors)
3396 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3397 pieces[in_start + i]));
3398 else
3399 results.quick_push (results[i - nvectors]);
3403 /* For constant and loop invariant defs of SLP_NODE this function returns
3404 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3405 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
3406 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
3407 REDUC_INDEX is the index of the reduction operand in the statements, unless
3408 it is -1. */
3410 static void
3411 vect_get_constant_vectors (tree op, slp_tree slp_node,
3412 vec<tree> *vec_oprnds,
3413 unsigned int op_num, unsigned int number_of_vectors)
3415 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3416 gimple *stmt = stmts[0];
3417 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3418 unsigned HOST_WIDE_INT nunits;
3419 tree vec_cst;
3420 unsigned j, number_of_places_left_in_vector;
3421 tree vector_type;
3422 tree vop;
3423 int group_size = stmts.length ();
3424 unsigned int vec_num, i;
3425 unsigned number_of_copies = 1;
3426 vec<tree> voprnds;
3427 voprnds.create (number_of_vectors);
3428 bool constant_p, is_store;
3429 tree neutral_op = NULL;
3430 enum tree_code code = gimple_expr_code (stmt);
3431 gimple_seq ctor_seq = NULL;
3432 auto_vec<tree, 16> permute_results;
3434 /* Check if vector type is a boolean vector. */
3435 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3436 && vect_mask_constant_operand_p (stmt, op_num))
3437 vector_type
3438 = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
3439 else
3440 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
3442 if (STMT_VINFO_DATA_REF (stmt_vinfo))
3444 is_store = true;
3445 op = gimple_assign_rhs1 (stmt);
3447 else
3448 is_store = false;
3450 gcc_assert (op);
3452 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3453 created vectors. It is greater than 1 if unrolling is performed.
3455 For example, we have two scalar operands, s1 and s2 (e.g., group of
3456 strided accesses of size two), while NUNITS is four (i.e., four scalars
3457 of this type can be packed in a vector). The output vector will contain
3458 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3459 will be 2).
3461 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3462 containing the operands.
3464 For example, NUNITS is four as before, and the group size is 8
3465 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3466 {s5, s6, s7, s8}. */
3468 /* When using duplicate_and_interleave, we just need one element for
3469 each scalar statement. */
3470 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3471 nunits = group_size;
3473 number_of_copies = nunits * number_of_vectors / group_size;
3475 number_of_places_left_in_vector = nunits;
3476 constant_p = true;
3477 tree_vector_builder elts (vector_type, nunits, 1);
3478 elts.quick_grow (nunits);
3479 bool place_after_defs = false;
3480 for (j = 0; j < number_of_copies; j++)
3482 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
3484 if (is_store)
3485 op = gimple_assign_rhs1 (stmt);
3486 else
3488 switch (code)
3490 case COND_EXPR:
3492 tree cond = gimple_assign_rhs1 (stmt);
3493 if (TREE_CODE (cond) == SSA_NAME)
3494 op = gimple_op (stmt, op_num + 1);
3495 else if (op_num == 0 || op_num == 1)
3496 op = TREE_OPERAND (cond, op_num);
3497 else
3499 if (op_num == 2)
3500 op = gimple_assign_rhs2 (stmt);
3501 else
3502 op = gimple_assign_rhs3 (stmt);
3505 break;
3507 case CALL_EXPR:
3508 op = gimple_call_arg (stmt, op_num);
3509 break;
3511 case LSHIFT_EXPR:
3512 case RSHIFT_EXPR:
3513 case LROTATE_EXPR:
3514 case RROTATE_EXPR:
3515 op = gimple_op (stmt, op_num + 1);
3516 /* Unlike the other binary operators, shifts/rotates have
3517 the shift count being int, instead of the same type as
3518 the lhs, so make sure the scalar is the right type if
3519 we are dealing with vectors of
3520 long long/long/short/char. */
3521 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3522 op = fold_convert (TREE_TYPE (vector_type), op);
3523 break;
3525 default:
3526 op = gimple_op (stmt, op_num + 1);
3527 break;
3531 /* Create 'vect_ = {op0,op1,...,opn}'. */
3532 number_of_places_left_in_vector--;
3533 tree orig_op = op;
3534 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3536 if (CONSTANT_CLASS_P (op))
3538 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3540 /* Can't use VIEW_CONVERT_EXPR for booleans because
3541 of possibly different sizes of scalar value and
3542 vector element. */
3543 if (integer_zerop (op))
3544 op = build_int_cst (TREE_TYPE (vector_type), 0);
3545 else if (integer_onep (op))
3546 op = build_all_ones_cst (TREE_TYPE (vector_type));
3547 else
3548 gcc_unreachable ();
3550 else
3551 op = fold_unary (VIEW_CONVERT_EXPR,
3552 TREE_TYPE (vector_type), op);
3553 gcc_assert (op && CONSTANT_CLASS_P (op));
3555 else
3557 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3558 gimple *init_stmt;
3559 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3561 tree true_val
3562 = build_all_ones_cst (TREE_TYPE (vector_type));
3563 tree false_val
3564 = build_zero_cst (TREE_TYPE (vector_type));
3565 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3566 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3567 op, true_val,
3568 false_val);
3570 else
3572 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3573 op);
3574 init_stmt
3575 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3576 op);
3578 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3579 op = new_temp;
3582 elts[number_of_places_left_in_vector] = op;
3583 if (!CONSTANT_CLASS_P (op))
3584 constant_p = false;
3585 if (TREE_CODE (orig_op) == SSA_NAME
3586 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3587 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3588 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3589 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3590 place_after_defs = true;
3592 if (number_of_places_left_in_vector == 0)
3594 if (constant_p
3595 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3596 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3597 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3598 else
3600 if (vec_oprnds->is_empty ())
3601 duplicate_and_interleave (&ctor_seq, vector_type, elts,
3602 number_of_vectors,
3603 permute_results);
3604 vec_cst = permute_results[number_of_vectors - j - 1];
3606 tree init;
3607 gimple_stmt_iterator gsi;
3608 if (place_after_defs)
3610 gsi = gsi_for_stmt
3611 (vect_find_last_scalar_stmt_in_slp (slp_node));
3612 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
3614 else
3615 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
3616 if (ctor_seq != NULL)
3618 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3619 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3620 ctor_seq = NULL;
3622 voprnds.quick_push (init);
3623 place_after_defs = false;
3624 number_of_places_left_in_vector = nunits;
3625 constant_p = true;
3626 elts.new_vector (vector_type, nunits, 1);
3627 elts.quick_grow (nunits);
3632 /* Since the vectors are created in the reverse order, we should invert
3633 them. */
3634 vec_num = voprnds.length ();
3635 for (j = vec_num; j != 0; j--)
3637 vop = voprnds[j - 1];
3638 vec_oprnds->quick_push (vop);
3641 voprnds.release ();
3643 /* In case that VF is greater than the unrolling factor needed for the SLP
3644 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3645 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3646 to replicate the vectors. */
3647 while (number_of_vectors > vec_oprnds->length ())
3649 tree neutral_vec = NULL;
3651 if (neutral_op)
3653 if (!neutral_vec)
3654 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3656 vec_oprnds->quick_push (neutral_vec);
3658 else
3660 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3661 vec_oprnds->quick_push (vop);
3667 /* Get vectorized definitions from SLP_NODE that contains corresponding
3668 vectorized def-stmts. */
3670 static void
3671 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3673 tree vec_oprnd;
3674 gimple *vec_def_stmt;
3675 unsigned int i;
3677 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3679 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
3681 gcc_assert (vec_def_stmt);
3682 if (gimple_code (vec_def_stmt) == GIMPLE_PHI)
3683 vec_oprnd = gimple_phi_result (vec_def_stmt);
3684 else
3685 vec_oprnd = gimple_get_lhs (vec_def_stmt);
3686 vec_oprnds->quick_push (vec_oprnd);
3691 /* Get vectorized definitions for SLP_NODE.
3692 If the scalar definitions are loop invariants or constants, collect them and
3693 call vect_get_constant_vectors() to create vector stmts.
3694 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3695 must be stored in the corresponding child of SLP_NODE, and we call
3696 vect_get_slp_vect_defs () to retrieve them. */
3698 void
3699 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3700 vec<vec<tree> > *vec_oprnds)
3702 gimple *first_stmt;
3703 int number_of_vects = 0, i;
3704 unsigned int child_index = 0;
3705 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3706 slp_tree child = NULL;
3707 vec<tree> vec_defs;
3708 tree oprnd;
3709 bool vectorized_defs;
3711 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3712 FOR_EACH_VEC_ELT (ops, i, oprnd)
3714 /* For each operand we check if it has vectorized definitions in a child
3715 node or we need to create them (for invariants and constants). We
3716 check if the LHS of the first stmt of the next child matches OPRND.
3717 If it does, we found the correct child. Otherwise, we call
3718 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3719 to check this child node for the next operand. */
3720 vectorized_defs = false;
3721 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3723 child = SLP_TREE_CHILDREN (slp_node)[child_index];
3725 /* We have to check both pattern and original def, if available. */
3726 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3728 gimple *first_def = SLP_TREE_SCALAR_STMTS (child)[0];
3729 gimple *related
3730 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
3731 tree first_def_op;
3733 if (gimple_code (first_def) == GIMPLE_PHI)
3734 first_def_op = gimple_phi_result (first_def);
3735 else
3736 first_def_op = gimple_get_lhs (first_def);
3737 if (operand_equal_p (oprnd, first_def_op, 0)
3738 || (related
3739 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
3741 /* The number of vector defs is determined by the number of
3742 vector statements in the node from which we get those
3743 statements. */
3744 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3745 vectorized_defs = true;
3746 child_index++;
3749 else
3750 child_index++;
3753 if (!vectorized_defs)
3755 if (i == 0)
3757 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3758 /* Number of vector stmts was calculated according to LHS in
3759 vect_schedule_slp_instance (), fix it by replacing LHS with
3760 RHS, if necessary. See vect_get_smallest_scalar_type () for
3761 details. */
3762 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
3763 &rhs_size_unit);
3764 if (rhs_size_unit != lhs_size_unit)
3766 number_of_vects *= rhs_size_unit;
3767 number_of_vects /= lhs_size_unit;
3772 /* Allocate memory for vectorized defs. */
3773 vec_defs = vNULL;
3774 vec_defs.create (number_of_vects);
3776 /* For reduction defs we call vect_get_constant_vectors (), since we are
3777 looking for initial loop invariant values. */
3778 if (vectorized_defs)
3779 /* The defs are already vectorized. */
3780 vect_get_slp_vect_defs (child, &vec_defs);
3781 else
3782 /* Build vectors from scalar defs. */
3783 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3784 number_of_vects);
3786 vec_oprnds->quick_push (vec_defs);
3790 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3791 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3792 permute statements for the SLP node NODE of the SLP instance
3793 SLP_NODE_INSTANCE. */
3795 bool
3796 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3797 gimple_stmt_iterator *gsi, poly_uint64 vf,
3798 slp_instance slp_node_instance, bool analyze_only,
3799 unsigned *n_perms)
3801 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3802 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3803 tree mask_element_type = NULL_TREE, mask_type;
3804 int vec_index = 0;
3805 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3806 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3807 unsigned int mask_element;
3808 machine_mode mode;
3809 unsigned HOST_WIDE_INT nunits, const_vf;
3811 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3812 return false;
3814 stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
3816 mode = TYPE_MODE (vectype);
3818 /* At the moment, all permutations are represented using per-element
3819 indices, so we can't cope with variable vector lengths or
3820 vectorization factors. */
3821 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
3822 || !vf.is_constant (&const_vf))
3823 return false;
3825 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3826 same size as the vector element being permuted. */
3827 mask_element_type = lang_hooks.types.type_for_mode
3828 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
3829 mask_type = get_vectype_for_scalar_type (mask_element_type);
3830 vec_perm_builder mask (nunits, nunits, 1);
3831 mask.quick_grow (nunits);
3832 vec_perm_indices indices;
3834 /* Initialize the vect stmts of NODE to properly insert the generated
3835 stmts later. */
3836 if (! analyze_only)
3837 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3838 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3839 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3841 /* Generate permutation masks for every NODE. Number of masks for each NODE
3842 is equal to GROUP_SIZE.
3843 E.g., we have a group of three nodes with three loads from the same
3844 location in each node, and the vector size is 4. I.e., we have a
3845 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3846 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3847 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3850 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3851 The last mask is illegal since we assume two operands for permute
3852 operation, and the mask element values can't be outside that range.
3853 Hence, the last mask must be converted into {2,5,5,5}.
3854 For the first two permutations we need the first and the second input
3855 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3856 we need the second and the third vectors: {b1,c1,a2,b2} and
3857 {c2,a3,b3,c3}. */
3859 int vect_stmts_counter = 0;
3860 unsigned int index = 0;
3861 int first_vec_index = -1;
3862 int second_vec_index = -1;
3863 bool noop_p = true;
3864 *n_perms = 0;
3866 for (unsigned int j = 0; j < const_vf; j++)
3868 for (int k = 0; k < group_size; k++)
3870 unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
3871 + j * STMT_VINFO_GROUP_SIZE (stmt_info));
3872 vec_index = i / nunits;
3873 mask_element = i % nunits;
3874 if (vec_index == first_vec_index
3875 || first_vec_index == -1)
3877 first_vec_index = vec_index;
3879 else if (vec_index == second_vec_index
3880 || second_vec_index == -1)
3882 second_vec_index = vec_index;
3883 mask_element += nunits;
3885 else
3887 if (dump_enabled_p ())
3889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3890 "permutation requires at "
3891 "least three vectors ");
3892 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3893 stmt, 0);
3895 gcc_assert (analyze_only);
3896 return false;
3899 gcc_assert (mask_element < 2 * nunits);
3900 if (mask_element != index)
3901 noop_p = false;
3902 mask[index++] = mask_element;
3904 if (index == nunits && !noop_p)
3906 indices.new_vector (mask, 2, nunits);
3907 if (!can_vec_perm_const_p (mode, indices))
3909 if (dump_enabled_p ())
3911 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3912 vect_location,
3913 "unsupported vect permute { ");
3914 for (i = 0; i < nunits; ++i)
3916 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3917 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3919 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3921 gcc_assert (analyze_only);
3922 return false;
3925 ++*n_perms;
3928 if (index == nunits)
3930 if (!analyze_only)
3932 tree mask_vec = NULL_TREE;
3934 if (! noop_p)
3935 mask_vec = vec_perm_indices_to_tree (mask_type, indices);
3937 if (second_vec_index == -1)
3938 second_vec_index = first_vec_index;
3940 /* Generate the permute statement if necessary. */
3941 tree first_vec = dr_chain[first_vec_index];
3942 tree second_vec = dr_chain[second_vec_index];
3943 gimple *perm_stmt;
3944 if (! noop_p)
3946 tree perm_dest
3947 = vect_create_destination_var (gimple_assign_lhs (stmt),
3948 vectype);
3949 perm_dest = make_ssa_name (perm_dest);
3950 perm_stmt = gimple_build_assign (perm_dest,
3951 VEC_PERM_EXPR,
3952 first_vec, second_vec,
3953 mask_vec);
3954 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3956 else
3957 /* If mask was NULL_TREE generate the requested
3958 identity transform. */
3959 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
3961 /* Store the vector statement in NODE. */
3962 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
3965 index = 0;
3966 first_vec_index = -1;
3967 second_vec_index = -1;
3968 noop_p = true;
3973 return true;
3976 typedef hash_map <vec <gimple *>, slp_tree,
3977 simple_hashmap_traits <bst_traits, slp_tree> >
3978 scalar_stmts_to_slp_tree_map_t;
3980 /* Vectorize SLP instance tree in postorder. */
3982 static bool
3983 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3984 scalar_stmts_to_slp_tree_map_t *bst_map)
3986 gimple *stmt;
3987 bool grouped_store, is_store;
3988 gimple_stmt_iterator si;
3989 stmt_vec_info stmt_info;
3990 unsigned int group_size;
3991 tree vectype;
3992 int i, j;
3993 slp_tree child;
3995 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3996 return false;
3998 /* See if we have already vectorized the same set of stmts and reuse their
3999 vectorized stmts. */
4000 slp_tree &leader
4001 = bst_map->get_or_insert (SLP_TREE_SCALAR_STMTS (node).copy ());
4002 if (leader)
4004 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (leader));
4005 return false;
4008 leader = node;
4009 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4010 vect_schedule_slp_instance (child, instance, bst_map);
4012 /* Push SLP node def-type to stmts. */
4013 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4014 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4015 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4016 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
4018 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
4019 stmt_info = vinfo_for_stmt (stmt);
4021 /* VECTYPE is the type of the destination. */
4022 vectype = STMT_VINFO_VECTYPE (stmt_info);
4023 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4024 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
4026 if (!SLP_TREE_VEC_STMTS (node).exists ())
4027 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4029 if (dump_enabled_p ())
4031 dump_printf_loc (MSG_NOTE,vect_location,
4032 "------>vectorizing SLP node starting from: ");
4033 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4036 /* Vectorized stmts go before the last scalar stmt which is where
4037 all uses are ready. */
4038 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
4040 /* Mark the first element of the reduction chain as reduction to properly
4041 transform the node. In the analysis phase only the last element of the
4042 chain is marked as reduction. */
4043 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
4044 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
4046 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
4047 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4050 /* Handle two-operation SLP nodes by vectorizing the group with
4051 both operations and then performing a merge. */
4052 if (SLP_TREE_TWO_OPERATORS (node))
4054 enum tree_code code0 = gimple_assign_rhs_code (stmt);
4055 enum tree_code ocode = ERROR_MARK;
4056 gimple *ostmt;
4057 vec_perm_builder mask (group_size, group_size, 1);
4058 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
4059 if (gimple_assign_rhs_code (ostmt) != code0)
4061 mask.quick_push (1);
4062 ocode = gimple_assign_rhs_code (ostmt);
4064 else
4065 mask.quick_push (0);
4066 if (ocode != ERROR_MARK)
4068 vec<gimple *> v0;
4069 vec<gimple *> v1;
4070 unsigned j;
4071 tree tmask = NULL_TREE;
4072 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4073 v0 = SLP_TREE_VEC_STMTS (node).copy ();
4074 SLP_TREE_VEC_STMTS (node).truncate (0);
4075 gimple_assign_set_rhs_code (stmt, ocode);
4076 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4077 gimple_assign_set_rhs_code (stmt, code0);
4078 v1 = SLP_TREE_VEC_STMTS (node).copy ();
4079 SLP_TREE_VEC_STMTS (node).truncate (0);
4080 tree meltype = build_nonstandard_integer_type
4081 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
4082 tree mvectype = get_same_sized_vectype (meltype, vectype);
4083 unsigned k = 0, l;
4084 for (j = 0; j < v0.length (); ++j)
4086 /* Enforced by vect_build_slp_tree, which rejects variable-length
4087 vectors for SLP_TREE_TWO_OPERATORS. */
4088 unsigned int const_nunits = nunits.to_constant ();
4089 tree_vector_builder melts (mvectype, const_nunits, 1);
4090 for (l = 0; l < const_nunits; ++l)
4092 if (k >= group_size)
4093 k = 0;
4094 tree t = build_int_cst (meltype,
4095 mask[k++] * const_nunits + l);
4096 melts.quick_push (t);
4098 tmask = melts.build ();
4100 /* ??? Not all targets support a VEC_PERM_EXPR with a
4101 constant mask that would translate to a vec_merge RTX
4102 (with their vec_perm_const_ok). We can either not
4103 vectorize in that case or let veclower do its job.
4104 Unfortunately that isn't too great and at least for
4105 plus/minus we'd eventually like to match targets
4106 vector addsub instructions. */
4107 gimple *vstmt;
4108 vstmt = gimple_build_assign (make_ssa_name (vectype),
4109 VEC_PERM_EXPR,
4110 gimple_assign_lhs (v0[j]),
4111 gimple_assign_lhs (v1[j]), tmask);
4112 vect_finish_stmt_generation (stmt, vstmt, &si);
4113 SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
4115 v0.release ();
4116 v1.release ();
4117 return false;
4120 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4122 /* Restore stmt def-types. */
4123 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4124 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4125 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4126 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
4128 return is_store;
4131 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4132 For loop vectorization this is done in vectorizable_call, but for SLP
4133 it needs to be deferred until end of vect_schedule_slp, because multiple
4134 SLP instances may refer to the same scalar stmt. */
4136 static void
4137 vect_remove_slp_scalar_calls (slp_tree node)
4139 gimple *stmt, *new_stmt;
4140 gimple_stmt_iterator gsi;
4141 int i;
4142 slp_tree child;
4143 tree lhs;
4144 stmt_vec_info stmt_info;
4146 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4147 return;
4149 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4150 vect_remove_slp_scalar_calls (child);
4152 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4154 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
4155 continue;
4156 stmt_info = vinfo_for_stmt (stmt);
4157 if (stmt_info == NULL
4158 || is_pattern_stmt_p (stmt_info)
4159 || !PURE_SLP_STMT (stmt_info))
4160 continue;
4161 lhs = gimple_call_lhs (stmt);
4162 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4163 set_vinfo_for_stmt (new_stmt, stmt_info);
4164 set_vinfo_for_stmt (stmt, NULL);
4165 STMT_VINFO_STMT (stmt_info) = new_stmt;
4166 gsi = gsi_for_stmt (stmt);
4167 gsi_replace (&gsi, new_stmt, false);
4168 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4172 /* Generate vector code for all SLP instances in the loop/basic block. */
4174 bool
4175 vect_schedule_slp (vec_info *vinfo)
4177 vec<slp_instance> slp_instances;
4178 slp_instance instance;
4179 unsigned int i;
4180 bool is_store = false;
4182 slp_instances = vinfo->slp_instances;
4183 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4185 /* Schedule the tree of INSTANCE. */
4186 scalar_stmts_to_slp_tree_map_t *bst_map
4187 = new scalar_stmts_to_slp_tree_map_t ();
4188 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4189 instance, bst_map);
4190 delete bst_map;
4191 if (dump_enabled_p ())
4192 dump_printf_loc (MSG_NOTE, vect_location,
4193 "vectorizing stmts using SLP.\n");
4196 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4198 slp_tree root = SLP_INSTANCE_TREE (instance);
4199 gimple *store;
4200 unsigned int j;
4201 gimple_stmt_iterator gsi;
4203 /* Remove scalar call stmts. Do not do this for basic-block
4204 vectorization as not all uses may be vectorized.
4205 ??? Why should this be necessary? DCE should be able to
4206 remove the stmts itself.
4207 ??? For BB vectorization we can as well remove scalar
4208 stmts starting from the SLP tree root if they have no
4209 uses. */
4210 if (is_a <loop_vec_info> (vinfo))
4211 vect_remove_slp_scalar_calls (root);
4213 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
4214 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4216 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
4217 break;
4219 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
4220 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
4221 /* Free the attached stmt_vec_info and remove the stmt. */
4222 gsi = gsi_for_stmt (store);
4223 unlink_stmt_vdef (store);
4224 gsi_remove (&gsi, true);
4225 release_defs (store);
4226 free_stmt_vec_info (store);
4230 return is_store;