[AArch64] Fix -mlow-precision-div (PR 86838)
[official-gcc.git] / gcc / tree-vect-slp.c
blob367945b69cdf08e62fd6556a8548d60b2445fbfe
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
51 FINAL_P is true if we have vectorized the instance or if we have
52 made a final decision not to vectorize the statements in any way. */
54 static void
55 vect_free_slp_tree (slp_tree node, bool final_p)
57 int i;
58 slp_tree child;
60 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
61 vect_free_slp_tree (child, final_p);
63 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
64 Some statements might no longer exist, after having been
65 removed by vect_transform_stmt. Updating the remaining
66 statements would be redundant. */
67 if (!final_p)
69 stmt_vec_info stmt_info;
70 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
72 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
73 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
77 SLP_TREE_CHILDREN (node).release ();
78 SLP_TREE_SCALAR_STMTS (node).release ();
79 SLP_TREE_VEC_STMTS (node).release ();
80 SLP_TREE_LOAD_PERMUTATION (node).release ();
82 free (node);
86 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
87 have vectorized the instance or if we have made a final decision not
88 to vectorize the statements in any way. */
90 void
91 vect_free_slp_instance (slp_instance instance, bool final_p)
93 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
94 SLP_INSTANCE_LOADS (instance).release ();
95 free (instance);
99 /* Create an SLP node for SCALAR_STMTS. */
101 static slp_tree
102 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
104 slp_tree node;
105 stmt_vec_info stmt_info = scalar_stmts[0];
106 unsigned int nops;
108 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
109 nops = gimple_call_num_args (stmt);
110 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
112 nops = gimple_num_ops (stmt) - 1;
113 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
114 nops++;
116 else if (is_a <gphi *> (stmt_info->stmt))
117 nops = 0;
118 else
119 return NULL;
121 node = XNEW (struct _slp_tree);
122 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
123 SLP_TREE_VEC_STMTS (node).create (0);
124 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
125 SLP_TREE_CHILDREN (node).create (nops);
126 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
127 SLP_TREE_TWO_OPERATORS (node) = false;
128 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
130 unsigned i;
131 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
132 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
134 return node;
138 /* This structure is used in creation of an SLP tree. Each instance
139 corresponds to the same operand in a group of scalar stmts in an SLP
140 node. */
141 typedef struct _slp_oprnd_info
143 /* Def-stmts for the operands. */
144 vec<stmt_vec_info> def_stmts;
145 /* Information about the first statement, its vector def-type, type, the
146 operand itself in case it's constant, and an indication if it's a pattern
147 stmt. */
148 tree first_op_type;
149 enum vect_def_type first_dt;
150 bool first_pattern;
151 bool second_pattern;
152 } *slp_oprnd_info;
155 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
156 operand. */
157 static vec<slp_oprnd_info>
158 vect_create_oprnd_info (int nops, int group_size)
160 int i;
161 slp_oprnd_info oprnd_info;
162 vec<slp_oprnd_info> oprnds_info;
164 oprnds_info.create (nops);
165 for (i = 0; i < nops; i++)
167 oprnd_info = XNEW (struct _slp_oprnd_info);
168 oprnd_info->def_stmts.create (group_size);
169 oprnd_info->first_dt = vect_uninitialized_def;
170 oprnd_info->first_op_type = NULL_TREE;
171 oprnd_info->first_pattern = false;
172 oprnd_info->second_pattern = false;
173 oprnds_info.quick_push (oprnd_info);
176 return oprnds_info;
180 /* Free operands info. */
182 static void
183 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
185 int i;
186 slp_oprnd_info oprnd_info;
188 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
190 oprnd_info->def_stmts.release ();
191 XDELETE (oprnd_info);
194 oprnds_info.release ();
198 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
199 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
200 of the chain. */
203 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
204 stmt_vec_info first_stmt_info)
206 stmt_vec_info next_stmt_info = first_stmt_info;
207 int result = 0;
209 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
210 return -1;
214 if (next_stmt_info == stmt_info)
215 return result;
216 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
217 if (next_stmt_info)
218 result += DR_GROUP_GAP (next_stmt_info);
220 while (next_stmt_info);
222 return -1;
225 /* Check whether it is possible to load COUNT elements of type ELT_MODE
226 using the method implemented by duplicate_and_interleave. Return true
227 if so, returning the number of intermediate vectors in *NVECTORS_OUT
228 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
229 (if nonnull). */
231 bool
232 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
233 unsigned int *nvectors_out,
234 tree *vector_type_out,
235 tree *permutes)
237 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
238 poly_int64 nelts;
239 unsigned int nvectors = 1;
240 for (;;)
242 scalar_int_mode int_mode;
243 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
244 if (multiple_p (current_vector_size, elt_bytes, &nelts)
245 && int_mode_for_size (elt_bits, 0).exists (&int_mode))
247 tree int_type = build_nonstandard_integer_type
248 (GET_MODE_BITSIZE (int_mode), 1);
249 tree vector_type = build_vector_type (int_type, nelts);
250 if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
252 vec_perm_builder sel1 (nelts, 2, 3);
253 vec_perm_builder sel2 (nelts, 2, 3);
254 poly_int64 half_nelts = exact_div (nelts, 2);
255 for (unsigned int i = 0; i < 3; ++i)
257 sel1.quick_push (i);
258 sel1.quick_push (i + nelts);
259 sel2.quick_push (half_nelts + i);
260 sel2.quick_push (half_nelts + i + nelts);
262 vec_perm_indices indices1 (sel1, 2, nelts);
263 vec_perm_indices indices2 (sel2, 2, nelts);
264 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
265 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
267 if (nvectors_out)
268 *nvectors_out = nvectors;
269 if (vector_type_out)
270 *vector_type_out = vector_type;
271 if (permutes)
273 permutes[0] = vect_gen_perm_mask_checked (vector_type,
274 indices1);
275 permutes[1] = vect_gen_perm_mask_checked (vector_type,
276 indices2);
278 return true;
282 if (!multiple_p (elt_bytes, 2, &elt_bytes))
283 return false;
284 nvectors *= 2;
288 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
289 they are of a valid type and that they match the defs of the first stmt of
290 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
291 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
292 indicates swap is required for cond_expr stmts. Specifically, *SWAP
293 is 1 if STMT is cond and operands of comparison need to be swapped;
294 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
295 If there is any operand swap in this function, *SWAP is set to non-zero
296 value.
297 If there was a fatal error return -1; if the error could be corrected by
298 swapping operands of father node of this one, return 1; if everything is
299 ok return 0. */
300 static int
301 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
302 vec<stmt_vec_info> stmts, unsigned stmt_num,
303 vec<slp_oprnd_info> *oprnds_info)
305 stmt_vec_info stmt_info = stmts[stmt_num];
306 tree oprnd;
307 unsigned int i, number_of_oprnds;
308 enum vect_def_type dt = vect_uninitialized_def;
309 bool pattern = false;
310 slp_oprnd_info oprnd_info;
311 int first_op_idx = 1;
312 unsigned int commutative_op = -1U;
313 bool first_op_cond = false;
314 bool first = stmt_num == 0;
315 bool second = stmt_num == 1;
317 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
319 number_of_oprnds = gimple_call_num_args (stmt);
320 first_op_idx = 3;
321 if (gimple_call_internal_p (stmt))
323 internal_fn ifn = gimple_call_internal_fn (stmt);
324 commutative_op = first_commutative_argument (ifn);
327 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
329 enum tree_code code = gimple_assign_rhs_code (stmt);
330 number_of_oprnds = gimple_num_ops (stmt) - 1;
331 /* Swap can only be done for cond_expr if asked to, otherwise we
332 could result in different comparison code to the first stmt. */
333 if (gimple_assign_rhs_code (stmt) == COND_EXPR
334 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
336 first_op_cond = true;
337 number_of_oprnds++;
339 else
340 commutative_op = commutative_tree_code (code) ? 0U : -1U;
342 else
343 return -1;
345 bool swapped = (*swap != 0);
346 gcc_assert (!swapped || first_op_cond);
347 for (i = 0; i < number_of_oprnds; i++)
349 again:
350 if (first_op_cond)
352 /* Map indicating how operands of cond_expr should be swapped. */
353 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
354 int *map = maps[*swap];
356 if (i < 2)
357 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
358 first_op_idx), map[i]);
359 else
360 oprnd = gimple_op (stmt_info->stmt, map[i]);
362 else
363 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
365 oprnd_info = (*oprnds_info)[i];
367 stmt_vec_info def_stmt_info;
368 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
370 if (dump_enabled_p ())
372 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
373 "Build SLP failed: can't analyze def for ");
374 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
375 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
378 return -1;
381 if (second)
382 oprnd_info->second_pattern = pattern;
384 if (first)
386 oprnd_info->first_dt = dt;
387 oprnd_info->first_pattern = pattern;
388 oprnd_info->first_op_type = TREE_TYPE (oprnd);
390 else
392 /* Not first stmt of the group, check that the def-stmt/s match
393 the def-stmt/s of the first stmt. Allow different definition
394 types for reduction chains: the first stmt must be a
395 vect_reduction_def (a phi node), and the rest
396 vect_internal_def. */
397 tree type = TREE_TYPE (oprnd);
398 if ((oprnd_info->first_dt != dt
399 && !(oprnd_info->first_dt == vect_reduction_def
400 && dt == vect_internal_def)
401 && !((oprnd_info->first_dt == vect_external_def
402 || oprnd_info->first_dt == vect_constant_def)
403 && (dt == vect_external_def
404 || dt == vect_constant_def)))
405 || !types_compatible_p (oprnd_info->first_op_type, type))
407 /* Try swapping operands if we got a mismatch. */
408 if (i == commutative_op && !swapped)
410 swapped = true;
411 goto again;
414 if (dump_enabled_p ())
415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
416 "Build SLP failed: different types\n");
418 return 1;
420 if ((dt == vect_constant_def
421 || dt == vect_external_def)
422 && !current_vector_size.is_constant ()
423 && (TREE_CODE (type) == BOOLEAN_TYPE
424 || !can_duplicate_and_interleave_p (stmts.length (),
425 TYPE_MODE (type))))
427 if (dump_enabled_p ())
429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
430 "Build SLP failed: invalid type of def "
431 "for variable-length SLP ");
432 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
433 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
435 return -1;
439 /* Check the types of the definitions. */
440 switch (dt)
442 case vect_constant_def:
443 case vect_external_def:
444 break;
446 case vect_reduction_def:
447 case vect_induction_def:
448 case vect_internal_def:
449 oprnd_info->def_stmts.quick_push (def_stmt_info);
450 break;
452 default:
453 /* FORNOW: Not supported. */
454 if (dump_enabled_p ())
456 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
457 "Build SLP failed: illegal type of def ");
458 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
459 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
462 return -1;
466 /* Swap operands. */
467 if (swapped)
469 /* If there are already uses of this stmt in a SLP instance then
470 we've committed to the operand order and can't swap it. */
471 if (STMT_VINFO_NUM_SLP_USES (stmt_info) != 0)
473 if (dump_enabled_p ())
475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
476 "Build SLP failed: cannot swap operands of "
477 "shared stmt ");
478 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
479 stmt_info->stmt, 0);
481 return -1;
484 if (first_op_cond)
486 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
487 tree cond = gimple_assign_rhs1 (stmt);
488 enum tree_code code = TREE_CODE (cond);
490 /* Swap. */
491 if (*swap == 1)
493 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
494 &TREE_OPERAND (cond, 1));
495 TREE_SET_CODE (cond, swap_tree_comparison (code));
497 /* Invert. */
498 else
500 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
501 gimple_assign_rhs3_ptr (stmt));
502 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
503 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
504 gcc_assert (code != ERROR_MARK);
505 TREE_SET_CODE (cond, code);
508 else
510 unsigned int op = commutative_op + first_op_idx;
511 swap_ssa_operands (stmt_info->stmt,
512 gimple_op_ptr (stmt_info->stmt, op),
513 gimple_op_ptr (stmt_info->stmt, op + 1));
515 if (dump_enabled_p ())
517 dump_printf_loc (MSG_NOTE, vect_location,
518 "swapped operands to match def types in ");
519 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
523 *swap = swapped;
524 return 0;
527 /* Return true if call statements CALL1 and CALL2 are similar enough
528 to be combined into the same SLP group. */
530 static bool
531 compatible_calls_p (gcall *call1, gcall *call2)
533 unsigned int nargs = gimple_call_num_args (call1);
534 if (nargs != gimple_call_num_args (call2))
535 return false;
537 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
538 return false;
540 if (gimple_call_internal_p (call1))
542 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
543 TREE_TYPE (gimple_call_lhs (call2))))
544 return false;
545 for (unsigned int i = 0; i < nargs; ++i)
546 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
547 TREE_TYPE (gimple_call_arg (call2, i))))
548 return false;
550 else
552 if (!operand_equal_p (gimple_call_fn (call1),
553 gimple_call_fn (call2), 0))
554 return false;
556 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
557 return false;
559 return true;
562 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
563 caller's attempt to find the vector type in STMT_INFO with the narrowest
564 element type. Return true if VECTYPE is nonnull and if it is valid
565 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
566 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
567 vect_build_slp_tree. */
569 static bool
570 vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
571 tree vectype, poly_uint64 *max_nunits)
573 if (!vectype)
575 if (dump_enabled_p ())
577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
578 "Build SLP failed: unsupported data-type in ");
579 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
580 stmt_info->stmt, 0);
581 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
583 /* Fatal mismatch. */
584 return false;
587 /* If populating the vector type requires unrolling then fail
588 before adjusting *max_nunits for basic-block vectorization. */
589 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
590 unsigned HOST_WIDE_INT const_nunits;
591 if (STMT_VINFO_BB_VINFO (stmt_info)
592 && (!nunits.is_constant (&const_nunits)
593 || const_nunits > group_size))
595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
596 "Build SLP failed: unrolling required "
597 "in basic block SLP\n");
598 /* Fatal mismatch. */
599 return false;
602 /* In case of multiple types we need to detect the smallest type. */
603 vect_update_max_nunits (max_nunits, vectype);
604 return true;
607 /* STMTS is a group of GROUP_SIZE SLP statements in which some
608 statements do the same operation as the first statement and in which
609 the others do ALT_STMT_CODE. Return true if we can take one vector
610 of the first operation and one vector of the second and permute them
611 to get the required result. VECTYPE is the type of the vector that
612 would be permuted. */
614 static bool
615 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
616 unsigned int group_size, tree vectype,
617 tree_code alt_stmt_code)
619 unsigned HOST_WIDE_INT count;
620 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
621 return false;
623 vec_perm_builder sel (count, count, 1);
624 for (unsigned int i = 0; i < count; ++i)
626 unsigned int elt = i;
627 gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
628 if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
629 elt += count;
630 sel.quick_push (elt);
632 vec_perm_indices indices (sel, 2, count);
633 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
636 /* Verify if the scalar stmts STMTS are isomorphic, require data
637 permutation or are of unsupported types of operation. Return
638 true if they are, otherwise return false and indicate in *MATCHES
639 which stmts are not isomorphic to the first one. If MATCHES[0]
640 is false then this indicates the comparison could not be
641 carried out or the stmts will never be vectorized by SLP.
643 Note COND_EXPR is possibly ismorphic to another one after swapping its
644 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
645 the first stmt by swapping the two operands of comparison; set SWAP[i]
646 to 2 if stmt I is isormorphic to the first stmt by inverting the code
647 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
648 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
650 static bool
651 vect_build_slp_tree_1 (unsigned char *swap,
652 vec<stmt_vec_info> stmts, unsigned int group_size,
653 poly_uint64 *max_nunits, bool *matches,
654 bool *two_operators)
656 unsigned int i;
657 stmt_vec_info first_stmt_info = stmts[0];
658 enum tree_code first_stmt_code = ERROR_MARK;
659 enum tree_code alt_stmt_code = ERROR_MARK;
660 enum tree_code rhs_code = ERROR_MARK;
661 enum tree_code first_cond_code = ERROR_MARK;
662 tree lhs;
663 bool need_same_oprnds = false;
664 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
665 optab optab;
666 int icode;
667 machine_mode optab_op2_mode;
668 machine_mode vec_mode;
669 stmt_vec_info first_load = NULL, prev_first_load = NULL;
671 /* For every stmt in NODE find its def stmt/s. */
672 stmt_vec_info stmt_info;
673 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
675 gimple *stmt = stmt_info->stmt;
676 swap[i] = 0;
677 matches[i] = false;
679 if (dump_enabled_p ())
681 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
682 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
685 /* Fail to vectorize statements marked as unvectorizable. */
686 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
688 if (dump_enabled_p ())
690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
691 "Build SLP failed: unvectorizable statement ");
692 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
694 /* Fatal mismatch. */
695 matches[0] = false;
696 return false;
699 lhs = gimple_get_lhs (stmt);
700 if (lhs == NULL_TREE)
702 if (dump_enabled_p ())
704 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
705 "Build SLP failed: not GIMPLE_ASSIGN nor "
706 "GIMPLE_CALL ");
707 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
709 /* Fatal mismatch. */
710 matches[0] = false;
711 return false;
714 tree nunits_vectype;
715 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
716 &nunits_vectype)
717 || (nunits_vectype
718 && !vect_record_max_nunits (stmt_info, group_size,
719 nunits_vectype, max_nunits)))
721 /* Fatal mismatch. */
722 matches[0] = false;
723 return false;
726 gcc_assert (vectype);
728 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
730 rhs_code = CALL_EXPR;
731 if ((gimple_call_internal_p (call_stmt)
732 && (!vectorizable_internal_fn_p
733 (gimple_call_internal_fn (call_stmt))))
734 || gimple_call_tail_p (call_stmt)
735 || gimple_call_noreturn_p (call_stmt)
736 || !gimple_call_nothrow_p (call_stmt)
737 || gimple_call_chain (call_stmt))
739 if (dump_enabled_p ())
741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
742 "Build SLP failed: unsupported call type ");
743 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
744 call_stmt, 0);
746 /* Fatal mismatch. */
747 matches[0] = false;
748 return false;
751 else
752 rhs_code = gimple_assign_rhs_code (stmt);
754 /* Check the operation. */
755 if (i == 0)
757 first_stmt_code = rhs_code;
759 /* Shift arguments should be equal in all the packed stmts for a
760 vector shift with scalar shift operand. */
761 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
762 || rhs_code == LROTATE_EXPR
763 || rhs_code == RROTATE_EXPR)
765 if (vectype == boolean_type_node)
767 if (dump_enabled_p ())
768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
769 "Build SLP failed: shift of a"
770 " boolean.\n");
771 /* Fatal mismatch. */
772 matches[0] = false;
773 return false;
776 vec_mode = TYPE_MODE (vectype);
778 /* First see if we have a vector/vector shift. */
779 optab = optab_for_tree_code (rhs_code, vectype,
780 optab_vector);
782 if (!optab
783 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
785 /* No vector/vector shift, try for a vector/scalar shift. */
786 optab = optab_for_tree_code (rhs_code, vectype,
787 optab_scalar);
789 if (!optab)
791 if (dump_enabled_p ())
792 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
793 "Build SLP failed: no optab.\n");
794 /* Fatal mismatch. */
795 matches[0] = false;
796 return false;
798 icode = (int) optab_handler (optab, vec_mode);
799 if (icode == CODE_FOR_nothing)
801 if (dump_enabled_p ())
802 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
803 "Build SLP failed: "
804 "op not supported by target.\n");
805 /* Fatal mismatch. */
806 matches[0] = false;
807 return false;
809 optab_op2_mode = insn_data[icode].operand[2].mode;
810 if (!VECTOR_MODE_P (optab_op2_mode))
812 need_same_oprnds = true;
813 first_op1 = gimple_assign_rhs2 (stmt);
817 else if (rhs_code == WIDEN_LSHIFT_EXPR)
819 need_same_oprnds = true;
820 first_op1 = gimple_assign_rhs2 (stmt);
823 else
825 if (first_stmt_code != rhs_code
826 && alt_stmt_code == ERROR_MARK)
827 alt_stmt_code = rhs_code;
828 if (first_stmt_code != rhs_code
829 && (first_stmt_code != IMAGPART_EXPR
830 || rhs_code != REALPART_EXPR)
831 && (first_stmt_code != REALPART_EXPR
832 || rhs_code != IMAGPART_EXPR)
833 /* Handle mismatches in plus/minus by computing both
834 and merging the results. */
835 && !((first_stmt_code == PLUS_EXPR
836 || first_stmt_code == MINUS_EXPR)
837 && (alt_stmt_code == PLUS_EXPR
838 || alt_stmt_code == MINUS_EXPR)
839 && rhs_code == alt_stmt_code)
840 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
841 && (first_stmt_code == ARRAY_REF
842 || first_stmt_code == BIT_FIELD_REF
843 || first_stmt_code == INDIRECT_REF
844 || first_stmt_code == COMPONENT_REF
845 || first_stmt_code == MEM_REF)))
847 if (dump_enabled_p ())
849 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
850 "Build SLP failed: different operation "
851 "in stmt ");
852 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
854 "original stmt ");
855 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
856 first_stmt_info->stmt, 0);
858 /* Mismatch. */
859 continue;
862 if (need_same_oprnds
863 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
865 if (dump_enabled_p ())
867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
868 "Build SLP failed: different shift "
869 "arguments in ");
870 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
872 /* Mismatch. */
873 continue;
876 if (rhs_code == CALL_EXPR)
878 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
879 as_a <gcall *> (stmt)))
881 if (dump_enabled_p ())
883 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
884 "Build SLP failed: different calls in ");
885 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
886 stmt, 0);
888 /* Mismatch. */
889 continue;
894 /* Grouped store or load. */
895 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
897 if (REFERENCE_CLASS_P (lhs))
899 /* Store. */
902 else
904 /* Load. */
905 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
906 if (prev_first_load)
908 /* Check that there are no loads from different interleaving
909 chains in the same node. */
910 if (prev_first_load != first_load)
912 if (dump_enabled_p ())
914 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
915 vect_location,
916 "Build SLP failed: different "
917 "interleaving chains in one node ");
918 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
919 stmt, 0);
921 /* Mismatch. */
922 continue;
925 else
926 prev_first_load = first_load;
928 } /* Grouped access. */
929 else
931 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
933 /* Not grouped load. */
934 if (dump_enabled_p ())
936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
937 "Build SLP failed: not grouped load ");
938 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
941 /* FORNOW: Not grouped loads are not supported. */
942 /* Fatal mismatch. */
943 matches[0] = false;
944 return false;
947 /* Not memory operation. */
948 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
949 && TREE_CODE_CLASS (rhs_code) != tcc_unary
950 && TREE_CODE_CLASS (rhs_code) != tcc_expression
951 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
952 && rhs_code != CALL_EXPR)
954 if (dump_enabled_p ())
956 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
957 "Build SLP failed: operation");
958 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
959 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
961 /* Fatal mismatch. */
962 matches[0] = false;
963 return false;
966 if (rhs_code == COND_EXPR)
968 tree cond_expr = gimple_assign_rhs1 (stmt);
969 enum tree_code cond_code = TREE_CODE (cond_expr);
970 enum tree_code swap_code = ERROR_MARK;
971 enum tree_code invert_code = ERROR_MARK;
973 if (i == 0)
974 first_cond_code = TREE_CODE (cond_expr);
975 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
977 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
978 swap_code = swap_tree_comparison (cond_code);
979 invert_code = invert_tree_comparison (cond_code, honor_nans);
982 if (first_cond_code == cond_code)
984 /* Isomorphic can be achieved by swapping. */
985 else if (first_cond_code == swap_code)
986 swap[i] = 1;
987 /* Isomorphic can be achieved by inverting. */
988 else if (first_cond_code == invert_code)
989 swap[i] = 2;
990 else
992 if (dump_enabled_p ())
994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
995 "Build SLP failed: different"
996 " operation");
997 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
998 stmt, 0);
1000 /* Mismatch. */
1001 continue;
1006 matches[i] = true;
1009 for (i = 0; i < group_size; ++i)
1010 if (!matches[i])
1011 return false;
1013 /* If we allowed a two-operation SLP node verify the target can cope
1014 with the permute we are going to use. */
1015 if (alt_stmt_code != ERROR_MARK
1016 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1018 if (vectype == boolean_type_node
1019 || !vect_two_operations_perm_ok_p (stmts, group_size,
1020 vectype, alt_stmt_code))
1022 for (i = 0; i < group_size; ++i)
1023 if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
1025 matches[i] = false;
1026 if (dump_enabled_p ())
1028 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1029 "Build SLP failed: different operation "
1030 "in stmt ");
1031 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1032 stmts[i]->stmt, 0);
1033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1034 "original stmt ");
1035 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1036 first_stmt_info->stmt, 0);
1039 return false;
1041 *two_operators = true;
1044 return true;
1047 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1048 Note we never remove apart from at destruction time so we do not
1049 need a special value for deleted that differs from empty. */
1050 struct bst_traits
1052 typedef vec <stmt_vec_info> value_type;
1053 typedef vec <stmt_vec_info> compare_type;
1054 static inline hashval_t hash (value_type);
1055 static inline bool equal (value_type existing, value_type candidate);
1056 static inline bool is_empty (value_type x) { return !x.exists (); }
1057 static inline bool is_deleted (value_type x) { return !x.exists (); }
1058 static inline void mark_empty (value_type &x) { x.release (); }
1059 static inline void mark_deleted (value_type &x) { x.release (); }
1060 static inline void remove (value_type &x) { x.release (); }
1062 inline hashval_t
1063 bst_traits::hash (value_type x)
1065 inchash::hash h;
1066 for (unsigned i = 0; i < x.length (); ++i)
1067 h.add_int (gimple_uid (x[i]->stmt));
1068 return h.end ();
1070 inline bool
1071 bst_traits::equal (value_type existing, value_type candidate)
1073 if (existing.length () != candidate.length ())
1074 return false;
1075 for (unsigned i = 0; i < existing.length (); ++i)
1076 if (existing[i] != candidate[i])
1077 return false;
1078 return true;
1081 typedef hash_set <vec <gimple *>, bst_traits> scalar_stmts_set_t;
1082 static scalar_stmts_set_t *bst_fail;
1084 typedef hash_map <vec <gimple *>, slp_tree,
1085 simple_hashmap_traits <bst_traits, slp_tree> >
1086 scalar_stmts_to_slp_tree_map_t;
1088 static slp_tree
1089 vect_build_slp_tree_2 (vec_info *vinfo,
1090 vec<stmt_vec_info> stmts, unsigned int group_size,
1091 poly_uint64 *max_nunits,
1092 vec<slp_tree> *loads,
1093 bool *matches, unsigned *npermutes, unsigned *tree_size,
1094 unsigned max_tree_size);
1096 static slp_tree
1097 vect_build_slp_tree (vec_info *vinfo,
1098 vec<stmt_vec_info> stmts, unsigned int group_size,
1099 poly_uint64 *max_nunits, vec<slp_tree> *loads,
1100 bool *matches, unsigned *npermutes, unsigned *tree_size,
1101 unsigned max_tree_size)
1103 if (bst_fail->contains (stmts))
1104 return NULL;
1105 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1106 loads, matches, npermutes, tree_size,
1107 max_tree_size);
1108 /* When SLP build fails for stmts record this, otherwise SLP build
1109 can be exponential in time when we allow to construct parts from
1110 scalars, see PR81723. */
1111 if (! res)
1113 vec <stmt_vec_info> x;
1114 x.create (stmts.length ());
1115 x.splice (stmts);
1116 bst_fail->add (x);
1118 return res;
1121 /* Recursively build an SLP tree starting from NODE.
1122 Fail (and return a value not equal to zero) if def-stmts are not
1123 isomorphic, require data permutation or are of unsupported types of
1124 operation. Otherwise, return 0.
1125 The value returned is the depth in the SLP tree where a mismatch
1126 was found. */
1128 static slp_tree
1129 vect_build_slp_tree_2 (vec_info *vinfo,
1130 vec<stmt_vec_info> stmts, unsigned int group_size,
1131 poly_uint64 *max_nunits,
1132 vec<slp_tree> *loads,
1133 bool *matches, unsigned *npermutes, unsigned *tree_size,
1134 unsigned max_tree_size)
1136 unsigned nops, i, this_tree_size = 0;
1137 poly_uint64 this_max_nunits = *max_nunits;
1138 slp_tree node;
1140 matches[0] = false;
1142 stmt_vec_info stmt_info = stmts[0];
1143 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1144 nops = gimple_call_num_args (stmt);
1145 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1147 nops = gimple_num_ops (stmt) - 1;
1148 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1149 nops++;
1151 else if (is_a <gphi *> (stmt_info->stmt))
1152 nops = 0;
1153 else
1154 return NULL;
1156 /* If the SLP node is a PHI (induction or reduction), terminate
1157 the recursion. */
1158 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1160 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1161 tree vectype = get_vectype_for_scalar_type (scalar_type);
1162 if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits))
1163 return NULL;
1165 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1166 /* Induction from different IVs is not supported. */
1167 if (def_type == vect_induction_def)
1169 stmt_vec_info other_info;
1170 FOR_EACH_VEC_ELT (stmts, i, other_info)
1171 if (stmt_info != other_info)
1172 return NULL;
1174 else
1176 /* Else def types have to match. */
1177 stmt_vec_info other_info;
1178 FOR_EACH_VEC_ELT (stmts, i, other_info)
1180 /* But for reduction chains only check on the first stmt. */
1181 if (REDUC_GROUP_FIRST_ELEMENT (other_info)
1182 && REDUC_GROUP_FIRST_ELEMENT (other_info) != stmt_info)
1183 continue;
1184 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1185 return NULL;
1188 node = vect_create_new_slp_node (stmts);
1189 return node;
1193 bool two_operators = false;
1194 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1195 if (!vect_build_slp_tree_1 (swap, stmts, group_size,
1196 &this_max_nunits, matches, &two_operators))
1197 return NULL;
1199 /* If the SLP node is a load, terminate the recursion. */
1200 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1201 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1203 *max_nunits = this_max_nunits;
1204 node = vect_create_new_slp_node (stmts);
1205 loads->safe_push (node);
1206 return node;
1209 /* Get at the operands, verifying they are compatible. */
1210 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1211 slp_oprnd_info oprnd_info;
1212 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1214 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1215 stmts, i, &oprnds_info);
1216 if (res != 0)
1217 matches[(res == -1) ? 0 : i] = false;
1218 if (!matches[0])
1219 break;
1221 for (i = 0; i < group_size; ++i)
1222 if (!matches[i])
1224 vect_free_oprnd_info (oprnds_info);
1225 return NULL;
1228 auto_vec<slp_tree, 4> children;
1229 auto_vec<slp_tree> this_loads;
1231 stmt_info = stmts[0];
1233 if (tree_size)
1234 max_tree_size -= *tree_size;
1236 /* Create SLP_TREE nodes for the definition node/s. */
1237 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1239 slp_tree child;
1240 unsigned old_nloads = this_loads.length ();
1241 unsigned old_tree_size = this_tree_size;
1242 unsigned int j;
1244 if (oprnd_info->first_dt != vect_internal_def
1245 && oprnd_info->first_dt != vect_reduction_def
1246 && oprnd_info->first_dt != vect_induction_def)
1247 continue;
1249 if (++this_tree_size > max_tree_size)
1251 FOR_EACH_VEC_ELT (children, j, child)
1252 vect_free_slp_tree (child, false);
1253 vect_free_oprnd_info (oprnds_info);
1254 return NULL;
1257 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1258 group_size, &this_max_nunits,
1259 &this_loads, matches, npermutes,
1260 &this_tree_size,
1261 max_tree_size)) != NULL)
1263 /* If we have all children of child built up from scalars then just
1264 throw that away and build it up this node from scalars. */
1265 if (!SLP_TREE_CHILDREN (child).is_empty ()
1266 /* ??? Rejecting patterns this way doesn't work. We'd have to
1267 do extra work to cancel the pattern so the uses see the
1268 scalar version. */
1269 && !is_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (child)[0]))
1271 slp_tree grandchild;
1273 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1274 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1275 break;
1276 if (!grandchild)
1278 /* Roll back. */
1279 this_loads.truncate (old_nloads);
1280 this_tree_size = old_tree_size;
1281 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1282 vect_free_slp_tree (grandchild, false);
1283 SLP_TREE_CHILDREN (child).truncate (0);
1285 dump_printf_loc (MSG_NOTE, vect_location,
1286 "Building parent vector operands from "
1287 "scalars instead\n");
1288 oprnd_info->def_stmts = vNULL;
1289 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1290 children.safe_push (child);
1291 continue;
1295 oprnd_info->def_stmts = vNULL;
1296 children.safe_push (child);
1297 continue;
1300 /* If the SLP build failed fatally and we analyze a basic-block
1301 simply treat nodes we fail to build as externally defined
1302 (and thus build vectors from the scalar defs).
1303 The cost model will reject outright expensive cases.
1304 ??? This doesn't treat cases where permutation ultimatively
1305 fails (or we don't try permutation below). Ideally we'd
1306 even compute a permutation that will end up with the maximum
1307 SLP tree size... */
1308 if (is_a <bb_vec_info> (vinfo)
1309 && !matches[0]
1310 /* ??? Rejecting patterns this way doesn't work. We'd have to
1311 do extra work to cancel the pattern so the uses see the
1312 scalar version. */
1313 && !is_pattern_stmt_p (stmt_info))
1315 dump_printf_loc (MSG_NOTE, vect_location,
1316 "Building vector operands from scalars\n");
1317 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1318 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1319 children.safe_push (child);
1320 oprnd_info->def_stmts = vNULL;
1321 continue;
1324 /* If the SLP build for operand zero failed and operand zero
1325 and one can be commutated try that for the scalar stmts
1326 that failed the match. */
1327 if (i == 0
1328 /* A first scalar stmt mismatch signals a fatal mismatch. */
1329 && matches[0]
1330 /* ??? For COND_EXPRs we can swap the comparison operands
1331 as well as the arms under some constraints. */
1332 && nops == 2
1333 && oprnds_info[1]->first_dt == vect_internal_def
1334 && is_gimple_assign (stmt_info->stmt)
1335 /* Do so only if the number of not successful permutes was nor more
1336 than a cut-ff as re-trying the recursive match on
1337 possibly each level of the tree would expose exponential
1338 behavior. */
1339 && *npermutes < 4)
1341 /* See whether we can swap the matching or the non-matching
1342 stmt operands. */
1343 bool swap_not_matching = true;
1346 for (j = 0; j < group_size; ++j)
1348 if (matches[j] != !swap_not_matching)
1349 continue;
1350 stmt_vec_info stmt_info = stmts[j];
1351 /* Verify if we can swap operands of this stmt. */
1352 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1353 if (!stmt
1354 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1356 if (!swap_not_matching)
1357 goto fail;
1358 swap_not_matching = false;
1359 break;
1361 /* Verify if we can safely swap or if we committed to a
1362 specific operand order already.
1363 ??? Instead of modifying GIMPLE stmts here we could
1364 record whether we want to swap operands in the SLP
1365 node and temporarily do that when processing it
1366 (or wrap operand accessors in a helper). */
1367 else if (swap[j] != 0
1368 || STMT_VINFO_NUM_SLP_USES (stmt_info))
1370 if (!swap_not_matching)
1372 if (dump_enabled_p ())
1374 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1375 vect_location,
1376 "Build SLP failed: cannot swap "
1377 "operands of shared stmt ");
1378 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
1379 TDF_SLIM, stmts[j]->stmt, 0);
1381 goto fail;
1383 swap_not_matching = false;
1384 break;
1388 while (j != group_size);
1390 /* Swap mismatched definition stmts. */
1391 dump_printf_loc (MSG_NOTE, vect_location,
1392 "Re-trying with swapped operands of stmts ");
1393 for (j = 0; j < group_size; ++j)
1394 if (matches[j] == !swap_not_matching)
1396 std::swap (oprnds_info[0]->def_stmts[j],
1397 oprnds_info[1]->def_stmts[j]);
1398 dump_printf (MSG_NOTE, "%d ", j);
1400 dump_printf (MSG_NOTE, "\n");
1401 /* And try again with scratch 'matches' ... */
1402 bool *tem = XALLOCAVEC (bool, group_size);
1403 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1404 group_size, &this_max_nunits,
1405 &this_loads, tem, npermutes,
1406 &this_tree_size,
1407 max_tree_size)) != NULL)
1409 /* ... so if successful we can apply the operand swapping
1410 to the GIMPLE IL. This is necessary because for example
1411 vect_get_slp_defs uses operand indexes and thus expects
1412 canonical operand order. This is also necessary even
1413 if we end up building the operand from scalars as
1414 we'll continue to process swapped operand two. */
1415 for (j = 0; j < group_size; ++j)
1416 gimple_set_plf (stmts[j]->stmt, GF_PLF_1, false);
1417 for (j = 0; j < group_size; ++j)
1418 if (matches[j] == !swap_not_matching)
1420 gassign *stmt = as_a <gassign *> (stmts[j]->stmt);
1421 /* Avoid swapping operands twice. */
1422 if (gimple_plf (stmt, GF_PLF_1))
1423 continue;
1424 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1425 gimple_assign_rhs2_ptr (stmt));
1426 gimple_set_plf (stmt, GF_PLF_1, true);
1428 /* Verify we swap all duplicates or none. */
1429 if (flag_checking)
1430 for (j = 0; j < group_size; ++j)
1431 gcc_assert (gimple_plf (stmts[j]->stmt, GF_PLF_1)
1432 == (matches[j] == !swap_not_matching));
1434 /* If we have all children of child built up from scalars then
1435 just throw that away and build it up this node from scalars. */
1436 if (!SLP_TREE_CHILDREN (child).is_empty ()
1437 /* ??? Rejecting patterns this way doesn't work. We'd have
1438 to do extra work to cancel the pattern so the uses see the
1439 scalar version. */
1440 && !is_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (child)[0]))
1442 unsigned int j;
1443 slp_tree grandchild;
1445 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1446 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1447 break;
1448 if (!grandchild)
1450 /* Roll back. */
1451 this_loads.truncate (old_nloads);
1452 this_tree_size = old_tree_size;
1453 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1454 vect_free_slp_tree (grandchild, false);
1455 SLP_TREE_CHILDREN (child).truncate (0);
1457 dump_printf_loc (MSG_NOTE, vect_location,
1458 "Building parent vector operands from "
1459 "scalars instead\n");
1460 oprnd_info->def_stmts = vNULL;
1461 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1462 children.safe_push (child);
1463 continue;
1467 oprnd_info->def_stmts = vNULL;
1468 children.safe_push (child);
1469 continue;
1472 ++*npermutes;
1475 fail:
1476 gcc_assert (child == NULL);
1477 FOR_EACH_VEC_ELT (children, j, child)
1478 vect_free_slp_tree (child, false);
1479 vect_free_oprnd_info (oprnds_info);
1480 return NULL;
1483 vect_free_oprnd_info (oprnds_info);
1485 if (tree_size)
1486 *tree_size += this_tree_size;
1487 *max_nunits = this_max_nunits;
1488 loads->safe_splice (this_loads);
1490 node = vect_create_new_slp_node (stmts);
1491 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1492 SLP_TREE_CHILDREN (node).splice (children);
1493 return node;
1496 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1498 static void
1499 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1500 slp_tree node)
1502 int i;
1503 stmt_vec_info stmt_info;
1504 slp_tree child;
1506 dump_printf_loc (dump_kind, loc, "node%s\n",
1507 SLP_TREE_DEF_TYPE (node) != vect_internal_def
1508 ? " (external)" : "");
1509 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1511 dump_printf_loc (dump_kind, loc, "\tstmt %d ", i);
1512 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt_info->stmt, 0);
1514 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1515 vect_print_slp_tree (dump_kind, loc, child);
1519 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1520 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1521 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1522 stmts in NODE are to be marked. */
1524 static void
1525 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1527 int i;
1528 stmt_vec_info stmt_info;
1529 slp_tree child;
1531 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1532 return;
1534 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1535 if (j < 0 || i == j)
1536 STMT_SLP_TYPE (stmt_info) = mark;
1538 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1539 vect_mark_slp_stmts (child, mark, j);
1543 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1545 static void
1546 vect_mark_slp_stmts_relevant (slp_tree node)
1548 int i;
1549 stmt_vec_info stmt_info;
1550 slp_tree child;
1552 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1553 return;
1555 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1557 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1558 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1559 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1562 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1563 vect_mark_slp_stmts_relevant (child);
1567 /* Rearrange the statements of NODE according to PERMUTATION. */
1569 static void
1570 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1571 vec<unsigned> permutation)
1573 stmt_vec_info stmt_info;
1574 vec<stmt_vec_info> tmp_stmts;
1575 unsigned int i;
1576 slp_tree child;
1578 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1579 vect_slp_rearrange_stmts (child, group_size, permutation);
1581 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1582 tmp_stmts.create (group_size);
1583 tmp_stmts.quick_grow_cleared (group_size);
1585 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1586 tmp_stmts[permutation[i]] = stmt_info;
1588 SLP_TREE_SCALAR_STMTS (node).release ();
1589 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1593 /* Attempt to reorder stmts in a reduction chain so that we don't
1594 require any load permutation. Return true if that was possible,
1595 otherwise return false. */
1597 static bool
1598 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1600 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1601 unsigned int i, j;
1602 unsigned int lidx;
1603 slp_tree node, load;
1605 /* Compare all the permutation sequences to the first one. We know
1606 that at least one load is permuted. */
1607 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1608 if (!node->load_permutation.exists ())
1609 return false;
1610 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1612 if (!load->load_permutation.exists ())
1613 return false;
1614 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1615 if (lidx != node->load_permutation[j])
1616 return false;
1619 /* Check that the loads in the first sequence are different and there
1620 are no gaps between them. */
1621 auto_sbitmap load_index (group_size);
1622 bitmap_clear (load_index);
1623 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1625 if (lidx >= group_size)
1626 return false;
1627 if (bitmap_bit_p (load_index, lidx))
1628 return false;
1630 bitmap_set_bit (load_index, lidx);
1632 for (i = 0; i < group_size; i++)
1633 if (!bitmap_bit_p (load_index, i))
1634 return false;
1636 /* This permutation is valid for reduction. Since the order of the
1637 statements in the nodes is not important unless they are memory
1638 accesses, we can rearrange the statements in all the nodes
1639 according to the order of the loads. */
1640 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1641 node->load_permutation);
1643 /* We are done, no actual permutations need to be generated. */
1644 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1645 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1647 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1648 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1649 /* But we have to keep those permutations that are required because
1650 of handling of gaps. */
1651 if (known_eq (unrolling_factor, 1U)
1652 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1653 && DR_GROUP_GAP (first_stmt_info) == 0))
1654 SLP_TREE_LOAD_PERMUTATION (node).release ();
1655 else
1656 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1657 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1660 return true;
1663 /* Check if the required load permutations in the SLP instance
1664 SLP_INSTN are supported. */
1666 static bool
1667 vect_supported_load_permutation_p (slp_instance slp_instn)
1669 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1670 unsigned int i, j, k, next;
1671 slp_tree node;
1673 if (dump_enabled_p ())
1675 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1676 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1677 if (node->load_permutation.exists ())
1678 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1679 dump_printf (MSG_NOTE, "%d ", next);
1680 else
1681 for (k = 0; k < group_size; ++k)
1682 dump_printf (MSG_NOTE, "%d ", k);
1683 dump_printf (MSG_NOTE, "\n");
1686 /* In case of reduction every load permutation is allowed, since the order
1687 of the reduction statements is not important (as opposed to the case of
1688 grouped stores). The only condition we need to check is that all the
1689 load nodes are of the same size and have the same permutation (and then
1690 rearrange all the nodes of the SLP instance according to this
1691 permutation). */
1693 /* Check that all the load nodes are of the same size. */
1694 /* ??? Can't we assert this? */
1695 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1696 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1697 return false;
1699 node = SLP_INSTANCE_TREE (slp_instn);
1700 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1702 /* Reduction (there are no data-refs in the root).
1703 In reduction chain the order of the loads is not important. */
1704 if (!STMT_VINFO_DATA_REF (stmt_info)
1705 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1706 vect_attempt_slp_rearrange_stmts (slp_instn);
1708 /* In basic block vectorization we allow any subchain of an interleaving
1709 chain.
1710 FORNOW: not supported in loop SLP because of realignment compications. */
1711 if (STMT_VINFO_BB_VINFO (stmt_info))
1713 /* Check whether the loads in an instance form a subchain and thus
1714 no permutation is necessary. */
1715 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1717 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1718 continue;
1719 bool subchain_p = true;
1720 stmt_vec_info next_load_info = NULL;
1721 stmt_vec_info load_info;
1722 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1724 if (j != 0
1725 && (next_load_info != load_info
1726 || DR_GROUP_GAP (load_info) != 1))
1728 subchain_p = false;
1729 break;
1731 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
1733 if (subchain_p)
1734 SLP_TREE_LOAD_PERMUTATION (node).release ();
1735 else
1737 stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
1738 group_info = DR_GROUP_FIRST_ELEMENT (group_info);
1739 unsigned HOST_WIDE_INT nunits;
1740 unsigned k, maxk = 0;
1741 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1742 if (k > maxk)
1743 maxk = k;
1744 /* In BB vectorization we may not actually use a loaded vector
1745 accessing elements in excess of DR_GROUP_SIZE. */
1746 tree vectype = STMT_VINFO_VECTYPE (group_info);
1747 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1748 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
1750 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1751 "BB vectorization with gaps at the end of "
1752 "a load is not supported\n");
1753 return false;
1756 /* Verify the permutation can be generated. */
1757 vec<tree> tem;
1758 unsigned n_perms;
1759 if (!vect_transform_slp_perm_load (node, tem, NULL,
1760 1, slp_instn, true, &n_perms))
1762 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1763 vect_location,
1764 "unsupported load permutation\n");
1765 return false;
1769 return true;
1772 /* For loop vectorization verify we can generate the permutation. Be
1773 conservative about the vectorization factor, there are permutations
1774 that will use three vector inputs only starting from a specific factor
1775 and the vectorization factor is not yet final.
1776 ??? The SLP instance unrolling factor might not be the maximum one. */
1777 unsigned n_perms;
1778 poly_uint64 test_vf
1779 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1780 LOOP_VINFO_VECT_FACTOR
1781 (STMT_VINFO_LOOP_VINFO (stmt_info)));
1782 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1783 if (node->load_permutation.exists ()
1784 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1785 slp_instn, true, &n_perms))
1786 return false;
1788 return true;
1792 /* Find the last store in SLP INSTANCE. */
1794 stmt_vec_info
1795 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1797 stmt_vec_info last = NULL;
1798 stmt_vec_info stmt_vinfo;
1800 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1802 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1803 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1806 return last;
1809 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1810 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1811 (also containing the first GROUP1_SIZE stmts, since stores are
1812 consecutive), the second containing the remainder.
1813 Return the first stmt in the second group. */
1815 static stmt_vec_info
1816 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1818 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1819 gcc_assert (group1_size > 0);
1820 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1821 gcc_assert (group2_size > 0);
1822 DR_GROUP_SIZE (first_vinfo) = group1_size;
1824 stmt_vec_info stmt_info = first_vinfo;
1825 for (unsigned i = group1_size; i > 1; i--)
1827 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1828 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1830 /* STMT is now the last element of the first group. */
1831 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1832 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1834 DR_GROUP_SIZE (group2) = group2_size;
1835 for (stmt_info = group2; stmt_info;
1836 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1838 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1839 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1842 /* For the second group, the DR_GROUP_GAP is that before the original group,
1843 plus skipping over the first vector. */
1844 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1846 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1847 DR_GROUP_GAP (first_vinfo) += group2_size;
1849 if (dump_enabled_p ())
1850 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1851 group1_size, group2_size);
1853 return group2;
1856 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1857 statements and a vector of NUNITS elements. */
1859 static poly_uint64
1860 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
1862 return exact_div (common_multiple (nunits, group_size), group_size);
1865 /* Analyze an SLP instance starting from a group of grouped stores. Call
1866 vect_build_slp_tree to build a tree of packed stmts if possible.
1867 Return FALSE if it's impossible to SLP any stmt in the loop. */
1869 static bool
1870 vect_analyze_slp_instance (vec_info *vinfo,
1871 stmt_vec_info stmt_info, unsigned max_tree_size)
1873 slp_instance new_instance;
1874 slp_tree node;
1875 unsigned int group_size;
1876 tree vectype, scalar_type = NULL_TREE;
1877 unsigned int i;
1878 vec<slp_tree> loads;
1879 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1880 vec<stmt_vec_info> scalar_stmts;
1882 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1884 scalar_type = TREE_TYPE (DR_REF (dr));
1885 vectype = get_vectype_for_scalar_type (scalar_type);
1886 group_size = DR_GROUP_SIZE (stmt_info);
1888 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1890 gcc_assert (is_a <loop_vec_info> (vinfo));
1891 vectype = STMT_VINFO_VECTYPE (stmt_info);
1892 group_size = REDUC_GROUP_SIZE (stmt_info);
1894 else
1896 gcc_assert (is_a <loop_vec_info> (vinfo));
1897 vectype = STMT_VINFO_VECTYPE (stmt_info);
1898 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
1901 if (!vectype)
1903 if (dump_enabled_p ())
1905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1906 "Build SLP failed: unsupported data-type ");
1907 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1908 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1911 return false;
1913 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1915 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1916 scalar_stmts.create (group_size);
1917 stmt_vec_info next_info = stmt_info;
1918 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1920 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1921 while (next_info)
1923 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
1924 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
1927 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1929 /* Collect the reduction stmts and store them in
1930 SLP_TREE_SCALAR_STMTS. */
1931 while (next_info)
1933 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
1934 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
1936 /* Mark the first element of the reduction chain as reduction to properly
1937 transform the node. In the reduction analysis phase only the last
1938 element of the chain is marked as reduction. */
1939 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
1941 else
1943 /* Collect reduction statements. */
1944 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
1945 for (i = 0; reductions.iterate (i, &next_info); i++)
1946 scalar_stmts.safe_push (next_info);
1949 loads.create (group_size);
1951 /* Build the tree for the SLP instance. */
1952 bool *matches = XALLOCAVEC (bool, group_size);
1953 unsigned npermutes = 0;
1954 bst_fail = new scalar_stmts_set_t ();
1955 poly_uint64 max_nunits = nunits;
1956 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
1957 &max_nunits, &loads, matches, &npermutes,
1958 NULL, max_tree_size);
1959 delete bst_fail;
1960 if (node != NULL)
1962 /* Calculate the unrolling factor based on the smallest type. */
1963 poly_uint64 unrolling_factor
1964 = calculate_unrolling_factor (max_nunits, group_size);
1966 if (maybe_ne (unrolling_factor, 1U)
1967 && is_a <bb_vec_info> (vinfo))
1969 unsigned HOST_WIDE_INT const_max_nunits;
1970 if (!max_nunits.is_constant (&const_max_nunits)
1971 || const_max_nunits > group_size)
1973 if (dump_enabled_p ())
1974 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1975 "Build SLP failed: store group "
1976 "size not a multiple of the vector size "
1977 "in basic block SLP\n");
1978 vect_free_slp_tree (node, false);
1979 loads.release ();
1980 return false;
1982 /* Fatal mismatch. */
1983 matches[group_size / const_max_nunits * const_max_nunits] = false;
1984 vect_free_slp_tree (node, false);
1985 loads.release ();
1987 else
1989 /* Create a new SLP instance. */
1990 new_instance = XNEW (struct _slp_instance);
1991 SLP_INSTANCE_TREE (new_instance) = node;
1992 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1993 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1994 SLP_INSTANCE_LOADS (new_instance) = loads;
1996 /* Compute the load permutation. */
1997 slp_tree load_node;
1998 bool loads_permuted = false;
1999 FOR_EACH_VEC_ELT (loads, i, load_node)
2001 vec<unsigned> load_permutation;
2002 int j;
2003 stmt_vec_info load_info;
2004 bool this_load_permuted = false;
2005 load_permutation.create (group_size);
2006 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
2007 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2008 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2010 int load_place = vect_get_place_in_interleaving_chain
2011 (load_info, first_stmt_info);
2012 gcc_assert (load_place != -1);
2013 if (load_place != j)
2014 this_load_permuted = true;
2015 load_permutation.safe_push (load_place);
2017 if (!this_load_permuted
2018 /* The load requires permutation when unrolling exposes
2019 a gap either because the group is larger than the SLP
2020 group-size or because there is a gap between the groups. */
2021 && (known_eq (unrolling_factor, 1U)
2022 || (group_size == DR_GROUP_SIZE (first_stmt_info)
2023 && DR_GROUP_GAP (first_stmt_info) == 0)))
2025 load_permutation.release ();
2026 continue;
2028 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2029 loads_permuted = true;
2032 if (loads_permuted)
2034 if (!vect_supported_load_permutation_p (new_instance))
2036 if (dump_enabled_p ())
2038 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2039 "Build SLP failed: unsupported load "
2040 "permutation ");
2041 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
2042 TDF_SLIM, stmt_info->stmt, 0);
2044 vect_free_slp_instance (new_instance, false);
2045 return false;
2049 /* If the loads and stores can be handled with load/store-lan
2050 instructions do not generate this SLP instance. */
2051 if (is_a <loop_vec_info> (vinfo)
2052 && loads_permuted
2053 && dr && vect_store_lanes_supported (vectype, group_size, false))
2055 slp_tree load_node;
2056 FOR_EACH_VEC_ELT (loads, i, load_node)
2058 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2059 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2060 /* Use SLP for strided accesses (or if we can't load-lanes). */
2061 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2062 || ! vect_load_lanes_supported
2063 (STMT_VINFO_VECTYPE (stmt_vinfo),
2064 DR_GROUP_SIZE (stmt_vinfo), false))
2065 break;
2067 if (i == loads.length ())
2069 if (dump_enabled_p ())
2070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2071 "Built SLP cancelled: can use "
2072 "load/store-lanes\n");
2073 vect_free_slp_instance (new_instance, false);
2074 return false;
2078 vinfo->slp_instances.safe_push (new_instance);
2080 if (dump_enabled_p ())
2082 dump_printf_loc (MSG_NOTE, vect_location,
2083 "Final SLP tree for instance:\n");
2084 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2087 return true;
2090 else
2092 /* Failed to SLP. */
2093 /* Free the allocated memory. */
2094 scalar_stmts.release ();
2095 loads.release ();
2098 /* For basic block SLP, try to break the group up into multiples of the
2099 vector size. */
2100 unsigned HOST_WIDE_INT const_nunits;
2101 if (is_a <bb_vec_info> (vinfo)
2102 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2103 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2104 && nunits.is_constant (&const_nunits))
2106 /* We consider breaking the group only on VF boundaries from the existing
2107 start. */
2108 for (i = 0; i < group_size; i++)
2109 if (!matches[i]) break;
2111 if (i >= const_nunits && i < group_size)
2113 /* Split into two groups at the first vector boundary before i. */
2114 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2115 unsigned group1_size = i & ~(const_nunits - 1);
2117 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2118 group1_size);
2119 bool res = vect_analyze_slp_instance (vinfo, stmt_info,
2120 max_tree_size);
2121 /* If the first non-match was in the middle of a vector,
2122 skip the rest of that vector. */
2123 if (group1_size < i)
2125 i = group1_size + const_nunits;
2126 if (i < group_size)
2127 rest = vect_split_slp_store_group (rest, const_nunits);
2129 if (i < group_size)
2130 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2131 return res;
2133 /* Even though the first vector did not all match, we might be able to SLP
2134 (some) of the remainder. FORNOW ignore this possibility. */
2137 return false;
2141 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2142 trees of packed scalar stmts if SLP is possible. */
2144 bool
2145 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2147 unsigned int i;
2148 stmt_vec_info first_element;
2150 DUMP_VECT_SCOPE ("vect_analyze_slp");
2152 /* Find SLP sequences starting from groups of grouped stores. */
2153 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2154 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2156 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2158 if (loop_vinfo->reduction_chains.length () > 0)
2160 /* Find SLP sequences starting from reduction chains. */
2161 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2162 if (! vect_analyze_slp_instance (vinfo, first_element,
2163 max_tree_size))
2165 /* Dissolve reduction chain group. */
2166 stmt_vec_info vinfo = first_element;
2167 while (vinfo)
2169 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2170 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2171 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2172 vinfo = next;
2174 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2178 /* Find SLP sequences starting from groups of reductions. */
2179 if (loop_vinfo->reductions.length () > 1)
2180 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2181 max_tree_size);
2184 return true;
2188 /* For each possible SLP instance decide whether to SLP it and calculate overall
2189 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2190 least one instance. */
2192 bool
2193 vect_make_slp_decision (loop_vec_info loop_vinfo)
2195 unsigned int i;
2196 poly_uint64 unrolling_factor = 1;
2197 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2198 slp_instance instance;
2199 int decided_to_slp = 0;
2201 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2203 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2205 /* FORNOW: SLP if you can. */
2206 /* All unroll factors have the form current_vector_size * X for some
2207 rational X, so they must have a common multiple. */
2208 unrolling_factor
2209 = force_common_multiple (unrolling_factor,
2210 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2212 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2213 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2214 loop-based vectorization. Such stmts will be marked as HYBRID. */
2215 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2216 decided_to_slp++;
2219 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2221 if (decided_to_slp && dump_enabled_p ())
2223 dump_printf_loc (MSG_NOTE, vect_location,
2224 "Decided to SLP %d instances. Unrolling factor ",
2225 decided_to_slp);
2226 dump_dec (MSG_NOTE, unrolling_factor);
2227 dump_printf (MSG_NOTE, "\n");
2230 return (decided_to_slp > 0);
2234 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2235 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2237 static void
2238 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2240 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (node)[i];
2241 imm_use_iterator imm_iter;
2242 gimple *use_stmt;
2243 stmt_vec_info use_vinfo;
2244 slp_tree child;
2245 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2246 int j;
2248 /* Propagate hybrid down the SLP tree. */
2249 if (stype == hybrid)
2251 else if (HYBRID_SLP_STMT (stmt_vinfo))
2252 stype = hybrid;
2253 else
2255 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2256 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2257 /* If we get a pattern stmt here we have to use the LHS of the
2258 original stmt for immediate uses. */
2259 gimple *stmt = vect_orig_stmt (stmt_vinfo)->stmt;
2260 tree def;
2261 if (gimple_code (stmt) == GIMPLE_PHI)
2262 def = gimple_phi_result (stmt);
2263 else
2264 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2265 if (def)
2266 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2268 use_vinfo = loop_vinfo->lookup_stmt (use_stmt);
2269 if (!use_vinfo)
2270 continue;
2271 use_vinfo = vect_stmt_to_vectorize (use_vinfo);
2272 if (!STMT_SLP_TYPE (use_vinfo)
2273 && (STMT_VINFO_RELEVANT (use_vinfo)
2274 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2275 && !(gimple_code (use_stmt) == GIMPLE_PHI
2276 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2278 if (dump_enabled_p ())
2280 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2281 "def in non-SLP stmt: ");
2282 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
2284 stype = hybrid;
2289 if (stype == hybrid
2290 && !HYBRID_SLP_STMT (stmt_vinfo))
2292 if (dump_enabled_p ())
2294 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2295 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_vinfo->stmt, 0);
2297 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2300 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2301 if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
2302 vect_detect_hybrid_slp_stmts (child, i, stype);
2305 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2307 static tree
2308 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2310 walk_stmt_info *wi = (walk_stmt_info *)data;
2311 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2313 if (wi->is_lhs)
2314 return NULL_TREE;
2316 stmt_vec_info def_stmt_info = loop_vinfo->lookup_def (*tp);
2317 if (def_stmt_info && PURE_SLP_STMT (def_stmt_info))
2319 if (dump_enabled_p ())
2321 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2322 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt_info->stmt, 0);
2324 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2327 return NULL_TREE;
2330 static tree
2331 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2332 walk_stmt_info *wi)
2334 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2335 stmt_vec_info use_vinfo = loop_vinfo->lookup_stmt (gsi_stmt (*gsi));
2336 /* If the stmt is in a SLP instance then this isn't a reason
2337 to mark use definitions in other SLP instances as hybrid. */
2338 if (! STMT_SLP_TYPE (use_vinfo)
2339 && (STMT_VINFO_RELEVANT (use_vinfo)
2340 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2341 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2342 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2344 else
2345 *handled = true;
2346 return NULL_TREE;
2349 /* Find stmts that must be both vectorized and SLPed. */
2351 void
2352 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2354 unsigned int i;
2355 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2356 slp_instance instance;
2358 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2360 /* First walk all pattern stmt in the loop and mark defs of uses as
2361 hybrid because immediate uses in them are not recorded. */
2362 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2364 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2365 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2366 gsi_next (&gsi))
2368 gimple *stmt = gsi_stmt (gsi);
2369 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2370 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2372 walk_stmt_info wi;
2373 memset (&wi, 0, sizeof (wi));
2374 wi.info = loop_vinfo;
2375 gimple_stmt_iterator gsi2
2376 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)->stmt);
2377 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2378 vect_detect_hybrid_slp_1, &wi);
2379 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2380 vect_detect_hybrid_slp_2,
2381 vect_detect_hybrid_slp_1, &wi);
2386 /* Then walk the SLP instance trees marking stmts with uses in
2387 non-SLP stmts as hybrid, also propagating hybrid down the
2388 SLP tree, collecting the above info on-the-fly. */
2389 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2391 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2392 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2393 i, pure_slp);
2398 /* Initialize a bb_vec_info struct for the statements between
2399 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2401 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2402 gimple_stmt_iterator region_end_in,
2403 vec_info_shared *shared)
2404 : vec_info (vec_info::bb, init_cost (NULL), shared),
2405 bb (gsi_bb (region_begin_in)),
2406 region_begin (region_begin_in),
2407 region_end (region_end_in)
2409 gimple_stmt_iterator gsi;
2411 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2412 gsi_next (&gsi))
2414 gimple *stmt = gsi_stmt (gsi);
2415 gimple_set_uid (stmt, 0);
2416 add_stmt (stmt);
2419 bb->aux = this;
2423 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2424 stmts in the basic block. */
2426 _bb_vec_info::~_bb_vec_info ()
2428 for (gimple_stmt_iterator si = region_begin;
2429 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2430 /* Reset region marker. */
2431 gimple_set_uid (gsi_stmt (si), -1);
2433 bb->aux = NULL;
2436 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2437 given then that child nodes have already been processed, and that
2438 their def types currently match their SLP node's def type. */
2440 static bool
2441 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2442 slp_instance node_instance,
2443 stmt_vector_for_cost *cost_vec)
2445 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2446 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2448 /* For BB vectorization vector types are assigned here.
2449 Memory accesses already got their vector type assigned
2450 in vect_analyze_data_refs. */
2451 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2452 if (bb_vinfo
2453 && ! STMT_VINFO_DATA_REF (stmt_info))
2455 tree vectype, nunits_vectype;
2456 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
2457 &nunits_vectype))
2458 /* We checked this when building the node. */
2459 gcc_unreachable ();
2460 if (vectype == boolean_type_node)
2462 vectype = vect_get_mask_type_for_stmt (stmt_info);
2463 if (!vectype)
2464 /* vect_get_mask_type_for_stmt has already explained the
2465 failure. */
2466 return false;
2469 stmt_vec_info sstmt_info;
2470 unsigned int i;
2471 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt_info)
2472 STMT_VINFO_VECTYPE (sstmt_info) = vectype;
2475 /* Calculate the number of vector statements to be created for the
2476 scalar stmts in this node. For SLP reductions it is equal to the
2477 number of vector statements in the children (which has already been
2478 calculated by the recursive call). Otherwise it is the number of
2479 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2480 VF divided by the number of elements in a vector. */
2481 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2482 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2483 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2484 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2485 else
2487 poly_uint64 vf;
2488 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2489 vf = loop_vinfo->vectorization_factor;
2490 else
2491 vf = 1;
2492 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2493 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2494 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2495 = vect_get_num_vectors (vf * group_size, vectype);
2498 bool dummy;
2499 return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec);
2502 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2503 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2505 Return true if the operations are supported. */
2507 static bool
2508 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2509 slp_instance node_instance,
2510 scalar_stmts_to_slp_tree_map_t *visited,
2511 scalar_stmts_to_slp_tree_map_t *lvisited,
2512 stmt_vector_for_cost *cost_vec)
2514 int i, j;
2515 slp_tree child;
2517 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2518 return true;
2520 /* If we already analyzed the exact same set of scalar stmts we're done.
2521 We share the generated vector stmts for those. */
2522 slp_tree *leader;
2523 if ((leader = visited->get (SLP_TREE_SCALAR_STMTS (node)))
2524 || (leader = lvisited->get (SLP_TREE_SCALAR_STMTS (node))))
2526 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2527 = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
2528 return true;
2531 /* The SLP graph is acyclic so not caching whether we failed or succeeded
2532 doesn't result in any issue since we throw away the lvisited set
2533 when we fail. */
2534 lvisited->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
2536 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2537 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2538 visited, lvisited, cost_vec))
2539 return false;
2541 /* Push SLP node def-type to stmt operands. */
2542 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2543 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2544 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2545 = SLP_TREE_DEF_TYPE (child);
2546 bool res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2547 cost_vec);
2548 /* Restore def-types. */
2549 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2550 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2551 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2552 = vect_internal_def;
2553 if (! res)
2554 return false;
2556 return true;
2560 /* Analyze statements in SLP instances of VINFO. Return true if the
2561 operations are supported. */
2563 bool
2564 vect_slp_analyze_operations (vec_info *vinfo)
2566 slp_instance instance;
2567 int i;
2569 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2571 scalar_stmts_to_slp_tree_map_t *visited
2572 = new scalar_stmts_to_slp_tree_map_t ();
2573 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2575 scalar_stmts_to_slp_tree_map_t lvisited;
2576 stmt_vector_for_cost cost_vec;
2577 cost_vec.create (2);
2578 if (!vect_slp_analyze_node_operations (vinfo,
2579 SLP_INSTANCE_TREE (instance),
2580 instance, visited, &lvisited,
2581 &cost_vec))
2583 slp_tree node = SLP_INSTANCE_TREE (instance);
2584 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2585 dump_printf_loc (MSG_NOTE, vect_location,
2586 "removing SLP instance operations starting from: ");
2587 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
2588 vect_free_slp_instance (instance, false);
2589 vinfo->slp_instances.ordered_remove (i);
2590 cost_vec.release ();
2592 else
2594 for (scalar_stmts_to_slp_tree_map_t::iterator x = lvisited.begin();
2595 x != lvisited.end(); ++x)
2596 visited->put ((*x).first.copy (), (*x).second);
2597 i++;
2599 add_stmt_costs (vinfo->target_cost_data, &cost_vec);
2600 cost_vec.release ();
2603 delete visited;
2605 return !vinfo->slp_instances.is_empty ();
2609 /* Compute the scalar cost of the SLP node NODE and its children
2610 and return it. Do not account defs that are marked in LIFE and
2611 update LIFE according to uses of NODE. */
2613 static void
2614 vect_bb_slp_scalar_cost (basic_block bb,
2615 slp_tree node, vec<bool, va_heap> *life,
2616 stmt_vector_for_cost *cost_vec)
2618 unsigned i;
2619 stmt_vec_info stmt_info;
2620 slp_tree child;
2622 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2624 gimple *stmt = stmt_info->stmt;
2625 vec_info *vinfo = stmt_info->vinfo;
2626 ssa_op_iter op_iter;
2627 def_operand_p def_p;
2629 if ((*life)[i])
2630 continue;
2632 /* If there is a non-vectorized use of the defs then the scalar
2633 stmt is kept live in which case we do not account it or any
2634 required defs in the SLP children in the scalar cost. This
2635 way we make the vectorization more costly when compared to
2636 the scalar cost. */
2637 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2639 imm_use_iterator use_iter;
2640 gimple *use_stmt;
2641 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2642 if (!is_gimple_debug (use_stmt))
2644 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
2645 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
2647 (*life)[i] = true;
2648 BREAK_FROM_IMM_USE_STMT (use_iter);
2652 if ((*life)[i])
2653 continue;
2655 /* Count scalar stmts only once. */
2656 if (gimple_visited_p (stmt))
2657 continue;
2658 gimple_set_visited (stmt, true);
2660 vect_cost_for_stmt kind;
2661 if (STMT_VINFO_DATA_REF (stmt_info))
2663 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2664 kind = scalar_load;
2665 else
2666 kind = scalar_store;
2668 else
2669 kind = scalar_stmt;
2670 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
2673 auto_vec<bool, 20> subtree_life;
2674 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2676 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2678 /* Do not directly pass LIFE to the recursive call, copy it to
2679 confine changes in the callee to the current child/subtree. */
2680 subtree_life.safe_splice (*life);
2681 vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec);
2682 subtree_life.truncate (0);
2687 /* Check if vectorization of the basic block is profitable. */
2689 static bool
2690 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2692 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2693 slp_instance instance;
2694 int i;
2695 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2696 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2698 /* Calculate scalar cost. */
2699 stmt_vector_for_cost scalar_costs;
2700 scalar_costs.create (0);
2701 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2703 auto_vec<bool, 20> life;
2704 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2705 vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2706 SLP_INSTANCE_TREE (instance),
2707 &life, &scalar_costs);
2709 void *target_cost_data = init_cost (NULL);
2710 add_stmt_costs (target_cost_data, &scalar_costs);
2711 scalar_costs.release ();
2712 unsigned dummy;
2713 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
2714 destroy_cost_data (target_cost_data);
2716 /* Unset visited flag. */
2717 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2718 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2719 gimple_set_visited (gsi_stmt (gsi), false);
2721 /* Complete the target-specific cost calculation. */
2722 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2723 &vec_inside_cost, &vec_epilogue_cost);
2725 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2727 if (dump_enabled_p ())
2729 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2730 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2731 vec_inside_cost);
2732 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2733 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2734 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2737 /* Vectorization is profitable if its cost is more than the cost of scalar
2738 version. Note that we err on the vector side for equal cost because
2739 the cost estimate is otherwise quite pessimistic (constant uses are
2740 free on the scalar side but cost a load on the vector side for
2741 example). */
2742 if (vec_outside_cost + vec_inside_cost > scalar_cost)
2743 return false;
2745 return true;
2748 /* Check if the basic block can be vectorized. Returns a bb_vec_info
2749 if so and sets fatal to true if failure is independent of
2750 current_vector_size. */
2752 static bb_vec_info
2753 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
2754 gimple_stmt_iterator region_end,
2755 vec<data_reference_p> datarefs, int n_stmts,
2756 bool &fatal, vec_info_shared *shared)
2758 bb_vec_info bb_vinfo;
2759 slp_instance instance;
2760 int i;
2761 poly_uint64 min_vf = 2;
2763 /* The first group of checks is independent of the vector size. */
2764 fatal = true;
2766 if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2768 if (dump_enabled_p ())
2769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2770 "not vectorized: too many instructions in "
2771 "basic block.\n");
2772 free_data_refs (datarefs);
2773 return NULL;
2776 bb_vinfo = new _bb_vec_info (region_begin, region_end, shared);
2777 if (!bb_vinfo)
2778 return NULL;
2780 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
2781 bb_vinfo->shared->save_datarefs ();
2783 /* Analyze the data references. */
2785 if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
2787 if (dump_enabled_p ())
2788 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2789 "not vectorized: unhandled data-ref in basic "
2790 "block.\n");
2792 delete bb_vinfo;
2793 return NULL;
2796 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2798 if (dump_enabled_p ())
2799 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2800 "not vectorized: not enough data-refs in "
2801 "basic block.\n");
2803 delete bb_vinfo;
2804 return NULL;
2807 if (!vect_analyze_data_ref_accesses (bb_vinfo))
2809 if (dump_enabled_p ())
2810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2811 "not vectorized: unhandled data access in "
2812 "basic block.\n");
2814 delete bb_vinfo;
2815 return NULL;
2818 /* If there are no grouped stores in the region there is no need
2819 to continue with pattern recog as vect_analyze_slp will fail
2820 anyway. */
2821 if (bb_vinfo->grouped_stores.is_empty ())
2823 if (dump_enabled_p ())
2824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2825 "not vectorized: no grouped stores in "
2826 "basic block.\n");
2828 delete bb_vinfo;
2829 return NULL;
2832 /* While the rest of the analysis below depends on it in some way. */
2833 fatal = false;
2835 vect_pattern_recog (bb_vinfo);
2837 /* Check the SLP opportunities in the basic block, analyze and build SLP
2838 trees. */
2839 if (!vect_analyze_slp (bb_vinfo, n_stmts))
2841 if (dump_enabled_p ())
2843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2844 "Failed to SLP the basic block.\n");
2845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2846 "not vectorized: failed to find SLP opportunities "
2847 "in basic block.\n");
2850 delete bb_vinfo;
2851 return NULL;
2854 vect_record_base_alignments (bb_vinfo);
2856 /* Analyze and verify the alignment of data references and the
2857 dependence in the SLP instances. */
2858 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
2860 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
2861 || ! vect_slp_analyze_instance_dependence (instance))
2863 slp_tree node = SLP_INSTANCE_TREE (instance);
2864 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2865 dump_printf_loc (MSG_NOTE, vect_location,
2866 "removing SLP instance operations starting from: ");
2867 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
2868 vect_free_slp_instance (instance, false);
2869 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
2870 continue;
2873 /* Mark all the statements that we want to vectorize as pure SLP and
2874 relevant. */
2875 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2876 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2878 i++;
2880 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
2882 delete bb_vinfo;
2883 return NULL;
2886 if (!vect_slp_analyze_operations (bb_vinfo))
2888 if (dump_enabled_p ())
2889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2890 "not vectorized: bad operation in basic block.\n");
2892 delete bb_vinfo;
2893 return NULL;
2896 /* Cost model: check if the vectorization is worthwhile. */
2897 if (!unlimited_cost_model (NULL)
2898 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2900 if (dump_enabled_p ())
2901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2902 "not vectorized: vectorization is not "
2903 "profitable.\n");
2905 delete bb_vinfo;
2906 return NULL;
2909 if (dump_enabled_p ())
2910 dump_printf_loc (MSG_NOTE, vect_location,
2911 "Basic block will be vectorized using SLP\n");
2913 return bb_vinfo;
2917 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
2918 true if anything in the basic-block was vectorized. */
2920 bool
2921 vect_slp_bb (basic_block bb)
2923 bb_vec_info bb_vinfo;
2924 gimple_stmt_iterator gsi;
2925 bool any_vectorized = false;
2926 auto_vector_sizes vector_sizes;
2928 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
2930 /* Autodetect first vector size we try. */
2931 current_vector_size = 0;
2932 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2933 unsigned int next_size = 0;
2935 gsi = gsi_start_bb (bb);
2937 poly_uint64 autodetected_vector_size = 0;
2938 while (1)
2940 if (gsi_end_p (gsi))
2941 break;
2943 gimple_stmt_iterator region_begin = gsi;
2944 vec<data_reference_p> datarefs = vNULL;
2945 int insns = 0;
2947 for (; !gsi_end_p (gsi); gsi_next (&gsi))
2949 gimple *stmt = gsi_stmt (gsi);
2950 if (is_gimple_debug (stmt))
2951 continue;
2952 insns++;
2954 if (gimple_location (stmt) != UNKNOWN_LOCATION)
2955 vect_location = stmt;
2957 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
2958 break;
2961 /* Skip leading unhandled stmts. */
2962 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
2964 gsi_next (&gsi);
2965 continue;
2968 gimple_stmt_iterator region_end = gsi;
2970 bool vectorized = false;
2971 bool fatal = false;
2972 vec_info_shared shared;
2973 bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
2974 datarefs, insns, fatal, &shared);
2975 if (bb_vinfo
2976 && dbg_cnt (vect_slp))
2978 if (dump_enabled_p ())
2979 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
2981 bb_vinfo->shared->check_datarefs ();
2982 vect_schedule_slp (bb_vinfo);
2984 unsigned HOST_WIDE_INT bytes;
2985 if (current_vector_size.is_constant (&bytes))
2986 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
2987 "basic block part vectorized using "
2988 HOST_WIDE_INT_PRINT_UNSIGNED " byte "
2989 "vectors\n", bytes);
2990 else
2991 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
2992 "basic block part vectorized using variable "
2993 "length vectors\n");
2995 vectorized = true;
2997 delete bb_vinfo;
2999 any_vectorized |= vectorized;
3001 if (next_size == 0)
3002 autodetected_vector_size = current_vector_size;
3004 if (next_size < vector_sizes.length ()
3005 && known_eq (vector_sizes[next_size], autodetected_vector_size))
3006 next_size += 1;
3008 if (vectorized
3009 || next_size == vector_sizes.length ()
3010 || known_eq (current_vector_size, 0U)
3011 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3012 vector sizes will fail do not bother iterating. */
3013 || fatal)
3015 if (gsi_end_p (region_end))
3016 break;
3018 /* Skip the unhandled stmt. */
3019 gsi_next (&gsi);
3021 /* And reset vector sizes. */
3022 current_vector_size = 0;
3023 next_size = 0;
3025 else
3027 /* Try the next biggest vector size. */
3028 current_vector_size = vector_sizes[next_size++];
3029 if (dump_enabled_p ())
3031 dump_printf_loc (MSG_NOTE, vect_location,
3032 "***** Re-trying analysis with "
3033 "vector size ");
3034 dump_dec (MSG_NOTE, current_vector_size);
3035 dump_printf (MSG_NOTE, "\n");
3038 /* Start over. */
3039 gsi = region_begin;
3043 return any_vectorized;
3047 /* Return 1 if vector type of boolean constant which is OPNUM
3048 operand in statement STMT_VINFO is a boolean vector. */
3050 static bool
3051 vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo, int opnum)
3053 enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
3054 tree op, vectype;
3055 enum vect_def_type dt;
3057 /* For comparison and COND_EXPR type is chosen depending
3058 on the other comparison operand. */
3059 if (TREE_CODE_CLASS (code) == tcc_comparison)
3061 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3062 if (opnum)
3063 op = gimple_assign_rhs1 (stmt);
3064 else
3065 op = gimple_assign_rhs2 (stmt);
3067 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3068 gcc_unreachable ();
3070 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3073 if (code == COND_EXPR)
3075 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3076 tree cond = gimple_assign_rhs1 (stmt);
3078 if (TREE_CODE (cond) == SSA_NAME)
3079 op = cond;
3080 else if (opnum)
3081 op = TREE_OPERAND (cond, 1);
3082 else
3083 op = TREE_OPERAND (cond, 0);
3085 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3086 gcc_unreachable ();
3088 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3091 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3094 /* Build a variable-length vector in which the elements in ELTS are repeated
3095 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3096 RESULTS and add any new instructions to SEQ.
3098 The approach we use is:
3100 (1) Find a vector mode VM with integer elements of mode IM.
3102 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3103 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3104 from small vectors to IM.
3106 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3108 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3109 correct byte contents.
3111 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3113 We try to find the largest IM for which this sequence works, in order
3114 to cut down on the number of interleaves. */
3116 void
3117 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
3118 unsigned int nresults, vec<tree> &results)
3120 unsigned int nelts = elts.length ();
3121 tree element_type = TREE_TYPE (vector_type);
3123 /* (1) Find a vector mode VM with integer elements of mode IM. */
3124 unsigned int nvectors = 1;
3125 tree new_vector_type;
3126 tree permutes[2];
3127 if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
3128 &nvectors, &new_vector_type,
3129 permutes))
3130 gcc_unreachable ();
3132 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3133 unsigned int partial_nelts = nelts / nvectors;
3134 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3136 tree_vector_builder partial_elts;
3137 auto_vec<tree, 32> pieces (nvectors * 2);
3138 pieces.quick_grow (nvectors * 2);
3139 for (unsigned int i = 0; i < nvectors; ++i)
3141 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3142 ELTS' has mode IM. */
3143 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3144 for (unsigned int j = 0; j < partial_nelts; ++j)
3145 partial_elts.quick_push (elts[i * partial_nelts + j]);
3146 tree t = gimple_build_vector (seq, &partial_elts);
3147 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3148 TREE_TYPE (new_vector_type), t);
3150 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3151 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3154 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3155 correct byte contents.
3157 We need to repeat the following operation log2(nvectors) times:
3159 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3160 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3162 However, if each input repeats every N elements and the VF is
3163 a multiple of N * 2, the HI result is the same as the LO. */
3164 unsigned int in_start = 0;
3165 unsigned int out_start = nvectors;
3166 unsigned int hi_start = nvectors / 2;
3167 /* A bound on the number of outputs needed to produce NRESULTS results
3168 in the final iteration. */
3169 unsigned int noutputs_bound = nvectors * nresults;
3170 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3172 noutputs_bound /= 2;
3173 unsigned int limit = MIN (noutputs_bound, nvectors);
3174 for (unsigned int i = 0; i < limit; ++i)
3176 if ((i & 1) != 0
3177 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3178 2 * in_repeat))
3180 pieces[out_start + i] = pieces[out_start + i - 1];
3181 continue;
3184 tree output = make_ssa_name (new_vector_type);
3185 tree input1 = pieces[in_start + (i / 2)];
3186 tree input2 = pieces[in_start + (i / 2) + hi_start];
3187 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3188 input1, input2,
3189 permutes[i & 1]);
3190 gimple_seq_add_stmt (seq, stmt);
3191 pieces[out_start + i] = output;
3193 std::swap (in_start, out_start);
3196 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3197 results.reserve (nresults);
3198 for (unsigned int i = 0; i < nresults; ++i)
3199 if (i < nvectors)
3200 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3201 pieces[in_start + i]));
3202 else
3203 results.quick_push (results[i - nvectors]);
3207 /* For constant and loop invariant defs of SLP_NODE this function returns
3208 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3209 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
3210 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
3211 REDUC_INDEX is the index of the reduction operand in the statements, unless
3212 it is -1. */
3214 static void
3215 vect_get_constant_vectors (tree op, slp_tree slp_node,
3216 vec<tree> *vec_oprnds,
3217 unsigned int op_num, unsigned int number_of_vectors)
3219 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3220 stmt_vec_info stmt_vinfo = stmts[0];
3221 gimple *stmt = stmt_vinfo->stmt;
3222 unsigned HOST_WIDE_INT nunits;
3223 tree vec_cst;
3224 unsigned j, number_of_places_left_in_vector;
3225 tree vector_type;
3226 tree vop;
3227 int group_size = stmts.length ();
3228 unsigned int vec_num, i;
3229 unsigned number_of_copies = 1;
3230 vec<tree> voprnds;
3231 voprnds.create (number_of_vectors);
3232 bool constant_p, is_store;
3233 tree neutral_op = NULL;
3234 enum tree_code code = gimple_expr_code (stmt);
3235 gimple_seq ctor_seq = NULL;
3236 auto_vec<tree, 16> permute_results;
3238 /* Check if vector type is a boolean vector. */
3239 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3240 && vect_mask_constant_operand_p (stmt_vinfo, op_num))
3241 vector_type
3242 = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
3243 else
3244 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
3246 if (STMT_VINFO_DATA_REF (stmt_vinfo))
3248 is_store = true;
3249 op = gimple_assign_rhs1 (stmt);
3251 else
3252 is_store = false;
3254 gcc_assert (op);
3256 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3257 created vectors. It is greater than 1 if unrolling is performed.
3259 For example, we have two scalar operands, s1 and s2 (e.g., group of
3260 strided accesses of size two), while NUNITS is four (i.e., four scalars
3261 of this type can be packed in a vector). The output vector will contain
3262 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3263 will be 2).
3265 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3266 containing the operands.
3268 For example, NUNITS is four as before, and the group size is 8
3269 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3270 {s5, s6, s7, s8}. */
3272 /* When using duplicate_and_interleave, we just need one element for
3273 each scalar statement. */
3274 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3275 nunits = group_size;
3277 number_of_copies = nunits * number_of_vectors / group_size;
3279 number_of_places_left_in_vector = nunits;
3280 constant_p = true;
3281 tree_vector_builder elts (vector_type, nunits, 1);
3282 elts.quick_grow (nunits);
3283 bool place_after_defs = false;
3284 for (j = 0; j < number_of_copies; j++)
3286 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
3288 stmt = stmt_vinfo->stmt;
3289 if (is_store)
3290 op = gimple_assign_rhs1 (stmt);
3291 else
3293 switch (code)
3295 case COND_EXPR:
3297 tree cond = gimple_assign_rhs1 (stmt);
3298 if (TREE_CODE (cond) == SSA_NAME)
3299 op = gimple_op (stmt, op_num + 1);
3300 else if (op_num == 0 || op_num == 1)
3301 op = TREE_OPERAND (cond, op_num);
3302 else
3304 if (op_num == 2)
3305 op = gimple_assign_rhs2 (stmt);
3306 else
3307 op = gimple_assign_rhs3 (stmt);
3310 break;
3312 case CALL_EXPR:
3313 op = gimple_call_arg (stmt, op_num);
3314 break;
3316 case LSHIFT_EXPR:
3317 case RSHIFT_EXPR:
3318 case LROTATE_EXPR:
3319 case RROTATE_EXPR:
3320 op = gimple_op (stmt, op_num + 1);
3321 /* Unlike the other binary operators, shifts/rotates have
3322 the shift count being int, instead of the same type as
3323 the lhs, so make sure the scalar is the right type if
3324 we are dealing with vectors of
3325 long long/long/short/char. */
3326 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3327 op = fold_convert (TREE_TYPE (vector_type), op);
3328 break;
3330 default:
3331 op = gimple_op (stmt, op_num + 1);
3332 break;
3336 /* Create 'vect_ = {op0,op1,...,opn}'. */
3337 number_of_places_left_in_vector--;
3338 tree orig_op = op;
3339 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3341 if (CONSTANT_CLASS_P (op))
3343 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3345 /* Can't use VIEW_CONVERT_EXPR for booleans because
3346 of possibly different sizes of scalar value and
3347 vector element. */
3348 if (integer_zerop (op))
3349 op = build_int_cst (TREE_TYPE (vector_type), 0);
3350 else if (integer_onep (op))
3351 op = build_all_ones_cst (TREE_TYPE (vector_type));
3352 else
3353 gcc_unreachable ();
3355 else
3356 op = fold_unary (VIEW_CONVERT_EXPR,
3357 TREE_TYPE (vector_type), op);
3358 gcc_assert (op && CONSTANT_CLASS_P (op));
3360 else
3362 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3363 gimple *init_stmt;
3364 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3366 tree true_val
3367 = build_all_ones_cst (TREE_TYPE (vector_type));
3368 tree false_val
3369 = build_zero_cst (TREE_TYPE (vector_type));
3370 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3371 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3372 op, true_val,
3373 false_val);
3375 else
3377 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3378 op);
3379 init_stmt
3380 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3381 op);
3383 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3384 op = new_temp;
3387 elts[number_of_places_left_in_vector] = op;
3388 if (!CONSTANT_CLASS_P (op))
3389 constant_p = false;
3390 if (TREE_CODE (orig_op) == SSA_NAME
3391 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3392 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3393 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3394 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3395 place_after_defs = true;
3397 if (number_of_places_left_in_vector == 0)
3399 if (constant_p
3400 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3401 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3402 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3403 else
3405 if (vec_oprnds->is_empty ())
3406 duplicate_and_interleave (&ctor_seq, vector_type, elts,
3407 number_of_vectors,
3408 permute_results);
3409 vec_cst = permute_results[number_of_vectors - j - 1];
3411 tree init;
3412 gimple_stmt_iterator gsi;
3413 if (place_after_defs)
3415 stmt_vec_info last_stmt_info
3416 = vect_find_last_scalar_stmt_in_slp (slp_node);
3417 gsi = gsi_for_stmt (last_stmt_info->stmt);
3418 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3419 &gsi);
3421 else
3422 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3423 NULL);
3424 if (ctor_seq != NULL)
3426 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3427 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3428 ctor_seq = NULL;
3430 voprnds.quick_push (init);
3431 place_after_defs = false;
3432 number_of_places_left_in_vector = nunits;
3433 constant_p = true;
3434 elts.new_vector (vector_type, nunits, 1);
3435 elts.quick_grow (nunits);
3440 /* Since the vectors are created in the reverse order, we should invert
3441 them. */
3442 vec_num = voprnds.length ();
3443 for (j = vec_num; j != 0; j--)
3445 vop = voprnds[j - 1];
3446 vec_oprnds->quick_push (vop);
3449 voprnds.release ();
3451 /* In case that VF is greater than the unrolling factor needed for the SLP
3452 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3453 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3454 to replicate the vectors. */
3455 while (number_of_vectors > vec_oprnds->length ())
3457 tree neutral_vec = NULL;
3459 if (neutral_op)
3461 if (!neutral_vec)
3462 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3464 vec_oprnds->quick_push (neutral_vec);
3466 else
3468 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3469 vec_oprnds->quick_push (vop);
3475 /* Get vectorized definitions from SLP_NODE that contains corresponding
3476 vectorized def-stmts. */
3478 static void
3479 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3481 tree vec_oprnd;
3482 stmt_vec_info vec_def_stmt_info;
3483 unsigned int i;
3485 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3487 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
3489 gcc_assert (vec_def_stmt_info);
3490 if (gphi *vec_def_phi = dyn_cast <gphi *> (vec_def_stmt_info->stmt))
3491 vec_oprnd = gimple_phi_result (vec_def_phi);
3492 else
3493 vec_oprnd = gimple_get_lhs (vec_def_stmt_info->stmt);
3494 vec_oprnds->quick_push (vec_oprnd);
3499 /* Get vectorized definitions for SLP_NODE.
3500 If the scalar definitions are loop invariants or constants, collect them and
3501 call vect_get_constant_vectors() to create vector stmts.
3502 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3503 must be stored in the corresponding child of SLP_NODE, and we call
3504 vect_get_slp_vect_defs () to retrieve them. */
3506 void
3507 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3508 vec<vec<tree> > *vec_oprnds)
3510 int number_of_vects = 0, i;
3511 unsigned int child_index = 0;
3512 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3513 slp_tree child = NULL;
3514 vec<tree> vec_defs;
3515 tree oprnd;
3516 bool vectorized_defs;
3518 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3519 FOR_EACH_VEC_ELT (ops, i, oprnd)
3521 /* For each operand we check if it has vectorized definitions in a child
3522 node or we need to create them (for invariants and constants). We
3523 check if the LHS of the first stmt of the next child matches OPRND.
3524 If it does, we found the correct child. Otherwise, we call
3525 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3526 to check this child node for the next operand. */
3527 vectorized_defs = false;
3528 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3530 child = SLP_TREE_CHILDREN (slp_node)[child_index];
3532 /* We have to check both pattern and original def, if available. */
3533 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3535 stmt_vec_info first_def_info = SLP_TREE_SCALAR_STMTS (child)[0];
3536 stmt_vec_info related = STMT_VINFO_RELATED_STMT (first_def_info);
3537 tree first_def_op;
3539 if (gphi *first_def = dyn_cast <gphi *> (first_def_info->stmt))
3540 first_def_op = gimple_phi_result (first_def);
3541 else
3542 first_def_op = gimple_get_lhs (first_def_info->stmt);
3543 if (operand_equal_p (oprnd, first_def_op, 0)
3544 || (related
3545 && operand_equal_p (oprnd,
3546 gimple_get_lhs (related->stmt), 0)))
3548 /* The number of vector defs is determined by the number of
3549 vector statements in the node from which we get those
3550 statements. */
3551 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3552 vectorized_defs = true;
3553 child_index++;
3556 else
3557 child_index++;
3560 if (!vectorized_defs)
3562 if (i == 0)
3564 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3565 /* Number of vector stmts was calculated according to LHS in
3566 vect_schedule_slp_instance (), fix it by replacing LHS with
3567 RHS, if necessary. See vect_get_smallest_scalar_type () for
3568 details. */
3569 vect_get_smallest_scalar_type (first_stmt_info, &lhs_size_unit,
3570 &rhs_size_unit);
3571 if (rhs_size_unit != lhs_size_unit)
3573 number_of_vects *= rhs_size_unit;
3574 number_of_vects /= lhs_size_unit;
3579 /* Allocate memory for vectorized defs. */
3580 vec_defs = vNULL;
3581 vec_defs.create (number_of_vects);
3583 /* For reduction defs we call vect_get_constant_vectors (), since we are
3584 looking for initial loop invariant values. */
3585 if (vectorized_defs)
3586 /* The defs are already vectorized. */
3587 vect_get_slp_vect_defs (child, &vec_defs);
3588 else
3589 /* Build vectors from scalar defs. */
3590 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3591 number_of_vects);
3593 vec_oprnds->quick_push (vec_defs);
3597 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3598 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3599 permute statements for the SLP node NODE of the SLP instance
3600 SLP_NODE_INSTANCE. */
3602 bool
3603 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3604 gimple_stmt_iterator *gsi, poly_uint64 vf,
3605 slp_instance slp_node_instance, bool analyze_only,
3606 unsigned *n_perms)
3608 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3609 vec_info *vinfo = stmt_info->vinfo;
3610 tree mask_element_type = NULL_TREE, mask_type;
3611 int vec_index = 0;
3612 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3613 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3614 unsigned int mask_element;
3615 machine_mode mode;
3616 unsigned HOST_WIDE_INT nunits, const_vf;
3618 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3619 return false;
3621 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3623 mode = TYPE_MODE (vectype);
3625 /* At the moment, all permutations are represented using per-element
3626 indices, so we can't cope with variable vector lengths or
3627 vectorization factors. */
3628 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
3629 || !vf.is_constant (&const_vf))
3630 return false;
3632 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3633 same size as the vector element being permuted. */
3634 mask_element_type = lang_hooks.types.type_for_mode
3635 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
3636 mask_type = get_vectype_for_scalar_type (mask_element_type);
3637 vec_perm_builder mask (nunits, nunits, 1);
3638 mask.quick_grow (nunits);
3639 vec_perm_indices indices;
3641 /* Initialize the vect stmts of NODE to properly insert the generated
3642 stmts later. */
3643 if (! analyze_only)
3644 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3645 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3646 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3648 /* Generate permutation masks for every NODE. Number of masks for each NODE
3649 is equal to GROUP_SIZE.
3650 E.g., we have a group of three nodes with three loads from the same
3651 location in each node, and the vector size is 4. I.e., we have a
3652 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3653 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3654 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3657 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3658 The last mask is illegal since we assume two operands for permute
3659 operation, and the mask element values can't be outside that range.
3660 Hence, the last mask must be converted into {2,5,5,5}.
3661 For the first two permutations we need the first and the second input
3662 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3663 we need the second and the third vectors: {b1,c1,a2,b2} and
3664 {c2,a3,b3,c3}. */
3666 int vect_stmts_counter = 0;
3667 unsigned int index = 0;
3668 int first_vec_index = -1;
3669 int second_vec_index = -1;
3670 bool noop_p = true;
3671 *n_perms = 0;
3673 for (unsigned int j = 0; j < const_vf; j++)
3675 for (int k = 0; k < group_size; k++)
3677 unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
3678 + j * DR_GROUP_SIZE (stmt_info));
3679 vec_index = i / nunits;
3680 mask_element = i % nunits;
3681 if (vec_index == first_vec_index
3682 || first_vec_index == -1)
3684 first_vec_index = vec_index;
3686 else if (vec_index == second_vec_index
3687 || second_vec_index == -1)
3689 second_vec_index = vec_index;
3690 mask_element += nunits;
3692 else
3694 if (dump_enabled_p ())
3696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3697 "permutation requires at "
3698 "least three vectors ");
3699 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3700 stmt_info->stmt, 0);
3702 gcc_assert (analyze_only);
3703 return false;
3706 gcc_assert (mask_element < 2 * nunits);
3707 if (mask_element != index)
3708 noop_p = false;
3709 mask[index++] = mask_element;
3711 if (index == nunits && !noop_p)
3713 indices.new_vector (mask, 2, nunits);
3714 if (!can_vec_perm_const_p (mode, indices))
3716 if (dump_enabled_p ())
3718 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3719 vect_location,
3720 "unsupported vect permute { ");
3721 for (i = 0; i < nunits; ++i)
3723 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3724 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3726 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3728 gcc_assert (analyze_only);
3729 return false;
3732 ++*n_perms;
3735 if (index == nunits)
3737 if (!analyze_only)
3739 tree mask_vec = NULL_TREE;
3741 if (! noop_p)
3742 mask_vec = vec_perm_indices_to_tree (mask_type, indices);
3744 if (second_vec_index == -1)
3745 second_vec_index = first_vec_index;
3747 /* Generate the permute statement if necessary. */
3748 tree first_vec = dr_chain[first_vec_index];
3749 tree second_vec = dr_chain[second_vec_index];
3750 stmt_vec_info perm_stmt_info;
3751 if (! noop_p)
3753 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3754 tree perm_dest
3755 = vect_create_destination_var (gimple_assign_lhs (stmt),
3756 vectype);
3757 perm_dest = make_ssa_name (perm_dest);
3758 gassign *perm_stmt
3759 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3760 first_vec, second_vec,
3761 mask_vec);
3762 perm_stmt_info
3763 = vect_finish_stmt_generation (stmt_info, perm_stmt,
3764 gsi);
3766 else
3767 /* If mask was NULL_TREE generate the requested
3768 identity transform. */
3769 perm_stmt_info = vinfo->lookup_def (first_vec);
3771 /* Store the vector statement in NODE. */
3772 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
3773 = perm_stmt_info;
3776 index = 0;
3777 first_vec_index = -1;
3778 second_vec_index = -1;
3779 noop_p = true;
3784 return true;
3787 /* Vectorize SLP instance tree in postorder. */
3789 static void
3790 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3791 scalar_stmts_to_slp_tree_map_t *bst_map)
3793 gimple_stmt_iterator si;
3794 stmt_vec_info stmt_info;
3795 unsigned int group_size;
3796 tree vectype;
3797 int i, j;
3798 slp_tree child;
3800 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3801 return;
3803 /* See if we have already vectorized the same set of stmts and reuse their
3804 vectorized stmts. */
3805 if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
3807 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
3808 return;
3811 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
3812 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3813 vect_schedule_slp_instance (child, instance, bst_map);
3815 /* Push SLP node def-type to stmts. */
3816 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3817 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3819 stmt_vec_info child_stmt_info;
3820 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
3821 STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
3824 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3826 /* VECTYPE is the type of the destination. */
3827 vectype = STMT_VINFO_VECTYPE (stmt_info);
3828 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3829 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3831 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
3832 if (!SLP_TREE_VEC_STMTS (node).exists ())
3833 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
3835 if (dump_enabled_p ())
3837 dump_printf_loc (MSG_NOTE,vect_location,
3838 "------>vectorizing SLP node starting from: ");
3839 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
3842 /* Vectorized stmts go before the last scalar stmt which is where
3843 all uses are ready. */
3844 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
3845 si = gsi_for_stmt (last_stmt_info->stmt);
3847 /* Mark the first element of the reduction chain as reduction to properly
3848 transform the node. In the analysis phase only the last element of the
3849 chain is marked as reduction. */
3850 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3851 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
3852 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
3854 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3855 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3858 /* Handle two-operation SLP nodes by vectorizing the group with
3859 both operations and then performing a merge. */
3860 if (SLP_TREE_TWO_OPERATORS (node))
3862 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3863 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3864 enum tree_code ocode = ERROR_MARK;
3865 stmt_vec_info ostmt_info;
3866 vec_perm_builder mask (group_size, group_size, 1);
3867 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
3869 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
3870 if (gimple_assign_rhs_code (ostmt) != code0)
3872 mask.quick_push (1);
3873 ocode = gimple_assign_rhs_code (ostmt);
3875 else
3876 mask.quick_push (0);
3878 if (ocode != ERROR_MARK)
3880 vec<stmt_vec_info> v0;
3881 vec<stmt_vec_info> v1;
3882 unsigned j;
3883 tree tmask = NULL_TREE;
3884 vect_transform_stmt (stmt_info, &si, node, instance);
3885 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3886 SLP_TREE_VEC_STMTS (node).truncate (0);
3887 gimple_assign_set_rhs_code (stmt, ocode);
3888 vect_transform_stmt (stmt_info, &si, node, instance);
3889 gimple_assign_set_rhs_code (stmt, code0);
3890 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3891 SLP_TREE_VEC_STMTS (node).truncate (0);
3892 tree meltype = build_nonstandard_integer_type
3893 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
3894 tree mvectype = get_same_sized_vectype (meltype, vectype);
3895 unsigned k = 0, l;
3896 for (j = 0; j < v0.length (); ++j)
3898 /* Enforced by vect_build_slp_tree, which rejects variable-length
3899 vectors for SLP_TREE_TWO_OPERATORS. */
3900 unsigned int const_nunits = nunits.to_constant ();
3901 tree_vector_builder melts (mvectype, const_nunits, 1);
3902 for (l = 0; l < const_nunits; ++l)
3904 if (k >= group_size)
3905 k = 0;
3906 tree t = build_int_cst (meltype,
3907 mask[k++] * const_nunits + l);
3908 melts.quick_push (t);
3910 tmask = melts.build ();
3912 /* ??? Not all targets support a VEC_PERM_EXPR with a
3913 constant mask that would translate to a vec_merge RTX
3914 (with their vec_perm_const_ok). We can either not
3915 vectorize in that case or let veclower do its job.
3916 Unfortunately that isn't too great and at least for
3917 plus/minus we'd eventually like to match targets
3918 vector addsub instructions. */
3919 gimple *vstmt;
3920 vstmt = gimple_build_assign (make_ssa_name (vectype),
3921 VEC_PERM_EXPR,
3922 gimple_assign_lhs (v0[j]->stmt),
3923 gimple_assign_lhs (v1[j]->stmt),
3924 tmask);
3925 SLP_TREE_VEC_STMTS (node).quick_push
3926 (vect_finish_stmt_generation (stmt_info, vstmt, &si));
3928 v0.release ();
3929 v1.release ();
3930 return;
3933 vect_transform_stmt (stmt_info, &si, node, instance);
3935 /* Restore stmt def-types. */
3936 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3937 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3939 stmt_vec_info child_stmt_info;
3940 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
3941 STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
3945 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3946 For loop vectorization this is done in vectorizable_call, but for SLP
3947 it needs to be deferred until end of vect_schedule_slp, because multiple
3948 SLP instances may refer to the same scalar stmt. */
3950 static void
3951 vect_remove_slp_scalar_calls (slp_tree node)
3953 gimple *new_stmt;
3954 gimple_stmt_iterator gsi;
3955 int i;
3956 slp_tree child;
3957 tree lhs;
3958 stmt_vec_info stmt_info;
3960 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3961 return;
3963 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3964 vect_remove_slp_scalar_calls (child);
3966 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3968 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
3969 if (!stmt || gimple_bb (stmt) == NULL)
3970 continue;
3971 if (is_pattern_stmt_p (stmt_info)
3972 || !PURE_SLP_STMT (stmt_info))
3973 continue;
3974 lhs = gimple_call_lhs (stmt);
3975 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3976 gsi = gsi_for_stmt (stmt);
3977 stmt_info->vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
3978 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3982 /* Generate vector code for all SLP instances in the loop/basic block. */
3984 void
3985 vect_schedule_slp (vec_info *vinfo)
3987 vec<slp_instance> slp_instances;
3988 slp_instance instance;
3989 unsigned int i;
3991 scalar_stmts_to_slp_tree_map_t *bst_map
3992 = new scalar_stmts_to_slp_tree_map_t ();
3993 slp_instances = vinfo->slp_instances;
3994 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3996 /* Schedule the tree of INSTANCE. */
3997 vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3998 instance, bst_map);
3999 if (dump_enabled_p ())
4000 dump_printf_loc (MSG_NOTE, vect_location,
4001 "vectorizing stmts using SLP.\n");
4003 delete bst_map;
4005 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4007 slp_tree root = SLP_INSTANCE_TREE (instance);
4008 stmt_vec_info store_info;
4009 unsigned int j;
4011 /* Remove scalar call stmts. Do not do this for basic-block
4012 vectorization as not all uses may be vectorized.
4013 ??? Why should this be necessary? DCE should be able to
4014 remove the stmts itself.
4015 ??? For BB vectorization we can as well remove scalar
4016 stmts starting from the SLP tree root if they have no
4017 uses. */
4018 if (is_a <loop_vec_info> (vinfo))
4019 vect_remove_slp_scalar_calls (root);
4021 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info)
4022 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4024 if (!STMT_VINFO_DATA_REF (store_info))
4025 break;
4027 store_info = vect_orig_stmt (store_info);
4028 /* Free the attached stmt_vec_info and remove the stmt. */
4029 vinfo->remove_stmt (store_info);