[06/46] Add vec_info::add_stmt
[official-gcc.git] / gcc / tree-vect-slp.c
blobfbce211d117eb58e7e388c558ef6104e5a6b6275
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
51 FINAL_P is true if we have vectorized the instance or if we have
52 made a final decision not to vectorize the statements in any way. */
54 static void
55 vect_free_slp_tree (slp_tree node, bool final_p)
57 int i;
58 slp_tree child;
60 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
61 vect_free_slp_tree (child, final_p);
63 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
64 Some statements might no longer exist, after having been
65 removed by vect_transform_stmt. Updating the remaining
66 statements would be redundant. */
67 if (!final_p)
69 gimple *stmt;
70 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
72 gcc_assert (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) > 0);
73 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))--;
77 SLP_TREE_CHILDREN (node).release ();
78 SLP_TREE_SCALAR_STMTS (node).release ();
79 SLP_TREE_VEC_STMTS (node).release ();
80 SLP_TREE_LOAD_PERMUTATION (node).release ();
82 free (node);
86 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
87 have vectorized the instance or if we have made a final decision not
88 to vectorize the statements in any way. */
90 void
91 vect_free_slp_instance (slp_instance instance, bool final_p)
93 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
94 SLP_INSTANCE_LOADS (instance).release ();
95 free (instance);
99 /* Create an SLP node for SCALAR_STMTS. */
101 static slp_tree
102 vect_create_new_slp_node (vec<gimple *> scalar_stmts)
104 slp_tree node;
105 gimple *stmt = scalar_stmts[0];
106 unsigned int nops;
108 if (is_gimple_call (stmt))
109 nops = gimple_call_num_args (stmt);
110 else if (is_gimple_assign (stmt))
112 nops = gimple_num_ops (stmt) - 1;
113 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
114 nops++;
116 else if (gimple_code (stmt) == GIMPLE_PHI)
117 nops = 0;
118 else
119 return NULL;
121 node = XNEW (struct _slp_tree);
122 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
123 SLP_TREE_VEC_STMTS (node).create (0);
124 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
125 SLP_TREE_CHILDREN (node).create (nops);
126 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
127 SLP_TREE_TWO_OPERATORS (node) = false;
128 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
130 unsigned i;
131 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt)
132 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))++;
134 return node;
138 /* This structure is used in creation of an SLP tree. Each instance
139 corresponds to the same operand in a group of scalar stmts in an SLP
140 node. */
141 typedef struct _slp_oprnd_info
143 /* Def-stmts for the operands. */
144 vec<gimple *> def_stmts;
145 /* Information about the first statement, its vector def-type, type, the
146 operand itself in case it's constant, and an indication if it's a pattern
147 stmt. */
148 tree first_op_type;
149 enum vect_def_type first_dt;
150 bool first_pattern;
151 bool second_pattern;
152 } *slp_oprnd_info;
155 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
156 operand. */
157 static vec<slp_oprnd_info>
158 vect_create_oprnd_info (int nops, int group_size)
160 int i;
161 slp_oprnd_info oprnd_info;
162 vec<slp_oprnd_info> oprnds_info;
164 oprnds_info.create (nops);
165 for (i = 0; i < nops; i++)
167 oprnd_info = XNEW (struct _slp_oprnd_info);
168 oprnd_info->def_stmts.create (group_size);
169 oprnd_info->first_dt = vect_uninitialized_def;
170 oprnd_info->first_op_type = NULL_TREE;
171 oprnd_info->first_pattern = false;
172 oprnd_info->second_pattern = false;
173 oprnds_info.quick_push (oprnd_info);
176 return oprnds_info;
180 /* Free operands info. */
182 static void
183 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
185 int i;
186 slp_oprnd_info oprnd_info;
188 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
190 oprnd_info->def_stmts.release ();
191 XDELETE (oprnd_info);
194 oprnds_info.release ();
198 /* Find the place of the data-ref in STMT in the interleaving chain that starts
199 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
202 vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
204 gimple *next_stmt = first_stmt;
205 int result = 0;
207 if (first_stmt != DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
208 return -1;
212 if (next_stmt == stmt)
213 return result;
214 next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
215 if (next_stmt)
216 result += DR_GROUP_GAP (vinfo_for_stmt (next_stmt));
218 while (next_stmt);
220 return -1;
223 /* Check whether it is possible to load COUNT elements of type ELT_MODE
224 using the method implemented by duplicate_and_interleave. Return true
225 if so, returning the number of intermediate vectors in *NVECTORS_OUT
226 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
227 (if nonnull). */
229 bool
230 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
231 unsigned int *nvectors_out,
232 tree *vector_type_out,
233 tree *permutes)
235 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
236 poly_int64 nelts;
237 unsigned int nvectors = 1;
238 for (;;)
240 scalar_int_mode int_mode;
241 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
242 if (multiple_p (current_vector_size, elt_bytes, &nelts)
243 && int_mode_for_size (elt_bits, 0).exists (&int_mode))
245 tree int_type = build_nonstandard_integer_type
246 (GET_MODE_BITSIZE (int_mode), 1);
247 tree vector_type = build_vector_type (int_type, nelts);
248 if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
250 vec_perm_builder sel1 (nelts, 2, 3);
251 vec_perm_builder sel2 (nelts, 2, 3);
252 poly_int64 half_nelts = exact_div (nelts, 2);
253 for (unsigned int i = 0; i < 3; ++i)
255 sel1.quick_push (i);
256 sel1.quick_push (i + nelts);
257 sel2.quick_push (half_nelts + i);
258 sel2.quick_push (half_nelts + i + nelts);
260 vec_perm_indices indices1 (sel1, 2, nelts);
261 vec_perm_indices indices2 (sel2, 2, nelts);
262 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
263 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
265 if (nvectors_out)
266 *nvectors_out = nvectors;
267 if (vector_type_out)
268 *vector_type_out = vector_type;
269 if (permutes)
271 permutes[0] = vect_gen_perm_mask_checked (vector_type,
272 indices1);
273 permutes[1] = vect_gen_perm_mask_checked (vector_type,
274 indices2);
276 return true;
280 if (!multiple_p (elt_bytes, 2, &elt_bytes))
281 return false;
282 nvectors *= 2;
286 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
287 they are of a valid type and that they match the defs of the first stmt of
288 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
289 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
290 indicates swap is required for cond_expr stmts. Specifically, *SWAP
291 is 1 if STMT is cond and operands of comparison need to be swapped;
292 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
293 If there is any operand swap in this function, *SWAP is set to non-zero
294 value.
295 If there was a fatal error return -1; if the error could be corrected by
296 swapping operands of father node of this one, return 1; if everything is
297 ok return 0. */
298 static int
299 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
300 vec<gimple *> stmts, unsigned stmt_num,
301 vec<slp_oprnd_info> *oprnds_info)
303 gimple *stmt = stmts[stmt_num];
304 tree oprnd;
305 unsigned int i, number_of_oprnds;
306 gimple *def_stmt;
307 enum vect_def_type dt = vect_uninitialized_def;
308 bool pattern = false;
309 slp_oprnd_info oprnd_info;
310 int first_op_idx = 1;
311 bool commutative = false;
312 bool first_op_cond = false;
313 bool first = stmt_num == 0;
314 bool second = stmt_num == 1;
316 if (is_gimple_call (stmt))
318 number_of_oprnds = gimple_call_num_args (stmt);
319 first_op_idx = 3;
321 else if (is_gimple_assign (stmt))
323 enum tree_code code = gimple_assign_rhs_code (stmt);
324 number_of_oprnds = gimple_num_ops (stmt) - 1;
325 /* Swap can only be done for cond_expr if asked to, otherwise we
326 could result in different comparison code to the first stmt. */
327 if (gimple_assign_rhs_code (stmt) == COND_EXPR
328 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
330 first_op_cond = true;
331 number_of_oprnds++;
333 else
334 commutative = commutative_tree_code (code);
336 else
337 return -1;
339 bool swapped = (*swap != 0);
340 gcc_assert (!swapped || first_op_cond);
341 for (i = 0; i < number_of_oprnds; i++)
343 again:
344 if (first_op_cond)
346 /* Map indicating how operands of cond_expr should be swapped. */
347 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
348 int *map = maps[*swap];
350 if (i < 2)
351 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx), map[i]);
352 else
353 oprnd = gimple_op (stmt, map[i]);
355 else
356 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
358 oprnd_info = (*oprnds_info)[i];
360 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt))
362 if (dump_enabled_p ())
364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
365 "Build SLP failed: can't analyze def for ");
366 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
367 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
370 return -1;
373 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
374 from the pattern. Check that all the stmts of the node are in the
375 pattern. */
376 if (def_stmt && gimple_bb (def_stmt)
377 && vect_stmt_in_region_p (vinfo, def_stmt)
378 && vinfo_for_stmt (def_stmt)
379 && is_pattern_stmt_p (vinfo_for_stmt (def_stmt)))
381 pattern = true;
382 if (!first && !oprnd_info->first_pattern
383 /* Allow different pattern state for the defs of the
384 first stmt in reduction chains. */
385 && (oprnd_info->first_dt != vect_reduction_def
386 || (!second && !oprnd_info->second_pattern)))
388 if (i == 0
389 && !swapped
390 && commutative)
392 swapped = true;
393 goto again;
396 if (dump_enabled_p ())
398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
399 "Build SLP failed: some of the stmts"
400 " are in a pattern, and others are not ");
401 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
402 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
405 return 1;
408 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
410 if (dt == vect_unknown_def_type)
412 if (dump_enabled_p ())
413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
414 "Unsupported pattern.\n");
415 return -1;
418 switch (gimple_code (def_stmt))
420 case GIMPLE_PHI:
421 case GIMPLE_ASSIGN:
422 break;
424 default:
425 if (dump_enabled_p ())
426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
427 "unsupported defining stmt:\n");
428 return -1;
432 if (second)
433 oprnd_info->second_pattern = pattern;
435 if (first)
437 oprnd_info->first_dt = dt;
438 oprnd_info->first_pattern = pattern;
439 oprnd_info->first_op_type = TREE_TYPE (oprnd);
441 else
443 /* Not first stmt of the group, check that the def-stmt/s match
444 the def-stmt/s of the first stmt. Allow different definition
445 types for reduction chains: the first stmt must be a
446 vect_reduction_def (a phi node), and the rest
447 vect_internal_def. */
448 tree type = TREE_TYPE (oprnd);
449 if ((oprnd_info->first_dt != dt
450 && !(oprnd_info->first_dt == vect_reduction_def
451 && dt == vect_internal_def)
452 && !((oprnd_info->first_dt == vect_external_def
453 || oprnd_info->first_dt == vect_constant_def)
454 && (dt == vect_external_def
455 || dt == vect_constant_def)))
456 || !types_compatible_p (oprnd_info->first_op_type, type))
458 /* Try swapping operands if we got a mismatch. */
459 if (i == 0
460 && !swapped
461 && commutative)
463 swapped = true;
464 goto again;
467 if (dump_enabled_p ())
468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
469 "Build SLP failed: different types\n");
471 return 1;
473 if ((dt == vect_constant_def
474 || dt == vect_external_def)
475 && !current_vector_size.is_constant ()
476 && (TREE_CODE (type) == BOOLEAN_TYPE
477 || !can_duplicate_and_interleave_p (stmts.length (),
478 TYPE_MODE (type))))
480 if (dump_enabled_p ())
482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
483 "Build SLP failed: invalid type of def "
484 "for variable-length SLP ");
485 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
486 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
488 return -1;
492 /* Check the types of the definitions. */
493 switch (dt)
495 case vect_constant_def:
496 case vect_external_def:
497 break;
499 case vect_reduction_def:
500 case vect_induction_def:
501 case vect_internal_def:
502 oprnd_info->def_stmts.quick_push (def_stmt);
503 break;
505 default:
506 /* FORNOW: Not supported. */
507 if (dump_enabled_p ())
509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
510 "Build SLP failed: illegal type of def ");
511 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
512 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
515 return -1;
519 /* Swap operands. */
520 if (swapped)
522 /* If there are already uses of this stmt in a SLP instance then
523 we've committed to the operand order and can't swap it. */
524 if (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) != 0)
526 if (dump_enabled_p ())
528 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
529 "Build SLP failed: cannot swap operands of "
530 "shared stmt ");
531 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
533 return -1;
536 if (first_op_cond)
538 tree cond = gimple_assign_rhs1 (stmt);
539 enum tree_code code = TREE_CODE (cond);
541 /* Swap. */
542 if (*swap == 1)
544 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
545 &TREE_OPERAND (cond, 1));
546 TREE_SET_CODE (cond, swap_tree_comparison (code));
548 /* Invert. */
549 else
551 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
552 gimple_assign_rhs3_ptr (stmt));
553 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
554 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
555 gcc_assert (code != ERROR_MARK);
556 TREE_SET_CODE (cond, code);
559 else
560 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
561 gimple_assign_rhs2_ptr (stmt));
562 if (dump_enabled_p ())
564 dump_printf_loc (MSG_NOTE, vect_location,
565 "swapped operands to match def types in ");
566 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
570 *swap = swapped;
571 return 0;
574 /* Return true if call statements CALL1 and CALL2 are similar enough
575 to be combined into the same SLP group. */
577 static bool
578 compatible_calls_p (gcall *call1, gcall *call2)
580 unsigned int nargs = gimple_call_num_args (call1);
581 if (nargs != gimple_call_num_args (call2))
582 return false;
584 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
585 return false;
587 if (gimple_call_internal_p (call1))
589 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
590 TREE_TYPE (gimple_call_lhs (call2))))
591 return false;
592 for (unsigned int i = 0; i < nargs; ++i)
593 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
594 TREE_TYPE (gimple_call_arg (call2, i))))
595 return false;
597 else
599 if (!operand_equal_p (gimple_call_fn (call1),
600 gimple_call_fn (call2), 0))
601 return false;
603 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
604 return false;
606 return true;
609 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
610 caller's attempt to find the vector type in STMT with the narrowest
611 element type. Return true if VECTYPE is nonnull and if it is valid
612 for VINFO. When returning true, update MAX_NUNITS to reflect the
613 number of units in VECTYPE. VINFO, GORUP_SIZE and MAX_NUNITS are
614 as for vect_build_slp_tree. */
616 static bool
617 vect_record_max_nunits (vec_info *vinfo, gimple *stmt, unsigned int group_size,
618 tree vectype, poly_uint64 *max_nunits)
620 if (!vectype)
622 if (dump_enabled_p ())
624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
625 "Build SLP failed: unsupported data-type in ");
626 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
627 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
629 /* Fatal mismatch. */
630 return false;
633 /* If populating the vector type requires unrolling then fail
634 before adjusting *max_nunits for basic-block vectorization. */
635 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
636 unsigned HOST_WIDE_INT const_nunits;
637 if (is_a <bb_vec_info> (vinfo)
638 && (!nunits.is_constant (&const_nunits)
639 || const_nunits > group_size))
641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
642 "Build SLP failed: unrolling required "
643 "in basic block SLP\n");
644 /* Fatal mismatch. */
645 return false;
648 /* In case of multiple types we need to detect the smallest type. */
649 vect_update_max_nunits (max_nunits, vectype);
650 return true;
653 /* STMTS is a group of GROUP_SIZE SLP statements in which some
654 statements do the same operation as the first statement and in which
655 the others do ALT_STMT_CODE. Return true if we can take one vector
656 of the first operation and one vector of the second and permute them
657 to get the required result. VECTYPE is the type of the vector that
658 would be permuted. */
660 static bool
661 vect_two_operations_perm_ok_p (vec<gimple *> stmts, unsigned int group_size,
662 tree vectype, tree_code alt_stmt_code)
664 unsigned HOST_WIDE_INT count;
665 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
666 return false;
668 vec_perm_builder sel (count, count, 1);
669 for (unsigned int i = 0; i < count; ++i)
671 unsigned int elt = i;
672 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
673 elt += count;
674 sel.quick_push (elt);
676 vec_perm_indices indices (sel, 2, count);
677 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
680 /* Verify if the scalar stmts STMTS are isomorphic, require data
681 permutation or are of unsupported types of operation. Return
682 true if they are, otherwise return false and indicate in *MATCHES
683 which stmts are not isomorphic to the first one. If MATCHES[0]
684 is false then this indicates the comparison could not be
685 carried out or the stmts will never be vectorized by SLP.
687 Note COND_EXPR is possibly ismorphic to another one after swapping its
688 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
689 the first stmt by swapping the two operands of comparison; set SWAP[i]
690 to 2 if stmt I is isormorphic to the first stmt by inverting the code
691 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
692 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
694 static bool
695 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
696 vec<gimple *> stmts, unsigned int group_size,
697 poly_uint64 *max_nunits, bool *matches,
698 bool *two_operators)
700 unsigned int i;
701 gimple *first_stmt = stmts[0], *stmt = stmts[0];
702 enum tree_code first_stmt_code = ERROR_MARK;
703 enum tree_code alt_stmt_code = ERROR_MARK;
704 enum tree_code rhs_code = ERROR_MARK;
705 enum tree_code first_cond_code = ERROR_MARK;
706 tree lhs;
707 bool need_same_oprnds = false;
708 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
709 optab optab;
710 int icode;
711 machine_mode optab_op2_mode;
712 machine_mode vec_mode;
713 gimple *first_load = NULL, *prev_first_load = NULL;
715 /* For every stmt in NODE find its def stmt/s. */
716 FOR_EACH_VEC_ELT (stmts, i, stmt)
718 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
719 swap[i] = 0;
720 matches[i] = false;
722 if (dump_enabled_p ())
724 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
725 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
728 /* Fail to vectorize statements marked as unvectorizable. */
729 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
731 if (dump_enabled_p ())
733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
734 "Build SLP failed: unvectorizable statement ");
735 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
737 /* Fatal mismatch. */
738 matches[0] = false;
739 return false;
742 lhs = gimple_get_lhs (stmt);
743 if (lhs == NULL_TREE)
745 if (dump_enabled_p ())
747 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
748 "Build SLP failed: not GIMPLE_ASSIGN nor "
749 "GIMPLE_CALL ");
750 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
752 /* Fatal mismatch. */
753 matches[0] = false;
754 return false;
757 tree nunits_vectype;
758 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
759 &nunits_vectype)
760 || (nunits_vectype
761 && !vect_record_max_nunits (vinfo, stmt, group_size,
762 nunits_vectype, max_nunits)))
764 /* Fatal mismatch. */
765 matches[0] = false;
766 return false;
769 gcc_assert (vectype);
771 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
773 rhs_code = CALL_EXPR;
774 if ((gimple_call_internal_p (call_stmt)
775 && (!vectorizable_internal_fn_p
776 (gimple_call_internal_fn (call_stmt))))
777 || gimple_call_tail_p (call_stmt)
778 || gimple_call_noreturn_p (call_stmt)
779 || !gimple_call_nothrow_p (call_stmt)
780 || gimple_call_chain (call_stmt))
782 if (dump_enabled_p ())
784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
785 "Build SLP failed: unsupported call type ");
786 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
787 call_stmt, 0);
789 /* Fatal mismatch. */
790 matches[0] = false;
791 return false;
794 else
795 rhs_code = gimple_assign_rhs_code (stmt);
797 /* Check the operation. */
798 if (i == 0)
800 first_stmt_code = rhs_code;
802 /* Shift arguments should be equal in all the packed stmts for a
803 vector shift with scalar shift operand. */
804 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
805 || rhs_code == LROTATE_EXPR
806 || rhs_code == RROTATE_EXPR)
808 if (vectype == boolean_type_node)
810 if (dump_enabled_p ())
811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
812 "Build SLP failed: shift of a"
813 " boolean.\n");
814 /* Fatal mismatch. */
815 matches[0] = false;
816 return false;
819 vec_mode = TYPE_MODE (vectype);
821 /* First see if we have a vector/vector shift. */
822 optab = optab_for_tree_code (rhs_code, vectype,
823 optab_vector);
825 if (!optab
826 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
828 /* No vector/vector shift, try for a vector/scalar shift. */
829 optab = optab_for_tree_code (rhs_code, vectype,
830 optab_scalar);
832 if (!optab)
834 if (dump_enabled_p ())
835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
836 "Build SLP failed: no optab.\n");
837 /* Fatal mismatch. */
838 matches[0] = false;
839 return false;
841 icode = (int) optab_handler (optab, vec_mode);
842 if (icode == CODE_FOR_nothing)
844 if (dump_enabled_p ())
845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
846 "Build SLP failed: "
847 "op not supported by target.\n");
848 /* Fatal mismatch. */
849 matches[0] = false;
850 return false;
852 optab_op2_mode = insn_data[icode].operand[2].mode;
853 if (!VECTOR_MODE_P (optab_op2_mode))
855 need_same_oprnds = true;
856 first_op1 = gimple_assign_rhs2 (stmt);
860 else if (rhs_code == WIDEN_LSHIFT_EXPR)
862 need_same_oprnds = true;
863 first_op1 = gimple_assign_rhs2 (stmt);
866 else
868 if (first_stmt_code != rhs_code
869 && alt_stmt_code == ERROR_MARK)
870 alt_stmt_code = rhs_code;
871 if (first_stmt_code != rhs_code
872 && (first_stmt_code != IMAGPART_EXPR
873 || rhs_code != REALPART_EXPR)
874 && (first_stmt_code != REALPART_EXPR
875 || rhs_code != IMAGPART_EXPR)
876 /* Handle mismatches in plus/minus by computing both
877 and merging the results. */
878 && !((first_stmt_code == PLUS_EXPR
879 || first_stmt_code == MINUS_EXPR)
880 && (alt_stmt_code == PLUS_EXPR
881 || alt_stmt_code == MINUS_EXPR)
882 && rhs_code == alt_stmt_code)
883 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
884 && (first_stmt_code == ARRAY_REF
885 || first_stmt_code == BIT_FIELD_REF
886 || first_stmt_code == INDIRECT_REF
887 || first_stmt_code == COMPONENT_REF
888 || first_stmt_code == MEM_REF)))
890 if (dump_enabled_p ())
892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
893 "Build SLP failed: different operation "
894 "in stmt ");
895 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
896 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
897 "original stmt ");
898 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
899 first_stmt, 0);
901 /* Mismatch. */
902 continue;
905 if (need_same_oprnds
906 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
908 if (dump_enabled_p ())
910 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
911 "Build SLP failed: different shift "
912 "arguments in ");
913 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
915 /* Mismatch. */
916 continue;
919 if (rhs_code == CALL_EXPR)
921 gimple *first_stmt = stmts[0];
922 if (!compatible_calls_p (as_a <gcall *> (first_stmt),
923 as_a <gcall *> (stmt)))
925 if (dump_enabled_p ())
927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
928 "Build SLP failed: different calls in ");
929 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
930 stmt, 0);
932 /* Mismatch. */
933 continue;
938 /* Grouped store or load. */
939 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
941 if (REFERENCE_CLASS_P (lhs))
943 /* Store. */
946 else
948 /* Load. */
949 first_load = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
950 if (prev_first_load)
952 /* Check that there are no loads from different interleaving
953 chains in the same node. */
954 if (prev_first_load != first_load)
956 if (dump_enabled_p ())
958 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
959 vect_location,
960 "Build SLP failed: different "
961 "interleaving chains in one node ");
962 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
963 stmt, 0);
965 /* Mismatch. */
966 continue;
969 else
970 prev_first_load = first_load;
972 } /* Grouped access. */
973 else
975 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
977 /* Not grouped load. */
978 if (dump_enabled_p ())
980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
981 "Build SLP failed: not grouped load ");
982 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
985 /* FORNOW: Not grouped loads are not supported. */
986 /* Fatal mismatch. */
987 matches[0] = false;
988 return false;
991 /* Not memory operation. */
992 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
993 && TREE_CODE_CLASS (rhs_code) != tcc_unary
994 && TREE_CODE_CLASS (rhs_code) != tcc_expression
995 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
996 && rhs_code != CALL_EXPR)
998 if (dump_enabled_p ())
1000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1001 "Build SLP failed: operation");
1002 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
1003 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1005 /* Fatal mismatch. */
1006 matches[0] = false;
1007 return false;
1010 if (rhs_code == COND_EXPR)
1012 tree cond_expr = gimple_assign_rhs1 (stmt);
1013 enum tree_code cond_code = TREE_CODE (cond_expr);
1014 enum tree_code swap_code = ERROR_MARK;
1015 enum tree_code invert_code = ERROR_MARK;
1017 if (i == 0)
1018 first_cond_code = TREE_CODE (cond_expr);
1019 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1021 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1022 swap_code = swap_tree_comparison (cond_code);
1023 invert_code = invert_tree_comparison (cond_code, honor_nans);
1026 if (first_cond_code == cond_code)
1028 /* Isomorphic can be achieved by swapping. */
1029 else if (first_cond_code == swap_code)
1030 swap[i] = 1;
1031 /* Isomorphic can be achieved by inverting. */
1032 else if (first_cond_code == invert_code)
1033 swap[i] = 2;
1034 else
1036 if (dump_enabled_p ())
1038 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1039 "Build SLP failed: different"
1040 " operation");
1041 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1042 stmt, 0);
1044 /* Mismatch. */
1045 continue;
1050 matches[i] = true;
1053 for (i = 0; i < group_size; ++i)
1054 if (!matches[i])
1055 return false;
1057 /* If we allowed a two-operation SLP node verify the target can cope
1058 with the permute we are going to use. */
1059 if (alt_stmt_code != ERROR_MARK
1060 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1062 if (vectype == boolean_type_node
1063 || !vect_two_operations_perm_ok_p (stmts, group_size,
1064 vectype, alt_stmt_code))
1066 for (i = 0; i < group_size; ++i)
1067 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
1069 matches[i] = false;
1070 if (dump_enabled_p ())
1072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1073 "Build SLP failed: different operation "
1074 "in stmt ");
1075 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1076 stmts[i], 0);
1077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1078 "original stmt ");
1079 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1080 first_stmt, 0);
1083 return false;
1085 *two_operators = true;
1088 return true;
1091 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1092 Note we never remove apart from at destruction time so we do not
1093 need a special value for deleted that differs from empty. */
1094 struct bst_traits
1096 typedef vec <gimple *> value_type;
1097 typedef vec <gimple *> compare_type;
1098 static inline hashval_t hash (value_type);
1099 static inline bool equal (value_type existing, value_type candidate);
1100 static inline bool is_empty (value_type x) { return !x.exists (); }
1101 static inline bool is_deleted (value_type x) { return !x.exists (); }
1102 static inline void mark_empty (value_type &x) { x.release (); }
1103 static inline void mark_deleted (value_type &x) { x.release (); }
1104 static inline void remove (value_type &x) { x.release (); }
1106 inline hashval_t
1107 bst_traits::hash (value_type x)
1109 inchash::hash h;
1110 for (unsigned i = 0; i < x.length (); ++i)
1111 h.add_int (gimple_uid (x[i]));
1112 return h.end ();
1114 inline bool
1115 bst_traits::equal (value_type existing, value_type candidate)
1117 if (existing.length () != candidate.length ())
1118 return false;
1119 for (unsigned i = 0; i < existing.length (); ++i)
1120 if (existing[i] != candidate[i])
1121 return false;
1122 return true;
1125 typedef hash_set <vec <gimple *>, bst_traits> scalar_stmts_set_t;
1126 static scalar_stmts_set_t *bst_fail;
1128 typedef hash_map <vec <gimple *>, slp_tree,
1129 simple_hashmap_traits <bst_traits, slp_tree> >
1130 scalar_stmts_to_slp_tree_map_t;
1132 static slp_tree
1133 vect_build_slp_tree_2 (vec_info *vinfo,
1134 vec<gimple *> stmts, unsigned int group_size,
1135 poly_uint64 *max_nunits,
1136 vec<slp_tree> *loads,
1137 bool *matches, unsigned *npermutes, unsigned *tree_size,
1138 unsigned max_tree_size);
1140 static slp_tree
1141 vect_build_slp_tree (vec_info *vinfo,
1142 vec<gimple *> stmts, unsigned int group_size,
1143 poly_uint64 *max_nunits, vec<slp_tree> *loads,
1144 bool *matches, unsigned *npermutes, unsigned *tree_size,
1145 unsigned max_tree_size)
1147 if (bst_fail->contains (stmts))
1148 return NULL;
1149 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1150 loads, matches, npermutes, tree_size,
1151 max_tree_size);
1152 /* When SLP build fails for stmts record this, otherwise SLP build
1153 can be exponential in time when we allow to construct parts from
1154 scalars, see PR81723. */
1155 if (! res)
1157 vec <gimple *> x;
1158 x.create (stmts.length ());
1159 x.splice (stmts);
1160 bst_fail->add (x);
1162 return res;
1165 /* Recursively build an SLP tree starting from NODE.
1166 Fail (and return a value not equal to zero) if def-stmts are not
1167 isomorphic, require data permutation or are of unsupported types of
1168 operation. Otherwise, return 0.
1169 The value returned is the depth in the SLP tree where a mismatch
1170 was found. */
1172 static slp_tree
1173 vect_build_slp_tree_2 (vec_info *vinfo,
1174 vec<gimple *> stmts, unsigned int group_size,
1175 poly_uint64 *max_nunits,
1176 vec<slp_tree> *loads,
1177 bool *matches, unsigned *npermutes, unsigned *tree_size,
1178 unsigned max_tree_size)
1180 unsigned nops, i, this_tree_size = 0;
1181 poly_uint64 this_max_nunits = *max_nunits;
1182 gimple *stmt;
1183 slp_tree node;
1185 matches[0] = false;
1187 stmt = stmts[0];
1188 if (is_gimple_call (stmt))
1189 nops = gimple_call_num_args (stmt);
1190 else if (is_gimple_assign (stmt))
1192 nops = gimple_num_ops (stmt) - 1;
1193 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1194 nops++;
1196 else if (gimple_code (stmt) == GIMPLE_PHI)
1197 nops = 0;
1198 else
1199 return NULL;
1201 /* If the SLP node is a PHI (induction or reduction), terminate
1202 the recursion. */
1203 if (gimple_code (stmt) == GIMPLE_PHI)
1205 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1206 tree vectype = get_vectype_for_scalar_type (scalar_type);
1207 if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
1208 max_nunits))
1209 return NULL;
1211 vect_def_type def_type = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt));
1212 /* Induction from different IVs is not supported. */
1213 if (def_type == vect_induction_def)
1215 FOR_EACH_VEC_ELT (stmts, i, stmt)
1216 if (stmt != stmts[0])
1217 return NULL;
1219 else
1221 /* Else def types have to match. */
1222 FOR_EACH_VEC_ELT (stmts, i, stmt)
1224 /* But for reduction chains only check on the first stmt. */
1225 if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1226 && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt)
1227 continue;
1228 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != def_type)
1229 return NULL;
1232 node = vect_create_new_slp_node (stmts);
1233 return node;
1237 bool two_operators = false;
1238 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1239 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1240 &this_max_nunits, matches, &two_operators))
1241 return NULL;
1243 /* If the SLP node is a load, terminate the recursion. */
1244 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
1245 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1247 *max_nunits = this_max_nunits;
1248 node = vect_create_new_slp_node (stmts);
1249 loads->safe_push (node);
1250 return node;
1253 /* Get at the operands, verifying they are compatible. */
1254 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1255 slp_oprnd_info oprnd_info;
1256 FOR_EACH_VEC_ELT (stmts, i, stmt)
1258 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1259 stmts, i, &oprnds_info);
1260 if (res != 0)
1261 matches[(res == -1) ? 0 : i] = false;
1262 if (!matches[0])
1263 break;
1265 for (i = 0; i < group_size; ++i)
1266 if (!matches[i])
1268 vect_free_oprnd_info (oprnds_info);
1269 return NULL;
1272 auto_vec<slp_tree, 4> children;
1273 auto_vec<slp_tree> this_loads;
1275 stmt = stmts[0];
1277 if (tree_size)
1278 max_tree_size -= *tree_size;
1280 /* Create SLP_TREE nodes for the definition node/s. */
1281 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1283 slp_tree child;
1284 unsigned old_nloads = this_loads.length ();
1285 unsigned old_tree_size = this_tree_size;
1286 unsigned int j;
1288 if (oprnd_info->first_dt != vect_internal_def
1289 && oprnd_info->first_dt != vect_reduction_def
1290 && oprnd_info->first_dt != vect_induction_def)
1291 continue;
1293 if (++this_tree_size > max_tree_size)
1295 FOR_EACH_VEC_ELT (children, j, child)
1296 vect_free_slp_tree (child, false);
1297 vect_free_oprnd_info (oprnds_info);
1298 return NULL;
1301 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1302 group_size, &this_max_nunits,
1303 &this_loads, matches, npermutes,
1304 &this_tree_size,
1305 max_tree_size)) != NULL)
1307 /* If we have all children of child built up from scalars then just
1308 throw that away and build it up this node from scalars. */
1309 if (!SLP_TREE_CHILDREN (child).is_empty ()
1310 /* ??? Rejecting patterns this way doesn't work. We'd have to
1311 do extra work to cancel the pattern so the uses see the
1312 scalar version. */
1313 && !is_pattern_stmt_p
1314 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1316 slp_tree grandchild;
1318 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1319 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1320 break;
1321 if (!grandchild)
1323 /* Roll back. */
1324 this_loads.truncate (old_nloads);
1325 this_tree_size = old_tree_size;
1326 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1327 vect_free_slp_tree (grandchild, false);
1328 SLP_TREE_CHILDREN (child).truncate (0);
1330 dump_printf_loc (MSG_NOTE, vect_location,
1331 "Building parent vector operands from "
1332 "scalars instead\n");
1333 oprnd_info->def_stmts = vNULL;
1334 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1335 children.safe_push (child);
1336 continue;
1340 oprnd_info->def_stmts = vNULL;
1341 children.safe_push (child);
1342 continue;
1345 /* If the SLP build failed fatally and we analyze a basic-block
1346 simply treat nodes we fail to build as externally defined
1347 (and thus build vectors from the scalar defs).
1348 The cost model will reject outright expensive cases.
1349 ??? This doesn't treat cases where permutation ultimatively
1350 fails (or we don't try permutation below). Ideally we'd
1351 even compute a permutation that will end up with the maximum
1352 SLP tree size... */
1353 if (is_a <bb_vec_info> (vinfo)
1354 && !matches[0]
1355 /* ??? Rejecting patterns this way doesn't work. We'd have to
1356 do extra work to cancel the pattern so the uses see the
1357 scalar version. */
1358 && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1360 dump_printf_loc (MSG_NOTE, vect_location,
1361 "Building vector operands from scalars\n");
1362 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1363 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1364 children.safe_push (child);
1365 oprnd_info->def_stmts = vNULL;
1366 continue;
1369 /* If the SLP build for operand zero failed and operand zero
1370 and one can be commutated try that for the scalar stmts
1371 that failed the match. */
1372 if (i == 0
1373 /* A first scalar stmt mismatch signals a fatal mismatch. */
1374 && matches[0]
1375 /* ??? For COND_EXPRs we can swap the comparison operands
1376 as well as the arms under some constraints. */
1377 && nops == 2
1378 && oprnds_info[1]->first_dt == vect_internal_def
1379 && is_gimple_assign (stmt)
1380 /* Do so only if the number of not successful permutes was nor more
1381 than a cut-ff as re-trying the recursive match on
1382 possibly each level of the tree would expose exponential
1383 behavior. */
1384 && *npermutes < 4)
1386 /* See whether we can swap the matching or the non-matching
1387 stmt operands. */
1388 bool swap_not_matching = true;
1391 for (j = 0; j < group_size; ++j)
1393 if (matches[j] != !swap_not_matching)
1394 continue;
1395 gimple *stmt = stmts[j];
1396 /* Verify if we can swap operands of this stmt. */
1397 if (!is_gimple_assign (stmt)
1398 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1400 if (!swap_not_matching)
1401 goto fail;
1402 swap_not_matching = false;
1403 break;
1405 /* Verify if we can safely swap or if we committed to a
1406 specific operand order already.
1407 ??? Instead of modifying GIMPLE stmts here we could
1408 record whether we want to swap operands in the SLP
1409 node and temporarily do that when processing it
1410 (or wrap operand accessors in a helper). */
1411 else if (swap[j] != 0
1412 || STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)))
1414 if (!swap_not_matching)
1416 if (dump_enabled_p ())
1418 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1419 vect_location,
1420 "Build SLP failed: cannot swap "
1421 "operands of shared stmt ");
1422 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
1423 TDF_SLIM, stmts[j], 0);
1425 goto fail;
1427 swap_not_matching = false;
1428 break;
1432 while (j != group_size);
1434 /* Swap mismatched definition stmts. */
1435 dump_printf_loc (MSG_NOTE, vect_location,
1436 "Re-trying with swapped operands of stmts ");
1437 for (j = 0; j < group_size; ++j)
1438 if (matches[j] == !swap_not_matching)
1440 std::swap (oprnds_info[0]->def_stmts[j],
1441 oprnds_info[1]->def_stmts[j]);
1442 dump_printf (MSG_NOTE, "%d ", j);
1444 dump_printf (MSG_NOTE, "\n");
1445 /* And try again with scratch 'matches' ... */
1446 bool *tem = XALLOCAVEC (bool, group_size);
1447 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1448 group_size, &this_max_nunits,
1449 &this_loads, tem, npermutes,
1450 &this_tree_size,
1451 max_tree_size)) != NULL)
1453 /* ... so if successful we can apply the operand swapping
1454 to the GIMPLE IL. This is necessary because for example
1455 vect_get_slp_defs uses operand indexes and thus expects
1456 canonical operand order. This is also necessary even
1457 if we end up building the operand from scalars as
1458 we'll continue to process swapped operand two. */
1459 for (j = 0; j < group_size; ++j)
1461 gimple *stmt = stmts[j];
1462 gimple_set_plf (stmt, GF_PLF_1, false);
1464 for (j = 0; j < group_size; ++j)
1466 gimple *stmt = stmts[j];
1467 if (matches[j] == !swap_not_matching)
1469 /* Avoid swapping operands twice. */
1470 if (gimple_plf (stmt, GF_PLF_1))
1471 continue;
1472 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1473 gimple_assign_rhs2_ptr (stmt));
1474 gimple_set_plf (stmt, GF_PLF_1, true);
1477 /* Verify we swap all duplicates or none. */
1478 if (flag_checking)
1479 for (j = 0; j < group_size; ++j)
1481 gimple *stmt = stmts[j];
1482 gcc_assert (gimple_plf (stmt, GF_PLF_1)
1483 == (matches[j] == !swap_not_matching));
1486 /* If we have all children of child built up from scalars then
1487 just throw that away and build it up this node from scalars. */
1488 if (!SLP_TREE_CHILDREN (child).is_empty ()
1489 /* ??? Rejecting patterns this way doesn't work. We'd have
1490 to do extra work to cancel the pattern so the uses see the
1491 scalar version. */
1492 && !is_pattern_stmt_p
1493 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1495 unsigned int j;
1496 slp_tree grandchild;
1498 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1499 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1500 break;
1501 if (!grandchild)
1503 /* Roll back. */
1504 this_loads.truncate (old_nloads);
1505 this_tree_size = old_tree_size;
1506 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1507 vect_free_slp_tree (grandchild, false);
1508 SLP_TREE_CHILDREN (child).truncate (0);
1510 dump_printf_loc (MSG_NOTE, vect_location,
1511 "Building parent vector operands from "
1512 "scalars instead\n");
1513 oprnd_info->def_stmts = vNULL;
1514 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1515 children.safe_push (child);
1516 continue;
1520 oprnd_info->def_stmts = vNULL;
1521 children.safe_push (child);
1522 continue;
1525 ++*npermutes;
1528 fail:
1529 gcc_assert (child == NULL);
1530 FOR_EACH_VEC_ELT (children, j, child)
1531 vect_free_slp_tree (child, false);
1532 vect_free_oprnd_info (oprnds_info);
1533 return NULL;
1536 vect_free_oprnd_info (oprnds_info);
1538 if (tree_size)
1539 *tree_size += this_tree_size;
1540 *max_nunits = this_max_nunits;
1541 loads->safe_splice (this_loads);
1543 node = vect_create_new_slp_node (stmts);
1544 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1545 SLP_TREE_CHILDREN (node).splice (children);
1546 return node;
1549 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1551 static void
1552 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1553 slp_tree node)
1555 int i;
1556 gimple *stmt;
1557 slp_tree child;
1559 dump_printf_loc (dump_kind, loc, "node%s\n",
1560 SLP_TREE_DEF_TYPE (node) != vect_internal_def
1561 ? " (external)" : "");
1562 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1564 dump_printf_loc (dump_kind, loc, "\tstmt %d ", i);
1565 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1567 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1568 vect_print_slp_tree (dump_kind, loc, child);
1572 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1573 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1574 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1575 stmts in NODE are to be marked. */
1577 static void
1578 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1580 int i;
1581 gimple *stmt;
1582 slp_tree child;
1584 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1585 return;
1587 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1588 if (j < 0 || i == j)
1589 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1591 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1592 vect_mark_slp_stmts (child, mark, j);
1596 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1598 static void
1599 vect_mark_slp_stmts_relevant (slp_tree node)
1601 int i;
1602 gimple *stmt;
1603 stmt_vec_info stmt_info;
1604 slp_tree child;
1606 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1607 return;
1609 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1611 stmt_info = vinfo_for_stmt (stmt);
1612 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1613 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1614 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1617 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1618 vect_mark_slp_stmts_relevant (child);
1622 /* Rearrange the statements of NODE according to PERMUTATION. */
1624 static void
1625 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1626 vec<unsigned> permutation)
1628 gimple *stmt;
1629 vec<gimple *> tmp_stmts;
1630 unsigned int i;
1631 slp_tree child;
1633 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1634 vect_slp_rearrange_stmts (child, group_size, permutation);
1636 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1637 tmp_stmts.create (group_size);
1638 tmp_stmts.quick_grow_cleared (group_size);
1640 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1641 tmp_stmts[permutation[i]] = stmt;
1643 SLP_TREE_SCALAR_STMTS (node).release ();
1644 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1648 /* Attempt to reorder stmts in a reduction chain so that we don't
1649 require any load permutation. Return true if that was possible,
1650 otherwise return false. */
1652 static bool
1653 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1655 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1656 unsigned int i, j;
1657 unsigned int lidx;
1658 slp_tree node, load;
1660 /* Compare all the permutation sequences to the first one. We know
1661 that at least one load is permuted. */
1662 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1663 if (!node->load_permutation.exists ())
1664 return false;
1665 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1667 if (!load->load_permutation.exists ())
1668 return false;
1669 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1670 if (lidx != node->load_permutation[j])
1671 return false;
1674 /* Check that the loads in the first sequence are different and there
1675 are no gaps between them. */
1676 auto_sbitmap load_index (group_size);
1677 bitmap_clear (load_index);
1678 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1680 if (lidx >= group_size)
1681 return false;
1682 if (bitmap_bit_p (load_index, lidx))
1683 return false;
1685 bitmap_set_bit (load_index, lidx);
1687 for (i = 0; i < group_size; i++)
1688 if (!bitmap_bit_p (load_index, i))
1689 return false;
1691 /* This permutation is valid for reduction. Since the order of the
1692 statements in the nodes is not important unless they are memory
1693 accesses, we can rearrange the statements in all the nodes
1694 according to the order of the loads. */
1695 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1696 node->load_permutation);
1698 /* We are done, no actual permutations need to be generated. */
1699 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1700 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1702 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1703 first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
1704 /* But we have to keep those permutations that are required because
1705 of handling of gaps. */
1706 if (known_eq (unrolling_factor, 1U)
1707 || (group_size == DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
1708 && DR_GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0))
1709 SLP_TREE_LOAD_PERMUTATION (node).release ();
1710 else
1711 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1712 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1715 return true;
1718 /* Check if the required load permutations in the SLP instance
1719 SLP_INSTN are supported. */
1721 static bool
1722 vect_supported_load_permutation_p (slp_instance slp_instn)
1724 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1725 unsigned int i, j, k, next;
1726 slp_tree node;
1727 gimple *stmt, *load, *next_load;
1729 if (dump_enabled_p ())
1731 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1732 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1733 if (node->load_permutation.exists ())
1734 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1735 dump_printf (MSG_NOTE, "%d ", next);
1736 else
1737 for (k = 0; k < group_size; ++k)
1738 dump_printf (MSG_NOTE, "%d ", k);
1739 dump_printf (MSG_NOTE, "\n");
1742 /* In case of reduction every load permutation is allowed, since the order
1743 of the reduction statements is not important (as opposed to the case of
1744 grouped stores). The only condition we need to check is that all the
1745 load nodes are of the same size and have the same permutation (and then
1746 rearrange all the nodes of the SLP instance according to this
1747 permutation). */
1749 /* Check that all the load nodes are of the same size. */
1750 /* ??? Can't we assert this? */
1751 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1752 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1753 return false;
1755 node = SLP_INSTANCE_TREE (slp_instn);
1756 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1758 /* Reduction (there are no data-refs in the root).
1759 In reduction chain the order of the loads is not important. */
1760 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1761 && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1762 vect_attempt_slp_rearrange_stmts (slp_instn);
1764 /* In basic block vectorization we allow any subchain of an interleaving
1765 chain.
1766 FORNOW: not supported in loop SLP because of realignment compications. */
1767 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1769 /* Check whether the loads in an instance form a subchain and thus
1770 no permutation is necessary. */
1771 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1773 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1774 continue;
1775 bool subchain_p = true;
1776 next_load = NULL;
1777 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1779 if (j != 0
1780 && (next_load != load
1781 || DR_GROUP_GAP (vinfo_for_stmt (load)) != 1))
1783 subchain_p = false;
1784 break;
1786 next_load = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1788 if (subchain_p)
1789 SLP_TREE_LOAD_PERMUTATION (node).release ();
1790 else
1792 stmt_vec_info group_info
1793 = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1794 group_info = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (group_info));
1795 unsigned HOST_WIDE_INT nunits;
1796 unsigned k, maxk = 0;
1797 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1798 if (k > maxk)
1799 maxk = k;
1800 /* In BB vectorization we may not actually use a loaded vector
1801 accessing elements in excess of DR_GROUP_SIZE. */
1802 tree vectype = STMT_VINFO_VECTYPE (group_info);
1803 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1804 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
1806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1807 "BB vectorization with gaps at the end of "
1808 "a load is not supported\n");
1809 return false;
1812 /* Verify the permutation can be generated. */
1813 vec<tree> tem;
1814 unsigned n_perms;
1815 if (!vect_transform_slp_perm_load (node, tem, NULL,
1816 1, slp_instn, true, &n_perms))
1818 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1819 vect_location,
1820 "unsupported load permutation\n");
1821 return false;
1825 return true;
1828 /* For loop vectorization verify we can generate the permutation. Be
1829 conservative about the vectorization factor, there are permutations
1830 that will use three vector inputs only starting from a specific factor
1831 and the vectorization factor is not yet final.
1832 ??? The SLP instance unrolling factor might not be the maximum one. */
1833 unsigned n_perms;
1834 poly_uint64 test_vf
1835 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1836 LOOP_VINFO_VECT_FACTOR
1837 (STMT_VINFO_LOOP_VINFO (vinfo_for_stmt (stmt))));
1838 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1839 if (node->load_permutation.exists ()
1840 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1841 slp_instn, true, &n_perms))
1842 return false;
1844 return true;
1848 /* Find the last store in SLP INSTANCE. */
1850 gimple *
1851 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1853 gimple *last = NULL, *stmt;
1855 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1857 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1858 if (is_pattern_stmt_p (stmt_vinfo))
1859 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1860 else
1861 last = get_later_stmt (stmt, last);
1864 return last;
1867 /* Splits a group of stores, currently beginning at FIRST_STMT, into two groups:
1868 one (still beginning at FIRST_STMT) of size GROUP1_SIZE (also containing
1869 the first GROUP1_SIZE stmts, since stores are consecutive), the second
1870 containing the remainder.
1871 Return the first stmt in the second group. */
1873 static gimple *
1874 vect_split_slp_store_group (gimple *first_stmt, unsigned group1_size)
1876 stmt_vec_info first_vinfo = vinfo_for_stmt (first_stmt);
1877 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_stmt);
1878 gcc_assert (group1_size > 0);
1879 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1880 gcc_assert (group2_size > 0);
1881 DR_GROUP_SIZE (first_vinfo) = group1_size;
1883 gimple *stmt = first_stmt;
1884 for (unsigned i = group1_size; i > 1; i--)
1886 stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
1887 gcc_assert (DR_GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
1889 /* STMT is now the last element of the first group. */
1890 gimple *group2 = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
1891 DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)) = 0;
1893 DR_GROUP_SIZE (vinfo_for_stmt (group2)) = group2_size;
1894 for (stmt = group2; stmt; stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)))
1896 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = group2;
1897 gcc_assert (DR_GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
1900 /* For the second group, the DR_GROUP_GAP is that before the original group,
1901 plus skipping over the first vector. */
1902 DR_GROUP_GAP (vinfo_for_stmt (group2))
1903 = DR_GROUP_GAP (first_vinfo) + group1_size;
1905 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1906 DR_GROUP_GAP (first_vinfo) += group2_size;
1908 if (dump_enabled_p ())
1909 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1910 group1_size, group2_size);
1912 return group2;
1915 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1916 statements and a vector of NUNITS elements. */
1918 static poly_uint64
1919 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
1921 return exact_div (common_multiple (nunits, group_size), group_size);
1924 /* Analyze an SLP instance starting from a group of grouped stores. Call
1925 vect_build_slp_tree to build a tree of packed stmts if possible.
1926 Return FALSE if it's impossible to SLP any stmt in the loop. */
1928 static bool
1929 vect_analyze_slp_instance (vec_info *vinfo,
1930 gimple *stmt, unsigned max_tree_size)
1932 slp_instance new_instance;
1933 slp_tree node;
1934 unsigned int group_size;
1935 tree vectype, scalar_type = NULL_TREE;
1936 gimple *next;
1937 unsigned int i;
1938 vec<slp_tree> loads;
1939 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1940 vec<gimple *> scalar_stmts;
1942 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
1944 scalar_type = TREE_TYPE (DR_REF (dr));
1945 vectype = get_vectype_for_scalar_type (scalar_type);
1946 group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt));
1948 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1950 gcc_assert (is_a <loop_vec_info> (vinfo));
1951 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1952 group_size = REDUC_GROUP_SIZE (vinfo_for_stmt (stmt));
1954 else
1956 gcc_assert (is_a <loop_vec_info> (vinfo));
1957 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1958 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
1961 if (!vectype)
1963 if (dump_enabled_p ())
1965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1966 "Build SLP failed: unsupported data-type ");
1967 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1968 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1971 return false;
1973 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1975 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1976 scalar_stmts.create (group_size);
1977 next = stmt;
1978 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
1980 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1981 while (next)
1983 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1984 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1985 scalar_stmts.safe_push (
1986 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1987 else
1988 scalar_stmts.safe_push (next);
1989 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1992 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1994 /* Collect the reduction stmts and store them in
1995 SLP_TREE_SCALAR_STMTS. */
1996 while (next)
1998 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1999 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
2000 scalar_stmts.safe_push (
2001 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
2002 else
2003 scalar_stmts.safe_push (next);
2004 next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2006 /* Mark the first element of the reduction chain as reduction to properly
2007 transform the node. In the reduction analysis phase only the last
2008 element of the chain is marked as reduction. */
2009 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
2011 else
2013 /* Collect reduction statements. */
2014 vec<gimple *> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2015 for (i = 0; reductions.iterate (i, &next); i++)
2016 scalar_stmts.safe_push (next);
2019 loads.create (group_size);
2021 /* Build the tree for the SLP instance. */
2022 bool *matches = XALLOCAVEC (bool, group_size);
2023 unsigned npermutes = 0;
2024 bst_fail = new scalar_stmts_set_t ();
2025 poly_uint64 max_nunits = nunits;
2026 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2027 &max_nunits, &loads, matches, &npermutes,
2028 NULL, max_tree_size);
2029 delete bst_fail;
2030 if (node != NULL)
2032 /* Calculate the unrolling factor based on the smallest type. */
2033 poly_uint64 unrolling_factor
2034 = calculate_unrolling_factor (max_nunits, group_size);
2036 if (maybe_ne (unrolling_factor, 1U)
2037 && is_a <bb_vec_info> (vinfo))
2039 unsigned HOST_WIDE_INT const_max_nunits;
2040 if (!max_nunits.is_constant (&const_max_nunits)
2041 || const_max_nunits > group_size)
2043 if (dump_enabled_p ())
2044 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2045 "Build SLP failed: store group "
2046 "size not a multiple of the vector size "
2047 "in basic block SLP\n");
2048 vect_free_slp_tree (node, false);
2049 loads.release ();
2050 return false;
2052 /* Fatal mismatch. */
2053 matches[group_size / const_max_nunits * const_max_nunits] = false;
2054 vect_free_slp_tree (node, false);
2055 loads.release ();
2057 else
2059 /* Create a new SLP instance. */
2060 new_instance = XNEW (struct _slp_instance);
2061 SLP_INSTANCE_TREE (new_instance) = node;
2062 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2063 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2064 SLP_INSTANCE_LOADS (new_instance) = loads;
2066 /* Compute the load permutation. */
2067 slp_tree load_node;
2068 bool loads_permuted = false;
2069 FOR_EACH_VEC_ELT (loads, i, load_node)
2071 vec<unsigned> load_permutation;
2072 int j;
2073 gimple *load, *first_stmt;
2074 bool this_load_permuted = false;
2075 load_permutation.create (group_size);
2076 first_stmt = DR_GROUP_FIRST_ELEMENT
2077 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2078 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
2080 int load_place = vect_get_place_in_interleaving_chain
2081 (load, first_stmt);
2082 gcc_assert (load_place != -1);
2083 if (load_place != j)
2084 this_load_permuted = true;
2085 load_permutation.safe_push (load_place);
2087 if (!this_load_permuted
2088 /* The load requires permutation when unrolling exposes
2089 a gap either because the group is larger than the SLP
2090 group-size or because there is a gap between the groups. */
2091 && (known_eq (unrolling_factor, 1U)
2092 || (group_size == DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
2093 && DR_GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
2095 load_permutation.release ();
2096 continue;
2098 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2099 loads_permuted = true;
2102 if (loads_permuted)
2104 if (!vect_supported_load_permutation_p (new_instance))
2106 if (dump_enabled_p ())
2108 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2109 "Build SLP failed: unsupported load "
2110 "permutation ");
2111 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
2112 TDF_SLIM, stmt, 0);
2114 vect_free_slp_instance (new_instance, false);
2115 return false;
2119 /* If the loads and stores can be handled with load/store-lan
2120 instructions do not generate this SLP instance. */
2121 if (is_a <loop_vec_info> (vinfo)
2122 && loads_permuted
2123 && dr && vect_store_lanes_supported (vectype, group_size, false))
2125 slp_tree load_node;
2126 FOR_EACH_VEC_ELT (loads, i, load_node)
2128 gimple *first_stmt = DR_GROUP_FIRST_ELEMENT
2129 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2130 stmt_vec_info stmt_vinfo = vinfo_for_stmt (first_stmt);
2131 /* Use SLP for strided accesses (or if we
2132 can't load-lanes). */
2133 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2134 || ! vect_load_lanes_supported
2135 (STMT_VINFO_VECTYPE (stmt_vinfo),
2136 DR_GROUP_SIZE (stmt_vinfo), false))
2137 break;
2139 if (i == loads.length ())
2141 if (dump_enabled_p ())
2142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2143 "Built SLP cancelled: can use "
2144 "load/store-lanes\n");
2145 vect_free_slp_instance (new_instance, false);
2146 return false;
2150 vinfo->slp_instances.safe_push (new_instance);
2152 if (dump_enabled_p ())
2154 dump_printf_loc (MSG_NOTE, vect_location,
2155 "Final SLP tree for instance:\n");
2156 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2159 return true;
2162 else
2164 /* Failed to SLP. */
2165 /* Free the allocated memory. */
2166 scalar_stmts.release ();
2167 loads.release ();
2170 /* For basic block SLP, try to break the group up into multiples of the
2171 vector size. */
2172 unsigned HOST_WIDE_INT const_nunits;
2173 if (is_a <bb_vec_info> (vinfo)
2174 && STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
2175 && DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2176 && nunits.is_constant (&const_nunits))
2178 /* We consider breaking the group only on VF boundaries from the existing
2179 start. */
2180 for (i = 0; i < group_size; i++)
2181 if (!matches[i]) break;
2183 if (i >= const_nunits && i < group_size)
2185 /* Split into two groups at the first vector boundary before i. */
2186 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2187 unsigned group1_size = i & ~(const_nunits - 1);
2189 gimple *rest = vect_split_slp_store_group (stmt, group1_size);
2190 bool res = vect_analyze_slp_instance (vinfo, stmt, max_tree_size);
2191 /* If the first non-match was in the middle of a vector,
2192 skip the rest of that vector. */
2193 if (group1_size < i)
2195 i = group1_size + const_nunits;
2196 if (i < group_size)
2197 rest = vect_split_slp_store_group (rest, const_nunits);
2199 if (i < group_size)
2200 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2201 return res;
2203 /* Even though the first vector did not all match, we might be able to SLP
2204 (some) of the remainder. FORNOW ignore this possibility. */
2207 return false;
2211 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2212 trees of packed scalar stmts if SLP is possible. */
2214 bool
2215 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2217 unsigned int i;
2218 gimple *first_element;
2220 DUMP_VECT_SCOPE ("vect_analyze_slp");
2222 /* Find SLP sequences starting from groups of grouped stores. */
2223 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2224 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2226 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2228 if (loop_vinfo->reduction_chains.length () > 0)
2230 /* Find SLP sequences starting from reduction chains. */
2231 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2232 if (! vect_analyze_slp_instance (vinfo, first_element,
2233 max_tree_size))
2235 /* Dissolve reduction chain group. */
2236 gimple *next, *stmt = first_element;
2237 while (stmt)
2239 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2240 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2241 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2242 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2243 stmt = next;
2245 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (first_element))
2246 = vect_internal_def;
2250 /* Find SLP sequences starting from groups of reductions. */
2251 if (loop_vinfo->reductions.length () > 1)
2252 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2253 max_tree_size);
2256 return true;
2260 /* For each possible SLP instance decide whether to SLP it and calculate overall
2261 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2262 least one instance. */
2264 bool
2265 vect_make_slp_decision (loop_vec_info loop_vinfo)
2267 unsigned int i;
2268 poly_uint64 unrolling_factor = 1;
2269 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2270 slp_instance instance;
2271 int decided_to_slp = 0;
2273 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2275 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2277 /* FORNOW: SLP if you can. */
2278 /* All unroll factors have the form current_vector_size * X for some
2279 rational X, so they must have a common multiple. */
2280 unrolling_factor
2281 = force_common_multiple (unrolling_factor,
2282 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2284 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2285 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2286 loop-based vectorization. Such stmts will be marked as HYBRID. */
2287 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2288 decided_to_slp++;
2291 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2293 if (decided_to_slp && dump_enabled_p ())
2295 dump_printf_loc (MSG_NOTE, vect_location,
2296 "Decided to SLP %d instances. Unrolling factor ",
2297 decided_to_slp);
2298 dump_dec (MSG_NOTE, unrolling_factor);
2299 dump_printf (MSG_NOTE, "\n");
2302 return (decided_to_slp > 0);
2306 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2307 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2309 static void
2310 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2312 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[i];
2313 imm_use_iterator imm_iter;
2314 gimple *use_stmt;
2315 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
2316 slp_tree child;
2317 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2318 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2319 int j;
2321 /* Propagate hybrid down the SLP tree. */
2322 if (stype == hybrid)
2324 else if (HYBRID_SLP_STMT (stmt_vinfo))
2325 stype = hybrid;
2326 else
2328 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2329 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2330 /* If we get a pattern stmt here we have to use the LHS of the
2331 original stmt for immediate uses. */
2332 if (! STMT_VINFO_IN_PATTERN_P (stmt_vinfo)
2333 && STMT_VINFO_RELATED_STMT (stmt_vinfo))
2334 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
2335 tree def;
2336 if (gimple_code (stmt) == GIMPLE_PHI)
2337 def = gimple_phi_result (stmt);
2338 else
2339 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2340 if (def)
2341 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2343 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2344 continue;
2345 use_vinfo = vinfo_for_stmt (use_stmt);
2346 if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2347 && STMT_VINFO_RELATED_STMT (use_vinfo))
2348 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
2349 if (!STMT_SLP_TYPE (use_vinfo)
2350 && (STMT_VINFO_RELEVANT (use_vinfo)
2351 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2352 && !(gimple_code (use_stmt) == GIMPLE_PHI
2353 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2355 if (dump_enabled_p ())
2357 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2358 "def in non-SLP stmt: ");
2359 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
2361 stype = hybrid;
2366 if (stype == hybrid
2367 && !HYBRID_SLP_STMT (stmt_vinfo))
2369 if (dump_enabled_p ())
2371 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2372 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2374 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2377 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2378 if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
2379 vect_detect_hybrid_slp_stmts (child, i, stype);
2382 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2384 static tree
2385 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2387 walk_stmt_info *wi = (walk_stmt_info *)data;
2388 struct loop *loopp = (struct loop *)wi->info;
2390 if (wi->is_lhs)
2391 return NULL_TREE;
2393 if (TREE_CODE (*tp) == SSA_NAME
2394 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2396 gimple *def_stmt = SSA_NAME_DEF_STMT (*tp);
2397 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2398 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2400 if (dump_enabled_p ())
2402 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2403 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2405 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2409 return NULL_TREE;
2412 static tree
2413 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2414 walk_stmt_info *)
2416 stmt_vec_info use_vinfo = vinfo_for_stmt (gsi_stmt (*gsi));
2417 /* If the stmt is in a SLP instance then this isn't a reason
2418 to mark use definitions in other SLP instances as hybrid. */
2419 if (! STMT_SLP_TYPE (use_vinfo)
2420 && (STMT_VINFO_RELEVANT (use_vinfo)
2421 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2422 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2423 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2425 else
2426 *handled = true;
2427 return NULL_TREE;
2430 /* Find stmts that must be both vectorized and SLPed. */
2432 void
2433 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2435 unsigned int i;
2436 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2437 slp_instance instance;
2439 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2441 /* First walk all pattern stmt in the loop and mark defs of uses as
2442 hybrid because immediate uses in them are not recorded. */
2443 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2445 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2446 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2447 gsi_next (&gsi))
2449 gimple *stmt = gsi_stmt (gsi);
2450 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2451 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2453 walk_stmt_info wi;
2454 memset (&wi, 0, sizeof (wi));
2455 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2456 gimple_stmt_iterator gsi2
2457 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2458 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2459 vect_detect_hybrid_slp_1, &wi);
2460 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2461 vect_detect_hybrid_slp_2,
2462 vect_detect_hybrid_slp_1, &wi);
2467 /* Then walk the SLP instance trees marking stmts with uses in
2468 non-SLP stmts as hybrid, also propagating hybrid down the
2469 SLP tree, collecting the above info on-the-fly. */
2470 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2472 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2473 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2474 i, pure_slp);
2479 /* Initialize a bb_vec_info struct for the statements between
2480 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2482 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2483 gimple_stmt_iterator region_end_in,
2484 vec_info_shared *shared)
2485 : vec_info (vec_info::bb, init_cost (NULL), shared),
2486 bb (gsi_bb (region_begin_in)),
2487 region_begin (region_begin_in),
2488 region_end (region_end_in)
2490 gimple_stmt_iterator gsi;
2492 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2493 gsi_next (&gsi))
2495 gimple *stmt = gsi_stmt (gsi);
2496 gimple_set_uid (stmt, 0);
2497 add_stmt (stmt);
2500 bb->aux = this;
2504 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2505 stmts in the basic block. */
2507 _bb_vec_info::~_bb_vec_info ()
2509 for (gimple_stmt_iterator si = region_begin;
2510 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2512 gimple *stmt = gsi_stmt (si);
2513 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2515 if (stmt_info)
2516 /* Free stmt_vec_info. */
2517 free_stmt_vec_info (stmt);
2519 /* Reset region marker. */
2520 gimple_set_uid (stmt, -1);
2523 bb->aux = NULL;
2526 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2527 given then that child nodes have already been processed, and that
2528 their def types currently match their SLP node's def type. */
2530 static bool
2531 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2532 slp_instance node_instance,
2533 stmt_vector_for_cost *cost_vec)
2535 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2536 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2537 gcc_assert (stmt_info);
2538 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2540 /* For BB vectorization vector types are assigned here.
2541 Memory accesses already got their vector type assigned
2542 in vect_analyze_data_refs. */
2543 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2544 if (bb_vinfo
2545 && ! STMT_VINFO_DATA_REF (stmt_info))
2547 tree vectype, nunits_vectype;
2548 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
2549 &nunits_vectype))
2550 /* We checked this when building the node. */
2551 gcc_unreachable ();
2552 if (vectype == boolean_type_node)
2554 vectype = vect_get_mask_type_for_stmt (stmt_info);
2555 if (!vectype)
2556 /* vect_get_mask_type_for_stmt has already explained the
2557 failure. */
2558 return false;
2561 gimple *sstmt;
2562 unsigned int i;
2563 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt)
2564 STMT_VINFO_VECTYPE (vinfo_for_stmt (sstmt)) = vectype;
2567 /* Calculate the number of vector statements to be created for the
2568 scalar stmts in this node. For SLP reductions it is equal to the
2569 number of vector statements in the children (which has already been
2570 calculated by the recursive call). Otherwise it is the number of
2571 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2572 VF divided by the number of elements in a vector. */
2573 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2574 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2575 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2576 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2577 else
2579 poly_uint64 vf;
2580 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2581 vf = loop_vinfo->vectorization_factor;
2582 else
2583 vf = 1;
2584 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2585 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2586 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2587 = vect_get_num_vectors (vf * group_size, vectype);
2590 bool dummy;
2591 return vect_analyze_stmt (stmt, &dummy, node, node_instance, cost_vec);
2594 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2595 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2597 Return true if the operations are supported. */
2599 static bool
2600 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2601 slp_instance node_instance,
2602 scalar_stmts_to_slp_tree_map_t *visited,
2603 scalar_stmts_to_slp_tree_map_t *lvisited,
2604 stmt_vector_for_cost *cost_vec)
2606 int i, j;
2607 slp_tree child;
2609 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2610 return true;
2612 /* If we already analyzed the exact same set of scalar stmts we're done.
2613 We share the generated vector stmts for those. */
2614 slp_tree *leader;
2615 if ((leader = visited->get (SLP_TREE_SCALAR_STMTS (node)))
2616 || (leader = lvisited->get (SLP_TREE_SCALAR_STMTS (node))))
2618 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2619 = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
2620 return true;
2623 /* The SLP graph is acyclic so not caching whether we failed or succeeded
2624 doesn't result in any issue since we throw away the lvisited set
2625 when we fail. */
2626 lvisited->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
2628 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2629 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2630 visited, lvisited, cost_vec))
2631 return false;
2633 /* Push SLP node def-type to stmt operands. */
2634 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2635 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2636 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2637 = SLP_TREE_DEF_TYPE (child);
2638 bool res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2639 cost_vec);
2640 /* Restore def-types. */
2641 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2642 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2643 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2644 = vect_internal_def;
2645 if (! res)
2646 return false;
2648 return true;
2652 /* Analyze statements in SLP instances of VINFO. Return true if the
2653 operations are supported. */
2655 bool
2656 vect_slp_analyze_operations (vec_info *vinfo)
2658 slp_instance instance;
2659 int i;
2661 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2663 scalar_stmts_to_slp_tree_map_t *visited
2664 = new scalar_stmts_to_slp_tree_map_t ();
2665 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2667 scalar_stmts_to_slp_tree_map_t lvisited;
2668 stmt_vector_for_cost cost_vec;
2669 cost_vec.create (2);
2670 if (!vect_slp_analyze_node_operations (vinfo,
2671 SLP_INSTANCE_TREE (instance),
2672 instance, visited, &lvisited,
2673 &cost_vec))
2675 dump_printf_loc (MSG_NOTE, vect_location,
2676 "removing SLP instance operations starting from: ");
2677 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2678 SLP_TREE_SCALAR_STMTS
2679 (SLP_INSTANCE_TREE (instance))[0], 0);
2680 vect_free_slp_instance (instance, false);
2681 vinfo->slp_instances.ordered_remove (i);
2682 cost_vec.release ();
2684 else
2686 for (scalar_stmts_to_slp_tree_map_t::iterator x = lvisited.begin();
2687 x != lvisited.end(); ++x)
2688 visited->put ((*x).first.copy (), (*x).second);
2689 i++;
2691 add_stmt_costs (vinfo->target_cost_data, &cost_vec);
2692 cost_vec.release ();
2695 delete visited;
2697 return !vinfo->slp_instances.is_empty ();
2701 /* Compute the scalar cost of the SLP node NODE and its children
2702 and return it. Do not account defs that are marked in LIFE and
2703 update LIFE according to uses of NODE. */
2705 static void
2706 vect_bb_slp_scalar_cost (basic_block bb,
2707 slp_tree node, vec<bool, va_heap> *life,
2708 stmt_vector_for_cost *cost_vec)
2710 unsigned i;
2711 gimple *stmt;
2712 slp_tree child;
2714 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2716 ssa_op_iter op_iter;
2717 def_operand_p def_p;
2718 stmt_vec_info stmt_info;
2720 if ((*life)[i])
2721 continue;
2723 /* If there is a non-vectorized use of the defs then the scalar
2724 stmt is kept live in which case we do not account it or any
2725 required defs in the SLP children in the scalar cost. This
2726 way we make the vectorization more costly when compared to
2727 the scalar cost. */
2728 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2730 imm_use_iterator use_iter;
2731 gimple *use_stmt;
2732 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2733 if (!is_gimple_debug (use_stmt)
2734 && (! vect_stmt_in_region_p (vinfo_for_stmt (stmt)->vinfo,
2735 use_stmt)
2736 || ! PURE_SLP_STMT (vinfo_for_stmt (use_stmt))))
2738 (*life)[i] = true;
2739 BREAK_FROM_IMM_USE_STMT (use_iter);
2742 if ((*life)[i])
2743 continue;
2745 /* Count scalar stmts only once. */
2746 if (gimple_visited_p (stmt))
2747 continue;
2748 gimple_set_visited (stmt, true);
2750 stmt_info = vinfo_for_stmt (stmt);
2751 vect_cost_for_stmt kind;
2752 if (STMT_VINFO_DATA_REF (stmt_info))
2754 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2755 kind = scalar_load;
2756 else
2757 kind = scalar_store;
2759 else
2760 kind = scalar_stmt;
2761 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
2764 auto_vec<bool, 20> subtree_life;
2765 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2767 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2769 /* Do not directly pass LIFE to the recursive call, copy it to
2770 confine changes in the callee to the current child/subtree. */
2771 subtree_life.safe_splice (*life);
2772 vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec);
2773 subtree_life.truncate (0);
2778 /* Check if vectorization of the basic block is profitable. */
2780 static bool
2781 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2783 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2784 slp_instance instance;
2785 int i;
2786 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2787 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2789 /* Calculate scalar cost. */
2790 stmt_vector_for_cost scalar_costs;
2791 scalar_costs.create (0);
2792 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2794 auto_vec<bool, 20> life;
2795 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2796 vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2797 SLP_INSTANCE_TREE (instance),
2798 &life, &scalar_costs);
2800 void *target_cost_data = init_cost (NULL);
2801 add_stmt_costs (target_cost_data, &scalar_costs);
2802 scalar_costs.release ();
2803 unsigned dummy;
2804 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
2805 destroy_cost_data (target_cost_data);
2807 /* Unset visited flag. */
2808 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2809 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2810 gimple_set_visited (gsi_stmt (gsi), false);
2812 /* Complete the target-specific cost calculation. */
2813 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2814 &vec_inside_cost, &vec_epilogue_cost);
2816 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2818 if (dump_enabled_p ())
2820 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2821 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2822 vec_inside_cost);
2823 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2824 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2825 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2828 /* Vectorization is profitable if its cost is more than the cost of scalar
2829 version. Note that we err on the vector side for equal cost because
2830 the cost estimate is otherwise quite pessimistic (constant uses are
2831 free on the scalar side but cost a load on the vector side for
2832 example). */
2833 if (vec_outside_cost + vec_inside_cost > scalar_cost)
2834 return false;
2836 return true;
2839 /* Check if the basic block can be vectorized. Returns a bb_vec_info
2840 if so and sets fatal to true if failure is independent of
2841 current_vector_size. */
2843 static bb_vec_info
2844 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
2845 gimple_stmt_iterator region_end,
2846 vec<data_reference_p> datarefs, int n_stmts,
2847 bool &fatal, vec_info_shared *shared)
2849 bb_vec_info bb_vinfo;
2850 slp_instance instance;
2851 int i;
2852 poly_uint64 min_vf = 2;
2854 /* The first group of checks is independent of the vector size. */
2855 fatal = true;
2857 if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2859 if (dump_enabled_p ())
2860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2861 "not vectorized: too many instructions in "
2862 "basic block.\n");
2863 free_data_refs (datarefs);
2864 return NULL;
2867 bb_vinfo = new _bb_vec_info (region_begin, region_end, shared);
2868 if (!bb_vinfo)
2869 return NULL;
2871 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
2872 bb_vinfo->shared->save_datarefs ();
2874 /* Analyze the data references. */
2876 if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
2878 if (dump_enabled_p ())
2879 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2880 "not vectorized: unhandled data-ref in basic "
2881 "block.\n");
2883 delete bb_vinfo;
2884 return NULL;
2887 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2889 if (dump_enabled_p ())
2890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2891 "not vectorized: not enough data-refs in "
2892 "basic block.\n");
2894 delete bb_vinfo;
2895 return NULL;
2898 if (!vect_analyze_data_ref_accesses (bb_vinfo))
2900 if (dump_enabled_p ())
2901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2902 "not vectorized: unhandled data access in "
2903 "basic block.\n");
2905 delete bb_vinfo;
2906 return NULL;
2909 /* If there are no grouped stores in the region there is no need
2910 to continue with pattern recog as vect_analyze_slp will fail
2911 anyway. */
2912 if (bb_vinfo->grouped_stores.is_empty ())
2914 if (dump_enabled_p ())
2915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2916 "not vectorized: no grouped stores in "
2917 "basic block.\n");
2919 delete bb_vinfo;
2920 return NULL;
2923 /* While the rest of the analysis below depends on it in some way. */
2924 fatal = false;
2926 vect_pattern_recog (bb_vinfo);
2928 /* Check the SLP opportunities in the basic block, analyze and build SLP
2929 trees. */
2930 if (!vect_analyze_slp (bb_vinfo, n_stmts))
2932 if (dump_enabled_p ())
2934 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2935 "Failed to SLP the basic block.\n");
2936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2937 "not vectorized: failed to find SLP opportunities "
2938 "in basic block.\n");
2941 delete bb_vinfo;
2942 return NULL;
2945 vect_record_base_alignments (bb_vinfo);
2947 /* Analyze and verify the alignment of data references and the
2948 dependence in the SLP instances. */
2949 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
2951 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
2952 || ! vect_slp_analyze_instance_dependence (instance))
2954 dump_printf_loc (MSG_NOTE, vect_location,
2955 "removing SLP instance operations starting from: ");
2956 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2957 SLP_TREE_SCALAR_STMTS
2958 (SLP_INSTANCE_TREE (instance))[0], 0);
2959 vect_free_slp_instance (instance, false);
2960 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
2961 continue;
2964 /* Mark all the statements that we want to vectorize as pure SLP and
2965 relevant. */
2966 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2967 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2969 i++;
2971 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
2973 delete bb_vinfo;
2974 return NULL;
2977 if (!vect_slp_analyze_operations (bb_vinfo))
2979 if (dump_enabled_p ())
2980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2981 "not vectorized: bad operation in basic block.\n");
2983 delete bb_vinfo;
2984 return NULL;
2987 /* Cost model: check if the vectorization is worthwhile. */
2988 if (!unlimited_cost_model (NULL)
2989 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2991 if (dump_enabled_p ())
2992 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2993 "not vectorized: vectorization is not "
2994 "profitable.\n");
2996 delete bb_vinfo;
2997 return NULL;
3000 if (dump_enabled_p ())
3001 dump_printf_loc (MSG_NOTE, vect_location,
3002 "Basic block will be vectorized using SLP\n");
3004 return bb_vinfo;
3008 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3009 true if anything in the basic-block was vectorized. */
3011 bool
3012 vect_slp_bb (basic_block bb)
3014 bb_vec_info bb_vinfo;
3015 gimple_stmt_iterator gsi;
3016 bool any_vectorized = false;
3017 auto_vector_sizes vector_sizes;
3019 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3021 /* Autodetect first vector size we try. */
3022 current_vector_size = 0;
3023 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
3024 unsigned int next_size = 0;
3026 gsi = gsi_start_bb (bb);
3028 poly_uint64 autodetected_vector_size = 0;
3029 while (1)
3031 if (gsi_end_p (gsi))
3032 break;
3034 gimple_stmt_iterator region_begin = gsi;
3035 vec<data_reference_p> datarefs = vNULL;
3036 int insns = 0;
3038 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3040 gimple *stmt = gsi_stmt (gsi);
3041 if (is_gimple_debug (stmt))
3042 continue;
3043 insns++;
3045 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3046 vect_location = stmt;
3048 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3049 break;
3052 /* Skip leading unhandled stmts. */
3053 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3055 gsi_next (&gsi);
3056 continue;
3059 gimple_stmt_iterator region_end = gsi;
3061 bool vectorized = false;
3062 bool fatal = false;
3063 vec_info_shared shared;
3064 bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
3065 datarefs, insns, fatal, &shared);
3066 if (bb_vinfo
3067 && dbg_cnt (vect_slp))
3069 if (dump_enabled_p ())
3070 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3072 bb_vinfo->shared->check_datarefs ();
3073 vect_schedule_slp (bb_vinfo);
3075 unsigned HOST_WIDE_INT bytes;
3076 if (current_vector_size.is_constant (&bytes))
3077 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3078 "basic block part vectorized using "
3079 HOST_WIDE_INT_PRINT_UNSIGNED " byte "
3080 "vectors\n", bytes);
3081 else
3082 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3083 "basic block part vectorized using variable "
3084 "length vectors\n");
3086 vectorized = true;
3088 delete bb_vinfo;
3090 any_vectorized |= vectorized;
3092 if (next_size == 0)
3093 autodetected_vector_size = current_vector_size;
3095 if (next_size < vector_sizes.length ()
3096 && known_eq (vector_sizes[next_size], autodetected_vector_size))
3097 next_size += 1;
3099 if (vectorized
3100 || next_size == vector_sizes.length ()
3101 || known_eq (current_vector_size, 0U)
3102 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3103 vector sizes will fail do not bother iterating. */
3104 || fatal)
3106 if (gsi_end_p (region_end))
3107 break;
3109 /* Skip the unhandled stmt. */
3110 gsi_next (&gsi);
3112 /* And reset vector sizes. */
3113 current_vector_size = 0;
3114 next_size = 0;
3116 else
3118 /* Try the next biggest vector size. */
3119 current_vector_size = vector_sizes[next_size++];
3120 if (dump_enabled_p ())
3122 dump_printf_loc (MSG_NOTE, vect_location,
3123 "***** Re-trying analysis with "
3124 "vector size ");
3125 dump_dec (MSG_NOTE, current_vector_size);
3126 dump_printf (MSG_NOTE, "\n");
3129 /* Start over. */
3130 gsi = region_begin;
3134 return any_vectorized;
3138 /* Return 1 if vector type of boolean constant which is OPNUM
3139 operand in statement STMT is a boolean vector. */
3141 static bool
3142 vect_mask_constant_operand_p (gimple *stmt, int opnum)
3144 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3145 enum tree_code code = gimple_expr_code (stmt);
3146 tree op, vectype;
3147 enum vect_def_type dt;
3149 /* For comparison and COND_EXPR type is chosen depending
3150 on the other comparison operand. */
3151 if (TREE_CODE_CLASS (code) == tcc_comparison)
3153 if (opnum)
3154 op = gimple_assign_rhs1 (stmt);
3155 else
3156 op = gimple_assign_rhs2 (stmt);
3158 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3159 gcc_unreachable ();
3161 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3164 if (code == COND_EXPR)
3166 tree cond = gimple_assign_rhs1 (stmt);
3168 if (TREE_CODE (cond) == SSA_NAME)
3169 op = cond;
3170 else if (opnum)
3171 op = TREE_OPERAND (cond, 1);
3172 else
3173 op = TREE_OPERAND (cond, 0);
3175 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3176 gcc_unreachable ();
3178 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3181 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3184 /* Build a variable-length vector in which the elements in ELTS are repeated
3185 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3186 RESULTS and add any new instructions to SEQ.
3188 The approach we use is:
3190 (1) Find a vector mode VM with integer elements of mode IM.
3192 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3193 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3194 from small vectors to IM.
3196 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3198 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3199 correct byte contents.
3201 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3203 We try to find the largest IM for which this sequence works, in order
3204 to cut down on the number of interleaves. */
3206 void
3207 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
3208 unsigned int nresults, vec<tree> &results)
3210 unsigned int nelts = elts.length ();
3211 tree element_type = TREE_TYPE (vector_type);
3213 /* (1) Find a vector mode VM with integer elements of mode IM. */
3214 unsigned int nvectors = 1;
3215 tree new_vector_type;
3216 tree permutes[2];
3217 if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
3218 &nvectors, &new_vector_type,
3219 permutes))
3220 gcc_unreachable ();
3222 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3223 unsigned int partial_nelts = nelts / nvectors;
3224 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3226 tree_vector_builder partial_elts;
3227 auto_vec<tree, 32> pieces (nvectors * 2);
3228 pieces.quick_grow (nvectors * 2);
3229 for (unsigned int i = 0; i < nvectors; ++i)
3231 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3232 ELTS' has mode IM. */
3233 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3234 for (unsigned int j = 0; j < partial_nelts; ++j)
3235 partial_elts.quick_push (elts[i * partial_nelts + j]);
3236 tree t = gimple_build_vector (seq, &partial_elts);
3237 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3238 TREE_TYPE (new_vector_type), t);
3240 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3241 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3244 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3245 correct byte contents.
3247 We need to repeat the following operation log2(nvectors) times:
3249 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3250 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3252 However, if each input repeats every N elements and the VF is
3253 a multiple of N * 2, the HI result is the same as the LO. */
3254 unsigned int in_start = 0;
3255 unsigned int out_start = nvectors;
3256 unsigned int hi_start = nvectors / 2;
3257 /* A bound on the number of outputs needed to produce NRESULTS results
3258 in the final iteration. */
3259 unsigned int noutputs_bound = nvectors * nresults;
3260 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3262 noutputs_bound /= 2;
3263 unsigned int limit = MIN (noutputs_bound, nvectors);
3264 for (unsigned int i = 0; i < limit; ++i)
3266 if ((i & 1) != 0
3267 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3268 2 * in_repeat))
3270 pieces[out_start + i] = pieces[out_start + i - 1];
3271 continue;
3274 tree output = make_ssa_name (new_vector_type);
3275 tree input1 = pieces[in_start + (i / 2)];
3276 tree input2 = pieces[in_start + (i / 2) + hi_start];
3277 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3278 input1, input2,
3279 permutes[i & 1]);
3280 gimple_seq_add_stmt (seq, stmt);
3281 pieces[out_start + i] = output;
3283 std::swap (in_start, out_start);
3286 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3287 results.reserve (nresults);
3288 for (unsigned int i = 0; i < nresults; ++i)
3289 if (i < nvectors)
3290 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3291 pieces[in_start + i]));
3292 else
3293 results.quick_push (results[i - nvectors]);
3297 /* For constant and loop invariant defs of SLP_NODE this function returns
3298 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3299 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
3300 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
3301 REDUC_INDEX is the index of the reduction operand in the statements, unless
3302 it is -1. */
3304 static void
3305 vect_get_constant_vectors (tree op, slp_tree slp_node,
3306 vec<tree> *vec_oprnds,
3307 unsigned int op_num, unsigned int number_of_vectors)
3309 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3310 gimple *stmt = stmts[0];
3311 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3312 unsigned HOST_WIDE_INT nunits;
3313 tree vec_cst;
3314 unsigned j, number_of_places_left_in_vector;
3315 tree vector_type;
3316 tree vop;
3317 int group_size = stmts.length ();
3318 unsigned int vec_num, i;
3319 unsigned number_of_copies = 1;
3320 vec<tree> voprnds;
3321 voprnds.create (number_of_vectors);
3322 bool constant_p, is_store;
3323 tree neutral_op = NULL;
3324 enum tree_code code = gimple_expr_code (stmt);
3325 gimple_seq ctor_seq = NULL;
3326 auto_vec<tree, 16> permute_results;
3328 /* Check if vector type is a boolean vector. */
3329 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3330 && vect_mask_constant_operand_p (stmt, op_num))
3331 vector_type
3332 = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
3333 else
3334 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
3336 if (STMT_VINFO_DATA_REF (stmt_vinfo))
3338 is_store = true;
3339 op = gimple_assign_rhs1 (stmt);
3341 else
3342 is_store = false;
3344 gcc_assert (op);
3346 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3347 created vectors. It is greater than 1 if unrolling is performed.
3349 For example, we have two scalar operands, s1 and s2 (e.g., group of
3350 strided accesses of size two), while NUNITS is four (i.e., four scalars
3351 of this type can be packed in a vector). The output vector will contain
3352 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3353 will be 2).
3355 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3356 containing the operands.
3358 For example, NUNITS is four as before, and the group size is 8
3359 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3360 {s5, s6, s7, s8}. */
3362 /* When using duplicate_and_interleave, we just need one element for
3363 each scalar statement. */
3364 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3365 nunits = group_size;
3367 number_of_copies = nunits * number_of_vectors / group_size;
3369 number_of_places_left_in_vector = nunits;
3370 constant_p = true;
3371 tree_vector_builder elts (vector_type, nunits, 1);
3372 elts.quick_grow (nunits);
3373 bool place_after_defs = false;
3374 for (j = 0; j < number_of_copies; j++)
3376 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
3378 if (is_store)
3379 op = gimple_assign_rhs1 (stmt);
3380 else
3382 switch (code)
3384 case COND_EXPR:
3386 tree cond = gimple_assign_rhs1 (stmt);
3387 if (TREE_CODE (cond) == SSA_NAME)
3388 op = gimple_op (stmt, op_num + 1);
3389 else if (op_num == 0 || op_num == 1)
3390 op = TREE_OPERAND (cond, op_num);
3391 else
3393 if (op_num == 2)
3394 op = gimple_assign_rhs2 (stmt);
3395 else
3396 op = gimple_assign_rhs3 (stmt);
3399 break;
3401 case CALL_EXPR:
3402 op = gimple_call_arg (stmt, op_num);
3403 break;
3405 case LSHIFT_EXPR:
3406 case RSHIFT_EXPR:
3407 case LROTATE_EXPR:
3408 case RROTATE_EXPR:
3409 op = gimple_op (stmt, op_num + 1);
3410 /* Unlike the other binary operators, shifts/rotates have
3411 the shift count being int, instead of the same type as
3412 the lhs, so make sure the scalar is the right type if
3413 we are dealing with vectors of
3414 long long/long/short/char. */
3415 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3416 op = fold_convert (TREE_TYPE (vector_type), op);
3417 break;
3419 default:
3420 op = gimple_op (stmt, op_num + 1);
3421 break;
3425 /* Create 'vect_ = {op0,op1,...,opn}'. */
3426 number_of_places_left_in_vector--;
3427 tree orig_op = op;
3428 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3430 if (CONSTANT_CLASS_P (op))
3432 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3434 /* Can't use VIEW_CONVERT_EXPR for booleans because
3435 of possibly different sizes of scalar value and
3436 vector element. */
3437 if (integer_zerop (op))
3438 op = build_int_cst (TREE_TYPE (vector_type), 0);
3439 else if (integer_onep (op))
3440 op = build_all_ones_cst (TREE_TYPE (vector_type));
3441 else
3442 gcc_unreachable ();
3444 else
3445 op = fold_unary (VIEW_CONVERT_EXPR,
3446 TREE_TYPE (vector_type), op);
3447 gcc_assert (op && CONSTANT_CLASS_P (op));
3449 else
3451 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3452 gimple *init_stmt;
3453 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3455 tree true_val
3456 = build_all_ones_cst (TREE_TYPE (vector_type));
3457 tree false_val
3458 = build_zero_cst (TREE_TYPE (vector_type));
3459 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3460 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3461 op, true_val,
3462 false_val);
3464 else
3466 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3467 op);
3468 init_stmt
3469 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3470 op);
3472 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3473 op = new_temp;
3476 elts[number_of_places_left_in_vector] = op;
3477 if (!CONSTANT_CLASS_P (op))
3478 constant_p = false;
3479 if (TREE_CODE (orig_op) == SSA_NAME
3480 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3481 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3482 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3483 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3484 place_after_defs = true;
3486 if (number_of_places_left_in_vector == 0)
3488 if (constant_p
3489 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3490 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3491 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3492 else
3494 if (vec_oprnds->is_empty ())
3495 duplicate_and_interleave (&ctor_seq, vector_type, elts,
3496 number_of_vectors,
3497 permute_results);
3498 vec_cst = permute_results[number_of_vectors - j - 1];
3500 tree init;
3501 gimple_stmt_iterator gsi;
3502 if (place_after_defs)
3504 gsi = gsi_for_stmt
3505 (vect_find_last_scalar_stmt_in_slp (slp_node));
3506 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
3508 else
3509 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
3510 if (ctor_seq != NULL)
3512 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3513 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3514 ctor_seq = NULL;
3516 voprnds.quick_push (init);
3517 place_after_defs = false;
3518 number_of_places_left_in_vector = nunits;
3519 constant_p = true;
3520 elts.new_vector (vector_type, nunits, 1);
3521 elts.quick_grow (nunits);
3526 /* Since the vectors are created in the reverse order, we should invert
3527 them. */
3528 vec_num = voprnds.length ();
3529 for (j = vec_num; j != 0; j--)
3531 vop = voprnds[j - 1];
3532 vec_oprnds->quick_push (vop);
3535 voprnds.release ();
3537 /* In case that VF is greater than the unrolling factor needed for the SLP
3538 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3539 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3540 to replicate the vectors. */
3541 while (number_of_vectors > vec_oprnds->length ())
3543 tree neutral_vec = NULL;
3545 if (neutral_op)
3547 if (!neutral_vec)
3548 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3550 vec_oprnds->quick_push (neutral_vec);
3552 else
3554 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3555 vec_oprnds->quick_push (vop);
3561 /* Get vectorized definitions from SLP_NODE that contains corresponding
3562 vectorized def-stmts. */
3564 static void
3565 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3567 tree vec_oprnd;
3568 gimple *vec_def_stmt;
3569 unsigned int i;
3571 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3573 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
3575 gcc_assert (vec_def_stmt);
3576 if (gimple_code (vec_def_stmt) == GIMPLE_PHI)
3577 vec_oprnd = gimple_phi_result (vec_def_stmt);
3578 else
3579 vec_oprnd = gimple_get_lhs (vec_def_stmt);
3580 vec_oprnds->quick_push (vec_oprnd);
3585 /* Get vectorized definitions for SLP_NODE.
3586 If the scalar definitions are loop invariants or constants, collect them and
3587 call vect_get_constant_vectors() to create vector stmts.
3588 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3589 must be stored in the corresponding child of SLP_NODE, and we call
3590 vect_get_slp_vect_defs () to retrieve them. */
3592 void
3593 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3594 vec<vec<tree> > *vec_oprnds)
3596 gimple *first_stmt;
3597 int number_of_vects = 0, i;
3598 unsigned int child_index = 0;
3599 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3600 slp_tree child = NULL;
3601 vec<tree> vec_defs;
3602 tree oprnd;
3603 bool vectorized_defs;
3605 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3606 FOR_EACH_VEC_ELT (ops, i, oprnd)
3608 /* For each operand we check if it has vectorized definitions in a child
3609 node or we need to create them (for invariants and constants). We
3610 check if the LHS of the first stmt of the next child matches OPRND.
3611 If it does, we found the correct child. Otherwise, we call
3612 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3613 to check this child node for the next operand. */
3614 vectorized_defs = false;
3615 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3617 child = SLP_TREE_CHILDREN (slp_node)[child_index];
3619 /* We have to check both pattern and original def, if available. */
3620 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3622 gimple *first_def = SLP_TREE_SCALAR_STMTS (child)[0];
3623 gimple *related
3624 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
3625 tree first_def_op;
3627 if (gimple_code (first_def) == GIMPLE_PHI)
3628 first_def_op = gimple_phi_result (first_def);
3629 else
3630 first_def_op = gimple_get_lhs (first_def);
3631 if (operand_equal_p (oprnd, first_def_op, 0)
3632 || (related
3633 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
3635 /* The number of vector defs is determined by the number of
3636 vector statements in the node from which we get those
3637 statements. */
3638 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3639 vectorized_defs = true;
3640 child_index++;
3643 else
3644 child_index++;
3647 if (!vectorized_defs)
3649 if (i == 0)
3651 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3652 /* Number of vector stmts was calculated according to LHS in
3653 vect_schedule_slp_instance (), fix it by replacing LHS with
3654 RHS, if necessary. See vect_get_smallest_scalar_type () for
3655 details. */
3656 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
3657 &rhs_size_unit);
3658 if (rhs_size_unit != lhs_size_unit)
3660 number_of_vects *= rhs_size_unit;
3661 number_of_vects /= lhs_size_unit;
3666 /* Allocate memory for vectorized defs. */
3667 vec_defs = vNULL;
3668 vec_defs.create (number_of_vects);
3670 /* For reduction defs we call vect_get_constant_vectors (), since we are
3671 looking for initial loop invariant values. */
3672 if (vectorized_defs)
3673 /* The defs are already vectorized. */
3674 vect_get_slp_vect_defs (child, &vec_defs);
3675 else
3676 /* Build vectors from scalar defs. */
3677 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3678 number_of_vects);
3680 vec_oprnds->quick_push (vec_defs);
3684 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3685 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3686 permute statements for the SLP node NODE of the SLP instance
3687 SLP_NODE_INSTANCE. */
3689 bool
3690 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3691 gimple_stmt_iterator *gsi, poly_uint64 vf,
3692 slp_instance slp_node_instance, bool analyze_only,
3693 unsigned *n_perms)
3695 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3696 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3697 tree mask_element_type = NULL_TREE, mask_type;
3698 int vec_index = 0;
3699 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3700 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3701 unsigned int mask_element;
3702 machine_mode mode;
3703 unsigned HOST_WIDE_INT nunits, const_vf;
3705 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3706 return false;
3708 stmt_info = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info));
3710 mode = TYPE_MODE (vectype);
3712 /* At the moment, all permutations are represented using per-element
3713 indices, so we can't cope with variable vector lengths or
3714 vectorization factors. */
3715 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
3716 || !vf.is_constant (&const_vf))
3717 return false;
3719 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3720 same size as the vector element being permuted. */
3721 mask_element_type = lang_hooks.types.type_for_mode
3722 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
3723 mask_type = get_vectype_for_scalar_type (mask_element_type);
3724 vec_perm_builder mask (nunits, nunits, 1);
3725 mask.quick_grow (nunits);
3726 vec_perm_indices indices;
3728 /* Initialize the vect stmts of NODE to properly insert the generated
3729 stmts later. */
3730 if (! analyze_only)
3731 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3732 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3733 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3735 /* Generate permutation masks for every NODE. Number of masks for each NODE
3736 is equal to GROUP_SIZE.
3737 E.g., we have a group of three nodes with three loads from the same
3738 location in each node, and the vector size is 4. I.e., we have a
3739 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3740 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3741 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3744 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3745 The last mask is illegal since we assume two operands for permute
3746 operation, and the mask element values can't be outside that range.
3747 Hence, the last mask must be converted into {2,5,5,5}.
3748 For the first two permutations we need the first and the second input
3749 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3750 we need the second and the third vectors: {b1,c1,a2,b2} and
3751 {c2,a3,b3,c3}. */
3753 int vect_stmts_counter = 0;
3754 unsigned int index = 0;
3755 int first_vec_index = -1;
3756 int second_vec_index = -1;
3757 bool noop_p = true;
3758 *n_perms = 0;
3760 for (unsigned int j = 0; j < const_vf; j++)
3762 for (int k = 0; k < group_size; k++)
3764 unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
3765 + j * DR_GROUP_SIZE (stmt_info));
3766 vec_index = i / nunits;
3767 mask_element = i % nunits;
3768 if (vec_index == first_vec_index
3769 || first_vec_index == -1)
3771 first_vec_index = vec_index;
3773 else if (vec_index == second_vec_index
3774 || second_vec_index == -1)
3776 second_vec_index = vec_index;
3777 mask_element += nunits;
3779 else
3781 if (dump_enabled_p ())
3783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3784 "permutation requires at "
3785 "least three vectors ");
3786 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3787 stmt, 0);
3789 gcc_assert (analyze_only);
3790 return false;
3793 gcc_assert (mask_element < 2 * nunits);
3794 if (mask_element != index)
3795 noop_p = false;
3796 mask[index++] = mask_element;
3798 if (index == nunits && !noop_p)
3800 indices.new_vector (mask, 2, nunits);
3801 if (!can_vec_perm_const_p (mode, indices))
3803 if (dump_enabled_p ())
3805 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3806 vect_location,
3807 "unsupported vect permute { ");
3808 for (i = 0; i < nunits; ++i)
3810 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3811 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3813 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3815 gcc_assert (analyze_only);
3816 return false;
3819 ++*n_perms;
3822 if (index == nunits)
3824 if (!analyze_only)
3826 tree mask_vec = NULL_TREE;
3828 if (! noop_p)
3829 mask_vec = vec_perm_indices_to_tree (mask_type, indices);
3831 if (second_vec_index == -1)
3832 second_vec_index = first_vec_index;
3834 /* Generate the permute statement if necessary. */
3835 tree first_vec = dr_chain[first_vec_index];
3836 tree second_vec = dr_chain[second_vec_index];
3837 gimple *perm_stmt;
3838 if (! noop_p)
3840 tree perm_dest
3841 = vect_create_destination_var (gimple_assign_lhs (stmt),
3842 vectype);
3843 perm_dest = make_ssa_name (perm_dest);
3844 perm_stmt = gimple_build_assign (perm_dest,
3845 VEC_PERM_EXPR,
3846 first_vec, second_vec,
3847 mask_vec);
3848 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3850 else
3851 /* If mask was NULL_TREE generate the requested
3852 identity transform. */
3853 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
3855 /* Store the vector statement in NODE. */
3856 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
3859 index = 0;
3860 first_vec_index = -1;
3861 second_vec_index = -1;
3862 noop_p = true;
3867 return true;
3870 /* Vectorize SLP instance tree in postorder. */
3872 static bool
3873 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3874 scalar_stmts_to_slp_tree_map_t *bst_map)
3876 gimple *stmt;
3877 bool grouped_store, is_store;
3878 gimple_stmt_iterator si;
3879 stmt_vec_info stmt_info;
3880 unsigned int group_size;
3881 tree vectype;
3882 int i, j;
3883 slp_tree child;
3885 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3886 return false;
3888 /* See if we have already vectorized the same set of stmts and reuse their
3889 vectorized stmts. */
3890 if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
3892 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
3893 return false;
3896 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
3897 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3898 vect_schedule_slp_instance (child, instance, bst_map);
3900 /* Push SLP node def-type to stmts. */
3901 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3902 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3903 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
3904 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
3906 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3907 stmt_info = vinfo_for_stmt (stmt);
3909 /* VECTYPE is the type of the destination. */
3910 vectype = STMT_VINFO_VECTYPE (stmt_info);
3911 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3912 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3914 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
3915 if (!SLP_TREE_VEC_STMTS (node).exists ())
3916 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
3918 if (dump_enabled_p ())
3920 dump_printf_loc (MSG_NOTE,vect_location,
3921 "------>vectorizing SLP node starting from: ");
3922 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3925 /* Vectorized stmts go before the last scalar stmt which is where
3926 all uses are ready. */
3927 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
3929 /* Mark the first element of the reduction chain as reduction to properly
3930 transform the node. In the analysis phase only the last element of the
3931 chain is marked as reduction. */
3932 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3933 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
3934 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3936 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3937 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3940 /* Handle two-operation SLP nodes by vectorizing the group with
3941 both operations and then performing a merge. */
3942 if (SLP_TREE_TWO_OPERATORS (node))
3944 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3945 enum tree_code ocode = ERROR_MARK;
3946 gimple *ostmt;
3947 vec_perm_builder mask (group_size, group_size, 1);
3948 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
3949 if (gimple_assign_rhs_code (ostmt) != code0)
3951 mask.quick_push (1);
3952 ocode = gimple_assign_rhs_code (ostmt);
3954 else
3955 mask.quick_push (0);
3956 if (ocode != ERROR_MARK)
3958 vec<gimple *> v0;
3959 vec<gimple *> v1;
3960 unsigned j;
3961 tree tmask = NULL_TREE;
3962 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3963 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3964 SLP_TREE_VEC_STMTS (node).truncate (0);
3965 gimple_assign_set_rhs_code (stmt, ocode);
3966 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3967 gimple_assign_set_rhs_code (stmt, code0);
3968 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3969 SLP_TREE_VEC_STMTS (node).truncate (0);
3970 tree meltype = build_nonstandard_integer_type
3971 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
3972 tree mvectype = get_same_sized_vectype (meltype, vectype);
3973 unsigned k = 0, l;
3974 for (j = 0; j < v0.length (); ++j)
3976 /* Enforced by vect_build_slp_tree, which rejects variable-length
3977 vectors for SLP_TREE_TWO_OPERATORS. */
3978 unsigned int const_nunits = nunits.to_constant ();
3979 tree_vector_builder melts (mvectype, const_nunits, 1);
3980 for (l = 0; l < const_nunits; ++l)
3982 if (k >= group_size)
3983 k = 0;
3984 tree t = build_int_cst (meltype,
3985 mask[k++] * const_nunits + l);
3986 melts.quick_push (t);
3988 tmask = melts.build ();
3990 /* ??? Not all targets support a VEC_PERM_EXPR with a
3991 constant mask that would translate to a vec_merge RTX
3992 (with their vec_perm_const_ok). We can either not
3993 vectorize in that case or let veclower do its job.
3994 Unfortunately that isn't too great and at least for
3995 plus/minus we'd eventually like to match targets
3996 vector addsub instructions. */
3997 gimple *vstmt;
3998 vstmt = gimple_build_assign (make_ssa_name (vectype),
3999 VEC_PERM_EXPR,
4000 gimple_assign_lhs (v0[j]),
4001 gimple_assign_lhs (v1[j]), tmask);
4002 vect_finish_stmt_generation (stmt, vstmt, &si);
4003 SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
4005 v0.release ();
4006 v1.release ();
4007 return false;
4010 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4012 /* Restore stmt def-types. */
4013 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4014 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4015 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4016 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
4018 return is_store;
4021 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4022 For loop vectorization this is done in vectorizable_call, but for SLP
4023 it needs to be deferred until end of vect_schedule_slp, because multiple
4024 SLP instances may refer to the same scalar stmt. */
4026 static void
4027 vect_remove_slp_scalar_calls (slp_tree node)
4029 gimple *stmt, *new_stmt;
4030 gimple_stmt_iterator gsi;
4031 int i;
4032 slp_tree child;
4033 tree lhs;
4034 stmt_vec_info stmt_info;
4036 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4037 return;
4039 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4040 vect_remove_slp_scalar_calls (child);
4042 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4044 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
4045 continue;
4046 stmt_info = vinfo_for_stmt (stmt);
4047 if (stmt_info == NULL
4048 || is_pattern_stmt_p (stmt_info)
4049 || !PURE_SLP_STMT (stmt_info))
4050 continue;
4051 lhs = gimple_call_lhs (stmt);
4052 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4053 set_vinfo_for_stmt (new_stmt, stmt_info);
4054 set_vinfo_for_stmt (stmt, NULL);
4055 STMT_VINFO_STMT (stmt_info) = new_stmt;
4056 gsi = gsi_for_stmt (stmt);
4057 gsi_replace (&gsi, new_stmt, false);
4058 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4062 /* Generate vector code for all SLP instances in the loop/basic block. */
4064 bool
4065 vect_schedule_slp (vec_info *vinfo)
4067 vec<slp_instance> slp_instances;
4068 slp_instance instance;
4069 unsigned int i;
4070 bool is_store = false;
4073 scalar_stmts_to_slp_tree_map_t *bst_map
4074 = new scalar_stmts_to_slp_tree_map_t ();
4075 slp_instances = vinfo->slp_instances;
4076 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4078 /* Schedule the tree of INSTANCE. */
4079 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4080 instance, bst_map);
4081 if (dump_enabled_p ())
4082 dump_printf_loc (MSG_NOTE, vect_location,
4083 "vectorizing stmts using SLP.\n");
4085 delete bst_map;
4087 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4089 slp_tree root = SLP_INSTANCE_TREE (instance);
4090 gimple *store;
4091 unsigned int j;
4092 gimple_stmt_iterator gsi;
4094 /* Remove scalar call stmts. Do not do this for basic-block
4095 vectorization as not all uses may be vectorized.
4096 ??? Why should this be necessary? DCE should be able to
4097 remove the stmts itself.
4098 ??? For BB vectorization we can as well remove scalar
4099 stmts starting from the SLP tree root if they have no
4100 uses. */
4101 if (is_a <loop_vec_info> (vinfo))
4102 vect_remove_slp_scalar_calls (root);
4104 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
4105 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4107 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
4108 break;
4110 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
4111 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
4112 /* Free the attached stmt_vec_info and remove the stmt. */
4113 gsi = gsi_for_stmt (store);
4114 unlink_stmt_vdef (store);
4115 gsi_remove (&gsi, true);
4116 release_defs (store);
4117 free_stmt_vec_info (store);
4121 return is_store;