2018-02-09 Sebastian Perta <sebastian.perta@renesas.com>
[official-gcc.git] / gcc / tree-vect-slp.c
blobc9f0feac76a032b365cec554556d6799fdb35584
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
52 static void
53 vect_free_slp_tree (slp_tree node)
55 int i;
56 slp_tree child;
58 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
59 vect_free_slp_tree (child);
61 gimple *stmt;
62 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
63 /* After transform some stmts are removed and thus their vinfo is gone. */
64 if (vinfo_for_stmt (stmt))
66 gcc_assert (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) > 0);
67 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))--;
70 SLP_TREE_CHILDREN (node).release ();
71 SLP_TREE_SCALAR_STMTS (node).release ();
72 SLP_TREE_VEC_STMTS (node).release ();
73 SLP_TREE_LOAD_PERMUTATION (node).release ();
75 free (node);
79 /* Free the memory allocated for the SLP instance. */
81 void
82 vect_free_slp_instance (slp_instance instance)
84 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
85 SLP_INSTANCE_LOADS (instance).release ();
86 free (instance);
90 /* Create an SLP node for SCALAR_STMTS. */
92 static slp_tree
93 vect_create_new_slp_node (vec<gimple *> scalar_stmts)
95 slp_tree node;
96 gimple *stmt = scalar_stmts[0];
97 unsigned int nops;
99 if (is_gimple_call (stmt))
100 nops = gimple_call_num_args (stmt);
101 else if (is_gimple_assign (stmt))
103 nops = gimple_num_ops (stmt) - 1;
104 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
105 nops++;
107 else if (gimple_code (stmt) == GIMPLE_PHI)
108 nops = 0;
109 else
110 return NULL;
112 node = XNEW (struct _slp_tree);
113 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
114 SLP_TREE_VEC_STMTS (node).create (0);
115 SLP_TREE_CHILDREN (node).create (nops);
116 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
117 SLP_TREE_TWO_OPERATORS (node) = false;
118 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
120 unsigned i;
121 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt)
122 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))++;
124 return node;
128 /* This structure is used in creation of an SLP tree. Each instance
129 corresponds to the same operand in a group of scalar stmts in an SLP
130 node. */
131 typedef struct _slp_oprnd_info
133 /* Def-stmts for the operands. */
134 vec<gimple *> def_stmts;
135 /* Information about the first statement, its vector def-type, type, the
136 operand itself in case it's constant, and an indication if it's a pattern
137 stmt. */
138 tree first_op_type;
139 enum vect_def_type first_dt;
140 bool first_pattern;
141 bool second_pattern;
142 } *slp_oprnd_info;
145 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
146 operand. */
147 static vec<slp_oprnd_info>
148 vect_create_oprnd_info (int nops, int group_size)
150 int i;
151 slp_oprnd_info oprnd_info;
152 vec<slp_oprnd_info> oprnds_info;
154 oprnds_info.create (nops);
155 for (i = 0; i < nops; i++)
157 oprnd_info = XNEW (struct _slp_oprnd_info);
158 oprnd_info->def_stmts.create (group_size);
159 oprnd_info->first_dt = vect_uninitialized_def;
160 oprnd_info->first_op_type = NULL_TREE;
161 oprnd_info->first_pattern = false;
162 oprnd_info->second_pattern = false;
163 oprnds_info.quick_push (oprnd_info);
166 return oprnds_info;
170 /* Free operands info. */
172 static void
173 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
175 int i;
176 slp_oprnd_info oprnd_info;
178 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
180 oprnd_info->def_stmts.release ();
181 XDELETE (oprnd_info);
184 oprnds_info.release ();
188 /* Find the place of the data-ref in STMT in the interleaving chain that starts
189 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
192 vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
194 gimple *next_stmt = first_stmt;
195 int result = 0;
197 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
198 return -1;
202 if (next_stmt == stmt)
203 return result;
204 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
205 if (next_stmt)
206 result += GROUP_GAP (vinfo_for_stmt (next_stmt));
208 while (next_stmt);
210 return -1;
213 /* Check whether it is possible to load COUNT elements of type ELT_MODE
214 using the method implemented by duplicate_and_interleave. Return true
215 if so, returning the number of intermediate vectors in *NVECTORS_OUT
216 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
217 (if nonnull). */
219 bool
220 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
221 unsigned int *nvectors_out,
222 tree *vector_type_out,
223 tree *permutes)
225 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
226 poly_int64 nelts;
227 unsigned int nvectors = 1;
228 for (;;)
230 scalar_int_mode int_mode;
231 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
232 if (multiple_p (current_vector_size, elt_bytes, &nelts)
233 && int_mode_for_size (elt_bits, 0).exists (&int_mode))
235 tree int_type = build_nonstandard_integer_type
236 (GET_MODE_BITSIZE (int_mode), 1);
237 tree vector_type = build_vector_type (int_type, nelts);
238 if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
240 vec_perm_builder sel1 (nelts, 2, 3);
241 vec_perm_builder sel2 (nelts, 2, 3);
242 poly_int64 half_nelts = exact_div (nelts, 2);
243 for (unsigned int i = 0; i < 3; ++i)
245 sel1.quick_push (i);
246 sel1.quick_push (i + nelts);
247 sel2.quick_push (half_nelts + i);
248 sel2.quick_push (half_nelts + i + nelts);
250 vec_perm_indices indices1 (sel1, 2, nelts);
251 vec_perm_indices indices2 (sel2, 2, nelts);
252 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
253 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
255 if (nvectors_out)
256 *nvectors_out = nvectors;
257 if (vector_type_out)
258 *vector_type_out = vector_type;
259 if (permutes)
261 permutes[0] = vect_gen_perm_mask_checked (vector_type,
262 indices1);
263 permutes[1] = vect_gen_perm_mask_checked (vector_type,
264 indices2);
266 return true;
270 if (!multiple_p (elt_bytes, 2, &elt_bytes))
271 return false;
272 nvectors *= 2;
276 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
277 they are of a valid type and that they match the defs of the first stmt of
278 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
279 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
280 indicates swap is required for cond_expr stmts. Specifically, *SWAP
281 is 1 if STMT is cond and operands of comparison need to be swapped;
282 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
283 If there is any operand swap in this function, *SWAP is set to non-zero
284 value.
285 If there was a fatal error return -1; if the error could be corrected by
286 swapping operands of father node of this one, return 1; if everything is
287 ok return 0. */
288 static int
289 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
290 vec<gimple *> stmts, unsigned stmt_num,
291 vec<slp_oprnd_info> *oprnds_info)
293 gimple *stmt = stmts[stmt_num];
294 tree oprnd;
295 unsigned int i, number_of_oprnds;
296 gimple *def_stmt;
297 enum vect_def_type dt = vect_uninitialized_def;
298 bool pattern = false;
299 slp_oprnd_info oprnd_info;
300 int first_op_idx = 1;
301 bool commutative = false;
302 bool first_op_cond = false;
303 bool first = stmt_num == 0;
304 bool second = stmt_num == 1;
306 if (is_gimple_call (stmt))
308 number_of_oprnds = gimple_call_num_args (stmt);
309 first_op_idx = 3;
311 else if (is_gimple_assign (stmt))
313 enum tree_code code = gimple_assign_rhs_code (stmt);
314 number_of_oprnds = gimple_num_ops (stmt) - 1;
315 /* Swap can only be done for cond_expr if asked to, otherwise we
316 could result in different comparison code to the first stmt. */
317 if (gimple_assign_rhs_code (stmt) == COND_EXPR
318 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
320 first_op_cond = true;
321 number_of_oprnds++;
323 else
324 commutative = commutative_tree_code (code);
326 else
327 return -1;
329 bool swapped = (*swap != 0);
330 gcc_assert (!swapped || first_op_cond);
331 for (i = 0; i < number_of_oprnds; i++)
333 again:
334 if (first_op_cond)
336 /* Map indicating how operands of cond_expr should be swapped. */
337 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
338 int *map = maps[*swap];
340 if (i < 2)
341 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx), map[i]);
342 else
343 oprnd = gimple_op (stmt, map[i]);
345 else
346 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
348 oprnd_info = (*oprnds_info)[i];
350 if (!vect_is_simple_use (oprnd, vinfo, &def_stmt, &dt))
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 "Build SLP failed: can't analyze def for ");
356 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
357 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
360 return -1;
363 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
364 from the pattern. Check that all the stmts of the node are in the
365 pattern. */
366 if (def_stmt && gimple_bb (def_stmt)
367 && vect_stmt_in_region_p (vinfo, def_stmt)
368 && vinfo_for_stmt (def_stmt)
369 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
370 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
371 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
373 pattern = true;
374 if (!first && !oprnd_info->first_pattern
375 /* Allow different pattern state for the defs of the
376 first stmt in reduction chains. */
377 && (oprnd_info->first_dt != vect_reduction_def
378 || (!second && !oprnd_info->second_pattern)))
380 if (i == 0
381 && !swapped
382 && commutative)
384 swapped = true;
385 goto again;
388 if (dump_enabled_p ())
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
391 "Build SLP failed: some of the stmts"
392 " are in a pattern, and others are not ");
393 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
394 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
397 return 1;
400 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
401 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
403 if (dt == vect_unknown_def_type)
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "Unsupported pattern.\n");
408 return -1;
411 switch (gimple_code (def_stmt))
413 case GIMPLE_PHI:
414 case GIMPLE_ASSIGN:
415 break;
417 default:
418 if (dump_enabled_p ())
419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
420 "unsupported defining stmt:\n");
421 return -1;
425 if (second)
426 oprnd_info->second_pattern = pattern;
428 if (first)
430 oprnd_info->first_dt = dt;
431 oprnd_info->first_pattern = pattern;
432 oprnd_info->first_op_type = TREE_TYPE (oprnd);
434 else
436 /* Not first stmt of the group, check that the def-stmt/s match
437 the def-stmt/s of the first stmt. Allow different definition
438 types for reduction chains: the first stmt must be a
439 vect_reduction_def (a phi node), and the rest
440 vect_internal_def. */
441 tree type = TREE_TYPE (oprnd);
442 if ((oprnd_info->first_dt != dt
443 && !(oprnd_info->first_dt == vect_reduction_def
444 && dt == vect_internal_def)
445 && !((oprnd_info->first_dt == vect_external_def
446 || oprnd_info->first_dt == vect_constant_def)
447 && (dt == vect_external_def
448 || dt == vect_constant_def)))
449 || !types_compatible_p (oprnd_info->first_op_type, type))
451 /* Try swapping operands if we got a mismatch. */
452 if (i == 0
453 && !swapped
454 && commutative)
456 swapped = true;
457 goto again;
460 if (dump_enabled_p ())
461 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
462 "Build SLP failed: different types\n");
464 return 1;
466 if ((dt == vect_constant_def
467 || dt == vect_external_def)
468 && !current_vector_size.is_constant ()
469 && (TREE_CODE (type) == BOOLEAN_TYPE
470 || !can_duplicate_and_interleave_p (stmts.length (),
471 TYPE_MODE (type))))
473 if (dump_enabled_p ())
475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
476 "Build SLP failed: invalid type of def "
477 "for variable-length SLP ");
478 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
479 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
481 return -1;
485 /* Check the types of the definitions. */
486 switch (dt)
488 case vect_constant_def:
489 case vect_external_def:
490 break;
492 case vect_reduction_def:
493 case vect_induction_def:
494 case vect_internal_def:
495 oprnd_info->def_stmts.quick_push (def_stmt);
496 break;
498 default:
499 /* FORNOW: Not supported. */
500 if (dump_enabled_p ())
502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
503 "Build SLP failed: illegal type of def ");
504 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
505 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
508 return -1;
512 /* Swap operands. */
513 if (swapped)
515 /* If there are already uses of this stmt in a SLP instance then
516 we've committed to the operand order and can't swap it. */
517 if (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) != 0)
519 if (dump_enabled_p ())
521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
522 "Build SLP failed: cannot swap operands of "
523 "shared stmt ");
524 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
526 return -1;
529 if (first_op_cond)
531 tree cond = gimple_assign_rhs1 (stmt);
532 enum tree_code code = TREE_CODE (cond);
534 /* Swap. */
535 if (*swap == 1)
537 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
538 &TREE_OPERAND (cond, 1));
539 TREE_SET_CODE (cond, swap_tree_comparison (code));
541 /* Invert. */
542 else
544 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
545 gimple_assign_rhs3_ptr (stmt));
546 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
547 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
548 gcc_assert (code != ERROR_MARK);
549 TREE_SET_CODE (cond, code);
552 else
553 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
554 gimple_assign_rhs2_ptr (stmt));
555 if (dump_enabled_p ())
557 dump_printf_loc (MSG_NOTE, vect_location,
558 "swapped operands to match def types in ");
559 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
563 *swap = swapped;
564 return 0;
567 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
568 caller's attempt to find the vector type in STMT with the narrowest
569 element type. Return true if VECTYPE is nonnull and if it is valid
570 for VINFO. When returning true, update MAX_NUNITS to reflect the
571 number of units in VECTYPE. VINFO, GORUP_SIZE and MAX_NUNITS are
572 as for vect_build_slp_tree. */
574 static bool
575 vect_record_max_nunits (vec_info *vinfo, gimple *stmt, unsigned int group_size,
576 tree vectype, poly_uint64 *max_nunits)
578 if (!vectype)
580 if (dump_enabled_p ())
582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
583 "Build SLP failed: unsupported data-type in ");
584 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
585 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
587 /* Fatal mismatch. */
588 return false;
591 /* If populating the vector type requires unrolling then fail
592 before adjusting *max_nunits for basic-block vectorization. */
593 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
594 unsigned HOST_WIDE_INT const_nunits;
595 if (is_a <bb_vec_info> (vinfo)
596 && (!nunits.is_constant (&const_nunits)
597 || const_nunits > group_size))
599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
600 "Build SLP failed: unrolling required "
601 "in basic block SLP\n");
602 /* Fatal mismatch. */
603 return false;
606 /* In case of multiple types we need to detect the smallest type. */
607 vect_update_max_nunits (max_nunits, vectype);
608 return true;
611 /* Verify if the scalar stmts STMTS are isomorphic, require data
612 permutation or are of unsupported types of operation. Return
613 true if they are, otherwise return false and indicate in *MATCHES
614 which stmts are not isomorphic to the first one. If MATCHES[0]
615 is false then this indicates the comparison could not be
616 carried out or the stmts will never be vectorized by SLP.
618 Note COND_EXPR is possibly ismorphic to another one after swapping its
619 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
620 the first stmt by swapping the two operands of comparison; set SWAP[i]
621 to 2 if stmt I is isormorphic to the first stmt by inverting the code
622 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
623 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
625 static bool
626 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
627 vec<gimple *> stmts, unsigned int group_size,
628 unsigned nops, poly_uint64 *max_nunits,
629 bool *matches, bool *two_operators)
631 unsigned int i;
632 gimple *first_stmt = stmts[0], *stmt = stmts[0];
633 enum tree_code first_stmt_code = ERROR_MARK;
634 enum tree_code alt_stmt_code = ERROR_MARK;
635 enum tree_code rhs_code = ERROR_MARK;
636 enum tree_code first_cond_code = ERROR_MARK;
637 tree lhs;
638 bool need_same_oprnds = false;
639 tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
640 optab optab;
641 int icode;
642 machine_mode optab_op2_mode;
643 machine_mode vec_mode;
644 HOST_WIDE_INT dummy;
645 gimple *first_load = NULL, *prev_first_load = NULL;
647 /* For every stmt in NODE find its def stmt/s. */
648 FOR_EACH_VEC_ELT (stmts, i, stmt)
650 swap[i] = 0;
651 matches[i] = false;
653 if (dump_enabled_p ())
655 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
656 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
659 /* Fail to vectorize statements marked as unvectorizable. */
660 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
662 if (dump_enabled_p ())
664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
665 "Build SLP failed: unvectorizable statement ");
666 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
668 /* Fatal mismatch. */
669 matches[0] = false;
670 return false;
673 lhs = gimple_get_lhs (stmt);
674 if (lhs == NULL_TREE)
676 if (dump_enabled_p ())
678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
679 "Build SLP failed: not GIMPLE_ASSIGN nor "
680 "GIMPLE_CALL ");
681 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
683 /* Fatal mismatch. */
684 matches[0] = false;
685 return false;
688 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
689 vectype = get_vectype_for_scalar_type (scalar_type);
690 if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
691 max_nunits))
693 /* Fatal mismatch. */
694 matches[0] = false;
695 return false;
698 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
700 rhs_code = CALL_EXPR;
701 if (gimple_call_internal_p (call_stmt)
702 || gimple_call_tail_p (call_stmt)
703 || gimple_call_noreturn_p (call_stmt)
704 || !gimple_call_nothrow_p (call_stmt)
705 || gimple_call_chain (call_stmt))
707 if (dump_enabled_p ())
709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
710 "Build SLP failed: unsupported call type ");
711 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
712 call_stmt, 0);
714 /* Fatal mismatch. */
715 matches[0] = false;
716 return false;
719 else
720 rhs_code = gimple_assign_rhs_code (stmt);
722 /* Check the operation. */
723 if (i == 0)
725 first_stmt_code = rhs_code;
727 /* Shift arguments should be equal in all the packed stmts for a
728 vector shift with scalar shift operand. */
729 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
730 || rhs_code == LROTATE_EXPR
731 || rhs_code == RROTATE_EXPR)
733 vec_mode = TYPE_MODE (vectype);
735 /* First see if we have a vector/vector shift. */
736 optab = optab_for_tree_code (rhs_code, vectype,
737 optab_vector);
739 if (!optab
740 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
742 /* No vector/vector shift, try for a vector/scalar shift. */
743 optab = optab_for_tree_code (rhs_code, vectype,
744 optab_scalar);
746 if (!optab)
748 if (dump_enabled_p ())
749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
750 "Build SLP failed: no optab.\n");
751 /* Fatal mismatch. */
752 matches[0] = false;
753 return false;
755 icode = (int) optab_handler (optab, vec_mode);
756 if (icode == CODE_FOR_nothing)
758 if (dump_enabled_p ())
759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
760 "Build SLP failed: "
761 "op not supported by target.\n");
762 /* Fatal mismatch. */
763 matches[0] = false;
764 return false;
766 optab_op2_mode = insn_data[icode].operand[2].mode;
767 if (!VECTOR_MODE_P (optab_op2_mode))
769 need_same_oprnds = true;
770 first_op1 = gimple_assign_rhs2 (stmt);
774 else if (rhs_code == WIDEN_LSHIFT_EXPR)
776 need_same_oprnds = true;
777 first_op1 = gimple_assign_rhs2 (stmt);
780 else
782 if (first_stmt_code != rhs_code
783 && alt_stmt_code == ERROR_MARK)
784 alt_stmt_code = rhs_code;
785 if (first_stmt_code != rhs_code
786 && (first_stmt_code != IMAGPART_EXPR
787 || rhs_code != REALPART_EXPR)
788 && (first_stmt_code != REALPART_EXPR
789 || rhs_code != IMAGPART_EXPR)
790 /* Handle mismatches in plus/minus by computing both
791 and merging the results. */
792 && !((first_stmt_code == PLUS_EXPR
793 || first_stmt_code == MINUS_EXPR)
794 && (alt_stmt_code == PLUS_EXPR
795 || alt_stmt_code == MINUS_EXPR)
796 && rhs_code == alt_stmt_code)
797 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
798 && (first_stmt_code == ARRAY_REF
799 || first_stmt_code == BIT_FIELD_REF
800 || first_stmt_code == INDIRECT_REF
801 || first_stmt_code == COMPONENT_REF
802 || first_stmt_code == MEM_REF)))
804 if (dump_enabled_p ())
806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
807 "Build SLP failed: different operation "
808 "in stmt ");
809 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
811 "original stmt ");
812 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
813 first_stmt, 0);
815 /* Mismatch. */
816 continue;
819 if (need_same_oprnds
820 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
822 if (dump_enabled_p ())
824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
825 "Build SLP failed: different shift "
826 "arguments in ");
827 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
829 /* Mismatch. */
830 continue;
833 if (rhs_code == CALL_EXPR)
835 gimple *first_stmt = stmts[0];
836 if (gimple_call_num_args (stmt) != nops
837 || !operand_equal_p (gimple_call_fn (first_stmt),
838 gimple_call_fn (stmt), 0)
839 || gimple_call_fntype (first_stmt)
840 != gimple_call_fntype (stmt))
842 if (dump_enabled_p ())
844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
845 "Build SLP failed: different calls in ");
846 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
847 stmt, 0);
849 /* Mismatch. */
850 continue;
855 /* Grouped store or load. */
856 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
858 if (REFERENCE_CLASS_P (lhs))
860 /* Store. */
863 else
865 /* Load. */
866 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
867 if (prev_first_load)
869 /* Check that there are no loads from different interleaving
870 chains in the same node. */
871 if (prev_first_load != first_load)
873 if (dump_enabled_p ())
875 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
876 vect_location,
877 "Build SLP failed: different "
878 "interleaving chains in one node ");
879 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
880 stmt, 0);
882 /* Mismatch. */
883 continue;
886 else
887 prev_first_load = first_load;
889 } /* Grouped access. */
890 else
892 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
894 /* Not grouped load. */
895 if (dump_enabled_p ())
897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
898 "Build SLP failed: not grouped load ");
899 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
902 /* FORNOW: Not grouped loads are not supported. */
903 /* Fatal mismatch. */
904 matches[0] = false;
905 return false;
908 /* Not memory operation. */
909 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
910 && TREE_CODE_CLASS (rhs_code) != tcc_unary
911 && TREE_CODE_CLASS (rhs_code) != tcc_expression
912 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
913 && rhs_code != CALL_EXPR)
915 if (dump_enabled_p ())
917 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
918 "Build SLP failed: operation");
919 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
920 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
922 /* Fatal mismatch. */
923 matches[0] = false;
924 return false;
927 if (rhs_code == COND_EXPR)
929 tree cond_expr = gimple_assign_rhs1 (stmt);
930 enum tree_code cond_code = TREE_CODE (cond_expr);
931 enum tree_code swap_code = ERROR_MARK;
932 enum tree_code invert_code = ERROR_MARK;
934 if (i == 0)
935 first_cond_code = TREE_CODE (cond_expr);
936 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
938 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
939 swap_code = swap_tree_comparison (cond_code);
940 invert_code = invert_tree_comparison (cond_code, honor_nans);
943 if (first_cond_code == cond_code)
945 /* Isomorphic can be achieved by swapping. */
946 else if (first_cond_code == swap_code)
947 swap[i] = 1;
948 /* Isomorphic can be achieved by inverting. */
949 else if (first_cond_code == invert_code)
950 swap[i] = 2;
951 else
953 if (dump_enabled_p ())
955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
956 "Build SLP failed: different"
957 " operation");
958 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
959 stmt, 0);
961 /* Mismatch. */
962 continue;
967 matches[i] = true;
970 for (i = 0; i < group_size; ++i)
971 if (!matches[i])
972 return false;
974 /* If we allowed a two-operation SLP node verify the target can cope
975 with the permute we are going to use. */
976 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
977 if (alt_stmt_code != ERROR_MARK
978 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
980 unsigned HOST_WIDE_INT count;
981 if (!nunits.is_constant (&count))
983 if (dump_enabled_p ())
984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
985 "Build SLP failed: different operations "
986 "not allowed with variable-length SLP.\n");
987 return false;
989 vec_perm_builder sel (count, count, 1);
990 for (i = 0; i < count; ++i)
992 unsigned int elt = i;
993 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
994 elt += count;
995 sel.quick_push (elt);
997 vec_perm_indices indices (sel, 2, count);
998 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
1000 for (i = 0; i < group_size; ++i)
1001 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
1003 matches[i] = false;
1004 if (dump_enabled_p ())
1006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1007 "Build SLP failed: different operation "
1008 "in stmt ");
1009 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1010 stmts[i], 0);
1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 "original stmt ");
1013 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1014 first_stmt, 0);
1017 return false;
1019 *two_operators = true;
1022 return true;
1025 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1026 Note we never remove apart from at destruction time so we do not
1027 need a special value for deleted that differs from empty. */
1028 struct bst_traits
1030 typedef vec <gimple *> value_type;
1031 typedef vec <gimple *> compare_type;
1032 static inline hashval_t hash (value_type);
1033 static inline bool equal (value_type existing, value_type candidate);
1034 static inline bool is_empty (value_type x) { return !x.exists (); }
1035 static inline bool is_deleted (value_type x) { return !x.exists (); }
1036 static inline void mark_empty (value_type &x) { x.release (); }
1037 static inline void mark_deleted (value_type &x) { x.release (); }
1038 static inline void remove (value_type &x) { x.release (); }
1040 inline hashval_t
1041 bst_traits::hash (value_type x)
1043 inchash::hash h;
1044 for (unsigned i = 0; i < x.length (); ++i)
1045 h.add_int (gimple_uid (x[i]));
1046 return h.end ();
1048 inline bool
1049 bst_traits::equal (value_type existing, value_type candidate)
1051 if (existing.length () != candidate.length ())
1052 return false;
1053 for (unsigned i = 0; i < existing.length (); ++i)
1054 if (existing[i] != candidate[i])
1055 return false;
1056 return true;
1059 typedef hash_set <vec <gimple *>, bst_traits> scalar_stmts_set_t;
1060 static scalar_stmts_set_t *bst_fail;
1062 static slp_tree
1063 vect_build_slp_tree_2 (vec_info *vinfo,
1064 vec<gimple *> stmts, unsigned int group_size,
1065 poly_uint64 *max_nunits,
1066 vec<slp_tree> *loads,
1067 bool *matches, unsigned *npermutes, unsigned *tree_size,
1068 unsigned max_tree_size);
1070 static slp_tree
1071 vect_build_slp_tree (vec_info *vinfo,
1072 vec<gimple *> stmts, unsigned int group_size,
1073 poly_uint64 *max_nunits, vec<slp_tree> *loads,
1074 bool *matches, unsigned *npermutes, unsigned *tree_size,
1075 unsigned max_tree_size)
1077 if (bst_fail->contains (stmts))
1078 return NULL;
1079 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1080 loads, matches, npermutes, tree_size,
1081 max_tree_size);
1082 /* When SLP build fails for stmts record this, otherwise SLP build
1083 can be exponential in time when we allow to construct parts from
1084 scalars, see PR81723. */
1085 if (! res)
1087 vec <gimple *> x;
1088 x.create (stmts.length ());
1089 x.splice (stmts);
1090 bst_fail->add (x);
1092 return res;
1095 /* Recursively build an SLP tree starting from NODE.
1096 Fail (and return a value not equal to zero) if def-stmts are not
1097 isomorphic, require data permutation or are of unsupported types of
1098 operation. Otherwise, return 0.
1099 The value returned is the depth in the SLP tree where a mismatch
1100 was found. */
1102 static slp_tree
1103 vect_build_slp_tree_2 (vec_info *vinfo,
1104 vec<gimple *> stmts, unsigned int group_size,
1105 poly_uint64 *max_nunits,
1106 vec<slp_tree> *loads,
1107 bool *matches, unsigned *npermutes, unsigned *tree_size,
1108 unsigned max_tree_size)
1110 unsigned nops, i, this_tree_size = 0;
1111 poly_uint64 this_max_nunits = *max_nunits;
1112 gimple *stmt;
1113 slp_tree node;
1115 matches[0] = false;
1117 stmt = stmts[0];
1118 if (is_gimple_call (stmt))
1119 nops = gimple_call_num_args (stmt);
1120 else if (is_gimple_assign (stmt))
1122 nops = gimple_num_ops (stmt) - 1;
1123 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1124 nops++;
1126 else if (gimple_code (stmt) == GIMPLE_PHI)
1127 nops = 0;
1128 else
1129 return NULL;
1131 /* If the SLP node is a PHI (induction or reduction), terminate
1132 the recursion. */
1133 if (gimple_code (stmt) == GIMPLE_PHI)
1135 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1136 tree vectype = get_vectype_for_scalar_type (scalar_type);
1137 if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
1138 max_nunits))
1139 return NULL;
1141 vect_def_type def_type = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt));
1142 /* Induction from different IVs is not supported. */
1143 if (def_type == vect_induction_def)
1145 FOR_EACH_VEC_ELT (stmts, i, stmt)
1146 if (stmt != stmts[0])
1147 return NULL;
1149 else
1151 /* Else def types have to match. */
1152 FOR_EACH_VEC_ELT (stmts, i, stmt)
1154 /* But for reduction chains only check on the first stmt. */
1155 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1156 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt)
1157 continue;
1158 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != def_type)
1159 return NULL;
1162 node = vect_create_new_slp_node (stmts);
1163 return node;
1167 bool two_operators = false;
1168 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1169 if (!vect_build_slp_tree_1 (vinfo, swap,
1170 stmts, group_size, nops,
1171 &this_max_nunits, matches, &two_operators))
1172 return NULL;
1174 /* If the SLP node is a load, terminate the recursion. */
1175 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
1176 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1178 *max_nunits = this_max_nunits;
1179 node = vect_create_new_slp_node (stmts);
1180 loads->safe_push (node);
1181 return node;
1184 /* Get at the operands, verifying they are compatible. */
1185 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1186 slp_oprnd_info oprnd_info;
1187 FOR_EACH_VEC_ELT (stmts, i, stmt)
1189 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1190 stmts, i, &oprnds_info);
1191 if (res != 0)
1192 matches[(res == -1) ? 0 : i] = false;
1193 if (!matches[0])
1194 break;
1196 for (i = 0; i < group_size; ++i)
1197 if (!matches[i])
1199 vect_free_oprnd_info (oprnds_info);
1200 return NULL;
1203 auto_vec<slp_tree, 4> children;
1204 auto_vec<slp_tree> this_loads;
1206 stmt = stmts[0];
1208 if (tree_size)
1209 max_tree_size -= *tree_size;
1211 /* Create SLP_TREE nodes for the definition node/s. */
1212 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1214 slp_tree child;
1215 unsigned old_nloads = this_loads.length ();
1216 unsigned old_tree_size = this_tree_size;
1217 unsigned int j;
1219 if (oprnd_info->first_dt != vect_internal_def
1220 && oprnd_info->first_dt != vect_reduction_def
1221 && oprnd_info->first_dt != vect_induction_def)
1222 continue;
1224 if (++this_tree_size > max_tree_size)
1226 FOR_EACH_VEC_ELT (children, j, child)
1227 vect_free_slp_tree (child);
1228 vect_free_oprnd_info (oprnds_info);
1229 return NULL;
1232 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1233 group_size, &this_max_nunits,
1234 &this_loads, matches, npermutes,
1235 &this_tree_size,
1236 max_tree_size)) != NULL)
1238 /* If we have all children of child built up from scalars then just
1239 throw that away and build it up this node from scalars. */
1240 if (!SLP_TREE_CHILDREN (child).is_empty ()
1241 /* ??? Rejecting patterns this way doesn't work. We'd have to
1242 do extra work to cancel the pattern so the uses see the
1243 scalar version. */
1244 && !is_pattern_stmt_p
1245 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1247 slp_tree grandchild;
1249 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1250 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1251 break;
1252 if (!grandchild)
1254 /* Roll back. */
1255 this_loads.truncate (old_nloads);
1256 this_tree_size = old_tree_size;
1257 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1258 vect_free_slp_tree (grandchild);
1259 SLP_TREE_CHILDREN (child).truncate (0);
1261 dump_printf_loc (MSG_NOTE, vect_location,
1262 "Building parent vector operands from "
1263 "scalars instead\n");
1264 oprnd_info->def_stmts = vNULL;
1265 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1266 children.safe_push (child);
1267 continue;
1271 oprnd_info->def_stmts = vNULL;
1272 children.safe_push (child);
1273 continue;
1276 /* If the SLP build failed fatally and we analyze a basic-block
1277 simply treat nodes we fail to build as externally defined
1278 (and thus build vectors from the scalar defs).
1279 The cost model will reject outright expensive cases.
1280 ??? This doesn't treat cases where permutation ultimatively
1281 fails (or we don't try permutation below). Ideally we'd
1282 even compute a permutation that will end up with the maximum
1283 SLP tree size... */
1284 if (is_a <bb_vec_info> (vinfo)
1285 && !matches[0]
1286 /* ??? Rejecting patterns this way doesn't work. We'd have to
1287 do extra work to cancel the pattern so the uses see the
1288 scalar version. */
1289 && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1291 dump_printf_loc (MSG_NOTE, vect_location,
1292 "Building vector operands from scalars\n");
1293 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1294 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1295 children.safe_push (child);
1296 oprnd_info->def_stmts = vNULL;
1297 continue;
1300 /* If the SLP build for operand zero failed and operand zero
1301 and one can be commutated try that for the scalar stmts
1302 that failed the match. */
1303 if (i == 0
1304 /* A first scalar stmt mismatch signals a fatal mismatch. */
1305 && matches[0]
1306 /* ??? For COND_EXPRs we can swap the comparison operands
1307 as well as the arms under some constraints. */
1308 && nops == 2
1309 && oprnds_info[1]->first_dt == vect_internal_def
1310 && is_gimple_assign (stmt)
1311 && commutative_tree_code (gimple_assign_rhs_code (stmt))
1312 && ! two_operators
1313 /* Do so only if the number of not successful permutes was nor more
1314 than a cut-ff as re-trying the recursive match on
1315 possibly each level of the tree would expose exponential
1316 behavior. */
1317 && *npermutes < 4)
1319 /* Verify if we can safely swap or if we committed to a specific
1320 operand order already. */
1321 for (j = 0; j < group_size; ++j)
1322 if (!matches[j]
1323 && (swap[j] != 0
1324 || STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmts[j]))))
1326 if (dump_enabled_p ())
1328 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1329 "Build SLP failed: cannot swap operands "
1330 "of shared stmt ");
1331 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1332 stmts[j], 0);
1334 goto fail;
1337 /* Swap mismatched definition stmts. */
1338 dump_printf_loc (MSG_NOTE, vect_location,
1339 "Re-trying with swapped operands of stmts ");
1340 for (j = 0; j < group_size; ++j)
1341 if (!matches[j])
1343 std::swap (oprnds_info[0]->def_stmts[j],
1344 oprnds_info[1]->def_stmts[j]);
1345 dump_printf (MSG_NOTE, "%d ", j);
1347 dump_printf (MSG_NOTE, "\n");
1348 /* And try again with scratch 'matches' ... */
1349 bool *tem = XALLOCAVEC (bool, group_size);
1350 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1351 group_size, &this_max_nunits,
1352 &this_loads, tem, npermutes,
1353 &this_tree_size,
1354 max_tree_size)) != NULL)
1356 /* ... so if successful we can apply the operand swapping
1357 to the GIMPLE IL. This is necessary because for example
1358 vect_get_slp_defs uses operand indexes and thus expects
1359 canonical operand order. This is also necessary even
1360 if we end up building the operand from scalars as
1361 we'll continue to process swapped operand two. */
1362 for (j = 0; j < group_size; ++j)
1364 gimple *stmt = stmts[j];
1365 gimple_set_plf (stmt, GF_PLF_1, false);
1367 for (j = 0; j < group_size; ++j)
1369 gimple *stmt = stmts[j];
1370 if (!matches[j])
1372 /* Avoid swapping operands twice. */
1373 if (gimple_plf (stmt, GF_PLF_1))
1374 continue;
1375 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1376 gimple_assign_rhs2_ptr (stmt));
1377 gimple_set_plf (stmt, GF_PLF_1, true);
1380 /* Verify we swap all duplicates or none. */
1381 if (flag_checking)
1382 for (j = 0; j < group_size; ++j)
1384 gimple *stmt = stmts[j];
1385 gcc_assert (gimple_plf (stmt, GF_PLF_1) == ! matches[j]);
1388 /* If we have all children of child built up from scalars then
1389 just throw that away and build it up this node from scalars. */
1390 if (!SLP_TREE_CHILDREN (child).is_empty ()
1391 /* ??? Rejecting patterns this way doesn't work. We'd have
1392 to do extra work to cancel the pattern so the uses see the
1393 scalar version. */
1394 && !is_pattern_stmt_p
1395 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1397 unsigned int j;
1398 slp_tree grandchild;
1400 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1401 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1402 break;
1403 if (!grandchild)
1405 /* Roll back. */
1406 this_loads.truncate (old_nloads);
1407 this_tree_size = old_tree_size;
1408 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1409 vect_free_slp_tree (grandchild);
1410 SLP_TREE_CHILDREN (child).truncate (0);
1412 dump_printf_loc (MSG_NOTE, vect_location,
1413 "Building parent vector operands from "
1414 "scalars instead\n");
1415 oprnd_info->def_stmts = vNULL;
1416 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1417 children.safe_push (child);
1418 continue;
1422 oprnd_info->def_stmts = vNULL;
1423 children.safe_push (child);
1424 continue;
1427 ++*npermutes;
1430 fail:
1431 gcc_assert (child == NULL);
1432 FOR_EACH_VEC_ELT (children, j, child)
1433 vect_free_slp_tree (child);
1434 vect_free_oprnd_info (oprnds_info);
1435 return NULL;
1438 vect_free_oprnd_info (oprnds_info);
1440 if (tree_size)
1441 *tree_size += this_tree_size;
1442 *max_nunits = this_max_nunits;
1443 loads->safe_splice (this_loads);
1445 node = vect_create_new_slp_node (stmts);
1446 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1447 SLP_TREE_CHILDREN (node).splice (children);
1448 return node;
1451 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1453 static void
1454 vect_print_slp_tree (dump_flags_t dump_kind, location_t loc, slp_tree node)
1456 int i;
1457 gimple *stmt;
1458 slp_tree child;
1460 dump_printf_loc (dump_kind, loc, "node%s\n",
1461 SLP_TREE_DEF_TYPE (node) != vect_internal_def
1462 ? " (external)" : "");
1463 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1465 dump_printf_loc (dump_kind, loc, "\tstmt %d ", i);
1466 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1468 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1469 vect_print_slp_tree (dump_kind, loc, child);
1473 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1474 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1475 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1476 stmts in NODE are to be marked. */
1478 static void
1479 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1481 int i;
1482 gimple *stmt;
1483 slp_tree child;
1485 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1486 return;
1488 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1489 if (j < 0 || i == j)
1490 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1492 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1493 vect_mark_slp_stmts (child, mark, j);
1497 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1499 static void
1500 vect_mark_slp_stmts_relevant (slp_tree node)
1502 int i;
1503 gimple *stmt;
1504 stmt_vec_info stmt_info;
1505 slp_tree child;
1507 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1508 return;
1510 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1512 stmt_info = vinfo_for_stmt (stmt);
1513 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1514 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1515 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1518 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1519 vect_mark_slp_stmts_relevant (child);
1523 /* Rearrange the statements of NODE according to PERMUTATION. */
1525 static void
1526 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1527 vec<unsigned> permutation)
1529 gimple *stmt;
1530 vec<gimple *> tmp_stmts;
1531 unsigned int i;
1532 slp_tree child;
1534 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1535 vect_slp_rearrange_stmts (child, group_size, permutation);
1537 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1538 tmp_stmts.create (group_size);
1539 tmp_stmts.quick_grow_cleared (group_size);
1541 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1542 tmp_stmts[permutation[i]] = stmt;
1544 SLP_TREE_SCALAR_STMTS (node).release ();
1545 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1549 /* Attempt to reorder stmts in a reduction chain so that we don't
1550 require any load permutation. Return true if that was possible,
1551 otherwise return false. */
1553 static bool
1554 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1556 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1557 unsigned int i, j;
1558 unsigned int lidx;
1559 slp_tree node, load;
1561 /* Compare all the permutation sequences to the first one. We know
1562 that at least one load is permuted. */
1563 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1564 if (!node->load_permutation.exists ())
1565 return false;
1566 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1568 if (!load->load_permutation.exists ())
1569 return false;
1570 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1571 if (lidx != node->load_permutation[j])
1572 return false;
1575 /* Check that the loads in the first sequence are different and there
1576 are no gaps between them. */
1577 auto_sbitmap load_index (group_size);
1578 bitmap_clear (load_index);
1579 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1581 if (lidx >= group_size)
1582 return false;
1583 if (bitmap_bit_p (load_index, lidx))
1584 return false;
1586 bitmap_set_bit (load_index, lidx);
1588 for (i = 0; i < group_size; i++)
1589 if (!bitmap_bit_p (load_index, i))
1590 return false;
1592 /* This permutation is valid for reduction. Since the order of the
1593 statements in the nodes is not important unless they are memory
1594 accesses, we can rearrange the statements in all the nodes
1595 according to the order of the loads. */
1596 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1597 node->load_permutation);
1599 /* We are done, no actual permutations need to be generated. */
1600 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1601 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1603 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1604 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
1605 /* But we have to keep those permutations that are required because
1606 of handling of gaps. */
1607 if (known_eq (unrolling_factor, 1U)
1608 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
1609 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0))
1610 SLP_TREE_LOAD_PERMUTATION (node).release ();
1611 else
1612 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1613 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1616 return true;
1619 /* Check if the required load permutations in the SLP instance
1620 SLP_INSTN are supported. */
1622 static bool
1623 vect_supported_load_permutation_p (slp_instance slp_instn)
1625 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1626 unsigned int i, j, k, next;
1627 slp_tree node;
1628 gimple *stmt, *load, *next_load;
1630 if (dump_enabled_p ())
1632 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1633 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1634 if (node->load_permutation.exists ())
1635 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1636 dump_printf (MSG_NOTE, "%d ", next);
1637 else
1638 for (k = 0; k < group_size; ++k)
1639 dump_printf (MSG_NOTE, "%d ", k);
1640 dump_printf (MSG_NOTE, "\n");
1643 /* In case of reduction every load permutation is allowed, since the order
1644 of the reduction statements is not important (as opposed to the case of
1645 grouped stores). The only condition we need to check is that all the
1646 load nodes are of the same size and have the same permutation (and then
1647 rearrange all the nodes of the SLP instance according to this
1648 permutation). */
1650 /* Check that all the load nodes are of the same size. */
1651 /* ??? Can't we assert this? */
1652 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1653 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1654 return false;
1656 node = SLP_INSTANCE_TREE (slp_instn);
1657 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1659 /* Reduction (there are no data-refs in the root).
1660 In reduction chain the order of the loads is not important. */
1661 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1662 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1663 vect_attempt_slp_rearrange_stmts (slp_instn);
1665 /* In basic block vectorization we allow any subchain of an interleaving
1666 chain.
1667 FORNOW: not supported in loop SLP because of realignment compications. */
1668 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1670 /* Check whether the loads in an instance form a subchain and thus
1671 no permutation is necessary. */
1672 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1674 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1675 continue;
1676 bool subchain_p = true;
1677 next_load = NULL;
1678 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1680 if (j != 0
1681 && (next_load != load
1682 || GROUP_GAP (vinfo_for_stmt (load)) != 1))
1684 subchain_p = false;
1685 break;
1687 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1689 if (subchain_p)
1690 SLP_TREE_LOAD_PERMUTATION (node).release ();
1691 else
1693 stmt_vec_info group_info
1694 = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1695 group_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (group_info));
1696 unsigned HOST_WIDE_INT nunits;
1697 unsigned k, maxk = 0;
1698 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1699 if (k > maxk)
1700 maxk = k;
1701 /* In BB vectorization we may not actually use a loaded vector
1702 accessing elements in excess of GROUP_SIZE. */
1703 tree vectype = STMT_VINFO_VECTYPE (group_info);
1704 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1705 || maxk >= (GROUP_SIZE (group_info) & ~(nunits - 1)))
1707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1708 "BB vectorization with gaps at the end of "
1709 "a load is not supported\n");
1710 return false;
1713 /* Verify the permutation can be generated. */
1714 vec<tree> tem;
1715 unsigned n_perms;
1716 if (!vect_transform_slp_perm_load (node, tem, NULL,
1717 1, slp_instn, true, &n_perms))
1719 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1720 vect_location,
1721 "unsupported load permutation\n");
1722 return false;
1726 return true;
1729 /* For loop vectorization verify we can generate the permutation. Be
1730 conservative about the vectorization factor, there are permutations
1731 that will use three vector inputs only starting from a specific factor
1732 and the vectorization factor is not yet final.
1733 ??? The SLP instance unrolling factor might not be the maximum one. */
1734 unsigned n_perms;
1735 poly_uint64 test_vf
1736 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1737 LOOP_VINFO_VECT_FACTOR
1738 (STMT_VINFO_LOOP_VINFO (vinfo_for_stmt (stmt))));
1739 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1740 if (node->load_permutation.exists ()
1741 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1742 slp_instn, true, &n_perms))
1743 return false;
1745 return true;
1749 /* Find the last store in SLP INSTANCE. */
1751 gimple *
1752 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1754 gimple *last = NULL, *stmt;
1756 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1758 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1759 if (is_pattern_stmt_p (stmt_vinfo))
1760 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1761 else
1762 last = get_later_stmt (stmt, last);
1765 return last;
1768 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1770 static void
1771 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1772 stmt_vector_for_cost *prologue_cost_vec,
1773 stmt_vector_for_cost *body_cost_vec,
1774 unsigned ncopies_for_cost,
1775 scalar_stmts_set_t* visited)
1777 unsigned i, j;
1778 slp_tree child;
1779 gimple *stmt;
1780 stmt_vec_info stmt_info;
1781 tree lhs;
1783 /* If we already costed the exact same set of scalar stmts we're done.
1784 We share the generated vector stmts for those. */
1785 if (visited->contains (SLP_TREE_SCALAR_STMTS (node)))
1786 return;
1788 visited->add (SLP_TREE_SCALAR_STMTS (node).copy ());
1790 /* Recurse down the SLP tree. */
1791 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1792 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1793 vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1794 body_cost_vec, ncopies_for_cost, visited);
1796 /* Look at the first scalar stmt to determine the cost. */
1797 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1798 stmt_info = vinfo_for_stmt (stmt);
1799 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1801 vect_memory_access_type memory_access_type
1802 = (STMT_VINFO_STRIDED_P (stmt_info)
1803 ? VMAT_STRIDED_SLP
1804 : VMAT_CONTIGUOUS);
1805 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1806 vect_model_store_cost (stmt_info, ncopies_for_cost,
1807 memory_access_type, VLS_STORE,
1808 node, prologue_cost_vec, body_cost_vec);
1809 else
1811 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1812 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1814 /* If the load is permuted then the alignment is determined by
1815 the first group element not by the first scalar stmt DR. */
1816 stmt = GROUP_FIRST_ELEMENT (stmt_info);
1817 stmt_info = vinfo_for_stmt (stmt);
1818 /* Record the cost for the permutation. */
1819 unsigned n_perms;
1820 vect_transform_slp_perm_load (node, vNULL, NULL,
1821 ncopies_for_cost, instance, true,
1822 &n_perms);
1823 record_stmt_cost (body_cost_vec, n_perms, vec_perm,
1824 stmt_info, 0, vect_body);
1825 unsigned assumed_nunits
1826 = vect_nunits_for_cost (STMT_VINFO_VECTYPE (stmt_info));
1827 /* And adjust the number of loads performed. This handles
1828 redundancies as well as loads that are later dead. */
1829 auto_sbitmap perm (GROUP_SIZE (stmt_info));
1830 bitmap_clear (perm);
1831 for (i = 0; i < SLP_TREE_LOAD_PERMUTATION (node).length (); ++i)
1832 bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (node)[i]);
1833 ncopies_for_cost = 0;
1834 bool load_seen = false;
1835 for (i = 0; i < GROUP_SIZE (stmt_info); ++i)
1837 if (i % assumed_nunits == 0)
1839 if (load_seen)
1840 ncopies_for_cost++;
1841 load_seen = false;
1843 if (bitmap_bit_p (perm, i))
1844 load_seen = true;
1846 if (load_seen)
1847 ncopies_for_cost++;
1848 gcc_assert (ncopies_for_cost
1849 <= (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info)
1850 + assumed_nunits - 1) / assumed_nunits);
1851 poly_uint64 uf = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1852 ncopies_for_cost *= estimated_poly_value (uf);
1854 /* Record the cost for the vector loads. */
1855 vect_model_load_cost (stmt_info, ncopies_for_cost,
1856 memory_access_type, node, prologue_cost_vec,
1857 body_cost_vec);
1858 return;
1861 else if (STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type)
1863 /* ncopies_for_cost is the number of IVs we generate. */
1864 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1865 stmt_info, 0, vect_body);
1867 /* Prologue cost for the initial values and step vector. */
1868 record_stmt_cost (prologue_cost_vec, ncopies_for_cost,
1869 CONSTANT_CLASS_P
1870 (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED
1871 (stmt_info))
1872 ? vector_load : vec_construct,
1873 stmt_info, 0, vect_prologue);
1874 record_stmt_cost (prologue_cost_vec, 1,
1875 CONSTANT_CLASS_P
1876 (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info))
1877 ? vector_load : vec_construct,
1878 stmt_info, 0, vect_prologue);
1880 /* ??? No easy way to get at the actual number of vector stmts
1881 to be geneated and thus the derived IVs. */
1883 else
1885 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1886 stmt_info, 0, vect_body);
1887 if (SLP_TREE_TWO_OPERATORS (node))
1889 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1890 stmt_info, 0, vect_body);
1891 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1892 stmt_info, 0, vect_body);
1896 /* Push SLP node def-type to stmts. */
1897 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1898 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1899 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
1900 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
1902 /* Scan operands and account for prologue cost of constants/externals.
1903 ??? This over-estimates cost for multiple uses and should be
1904 re-engineered. */
1905 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1906 lhs = gimple_get_lhs (stmt);
1907 for (i = 0; i < gimple_num_ops (stmt); ++i)
1909 tree op = gimple_op (stmt, i);
1910 gimple *def_stmt;
1911 enum vect_def_type dt;
1912 if (!op || op == lhs)
1913 continue;
1914 if (vect_is_simple_use (op, stmt_info->vinfo, &def_stmt, &dt)
1915 && (dt == vect_constant_def || dt == vect_external_def))
1917 /* Without looking at the actual initializer a vector of
1918 constants can be implemented as load from the constant pool.
1919 When all elements are the same we can use a splat. */
1920 tree vectype = get_vectype_for_scalar_type (TREE_TYPE (op));
1921 unsigned group_size = SLP_TREE_SCALAR_STMTS (node).length ();
1922 unsigned num_vects_to_check;
1923 unsigned HOST_WIDE_INT const_nunits;
1924 unsigned nelt_limit;
1925 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
1926 && ! multiple_p (const_nunits, group_size))
1928 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
1929 nelt_limit = const_nunits;
1931 else
1933 /* If either the vector has variable length or the vectors
1934 are composed of repeated whole groups we only need to
1935 cost construction once. All vectors will be the same. */
1936 num_vects_to_check = 1;
1937 nelt_limit = group_size;
1939 tree elt = NULL_TREE;
1940 unsigned nelt = 0;
1941 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
1943 unsigned si = j % group_size;
1944 if (nelt == 0)
1945 elt = gimple_op (SLP_TREE_SCALAR_STMTS (node)[si], i);
1946 /* ??? We're just tracking whether all operands of a single
1947 vector initializer are the same, ideally we'd check if
1948 we emitted the same one already. */
1949 else if (elt != gimple_op (SLP_TREE_SCALAR_STMTS (node)[si], i))
1950 elt = NULL_TREE;
1951 nelt++;
1952 if (nelt == nelt_limit)
1954 /* ??? We need to pass down stmt_info for a vector type
1955 even if it points to the wrong stmt. */
1956 record_stmt_cost (prologue_cost_vec, 1,
1957 dt == vect_external_def
1958 ? (elt ? scalar_to_vec : vec_construct)
1959 : vector_load,
1960 stmt_info, 0, vect_prologue);
1961 nelt = 0;
1967 /* Restore stmt def-types. */
1968 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1969 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1970 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
1971 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
1974 /* Compute the cost for the SLP instance INSTANCE. */
1976 static void
1977 vect_analyze_slp_cost (slp_instance instance, void *data)
1979 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1980 unsigned ncopies_for_cost;
1981 stmt_info_for_cost *si;
1982 unsigned i;
1984 if (dump_enabled_p ())
1985 dump_printf_loc (MSG_NOTE, vect_location,
1986 "=== vect_analyze_slp_cost ===\n");
1988 /* Calculate the number of vector stmts to create based on the unrolling
1989 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1990 GROUP_SIZE / NUNITS otherwise. */
1991 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1992 slp_tree node = SLP_INSTANCE_TREE (instance);
1993 stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1994 /* Get the estimated vectorization factor, which is always one for
1995 basic-block vectorization. */
1996 unsigned int assumed_vf;
1997 if (STMT_VINFO_LOOP_VINFO (stmt_info))
1998 assumed_vf = vect_vf_for_cost (STMT_VINFO_LOOP_VINFO (stmt_info));
1999 else
2000 assumed_vf = 1;
2001 /* For reductions look at a reduction operand in case the reduction
2002 operation is widening like DOT_PROD or SAD. */
2003 tree vectype_for_cost = STMT_VINFO_VECTYPE (stmt_info);
2004 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2006 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2007 switch (gimple_assign_rhs_code (stmt))
2009 case DOT_PROD_EXPR:
2010 case SAD_EXPR:
2011 vectype_for_cost = get_vectype_for_scalar_type
2012 (TREE_TYPE (gimple_assign_rhs1 (stmt)));
2013 break;
2014 default:;
2017 unsigned int assumed_nunits = vect_nunits_for_cost (vectype_for_cost);
2018 ncopies_for_cost = (least_common_multiple (assumed_nunits,
2019 group_size * assumed_vf)
2020 / assumed_nunits);
2022 prologue_cost_vec.create (10);
2023 body_cost_vec.create (10);
2024 scalar_stmts_set_t *visited = new scalar_stmts_set_t ();
2025 vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
2026 &prologue_cost_vec, &body_cost_vec,
2027 ncopies_for_cost, visited);
2028 delete visited;
2030 /* Record the prologue costs, which were delayed until we were
2031 sure that SLP was successful. */
2032 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
2034 struct _stmt_vec_info *stmt_info
2035 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2036 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2037 si->misalign, vect_prologue);
2040 /* Record the instance's instructions in the target cost model. */
2041 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
2043 struct _stmt_vec_info *stmt_info
2044 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2045 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2046 si->misalign, vect_body);
2049 prologue_cost_vec.release ();
2050 body_cost_vec.release ();
2053 /* Splits a group of stores, currently beginning at FIRST_STMT, into two groups:
2054 one (still beginning at FIRST_STMT) of size GROUP1_SIZE (also containing
2055 the first GROUP1_SIZE stmts, since stores are consecutive), the second
2056 containing the remainder.
2057 Return the first stmt in the second group. */
2059 static gimple *
2060 vect_split_slp_store_group (gimple *first_stmt, unsigned group1_size)
2062 stmt_vec_info first_vinfo = vinfo_for_stmt (first_stmt);
2063 gcc_assert (GROUP_FIRST_ELEMENT (first_vinfo) == first_stmt);
2064 gcc_assert (group1_size > 0);
2065 int group2_size = GROUP_SIZE (first_vinfo) - group1_size;
2066 gcc_assert (group2_size > 0);
2067 GROUP_SIZE (first_vinfo) = group1_size;
2069 gimple *stmt = first_stmt;
2070 for (unsigned i = group1_size; i > 1; i--)
2072 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2073 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
2075 /* STMT is now the last element of the first group. */
2076 gimple *group2 = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2077 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)) = 0;
2079 GROUP_SIZE (vinfo_for_stmt (group2)) = group2_size;
2080 for (stmt = group2; stmt; stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)))
2082 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = group2;
2083 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
2086 /* For the second group, the GROUP_GAP is that before the original group,
2087 plus skipping over the first vector. */
2088 GROUP_GAP (vinfo_for_stmt (group2)) =
2089 GROUP_GAP (first_vinfo) + group1_size;
2091 /* GROUP_GAP of the first group now has to skip over the second group too. */
2092 GROUP_GAP (first_vinfo) += group2_size;
2094 if (dump_enabled_p ())
2095 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2096 group1_size, group2_size);
2098 return group2;
2101 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2102 statements and a vector of NUNITS elements. */
2104 static poly_uint64
2105 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2107 return exact_div (common_multiple (nunits, group_size), group_size);
2110 /* Analyze an SLP instance starting from a group of grouped stores. Call
2111 vect_build_slp_tree to build a tree of packed stmts if possible.
2112 Return FALSE if it's impossible to SLP any stmt in the loop. */
2114 static bool
2115 vect_analyze_slp_instance (vec_info *vinfo,
2116 gimple *stmt, unsigned max_tree_size)
2118 slp_instance new_instance;
2119 slp_tree node;
2120 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
2121 tree vectype, scalar_type = NULL_TREE;
2122 gimple *next;
2123 unsigned int i;
2124 vec<slp_tree> loads;
2125 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
2126 vec<gimple *> scalar_stmts;
2128 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2130 if (dr)
2132 scalar_type = TREE_TYPE (DR_REF (dr));
2133 vectype = get_vectype_for_scalar_type (scalar_type);
2135 else
2137 gcc_assert (is_a <loop_vec_info> (vinfo));
2138 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2141 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
2143 else
2145 gcc_assert (is_a <loop_vec_info> (vinfo));
2146 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2147 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2150 if (!vectype)
2152 if (dump_enabled_p ())
2154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2155 "Build SLP failed: unsupported data-type ");
2156 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
2157 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2160 return false;
2162 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2164 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2165 scalar_stmts.create (group_size);
2166 next = stmt;
2167 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2169 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2170 while (next)
2172 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
2173 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
2174 scalar_stmts.safe_push (
2175 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
2176 else
2177 scalar_stmts.safe_push (next);
2178 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2180 /* Mark the first element of the reduction chain as reduction to properly
2181 transform the node. In the reduction analysis phase only the last
2182 element of the chain is marked as reduction. */
2183 if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
2184 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
2186 else
2188 /* Collect reduction statements. */
2189 vec<gimple *> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2190 for (i = 0; reductions.iterate (i, &next); i++)
2191 scalar_stmts.safe_push (next);
2194 loads.create (group_size);
2196 /* Build the tree for the SLP instance. */
2197 bool *matches = XALLOCAVEC (bool, group_size);
2198 unsigned npermutes = 0;
2199 bst_fail = new scalar_stmts_set_t ();
2200 poly_uint64 max_nunits = nunits;
2201 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2202 &max_nunits, &loads, matches, &npermutes,
2203 NULL, max_tree_size);
2204 delete bst_fail;
2205 if (node != NULL)
2207 /* Calculate the unrolling factor based on the smallest type. */
2208 poly_uint64 unrolling_factor
2209 = calculate_unrolling_factor (max_nunits, group_size);
2211 if (maybe_ne (unrolling_factor, 1U)
2212 && is_a <bb_vec_info> (vinfo))
2214 unsigned HOST_WIDE_INT const_max_nunits;
2215 if (!max_nunits.is_constant (&const_max_nunits)
2216 || const_max_nunits > group_size)
2218 if (dump_enabled_p ())
2219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2220 "Build SLP failed: store group "
2221 "size not a multiple of the vector size "
2222 "in basic block SLP\n");
2223 vect_free_slp_tree (node);
2224 loads.release ();
2225 return false;
2227 /* Fatal mismatch. */
2228 matches[group_size / const_max_nunits * const_max_nunits] = false;
2229 vect_free_slp_tree (node);
2230 loads.release ();
2232 else
2234 /* Create a new SLP instance. */
2235 new_instance = XNEW (struct _slp_instance);
2236 SLP_INSTANCE_TREE (new_instance) = node;
2237 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2238 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2239 SLP_INSTANCE_LOADS (new_instance) = loads;
2241 /* Compute the load permutation. */
2242 slp_tree load_node;
2243 bool loads_permuted = false;
2244 FOR_EACH_VEC_ELT (loads, i, load_node)
2246 vec<unsigned> load_permutation;
2247 int j;
2248 gimple *load, *first_stmt;
2249 bool this_load_permuted = false;
2250 load_permutation.create (group_size);
2251 first_stmt = GROUP_FIRST_ELEMENT
2252 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2253 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
2255 int load_place = vect_get_place_in_interleaving_chain
2256 (load, first_stmt);
2257 gcc_assert (load_place != -1);
2258 if (load_place != j)
2259 this_load_permuted = true;
2260 load_permutation.safe_push (load_place);
2262 if (!this_load_permuted
2263 /* The load requires permutation when unrolling exposes
2264 a gap either because the group is larger than the SLP
2265 group-size or because there is a gap between the groups. */
2266 && (known_eq (unrolling_factor, 1U)
2267 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
2268 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
2270 load_permutation.release ();
2271 continue;
2273 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2274 loads_permuted = true;
2277 if (loads_permuted)
2279 if (!vect_supported_load_permutation_p (new_instance))
2281 if (dump_enabled_p ())
2283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2284 "Build SLP failed: unsupported load "
2285 "permutation ");
2286 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
2287 TDF_SLIM, stmt, 0);
2289 vect_free_slp_instance (new_instance);
2290 return false;
2294 /* If the loads and stores can be handled with load/store-lan
2295 instructions do not generate this SLP instance. */
2296 if (is_a <loop_vec_info> (vinfo)
2297 && loads_permuted
2298 && dr && vect_store_lanes_supported (vectype, group_size, false))
2300 slp_tree load_node;
2301 FOR_EACH_VEC_ELT (loads, i, load_node)
2303 gimple *first_stmt = GROUP_FIRST_ELEMENT
2304 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2305 stmt_vec_info stmt_vinfo = vinfo_for_stmt (first_stmt);
2306 /* Use SLP for strided accesses (or if we
2307 can't load-lanes). */
2308 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2309 || ! vect_load_lanes_supported
2310 (STMT_VINFO_VECTYPE (stmt_vinfo),
2311 GROUP_SIZE (stmt_vinfo), false))
2312 break;
2314 if (i == loads.length ())
2316 if (dump_enabled_p ())
2317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2318 "Built SLP cancelled: can use "
2319 "load/store-lanes\n");
2320 vect_free_slp_instance (new_instance);
2321 return false;
2325 vinfo->slp_instances.safe_push (new_instance);
2327 if (dump_enabled_p ())
2329 dump_printf_loc (MSG_NOTE, vect_location,
2330 "Final SLP tree for instance:\n");
2331 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2334 return true;
2337 else
2339 /* Failed to SLP. */
2340 /* Free the allocated memory. */
2341 scalar_stmts.release ();
2342 loads.release ();
2345 /* For basic block SLP, try to break the group up into multiples of the
2346 vector size. */
2347 unsigned HOST_WIDE_INT const_nunits;
2348 if (is_a <bb_vec_info> (vinfo)
2349 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2350 && STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
2351 && nunits.is_constant (&const_nunits))
2353 /* We consider breaking the group only on VF boundaries from the existing
2354 start. */
2355 for (i = 0; i < group_size; i++)
2356 if (!matches[i]) break;
2358 if (i >= const_nunits && i < group_size)
2360 /* Split into two groups at the first vector boundary before i. */
2361 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2362 unsigned group1_size = i & ~(const_nunits - 1);
2364 gimple *rest = vect_split_slp_store_group (stmt, group1_size);
2365 bool res = vect_analyze_slp_instance (vinfo, stmt, max_tree_size);
2366 /* If the first non-match was in the middle of a vector,
2367 skip the rest of that vector. */
2368 if (group1_size < i)
2370 i = group1_size + const_nunits;
2371 if (i < group_size)
2372 rest = vect_split_slp_store_group (rest, const_nunits);
2374 if (i < group_size)
2375 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2376 return res;
2378 /* Even though the first vector did not all match, we might be able to SLP
2379 (some) of the remainder. FORNOW ignore this possibility. */
2382 return false;
2386 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2387 trees of packed scalar stmts if SLP is possible. */
2389 bool
2390 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2392 unsigned int i;
2393 gimple *first_element;
2395 if (dump_enabled_p ())
2396 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
2398 /* Find SLP sequences starting from groups of grouped stores. */
2399 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2400 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2402 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2404 if (loop_vinfo->reduction_chains.length () > 0)
2406 /* Find SLP sequences starting from reduction chains. */
2407 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2408 if (! vect_analyze_slp_instance (vinfo, first_element,
2409 max_tree_size))
2411 /* Dissolve reduction chain group. */
2412 gimple *next, *stmt = first_element;
2413 while (stmt)
2415 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2416 next = GROUP_NEXT_ELEMENT (vinfo);
2417 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2418 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2419 stmt = next;
2421 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (first_element))
2422 = vect_internal_def;
2426 /* Find SLP sequences starting from groups of reductions. */
2427 if (loop_vinfo->reductions.length () > 1)
2428 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2429 max_tree_size);
2432 return true;
2436 /* For each possible SLP instance decide whether to SLP it and calculate overall
2437 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2438 least one instance. */
2440 bool
2441 vect_make_slp_decision (loop_vec_info loop_vinfo)
2443 unsigned int i;
2444 poly_uint64 unrolling_factor = 1;
2445 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2446 slp_instance instance;
2447 int decided_to_slp = 0;
2449 if (dump_enabled_p ())
2450 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
2451 "\n");
2453 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2455 /* FORNOW: SLP if you can. */
2456 /* All unroll factors have the form current_vector_size * X for some
2457 rational X, so they must have a common multiple. */
2458 unrolling_factor
2459 = force_common_multiple (unrolling_factor,
2460 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2462 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2463 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2464 loop-based vectorization. Such stmts will be marked as HYBRID. */
2465 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2466 decided_to_slp++;
2469 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2471 if (decided_to_slp && dump_enabled_p ())
2473 dump_printf_loc (MSG_NOTE, vect_location,
2474 "Decided to SLP %d instances. Unrolling factor ",
2475 decided_to_slp);
2476 dump_dec (MSG_NOTE, unrolling_factor);
2477 dump_printf (MSG_NOTE, "\n");
2480 return (decided_to_slp > 0);
2484 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2485 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2487 static void
2488 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2490 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[i];
2491 imm_use_iterator imm_iter;
2492 gimple *use_stmt;
2493 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
2494 slp_tree child;
2495 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2496 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2497 int j;
2499 /* Propagate hybrid down the SLP tree. */
2500 if (stype == hybrid)
2502 else if (HYBRID_SLP_STMT (stmt_vinfo))
2503 stype = hybrid;
2504 else
2506 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2507 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2508 /* If we get a pattern stmt here we have to use the LHS of the
2509 original stmt for immediate uses. */
2510 if (! STMT_VINFO_IN_PATTERN_P (stmt_vinfo)
2511 && STMT_VINFO_RELATED_STMT (stmt_vinfo))
2512 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
2513 tree def;
2514 if (gimple_code (stmt) == GIMPLE_PHI)
2515 def = gimple_phi_result (stmt);
2516 else
2517 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2518 if (def)
2519 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2521 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2522 continue;
2523 use_vinfo = vinfo_for_stmt (use_stmt);
2524 if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2525 && STMT_VINFO_RELATED_STMT (use_vinfo))
2526 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
2527 if (!STMT_SLP_TYPE (use_vinfo)
2528 && (STMT_VINFO_RELEVANT (use_vinfo)
2529 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2530 && !(gimple_code (use_stmt) == GIMPLE_PHI
2531 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2533 if (dump_enabled_p ())
2535 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2536 "def in non-SLP stmt: ");
2537 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
2539 stype = hybrid;
2544 if (stype == hybrid
2545 && !HYBRID_SLP_STMT (stmt_vinfo))
2547 if (dump_enabled_p ())
2549 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2550 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2552 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2555 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2556 if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
2557 vect_detect_hybrid_slp_stmts (child, i, stype);
2560 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2562 static tree
2563 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2565 walk_stmt_info *wi = (walk_stmt_info *)data;
2566 struct loop *loopp = (struct loop *)wi->info;
2568 if (wi->is_lhs)
2569 return NULL_TREE;
2571 if (TREE_CODE (*tp) == SSA_NAME
2572 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2574 gimple *def_stmt = SSA_NAME_DEF_STMT (*tp);
2575 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2576 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2578 if (dump_enabled_p ())
2580 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2581 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2583 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2587 return NULL_TREE;
2590 static tree
2591 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2592 walk_stmt_info *)
2594 stmt_vec_info use_vinfo = vinfo_for_stmt (gsi_stmt (*gsi));
2595 /* If the stmt is in a SLP instance then this isn't a reason
2596 to mark use definitions in other SLP instances as hybrid. */
2597 if (! STMT_SLP_TYPE (use_vinfo)
2598 && (STMT_VINFO_RELEVANT (use_vinfo)
2599 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2600 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2601 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2603 else
2604 *handled = true;
2605 return NULL_TREE;
2608 /* Find stmts that must be both vectorized and SLPed. */
2610 void
2611 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2613 unsigned int i;
2614 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2615 slp_instance instance;
2617 if (dump_enabled_p ())
2618 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2619 "\n");
2621 /* First walk all pattern stmt in the loop and mark defs of uses as
2622 hybrid because immediate uses in them are not recorded. */
2623 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2625 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2626 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2627 gsi_next (&gsi))
2629 gimple *stmt = gsi_stmt (gsi);
2630 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2631 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2633 walk_stmt_info wi;
2634 memset (&wi, 0, sizeof (wi));
2635 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2636 gimple_stmt_iterator gsi2
2637 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2638 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2639 vect_detect_hybrid_slp_1, &wi);
2640 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2641 vect_detect_hybrid_slp_2,
2642 vect_detect_hybrid_slp_1, &wi);
2647 /* Then walk the SLP instance trees marking stmts with uses in
2648 non-SLP stmts as hybrid, also propagating hybrid down the
2649 SLP tree, collecting the above info on-the-fly. */
2650 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2652 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2653 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2654 i, pure_slp);
2659 /* Initialize a bb_vec_info struct for the statements between
2660 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2662 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2663 gimple_stmt_iterator region_end_in)
2664 : vec_info (vec_info::bb, init_cost (NULL)),
2665 bb (gsi_bb (region_begin_in)),
2666 region_begin (region_begin_in),
2667 region_end (region_end_in)
2669 gimple_stmt_iterator gsi;
2671 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2672 gsi_next (&gsi))
2674 gimple *stmt = gsi_stmt (gsi);
2675 gimple_set_uid (stmt, 0);
2676 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
2679 bb->aux = this;
2683 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2684 stmts in the basic block. */
2686 _bb_vec_info::~_bb_vec_info ()
2688 for (gimple_stmt_iterator si = region_begin;
2689 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2691 gimple *stmt = gsi_stmt (si);
2692 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2694 if (stmt_info)
2695 /* Free stmt_vec_info. */
2696 free_stmt_vec_info (stmt);
2698 /* Reset region marker. */
2699 gimple_set_uid (stmt, -1);
2702 bb->aux = NULL;
2706 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2707 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2709 Return true if the operations are supported. */
2711 static bool
2712 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2713 slp_instance node_instance)
2715 bool dummy;
2716 int i, j;
2717 gimple *stmt;
2718 slp_tree child;
2720 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2721 return true;
2723 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2724 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance))
2725 return false;
2727 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2728 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2729 gcc_assert (stmt_info);
2730 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2732 /* For BB vectorization vector types are assigned here.
2733 Memory accesses already got their vector type assigned
2734 in vect_analyze_data_refs. */
2735 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2736 if (bb_vinfo
2737 && ! STMT_VINFO_DATA_REF (stmt_info))
2739 gcc_assert (PURE_SLP_STMT (stmt_info));
2741 tree scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
2742 if (dump_enabled_p ())
2744 dump_printf_loc (MSG_NOTE, vect_location,
2745 "get vectype for scalar type: ");
2746 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
2747 dump_printf (MSG_NOTE, "\n");
2750 tree vectype = get_vectype_for_scalar_type (scalar_type);
2751 if (!vectype)
2753 if (dump_enabled_p ())
2755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2756 "not SLPed: unsupported data-type ");
2757 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2758 scalar_type);
2759 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2761 return false;
2764 if (dump_enabled_p ())
2766 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
2767 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
2768 dump_printf (MSG_NOTE, "\n");
2771 gimple *sstmt;
2772 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt)
2773 STMT_VINFO_VECTYPE (vinfo_for_stmt (sstmt)) = vectype;
2776 /* Calculate the number of vector statements to be created for the
2777 scalar stmts in this node. For SLP reductions it is equal to the
2778 number of vector statements in the children (which has already been
2779 calculated by the recursive call). Otherwise it is the number of
2780 scalar elements in one scalar iteration (GROUP_SIZE) multiplied by
2781 VF divided by the number of elements in a vector. */
2782 if (GROUP_FIRST_ELEMENT (stmt_info)
2783 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2784 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2785 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2786 else
2788 poly_uint64 vf;
2789 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2790 vf = loop_vinfo->vectorization_factor;
2791 else
2792 vf = 1;
2793 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2794 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2795 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2796 = vect_get_num_vectors (vf * group_size, vectype);
2799 /* Push SLP node def-type to stmt operands. */
2800 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2801 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2802 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2803 = SLP_TREE_DEF_TYPE (child);
2804 bool res = vect_analyze_stmt (stmt, &dummy, node, node_instance);
2805 /* Restore def-types. */
2806 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2807 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2808 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2809 = vect_internal_def;
2810 if (! res)
2811 return false;
2813 return true;
2817 /* Analyze statements in SLP instances of VINFO. Return true if the
2818 operations are supported. */
2820 bool
2821 vect_slp_analyze_operations (vec_info *vinfo)
2823 slp_instance instance;
2824 int i;
2826 if (dump_enabled_p ())
2827 dump_printf_loc (MSG_NOTE, vect_location,
2828 "=== vect_slp_analyze_operations ===\n");
2830 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2832 if (!vect_slp_analyze_node_operations (vinfo,
2833 SLP_INSTANCE_TREE (instance),
2834 instance))
2836 dump_printf_loc (MSG_NOTE, vect_location,
2837 "removing SLP instance operations starting from: ");
2838 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2839 SLP_TREE_SCALAR_STMTS
2840 (SLP_INSTANCE_TREE (instance))[0], 0);
2841 vect_free_slp_instance (instance);
2842 vinfo->slp_instances.ordered_remove (i);
2844 else
2846 /* Compute the costs of the SLP instance. */
2847 vect_analyze_slp_cost (instance, vinfo->target_cost_data);
2848 i++;
2852 return !vinfo->slp_instances.is_empty ();
2856 /* Compute the scalar cost of the SLP node NODE and its children
2857 and return it. Do not account defs that are marked in LIFE and
2858 update LIFE according to uses of NODE. */
2860 static unsigned
2861 vect_bb_slp_scalar_cost (basic_block bb,
2862 slp_tree node, vec<bool, va_heap> *life)
2864 unsigned scalar_cost = 0;
2865 unsigned i;
2866 gimple *stmt;
2867 slp_tree child;
2869 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2871 unsigned stmt_cost;
2872 ssa_op_iter op_iter;
2873 def_operand_p def_p;
2874 stmt_vec_info stmt_info;
2876 if ((*life)[i])
2877 continue;
2879 /* If there is a non-vectorized use of the defs then the scalar
2880 stmt is kept live in which case we do not account it or any
2881 required defs in the SLP children in the scalar cost. This
2882 way we make the vectorization more costly when compared to
2883 the scalar cost. */
2884 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2886 imm_use_iterator use_iter;
2887 gimple *use_stmt;
2888 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2889 if (!is_gimple_debug (use_stmt)
2890 && (! vect_stmt_in_region_p (vinfo_for_stmt (stmt)->vinfo,
2891 use_stmt)
2892 || ! PURE_SLP_STMT (vinfo_for_stmt (use_stmt))))
2894 (*life)[i] = true;
2895 BREAK_FROM_IMM_USE_STMT (use_iter);
2898 if ((*life)[i])
2899 continue;
2901 /* Count scalar stmts only once. */
2902 if (gimple_visited_p (stmt))
2903 continue;
2904 gimple_set_visited (stmt, true);
2906 stmt_info = vinfo_for_stmt (stmt);
2907 if (STMT_VINFO_DATA_REF (stmt_info))
2909 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2910 stmt_cost = vect_get_stmt_cost (scalar_load);
2911 else
2912 stmt_cost = vect_get_stmt_cost (scalar_store);
2914 else
2915 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2917 scalar_cost += stmt_cost;
2920 auto_vec<bool, 20> subtree_life;
2921 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2923 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2925 /* Do not directly pass LIFE to the recursive call, copy it to
2926 confine changes in the callee to the current child/subtree. */
2927 subtree_life.safe_splice (*life);
2928 scalar_cost += vect_bb_slp_scalar_cost (bb, child, &subtree_life);
2929 subtree_life.truncate (0);
2933 return scalar_cost;
2936 /* Check if vectorization of the basic block is profitable. */
2938 static bool
2939 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2941 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2942 slp_instance instance;
2943 int i;
2944 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2945 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2947 /* Calculate scalar cost. */
2948 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2950 auto_vec<bool, 20> life;
2951 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2952 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2953 SLP_INSTANCE_TREE (instance),
2954 &life);
2957 /* Unset visited flag. */
2958 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2959 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2960 gimple_set_visited (gsi_stmt (gsi), false);
2962 /* Complete the target-specific cost calculation. */
2963 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2964 &vec_inside_cost, &vec_epilogue_cost);
2966 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2968 if (dump_enabled_p ())
2970 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2971 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2972 vec_inside_cost);
2973 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2974 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2975 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2978 /* Vectorization is profitable if its cost is more than the cost of scalar
2979 version. Note that we err on the vector side for equal cost because
2980 the cost estimate is otherwise quite pessimistic (constant uses are
2981 free on the scalar side but cost a load on the vector side for
2982 example). */
2983 if (vec_outside_cost + vec_inside_cost > scalar_cost)
2984 return false;
2986 return true;
2989 /* Check if the basic block can be vectorized. Returns a bb_vec_info
2990 if so and sets fatal to true if failure is independent of
2991 current_vector_size. */
2993 static bb_vec_info
2994 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
2995 gimple_stmt_iterator region_end,
2996 vec<data_reference_p> datarefs, int n_stmts,
2997 bool &fatal)
2999 bb_vec_info bb_vinfo;
3000 slp_instance instance;
3001 int i;
3002 poly_uint64 min_vf = 2;
3004 /* The first group of checks is independent of the vector size. */
3005 fatal = true;
3007 if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
3009 if (dump_enabled_p ())
3010 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3011 "not vectorized: too many instructions in "
3012 "basic block.\n");
3013 free_data_refs (datarefs);
3014 return NULL;
3017 bb_vinfo = new _bb_vec_info (region_begin, region_end);
3018 if (!bb_vinfo)
3019 return NULL;
3021 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3023 /* Analyze the data references. */
3025 if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
3027 if (dump_enabled_p ())
3028 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3029 "not vectorized: unhandled data-ref in basic "
3030 "block.\n");
3032 delete bb_vinfo;
3033 return NULL;
3036 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
3038 if (dump_enabled_p ())
3039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3040 "not vectorized: not enough data-refs in "
3041 "basic block.\n");
3043 delete bb_vinfo;
3044 return NULL;
3047 if (!vect_analyze_data_ref_accesses (bb_vinfo))
3049 if (dump_enabled_p ())
3050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3051 "not vectorized: unhandled data access in "
3052 "basic block.\n");
3054 delete bb_vinfo;
3055 return NULL;
3058 /* If there are no grouped stores in the region there is no need
3059 to continue with pattern recog as vect_analyze_slp will fail
3060 anyway. */
3061 if (bb_vinfo->grouped_stores.is_empty ())
3063 if (dump_enabled_p ())
3064 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3065 "not vectorized: no grouped stores in "
3066 "basic block.\n");
3068 delete bb_vinfo;
3069 return NULL;
3072 /* While the rest of the analysis below depends on it in some way. */
3073 fatal = false;
3075 vect_pattern_recog (bb_vinfo);
3077 /* Check the SLP opportunities in the basic block, analyze and build SLP
3078 trees. */
3079 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3081 if (dump_enabled_p ())
3083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3084 "Failed to SLP the basic block.\n");
3085 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3086 "not vectorized: failed to find SLP opportunities "
3087 "in basic block.\n");
3090 delete bb_vinfo;
3091 return NULL;
3094 vect_record_base_alignments (bb_vinfo);
3096 /* Analyze and verify the alignment of data references and the
3097 dependence in the SLP instances. */
3098 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3100 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
3101 || ! vect_slp_analyze_instance_dependence (instance))
3103 dump_printf_loc (MSG_NOTE, vect_location,
3104 "removing SLP instance operations starting from: ");
3105 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3106 SLP_TREE_SCALAR_STMTS
3107 (SLP_INSTANCE_TREE (instance))[0], 0);
3108 vect_free_slp_instance (instance);
3109 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3110 continue;
3113 /* Mark all the statements that we want to vectorize as pure SLP and
3114 relevant. */
3115 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
3116 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3118 i++;
3120 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3122 delete bb_vinfo;
3123 return NULL;
3126 if (!vect_slp_analyze_operations (bb_vinfo))
3128 if (dump_enabled_p ())
3129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3130 "not vectorized: bad operation in basic block.\n");
3132 delete bb_vinfo;
3133 return NULL;
3136 /* Cost model: check if the vectorization is worthwhile. */
3137 if (!unlimited_cost_model (NULL)
3138 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3140 if (dump_enabled_p ())
3141 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3142 "not vectorized: vectorization is not "
3143 "profitable.\n");
3145 delete bb_vinfo;
3146 return NULL;
3149 if (dump_enabled_p ())
3150 dump_printf_loc (MSG_NOTE, vect_location,
3151 "Basic block will be vectorized using SLP\n");
3153 return bb_vinfo;
3157 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3158 true if anything in the basic-block was vectorized. */
3160 bool
3161 vect_slp_bb (basic_block bb)
3163 bb_vec_info bb_vinfo;
3164 gimple_stmt_iterator gsi;
3165 bool any_vectorized = false;
3166 auto_vector_sizes vector_sizes;
3168 if (dump_enabled_p ())
3169 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
3171 /* Autodetect first vector size we try. */
3172 current_vector_size = 0;
3173 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
3174 unsigned int next_size = 0;
3176 gsi = gsi_start_bb (bb);
3178 poly_uint64 autodetected_vector_size = 0;
3179 while (1)
3181 if (gsi_end_p (gsi))
3182 break;
3184 gimple_stmt_iterator region_begin = gsi;
3185 vec<data_reference_p> datarefs = vNULL;
3186 int insns = 0;
3188 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3190 gimple *stmt = gsi_stmt (gsi);
3191 if (is_gimple_debug (stmt))
3192 continue;
3193 insns++;
3195 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3196 vect_location = gimple_location (stmt);
3198 if (!find_data_references_in_stmt (NULL, stmt, &datarefs))
3199 break;
3202 /* Skip leading unhandled stmts. */
3203 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3205 gsi_next (&gsi);
3206 continue;
3209 gimple_stmt_iterator region_end = gsi;
3211 bool vectorized = false;
3212 bool fatal = false;
3213 bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
3214 datarefs, insns, fatal);
3215 if (bb_vinfo
3216 && dbg_cnt (vect_slp))
3218 if (dump_enabled_p ())
3219 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3221 vect_schedule_slp (bb_vinfo);
3223 if (dump_enabled_p ())
3224 dump_printf_loc (MSG_NOTE, vect_location,
3225 "basic block part vectorized\n");
3227 vectorized = true;
3229 delete bb_vinfo;
3231 any_vectorized |= vectorized;
3233 if (next_size == 0)
3234 autodetected_vector_size = current_vector_size;
3236 if (next_size < vector_sizes.length ()
3237 && known_eq (vector_sizes[next_size], autodetected_vector_size))
3238 next_size += 1;
3240 if (vectorized
3241 || next_size == vector_sizes.length ()
3242 || known_eq (current_vector_size, 0U)
3243 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3244 vector sizes will fail do not bother iterating. */
3245 || fatal)
3247 if (gsi_end_p (region_end))
3248 break;
3250 /* Skip the unhandled stmt. */
3251 gsi_next (&gsi);
3253 /* And reset vector sizes. */
3254 current_vector_size = 0;
3255 next_size = 0;
3257 else
3259 /* Try the next biggest vector size. */
3260 current_vector_size = vector_sizes[next_size++];
3261 if (dump_enabled_p ())
3263 dump_printf_loc (MSG_NOTE, vect_location,
3264 "***** Re-trying analysis with "
3265 "vector size ");
3266 dump_dec (MSG_NOTE, current_vector_size);
3267 dump_printf (MSG_NOTE, "\n");
3270 /* Start over. */
3271 gsi = region_begin;
3275 return any_vectorized;
3279 /* Return 1 if vector type of boolean constant which is OPNUM
3280 operand in statement STMT is a boolean vector. */
3282 static bool
3283 vect_mask_constant_operand_p (gimple *stmt, int opnum)
3285 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3286 enum tree_code code = gimple_expr_code (stmt);
3287 tree op, vectype;
3288 gimple *def_stmt;
3289 enum vect_def_type dt;
3291 /* For comparison and COND_EXPR type is chosen depending
3292 on the other comparison operand. */
3293 if (TREE_CODE_CLASS (code) == tcc_comparison)
3295 if (opnum)
3296 op = gimple_assign_rhs1 (stmt);
3297 else
3298 op = gimple_assign_rhs2 (stmt);
3300 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3301 &dt, &vectype))
3302 gcc_unreachable ();
3304 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3307 if (code == COND_EXPR)
3309 tree cond = gimple_assign_rhs1 (stmt);
3311 if (TREE_CODE (cond) == SSA_NAME)
3312 op = cond;
3313 else if (opnum)
3314 op = TREE_OPERAND (cond, 1);
3315 else
3316 op = TREE_OPERAND (cond, 0);
3318 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3319 &dt, &vectype))
3320 gcc_unreachable ();
3322 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3325 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3328 /* Build a variable-length vector in which the elements in ELTS are repeated
3329 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3330 RESULTS and add any new instructions to SEQ.
3332 The approach we use is:
3334 (1) Find a vector mode VM with integer elements of mode IM.
3336 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3337 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3338 from small vectors to IM.
3340 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3342 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3343 correct byte contents.
3345 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3347 We try to find the largest IM for which this sequence works, in order
3348 to cut down on the number of interleaves. */
3350 void
3351 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
3352 unsigned int nresults, vec<tree> &results)
3354 unsigned int nelts = elts.length ();
3355 tree element_type = TREE_TYPE (vector_type);
3357 /* (1) Find a vector mode VM with integer elements of mode IM. */
3358 unsigned int nvectors = 1;
3359 tree new_vector_type;
3360 tree permutes[2];
3361 if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
3362 &nvectors, &new_vector_type,
3363 permutes))
3364 gcc_unreachable ();
3366 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3367 unsigned int partial_nelts = nelts / nvectors;
3368 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3370 tree_vector_builder partial_elts;
3371 auto_vec<tree, 32> pieces (nvectors * 2);
3372 pieces.quick_grow (nvectors * 2);
3373 for (unsigned int i = 0; i < nvectors; ++i)
3375 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3376 ELTS' has mode IM. */
3377 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3378 for (unsigned int j = 0; j < partial_nelts; ++j)
3379 partial_elts.quick_push (elts[i * partial_nelts + j]);
3380 tree t = gimple_build_vector (seq, &partial_elts);
3381 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3382 TREE_TYPE (new_vector_type), t);
3384 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3385 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3388 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3389 correct byte contents.
3391 We need to repeat the following operation log2(nvectors) times:
3393 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3394 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3396 However, if each input repeats every N elements and the VF is
3397 a multiple of N * 2, the HI result is the same as the LO. */
3398 unsigned int in_start = 0;
3399 unsigned int out_start = nvectors;
3400 unsigned int hi_start = nvectors / 2;
3401 /* A bound on the number of outputs needed to produce NRESULTS results
3402 in the final iteration. */
3403 unsigned int noutputs_bound = nvectors * nresults;
3404 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3406 noutputs_bound /= 2;
3407 unsigned int limit = MIN (noutputs_bound, nvectors);
3408 for (unsigned int i = 0; i < limit; ++i)
3410 if ((i & 1) != 0
3411 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3412 2 * in_repeat))
3414 pieces[out_start + i] = pieces[out_start + i - 1];
3415 continue;
3418 tree output = make_ssa_name (new_vector_type);
3419 tree input1 = pieces[in_start + (i / 2)];
3420 tree input2 = pieces[in_start + (i / 2) + hi_start];
3421 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3422 input1, input2,
3423 permutes[i & 1]);
3424 gimple_seq_add_stmt (seq, stmt);
3425 pieces[out_start + i] = output;
3427 std::swap (in_start, out_start);
3430 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3431 results.reserve (nresults);
3432 for (unsigned int i = 0; i < nresults; ++i)
3433 if (i < nvectors)
3434 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3435 pieces[in_start + i]));
3436 else
3437 results.quick_push (results[i - nvectors]);
3441 /* For constant and loop invariant defs of SLP_NODE this function returns
3442 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3443 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
3444 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
3445 REDUC_INDEX is the index of the reduction operand in the statements, unless
3446 it is -1. */
3448 static void
3449 vect_get_constant_vectors (tree op, slp_tree slp_node,
3450 vec<tree> *vec_oprnds,
3451 unsigned int op_num, unsigned int number_of_vectors)
3453 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3454 gimple *stmt = stmts[0];
3455 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3456 unsigned HOST_WIDE_INT nunits;
3457 tree vec_cst;
3458 unsigned j, number_of_places_left_in_vector;
3459 tree vector_type;
3460 tree vop;
3461 int group_size = stmts.length ();
3462 unsigned int vec_num, i;
3463 unsigned number_of_copies = 1;
3464 vec<tree> voprnds;
3465 voprnds.create (number_of_vectors);
3466 bool constant_p, is_store;
3467 tree neutral_op = NULL;
3468 enum tree_code code = gimple_expr_code (stmt);
3469 gimple_seq ctor_seq = NULL;
3470 auto_vec<tree, 16> permute_results;
3472 /* Check if vector type is a boolean vector. */
3473 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3474 && vect_mask_constant_operand_p (stmt, op_num))
3475 vector_type
3476 = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
3477 else
3478 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
3480 if (STMT_VINFO_DATA_REF (stmt_vinfo))
3482 is_store = true;
3483 op = gimple_assign_rhs1 (stmt);
3485 else
3486 is_store = false;
3488 gcc_assert (op);
3490 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3491 created vectors. It is greater than 1 if unrolling is performed.
3493 For example, we have two scalar operands, s1 and s2 (e.g., group of
3494 strided accesses of size two), while NUNITS is four (i.e., four scalars
3495 of this type can be packed in a vector). The output vector will contain
3496 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3497 will be 2).
3499 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3500 containing the operands.
3502 For example, NUNITS is four as before, and the group size is 8
3503 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3504 {s5, s6, s7, s8}. */
3506 /* When using duplicate_and_interleave, we just need one element for
3507 each scalar statement. */
3508 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3509 nunits = group_size;
3511 number_of_copies = nunits * number_of_vectors / group_size;
3513 number_of_places_left_in_vector = nunits;
3514 constant_p = true;
3515 tree_vector_builder elts (vector_type, nunits, 1);
3516 elts.quick_grow (nunits);
3517 bool place_after_defs = false;
3518 for (j = 0; j < number_of_copies; j++)
3520 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
3522 if (is_store)
3523 op = gimple_assign_rhs1 (stmt);
3524 else
3526 switch (code)
3528 case COND_EXPR:
3530 tree cond = gimple_assign_rhs1 (stmt);
3531 if (TREE_CODE (cond) == SSA_NAME)
3532 op = gimple_op (stmt, op_num + 1);
3533 else if (op_num == 0 || op_num == 1)
3534 op = TREE_OPERAND (cond, op_num);
3535 else
3537 if (op_num == 2)
3538 op = gimple_assign_rhs2 (stmt);
3539 else
3540 op = gimple_assign_rhs3 (stmt);
3543 break;
3545 case CALL_EXPR:
3546 op = gimple_call_arg (stmt, op_num);
3547 break;
3549 case LSHIFT_EXPR:
3550 case RSHIFT_EXPR:
3551 case LROTATE_EXPR:
3552 case RROTATE_EXPR:
3553 op = gimple_op (stmt, op_num + 1);
3554 /* Unlike the other binary operators, shifts/rotates have
3555 the shift count being int, instead of the same type as
3556 the lhs, so make sure the scalar is the right type if
3557 we are dealing with vectors of
3558 long long/long/short/char. */
3559 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3560 op = fold_convert (TREE_TYPE (vector_type), op);
3561 break;
3563 default:
3564 op = gimple_op (stmt, op_num + 1);
3565 break;
3569 /* Create 'vect_ = {op0,op1,...,opn}'. */
3570 number_of_places_left_in_vector--;
3571 tree orig_op = op;
3572 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3574 if (CONSTANT_CLASS_P (op))
3576 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3578 /* Can't use VIEW_CONVERT_EXPR for booleans because
3579 of possibly different sizes of scalar value and
3580 vector element. */
3581 if (integer_zerop (op))
3582 op = build_int_cst (TREE_TYPE (vector_type), 0);
3583 else if (integer_onep (op))
3584 op = build_all_ones_cst (TREE_TYPE (vector_type));
3585 else
3586 gcc_unreachable ();
3588 else
3589 op = fold_unary (VIEW_CONVERT_EXPR,
3590 TREE_TYPE (vector_type), op);
3591 gcc_assert (op && CONSTANT_CLASS_P (op));
3593 else
3595 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3596 gimple *init_stmt;
3597 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3599 tree true_val
3600 = build_all_ones_cst (TREE_TYPE (vector_type));
3601 tree false_val
3602 = build_zero_cst (TREE_TYPE (vector_type));
3603 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3604 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3605 op, true_val,
3606 false_val);
3608 else
3610 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3611 op);
3612 init_stmt
3613 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3614 op);
3616 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3617 op = new_temp;
3620 elts[number_of_places_left_in_vector] = op;
3621 if (!CONSTANT_CLASS_P (op))
3622 constant_p = false;
3623 if (TREE_CODE (orig_op) == SSA_NAME
3624 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3625 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3626 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3627 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3628 place_after_defs = true;
3630 if (number_of_places_left_in_vector == 0)
3632 if (constant_p
3633 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3634 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3635 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3636 else
3638 if (vec_oprnds->is_empty ())
3639 duplicate_and_interleave (&ctor_seq, vector_type, elts,
3640 number_of_vectors,
3641 permute_results);
3642 vec_cst = permute_results[number_of_vectors - j - 1];
3644 tree init;
3645 gimple_stmt_iterator gsi;
3646 if (place_after_defs)
3648 gsi = gsi_for_stmt
3649 (vect_find_last_scalar_stmt_in_slp (slp_node));
3650 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
3652 else
3653 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
3654 if (ctor_seq != NULL)
3656 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3657 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3658 ctor_seq = NULL;
3660 voprnds.quick_push (init);
3661 place_after_defs = false;
3662 number_of_places_left_in_vector = nunits;
3663 constant_p = true;
3664 elts.new_vector (vector_type, nunits, 1);
3665 elts.quick_grow (nunits);
3670 /* Since the vectors are created in the reverse order, we should invert
3671 them. */
3672 vec_num = voprnds.length ();
3673 for (j = vec_num; j != 0; j--)
3675 vop = voprnds[j - 1];
3676 vec_oprnds->quick_push (vop);
3679 voprnds.release ();
3681 /* In case that VF is greater than the unrolling factor needed for the SLP
3682 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3683 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3684 to replicate the vectors. */
3685 while (number_of_vectors > vec_oprnds->length ())
3687 tree neutral_vec = NULL;
3689 if (neutral_op)
3691 if (!neutral_vec)
3692 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3694 vec_oprnds->quick_push (neutral_vec);
3696 else
3698 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3699 vec_oprnds->quick_push (vop);
3705 /* Get vectorized definitions from SLP_NODE that contains corresponding
3706 vectorized def-stmts. */
3708 static void
3709 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3711 tree vec_oprnd;
3712 gimple *vec_def_stmt;
3713 unsigned int i;
3715 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3717 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
3719 gcc_assert (vec_def_stmt);
3720 if (gimple_code (vec_def_stmt) == GIMPLE_PHI)
3721 vec_oprnd = gimple_phi_result (vec_def_stmt);
3722 else
3723 vec_oprnd = gimple_get_lhs (vec_def_stmt);
3724 vec_oprnds->quick_push (vec_oprnd);
3729 /* Get vectorized definitions for SLP_NODE.
3730 If the scalar definitions are loop invariants or constants, collect them and
3731 call vect_get_constant_vectors() to create vector stmts.
3732 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3733 must be stored in the corresponding child of SLP_NODE, and we call
3734 vect_get_slp_vect_defs () to retrieve them. */
3736 void
3737 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3738 vec<vec<tree> > *vec_oprnds)
3740 gimple *first_stmt;
3741 int number_of_vects = 0, i;
3742 unsigned int child_index = 0;
3743 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3744 slp_tree child = NULL;
3745 vec<tree> vec_defs;
3746 tree oprnd;
3747 bool vectorized_defs;
3749 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3750 FOR_EACH_VEC_ELT (ops, i, oprnd)
3752 /* For each operand we check if it has vectorized definitions in a child
3753 node or we need to create them (for invariants and constants). We
3754 check if the LHS of the first stmt of the next child matches OPRND.
3755 If it does, we found the correct child. Otherwise, we call
3756 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3757 to check this child node for the next operand. */
3758 vectorized_defs = false;
3759 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3761 child = SLP_TREE_CHILDREN (slp_node)[child_index];
3763 /* We have to check both pattern and original def, if available. */
3764 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3766 gimple *first_def = SLP_TREE_SCALAR_STMTS (child)[0];
3767 gimple *related
3768 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
3769 tree first_def_op;
3771 if (gimple_code (first_def) == GIMPLE_PHI)
3772 first_def_op = gimple_phi_result (first_def);
3773 else
3774 first_def_op = gimple_get_lhs (first_def);
3775 if (operand_equal_p (oprnd, first_def_op, 0)
3776 || (related
3777 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
3779 /* The number of vector defs is determined by the number of
3780 vector statements in the node from which we get those
3781 statements. */
3782 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3783 vectorized_defs = true;
3784 child_index++;
3787 else
3788 child_index++;
3791 if (!vectorized_defs)
3793 if (i == 0)
3795 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3796 /* Number of vector stmts was calculated according to LHS in
3797 vect_schedule_slp_instance (), fix it by replacing LHS with
3798 RHS, if necessary. See vect_get_smallest_scalar_type () for
3799 details. */
3800 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
3801 &rhs_size_unit);
3802 if (rhs_size_unit != lhs_size_unit)
3804 number_of_vects *= rhs_size_unit;
3805 number_of_vects /= lhs_size_unit;
3810 /* Allocate memory for vectorized defs. */
3811 vec_defs = vNULL;
3812 vec_defs.create (number_of_vects);
3814 /* For reduction defs we call vect_get_constant_vectors (), since we are
3815 looking for initial loop invariant values. */
3816 if (vectorized_defs)
3817 /* The defs are already vectorized. */
3818 vect_get_slp_vect_defs (child, &vec_defs);
3819 else
3820 /* Build vectors from scalar defs. */
3821 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3822 number_of_vects);
3824 vec_oprnds->quick_push (vec_defs);
3828 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3829 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3830 permute statements for the SLP node NODE of the SLP instance
3831 SLP_NODE_INSTANCE. */
3833 bool
3834 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3835 gimple_stmt_iterator *gsi, poly_uint64 vf,
3836 slp_instance slp_node_instance, bool analyze_only,
3837 unsigned *n_perms)
3839 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3840 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3841 tree mask_element_type = NULL_TREE, mask_type;
3842 int vec_index = 0;
3843 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3844 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3845 unsigned int mask_element;
3846 machine_mode mode;
3847 unsigned HOST_WIDE_INT nunits, const_vf;
3849 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3850 return false;
3852 stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
3854 mode = TYPE_MODE (vectype);
3856 /* At the moment, all permutations are represented using per-element
3857 indices, so we can't cope with variable vector lengths or
3858 vectorization factors. */
3859 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
3860 || !vf.is_constant (&const_vf))
3861 return false;
3863 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3864 same size as the vector element being permuted. */
3865 mask_element_type = lang_hooks.types.type_for_mode
3866 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
3867 mask_type = get_vectype_for_scalar_type (mask_element_type);
3868 vec_perm_builder mask (nunits, nunits, 1);
3869 mask.quick_grow (nunits);
3870 vec_perm_indices indices;
3872 /* Initialize the vect stmts of NODE to properly insert the generated
3873 stmts later. */
3874 if (! analyze_only)
3875 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3876 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3877 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3879 /* Generate permutation masks for every NODE. Number of masks for each NODE
3880 is equal to GROUP_SIZE.
3881 E.g., we have a group of three nodes with three loads from the same
3882 location in each node, and the vector size is 4. I.e., we have a
3883 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3884 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3885 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3888 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3889 The last mask is illegal since we assume two operands for permute
3890 operation, and the mask element values can't be outside that range.
3891 Hence, the last mask must be converted into {2,5,5,5}.
3892 For the first two permutations we need the first and the second input
3893 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3894 we need the second and the third vectors: {b1,c1,a2,b2} and
3895 {c2,a3,b3,c3}. */
3897 int vect_stmts_counter = 0;
3898 unsigned int index = 0;
3899 int first_vec_index = -1;
3900 int second_vec_index = -1;
3901 bool noop_p = true;
3902 *n_perms = 0;
3904 for (unsigned int j = 0; j < const_vf; j++)
3906 for (int k = 0; k < group_size; k++)
3908 unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
3909 + j * STMT_VINFO_GROUP_SIZE (stmt_info));
3910 vec_index = i / nunits;
3911 mask_element = i % nunits;
3912 if (vec_index == first_vec_index
3913 || first_vec_index == -1)
3915 first_vec_index = vec_index;
3917 else if (vec_index == second_vec_index
3918 || second_vec_index == -1)
3920 second_vec_index = vec_index;
3921 mask_element += nunits;
3923 else
3925 if (dump_enabled_p ())
3927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3928 "permutation requires at "
3929 "least three vectors ");
3930 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3931 stmt, 0);
3933 gcc_assert (analyze_only);
3934 return false;
3937 gcc_assert (mask_element < 2 * nunits);
3938 if (mask_element != index)
3939 noop_p = false;
3940 mask[index++] = mask_element;
3942 if (index == nunits && !noop_p)
3944 indices.new_vector (mask, 2, nunits);
3945 if (!can_vec_perm_const_p (mode, indices))
3947 if (dump_enabled_p ())
3949 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3950 vect_location,
3951 "unsupported vect permute { ");
3952 for (i = 0; i < nunits; ++i)
3954 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3955 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3957 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3959 gcc_assert (analyze_only);
3960 return false;
3963 ++*n_perms;
3966 if (index == nunits)
3968 if (!analyze_only)
3970 tree mask_vec = NULL_TREE;
3972 if (! noop_p)
3973 mask_vec = vec_perm_indices_to_tree (mask_type, indices);
3975 if (second_vec_index == -1)
3976 second_vec_index = first_vec_index;
3978 /* Generate the permute statement if necessary. */
3979 tree first_vec = dr_chain[first_vec_index];
3980 tree second_vec = dr_chain[second_vec_index];
3981 gimple *perm_stmt;
3982 if (! noop_p)
3984 tree perm_dest
3985 = vect_create_destination_var (gimple_assign_lhs (stmt),
3986 vectype);
3987 perm_dest = make_ssa_name (perm_dest);
3988 perm_stmt = gimple_build_assign (perm_dest,
3989 VEC_PERM_EXPR,
3990 first_vec, second_vec,
3991 mask_vec);
3992 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3994 else
3995 /* If mask was NULL_TREE generate the requested
3996 identity transform. */
3997 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
3999 /* Store the vector statement in NODE. */
4000 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
4003 index = 0;
4004 first_vec_index = -1;
4005 second_vec_index = -1;
4006 noop_p = true;
4011 return true;
4014 typedef hash_map <vec <gimple *>, slp_tree,
4015 simple_hashmap_traits <bst_traits, slp_tree> >
4016 scalar_stmts_to_slp_tree_map_t;
4018 /* Vectorize SLP instance tree in postorder. */
4020 static bool
4021 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
4022 scalar_stmts_to_slp_tree_map_t *bst_map)
4024 gimple *stmt;
4025 bool grouped_store, is_store;
4026 gimple_stmt_iterator si;
4027 stmt_vec_info stmt_info;
4028 unsigned int group_size;
4029 tree vectype;
4030 int i, j;
4031 slp_tree child;
4033 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4034 return false;
4036 /* See if we have already vectorized the same set of stmts and reuse their
4037 vectorized stmts. */
4038 slp_tree &leader
4039 = bst_map->get_or_insert (SLP_TREE_SCALAR_STMTS (node).copy ());
4040 if (leader)
4042 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (leader));
4043 return false;
4046 leader = node;
4047 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4048 vect_schedule_slp_instance (child, instance, bst_map);
4050 /* Push SLP node def-type to stmts. */
4051 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4052 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4053 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4054 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
4056 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
4057 stmt_info = vinfo_for_stmt (stmt);
4059 /* VECTYPE is the type of the destination. */
4060 vectype = STMT_VINFO_VECTYPE (stmt_info);
4061 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4062 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
4064 if (!SLP_TREE_VEC_STMTS (node).exists ())
4065 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4067 if (dump_enabled_p ())
4069 dump_printf_loc (MSG_NOTE,vect_location,
4070 "------>vectorizing SLP node starting from: ");
4071 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4074 /* Vectorized stmts go before the last scalar stmt which is where
4075 all uses are ready. */
4076 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
4078 /* Mark the first element of the reduction chain as reduction to properly
4079 transform the node. In the analysis phase only the last element of the
4080 chain is marked as reduction. */
4081 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
4082 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
4084 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
4085 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4088 /* Handle two-operation SLP nodes by vectorizing the group with
4089 both operations and then performing a merge. */
4090 if (SLP_TREE_TWO_OPERATORS (node))
4092 enum tree_code code0 = gimple_assign_rhs_code (stmt);
4093 enum tree_code ocode = ERROR_MARK;
4094 gimple *ostmt;
4095 vec_perm_builder mask (group_size, group_size, 1);
4096 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
4097 if (gimple_assign_rhs_code (ostmt) != code0)
4099 mask.quick_push (1);
4100 ocode = gimple_assign_rhs_code (ostmt);
4102 else
4103 mask.quick_push (0);
4104 if (ocode != ERROR_MARK)
4106 vec<gimple *> v0;
4107 vec<gimple *> v1;
4108 unsigned j;
4109 tree tmask = NULL_TREE;
4110 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4111 v0 = SLP_TREE_VEC_STMTS (node).copy ();
4112 SLP_TREE_VEC_STMTS (node).truncate (0);
4113 gimple_assign_set_rhs_code (stmt, ocode);
4114 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4115 gimple_assign_set_rhs_code (stmt, code0);
4116 v1 = SLP_TREE_VEC_STMTS (node).copy ();
4117 SLP_TREE_VEC_STMTS (node).truncate (0);
4118 tree meltype = build_nonstandard_integer_type
4119 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
4120 tree mvectype = get_same_sized_vectype (meltype, vectype);
4121 unsigned k = 0, l;
4122 for (j = 0; j < v0.length (); ++j)
4124 /* Enforced by vect_build_slp_tree, which rejects variable-length
4125 vectors for SLP_TREE_TWO_OPERATORS. */
4126 unsigned int const_nunits = nunits.to_constant ();
4127 tree_vector_builder melts (mvectype, const_nunits, 1);
4128 for (l = 0; l < const_nunits; ++l)
4130 if (k >= group_size)
4131 k = 0;
4132 tree t = build_int_cst (meltype,
4133 mask[k++] * const_nunits + l);
4134 melts.quick_push (t);
4136 tmask = melts.build ();
4138 /* ??? Not all targets support a VEC_PERM_EXPR with a
4139 constant mask that would translate to a vec_merge RTX
4140 (with their vec_perm_const_ok). We can either not
4141 vectorize in that case or let veclower do its job.
4142 Unfortunately that isn't too great and at least for
4143 plus/minus we'd eventually like to match targets
4144 vector addsub instructions. */
4145 gimple *vstmt;
4146 vstmt = gimple_build_assign (make_ssa_name (vectype),
4147 VEC_PERM_EXPR,
4148 gimple_assign_lhs (v0[j]),
4149 gimple_assign_lhs (v1[j]), tmask);
4150 vect_finish_stmt_generation (stmt, vstmt, &si);
4151 SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
4153 v0.release ();
4154 v1.release ();
4155 return false;
4158 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4160 /* Restore stmt def-types. */
4161 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4162 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4163 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4164 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
4166 return is_store;
4169 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4170 For loop vectorization this is done in vectorizable_call, but for SLP
4171 it needs to be deferred until end of vect_schedule_slp, because multiple
4172 SLP instances may refer to the same scalar stmt. */
4174 static void
4175 vect_remove_slp_scalar_calls (slp_tree node)
4177 gimple *stmt, *new_stmt;
4178 gimple_stmt_iterator gsi;
4179 int i;
4180 slp_tree child;
4181 tree lhs;
4182 stmt_vec_info stmt_info;
4184 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4185 return;
4187 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4188 vect_remove_slp_scalar_calls (child);
4190 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4192 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
4193 continue;
4194 stmt_info = vinfo_for_stmt (stmt);
4195 if (stmt_info == NULL
4196 || is_pattern_stmt_p (stmt_info)
4197 || !PURE_SLP_STMT (stmt_info))
4198 continue;
4199 lhs = gimple_call_lhs (stmt);
4200 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4201 set_vinfo_for_stmt (new_stmt, stmt_info);
4202 set_vinfo_for_stmt (stmt, NULL);
4203 STMT_VINFO_STMT (stmt_info) = new_stmt;
4204 gsi = gsi_for_stmt (stmt);
4205 gsi_replace (&gsi, new_stmt, false);
4206 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4210 /* Generate vector code for all SLP instances in the loop/basic block. */
4212 bool
4213 vect_schedule_slp (vec_info *vinfo)
4215 vec<slp_instance> slp_instances;
4216 slp_instance instance;
4217 unsigned int i;
4218 bool is_store = false;
4220 slp_instances = vinfo->slp_instances;
4221 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4223 /* Schedule the tree of INSTANCE. */
4224 scalar_stmts_to_slp_tree_map_t *bst_map
4225 = new scalar_stmts_to_slp_tree_map_t ();
4226 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4227 instance, bst_map);
4228 delete bst_map;
4229 if (dump_enabled_p ())
4230 dump_printf_loc (MSG_NOTE, vect_location,
4231 "vectorizing stmts using SLP.\n");
4234 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4236 slp_tree root = SLP_INSTANCE_TREE (instance);
4237 gimple *store;
4238 unsigned int j;
4239 gimple_stmt_iterator gsi;
4241 /* Remove scalar call stmts. Do not do this for basic-block
4242 vectorization as not all uses may be vectorized.
4243 ??? Why should this be necessary? DCE should be able to
4244 remove the stmts itself.
4245 ??? For BB vectorization we can as well remove scalar
4246 stmts starting from the SLP tree root if they have no
4247 uses. */
4248 if (is_a <loop_vec_info> (vinfo))
4249 vect_remove_slp_scalar_calls (root);
4251 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
4252 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4254 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
4255 break;
4257 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
4258 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
4259 /* Free the attached stmt_vec_info and remove the stmt. */
4260 gsi = gsi_for_stmt (store);
4261 unlink_stmt_vdef (store);
4262 gsi_remove (&gsi, true);
4263 release_defs (store);
4264 free_stmt_vec_info (store);
4268 return is_store;