Fix build on sparc64-linux-gnu.
[official-gcc.git] / gcc / tree-vect-slp.c
blobe7e5d252c00cc0a054d3db3f9afaee87403dac51
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
51 FINAL_P is true if we have vectorized the instance or if we have
52 made a final decision not to vectorize the statements in any way. */
54 static void
55 vect_free_slp_tree (slp_tree node, bool final_p)
57 int i;
58 slp_tree child;
60 if (--node->refcnt != 0)
61 return;
63 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
64 vect_free_slp_tree (child, final_p);
66 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
67 Some statements might no longer exist, after having been
68 removed by vect_transform_stmt. Updating the remaining
69 statements would be redundant. */
70 if (!final_p)
72 stmt_vec_info stmt_info;
73 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
75 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
76 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
80 SLP_TREE_CHILDREN (node).release ();
81 SLP_TREE_SCALAR_STMTS (node).release ();
82 SLP_TREE_VEC_STMTS (node).release ();
83 SLP_TREE_LOAD_PERMUTATION (node).release ();
85 free (node);
88 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
89 have vectorized the instance or if we have made a final decision not
90 to vectorize the statements in any way. */
92 void
93 vect_free_slp_instance (slp_instance instance, bool final_p)
95 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
96 SLP_INSTANCE_LOADS (instance).release ();
97 free (instance);
101 /* Create an SLP node for SCALAR_STMTS. */
103 static slp_tree
104 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
106 slp_tree node;
107 stmt_vec_info stmt_info = scalar_stmts[0];
108 unsigned int nops;
110 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
111 nops = gimple_call_num_args (stmt);
112 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
114 nops = gimple_num_ops (stmt) - 1;
115 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
116 nops++;
118 else if (is_a <gphi *> (stmt_info->stmt))
119 nops = 0;
120 else
121 return NULL;
123 node = XNEW (struct _slp_tree);
124 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
125 SLP_TREE_VEC_STMTS (node).create (0);
126 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
127 SLP_TREE_CHILDREN (node).create (nops);
128 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
129 SLP_TREE_TWO_OPERATORS (node) = false;
130 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
131 node->refcnt = 1;
133 unsigned i;
134 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
135 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
137 return node;
141 /* This structure is used in creation of an SLP tree. Each instance
142 corresponds to the same operand in a group of scalar stmts in an SLP
143 node. */
144 typedef struct _slp_oprnd_info
146 /* Def-stmts for the operands. */
147 vec<stmt_vec_info> def_stmts;
148 /* Information about the first statement, its vector def-type, type, the
149 operand itself in case it's constant, and an indication if it's a pattern
150 stmt. */
151 tree first_op_type;
152 enum vect_def_type first_dt;
153 bool first_pattern;
154 bool second_pattern;
155 } *slp_oprnd_info;
158 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
159 operand. */
160 static vec<slp_oprnd_info>
161 vect_create_oprnd_info (int nops, int group_size)
163 int i;
164 slp_oprnd_info oprnd_info;
165 vec<slp_oprnd_info> oprnds_info;
167 oprnds_info.create (nops);
168 for (i = 0; i < nops; i++)
170 oprnd_info = XNEW (struct _slp_oprnd_info);
171 oprnd_info->def_stmts.create (group_size);
172 oprnd_info->first_dt = vect_uninitialized_def;
173 oprnd_info->first_op_type = NULL_TREE;
174 oprnd_info->first_pattern = false;
175 oprnd_info->second_pattern = false;
176 oprnds_info.quick_push (oprnd_info);
179 return oprnds_info;
183 /* Free operands info. */
185 static void
186 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
188 int i;
189 slp_oprnd_info oprnd_info;
191 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
193 oprnd_info->def_stmts.release ();
194 XDELETE (oprnd_info);
197 oprnds_info.release ();
201 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
202 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
203 of the chain. */
206 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
207 stmt_vec_info first_stmt_info)
209 stmt_vec_info next_stmt_info = first_stmt_info;
210 int result = 0;
212 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
213 return -1;
217 if (next_stmt_info == stmt_info)
218 return result;
219 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
220 if (next_stmt_info)
221 result += DR_GROUP_GAP (next_stmt_info);
223 while (next_stmt_info);
225 return -1;
228 /* Check whether it is possible to load COUNT elements of type ELT_MODE
229 using the method implemented by duplicate_and_interleave. Return true
230 if so, returning the number of intermediate vectors in *NVECTORS_OUT
231 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
232 (if nonnull). */
234 bool
235 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
236 unsigned int *nvectors_out,
237 tree *vector_type_out,
238 tree *permutes)
240 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
241 poly_int64 nelts;
242 unsigned int nvectors = 1;
243 for (;;)
245 scalar_int_mode int_mode;
246 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
247 if (multiple_p (current_vector_size, elt_bytes, &nelts)
248 && int_mode_for_size (elt_bits, 0).exists (&int_mode))
250 tree int_type = build_nonstandard_integer_type
251 (GET_MODE_BITSIZE (int_mode), 1);
252 tree vector_type = build_vector_type (int_type, nelts);
253 if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
255 vec_perm_builder sel1 (nelts, 2, 3);
256 vec_perm_builder sel2 (nelts, 2, 3);
257 poly_int64 half_nelts = exact_div (nelts, 2);
258 for (unsigned int i = 0; i < 3; ++i)
260 sel1.quick_push (i);
261 sel1.quick_push (i + nelts);
262 sel2.quick_push (half_nelts + i);
263 sel2.quick_push (half_nelts + i + nelts);
265 vec_perm_indices indices1 (sel1, 2, nelts);
266 vec_perm_indices indices2 (sel2, 2, nelts);
267 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
268 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
270 if (nvectors_out)
271 *nvectors_out = nvectors;
272 if (vector_type_out)
273 *vector_type_out = vector_type;
274 if (permutes)
276 permutes[0] = vect_gen_perm_mask_checked (vector_type,
277 indices1);
278 permutes[1] = vect_gen_perm_mask_checked (vector_type,
279 indices2);
281 return true;
285 if (!multiple_p (elt_bytes, 2, &elt_bytes))
286 return false;
287 nvectors *= 2;
291 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
292 they are of a valid type and that they match the defs of the first stmt of
293 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
294 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
295 indicates swap is required for cond_expr stmts. Specifically, *SWAP
296 is 1 if STMT is cond and operands of comparison need to be swapped;
297 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
298 If there is any operand swap in this function, *SWAP is set to non-zero
299 value.
300 If there was a fatal error return -1; if the error could be corrected by
301 swapping operands of father node of this one, return 1; if everything is
302 ok return 0. */
303 static int
304 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
305 vec<stmt_vec_info> stmts, unsigned stmt_num,
306 vec<slp_oprnd_info> *oprnds_info)
308 stmt_vec_info stmt_info = stmts[stmt_num];
309 tree oprnd;
310 unsigned int i, number_of_oprnds;
311 enum vect_def_type dt = vect_uninitialized_def;
312 bool pattern = false;
313 slp_oprnd_info oprnd_info;
314 int first_op_idx = 1;
315 unsigned int commutative_op = -1U;
316 bool first_op_cond = false;
317 bool first = stmt_num == 0;
318 bool second = stmt_num == 1;
320 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
322 number_of_oprnds = gimple_call_num_args (stmt);
323 first_op_idx = 3;
324 if (gimple_call_internal_p (stmt))
326 internal_fn ifn = gimple_call_internal_fn (stmt);
327 commutative_op = first_commutative_argument (ifn);
330 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
332 enum tree_code code = gimple_assign_rhs_code (stmt);
333 number_of_oprnds = gimple_num_ops (stmt) - 1;
334 /* Swap can only be done for cond_expr if asked to, otherwise we
335 could result in different comparison code to the first stmt. */
336 if (gimple_assign_rhs_code (stmt) == COND_EXPR
337 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
339 first_op_cond = true;
340 number_of_oprnds++;
342 else
343 commutative_op = commutative_tree_code (code) ? 0U : -1U;
345 else
346 return -1;
348 bool swapped = (*swap != 0);
349 gcc_assert (!swapped || first_op_cond);
350 for (i = 0; i < number_of_oprnds; i++)
352 again:
353 if (first_op_cond)
355 /* Map indicating how operands of cond_expr should be swapped. */
356 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
357 int *map = maps[*swap];
359 if (i < 2)
360 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
361 first_op_idx), map[i]);
362 else
363 oprnd = gimple_op (stmt_info->stmt, map[i]);
365 else
366 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
368 oprnd_info = (*oprnds_info)[i];
370 stmt_vec_info def_stmt_info;
371 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
373 if (dump_enabled_p ())
374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
375 "Build SLP failed: can't analyze def for %T\n",
376 oprnd);
378 return -1;
381 if (second)
382 oprnd_info->second_pattern = pattern;
384 if (first)
386 oprnd_info->first_dt = dt;
387 oprnd_info->first_pattern = pattern;
388 oprnd_info->first_op_type = TREE_TYPE (oprnd);
390 else
392 /* Not first stmt of the group, check that the def-stmt/s match
393 the def-stmt/s of the first stmt. Allow different definition
394 types for reduction chains: the first stmt must be a
395 vect_reduction_def (a phi node), and the rest
396 vect_internal_def. */
397 tree type = TREE_TYPE (oprnd);
398 if ((oprnd_info->first_dt != dt
399 && !(oprnd_info->first_dt == vect_reduction_def
400 && dt == vect_internal_def)
401 && !((oprnd_info->first_dt == vect_external_def
402 || oprnd_info->first_dt == vect_constant_def)
403 && (dt == vect_external_def
404 || dt == vect_constant_def)))
405 || !types_compatible_p (oprnd_info->first_op_type, type))
407 /* Try swapping operands if we got a mismatch. */
408 if (i == commutative_op && !swapped)
410 swapped = true;
411 goto again;
414 if (dump_enabled_p ())
415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
416 "Build SLP failed: different types\n");
418 return 1;
420 if ((dt == vect_constant_def
421 || dt == vect_external_def)
422 && !current_vector_size.is_constant ()
423 && (TREE_CODE (type) == BOOLEAN_TYPE
424 || !can_duplicate_and_interleave_p (stmts.length (),
425 TYPE_MODE (type))))
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
429 "Build SLP failed: invalid type of def "
430 "for variable-length SLP %T\n", oprnd);
431 return -1;
435 /* Check the types of the definitions. */
436 switch (dt)
438 case vect_constant_def:
439 case vect_external_def:
440 break;
442 case vect_reduction_def:
443 case vect_induction_def:
444 case vect_internal_def:
445 oprnd_info->def_stmts.quick_push (def_stmt_info);
446 break;
448 default:
449 /* FORNOW: Not supported. */
450 if (dump_enabled_p ())
451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
452 "Build SLP failed: illegal type of def %T\n",
453 oprnd);
455 return -1;
459 /* Swap operands. */
460 if (swapped)
462 /* If there are already uses of this stmt in a SLP instance then
463 we've committed to the operand order and can't swap it. */
464 if (STMT_VINFO_NUM_SLP_USES (stmt_info) != 0)
466 if (dump_enabled_p ())
467 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
468 "Build SLP failed: cannot swap operands of "
469 "shared stmt %G", stmt_info->stmt);
470 return -1;
473 if (first_op_cond)
475 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
476 tree cond = gimple_assign_rhs1 (stmt);
477 enum tree_code code = TREE_CODE (cond);
479 /* Swap. */
480 if (*swap == 1)
482 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
483 &TREE_OPERAND (cond, 1));
484 TREE_SET_CODE (cond, swap_tree_comparison (code));
486 /* Invert. */
487 else
489 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
490 gimple_assign_rhs3_ptr (stmt));
491 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
492 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
493 gcc_assert (code != ERROR_MARK);
494 TREE_SET_CODE (cond, code);
497 else
499 unsigned int op = commutative_op + first_op_idx;
500 swap_ssa_operands (stmt_info->stmt,
501 gimple_op_ptr (stmt_info->stmt, op),
502 gimple_op_ptr (stmt_info->stmt, op + 1));
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE, vect_location,
506 "swapped operands to match def types in %G",
507 stmt_info->stmt);
510 *swap = swapped;
511 return 0;
514 /* Return true if call statements CALL1 and CALL2 are similar enough
515 to be combined into the same SLP group. */
517 static bool
518 compatible_calls_p (gcall *call1, gcall *call2)
520 unsigned int nargs = gimple_call_num_args (call1);
521 if (nargs != gimple_call_num_args (call2))
522 return false;
524 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
525 return false;
527 if (gimple_call_internal_p (call1))
529 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
530 TREE_TYPE (gimple_call_lhs (call2))))
531 return false;
532 for (unsigned int i = 0; i < nargs; ++i)
533 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
534 TREE_TYPE (gimple_call_arg (call2, i))))
535 return false;
537 else
539 if (!operand_equal_p (gimple_call_fn (call1),
540 gimple_call_fn (call2), 0))
541 return false;
543 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
544 return false;
546 return true;
549 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
550 caller's attempt to find the vector type in STMT_INFO with the narrowest
551 element type. Return true if VECTYPE is nonnull and if it is valid
552 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
553 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
554 vect_build_slp_tree. */
556 static bool
557 vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
558 tree vectype, poly_uint64 *max_nunits)
560 if (!vectype)
562 if (dump_enabled_p ())
563 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
564 "Build SLP failed: unsupported data-type in %G\n",
565 stmt_info->stmt);
566 /* Fatal mismatch. */
567 return false;
570 /* If populating the vector type requires unrolling then fail
571 before adjusting *max_nunits for basic-block vectorization. */
572 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
573 unsigned HOST_WIDE_INT const_nunits;
574 if (STMT_VINFO_BB_VINFO (stmt_info)
575 && (!nunits.is_constant (&const_nunits)
576 || const_nunits > group_size))
578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
579 "Build SLP failed: unrolling required "
580 "in basic block SLP\n");
581 /* Fatal mismatch. */
582 return false;
585 /* In case of multiple types we need to detect the smallest type. */
586 vect_update_max_nunits (max_nunits, vectype);
587 return true;
590 /* STMTS is a group of GROUP_SIZE SLP statements in which some
591 statements do the same operation as the first statement and in which
592 the others do ALT_STMT_CODE. Return true if we can take one vector
593 of the first operation and one vector of the second and permute them
594 to get the required result. VECTYPE is the type of the vector that
595 would be permuted. */
597 static bool
598 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
599 unsigned int group_size, tree vectype,
600 tree_code alt_stmt_code)
602 unsigned HOST_WIDE_INT count;
603 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
604 return false;
606 vec_perm_builder sel (count, count, 1);
607 for (unsigned int i = 0; i < count; ++i)
609 unsigned int elt = i;
610 gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
611 if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
612 elt += count;
613 sel.quick_push (elt);
615 vec_perm_indices indices (sel, 2, count);
616 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
619 /* Verify if the scalar stmts STMTS are isomorphic, require data
620 permutation or are of unsupported types of operation. Return
621 true if they are, otherwise return false and indicate in *MATCHES
622 which stmts are not isomorphic to the first one. If MATCHES[0]
623 is false then this indicates the comparison could not be
624 carried out or the stmts will never be vectorized by SLP.
626 Note COND_EXPR is possibly ismorphic to another one after swapping its
627 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
628 the first stmt by swapping the two operands of comparison; set SWAP[i]
629 to 2 if stmt I is isormorphic to the first stmt by inverting the code
630 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
631 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
633 static bool
634 vect_build_slp_tree_1 (unsigned char *swap,
635 vec<stmt_vec_info> stmts, unsigned int group_size,
636 poly_uint64 *max_nunits, bool *matches,
637 bool *two_operators)
639 unsigned int i;
640 stmt_vec_info first_stmt_info = stmts[0];
641 enum tree_code first_stmt_code = ERROR_MARK;
642 enum tree_code alt_stmt_code = ERROR_MARK;
643 enum tree_code rhs_code = ERROR_MARK;
644 enum tree_code first_cond_code = ERROR_MARK;
645 tree lhs;
646 bool need_same_oprnds = false;
647 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
648 optab optab;
649 int icode;
650 machine_mode optab_op2_mode;
651 machine_mode vec_mode;
652 stmt_vec_info first_load = NULL, prev_first_load = NULL;
654 /* For every stmt in NODE find its def stmt/s. */
655 stmt_vec_info stmt_info;
656 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
658 gimple *stmt = stmt_info->stmt;
659 swap[i] = 0;
660 matches[i] = false;
662 if (dump_enabled_p ())
663 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
665 /* Fail to vectorize statements marked as unvectorizable. */
666 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
668 if (dump_enabled_p ())
669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
670 "Build SLP failed: unvectorizable statement %G",
671 stmt);
672 /* Fatal mismatch. */
673 matches[0] = false;
674 return false;
677 lhs = gimple_get_lhs (stmt);
678 if (lhs == NULL_TREE)
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
682 "Build SLP failed: not GIMPLE_ASSIGN nor "
683 "GIMPLE_CALL %G", stmt);
684 /* Fatal mismatch. */
685 matches[0] = false;
686 return false;
689 tree nunits_vectype;
690 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
691 &nunits_vectype)
692 || (nunits_vectype
693 && !vect_record_max_nunits (stmt_info, group_size,
694 nunits_vectype, max_nunits)))
696 /* Fatal mismatch. */
697 matches[0] = false;
698 return false;
701 gcc_assert (vectype);
703 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
705 rhs_code = CALL_EXPR;
706 if ((gimple_call_internal_p (call_stmt)
707 && (!vectorizable_internal_fn_p
708 (gimple_call_internal_fn (call_stmt))))
709 || gimple_call_tail_p (call_stmt)
710 || gimple_call_noreturn_p (call_stmt)
711 || !gimple_call_nothrow_p (call_stmt)
712 || gimple_call_chain (call_stmt))
714 if (dump_enabled_p ())
715 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
716 "Build SLP failed: unsupported call type %G",
717 call_stmt);
718 /* Fatal mismatch. */
719 matches[0] = false;
720 return false;
723 else
724 rhs_code = gimple_assign_rhs_code (stmt);
726 /* Check the operation. */
727 if (i == 0)
729 first_stmt_code = rhs_code;
731 /* Shift arguments should be equal in all the packed stmts for a
732 vector shift with scalar shift operand. */
733 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
734 || rhs_code == LROTATE_EXPR
735 || rhs_code == RROTATE_EXPR)
737 if (vectype == boolean_type_node)
739 if (dump_enabled_p ())
740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
741 "Build SLP failed: shift of a"
742 " boolean.\n");
743 /* Fatal mismatch. */
744 matches[0] = false;
745 return false;
748 vec_mode = TYPE_MODE (vectype);
750 /* First see if we have a vector/vector shift. */
751 optab = optab_for_tree_code (rhs_code, vectype,
752 optab_vector);
754 if (!optab
755 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
757 /* No vector/vector shift, try for a vector/scalar shift. */
758 optab = optab_for_tree_code (rhs_code, vectype,
759 optab_scalar);
761 if (!optab)
763 if (dump_enabled_p ())
764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
765 "Build SLP failed: no optab.\n");
766 /* Fatal mismatch. */
767 matches[0] = false;
768 return false;
770 icode = (int) optab_handler (optab, vec_mode);
771 if (icode == CODE_FOR_nothing)
773 if (dump_enabled_p ())
774 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
775 "Build SLP failed: "
776 "op not supported by target.\n");
777 /* Fatal mismatch. */
778 matches[0] = false;
779 return false;
781 optab_op2_mode = insn_data[icode].operand[2].mode;
782 if (!VECTOR_MODE_P (optab_op2_mode))
784 need_same_oprnds = true;
785 first_op1 = gimple_assign_rhs2 (stmt);
789 else if (rhs_code == WIDEN_LSHIFT_EXPR)
791 need_same_oprnds = true;
792 first_op1 = gimple_assign_rhs2 (stmt);
795 else
797 if (first_stmt_code != rhs_code
798 && alt_stmt_code == ERROR_MARK)
799 alt_stmt_code = rhs_code;
800 if (first_stmt_code != rhs_code
801 && (first_stmt_code != IMAGPART_EXPR
802 || rhs_code != REALPART_EXPR)
803 && (first_stmt_code != REALPART_EXPR
804 || rhs_code != IMAGPART_EXPR)
805 /* Handle mismatches in plus/minus by computing both
806 and merging the results. */
807 && !((first_stmt_code == PLUS_EXPR
808 || first_stmt_code == MINUS_EXPR)
809 && (alt_stmt_code == PLUS_EXPR
810 || alt_stmt_code == MINUS_EXPR)
811 && rhs_code == alt_stmt_code)
812 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
813 && (first_stmt_code == ARRAY_REF
814 || first_stmt_code == BIT_FIELD_REF
815 || first_stmt_code == INDIRECT_REF
816 || first_stmt_code == COMPONENT_REF
817 || first_stmt_code == MEM_REF)))
819 if (dump_enabled_p ())
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 "Build SLP failed: different operation "
823 "in stmt %G", stmt);
824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
825 "original stmt %G", first_stmt_info->stmt);
827 /* Mismatch. */
828 continue;
831 if (need_same_oprnds
832 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
834 if (dump_enabled_p ())
835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
836 "Build SLP failed: different shift "
837 "arguments in %G", stmt);
838 /* Mismatch. */
839 continue;
842 if (rhs_code == CALL_EXPR)
844 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
845 as_a <gcall *> (stmt)))
847 if (dump_enabled_p ())
848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
849 "Build SLP failed: different calls in %G",
850 stmt);
851 /* Mismatch. */
852 continue;
857 /* Grouped store or load. */
858 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
860 if (REFERENCE_CLASS_P (lhs))
862 /* Store. */
865 else
867 /* Load. */
868 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
869 if (prev_first_load)
871 /* Check that there are no loads from different interleaving
872 chains in the same node. */
873 if (prev_first_load != first_load)
875 if (dump_enabled_p ())
876 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
877 vect_location,
878 "Build SLP failed: different "
879 "interleaving chains in one node %G",
880 stmt);
881 /* Mismatch. */
882 continue;
885 else
886 prev_first_load = first_load;
888 } /* Grouped access. */
889 else
891 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
893 /* Not grouped load. */
894 if (dump_enabled_p ())
895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
896 "Build SLP failed: not grouped load %G", stmt);
898 /* FORNOW: Not grouped loads are not supported. */
899 /* Fatal mismatch. */
900 matches[0] = false;
901 return false;
904 /* Not memory operation. */
905 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
906 && TREE_CODE_CLASS (rhs_code) != tcc_unary
907 && TREE_CODE_CLASS (rhs_code) != tcc_expression
908 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
909 && rhs_code != CALL_EXPR)
911 if (dump_enabled_p ())
912 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
913 "Build SLP failed: operation unsupported %G",
914 stmt);
915 /* Fatal mismatch. */
916 matches[0] = false;
917 return false;
920 if (rhs_code == COND_EXPR)
922 tree cond_expr = gimple_assign_rhs1 (stmt);
923 enum tree_code cond_code = TREE_CODE (cond_expr);
924 enum tree_code swap_code = ERROR_MARK;
925 enum tree_code invert_code = ERROR_MARK;
927 if (i == 0)
928 first_cond_code = TREE_CODE (cond_expr);
929 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
931 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
932 swap_code = swap_tree_comparison (cond_code);
933 invert_code = invert_tree_comparison (cond_code, honor_nans);
936 if (first_cond_code == cond_code)
938 /* Isomorphic can be achieved by swapping. */
939 else if (first_cond_code == swap_code)
940 swap[i] = 1;
941 /* Isomorphic can be achieved by inverting. */
942 else if (first_cond_code == invert_code)
943 swap[i] = 2;
944 else
946 if (dump_enabled_p ())
947 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
948 "Build SLP failed: different"
949 " operation %G", stmt);
950 /* Mismatch. */
951 continue;
956 matches[i] = true;
959 for (i = 0; i < group_size; ++i)
960 if (!matches[i])
961 return false;
963 /* If we allowed a two-operation SLP node verify the target can cope
964 with the permute we are going to use. */
965 if (alt_stmt_code != ERROR_MARK
966 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
968 if (vectype == boolean_type_node
969 || !vect_two_operations_perm_ok_p (stmts, group_size,
970 vectype, alt_stmt_code))
972 for (i = 0; i < group_size; ++i)
973 if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
975 matches[i] = false;
976 if (dump_enabled_p ())
978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
979 "Build SLP failed: different operation "
980 "in stmt %G", stmts[i]->stmt);
981 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
982 "original stmt %G", first_stmt_info->stmt);
985 return false;
987 *two_operators = true;
990 return true;
993 /* Traits for the hash_set to record failed SLP builds for a stmt set.
994 Note we never remove apart from at destruction time so we do not
995 need a special value for deleted that differs from empty. */
996 struct bst_traits
998 typedef vec <stmt_vec_info> value_type;
999 typedef vec <stmt_vec_info> compare_type;
1000 static inline hashval_t hash (value_type);
1001 static inline bool equal (value_type existing, value_type candidate);
1002 static inline bool is_empty (value_type x) { return !x.exists (); }
1003 static inline bool is_deleted (value_type x) { return !x.exists (); }
1004 static inline void mark_empty (value_type &x) { x.release (); }
1005 static inline void mark_deleted (value_type &x) { x.release (); }
1006 static inline void remove (value_type &x) { x.release (); }
1008 inline hashval_t
1009 bst_traits::hash (value_type x)
1011 inchash::hash h;
1012 for (unsigned i = 0; i < x.length (); ++i)
1013 h.add_int (gimple_uid (x[i]->stmt));
1014 return h.end ();
1016 inline bool
1017 bst_traits::equal (value_type existing, value_type candidate)
1019 if (existing.length () != candidate.length ())
1020 return false;
1021 for (unsigned i = 0; i < existing.length (); ++i)
1022 if (existing[i] != candidate[i])
1023 return false;
1024 return true;
1027 typedef hash_map <vec <gimple *>, slp_tree,
1028 simple_hashmap_traits <bst_traits, slp_tree> >
1029 scalar_stmts_to_slp_tree_map_t;
1031 static slp_tree
1032 vect_build_slp_tree_2 (vec_info *vinfo,
1033 vec<stmt_vec_info> stmts, unsigned int group_size,
1034 poly_uint64 *max_nunits,
1035 bool *matches, unsigned *npermutes, unsigned *tree_size,
1036 unsigned max_tree_size,
1037 scalar_stmts_to_slp_tree_map_t *bst_map);
1039 static slp_tree
1040 vect_build_slp_tree (vec_info *vinfo,
1041 vec<stmt_vec_info> stmts, unsigned int group_size,
1042 poly_uint64 *max_nunits,
1043 bool *matches, unsigned *npermutes, unsigned *tree_size,
1044 unsigned max_tree_size,
1045 scalar_stmts_to_slp_tree_map_t *bst_map)
1047 if (slp_tree *leader = bst_map->get (stmts))
1049 if (dump_enabled_p ())
1050 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1051 *leader ? "" : "failed ", *leader);
1052 if (*leader)
1053 (*leader)->refcnt++;
1054 return *leader;
1056 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1057 matches, npermutes, tree_size,
1058 max_tree_size, bst_map);
1059 /* Keep a reference for the bst_map use. */
1060 if (res)
1061 res->refcnt++;
1062 bst_map->put (stmts.copy (), res);
1063 return res;
1066 /* Recursively build an SLP tree starting from NODE.
1067 Fail (and return a value not equal to zero) if def-stmts are not
1068 isomorphic, require data permutation or are of unsupported types of
1069 operation. Otherwise, return 0.
1070 The value returned is the depth in the SLP tree where a mismatch
1071 was found. */
1073 static slp_tree
1074 vect_build_slp_tree_2 (vec_info *vinfo,
1075 vec<stmt_vec_info> stmts, unsigned int group_size,
1076 poly_uint64 *max_nunits,
1077 bool *matches, unsigned *npermutes, unsigned *tree_size,
1078 unsigned max_tree_size,
1079 scalar_stmts_to_slp_tree_map_t *bst_map)
1081 unsigned nops, i, this_tree_size = 0;
1082 poly_uint64 this_max_nunits = *max_nunits;
1083 slp_tree node;
1085 matches[0] = false;
1087 stmt_vec_info stmt_info = stmts[0];
1088 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1089 nops = gimple_call_num_args (stmt);
1090 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1092 nops = gimple_num_ops (stmt) - 1;
1093 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1094 nops++;
1096 else if (is_a <gphi *> (stmt_info->stmt))
1097 nops = 0;
1098 else
1099 return NULL;
1101 /* If the SLP node is a PHI (induction or reduction), terminate
1102 the recursion. */
1103 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1105 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1106 tree vectype = get_vectype_for_scalar_type (scalar_type);
1107 if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits))
1108 return NULL;
1110 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1111 /* Induction from different IVs is not supported. */
1112 if (def_type == vect_induction_def)
1114 stmt_vec_info other_info;
1115 FOR_EACH_VEC_ELT (stmts, i, other_info)
1116 if (stmt_info != other_info)
1117 return NULL;
1119 else if (def_type == vect_reduction_def
1120 || def_type == vect_double_reduction_def
1121 || def_type == vect_nested_cycle)
1123 /* Else def types have to match. */
1124 stmt_vec_info other_info;
1125 FOR_EACH_VEC_ELT (stmts, i, other_info)
1127 /* But for reduction chains only check on the first stmt. */
1128 if (REDUC_GROUP_FIRST_ELEMENT (other_info)
1129 && REDUC_GROUP_FIRST_ELEMENT (other_info) != stmt_info)
1130 continue;
1131 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1132 return NULL;
1135 else
1136 return NULL;
1137 node = vect_create_new_slp_node (stmts);
1138 return node;
1142 bool two_operators = false;
1143 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1144 if (!vect_build_slp_tree_1 (swap, stmts, group_size,
1145 &this_max_nunits, matches, &two_operators))
1146 return NULL;
1148 /* If the SLP node is a load, terminate the recursion. */
1149 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1150 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1152 *max_nunits = this_max_nunits;
1153 node = vect_create_new_slp_node (stmts);
1154 return node;
1157 /* Get at the operands, verifying they are compatible. */
1158 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1159 slp_oprnd_info oprnd_info;
1160 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1162 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1163 stmts, i, &oprnds_info);
1164 if (res != 0)
1165 matches[(res == -1) ? 0 : i] = false;
1166 if (!matches[0])
1167 break;
1169 for (i = 0; i < group_size; ++i)
1170 if (!matches[i])
1172 vect_free_oprnd_info (oprnds_info);
1173 return NULL;
1176 auto_vec<slp_tree, 4> children;
1178 stmt_info = stmts[0];
1180 if (tree_size)
1181 max_tree_size -= *tree_size;
1183 /* Create SLP_TREE nodes for the definition node/s. */
1184 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1186 slp_tree child;
1187 unsigned old_tree_size = this_tree_size;
1188 unsigned int j;
1190 if (oprnd_info->first_dt != vect_internal_def
1191 && oprnd_info->first_dt != vect_reduction_def
1192 && oprnd_info->first_dt != vect_induction_def)
1193 continue;
1195 if (++this_tree_size > max_tree_size)
1197 if (dump_enabled_p ())
1198 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1199 vect_location,
1200 "Build SLP failed: SLP tree too large\n");
1201 FOR_EACH_VEC_ELT (children, j, child)
1202 vect_free_slp_tree (child, false);
1203 vect_free_oprnd_info (oprnds_info);
1204 return NULL;
1207 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1208 group_size, &this_max_nunits,
1209 matches, npermutes,
1210 &this_tree_size,
1211 max_tree_size, bst_map)) != NULL)
1213 /* If we have all children of child built up from scalars then just
1214 throw that away and build it up this node from scalars. */
1215 if (!SLP_TREE_CHILDREN (child).is_empty ()
1216 /* ??? Rejecting patterns this way doesn't work. We'd have to
1217 do extra work to cancel the pattern so the uses see the
1218 scalar version. */
1219 && !is_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (child)[0]))
1221 slp_tree grandchild;
1223 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1224 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1225 break;
1226 if (!grandchild)
1228 /* Roll back. */
1229 this_tree_size = old_tree_size;
1230 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1231 vect_free_slp_tree (grandchild, false);
1232 SLP_TREE_CHILDREN (child).truncate (0);
1234 dump_printf_loc (MSG_NOTE, vect_location,
1235 "Building parent vector operands from "
1236 "scalars instead\n");
1237 oprnd_info->def_stmts = vNULL;
1238 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1239 children.safe_push (child);
1240 continue;
1244 oprnd_info->def_stmts = vNULL;
1245 children.safe_push (child);
1246 continue;
1249 /* If the SLP build failed fatally and we analyze a basic-block
1250 simply treat nodes we fail to build as externally defined
1251 (and thus build vectors from the scalar defs).
1252 The cost model will reject outright expensive cases.
1253 ??? This doesn't treat cases where permutation ultimatively
1254 fails (or we don't try permutation below). Ideally we'd
1255 even compute a permutation that will end up with the maximum
1256 SLP tree size... */
1257 if (is_a <bb_vec_info> (vinfo)
1258 && !matches[0]
1259 /* ??? Rejecting patterns this way doesn't work. We'd have to
1260 do extra work to cancel the pattern so the uses see the
1261 scalar version. */
1262 && !is_pattern_stmt_p (stmt_info))
1264 dump_printf_loc (MSG_NOTE, vect_location,
1265 "Building vector operands from scalars\n");
1266 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1267 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1268 children.safe_push (child);
1269 oprnd_info->def_stmts = vNULL;
1270 continue;
1273 /* If the SLP build for operand zero failed and operand zero
1274 and one can be commutated try that for the scalar stmts
1275 that failed the match. */
1276 if (i == 0
1277 /* A first scalar stmt mismatch signals a fatal mismatch. */
1278 && matches[0]
1279 /* ??? For COND_EXPRs we can swap the comparison operands
1280 as well as the arms under some constraints. */
1281 && nops == 2
1282 && oprnds_info[1]->first_dt == vect_internal_def
1283 && is_gimple_assign (stmt_info->stmt)
1284 /* Do so only if the number of not successful permutes was nor more
1285 than a cut-ff as re-trying the recursive match on
1286 possibly each level of the tree would expose exponential
1287 behavior. */
1288 && *npermutes < 4)
1290 /* See whether we can swap the matching or the non-matching
1291 stmt operands. */
1292 bool swap_not_matching = true;
1295 for (j = 0; j < group_size; ++j)
1297 if (matches[j] != !swap_not_matching)
1298 continue;
1299 stmt_vec_info stmt_info = stmts[j];
1300 /* Verify if we can swap operands of this stmt. */
1301 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1302 if (!stmt
1303 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1305 if (!swap_not_matching)
1306 goto fail;
1307 swap_not_matching = false;
1308 break;
1310 /* Verify if we can safely swap or if we committed to a
1311 specific operand order already.
1312 ??? Instead of modifying GIMPLE stmts here we could
1313 record whether we want to swap operands in the SLP
1314 node and temporarily do that when processing it
1315 (or wrap operand accessors in a helper). */
1316 else if (swap[j] != 0
1317 || STMT_VINFO_NUM_SLP_USES (stmt_info))
1319 if (!swap_not_matching)
1321 if (dump_enabled_p ())
1322 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1323 vect_location,
1324 "Build SLP failed: cannot swap "
1325 "operands of shared stmt %G",
1326 stmts[j]->stmt);
1327 goto fail;
1329 swap_not_matching = false;
1330 break;
1334 while (j != group_size);
1336 /* Swap mismatched definition stmts. */
1337 dump_printf_loc (MSG_NOTE, vect_location,
1338 "Re-trying with swapped operands of stmts ");
1339 for (j = 0; j < group_size; ++j)
1340 if (matches[j] == !swap_not_matching)
1342 std::swap (oprnds_info[0]->def_stmts[j],
1343 oprnds_info[1]->def_stmts[j]);
1344 dump_printf (MSG_NOTE, "%d ", j);
1346 dump_printf (MSG_NOTE, "\n");
1347 /* And try again with scratch 'matches' ... */
1348 bool *tem = XALLOCAVEC (bool, group_size);
1349 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1350 group_size, &this_max_nunits,
1351 tem, npermutes,
1352 &this_tree_size,
1353 max_tree_size, bst_map)) != NULL)
1355 /* ... so if successful we can apply the operand swapping
1356 to the GIMPLE IL. This is necessary because for example
1357 vect_get_slp_defs uses operand indexes and thus expects
1358 canonical operand order. This is also necessary even
1359 if we end up building the operand from scalars as
1360 we'll continue to process swapped operand two. */
1361 for (j = 0; j < group_size; ++j)
1362 gimple_set_plf (stmts[j]->stmt, GF_PLF_1, false);
1363 for (j = 0; j < group_size; ++j)
1364 if (matches[j] == !swap_not_matching)
1366 gassign *stmt = as_a <gassign *> (stmts[j]->stmt);
1367 /* Avoid swapping operands twice. */
1368 if (gimple_plf (stmt, GF_PLF_1))
1369 continue;
1370 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1371 gimple_assign_rhs2_ptr (stmt));
1372 gimple_set_plf (stmt, GF_PLF_1, true);
1374 /* Verify we swap all duplicates or none. */
1375 if (flag_checking)
1376 for (j = 0; j < group_size; ++j)
1377 gcc_assert (gimple_plf (stmts[j]->stmt, GF_PLF_1)
1378 == (matches[j] == !swap_not_matching));
1380 /* If we have all children of child built up from scalars then
1381 just throw that away and build it up this node from scalars. */
1382 if (!SLP_TREE_CHILDREN (child).is_empty ()
1383 /* ??? Rejecting patterns this way doesn't work. We'd have
1384 to do extra work to cancel the pattern so the uses see the
1385 scalar version. */
1386 && !is_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (child)[0]))
1388 unsigned int j;
1389 slp_tree grandchild;
1391 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1392 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1393 break;
1394 if (!grandchild)
1396 /* Roll back. */
1397 this_tree_size = old_tree_size;
1398 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1399 vect_free_slp_tree (grandchild, false);
1400 SLP_TREE_CHILDREN (child).truncate (0);
1402 dump_printf_loc (MSG_NOTE, vect_location,
1403 "Building parent vector operands from "
1404 "scalars instead\n");
1405 oprnd_info->def_stmts = vNULL;
1406 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1407 children.safe_push (child);
1408 continue;
1412 oprnd_info->def_stmts = vNULL;
1413 children.safe_push (child);
1414 continue;
1417 ++*npermutes;
1420 fail:
1421 gcc_assert (child == NULL);
1422 FOR_EACH_VEC_ELT (children, j, child)
1423 vect_free_slp_tree (child, false);
1424 vect_free_oprnd_info (oprnds_info);
1425 return NULL;
1428 vect_free_oprnd_info (oprnds_info);
1430 if (tree_size)
1431 *tree_size += this_tree_size;
1432 *max_nunits = this_max_nunits;
1434 node = vect_create_new_slp_node (stmts);
1435 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1436 SLP_TREE_CHILDREN (node).splice (children);
1437 return node;
1440 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1442 static void
1443 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1444 slp_tree node, hash_set<slp_tree> &visited)
1446 int i;
1447 stmt_vec_info stmt_info;
1448 slp_tree child;
1450 if (visited.add (node))
1451 return;
1453 dump_printf_loc (dump_kind, loc, "node%s %p\n",
1454 SLP_TREE_DEF_TYPE (node) != vect_internal_def
1455 ? " (external)" : "", node);
1456 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1457 dump_printf_loc (dump_kind, loc, "\tstmt %d %G", i, stmt_info->stmt);
1458 if (SLP_TREE_CHILDREN (node).is_empty ())
1459 return;
1460 dump_printf_loc (dump_kind, loc, "\tchildren");
1461 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1462 dump_printf (dump_kind, " %p", (void *)child);
1463 dump_printf (dump_kind, "\n");
1464 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1465 vect_print_slp_tree (dump_kind, loc, child, visited);
1468 static void
1469 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1470 slp_tree node)
1472 hash_set<slp_tree> visited;
1473 vect_print_slp_tree (dump_kind, loc, node, visited);
1476 /* Mark the tree rooted at NODE with PURE_SLP. */
1478 static void
1479 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1481 int i;
1482 stmt_vec_info stmt_info;
1483 slp_tree child;
1485 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1486 return;
1488 if (visited.add (node))
1489 return;
1491 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1492 STMT_SLP_TYPE (stmt_info) = pure_slp;
1494 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1495 vect_mark_slp_stmts (child, visited);
1498 static void
1499 vect_mark_slp_stmts (slp_tree node)
1501 hash_set<slp_tree> visited;
1502 vect_mark_slp_stmts (node, visited);
1505 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1507 static void
1508 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1510 int i;
1511 stmt_vec_info stmt_info;
1512 slp_tree child;
1514 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1515 return;
1517 if (visited.add (node))
1518 return;
1520 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1522 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1523 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1524 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1527 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1528 vect_mark_slp_stmts_relevant (child, visited);
1531 static void
1532 vect_mark_slp_stmts_relevant (slp_tree node)
1534 hash_set<slp_tree> visited;
1535 vect_mark_slp_stmts_relevant (node, visited);
1539 /* Rearrange the statements of NODE according to PERMUTATION. */
1541 static void
1542 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1543 vec<unsigned> permutation,
1544 hash_set<slp_tree> &visited)
1546 stmt_vec_info stmt_info;
1547 vec<stmt_vec_info> tmp_stmts;
1548 unsigned int i;
1549 slp_tree child;
1551 if (visited.add (node))
1552 return;
1554 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1555 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1557 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1558 tmp_stmts.create (group_size);
1559 tmp_stmts.quick_grow_cleared (group_size);
1561 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1562 tmp_stmts[permutation[i]] = stmt_info;
1564 SLP_TREE_SCALAR_STMTS (node).release ();
1565 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1569 /* Attempt to reorder stmts in a reduction chain so that we don't
1570 require any load permutation. Return true if that was possible,
1571 otherwise return false. */
1573 static bool
1574 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1576 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1577 unsigned int i, j;
1578 unsigned int lidx;
1579 slp_tree node, load;
1581 /* Compare all the permutation sequences to the first one. We know
1582 that at least one load is permuted. */
1583 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1584 if (!node->load_permutation.exists ())
1585 return false;
1586 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1588 if (!load->load_permutation.exists ())
1589 return false;
1590 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1591 if (lidx != node->load_permutation[j])
1592 return false;
1595 /* Check that the loads in the first sequence are different and there
1596 are no gaps between them. */
1597 auto_sbitmap load_index (group_size);
1598 bitmap_clear (load_index);
1599 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1601 if (lidx >= group_size)
1602 return false;
1603 if (bitmap_bit_p (load_index, lidx))
1604 return false;
1606 bitmap_set_bit (load_index, lidx);
1608 for (i = 0; i < group_size; i++)
1609 if (!bitmap_bit_p (load_index, i))
1610 return false;
1612 /* This permutation is valid for reduction. Since the order of the
1613 statements in the nodes is not important unless they are memory
1614 accesses, we can rearrange the statements in all the nodes
1615 according to the order of the loads. */
1616 hash_set<slp_tree> visited;
1617 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1618 node->load_permutation, visited);
1620 /* We are done, no actual permutations need to be generated. */
1621 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1622 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1624 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1625 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1626 /* But we have to keep those permutations that are required because
1627 of handling of gaps. */
1628 if (known_eq (unrolling_factor, 1U)
1629 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1630 && DR_GROUP_GAP (first_stmt_info) == 0))
1631 SLP_TREE_LOAD_PERMUTATION (node).release ();
1632 else
1633 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1634 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1637 return true;
1640 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1642 static void
1643 vect_gather_slp_loads (slp_instance inst, slp_tree node,
1644 hash_set<slp_tree> &visited)
1646 if (visited.add (node))
1647 return;
1649 if (SLP_TREE_CHILDREN (node).length () == 0)
1651 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1652 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
1653 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
1654 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1655 SLP_INSTANCE_LOADS (inst).safe_push (node);
1657 else
1659 unsigned i;
1660 slp_tree child;
1661 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1662 vect_gather_slp_loads (inst, child, visited);
1666 static void
1667 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1669 hash_set<slp_tree> visited;
1670 vect_gather_slp_loads (inst, node, visited);
1673 /* Check if the required load permutations in the SLP instance
1674 SLP_INSTN are supported. */
1676 static bool
1677 vect_supported_load_permutation_p (slp_instance slp_instn)
1679 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1680 unsigned int i, j, k, next;
1681 slp_tree node;
1683 if (dump_enabled_p ())
1685 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1686 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1687 if (node->load_permutation.exists ())
1688 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1689 dump_printf (MSG_NOTE, "%d ", next);
1690 else
1691 for (k = 0; k < group_size; ++k)
1692 dump_printf (MSG_NOTE, "%d ", k);
1693 dump_printf (MSG_NOTE, "\n");
1696 /* In case of reduction every load permutation is allowed, since the order
1697 of the reduction statements is not important (as opposed to the case of
1698 grouped stores). The only condition we need to check is that all the
1699 load nodes are of the same size and have the same permutation (and then
1700 rearrange all the nodes of the SLP instance according to this
1701 permutation). */
1703 /* Check that all the load nodes are of the same size. */
1704 /* ??? Can't we assert this? */
1705 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1706 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1707 return false;
1709 node = SLP_INSTANCE_TREE (slp_instn);
1710 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1712 /* Reduction (there are no data-refs in the root).
1713 In reduction chain the order of the loads is not important. */
1714 if (!STMT_VINFO_DATA_REF (stmt_info)
1715 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1716 vect_attempt_slp_rearrange_stmts (slp_instn);
1718 /* In basic block vectorization we allow any subchain of an interleaving
1719 chain.
1720 FORNOW: not supported in loop SLP because of realignment compications. */
1721 if (STMT_VINFO_BB_VINFO (stmt_info))
1723 /* Check whether the loads in an instance form a subchain and thus
1724 no permutation is necessary. */
1725 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1727 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1728 continue;
1729 bool subchain_p = true;
1730 stmt_vec_info next_load_info = NULL;
1731 stmt_vec_info load_info;
1732 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1734 if (j != 0
1735 && (next_load_info != load_info
1736 || DR_GROUP_GAP (load_info) != 1))
1738 subchain_p = false;
1739 break;
1741 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
1743 if (subchain_p)
1744 SLP_TREE_LOAD_PERMUTATION (node).release ();
1745 else
1747 stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
1748 group_info = DR_GROUP_FIRST_ELEMENT (group_info);
1749 unsigned HOST_WIDE_INT nunits;
1750 unsigned k, maxk = 0;
1751 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1752 if (k > maxk)
1753 maxk = k;
1754 /* In BB vectorization we may not actually use a loaded vector
1755 accessing elements in excess of DR_GROUP_SIZE. */
1756 tree vectype = STMT_VINFO_VECTYPE (group_info);
1757 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1758 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
1760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1761 "BB vectorization with gaps at the end of "
1762 "a load is not supported\n");
1763 return false;
1766 /* Verify the permutation can be generated. */
1767 vec<tree> tem;
1768 unsigned n_perms;
1769 if (!vect_transform_slp_perm_load (node, tem, NULL,
1770 1, slp_instn, true, &n_perms))
1772 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1773 vect_location,
1774 "unsupported load permutation\n");
1775 return false;
1779 return true;
1782 /* For loop vectorization verify we can generate the permutation. Be
1783 conservative about the vectorization factor, there are permutations
1784 that will use three vector inputs only starting from a specific factor
1785 and the vectorization factor is not yet final.
1786 ??? The SLP instance unrolling factor might not be the maximum one. */
1787 unsigned n_perms;
1788 poly_uint64 test_vf
1789 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1790 LOOP_VINFO_VECT_FACTOR
1791 (STMT_VINFO_LOOP_VINFO (stmt_info)));
1792 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1793 if (node->load_permutation.exists ()
1794 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1795 slp_instn, true, &n_perms))
1796 return false;
1798 return true;
1802 /* Find the last store in SLP INSTANCE. */
1804 stmt_vec_info
1805 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1807 stmt_vec_info last = NULL;
1808 stmt_vec_info stmt_vinfo;
1810 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1812 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1813 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1816 return last;
1819 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1820 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1821 (also containing the first GROUP1_SIZE stmts, since stores are
1822 consecutive), the second containing the remainder.
1823 Return the first stmt in the second group. */
1825 static stmt_vec_info
1826 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1828 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1829 gcc_assert (group1_size > 0);
1830 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1831 gcc_assert (group2_size > 0);
1832 DR_GROUP_SIZE (first_vinfo) = group1_size;
1834 stmt_vec_info stmt_info = first_vinfo;
1835 for (unsigned i = group1_size; i > 1; i--)
1837 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1838 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1840 /* STMT is now the last element of the first group. */
1841 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1842 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1844 DR_GROUP_SIZE (group2) = group2_size;
1845 for (stmt_info = group2; stmt_info;
1846 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1848 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1849 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1852 /* For the second group, the DR_GROUP_GAP is that before the original group,
1853 plus skipping over the first vector. */
1854 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1856 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1857 DR_GROUP_GAP (first_vinfo) += group2_size;
1859 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1861 group1_size, group2_size);
1863 return group2;
1866 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1867 statements and a vector of NUNITS elements. */
1869 static poly_uint64
1870 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
1872 return exact_div (common_multiple (nunits, group_size), group_size);
1875 /* Analyze an SLP instance starting from a group of grouped stores. Call
1876 vect_build_slp_tree to build a tree of packed stmts if possible.
1877 Return FALSE if it's impossible to SLP any stmt in the loop. */
1879 static bool
1880 vect_analyze_slp_instance (vec_info *vinfo,
1881 stmt_vec_info stmt_info, unsigned max_tree_size)
1883 slp_instance new_instance;
1884 slp_tree node;
1885 unsigned int group_size;
1886 tree vectype, scalar_type = NULL_TREE;
1887 unsigned int i;
1888 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1889 vec<stmt_vec_info> scalar_stmts;
1891 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1893 scalar_type = TREE_TYPE (DR_REF (dr));
1894 vectype = get_vectype_for_scalar_type (scalar_type);
1895 group_size = DR_GROUP_SIZE (stmt_info);
1897 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1899 gcc_assert (is_a <loop_vec_info> (vinfo));
1900 vectype = STMT_VINFO_VECTYPE (stmt_info);
1901 group_size = REDUC_GROUP_SIZE (stmt_info);
1903 else
1905 gcc_assert (is_a <loop_vec_info> (vinfo));
1906 vectype = STMT_VINFO_VECTYPE (stmt_info);
1907 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
1910 if (!vectype)
1912 if (dump_enabled_p ())
1913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1914 "Build SLP failed: unsupported data-type %T\n",
1915 scalar_type);
1917 return false;
1919 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1921 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1922 scalar_stmts.create (group_size);
1923 stmt_vec_info next_info = stmt_info;
1924 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1926 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1927 while (next_info)
1929 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
1930 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
1933 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1935 /* Collect the reduction stmts and store them in
1936 SLP_TREE_SCALAR_STMTS. */
1937 while (next_info)
1939 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
1940 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
1942 /* Mark the first element of the reduction chain as reduction to properly
1943 transform the node. In the reduction analysis phase only the last
1944 element of the chain is marked as reduction. */
1945 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
1947 else
1949 /* Collect reduction statements. */
1950 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
1951 for (i = 0; reductions.iterate (i, &next_info); i++)
1952 scalar_stmts.safe_push (next_info);
1955 /* Build the tree for the SLP instance. */
1956 bool *matches = XALLOCAVEC (bool, group_size);
1957 unsigned npermutes = 0;
1958 scalar_stmts_to_slp_tree_map_t *bst_map
1959 = new scalar_stmts_to_slp_tree_map_t ();
1960 poly_uint64 max_nunits = nunits;
1961 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
1962 &max_nunits, matches, &npermutes,
1963 NULL, max_tree_size, bst_map);
1964 /* The map keeps a reference on SLP nodes built, release that. */
1965 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1966 it != bst_map->end (); ++it)
1967 if ((*it).second)
1968 vect_free_slp_tree ((*it).second, false);
1969 delete bst_map;
1970 if (node != NULL)
1972 /* Calculate the unrolling factor based on the smallest type. */
1973 poly_uint64 unrolling_factor
1974 = calculate_unrolling_factor (max_nunits, group_size);
1976 if (maybe_ne (unrolling_factor, 1U)
1977 && is_a <bb_vec_info> (vinfo))
1979 unsigned HOST_WIDE_INT const_max_nunits;
1980 if (!max_nunits.is_constant (&const_max_nunits)
1981 || const_max_nunits > group_size)
1983 if (dump_enabled_p ())
1984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1985 "Build SLP failed: store group "
1986 "size not a multiple of the vector size "
1987 "in basic block SLP\n");
1988 vect_free_slp_tree (node, false);
1989 return false;
1991 /* Fatal mismatch. */
1992 matches[group_size / const_max_nunits * const_max_nunits] = false;
1993 vect_free_slp_tree (node, false);
1995 else
1997 /* Create a new SLP instance. */
1998 new_instance = XNEW (struct _slp_instance);
1999 SLP_INSTANCE_TREE (new_instance) = node;
2000 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2001 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2002 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2003 vect_gather_slp_loads (new_instance, node);
2005 /* Compute the load permutation. */
2006 slp_tree load_node;
2007 bool loads_permuted = false;
2008 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2010 vec<unsigned> load_permutation;
2011 int j;
2012 stmt_vec_info load_info;
2013 bool this_load_permuted = false;
2014 load_permutation.create (group_size);
2015 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
2016 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2017 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2019 int load_place = vect_get_place_in_interleaving_chain
2020 (load_info, first_stmt_info);
2021 gcc_assert (load_place != -1);
2022 if (load_place != j)
2023 this_load_permuted = true;
2024 load_permutation.safe_push (load_place);
2026 if (!this_load_permuted
2027 /* The load requires permutation when unrolling exposes
2028 a gap either because the group is larger than the SLP
2029 group-size or because there is a gap between the groups. */
2030 && (known_eq (unrolling_factor, 1U)
2031 || (group_size == DR_GROUP_SIZE (first_stmt_info)
2032 && DR_GROUP_GAP (first_stmt_info) == 0)))
2034 load_permutation.release ();
2035 continue;
2037 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2038 loads_permuted = true;
2041 if (loads_permuted)
2043 if (!vect_supported_load_permutation_p (new_instance))
2045 if (dump_enabled_p ())
2046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2047 "Build SLP failed: unsupported load "
2048 "permutation %G", stmt_info->stmt);
2049 vect_free_slp_instance (new_instance, false);
2050 return false;
2054 /* If the loads and stores can be handled with load/store-lan
2055 instructions do not generate this SLP instance. */
2056 if (is_a <loop_vec_info> (vinfo)
2057 && loads_permuted
2058 && dr && vect_store_lanes_supported (vectype, group_size, false))
2060 slp_tree load_node;
2061 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2063 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2064 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2065 /* Use SLP for strided accesses (or if we can't load-lanes). */
2066 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2067 || ! vect_load_lanes_supported
2068 (STMT_VINFO_VECTYPE (stmt_vinfo),
2069 DR_GROUP_SIZE (stmt_vinfo), false))
2070 break;
2072 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2074 if (dump_enabled_p ())
2075 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2076 "Built SLP cancelled: can use "
2077 "load/store-lanes\n");
2078 vect_free_slp_instance (new_instance, false);
2079 return false;
2083 vinfo->slp_instances.safe_push (new_instance);
2085 if (dump_enabled_p ())
2087 dump_printf_loc (MSG_NOTE, vect_location,
2088 "Final SLP tree for instance:\n");
2089 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2092 return true;
2095 else
2097 /* Failed to SLP. */
2098 /* Free the allocated memory. */
2099 scalar_stmts.release ();
2102 /* For basic block SLP, try to break the group up into multiples of the
2103 vector size. */
2104 unsigned HOST_WIDE_INT const_nunits;
2105 if (is_a <bb_vec_info> (vinfo)
2106 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2107 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2108 && nunits.is_constant (&const_nunits))
2110 /* We consider breaking the group only on VF boundaries from the existing
2111 start. */
2112 for (i = 0; i < group_size; i++)
2113 if (!matches[i]) break;
2115 if (i >= const_nunits && i < group_size)
2117 /* Split into two groups at the first vector boundary before i. */
2118 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2119 unsigned group1_size = i & ~(const_nunits - 1);
2121 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2122 group1_size);
2123 bool res = vect_analyze_slp_instance (vinfo, stmt_info,
2124 max_tree_size);
2125 /* If the first non-match was in the middle of a vector,
2126 skip the rest of that vector. */
2127 if (group1_size < i)
2129 i = group1_size + const_nunits;
2130 if (i < group_size)
2131 rest = vect_split_slp_store_group (rest, const_nunits);
2133 if (i < group_size)
2134 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2135 return res;
2137 /* Even though the first vector did not all match, we might be able to SLP
2138 (some) of the remainder. FORNOW ignore this possibility. */
2141 return false;
2145 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2146 trees of packed scalar stmts if SLP is possible. */
2148 opt_result
2149 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2151 unsigned int i;
2152 stmt_vec_info first_element;
2154 DUMP_VECT_SCOPE ("vect_analyze_slp");
2156 /* Find SLP sequences starting from groups of grouped stores. */
2157 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2158 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2160 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2162 if (loop_vinfo->reduction_chains.length () > 0)
2164 /* Find SLP sequences starting from reduction chains. */
2165 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2166 if (! vect_analyze_slp_instance (vinfo, first_element,
2167 max_tree_size))
2169 /* Dissolve reduction chain group. */
2170 stmt_vec_info vinfo = first_element;
2171 while (vinfo)
2173 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2174 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2175 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2176 vinfo = next;
2178 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2182 /* Find SLP sequences starting from groups of reductions. */
2183 if (loop_vinfo->reductions.length () > 1)
2184 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2185 max_tree_size);
2188 return opt_result::success ();
2192 /* For each possible SLP instance decide whether to SLP it and calculate overall
2193 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2194 least one instance. */
2196 bool
2197 vect_make_slp_decision (loop_vec_info loop_vinfo)
2199 unsigned int i;
2200 poly_uint64 unrolling_factor = 1;
2201 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2202 slp_instance instance;
2203 int decided_to_slp = 0;
2205 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2207 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2209 /* FORNOW: SLP if you can. */
2210 /* All unroll factors have the form current_vector_size * X for some
2211 rational X, so they must have a common multiple. */
2212 unrolling_factor
2213 = force_common_multiple (unrolling_factor,
2214 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2216 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2217 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2218 loop-based vectorization. Such stmts will be marked as HYBRID. */
2219 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2220 decided_to_slp++;
2223 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2225 if (decided_to_slp && dump_enabled_p ())
2227 dump_printf_loc (MSG_NOTE, vect_location,
2228 "Decided to SLP %d instances. Unrolling factor ",
2229 decided_to_slp);
2230 dump_dec (MSG_NOTE, unrolling_factor);
2231 dump_printf (MSG_NOTE, "\n");
2234 return (decided_to_slp > 0);
2238 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2239 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2241 static void
2242 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype,
2243 hash_map<slp_tree, unsigned> &visited)
2245 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (node)[i];
2246 imm_use_iterator imm_iter;
2247 gimple *use_stmt;
2248 stmt_vec_info use_vinfo;
2249 slp_tree child;
2250 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2251 int j;
2253 /* We need to union stype over the incoming graph edges but we still
2254 want to limit recursion to stay O(N+E). */
2255 bool only_edge = (++visited.get_or_insert (node) < node->refcnt);
2257 /* Propagate hybrid down the SLP tree. */
2258 if (stype == hybrid)
2260 else if (HYBRID_SLP_STMT (stmt_vinfo))
2261 stype = hybrid;
2262 else if (!only_edge)
2264 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2265 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2266 /* If we get a pattern stmt here we have to use the LHS of the
2267 original stmt for immediate uses. */
2268 gimple *stmt = vect_orig_stmt (stmt_vinfo)->stmt;
2269 tree def;
2270 if (gimple_code (stmt) == GIMPLE_PHI)
2271 def = gimple_phi_result (stmt);
2272 else
2273 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2274 if (def)
2275 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2277 use_vinfo = loop_vinfo->lookup_stmt (use_stmt);
2278 if (!use_vinfo)
2279 continue;
2280 use_vinfo = vect_stmt_to_vectorize (use_vinfo);
2281 if (!STMT_SLP_TYPE (use_vinfo)
2282 && (STMT_VINFO_RELEVANT (use_vinfo)
2283 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2284 && !(gimple_code (use_stmt) == GIMPLE_PHI
2285 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2287 if (dump_enabled_p ())
2288 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2289 "def in non-SLP stmt: %G", use_stmt);
2290 stype = hybrid;
2295 if (stype == hybrid
2296 && !HYBRID_SLP_STMT (stmt_vinfo))
2298 if (dump_enabled_p ())
2299 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2300 stmt_vinfo->stmt);
2301 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2304 if (!only_edge)
2305 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2306 if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
2307 vect_detect_hybrid_slp_stmts (child, i, stype, visited);
2310 static void
2311 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2313 hash_map<slp_tree, unsigned> visited;
2314 vect_detect_hybrid_slp_stmts (node, i, stype, visited);
2317 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2319 static tree
2320 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2322 walk_stmt_info *wi = (walk_stmt_info *)data;
2323 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2325 if (wi->is_lhs)
2326 return NULL_TREE;
2328 stmt_vec_info def_stmt_info = loop_vinfo->lookup_def (*tp);
2329 if (def_stmt_info && PURE_SLP_STMT (def_stmt_info))
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2333 def_stmt_info->stmt);
2334 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2337 return NULL_TREE;
2340 static tree
2341 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2342 walk_stmt_info *wi)
2344 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2345 stmt_vec_info use_vinfo = loop_vinfo->lookup_stmt (gsi_stmt (*gsi));
2346 /* If the stmt is in a SLP instance then this isn't a reason
2347 to mark use definitions in other SLP instances as hybrid. */
2348 if (! STMT_SLP_TYPE (use_vinfo)
2349 && (STMT_VINFO_RELEVANT (use_vinfo)
2350 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2351 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2352 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2354 else
2355 *handled = true;
2356 return NULL_TREE;
2359 /* Find stmts that must be both vectorized and SLPed. */
2361 void
2362 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2364 unsigned int i;
2365 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2366 slp_instance instance;
2368 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2370 /* First walk all pattern stmt in the loop and mark defs of uses as
2371 hybrid because immediate uses in them are not recorded. */
2372 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2374 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2375 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2376 gsi_next (&gsi))
2378 gimple *stmt = gsi_stmt (gsi);
2379 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2380 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2382 walk_stmt_info wi;
2383 memset (&wi, 0, sizeof (wi));
2384 wi.info = loop_vinfo;
2385 gimple_stmt_iterator gsi2
2386 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)->stmt);
2387 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2388 vect_detect_hybrid_slp_1, &wi);
2389 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2390 vect_detect_hybrid_slp_2,
2391 vect_detect_hybrid_slp_1, &wi);
2396 /* Then walk the SLP instance trees marking stmts with uses in
2397 non-SLP stmts as hybrid, also propagating hybrid down the
2398 SLP tree, collecting the above info on-the-fly. */
2399 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2401 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2402 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2403 i, pure_slp);
2408 /* Initialize a bb_vec_info struct for the statements between
2409 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2411 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2412 gimple_stmt_iterator region_end_in,
2413 vec_info_shared *shared)
2414 : vec_info (vec_info::bb, init_cost (NULL), shared),
2415 bb (gsi_bb (region_begin_in)),
2416 region_begin (region_begin_in),
2417 region_end (region_end_in)
2419 gimple_stmt_iterator gsi;
2421 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2422 gsi_next (&gsi))
2424 gimple *stmt = gsi_stmt (gsi);
2425 gimple_set_uid (stmt, 0);
2426 add_stmt (stmt);
2429 bb->aux = this;
2433 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2434 stmts in the basic block. */
2436 _bb_vec_info::~_bb_vec_info ()
2438 for (gimple_stmt_iterator si = region_begin;
2439 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2440 /* Reset region marker. */
2441 gimple_set_uid (gsi_stmt (si), -1);
2443 bb->aux = NULL;
2446 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2447 given then that child nodes have already been processed, and that
2448 their def types currently match their SLP node's def type. */
2450 static bool
2451 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2452 slp_instance node_instance,
2453 stmt_vector_for_cost *cost_vec)
2455 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2456 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2458 /* For BB vectorization vector types are assigned here.
2459 Memory accesses already got their vector type assigned
2460 in vect_analyze_data_refs. */
2461 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2462 if (bb_vinfo
2463 && ! STMT_VINFO_DATA_REF (stmt_info))
2465 tree vectype, nunits_vectype;
2466 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
2467 &nunits_vectype))
2468 /* We checked this when building the node. */
2469 gcc_unreachable ();
2470 if (vectype == boolean_type_node)
2472 vectype = vect_get_mask_type_for_stmt (stmt_info);
2473 if (!vectype)
2474 /* vect_get_mask_type_for_stmt has already explained the
2475 failure. */
2476 return false;
2479 stmt_vec_info sstmt_info;
2480 unsigned int i;
2481 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt_info)
2482 STMT_VINFO_VECTYPE (sstmt_info) = vectype;
2485 /* Calculate the number of vector statements to be created for the
2486 scalar stmts in this node. For SLP reductions it is equal to the
2487 number of vector statements in the children (which has already been
2488 calculated by the recursive call). Otherwise it is the number of
2489 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2490 VF divided by the number of elements in a vector. */
2491 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2492 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2493 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2494 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2495 else
2497 poly_uint64 vf;
2498 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2499 vf = loop_vinfo->vectorization_factor;
2500 else
2501 vf = 1;
2502 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2503 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2504 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2505 = vect_get_num_vectors (vf * group_size, vectype);
2508 bool dummy;
2509 return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec);
2512 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2513 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2515 Return true if the operations are supported. */
2517 static bool
2518 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2519 slp_instance node_instance,
2520 scalar_stmts_to_slp_tree_map_t *visited,
2521 scalar_stmts_to_slp_tree_map_t *lvisited,
2522 stmt_vector_for_cost *cost_vec)
2524 int i, j;
2525 slp_tree child;
2527 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2528 return true;
2530 /* If we already analyzed the exact same set of scalar stmts we're done.
2531 We share the generated vector stmts for those. */
2532 slp_tree *leader;
2533 if ((leader = visited->get (SLP_TREE_SCALAR_STMTS (node)))
2534 || (leader = lvisited->get (SLP_TREE_SCALAR_STMTS (node))))
2536 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2537 = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
2538 return true;
2541 /* The SLP graph is acyclic so not caching whether we failed or succeeded
2542 doesn't result in any issue since we throw away the lvisited set
2543 when we fail. */
2544 lvisited->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
2546 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2547 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2548 visited, lvisited, cost_vec))
2549 return false;
2551 /* Push SLP node def-type to stmt operands. */
2552 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2553 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2554 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2555 = SLP_TREE_DEF_TYPE (child);
2556 bool res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2557 cost_vec);
2558 /* Restore def-types. */
2559 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2560 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2561 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2562 = vect_internal_def;
2563 if (! res)
2564 return false;
2566 return true;
2570 /* Analyze statements in SLP instances of VINFO. Return true if the
2571 operations are supported. */
2573 bool
2574 vect_slp_analyze_operations (vec_info *vinfo)
2576 slp_instance instance;
2577 int i;
2579 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2581 scalar_stmts_to_slp_tree_map_t *visited
2582 = new scalar_stmts_to_slp_tree_map_t ();
2583 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2585 scalar_stmts_to_slp_tree_map_t lvisited;
2586 stmt_vector_for_cost cost_vec;
2587 cost_vec.create (2);
2588 if (!vect_slp_analyze_node_operations (vinfo,
2589 SLP_INSTANCE_TREE (instance),
2590 instance, visited, &lvisited,
2591 &cost_vec))
2593 slp_tree node = SLP_INSTANCE_TREE (instance);
2594 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2595 dump_printf_loc (MSG_NOTE, vect_location,
2596 "removing SLP instance operations starting from: %G",
2597 stmt_info->stmt);
2598 vect_free_slp_instance (instance, false);
2599 vinfo->slp_instances.ordered_remove (i);
2600 cost_vec.release ();
2602 else
2604 for (scalar_stmts_to_slp_tree_map_t::iterator x = lvisited.begin();
2605 x != lvisited.end(); ++x)
2606 visited->put ((*x).first.copy (), (*x).second);
2607 i++;
2609 add_stmt_costs (vinfo->target_cost_data, &cost_vec);
2610 cost_vec.release ();
2613 delete visited;
2615 return !vinfo->slp_instances.is_empty ();
2619 /* Compute the scalar cost of the SLP node NODE and its children
2620 and return it. Do not account defs that are marked in LIFE and
2621 update LIFE according to uses of NODE. */
2623 static void
2624 vect_bb_slp_scalar_cost (basic_block bb,
2625 slp_tree node, vec<bool, va_heap> *life,
2626 stmt_vector_for_cost *cost_vec,
2627 hash_set<slp_tree> &visited)
2629 unsigned i;
2630 stmt_vec_info stmt_info;
2631 slp_tree child;
2633 if (visited.add (node))
2634 return;
2636 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2638 gimple *stmt = stmt_info->stmt;
2639 vec_info *vinfo = stmt_info->vinfo;
2640 ssa_op_iter op_iter;
2641 def_operand_p def_p;
2643 if ((*life)[i])
2644 continue;
2646 /* If there is a non-vectorized use of the defs then the scalar
2647 stmt is kept live in which case we do not account it or any
2648 required defs in the SLP children in the scalar cost. This
2649 way we make the vectorization more costly when compared to
2650 the scalar cost. */
2651 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2653 imm_use_iterator use_iter;
2654 gimple *use_stmt;
2655 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2656 if (!is_gimple_debug (use_stmt))
2658 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
2659 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
2661 (*life)[i] = true;
2662 BREAK_FROM_IMM_USE_STMT (use_iter);
2666 if ((*life)[i])
2667 continue;
2669 /* Count scalar stmts only once. */
2670 if (gimple_visited_p (stmt))
2671 continue;
2672 gimple_set_visited (stmt, true);
2674 vect_cost_for_stmt kind;
2675 if (STMT_VINFO_DATA_REF (stmt_info))
2677 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2678 kind = scalar_load;
2679 else
2680 kind = scalar_store;
2682 else
2683 kind = scalar_stmt;
2684 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
2687 auto_vec<bool, 20> subtree_life;
2688 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2690 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2692 /* Do not directly pass LIFE to the recursive call, copy it to
2693 confine changes in the callee to the current child/subtree. */
2694 subtree_life.safe_splice (*life);
2695 vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec,
2696 visited);
2697 subtree_life.truncate (0);
2702 static void
2703 vect_bb_slp_scalar_cost (basic_block bb,
2704 slp_tree node, vec<bool, va_heap> *life,
2705 stmt_vector_for_cost *cost_vec)
2707 hash_set<slp_tree> visited;
2708 vect_bb_slp_scalar_cost (bb, node, life, cost_vec, visited);
2711 /* Check if vectorization of the basic block is profitable. */
2713 static bool
2714 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2716 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2717 slp_instance instance;
2718 int i;
2719 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2720 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2722 /* Calculate scalar cost. */
2723 stmt_vector_for_cost scalar_costs;
2724 scalar_costs.create (0);
2725 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2727 auto_vec<bool, 20> life;
2728 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2729 vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2730 SLP_INSTANCE_TREE (instance),
2731 &life, &scalar_costs);
2733 void *target_cost_data = init_cost (NULL);
2734 add_stmt_costs (target_cost_data, &scalar_costs);
2735 scalar_costs.release ();
2736 unsigned dummy;
2737 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
2738 destroy_cost_data (target_cost_data);
2740 /* Unset visited flag. */
2741 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2742 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2743 gimple_set_visited (gsi_stmt (gsi), false);
2745 /* Complete the target-specific cost calculation. */
2746 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2747 &vec_inside_cost, &vec_epilogue_cost);
2749 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2751 if (dump_enabled_p ())
2753 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2754 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2755 vec_inside_cost);
2756 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2757 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2758 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2761 /* Vectorization is profitable if its cost is more than the cost of scalar
2762 version. Note that we err on the vector side for equal cost because
2763 the cost estimate is otherwise quite pessimistic (constant uses are
2764 free on the scalar side but cost a load on the vector side for
2765 example). */
2766 if (vec_outside_cost + vec_inside_cost > scalar_cost)
2767 return false;
2769 return true;
2772 /* Check if the basic block can be vectorized. Returns a bb_vec_info
2773 if so and sets fatal to true if failure is independent of
2774 current_vector_size. */
2776 static bb_vec_info
2777 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
2778 gimple_stmt_iterator region_end,
2779 vec<data_reference_p> datarefs, int n_stmts,
2780 bool &fatal, vec_info_shared *shared)
2782 bb_vec_info bb_vinfo;
2783 slp_instance instance;
2784 int i;
2785 poly_uint64 min_vf = 2;
2787 /* The first group of checks is independent of the vector size. */
2788 fatal = true;
2790 if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2792 if (dump_enabled_p ())
2793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2794 "not vectorized: too many instructions in "
2795 "basic block.\n");
2796 free_data_refs (datarefs);
2797 return NULL;
2800 bb_vinfo = new _bb_vec_info (region_begin, region_end, shared);
2801 if (!bb_vinfo)
2802 return NULL;
2804 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
2805 bb_vinfo->shared->save_datarefs ();
2807 /* Analyze the data references. */
2809 if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
2811 if (dump_enabled_p ())
2812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2813 "not vectorized: unhandled data-ref in basic "
2814 "block.\n");
2816 delete bb_vinfo;
2817 return NULL;
2820 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2822 if (dump_enabled_p ())
2823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2824 "not vectorized: not enough data-refs in "
2825 "basic block.\n");
2827 delete bb_vinfo;
2828 return NULL;
2831 if (!vect_analyze_data_ref_accesses (bb_vinfo))
2833 if (dump_enabled_p ())
2834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2835 "not vectorized: unhandled data access in "
2836 "basic block.\n");
2838 delete bb_vinfo;
2839 return NULL;
2842 /* If there are no grouped stores in the region there is no need
2843 to continue with pattern recog as vect_analyze_slp will fail
2844 anyway. */
2845 if (bb_vinfo->grouped_stores.is_empty ())
2847 if (dump_enabled_p ())
2848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2849 "not vectorized: no grouped stores in "
2850 "basic block.\n");
2852 delete bb_vinfo;
2853 return NULL;
2856 /* While the rest of the analysis below depends on it in some way. */
2857 fatal = false;
2859 vect_pattern_recog (bb_vinfo);
2861 /* Check the SLP opportunities in the basic block, analyze and build SLP
2862 trees. */
2863 if (!vect_analyze_slp (bb_vinfo, n_stmts))
2865 if (dump_enabled_p ())
2867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2868 "Failed to SLP the basic block.\n");
2869 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2870 "not vectorized: failed to find SLP opportunities "
2871 "in basic block.\n");
2874 delete bb_vinfo;
2875 return NULL;
2878 vect_record_base_alignments (bb_vinfo);
2880 /* Analyze and verify the alignment of data references and the
2881 dependence in the SLP instances. */
2882 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
2884 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
2885 || ! vect_slp_analyze_instance_dependence (instance))
2887 slp_tree node = SLP_INSTANCE_TREE (instance);
2888 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2889 dump_printf_loc (MSG_NOTE, vect_location,
2890 "removing SLP instance operations starting from: %G",
2891 stmt_info->stmt);
2892 vect_free_slp_instance (instance, false);
2893 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
2894 continue;
2897 /* Mark all the statements that we want to vectorize as pure SLP and
2898 relevant. */
2899 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2900 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2902 i++;
2904 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
2906 delete bb_vinfo;
2907 return NULL;
2910 if (!vect_slp_analyze_operations (bb_vinfo))
2912 if (dump_enabled_p ())
2913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2914 "not vectorized: bad operation in basic block.\n");
2916 delete bb_vinfo;
2917 return NULL;
2920 /* Cost model: check if the vectorization is worthwhile. */
2921 if (!unlimited_cost_model (NULL)
2922 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2924 if (dump_enabled_p ())
2925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2926 "not vectorized: vectorization is not "
2927 "profitable.\n");
2929 delete bb_vinfo;
2930 return NULL;
2933 if (dump_enabled_p ())
2934 dump_printf_loc (MSG_NOTE, vect_location,
2935 "Basic block will be vectorized using SLP\n");
2937 return bb_vinfo;
2941 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
2942 true if anything in the basic-block was vectorized. */
2944 bool
2945 vect_slp_bb (basic_block bb)
2947 bb_vec_info bb_vinfo;
2948 gimple_stmt_iterator gsi;
2949 bool any_vectorized = false;
2950 auto_vector_sizes vector_sizes;
2952 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
2954 /* Autodetect first vector size we try. */
2955 current_vector_size = 0;
2956 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2957 unsigned int next_size = 0;
2959 gsi = gsi_start_bb (bb);
2961 poly_uint64 autodetected_vector_size = 0;
2962 while (1)
2964 if (gsi_end_p (gsi))
2965 break;
2967 gimple_stmt_iterator region_begin = gsi;
2968 vec<data_reference_p> datarefs = vNULL;
2969 int insns = 0;
2971 for (; !gsi_end_p (gsi); gsi_next (&gsi))
2973 gimple *stmt = gsi_stmt (gsi);
2974 if (is_gimple_debug (stmt))
2975 continue;
2976 insns++;
2978 if (gimple_location (stmt) != UNKNOWN_LOCATION)
2979 vect_location = stmt;
2981 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
2982 break;
2985 /* Skip leading unhandled stmts. */
2986 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
2988 gsi_next (&gsi);
2989 continue;
2992 gimple_stmt_iterator region_end = gsi;
2994 bool vectorized = false;
2995 bool fatal = false;
2996 vec_info_shared shared;
2997 bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
2998 datarefs, insns, fatal, &shared);
2999 if (bb_vinfo
3000 && dbg_cnt (vect_slp))
3002 if (dump_enabled_p ())
3003 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3005 bb_vinfo->shared->check_datarefs ();
3006 vect_schedule_slp (bb_vinfo);
3008 unsigned HOST_WIDE_INT bytes;
3009 if (current_vector_size.is_constant (&bytes))
3010 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3011 "basic block part vectorized using %wu byte "
3012 "vectors\n", bytes);
3013 else
3014 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3015 "basic block part vectorized using variable "
3016 "length vectors\n");
3018 vectorized = true;
3020 delete bb_vinfo;
3022 any_vectorized |= vectorized;
3024 if (next_size == 0)
3025 autodetected_vector_size = current_vector_size;
3027 if (next_size < vector_sizes.length ()
3028 && known_eq (vector_sizes[next_size], autodetected_vector_size))
3029 next_size += 1;
3031 if (vectorized
3032 || next_size == vector_sizes.length ()
3033 || known_eq (current_vector_size, 0U)
3034 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3035 vector sizes will fail do not bother iterating. */
3036 || fatal)
3038 if (gsi_end_p (region_end))
3039 break;
3041 /* Skip the unhandled stmt. */
3042 gsi_next (&gsi);
3044 /* And reset vector sizes. */
3045 current_vector_size = 0;
3046 next_size = 0;
3048 else
3050 /* Try the next biggest vector size. */
3051 current_vector_size = vector_sizes[next_size++];
3052 if (dump_enabled_p ())
3054 dump_printf_loc (MSG_NOTE, vect_location,
3055 "***** Re-trying analysis with "
3056 "vector size ");
3057 dump_dec (MSG_NOTE, current_vector_size);
3058 dump_printf (MSG_NOTE, "\n");
3061 /* Start over. */
3062 gsi = region_begin;
3066 return any_vectorized;
3070 /* Return 1 if vector type of boolean constant which is OPNUM
3071 operand in statement STMT_VINFO is a boolean vector. */
3073 static bool
3074 vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo, int opnum)
3076 enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
3077 tree op, vectype;
3078 enum vect_def_type dt;
3080 /* For comparison and COND_EXPR type is chosen depending
3081 on the other comparison operand. */
3082 if (TREE_CODE_CLASS (code) == tcc_comparison)
3084 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3085 if (opnum)
3086 op = gimple_assign_rhs1 (stmt);
3087 else
3088 op = gimple_assign_rhs2 (stmt);
3090 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3091 gcc_unreachable ();
3093 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3096 if (code == COND_EXPR)
3098 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3099 tree cond = gimple_assign_rhs1 (stmt);
3101 if (TREE_CODE (cond) == SSA_NAME)
3102 op = cond;
3103 else if (opnum)
3104 op = TREE_OPERAND (cond, 1);
3105 else
3106 op = TREE_OPERAND (cond, 0);
3108 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3109 gcc_unreachable ();
3111 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3114 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3117 /* Build a variable-length vector in which the elements in ELTS are repeated
3118 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3119 RESULTS and add any new instructions to SEQ.
3121 The approach we use is:
3123 (1) Find a vector mode VM with integer elements of mode IM.
3125 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3126 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3127 from small vectors to IM.
3129 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3131 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3132 correct byte contents.
3134 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3136 We try to find the largest IM for which this sequence works, in order
3137 to cut down on the number of interleaves. */
3139 void
3140 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
3141 unsigned int nresults, vec<tree> &results)
3143 unsigned int nelts = elts.length ();
3144 tree element_type = TREE_TYPE (vector_type);
3146 /* (1) Find a vector mode VM with integer elements of mode IM. */
3147 unsigned int nvectors = 1;
3148 tree new_vector_type;
3149 tree permutes[2];
3150 if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
3151 &nvectors, &new_vector_type,
3152 permutes))
3153 gcc_unreachable ();
3155 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3156 unsigned int partial_nelts = nelts / nvectors;
3157 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3159 tree_vector_builder partial_elts;
3160 auto_vec<tree, 32> pieces (nvectors * 2);
3161 pieces.quick_grow (nvectors * 2);
3162 for (unsigned int i = 0; i < nvectors; ++i)
3164 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3165 ELTS' has mode IM. */
3166 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3167 for (unsigned int j = 0; j < partial_nelts; ++j)
3168 partial_elts.quick_push (elts[i * partial_nelts + j]);
3169 tree t = gimple_build_vector (seq, &partial_elts);
3170 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3171 TREE_TYPE (new_vector_type), t);
3173 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3174 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3177 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3178 correct byte contents.
3180 We need to repeat the following operation log2(nvectors) times:
3182 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3183 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3185 However, if each input repeats every N elements and the VF is
3186 a multiple of N * 2, the HI result is the same as the LO. */
3187 unsigned int in_start = 0;
3188 unsigned int out_start = nvectors;
3189 unsigned int hi_start = nvectors / 2;
3190 /* A bound on the number of outputs needed to produce NRESULTS results
3191 in the final iteration. */
3192 unsigned int noutputs_bound = nvectors * nresults;
3193 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3195 noutputs_bound /= 2;
3196 unsigned int limit = MIN (noutputs_bound, nvectors);
3197 for (unsigned int i = 0; i < limit; ++i)
3199 if ((i & 1) != 0
3200 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3201 2 * in_repeat))
3203 pieces[out_start + i] = pieces[out_start + i - 1];
3204 continue;
3207 tree output = make_ssa_name (new_vector_type);
3208 tree input1 = pieces[in_start + (i / 2)];
3209 tree input2 = pieces[in_start + (i / 2) + hi_start];
3210 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3211 input1, input2,
3212 permutes[i & 1]);
3213 gimple_seq_add_stmt (seq, stmt);
3214 pieces[out_start + i] = output;
3216 std::swap (in_start, out_start);
3219 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3220 results.reserve (nresults);
3221 for (unsigned int i = 0; i < nresults; ++i)
3222 if (i < nvectors)
3223 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3224 pieces[in_start + i]));
3225 else
3226 results.quick_push (results[i - nvectors]);
3230 /* For constant and loop invariant defs of SLP_NODE this function returns
3231 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3232 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
3233 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
3234 REDUC_INDEX is the index of the reduction operand in the statements, unless
3235 it is -1. */
3237 static void
3238 vect_get_constant_vectors (tree op, slp_tree slp_node,
3239 vec<tree> *vec_oprnds,
3240 unsigned int op_num, unsigned int number_of_vectors)
3242 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3243 stmt_vec_info stmt_vinfo = stmts[0];
3244 gimple *stmt = stmt_vinfo->stmt;
3245 unsigned HOST_WIDE_INT nunits;
3246 tree vec_cst;
3247 unsigned j, number_of_places_left_in_vector;
3248 tree vector_type;
3249 tree vop;
3250 int group_size = stmts.length ();
3251 unsigned int vec_num, i;
3252 unsigned number_of_copies = 1;
3253 vec<tree> voprnds;
3254 voprnds.create (number_of_vectors);
3255 bool constant_p, is_store;
3256 tree neutral_op = NULL;
3257 enum tree_code code = gimple_expr_code (stmt);
3258 gimple_seq ctor_seq = NULL;
3259 auto_vec<tree, 16> permute_results;
3261 /* Check if vector type is a boolean vector. */
3262 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3263 && vect_mask_constant_operand_p (stmt_vinfo, op_num))
3264 vector_type
3265 = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
3266 else
3267 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
3269 if (STMT_VINFO_DATA_REF (stmt_vinfo))
3271 is_store = true;
3272 op = gimple_assign_rhs1 (stmt);
3274 else
3275 is_store = false;
3277 gcc_assert (op);
3279 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3280 created vectors. It is greater than 1 if unrolling is performed.
3282 For example, we have two scalar operands, s1 and s2 (e.g., group of
3283 strided accesses of size two), while NUNITS is four (i.e., four scalars
3284 of this type can be packed in a vector). The output vector will contain
3285 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3286 will be 2).
3288 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3289 containing the operands.
3291 For example, NUNITS is four as before, and the group size is 8
3292 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3293 {s5, s6, s7, s8}. */
3295 /* When using duplicate_and_interleave, we just need one element for
3296 each scalar statement. */
3297 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3298 nunits = group_size;
3300 number_of_copies = nunits * number_of_vectors / group_size;
3302 number_of_places_left_in_vector = nunits;
3303 constant_p = true;
3304 tree_vector_builder elts (vector_type, nunits, 1);
3305 elts.quick_grow (nunits);
3306 bool place_after_defs = false;
3307 for (j = 0; j < number_of_copies; j++)
3309 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
3311 stmt = stmt_vinfo->stmt;
3312 if (is_store)
3313 op = gimple_assign_rhs1 (stmt);
3314 else
3316 switch (code)
3318 case COND_EXPR:
3320 tree cond = gimple_assign_rhs1 (stmt);
3321 if (TREE_CODE (cond) == SSA_NAME)
3322 op = gimple_op (stmt, op_num + 1);
3323 else if (op_num == 0 || op_num == 1)
3324 op = TREE_OPERAND (cond, op_num);
3325 else
3327 if (op_num == 2)
3328 op = gimple_assign_rhs2 (stmt);
3329 else
3330 op = gimple_assign_rhs3 (stmt);
3333 break;
3335 case CALL_EXPR:
3336 op = gimple_call_arg (stmt, op_num);
3337 break;
3339 case LSHIFT_EXPR:
3340 case RSHIFT_EXPR:
3341 case LROTATE_EXPR:
3342 case RROTATE_EXPR:
3343 op = gimple_op (stmt, op_num + 1);
3344 /* Unlike the other binary operators, shifts/rotates have
3345 the shift count being int, instead of the same type as
3346 the lhs, so make sure the scalar is the right type if
3347 we are dealing with vectors of
3348 long long/long/short/char. */
3349 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3350 op = fold_convert (TREE_TYPE (vector_type), op);
3351 break;
3353 default:
3354 op = gimple_op (stmt, op_num + 1);
3355 break;
3359 /* Create 'vect_ = {op0,op1,...,opn}'. */
3360 number_of_places_left_in_vector--;
3361 tree orig_op = op;
3362 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3364 if (CONSTANT_CLASS_P (op))
3366 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3368 /* Can't use VIEW_CONVERT_EXPR for booleans because
3369 of possibly different sizes of scalar value and
3370 vector element. */
3371 if (integer_zerop (op))
3372 op = build_int_cst (TREE_TYPE (vector_type), 0);
3373 else if (integer_onep (op))
3374 op = build_all_ones_cst (TREE_TYPE (vector_type));
3375 else
3376 gcc_unreachable ();
3378 else
3379 op = fold_unary (VIEW_CONVERT_EXPR,
3380 TREE_TYPE (vector_type), op);
3381 gcc_assert (op && CONSTANT_CLASS_P (op));
3383 else
3385 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3386 gimple *init_stmt;
3387 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3389 tree true_val
3390 = build_all_ones_cst (TREE_TYPE (vector_type));
3391 tree false_val
3392 = build_zero_cst (TREE_TYPE (vector_type));
3393 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3394 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3395 op, true_val,
3396 false_val);
3398 else
3400 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3401 op);
3402 init_stmt
3403 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3404 op);
3406 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3407 op = new_temp;
3410 elts[number_of_places_left_in_vector] = op;
3411 if (!CONSTANT_CLASS_P (op))
3412 constant_p = false;
3413 if (TREE_CODE (orig_op) == SSA_NAME
3414 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3415 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3416 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3417 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3418 place_after_defs = true;
3420 if (number_of_places_left_in_vector == 0)
3422 if (constant_p
3423 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3424 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3425 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3426 else
3428 if (vec_oprnds->is_empty ())
3429 duplicate_and_interleave (&ctor_seq, vector_type, elts,
3430 number_of_vectors,
3431 permute_results);
3432 vec_cst = permute_results[number_of_vectors - j - 1];
3434 tree init;
3435 gimple_stmt_iterator gsi;
3436 if (place_after_defs)
3438 stmt_vec_info last_stmt_info
3439 = vect_find_last_scalar_stmt_in_slp (slp_node);
3440 gsi = gsi_for_stmt (last_stmt_info->stmt);
3441 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3442 &gsi);
3444 else
3445 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3446 NULL);
3447 if (ctor_seq != NULL)
3449 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3450 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3451 ctor_seq = NULL;
3453 voprnds.quick_push (init);
3454 place_after_defs = false;
3455 number_of_places_left_in_vector = nunits;
3456 constant_p = true;
3457 elts.new_vector (vector_type, nunits, 1);
3458 elts.quick_grow (nunits);
3463 /* Since the vectors are created in the reverse order, we should invert
3464 them. */
3465 vec_num = voprnds.length ();
3466 for (j = vec_num; j != 0; j--)
3468 vop = voprnds[j - 1];
3469 vec_oprnds->quick_push (vop);
3472 voprnds.release ();
3474 /* In case that VF is greater than the unrolling factor needed for the SLP
3475 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3476 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3477 to replicate the vectors. */
3478 while (number_of_vectors > vec_oprnds->length ())
3480 tree neutral_vec = NULL;
3482 if (neutral_op)
3484 if (!neutral_vec)
3485 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3487 vec_oprnds->quick_push (neutral_vec);
3489 else
3491 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3492 vec_oprnds->quick_push (vop);
3498 /* Get vectorized definitions from SLP_NODE that contains corresponding
3499 vectorized def-stmts. */
3501 static void
3502 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3504 tree vec_oprnd;
3505 stmt_vec_info vec_def_stmt_info;
3506 unsigned int i;
3508 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3510 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
3512 gcc_assert (vec_def_stmt_info);
3513 if (gphi *vec_def_phi = dyn_cast <gphi *> (vec_def_stmt_info->stmt))
3514 vec_oprnd = gimple_phi_result (vec_def_phi);
3515 else
3516 vec_oprnd = gimple_get_lhs (vec_def_stmt_info->stmt);
3517 vec_oprnds->quick_push (vec_oprnd);
3522 /* Get vectorized definitions for SLP_NODE.
3523 If the scalar definitions are loop invariants or constants, collect them and
3524 call vect_get_constant_vectors() to create vector stmts.
3525 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3526 must be stored in the corresponding child of SLP_NODE, and we call
3527 vect_get_slp_vect_defs () to retrieve them. */
3529 void
3530 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3531 vec<vec<tree> > *vec_oprnds)
3533 int number_of_vects = 0, i;
3534 unsigned int child_index = 0;
3535 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3536 slp_tree child = NULL;
3537 vec<tree> vec_defs;
3538 tree oprnd;
3539 bool vectorized_defs;
3541 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3542 FOR_EACH_VEC_ELT (ops, i, oprnd)
3544 /* For each operand we check if it has vectorized definitions in a child
3545 node or we need to create them (for invariants and constants). We
3546 check if the LHS of the first stmt of the next child matches OPRND.
3547 If it does, we found the correct child. Otherwise, we call
3548 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3549 to check this child node for the next operand. */
3550 vectorized_defs = false;
3551 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3553 child = SLP_TREE_CHILDREN (slp_node)[child_index];
3555 /* We have to check both pattern and original def, if available. */
3556 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3558 stmt_vec_info first_def_info = SLP_TREE_SCALAR_STMTS (child)[0];
3559 stmt_vec_info related = STMT_VINFO_RELATED_STMT (first_def_info);
3560 tree first_def_op;
3562 if (gphi *first_def = dyn_cast <gphi *> (first_def_info->stmt))
3563 first_def_op = gimple_phi_result (first_def);
3564 else
3565 first_def_op = gimple_get_lhs (first_def_info->stmt);
3566 if (operand_equal_p (oprnd, first_def_op, 0)
3567 || (related
3568 && operand_equal_p (oprnd,
3569 gimple_get_lhs (related->stmt), 0)))
3571 /* The number of vector defs is determined by the number of
3572 vector statements in the node from which we get those
3573 statements. */
3574 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3575 vectorized_defs = true;
3576 child_index++;
3579 else
3580 child_index++;
3583 if (!vectorized_defs)
3585 if (i == 0)
3587 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3588 /* Number of vector stmts was calculated according to LHS in
3589 vect_schedule_slp_instance (), fix it by replacing LHS with
3590 RHS, if necessary. See vect_get_smallest_scalar_type () for
3591 details. */
3592 vect_get_smallest_scalar_type (first_stmt_info, &lhs_size_unit,
3593 &rhs_size_unit);
3594 if (rhs_size_unit != lhs_size_unit)
3596 number_of_vects *= rhs_size_unit;
3597 number_of_vects /= lhs_size_unit;
3602 /* Allocate memory for vectorized defs. */
3603 vec_defs = vNULL;
3604 vec_defs.create (number_of_vects);
3606 /* For reduction defs we call vect_get_constant_vectors (), since we are
3607 looking for initial loop invariant values. */
3608 if (vectorized_defs)
3609 /* The defs are already vectorized. */
3610 vect_get_slp_vect_defs (child, &vec_defs);
3611 else
3612 /* Build vectors from scalar defs. */
3613 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3614 number_of_vects);
3616 vec_oprnds->quick_push (vec_defs);
3620 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3621 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3622 permute statements for the SLP node NODE of the SLP instance
3623 SLP_NODE_INSTANCE. */
3625 bool
3626 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3627 gimple_stmt_iterator *gsi, poly_uint64 vf,
3628 slp_instance slp_node_instance, bool analyze_only,
3629 unsigned *n_perms)
3631 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3632 vec_info *vinfo = stmt_info->vinfo;
3633 int vec_index = 0;
3634 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3635 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3636 unsigned int mask_element;
3637 machine_mode mode;
3639 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3640 return false;
3642 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3644 mode = TYPE_MODE (vectype);
3645 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3647 /* Initialize the vect stmts of NODE to properly insert the generated
3648 stmts later. */
3649 if (! analyze_only)
3650 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3651 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3652 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3654 /* Generate permutation masks for every NODE. Number of masks for each NODE
3655 is equal to GROUP_SIZE.
3656 E.g., we have a group of three nodes with three loads from the same
3657 location in each node, and the vector size is 4. I.e., we have a
3658 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3659 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3660 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3663 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3664 The last mask is illegal since we assume two operands for permute
3665 operation, and the mask element values can't be outside that range.
3666 Hence, the last mask must be converted into {2,5,5,5}.
3667 For the first two permutations we need the first and the second input
3668 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3669 we need the second and the third vectors: {b1,c1,a2,b2} and
3670 {c2,a3,b3,c3}. */
3672 int vect_stmts_counter = 0;
3673 unsigned int index = 0;
3674 int first_vec_index = -1;
3675 int second_vec_index = -1;
3676 bool noop_p = true;
3677 *n_perms = 0;
3679 vec_perm_builder mask;
3680 unsigned int nelts_to_build;
3681 unsigned int nvectors_per_build;
3682 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3683 && multiple_p (nunits, group_size));
3684 if (repeating_p)
3686 /* A single vector contains a whole number of copies of the node, so:
3687 (a) all permutes can use the same mask; and
3688 (b) the permutes only need a single vector input. */
3689 mask.new_vector (nunits, group_size, 3);
3690 nelts_to_build = mask.encoded_nelts ();
3691 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3693 else
3695 /* We need to construct a separate mask for each vector statement. */
3696 unsigned HOST_WIDE_INT const_nunits, const_vf;
3697 if (!nunits.is_constant (&const_nunits)
3698 || !vf.is_constant (&const_vf))
3699 return false;
3700 mask.new_vector (const_nunits, const_nunits, 1);
3701 nelts_to_build = const_vf * group_size;
3702 nvectors_per_build = 1;
3705 unsigned int count = mask.encoded_nelts ();
3706 mask.quick_grow (count);
3707 vec_perm_indices indices;
3709 for (unsigned int j = 0; j < nelts_to_build; j++)
3711 unsigned int iter_num = j / group_size;
3712 unsigned int stmt_num = j % group_size;
3713 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3714 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3715 if (repeating_p)
3717 first_vec_index = 0;
3718 mask_element = i;
3720 else
3722 /* Enforced before the loop when !repeating_p. */
3723 unsigned int const_nunits = nunits.to_constant ();
3724 vec_index = i / const_nunits;
3725 mask_element = i % const_nunits;
3726 if (vec_index == first_vec_index
3727 || first_vec_index == -1)
3729 first_vec_index = vec_index;
3731 else if (vec_index == second_vec_index
3732 || second_vec_index == -1)
3734 second_vec_index = vec_index;
3735 mask_element += const_nunits;
3737 else
3739 if (dump_enabled_p ())
3740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3741 "permutation requires at "
3742 "least three vectors %G",
3743 stmt_info->stmt);
3744 gcc_assert (analyze_only);
3745 return false;
3748 gcc_assert (mask_element < 2 * const_nunits);
3751 if (mask_element != index)
3752 noop_p = false;
3753 mask[index++] = mask_element;
3755 if (index == count && !noop_p)
3757 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
3758 if (!can_vec_perm_const_p (mode, indices))
3760 if (dump_enabled_p ())
3762 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3763 vect_location,
3764 "unsupported vect permute { ");
3765 for (i = 0; i < count; ++i)
3767 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3768 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3770 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3772 gcc_assert (analyze_only);
3773 return false;
3776 ++*n_perms;
3779 if (index == count)
3781 if (!analyze_only)
3783 tree mask_vec = NULL_TREE;
3785 if (! noop_p)
3786 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
3788 if (second_vec_index == -1)
3789 second_vec_index = first_vec_index;
3791 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
3793 /* Generate the permute statement if necessary. */
3794 tree first_vec = dr_chain[first_vec_index + ri];
3795 tree second_vec = dr_chain[second_vec_index + ri];
3796 stmt_vec_info perm_stmt_info;
3797 if (! noop_p)
3799 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3800 tree perm_dest
3801 = vect_create_destination_var (gimple_assign_lhs (stmt),
3802 vectype);
3803 perm_dest = make_ssa_name (perm_dest);
3804 gassign *perm_stmt
3805 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3806 first_vec, second_vec,
3807 mask_vec);
3808 perm_stmt_info
3809 = vect_finish_stmt_generation (stmt_info, perm_stmt,
3810 gsi);
3812 else
3813 /* If mask was NULL_TREE generate the requested
3814 identity transform. */
3815 perm_stmt_info = vinfo->lookup_def (first_vec);
3817 /* Store the vector statement in NODE. */
3818 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
3819 = perm_stmt_info;
3823 index = 0;
3824 first_vec_index = -1;
3825 second_vec_index = -1;
3826 noop_p = true;
3830 return true;
3833 /* Vectorize SLP instance tree in postorder. */
3835 static void
3836 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3837 scalar_stmts_to_slp_tree_map_t *bst_map)
3839 gimple_stmt_iterator si;
3840 stmt_vec_info stmt_info;
3841 unsigned int group_size;
3842 tree vectype;
3843 int i, j;
3844 slp_tree child;
3846 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3847 return;
3849 /* See if we have already vectorized the node in the graph of the
3850 SLP instance. */
3851 if (SLP_TREE_VEC_STMTS (node).exists ())
3852 return;
3854 /* See if we have already vectorized the same set of stmts and reuse their
3855 vectorized stmts across instances. */
3856 if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
3858 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
3859 return;
3862 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
3863 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3864 vect_schedule_slp_instance (child, instance, bst_map);
3866 /* Push SLP node def-type to stmts. */
3867 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3868 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3870 stmt_vec_info child_stmt_info;
3871 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
3872 STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
3875 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3877 /* VECTYPE is the type of the destination. */
3878 vectype = STMT_VINFO_VECTYPE (stmt_info);
3879 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3880 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3882 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
3883 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
3885 if (dump_enabled_p ())
3886 dump_printf_loc (MSG_NOTE, vect_location,
3887 "------>vectorizing SLP node starting from: %G",
3888 stmt_info->stmt);
3890 /* Vectorized stmts go before the last scalar stmt which is where
3891 all uses are ready. */
3892 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
3893 si = gsi_for_stmt (last_stmt_info->stmt);
3895 /* Mark the first element of the reduction chain as reduction to properly
3896 transform the node. In the analysis phase only the last element of the
3897 chain is marked as reduction. */
3898 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3899 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
3900 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
3902 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3903 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3906 /* Handle two-operation SLP nodes by vectorizing the group with
3907 both operations and then performing a merge. */
3908 if (SLP_TREE_TWO_OPERATORS (node))
3910 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3911 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3912 enum tree_code ocode = ERROR_MARK;
3913 stmt_vec_info ostmt_info;
3914 vec_perm_builder mask (group_size, group_size, 1);
3915 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
3917 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
3918 if (gimple_assign_rhs_code (ostmt) != code0)
3920 mask.quick_push (1);
3921 ocode = gimple_assign_rhs_code (ostmt);
3923 else
3924 mask.quick_push (0);
3926 if (ocode != ERROR_MARK)
3928 vec<stmt_vec_info> v0;
3929 vec<stmt_vec_info> v1;
3930 unsigned j;
3931 tree tmask = NULL_TREE;
3932 vect_transform_stmt (stmt_info, &si, node, instance);
3933 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3934 SLP_TREE_VEC_STMTS (node).truncate (0);
3935 gimple_assign_set_rhs_code (stmt, ocode);
3936 vect_transform_stmt (stmt_info, &si, node, instance);
3937 gimple_assign_set_rhs_code (stmt, code0);
3938 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3939 SLP_TREE_VEC_STMTS (node).truncate (0);
3940 tree meltype = build_nonstandard_integer_type
3941 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
3942 tree mvectype = get_same_sized_vectype (meltype, vectype);
3943 unsigned k = 0, l;
3944 for (j = 0; j < v0.length (); ++j)
3946 /* Enforced by vect_build_slp_tree, which rejects variable-length
3947 vectors for SLP_TREE_TWO_OPERATORS. */
3948 unsigned int const_nunits = nunits.to_constant ();
3949 tree_vector_builder melts (mvectype, const_nunits, 1);
3950 for (l = 0; l < const_nunits; ++l)
3952 if (k >= group_size)
3953 k = 0;
3954 tree t = build_int_cst (meltype,
3955 mask[k++] * const_nunits + l);
3956 melts.quick_push (t);
3958 tmask = melts.build ();
3960 /* ??? Not all targets support a VEC_PERM_EXPR with a
3961 constant mask that would translate to a vec_merge RTX
3962 (with their vec_perm_const_ok). We can either not
3963 vectorize in that case or let veclower do its job.
3964 Unfortunately that isn't too great and at least for
3965 plus/minus we'd eventually like to match targets
3966 vector addsub instructions. */
3967 gimple *vstmt;
3968 vstmt = gimple_build_assign (make_ssa_name (vectype),
3969 VEC_PERM_EXPR,
3970 gimple_assign_lhs (v0[j]->stmt),
3971 gimple_assign_lhs (v1[j]->stmt),
3972 tmask);
3973 SLP_TREE_VEC_STMTS (node).quick_push
3974 (vect_finish_stmt_generation (stmt_info, vstmt, &si));
3976 v0.release ();
3977 v1.release ();
3978 return;
3981 vect_transform_stmt (stmt_info, &si, node, instance);
3983 /* Restore stmt def-types. */
3984 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3985 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3987 stmt_vec_info child_stmt_info;
3988 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
3989 STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
3993 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3994 For loop vectorization this is done in vectorizable_call, but for SLP
3995 it needs to be deferred until end of vect_schedule_slp, because multiple
3996 SLP instances may refer to the same scalar stmt. */
3998 static void
3999 vect_remove_slp_scalar_calls (slp_tree node, hash_set<slp_tree> &visited)
4001 gimple *new_stmt;
4002 gimple_stmt_iterator gsi;
4003 int i;
4004 slp_tree child;
4005 tree lhs;
4006 stmt_vec_info stmt_info;
4008 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4009 return;
4011 if (visited.add (node))
4012 return;
4014 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4015 vect_remove_slp_scalar_calls (child, visited);
4017 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4019 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4020 if (!stmt || gimple_bb (stmt) == NULL)
4021 continue;
4022 if (is_pattern_stmt_p (stmt_info)
4023 || !PURE_SLP_STMT (stmt_info))
4024 continue;
4025 lhs = gimple_call_lhs (stmt);
4026 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4027 gsi = gsi_for_stmt (stmt);
4028 stmt_info->vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4029 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4033 static void
4034 vect_remove_slp_scalar_calls (slp_tree node)
4036 hash_set<slp_tree> visited;
4037 vect_remove_slp_scalar_calls (node, visited);
4040 /* Generate vector code for all SLP instances in the loop/basic block. */
4042 void
4043 vect_schedule_slp (vec_info *vinfo)
4045 vec<slp_instance> slp_instances;
4046 slp_instance instance;
4047 unsigned int i;
4049 scalar_stmts_to_slp_tree_map_t *bst_map
4050 = new scalar_stmts_to_slp_tree_map_t ();
4051 slp_instances = vinfo->slp_instances;
4052 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4054 /* Schedule the tree of INSTANCE. */
4055 vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4056 instance, bst_map);
4057 if (dump_enabled_p ())
4058 dump_printf_loc (MSG_NOTE, vect_location,
4059 "vectorizing stmts using SLP.\n");
4061 delete bst_map;
4063 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4065 slp_tree root = SLP_INSTANCE_TREE (instance);
4066 stmt_vec_info store_info;
4067 unsigned int j;
4069 /* Remove scalar call stmts. Do not do this for basic-block
4070 vectorization as not all uses may be vectorized.
4071 ??? Why should this be necessary? DCE should be able to
4072 remove the stmts itself.
4073 ??? For BB vectorization we can as well remove scalar
4074 stmts starting from the SLP tree root if they have no
4075 uses. */
4076 if (is_a <loop_vec_info> (vinfo))
4077 vect_remove_slp_scalar_calls (root);
4079 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info)
4080 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4082 if (!STMT_VINFO_DATA_REF (store_info))
4083 break;
4085 store_info = vect_orig_stmt (store_info);
4086 /* Free the attached stmt_vec_info and remove the stmt. */
4087 vinfo->remove_stmt (store_info);