pr70100.c: Add -mvsx.
[official-gcc.git] / gcc / tree-vect-slp.c
blobe32731b6db16b06508bee4c373536fd611d07f4d
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
51 FINAL_P is true if we have vectorized the instance or if we have
52 made a final decision not to vectorize the statements in any way. */
54 static void
55 vect_free_slp_tree (slp_tree node, bool final_p)
57 int i;
58 slp_tree child;
60 if (--node->refcnt != 0)
61 return;
63 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
64 vect_free_slp_tree (child, final_p);
66 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
67 Some statements might no longer exist, after having been
68 removed by vect_transform_stmt. Updating the remaining
69 statements would be redundant. */
70 if (!final_p)
72 stmt_vec_info stmt_info;
73 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
75 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
76 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
80 SLP_TREE_CHILDREN (node).release ();
81 SLP_TREE_SCALAR_STMTS (node).release ();
82 SLP_TREE_SCALAR_OPS (node).release ();
83 SLP_TREE_VEC_STMTS (node).release ();
84 SLP_TREE_LOAD_PERMUTATION (node).release ();
86 free (node);
89 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
90 have vectorized the instance or if we have made a final decision not
91 to vectorize the statements in any way. */
93 void
94 vect_free_slp_instance (slp_instance instance, bool final_p)
96 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
97 SLP_INSTANCE_LOADS (instance).release ();
98 free (instance);
102 /* Create an SLP node for SCALAR_STMTS. */
104 static slp_tree
105 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
107 slp_tree node;
108 stmt_vec_info stmt_info = scalar_stmts[0];
109 unsigned int nops;
111 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
112 nops = gimple_call_num_args (stmt);
113 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
115 nops = gimple_num_ops (stmt) - 1;
116 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
117 nops++;
119 else if (is_a <gphi *> (stmt_info->stmt))
120 nops = 0;
121 else
122 return NULL;
124 node = XNEW (struct _slp_tree);
125 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
126 SLP_TREE_SCALAR_OPS (node) = vNULL;
127 SLP_TREE_VEC_STMTS (node).create (0);
128 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
129 SLP_TREE_CHILDREN (node).create (nops);
130 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
131 SLP_TREE_TWO_OPERATORS (node) = false;
132 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
133 node->refcnt = 1;
134 node->max_nunits = 1;
136 unsigned i;
137 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
138 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
140 return node;
143 /* Create an SLP node for OPS. */
145 static slp_tree
146 vect_create_new_slp_node (vec<tree> ops)
148 slp_tree node;
150 node = XNEW (struct _slp_tree);
151 SLP_TREE_SCALAR_STMTS (node) = vNULL;
152 SLP_TREE_SCALAR_OPS (node) = ops;
153 SLP_TREE_VEC_STMTS (node).create (0);
154 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
155 SLP_TREE_CHILDREN (node) = vNULL;
156 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
157 SLP_TREE_TWO_OPERATORS (node) = false;
158 SLP_TREE_DEF_TYPE (node) = vect_external_def;
159 node->refcnt = 1;
160 node->max_nunits = 1;
162 return node;
166 /* This structure is used in creation of an SLP tree. Each instance
167 corresponds to the same operand in a group of scalar stmts in an SLP
168 node. */
169 typedef struct _slp_oprnd_info
171 /* Def-stmts for the operands. */
172 vec<stmt_vec_info> def_stmts;
173 /* Operands. */
174 vec<tree> ops;
175 /* Information about the first statement, its vector def-type, type, the
176 operand itself in case it's constant, and an indication if it's a pattern
177 stmt. */
178 tree first_op_type;
179 enum vect_def_type first_dt;
180 bool any_pattern;
181 } *slp_oprnd_info;
184 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
185 operand. */
186 static vec<slp_oprnd_info>
187 vect_create_oprnd_info (int nops, int group_size)
189 int i;
190 slp_oprnd_info oprnd_info;
191 vec<slp_oprnd_info> oprnds_info;
193 oprnds_info.create (nops);
194 for (i = 0; i < nops; i++)
196 oprnd_info = XNEW (struct _slp_oprnd_info);
197 oprnd_info->def_stmts.create (group_size);
198 oprnd_info->ops.create (group_size);
199 oprnd_info->first_dt = vect_uninitialized_def;
200 oprnd_info->first_op_type = NULL_TREE;
201 oprnd_info->any_pattern = false;
202 oprnds_info.quick_push (oprnd_info);
205 return oprnds_info;
209 /* Free operands info. */
211 static void
212 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
214 int i;
215 slp_oprnd_info oprnd_info;
217 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
219 oprnd_info->def_stmts.release ();
220 oprnd_info->ops.release ();
221 XDELETE (oprnd_info);
224 oprnds_info.release ();
228 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
229 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
230 of the chain. */
233 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
234 stmt_vec_info first_stmt_info)
236 stmt_vec_info next_stmt_info = first_stmt_info;
237 int result = 0;
239 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
240 return -1;
244 if (next_stmt_info == stmt_info)
245 return result;
246 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
247 if (next_stmt_info)
248 result += DR_GROUP_GAP (next_stmt_info);
250 while (next_stmt_info);
252 return -1;
255 /* Check whether it is possible to load COUNT elements of type ELT_MODE
256 using the method implemented by duplicate_and_interleave. Return true
257 if so, returning the number of intermediate vectors in *NVECTORS_OUT
258 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
259 (if nonnull). */
261 bool
262 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
263 machine_mode elt_mode,
264 unsigned int *nvectors_out,
265 tree *vector_type_out,
266 tree *permutes)
268 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
269 poly_int64 nelts;
270 unsigned int nvectors = 1;
271 for (;;)
273 scalar_int_mode int_mode;
274 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
275 if (multiple_p (vinfo->vector_size, elt_bytes, &nelts)
276 && int_mode_for_size (elt_bits, 0).exists (&int_mode))
278 tree int_type = build_nonstandard_integer_type
279 (GET_MODE_BITSIZE (int_mode), 1);
280 tree vector_type = build_vector_type (int_type, nelts);
281 if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
283 vec_perm_builder sel1 (nelts, 2, 3);
284 vec_perm_builder sel2 (nelts, 2, 3);
285 poly_int64 half_nelts = exact_div (nelts, 2);
286 for (unsigned int i = 0; i < 3; ++i)
288 sel1.quick_push (i);
289 sel1.quick_push (i + nelts);
290 sel2.quick_push (half_nelts + i);
291 sel2.quick_push (half_nelts + i + nelts);
293 vec_perm_indices indices1 (sel1, 2, nelts);
294 vec_perm_indices indices2 (sel2, 2, nelts);
295 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
296 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
298 if (nvectors_out)
299 *nvectors_out = nvectors;
300 if (vector_type_out)
301 *vector_type_out = vector_type;
302 if (permutes)
304 permutes[0] = vect_gen_perm_mask_checked (vector_type,
305 indices1);
306 permutes[1] = vect_gen_perm_mask_checked (vector_type,
307 indices2);
309 return true;
313 if (!multiple_p (elt_bytes, 2, &elt_bytes))
314 return false;
315 nvectors *= 2;
319 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
320 they are of a valid type and that they match the defs of the first stmt of
321 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
322 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
323 indicates swap is required for cond_expr stmts. Specifically, *SWAP
324 is 1 if STMT is cond and operands of comparison need to be swapped;
325 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
326 If there is any operand swap in this function, *SWAP is set to non-zero
327 value.
328 If there was a fatal error return -1; if the error could be corrected by
329 swapping operands of father node of this one, return 1; if everything is
330 ok return 0. */
331 static int
332 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
333 vec<stmt_vec_info> stmts, unsigned stmt_num,
334 vec<slp_oprnd_info> *oprnds_info)
336 stmt_vec_info stmt_info = stmts[stmt_num];
337 tree oprnd;
338 unsigned int i, number_of_oprnds;
339 enum vect_def_type dt = vect_uninitialized_def;
340 slp_oprnd_info oprnd_info;
341 int first_op_idx = 1;
342 unsigned int commutative_op = -1U;
343 bool first_op_cond = false;
344 bool first = stmt_num == 0;
346 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
348 number_of_oprnds = gimple_call_num_args (stmt);
349 first_op_idx = 3;
350 if (gimple_call_internal_p (stmt))
352 internal_fn ifn = gimple_call_internal_fn (stmt);
353 commutative_op = first_commutative_argument (ifn);
355 /* Masked load, only look at mask. */
356 if (ifn == IFN_MASK_LOAD)
358 number_of_oprnds = 1;
359 /* Mask operand index. */
360 first_op_idx = 5;
364 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
366 enum tree_code code = gimple_assign_rhs_code (stmt);
367 number_of_oprnds = gimple_num_ops (stmt) - 1;
368 /* Swap can only be done for cond_expr if asked to, otherwise we
369 could result in different comparison code to the first stmt. */
370 if (code == COND_EXPR
371 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
373 first_op_cond = true;
374 number_of_oprnds++;
376 else
377 commutative_op = commutative_tree_code (code) ? 0U : -1U;
379 else
380 return -1;
382 bool swapped = (*swap != 0);
383 gcc_assert (!swapped || first_op_cond);
384 for (i = 0; i < number_of_oprnds; i++)
386 again:
387 if (first_op_cond)
389 /* Map indicating how operands of cond_expr should be swapped. */
390 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
391 int *map = maps[*swap];
393 if (i < 2)
394 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
395 first_op_idx), map[i]);
396 else
397 oprnd = gimple_op (stmt_info->stmt, map[i]);
399 else
400 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
401 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
402 oprnd = TREE_OPERAND (oprnd, 0);
404 oprnd_info = (*oprnds_info)[i];
406 stmt_vec_info def_stmt_info;
407 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
409 if (dump_enabled_p ())
410 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
411 "Build SLP failed: can't analyze def for %T\n",
412 oprnd);
414 return -1;
417 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
418 oprnd_info->any_pattern = true;
420 if (first)
422 oprnd_info->first_dt = dt;
423 oprnd_info->first_op_type = TREE_TYPE (oprnd);
425 else
427 /* Not first stmt of the group, check that the def-stmt/s match
428 the def-stmt/s of the first stmt. Allow different definition
429 types for reduction chains: the first stmt must be a
430 vect_reduction_def (a phi node), and the rest
431 end in the reduction chain. */
432 tree type = TREE_TYPE (oprnd);
433 if ((oprnd_info->first_dt != dt
434 && !(oprnd_info->first_dt == vect_reduction_def
435 && !STMT_VINFO_DATA_REF (stmt_info)
436 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
437 && def_stmt_info
438 && !STMT_VINFO_DATA_REF (def_stmt_info)
439 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
440 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
441 && !((oprnd_info->first_dt == vect_external_def
442 || oprnd_info->first_dt == vect_constant_def)
443 && (dt == vect_external_def
444 || dt == vect_constant_def)))
445 || !types_compatible_p (oprnd_info->first_op_type, type)
446 || (!STMT_VINFO_DATA_REF (stmt_info)
447 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
448 && ((!def_stmt_info
449 || STMT_VINFO_DATA_REF (def_stmt_info)
450 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
451 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
452 != (oprnd_info->first_dt != vect_reduction_def))))
454 /* Try swapping operands if we got a mismatch. */
455 if (i == commutative_op && !swapped)
457 if (dump_enabled_p ())
458 dump_printf_loc (MSG_NOTE, vect_location,
459 "trying swapped operands\n");
460 swapped = true;
461 goto again;
464 if (dump_enabled_p ())
465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
466 "Build SLP failed: different types\n");
468 return 1;
470 if ((dt == vect_constant_def
471 || dt == vect_external_def)
472 && !vinfo->vector_size.is_constant ()
473 && (TREE_CODE (type) == BOOLEAN_TYPE
474 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
475 TYPE_MODE (type))))
477 if (dump_enabled_p ())
478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
479 "Build SLP failed: invalid type of def "
480 "for variable-length SLP %T\n", oprnd);
481 return -1;
485 /* Check the types of the definitions. */
486 switch (dt)
488 case vect_external_def:
489 /* Make sure to demote the overall operand to external. */
490 oprnd_info->first_dt = vect_external_def;
491 /* Fallthru. */
492 case vect_constant_def:
493 oprnd_info->def_stmts.quick_push (NULL);
494 oprnd_info->ops.quick_push (oprnd);
495 break;
497 case vect_internal_def:
498 case vect_reduction_def:
499 if (oprnd_info->first_dt == vect_reduction_def
500 && !STMT_VINFO_DATA_REF (stmt_info)
501 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
502 && !STMT_VINFO_DATA_REF (def_stmt_info)
503 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
504 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
506 /* For a SLP reduction chain we want to duplicate the
507 reduction to each of the chain members. That gets
508 us a sane SLP graph (still the stmts are not 100%
509 correct wrt the initial values). */
510 gcc_assert (!first);
511 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
512 oprnd_info->ops.quick_push (oprnd_info->ops[0]);
513 break;
515 /* Fallthru. */
516 case vect_induction_def:
517 oprnd_info->def_stmts.quick_push (def_stmt_info);
518 oprnd_info->ops.quick_push (oprnd);
519 break;
521 default:
522 /* FORNOW: Not supported. */
523 if (dump_enabled_p ())
524 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
525 "Build SLP failed: illegal type of def %T\n",
526 oprnd);
528 return -1;
532 /* Swap operands. */
533 if (swapped)
535 if (first_op_cond)
537 /* If there are already uses of this stmt in a SLP instance then
538 we've committed to the operand order and can't swap it. */
539 if (STMT_VINFO_NUM_SLP_USES (stmt_info) != 0)
541 if (dump_enabled_p ())
542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
543 "Build SLP failed: cannot swap operands of "
544 "shared stmt %G", stmt_info->stmt);
545 return -1;
548 /* To get rid of this swapping we have to move the stmt code
549 to the SLP tree as well (and gather it here per stmt). */
550 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
551 tree cond = gimple_assign_rhs1 (stmt);
552 enum tree_code code = TREE_CODE (cond);
554 /* Swap. */
555 if (*swap == 1)
557 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
558 &TREE_OPERAND (cond, 1));
559 TREE_SET_CODE (cond, swap_tree_comparison (code));
561 /* Invert. */
562 else
564 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
565 gimple_assign_rhs3_ptr (stmt));
566 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
567 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
568 gcc_assert (code != ERROR_MARK);
569 TREE_SET_CODE (cond, code);
572 else
574 /* Commutative ops need not reflect swapping, ops are in
575 the SLP tree. */
577 if (dump_enabled_p ())
578 dump_printf_loc (MSG_NOTE, vect_location,
579 "swapped operands to match def types in %G",
580 stmt_info->stmt);
583 *swap = swapped;
584 return 0;
587 /* Return true if call statements CALL1 and CALL2 are similar enough
588 to be combined into the same SLP group. */
590 static bool
591 compatible_calls_p (gcall *call1, gcall *call2)
593 unsigned int nargs = gimple_call_num_args (call1);
594 if (nargs != gimple_call_num_args (call2))
595 return false;
597 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
598 return false;
600 if (gimple_call_internal_p (call1))
602 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
603 TREE_TYPE (gimple_call_lhs (call2))))
604 return false;
605 for (unsigned int i = 0; i < nargs; ++i)
606 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
607 TREE_TYPE (gimple_call_arg (call2, i))))
608 return false;
610 else
612 if (!operand_equal_p (gimple_call_fn (call1),
613 gimple_call_fn (call2), 0))
614 return false;
616 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
617 return false;
619 return true;
622 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
623 caller's attempt to find the vector type in STMT_INFO with the narrowest
624 element type. Return true if VECTYPE is nonnull and if it is valid
625 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
626 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
627 vect_build_slp_tree. */
629 static bool
630 vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
631 tree vectype, poly_uint64 *max_nunits)
633 if (!vectype)
635 if (dump_enabled_p ())
636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
637 "Build SLP failed: unsupported data-type in %G\n",
638 stmt_info->stmt);
639 /* Fatal mismatch. */
640 return false;
643 /* If populating the vector type requires unrolling then fail
644 before adjusting *max_nunits for basic-block vectorization. */
645 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
646 unsigned HOST_WIDE_INT const_nunits;
647 if (STMT_VINFO_BB_VINFO (stmt_info)
648 && (!nunits.is_constant (&const_nunits)
649 || const_nunits > group_size))
651 if (dump_enabled_p ())
652 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
653 "Build SLP failed: unrolling required "
654 "in basic block SLP\n");
655 /* Fatal mismatch. */
656 return false;
659 /* In case of multiple types we need to detect the smallest type. */
660 vect_update_max_nunits (max_nunits, vectype);
661 return true;
664 /* STMTS is a group of GROUP_SIZE SLP statements in which some
665 statements do the same operation as the first statement and in which
666 the others do ALT_STMT_CODE. Return true if we can take one vector
667 of the first operation and one vector of the second and permute them
668 to get the required result. VECTYPE is the type of the vector that
669 would be permuted. */
671 static bool
672 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
673 unsigned int group_size, tree vectype,
674 tree_code alt_stmt_code)
676 unsigned HOST_WIDE_INT count;
677 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
678 return false;
680 vec_perm_builder sel (count, count, 1);
681 for (unsigned int i = 0; i < count; ++i)
683 unsigned int elt = i;
684 gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
685 if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
686 elt += count;
687 sel.quick_push (elt);
689 vec_perm_indices indices (sel, 2, count);
690 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
693 /* Verify if the scalar stmts STMTS are isomorphic, require data
694 permutation or are of unsupported types of operation. Return
695 true if they are, otherwise return false and indicate in *MATCHES
696 which stmts are not isomorphic to the first one. If MATCHES[0]
697 is false then this indicates the comparison could not be
698 carried out or the stmts will never be vectorized by SLP.
700 Note COND_EXPR is possibly isomorphic to another one after swapping its
701 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
702 the first stmt by swapping the two operands of comparison; set SWAP[i]
703 to 2 if stmt I is isormorphic to the first stmt by inverting the code
704 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
705 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
707 static bool
708 vect_build_slp_tree_1 (unsigned char *swap,
709 vec<stmt_vec_info> stmts, unsigned int group_size,
710 poly_uint64 *max_nunits, bool *matches,
711 bool *two_operators)
713 unsigned int i;
714 stmt_vec_info first_stmt_info = stmts[0];
715 enum tree_code first_stmt_code = ERROR_MARK;
716 enum tree_code alt_stmt_code = ERROR_MARK;
717 enum tree_code rhs_code = ERROR_MARK;
718 enum tree_code first_cond_code = ERROR_MARK;
719 tree lhs;
720 bool need_same_oprnds = false;
721 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
722 optab optab;
723 int icode;
724 machine_mode optab_op2_mode;
725 machine_mode vec_mode;
726 stmt_vec_info first_load = NULL, prev_first_load = NULL;
727 bool load_p = false;
729 /* For every stmt in NODE find its def stmt/s. */
730 stmt_vec_info stmt_info;
731 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
733 gimple *stmt = stmt_info->stmt;
734 swap[i] = 0;
735 matches[i] = false;
737 if (dump_enabled_p ())
738 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
740 /* Fail to vectorize statements marked as unvectorizable. */
741 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
743 if (dump_enabled_p ())
744 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
745 "Build SLP failed: unvectorizable statement %G",
746 stmt);
747 /* Fatal mismatch. */
748 matches[0] = false;
749 return false;
752 lhs = gimple_get_lhs (stmt);
753 if (lhs == NULL_TREE)
755 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757 "Build SLP failed: not GIMPLE_ASSIGN nor "
758 "GIMPLE_CALL %G", stmt);
759 /* Fatal mismatch. */
760 matches[0] = false;
761 return false;
764 tree nunits_vectype;
765 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
766 &nunits_vectype)
767 || (nunits_vectype
768 && !vect_record_max_nunits (stmt_info, group_size,
769 nunits_vectype, max_nunits)))
771 /* Fatal mismatch. */
772 matches[0] = false;
773 return false;
776 gcc_assert (vectype);
778 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
780 rhs_code = CALL_EXPR;
782 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
783 load_p = true;
784 else if ((gimple_call_internal_p (call_stmt)
785 && (!vectorizable_internal_fn_p
786 (gimple_call_internal_fn (call_stmt))))
787 || gimple_call_tail_p (call_stmt)
788 || gimple_call_noreturn_p (call_stmt)
789 || !gimple_call_nothrow_p (call_stmt)
790 || gimple_call_chain (call_stmt))
792 if (dump_enabled_p ())
793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
794 "Build SLP failed: unsupported call type %G",
795 call_stmt);
796 /* Fatal mismatch. */
797 matches[0] = false;
798 return false;
801 else
803 rhs_code = gimple_assign_rhs_code (stmt);
804 load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference;
807 /* Check the operation. */
808 if (i == 0)
810 first_stmt_code = rhs_code;
812 /* Shift arguments should be equal in all the packed stmts for a
813 vector shift with scalar shift operand. */
814 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
815 || rhs_code == LROTATE_EXPR
816 || rhs_code == RROTATE_EXPR)
818 if (vectype == boolean_type_node)
820 if (dump_enabled_p ())
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 "Build SLP failed: shift of a"
823 " boolean.\n");
824 /* Fatal mismatch. */
825 matches[0] = false;
826 return false;
829 vec_mode = TYPE_MODE (vectype);
831 /* First see if we have a vector/vector shift. */
832 optab = optab_for_tree_code (rhs_code, vectype,
833 optab_vector);
835 if (!optab
836 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
838 /* No vector/vector shift, try for a vector/scalar shift. */
839 optab = optab_for_tree_code (rhs_code, vectype,
840 optab_scalar);
842 if (!optab)
844 if (dump_enabled_p ())
845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
846 "Build SLP failed: no optab.\n");
847 /* Fatal mismatch. */
848 matches[0] = false;
849 return false;
851 icode = (int) optab_handler (optab, vec_mode);
852 if (icode == CODE_FOR_nothing)
854 if (dump_enabled_p ())
855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
856 "Build SLP failed: "
857 "op not supported by target.\n");
858 /* Fatal mismatch. */
859 matches[0] = false;
860 return false;
862 optab_op2_mode = insn_data[icode].operand[2].mode;
863 if (!VECTOR_MODE_P (optab_op2_mode))
865 need_same_oprnds = true;
866 first_op1 = gimple_assign_rhs2 (stmt);
870 else if (rhs_code == WIDEN_LSHIFT_EXPR)
872 need_same_oprnds = true;
873 first_op1 = gimple_assign_rhs2 (stmt);
876 else
878 if (first_stmt_code != rhs_code
879 && alt_stmt_code == ERROR_MARK)
880 alt_stmt_code = rhs_code;
881 if (first_stmt_code != rhs_code
882 && (first_stmt_code != IMAGPART_EXPR
883 || rhs_code != REALPART_EXPR)
884 && (first_stmt_code != REALPART_EXPR
885 || rhs_code != IMAGPART_EXPR)
886 /* Handle mismatches in plus/minus by computing both
887 and merging the results. */
888 && !((first_stmt_code == PLUS_EXPR
889 || first_stmt_code == MINUS_EXPR)
890 && (alt_stmt_code == PLUS_EXPR
891 || alt_stmt_code == MINUS_EXPR)
892 && rhs_code == alt_stmt_code)
893 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
894 && (first_stmt_code == ARRAY_REF
895 || first_stmt_code == BIT_FIELD_REF
896 || first_stmt_code == INDIRECT_REF
897 || first_stmt_code == COMPONENT_REF
898 || first_stmt_code == MEM_REF)))
900 if (dump_enabled_p ())
902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
903 "Build SLP failed: different operation "
904 "in stmt %G", stmt);
905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
906 "original stmt %G", first_stmt_info->stmt);
908 /* Mismatch. */
909 continue;
912 if (need_same_oprnds
913 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
915 if (dump_enabled_p ())
916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
917 "Build SLP failed: different shift "
918 "arguments in %G", stmt);
919 /* Mismatch. */
920 continue;
923 if (!load_p && rhs_code == CALL_EXPR)
925 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
926 as_a <gcall *> (stmt)))
928 if (dump_enabled_p ())
929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
930 "Build SLP failed: different calls in %G",
931 stmt);
932 /* Mismatch. */
933 continue;
938 /* Grouped store or load. */
939 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
941 if (REFERENCE_CLASS_P (lhs))
943 /* Store. */
946 else
948 /* Load. */
949 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
950 if (prev_first_load)
952 /* Check that there are no loads from different interleaving
953 chains in the same node. */
954 if (prev_first_load != first_load)
956 if (dump_enabled_p ())
957 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
958 vect_location,
959 "Build SLP failed: different "
960 "interleaving chains in one node %G",
961 stmt);
962 /* Mismatch. */
963 continue;
966 else
967 prev_first_load = first_load;
969 } /* Grouped access. */
970 else
972 if (load_p)
974 /* Not grouped load. */
975 if (dump_enabled_p ())
976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
977 "Build SLP failed: not grouped load %G", stmt);
979 /* FORNOW: Not grouped loads are not supported. */
980 /* Fatal mismatch. */
981 matches[0] = false;
982 return false;
985 /* Not memory operation. */
986 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
987 && TREE_CODE_CLASS (rhs_code) != tcc_unary
988 && TREE_CODE_CLASS (rhs_code) != tcc_expression
989 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
990 && rhs_code != VIEW_CONVERT_EXPR
991 && rhs_code != CALL_EXPR)
993 if (dump_enabled_p ())
994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
995 "Build SLP failed: operation unsupported %G",
996 stmt);
997 /* Fatal mismatch. */
998 matches[0] = false;
999 return false;
1002 if (rhs_code == COND_EXPR)
1004 tree cond_expr = gimple_assign_rhs1 (stmt);
1005 enum tree_code cond_code = TREE_CODE (cond_expr);
1006 enum tree_code swap_code = ERROR_MARK;
1007 enum tree_code invert_code = ERROR_MARK;
1009 if (i == 0)
1010 first_cond_code = TREE_CODE (cond_expr);
1011 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1013 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1014 swap_code = swap_tree_comparison (cond_code);
1015 invert_code = invert_tree_comparison (cond_code, honor_nans);
1018 if (first_cond_code == cond_code)
1020 /* Isomorphic can be achieved by swapping. */
1021 else if (first_cond_code == swap_code)
1022 swap[i] = 1;
1023 /* Isomorphic can be achieved by inverting. */
1024 else if (first_cond_code == invert_code)
1025 swap[i] = 2;
1026 else
1028 if (dump_enabled_p ())
1029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1030 "Build SLP failed: different"
1031 " operation %G", stmt);
1032 /* Mismatch. */
1033 continue;
1038 matches[i] = true;
1041 for (i = 0; i < group_size; ++i)
1042 if (!matches[i])
1043 return false;
1045 /* If we allowed a two-operation SLP node verify the target can cope
1046 with the permute we are going to use. */
1047 if (alt_stmt_code != ERROR_MARK
1048 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1050 if (vectype == boolean_type_node
1051 || !vect_two_operations_perm_ok_p (stmts, group_size,
1052 vectype, alt_stmt_code))
1054 for (i = 0; i < group_size; ++i)
1055 if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
1057 matches[i] = false;
1058 if (dump_enabled_p ())
1060 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1061 "Build SLP failed: different operation "
1062 "in stmt %G", stmts[i]->stmt);
1063 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1064 "original stmt %G", first_stmt_info->stmt);
1067 return false;
1069 *two_operators = true;
1072 return true;
1075 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1076 Note we never remove apart from at destruction time so we do not
1077 need a special value for deleted that differs from empty. */
1078 struct bst_traits
1080 typedef vec <stmt_vec_info> value_type;
1081 typedef vec <stmt_vec_info> compare_type;
1082 static inline hashval_t hash (value_type);
1083 static inline bool equal (value_type existing, value_type candidate);
1084 static inline bool is_empty (value_type x) { return !x.exists (); }
1085 static inline bool is_deleted (value_type x) { return !x.exists (); }
1086 static inline void mark_empty (value_type &x) { x.release (); }
1087 static inline void mark_deleted (value_type &x) { x.release (); }
1088 static inline void remove (value_type &x) { x.release (); }
1090 inline hashval_t
1091 bst_traits::hash (value_type x)
1093 inchash::hash h;
1094 for (unsigned i = 0; i < x.length (); ++i)
1095 h.add_int (gimple_uid (x[i]->stmt));
1096 return h.end ();
1098 inline bool
1099 bst_traits::equal (value_type existing, value_type candidate)
1101 if (existing.length () != candidate.length ())
1102 return false;
1103 for (unsigned i = 0; i < existing.length (); ++i)
1104 if (existing[i] != candidate[i])
1105 return false;
1106 return true;
1109 typedef hash_map <vec <gimple *>, slp_tree,
1110 simple_hashmap_traits <bst_traits, slp_tree> >
1111 scalar_stmts_to_slp_tree_map_t;
1113 static slp_tree
1114 vect_build_slp_tree_2 (vec_info *vinfo,
1115 vec<stmt_vec_info> stmts, unsigned int group_size,
1116 poly_uint64 *max_nunits,
1117 bool *matches, unsigned *npermutes, unsigned *tree_size,
1118 scalar_stmts_to_slp_tree_map_t *bst_map);
1120 static slp_tree
1121 vect_build_slp_tree (vec_info *vinfo,
1122 vec<stmt_vec_info> stmts, unsigned int group_size,
1123 poly_uint64 *max_nunits,
1124 bool *matches, unsigned *npermutes, unsigned *tree_size,
1125 scalar_stmts_to_slp_tree_map_t *bst_map)
1127 if (slp_tree *leader = bst_map->get (stmts))
1129 if (dump_enabled_p ())
1130 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1131 *leader ? "" : "failed ", *leader);
1132 if (*leader)
1134 (*leader)->refcnt++;
1135 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1137 return *leader;
1139 poly_uint64 this_max_nunits = 1;
1140 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1141 matches, npermutes, tree_size, bst_map);
1142 if (res)
1144 res->max_nunits = this_max_nunits;
1145 vect_update_max_nunits (max_nunits, this_max_nunits);
1146 /* Keep a reference for the bst_map use. */
1147 res->refcnt++;
1149 bst_map->put (stmts.copy (), res);
1150 return res;
1153 /* Recursively build an SLP tree starting from NODE.
1154 Fail (and return a value not equal to zero) if def-stmts are not
1155 isomorphic, require data permutation or are of unsupported types of
1156 operation. Otherwise, return 0.
1157 The value returned is the depth in the SLP tree where a mismatch
1158 was found. */
1160 static slp_tree
1161 vect_build_slp_tree_2 (vec_info *vinfo,
1162 vec<stmt_vec_info> stmts, unsigned int group_size,
1163 poly_uint64 *max_nunits,
1164 bool *matches, unsigned *npermutes, unsigned *tree_size,
1165 scalar_stmts_to_slp_tree_map_t *bst_map)
1167 unsigned nops, i, this_tree_size = 0;
1168 poly_uint64 this_max_nunits = *max_nunits;
1169 slp_tree node;
1171 matches[0] = false;
1173 stmt_vec_info stmt_info = stmts[0];
1174 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1175 nops = gimple_call_num_args (stmt);
1176 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1178 nops = gimple_num_ops (stmt) - 1;
1179 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1180 nops++;
1182 else if (is_a <gphi *> (stmt_info->stmt))
1183 nops = 0;
1184 else
1185 return NULL;
1187 /* If the SLP node is a PHI (induction or reduction), terminate
1188 the recursion. */
1189 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1191 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1192 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1193 if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits))
1194 return NULL;
1196 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1197 /* Induction from different IVs is not supported. */
1198 if (def_type == vect_induction_def)
1200 stmt_vec_info other_info;
1201 FOR_EACH_VEC_ELT (stmts, i, other_info)
1202 if (stmt_info != other_info)
1203 return NULL;
1205 else if (def_type == vect_reduction_def
1206 || def_type == vect_double_reduction_def
1207 || def_type == vect_nested_cycle)
1209 /* Else def types have to match. */
1210 stmt_vec_info other_info;
1211 FOR_EACH_VEC_ELT (stmts, i, other_info)
1212 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1213 return NULL;
1215 else
1216 return NULL;
1217 (*tree_size)++;
1218 node = vect_create_new_slp_node (stmts);
1219 return node;
1223 bool two_operators = false;
1224 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1225 if (!vect_build_slp_tree_1 (swap, stmts, group_size,
1226 &this_max_nunits, matches, &two_operators))
1227 return NULL;
1229 /* If the SLP node is a load, terminate the recursion unless masked. */
1230 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1231 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1233 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1235 /* Masked load. */
1236 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1237 nops = 1;
1239 else
1241 *max_nunits = this_max_nunits;
1242 (*tree_size)++;
1243 node = vect_create_new_slp_node (stmts);
1244 return node;
1248 /* Get at the operands, verifying they are compatible. */
1249 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1250 slp_oprnd_info oprnd_info;
1251 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1253 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1254 stmts, i, &oprnds_info);
1255 if (res != 0)
1256 matches[(res == -1) ? 0 : i] = false;
1257 if (!matches[0])
1258 break;
1260 for (i = 0; i < group_size; ++i)
1261 if (!matches[i])
1263 vect_free_oprnd_info (oprnds_info);
1264 return NULL;
1267 auto_vec<slp_tree, 4> children;
1269 stmt_info = stmts[0];
1271 /* Create SLP_TREE nodes for the definition node/s. */
1272 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1274 slp_tree child;
1275 unsigned old_tree_size = this_tree_size;
1276 unsigned int j;
1278 if (oprnd_info->first_dt == vect_uninitialized_def)
1280 /* COND_EXPR have one too many eventually if the condition
1281 is a SSA name. */
1282 gcc_assert (i == 3 && nops == 4);
1283 continue;
1286 if (oprnd_info->first_dt != vect_internal_def
1287 && oprnd_info->first_dt != vect_reduction_def
1288 && oprnd_info->first_dt != vect_induction_def)
1290 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1291 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1292 oprnd_info->ops = vNULL;
1293 children.safe_push (invnode);
1294 continue;
1297 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1298 group_size, &this_max_nunits,
1299 matches, npermutes,
1300 &this_tree_size, bst_map)) != NULL)
1302 /* If we have all children of child built up from scalars then just
1303 throw that away and build it up this node from scalars. */
1304 if (is_a <bb_vec_info> (vinfo)
1305 && !SLP_TREE_CHILDREN (child).is_empty ()
1306 /* ??? Rejecting patterns this way doesn't work. We'd have to
1307 do extra work to cancel the pattern so the uses see the
1308 scalar version. */
1309 && !oprnd_info->any_pattern)
1311 slp_tree grandchild;
1313 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1314 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1315 break;
1316 if (!grandchild)
1318 /* Roll back. */
1319 this_tree_size = old_tree_size;
1320 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1321 vect_free_slp_tree (grandchild, false);
1322 SLP_TREE_CHILDREN (child).truncate (0);
1324 if (dump_enabled_p ())
1325 dump_printf_loc (MSG_NOTE, vect_location,
1326 "Building parent vector operands from "
1327 "scalars instead\n");
1328 oprnd_info->def_stmts = vNULL;
1329 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1330 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1331 oprnd_info->ops = vNULL;
1332 ++this_tree_size;
1333 children.safe_push (child);
1334 continue;
1338 oprnd_info->def_stmts = vNULL;
1339 children.safe_push (child);
1340 continue;
1343 /* If the SLP build failed fatally and we analyze a basic-block
1344 simply treat nodes we fail to build as externally defined
1345 (and thus build vectors from the scalar defs).
1346 The cost model will reject outright expensive cases.
1347 ??? This doesn't treat cases where permutation ultimatively
1348 fails (or we don't try permutation below). Ideally we'd
1349 even compute a permutation that will end up with the maximum
1350 SLP tree size... */
1351 if (is_a <bb_vec_info> (vinfo)
1352 && !matches[0]
1353 /* ??? Rejecting patterns this way doesn't work. We'd have to
1354 do extra work to cancel the pattern so the uses see the
1355 scalar version. */
1356 && !is_pattern_stmt_p (stmt_info)
1357 && !oprnd_info->any_pattern)
1359 if (dump_enabled_p ())
1360 dump_printf_loc (MSG_NOTE, vect_location,
1361 "Building vector operands from scalars\n");
1362 this_tree_size++;
1363 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1364 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1365 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1366 children.safe_push (child);
1367 oprnd_info->ops = vNULL;
1368 oprnd_info->def_stmts = vNULL;
1369 continue;
1372 /* If the SLP build for operand zero failed and operand zero
1373 and one can be commutated try that for the scalar stmts
1374 that failed the match. */
1375 if (i == 0
1376 /* A first scalar stmt mismatch signals a fatal mismatch. */
1377 && matches[0]
1378 /* ??? For COND_EXPRs we can swap the comparison operands
1379 as well as the arms under some constraints. */
1380 && nops == 2
1381 && oprnds_info[1]->first_dt == vect_internal_def
1382 && is_gimple_assign (stmt_info->stmt)
1383 /* Swapping operands for reductions breaks assumptions later on. */
1384 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1385 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1386 /* Do so only if the number of not successful permutes was nor more
1387 than a cut-ff as re-trying the recursive match on
1388 possibly each level of the tree would expose exponential
1389 behavior. */
1390 && *npermutes < 4)
1392 /* See whether we can swap the matching or the non-matching
1393 stmt operands. */
1394 bool swap_not_matching = true;
1397 for (j = 0; j < group_size; ++j)
1399 if (matches[j] != !swap_not_matching)
1400 continue;
1401 stmt_vec_info stmt_info = stmts[j];
1402 /* Verify if we can swap operands of this stmt. */
1403 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1404 if (!stmt
1405 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1407 if (!swap_not_matching)
1408 goto fail;
1409 swap_not_matching = false;
1410 break;
1414 while (j != group_size);
1416 /* Swap mismatched definition stmts. */
1417 if (dump_enabled_p ())
1418 dump_printf_loc (MSG_NOTE, vect_location,
1419 "Re-trying with swapped operands of stmts ");
1420 for (j = 0; j < group_size; ++j)
1421 if (matches[j] == !swap_not_matching)
1423 std::swap (oprnds_info[0]->def_stmts[j],
1424 oprnds_info[1]->def_stmts[j]);
1425 std::swap (oprnds_info[0]->ops[j],
1426 oprnds_info[1]->ops[j]);
1427 if (dump_enabled_p ())
1428 dump_printf (MSG_NOTE, "%d ", j);
1430 if (dump_enabled_p ())
1431 dump_printf (MSG_NOTE, "\n");
1432 /* And try again with scratch 'matches' ... */
1433 bool *tem = XALLOCAVEC (bool, group_size);
1434 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1435 group_size, &this_max_nunits,
1436 tem, npermutes,
1437 &this_tree_size, bst_map)) != NULL)
1439 /* If we have all children of child built up from scalars then
1440 just throw that away and build it up this node from scalars. */
1441 if (is_a <bb_vec_info> (vinfo)
1442 && !SLP_TREE_CHILDREN (child).is_empty ()
1443 /* ??? Rejecting patterns this way doesn't work. We'd have
1444 to do extra work to cancel the pattern so the uses see the
1445 scalar version. */
1446 && !oprnd_info->any_pattern)
1448 unsigned int j;
1449 slp_tree grandchild;
1451 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1452 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1453 break;
1454 if (!grandchild)
1456 /* Roll back. */
1457 this_tree_size = old_tree_size;
1458 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1459 vect_free_slp_tree (grandchild, false);
1460 SLP_TREE_CHILDREN (child).truncate (0);
1462 if (dump_enabled_p ())
1463 dump_printf_loc (MSG_NOTE, vect_location,
1464 "Building parent vector operands from "
1465 "scalars instead\n");
1466 oprnd_info->def_stmts = vNULL;
1467 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1468 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1469 oprnd_info->ops = vNULL;
1470 ++this_tree_size;
1471 children.safe_push (child);
1472 continue;
1476 oprnd_info->def_stmts = vNULL;
1477 children.safe_push (child);
1478 continue;
1481 ++*npermutes;
1484 fail:
1485 gcc_assert (child == NULL);
1486 FOR_EACH_VEC_ELT (children, j, child)
1487 vect_free_slp_tree (child, false);
1488 vect_free_oprnd_info (oprnds_info);
1489 return NULL;
1492 vect_free_oprnd_info (oprnds_info);
1494 *tree_size += this_tree_size + 1;
1495 *max_nunits = this_max_nunits;
1497 node = vect_create_new_slp_node (stmts);
1498 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1499 SLP_TREE_CHILDREN (node).splice (children);
1500 return node;
1503 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1505 static void
1506 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1507 slp_tree node, hash_set<slp_tree> &visited)
1509 unsigned i;
1510 stmt_vec_info stmt_info;
1511 slp_tree child;
1512 tree op;
1514 if (visited.add (node))
1515 return;
1517 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1518 dump_user_location_t user_loc = loc.get_user_location ();
1519 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u)\n",
1520 SLP_TREE_DEF_TYPE (node) == vect_external_def
1521 ? " (external)"
1522 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1523 ? " (constant)"
1524 : ""), node,
1525 estimated_poly_value (node->max_nunits));
1526 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1527 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1528 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1529 else
1531 dump_printf_loc (metadata, user_loc, "\t{ ");
1532 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1533 dump_printf (metadata, "%T%s ", op,
1534 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1535 dump_printf (metadata, "}\n");
1537 if (SLP_TREE_CHILDREN (node).is_empty ())
1538 return;
1539 dump_printf_loc (metadata, user_loc, "\tchildren");
1540 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1541 dump_printf (dump_kind, " %p", (void *)child);
1542 dump_printf (dump_kind, "\n");
1543 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1544 vect_print_slp_tree (dump_kind, loc, child, visited);
1547 static void
1548 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1549 slp_tree node)
1551 hash_set<slp_tree> visited;
1552 vect_print_slp_tree (dump_kind, loc, node, visited);
1555 /* Mark the tree rooted at NODE with PURE_SLP. */
1557 static void
1558 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1560 int i;
1561 stmt_vec_info stmt_info;
1562 slp_tree child;
1564 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1565 return;
1567 if (visited.add (node))
1568 return;
1570 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1571 STMT_SLP_TYPE (stmt_info) = pure_slp;
1573 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1574 vect_mark_slp_stmts (child, visited);
1577 static void
1578 vect_mark_slp_stmts (slp_tree node)
1580 hash_set<slp_tree> visited;
1581 vect_mark_slp_stmts (node, visited);
1584 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1586 static void
1587 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1589 int i;
1590 stmt_vec_info stmt_info;
1591 slp_tree child;
1593 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1594 return;
1596 if (visited.add (node))
1597 return;
1599 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1601 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1602 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1603 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1606 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1607 vect_mark_slp_stmts_relevant (child, visited);
1610 static void
1611 vect_mark_slp_stmts_relevant (slp_tree node)
1613 hash_set<slp_tree> visited;
1614 vect_mark_slp_stmts_relevant (node, visited);
1618 /* Rearrange the statements of NODE according to PERMUTATION. */
1620 static void
1621 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1622 vec<unsigned> permutation,
1623 hash_set<slp_tree> &visited)
1625 unsigned int i;
1626 slp_tree child;
1628 if (visited.add (node))
1629 return;
1631 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1632 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1634 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1636 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1637 vec<stmt_vec_info> tmp_stmts;
1638 tmp_stmts.create (group_size);
1639 tmp_stmts.quick_grow (group_size);
1640 stmt_vec_info stmt_info;
1641 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1642 tmp_stmts[permutation[i]] = stmt_info;
1643 SLP_TREE_SCALAR_STMTS (node).release ();
1644 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1646 if (SLP_TREE_SCALAR_OPS (node).exists ())
1648 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1649 vec<tree> tmp_ops;
1650 tmp_ops.create (group_size);
1651 tmp_ops.quick_grow (group_size);
1652 tree op;
1653 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1654 tmp_ops[permutation[i]] = op;
1655 SLP_TREE_SCALAR_OPS (node).release ();
1656 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1661 /* Attempt to reorder stmts in a reduction chain so that we don't
1662 require any load permutation. Return true if that was possible,
1663 otherwise return false. */
1665 static bool
1666 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1668 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1669 unsigned int i, j;
1670 unsigned int lidx;
1671 slp_tree node, load;
1673 /* Compare all the permutation sequences to the first one. We know
1674 that at least one load is permuted. */
1675 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1676 if (!node->load_permutation.exists ())
1677 return false;
1678 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1680 if (!load->load_permutation.exists ())
1681 return false;
1682 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1683 if (lidx != node->load_permutation[j])
1684 return false;
1687 /* Check that the loads in the first sequence are different and there
1688 are no gaps between them. */
1689 auto_sbitmap load_index (group_size);
1690 bitmap_clear (load_index);
1691 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1693 if (lidx >= group_size)
1694 return false;
1695 if (bitmap_bit_p (load_index, lidx))
1696 return false;
1698 bitmap_set_bit (load_index, lidx);
1700 for (i = 0; i < group_size; i++)
1701 if (!bitmap_bit_p (load_index, i))
1702 return false;
1704 /* This permutation is valid for reduction. Since the order of the
1705 statements in the nodes is not important unless they are memory
1706 accesses, we can rearrange the statements in all the nodes
1707 according to the order of the loads. */
1708 hash_set<slp_tree> visited;
1709 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1710 node->load_permutation, visited);
1712 /* We are done, no actual permutations need to be generated. */
1713 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1714 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1716 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1717 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1718 /* But we have to keep those permutations that are required because
1719 of handling of gaps. */
1720 if (known_eq (unrolling_factor, 1U)
1721 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1722 && DR_GROUP_GAP (first_stmt_info) == 0))
1723 SLP_TREE_LOAD_PERMUTATION (node).release ();
1724 else
1725 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1726 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1729 return true;
1732 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1734 static void
1735 vect_gather_slp_loads (slp_instance inst, slp_tree node,
1736 hash_set<slp_tree> &visited)
1738 if (visited.add (node))
1739 return;
1741 if (SLP_TREE_CHILDREN (node).length () == 0)
1743 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1744 return;
1745 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1746 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1747 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1748 SLP_INSTANCE_LOADS (inst).safe_push (node);
1750 else
1752 unsigned i;
1753 slp_tree child;
1754 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1755 vect_gather_slp_loads (inst, child, visited);
1759 static void
1760 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1762 hash_set<slp_tree> visited;
1763 vect_gather_slp_loads (inst, node, visited);
1766 /* Check if the required load permutations in the SLP instance
1767 SLP_INSTN are supported. */
1769 static bool
1770 vect_supported_load_permutation_p (slp_instance slp_instn)
1772 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1773 unsigned int i, j, k, next;
1774 slp_tree node;
1776 if (dump_enabled_p ())
1778 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1779 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1780 if (node->load_permutation.exists ())
1781 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1782 dump_printf (MSG_NOTE, "%d ", next);
1783 else
1784 for (k = 0; k < group_size; ++k)
1785 dump_printf (MSG_NOTE, "%d ", k);
1786 dump_printf (MSG_NOTE, "\n");
1789 /* In case of reduction every load permutation is allowed, since the order
1790 of the reduction statements is not important (as opposed to the case of
1791 grouped stores). The only condition we need to check is that all the
1792 load nodes are of the same size and have the same permutation (and then
1793 rearrange all the nodes of the SLP instance according to this
1794 permutation). */
1796 /* Check that all the load nodes are of the same size. */
1797 /* ??? Can't we assert this? */
1798 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1799 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1800 return false;
1802 node = SLP_INSTANCE_TREE (slp_instn);
1803 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1805 /* Reduction (there are no data-refs in the root).
1806 In reduction chain the order of the loads is not important. */
1807 if (!STMT_VINFO_DATA_REF (stmt_info)
1808 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1809 vect_attempt_slp_rearrange_stmts (slp_instn);
1811 /* In basic block vectorization we allow any subchain of an interleaving
1812 chain.
1813 FORNOW: not supported in loop SLP because of realignment compications. */
1814 if (STMT_VINFO_BB_VINFO (stmt_info))
1816 /* Check whether the loads in an instance form a subchain and thus
1817 no permutation is necessary. */
1818 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1820 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1821 continue;
1822 bool subchain_p = true;
1823 stmt_vec_info next_load_info = NULL;
1824 stmt_vec_info load_info;
1825 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1827 if (j != 0
1828 && (next_load_info != load_info
1829 || DR_GROUP_GAP (load_info) != 1))
1831 subchain_p = false;
1832 break;
1834 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
1836 if (subchain_p)
1837 SLP_TREE_LOAD_PERMUTATION (node).release ();
1838 else
1840 stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
1841 group_info = DR_GROUP_FIRST_ELEMENT (group_info);
1842 unsigned HOST_WIDE_INT nunits;
1843 unsigned k, maxk = 0;
1844 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1845 if (k > maxk)
1846 maxk = k;
1847 /* In BB vectorization we may not actually use a loaded vector
1848 accessing elements in excess of DR_GROUP_SIZE. */
1849 tree vectype = STMT_VINFO_VECTYPE (group_info);
1850 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1851 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
1853 if (dump_enabled_p ())
1854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1855 "BB vectorization with gaps at the end of "
1856 "a load is not supported\n");
1857 return false;
1860 /* Verify the permutation can be generated. */
1861 vec<tree> tem;
1862 unsigned n_perms;
1863 if (!vect_transform_slp_perm_load (node, tem, NULL,
1864 1, slp_instn, true, &n_perms))
1866 if (dump_enabled_p ())
1867 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1868 vect_location,
1869 "unsupported load permutation\n");
1870 return false;
1874 return true;
1877 /* For loop vectorization verify we can generate the permutation. Be
1878 conservative about the vectorization factor, there are permutations
1879 that will use three vector inputs only starting from a specific factor
1880 and the vectorization factor is not yet final.
1881 ??? The SLP instance unrolling factor might not be the maximum one. */
1882 unsigned n_perms;
1883 poly_uint64 test_vf
1884 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1885 LOOP_VINFO_VECT_FACTOR
1886 (STMT_VINFO_LOOP_VINFO (stmt_info)));
1887 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1888 if (node->load_permutation.exists ()
1889 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1890 slp_instn, true, &n_perms))
1891 return false;
1893 return true;
1897 /* Find the last store in SLP INSTANCE. */
1899 stmt_vec_info
1900 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1902 stmt_vec_info last = NULL;
1903 stmt_vec_info stmt_vinfo;
1905 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1907 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1908 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1911 return last;
1914 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1915 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1916 (also containing the first GROUP1_SIZE stmts, since stores are
1917 consecutive), the second containing the remainder.
1918 Return the first stmt in the second group. */
1920 static stmt_vec_info
1921 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1923 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1924 gcc_assert (group1_size > 0);
1925 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1926 gcc_assert (group2_size > 0);
1927 DR_GROUP_SIZE (first_vinfo) = group1_size;
1929 stmt_vec_info stmt_info = first_vinfo;
1930 for (unsigned i = group1_size; i > 1; i--)
1932 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1933 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1935 /* STMT is now the last element of the first group. */
1936 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1937 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1939 DR_GROUP_SIZE (group2) = group2_size;
1940 for (stmt_info = group2; stmt_info;
1941 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1943 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1944 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1947 /* For the second group, the DR_GROUP_GAP is that before the original group,
1948 plus skipping over the first vector. */
1949 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1951 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1952 DR_GROUP_GAP (first_vinfo) += group2_size;
1954 if (dump_enabled_p ())
1955 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1956 group1_size, group2_size);
1958 return group2;
1961 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1962 statements and a vector of NUNITS elements. */
1964 static poly_uint64
1965 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
1967 return exact_div (common_multiple (nunits, group_size), group_size);
1970 /* Analyze an SLP instance starting from a group of grouped stores. Call
1971 vect_build_slp_tree to build a tree of packed stmts if possible.
1972 Return FALSE if it's impossible to SLP any stmt in the loop. */
1974 static bool
1975 vect_analyze_slp_instance (vec_info *vinfo,
1976 stmt_vec_info stmt_info, unsigned max_tree_size)
1978 slp_instance new_instance;
1979 slp_tree node;
1980 unsigned int group_size;
1981 tree vectype, scalar_type = NULL_TREE;
1982 unsigned int i;
1983 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1984 vec<stmt_vec_info> scalar_stmts;
1986 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1988 scalar_type = TREE_TYPE (DR_REF (dr));
1989 vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1990 group_size = DR_GROUP_SIZE (stmt_info);
1992 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1994 gcc_assert (is_a <loop_vec_info> (vinfo));
1995 vectype = STMT_VINFO_VECTYPE (stmt_info);
1996 group_size = REDUC_GROUP_SIZE (stmt_info);
1998 else
2000 gcc_assert (is_a <loop_vec_info> (vinfo));
2001 vectype = STMT_VINFO_VECTYPE (stmt_info);
2002 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2005 if (!vectype)
2007 if (dump_enabled_p ())
2008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2009 "Build SLP failed: unsupported data-type %T\n",
2010 scalar_type);
2012 return false;
2014 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2016 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2017 scalar_stmts.create (group_size);
2018 stmt_vec_info next_info = stmt_info;
2019 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2021 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2022 while (next_info)
2024 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2025 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2028 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2030 /* Collect the reduction stmts and store them in
2031 SLP_TREE_SCALAR_STMTS. */
2032 while (next_info)
2034 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2035 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2037 /* Mark the first element of the reduction chain as reduction to properly
2038 transform the node. In the reduction analysis phase only the last
2039 element of the chain is marked as reduction. */
2040 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
2041 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2042 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2044 else
2046 /* Collect reduction statements. */
2047 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2048 for (i = 0; reductions.iterate (i, &next_info); i++)
2049 scalar_stmts.safe_push (next_info);
2052 /* Build the tree for the SLP instance. */
2053 bool *matches = XALLOCAVEC (bool, group_size);
2054 unsigned npermutes = 0;
2055 scalar_stmts_to_slp_tree_map_t *bst_map
2056 = new scalar_stmts_to_slp_tree_map_t ();
2057 poly_uint64 max_nunits = nunits;
2058 unsigned tree_size = 0;
2059 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2060 &max_nunits, matches, &npermutes,
2061 &tree_size, bst_map);
2062 /* The map keeps a reference on SLP nodes built, release that. */
2063 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2064 it != bst_map->end (); ++it)
2065 if ((*it).second)
2066 vect_free_slp_tree ((*it).second, false);
2067 delete bst_map;
2068 if (node != NULL)
2070 /* Calculate the unrolling factor based on the smallest type. */
2071 poly_uint64 unrolling_factor
2072 = calculate_unrolling_factor (max_nunits, group_size);
2074 if (maybe_ne (unrolling_factor, 1U)
2075 && is_a <bb_vec_info> (vinfo))
2077 unsigned HOST_WIDE_INT const_max_nunits;
2078 if (!max_nunits.is_constant (&const_max_nunits)
2079 || const_max_nunits > group_size)
2081 if (dump_enabled_p ())
2082 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2083 "Build SLP failed: store group "
2084 "size not a multiple of the vector size "
2085 "in basic block SLP\n");
2086 vect_free_slp_tree (node, false);
2087 return false;
2089 /* Fatal mismatch. */
2090 matches[group_size / const_max_nunits * const_max_nunits] = false;
2091 vect_free_slp_tree (node, false);
2093 else
2095 /* Create a new SLP instance. */
2096 new_instance = XNEW (class _slp_instance);
2097 SLP_INSTANCE_TREE (new_instance) = node;
2098 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2099 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2100 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2101 vect_gather_slp_loads (new_instance, node);
2102 if (dump_enabled_p ())
2103 dump_printf_loc (MSG_NOTE, vect_location,
2104 "SLP size %u vs. limit %u.\n",
2105 tree_size, max_tree_size);
2107 /* Compute the load permutation. */
2108 slp_tree load_node;
2109 bool loads_permuted = false;
2110 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2112 vec<unsigned> load_permutation;
2113 int j;
2114 stmt_vec_info load_info;
2115 bool this_load_permuted = false;
2116 load_permutation.create (group_size);
2117 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
2118 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2119 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2121 int load_place = vect_get_place_in_interleaving_chain
2122 (load_info, first_stmt_info);
2123 gcc_assert (load_place != -1);
2124 if (load_place != j)
2125 this_load_permuted = true;
2126 load_permutation.safe_push (load_place);
2128 if (!this_load_permuted
2129 /* The load requires permutation when unrolling exposes
2130 a gap either because the group is larger than the SLP
2131 group-size or because there is a gap between the groups. */
2132 && (known_eq (unrolling_factor, 1U)
2133 || (group_size == DR_GROUP_SIZE (first_stmt_info)
2134 && DR_GROUP_GAP (first_stmt_info) == 0)))
2136 load_permutation.release ();
2137 continue;
2139 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2140 loads_permuted = true;
2143 if (loads_permuted)
2145 if (!vect_supported_load_permutation_p (new_instance))
2147 if (dump_enabled_p ())
2148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2149 "Build SLP failed: unsupported load "
2150 "permutation %G", stmt_info->stmt);
2151 vect_free_slp_instance (new_instance, false);
2152 return false;
2156 /* If the loads and stores can be handled with load/store-lan
2157 instructions do not generate this SLP instance. */
2158 if (is_a <loop_vec_info> (vinfo)
2159 && loads_permuted
2160 && dr && vect_store_lanes_supported (vectype, group_size, false))
2162 slp_tree load_node;
2163 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2165 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2166 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2167 /* Use SLP for strided accesses (or if we can't load-lanes). */
2168 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2169 || ! vect_load_lanes_supported
2170 (STMT_VINFO_VECTYPE (stmt_vinfo),
2171 DR_GROUP_SIZE (stmt_vinfo), false))
2172 break;
2174 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2176 if (dump_enabled_p ())
2177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2178 "Built SLP cancelled: can use "
2179 "load/store-lanes\n");
2180 vect_free_slp_instance (new_instance, false);
2181 return false;
2185 vinfo->slp_instances.safe_push (new_instance);
2187 if (dump_enabled_p ())
2189 dump_printf_loc (MSG_NOTE, vect_location,
2190 "Final SLP tree for instance:\n");
2191 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2194 return true;
2197 else
2199 /* Failed to SLP. */
2200 /* Free the allocated memory. */
2201 scalar_stmts.release ();
2204 /* For basic block SLP, try to break the group up into multiples of the
2205 vector size. */
2206 unsigned HOST_WIDE_INT const_nunits;
2207 if (is_a <bb_vec_info> (vinfo)
2208 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2209 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2210 && nunits.is_constant (&const_nunits))
2212 /* We consider breaking the group only on VF boundaries from the existing
2213 start. */
2214 for (i = 0; i < group_size; i++)
2215 if (!matches[i]) break;
2217 if (i >= const_nunits && i < group_size)
2219 /* Split into two groups at the first vector boundary before i. */
2220 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2221 unsigned group1_size = i & ~(const_nunits - 1);
2223 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2224 group1_size);
2225 bool res = vect_analyze_slp_instance (vinfo, stmt_info,
2226 max_tree_size);
2227 /* If the first non-match was in the middle of a vector,
2228 skip the rest of that vector. */
2229 if (group1_size < i)
2231 i = group1_size + const_nunits;
2232 if (i < group_size)
2233 rest = vect_split_slp_store_group (rest, const_nunits);
2235 if (i < group_size)
2236 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2237 return res;
2239 /* Even though the first vector did not all match, we might be able to SLP
2240 (some) of the remainder. FORNOW ignore this possibility. */
2243 return false;
2247 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2248 trees of packed scalar stmts if SLP is possible. */
2250 opt_result
2251 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2253 unsigned int i;
2254 stmt_vec_info first_element;
2256 DUMP_VECT_SCOPE ("vect_analyze_slp");
2258 /* Find SLP sequences starting from groups of grouped stores. */
2259 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2260 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2262 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2264 if (loop_vinfo->reduction_chains.length () > 0)
2266 /* Find SLP sequences starting from reduction chains. */
2267 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2268 if (! vect_analyze_slp_instance (vinfo, first_element,
2269 max_tree_size))
2271 /* Dissolve reduction chain group. */
2272 stmt_vec_info vinfo = first_element;
2273 stmt_vec_info last = NULL;
2274 while (vinfo)
2276 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2277 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2278 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2279 last = vinfo;
2280 vinfo = next;
2282 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2283 /* It can be still vectorized as part of an SLP reduction. */
2284 loop_vinfo->reductions.safe_push (last);
2288 /* Find SLP sequences starting from groups of reductions. */
2289 if (loop_vinfo->reductions.length () > 1)
2290 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2291 max_tree_size);
2294 return opt_result::success ();
2298 /* For each possible SLP instance decide whether to SLP it and calculate overall
2299 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2300 least one instance. */
2302 bool
2303 vect_make_slp_decision (loop_vec_info loop_vinfo)
2305 unsigned int i;
2306 poly_uint64 unrolling_factor = 1;
2307 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2308 slp_instance instance;
2309 int decided_to_slp = 0;
2311 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2313 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2315 /* FORNOW: SLP if you can. */
2316 /* All unroll factors have the form vinfo->vector_size * X for some
2317 rational X, so they must have a common multiple. */
2318 unrolling_factor
2319 = force_common_multiple (unrolling_factor,
2320 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2322 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2323 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2324 loop-based vectorization. Such stmts will be marked as HYBRID. */
2325 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2326 decided_to_slp++;
2329 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2331 if (decided_to_slp && dump_enabled_p ())
2333 dump_printf_loc (MSG_NOTE, vect_location,
2334 "Decided to SLP %d instances. Unrolling factor ",
2335 decided_to_slp);
2336 dump_dec (MSG_NOTE, unrolling_factor);
2337 dump_printf (MSG_NOTE, "\n");
2340 return (decided_to_slp > 0);
2344 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2345 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2347 static void
2348 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype,
2349 hash_map<slp_tree, unsigned> &visited)
2351 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (node)[i];
2352 imm_use_iterator imm_iter;
2353 gimple *use_stmt;
2354 stmt_vec_info use_vinfo;
2355 slp_tree child;
2356 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2357 int j;
2359 /* We need to union stype over the incoming graph edges but we still
2360 want to limit recursion to stay O(N+E). */
2361 bool only_edge = (++visited.get_or_insert (node) < node->refcnt);
2363 /* Propagate hybrid down the SLP tree. */
2364 if (stype == hybrid)
2366 else if (HYBRID_SLP_STMT (stmt_vinfo))
2367 stype = hybrid;
2368 else if (!only_edge)
2370 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2371 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2372 /* If we get a pattern stmt here we have to use the LHS of the
2373 original stmt for immediate uses. */
2374 gimple *stmt = vect_orig_stmt (stmt_vinfo)->stmt;
2375 tree def;
2376 if (gimple_code (stmt) == GIMPLE_PHI)
2377 def = gimple_phi_result (stmt);
2378 else
2379 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2380 if (def)
2381 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2383 use_vinfo = loop_vinfo->lookup_stmt (use_stmt);
2384 if (!use_vinfo)
2385 continue;
2386 use_vinfo = vect_stmt_to_vectorize (use_vinfo);
2387 if (!STMT_SLP_TYPE (use_vinfo)
2388 && (STMT_VINFO_RELEVANT (use_vinfo)
2389 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2390 && !(gimple_code (use_stmt) == GIMPLE_PHI
2391 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2393 if (dump_enabled_p ())
2394 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2395 "def in non-SLP stmt: %G", use_stmt);
2396 stype = hybrid;
2401 if (stype == hybrid
2402 && !HYBRID_SLP_STMT (stmt_vinfo))
2404 if (dump_enabled_p ())
2405 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2406 stmt_vinfo->stmt);
2407 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2410 if (!only_edge)
2411 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2412 if (SLP_TREE_DEF_TYPE (child) != vect_external_def
2413 && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
2414 vect_detect_hybrid_slp_stmts (child, i, stype, visited);
2417 static void
2418 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2420 hash_map<slp_tree, unsigned> visited;
2421 vect_detect_hybrid_slp_stmts (node, i, stype, visited);
2424 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2426 static tree
2427 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2429 walk_stmt_info *wi = (walk_stmt_info *)data;
2430 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2432 if (wi->is_lhs)
2433 return NULL_TREE;
2435 stmt_vec_info def_stmt_info = loop_vinfo->lookup_def (*tp);
2436 if (def_stmt_info && PURE_SLP_STMT (def_stmt_info))
2438 if (dump_enabled_p ())
2439 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2440 def_stmt_info->stmt);
2441 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2444 return NULL_TREE;
2447 static tree
2448 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2449 walk_stmt_info *wi)
2451 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2452 stmt_vec_info use_vinfo = loop_vinfo->lookup_stmt (gsi_stmt (*gsi));
2453 /* If the stmt is in a SLP instance then this isn't a reason
2454 to mark use definitions in other SLP instances as hybrid. */
2455 if (! STMT_SLP_TYPE (use_vinfo)
2456 && (STMT_VINFO_RELEVANT (use_vinfo)
2457 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2458 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2459 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2461 else
2462 *handled = true;
2463 return NULL_TREE;
2466 /* Find stmts that must be both vectorized and SLPed. */
2468 void
2469 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2471 unsigned int i;
2472 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2473 slp_instance instance;
2475 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2477 /* First walk all pattern stmt in the loop and mark defs of uses as
2478 hybrid because immediate uses in them are not recorded. */
2479 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2481 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2482 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2483 gsi_next (&gsi))
2485 gimple *stmt = gsi_stmt (gsi);
2486 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2487 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2489 walk_stmt_info wi;
2490 memset (&wi, 0, sizeof (wi));
2491 wi.info = loop_vinfo;
2492 gimple_stmt_iterator gsi2
2493 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)->stmt);
2494 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2495 vect_detect_hybrid_slp_1, &wi);
2496 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2497 vect_detect_hybrid_slp_2,
2498 vect_detect_hybrid_slp_1, &wi);
2503 /* Then walk the SLP instance trees marking stmts with uses in
2504 non-SLP stmts as hybrid, also propagating hybrid down the
2505 SLP tree, collecting the above info on-the-fly. */
2506 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2508 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2509 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2510 i, pure_slp);
2515 /* Initialize a bb_vec_info struct for the statements between
2516 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2518 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2519 gimple_stmt_iterator region_end_in,
2520 vec_info_shared *shared)
2521 : vec_info (vec_info::bb, init_cost (NULL), shared),
2522 bb (gsi_bb (region_begin_in)),
2523 region_begin (region_begin_in),
2524 region_end (region_end_in)
2526 gimple_stmt_iterator gsi;
2528 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2529 gsi_next (&gsi))
2531 gimple *stmt = gsi_stmt (gsi);
2532 gimple_set_uid (stmt, 0);
2533 add_stmt (stmt);
2536 bb->aux = this;
2540 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2541 stmts in the basic block. */
2543 _bb_vec_info::~_bb_vec_info ()
2545 for (gimple_stmt_iterator si = region_begin;
2546 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2547 /* Reset region marker. */
2548 gimple_set_uid (gsi_stmt (si), -1);
2550 bb->aux = NULL;
2553 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2554 given then that child nodes have already been processed, and that
2555 their def types currently match their SLP node's def type. */
2557 static bool
2558 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2559 slp_instance node_instance,
2560 stmt_vector_for_cost *cost_vec)
2562 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2563 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2565 /* For BB vectorization vector types are assigned here.
2566 Memory accesses already got their vector type assigned
2567 in vect_analyze_data_refs. */
2568 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2569 if (bb_vinfo
2570 && ! STMT_VINFO_DATA_REF (stmt_info))
2572 tree vectype, nunits_vectype;
2573 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
2574 &nunits_vectype))
2575 /* We checked this when building the node. */
2576 gcc_unreachable ();
2577 if (vectype == boolean_type_node)
2579 vectype = vect_get_mask_type_for_stmt (stmt_info);
2580 if (!vectype)
2581 /* vect_get_mask_type_for_stmt has already explained the
2582 failure. */
2583 return false;
2586 stmt_vec_info sstmt_info;
2587 unsigned int i;
2588 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt_info)
2589 STMT_VINFO_VECTYPE (sstmt_info) = vectype;
2592 /* Calculate the number of vector statements to be created for the
2593 scalar stmts in this node. For SLP reductions it is equal to the
2594 number of vector statements in the children (which has already been
2595 calculated by the recursive call). Otherwise it is the number of
2596 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2597 VF divided by the number of elements in a vector. */
2598 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2599 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2601 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2602 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2604 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2605 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2606 break;
2609 else
2611 poly_uint64 vf;
2612 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2613 vf = loop_vinfo->vectorization_factor;
2614 else
2615 vf = 1;
2616 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2617 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2618 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2619 = vect_get_num_vectors (vf * group_size, vectype);
2622 bool dummy;
2623 return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec);
2626 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2627 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2629 Return true if the operations are supported. */
2631 static bool
2632 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2633 slp_instance node_instance,
2634 scalar_stmts_to_slp_tree_map_t *visited,
2635 scalar_stmts_to_slp_tree_map_t *lvisited,
2636 stmt_vector_for_cost *cost_vec)
2638 int i, j;
2639 slp_tree child;
2641 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2642 return true;
2644 /* If we already analyzed the exact same set of scalar stmts we're done.
2645 We share the generated vector stmts for those. */
2646 slp_tree *leader;
2647 if ((leader = visited->get (SLP_TREE_SCALAR_STMTS (node)))
2648 || (leader = lvisited->get (SLP_TREE_SCALAR_STMTS (node))))
2650 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2651 = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
2652 return true;
2655 /* The SLP graph is acyclic so not caching whether we failed or succeeded
2656 doesn't result in any issue since we throw away the lvisited set
2657 when we fail. */
2658 lvisited->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
2660 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2661 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2662 visited, lvisited, cost_vec))
2663 return false;
2665 /* ??? We have to catch the case late where two first scalar stmts appear
2666 in multiple SLP children with different def type and fail. Remember
2667 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2668 match it when that is vect_internal_def. */
2669 auto_vec<vect_def_type, 4> dt;
2670 dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2671 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2672 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2673 dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
2675 /* Push SLP node def-type to stmt operands. */
2676 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2677 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def
2678 && SLP_TREE_SCALAR_STMTS (child).length () != 0)
2679 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2680 = SLP_TREE_DEF_TYPE (child);
2682 /* Check everything worked out. */
2683 bool res = true;
2684 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2685 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2687 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2689 if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2690 != SLP_TREE_DEF_TYPE (child))
2691 res = false;
2693 else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2694 != dt[j])
2695 res = false;
2697 if (!res && dump_enabled_p ())
2698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2699 "not vectorized: same operand with different "
2700 "def type in stmt.\n");
2702 if (res)
2703 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2704 cost_vec);
2706 /* Restore def-types. */
2707 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2708 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2709 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
2711 return res;
2715 /* Analyze statements in SLP instances of VINFO. Return true if the
2716 operations are supported. */
2718 bool
2719 vect_slp_analyze_operations (vec_info *vinfo)
2721 slp_instance instance;
2722 int i;
2724 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2726 scalar_stmts_to_slp_tree_map_t *visited
2727 = new scalar_stmts_to_slp_tree_map_t ();
2728 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2730 scalar_stmts_to_slp_tree_map_t lvisited;
2731 stmt_vector_for_cost cost_vec;
2732 cost_vec.create (2);
2733 if (!vect_slp_analyze_node_operations (vinfo,
2734 SLP_INSTANCE_TREE (instance),
2735 instance, visited, &lvisited,
2736 &cost_vec))
2738 slp_tree node = SLP_INSTANCE_TREE (instance);
2739 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2740 if (dump_enabled_p ())
2741 dump_printf_loc (MSG_NOTE, vect_location,
2742 "removing SLP instance operations starting from: %G",
2743 stmt_info->stmt);
2744 vect_free_slp_instance (instance, false);
2745 vinfo->slp_instances.ordered_remove (i);
2746 cost_vec.release ();
2748 else
2750 for (scalar_stmts_to_slp_tree_map_t::iterator x = lvisited.begin();
2751 x != lvisited.end(); ++x)
2752 visited->put ((*x).first.copy (), (*x).second);
2753 i++;
2755 add_stmt_costs (vinfo->target_cost_data, &cost_vec);
2756 cost_vec.release ();
2759 delete visited;
2761 return !vinfo->slp_instances.is_empty ();
2765 /* Compute the scalar cost of the SLP node NODE and its children
2766 and return it. Do not account defs that are marked in LIFE and
2767 update LIFE according to uses of NODE. */
2769 static void
2770 vect_bb_slp_scalar_cost (basic_block bb,
2771 slp_tree node, vec<bool, va_heap> *life,
2772 stmt_vector_for_cost *cost_vec,
2773 hash_set<slp_tree> &visited)
2775 unsigned i;
2776 stmt_vec_info stmt_info;
2777 slp_tree child;
2779 if (visited.add (node))
2780 return;
2782 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2784 gimple *stmt = stmt_info->stmt;
2785 vec_info *vinfo = stmt_info->vinfo;
2786 ssa_op_iter op_iter;
2787 def_operand_p def_p;
2789 if ((*life)[i])
2790 continue;
2792 /* If there is a non-vectorized use of the defs then the scalar
2793 stmt is kept live in which case we do not account it or any
2794 required defs in the SLP children in the scalar cost. This
2795 way we make the vectorization more costly when compared to
2796 the scalar cost. */
2797 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2799 imm_use_iterator use_iter;
2800 gimple *use_stmt;
2801 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2802 if (!is_gimple_debug (use_stmt))
2804 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
2805 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
2807 (*life)[i] = true;
2808 BREAK_FROM_IMM_USE_STMT (use_iter);
2812 if ((*life)[i])
2813 continue;
2815 /* Count scalar stmts only once. */
2816 if (gimple_visited_p (stmt))
2817 continue;
2818 gimple_set_visited (stmt, true);
2820 vect_cost_for_stmt kind;
2821 if (STMT_VINFO_DATA_REF (stmt_info))
2823 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2824 kind = scalar_load;
2825 else
2826 kind = scalar_store;
2828 else
2829 kind = scalar_stmt;
2830 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
2833 auto_vec<bool, 20> subtree_life;
2834 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2836 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2838 /* Do not directly pass LIFE to the recursive call, copy it to
2839 confine changes in the callee to the current child/subtree. */
2840 subtree_life.safe_splice (*life);
2841 vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec,
2842 visited);
2843 subtree_life.truncate (0);
2848 static void
2849 vect_bb_slp_scalar_cost (basic_block bb,
2850 slp_tree node, vec<bool, va_heap> *life,
2851 stmt_vector_for_cost *cost_vec)
2853 hash_set<slp_tree> visited;
2854 vect_bb_slp_scalar_cost (bb, node, life, cost_vec, visited);
2857 /* Check if vectorization of the basic block is profitable. */
2859 static bool
2860 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2862 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2863 slp_instance instance;
2864 int i;
2865 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2866 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2868 /* Calculate scalar cost. */
2869 stmt_vector_for_cost scalar_costs;
2870 scalar_costs.create (0);
2871 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2873 auto_vec<bool, 20> life;
2874 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2875 vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2876 SLP_INSTANCE_TREE (instance),
2877 &life, &scalar_costs);
2879 void *target_cost_data = init_cost (NULL);
2880 add_stmt_costs (target_cost_data, &scalar_costs);
2881 scalar_costs.release ();
2882 unsigned dummy;
2883 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
2884 destroy_cost_data (target_cost_data);
2886 /* Unset visited flag. */
2887 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2888 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2889 gimple_set_visited (gsi_stmt (gsi), false);
2891 /* Complete the target-specific cost calculation. */
2892 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2893 &vec_inside_cost, &vec_epilogue_cost);
2895 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2897 if (dump_enabled_p ())
2899 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2900 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2901 vec_inside_cost);
2902 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2903 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2904 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2907 /* Vectorization is profitable if its cost is more than the cost of scalar
2908 version. Note that we err on the vector side for equal cost because
2909 the cost estimate is otherwise quite pessimistic (constant uses are
2910 free on the scalar side but cost a load on the vector side for
2911 example). */
2912 if (vec_outside_cost + vec_inside_cost > scalar_cost)
2913 return false;
2915 return true;
2918 /* Check if the region described by BB_VINFO can be vectorized, returning
2919 true if so. When returning false, set FATAL to true if the same failure
2920 would prevent vectorization at other vector sizes, false if it is still
2921 worth trying other sizes. N_STMTS is the number of statements in the
2922 region. */
2924 static bool
2925 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
2927 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
2929 slp_instance instance;
2930 int i;
2931 poly_uint64 min_vf = 2;
2933 /* The first group of checks is independent of the vector size. */
2934 fatal = true;
2936 /* Analyze the data references. */
2938 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
2940 if (dump_enabled_p ())
2941 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2942 "not vectorized: unhandled data-ref in basic "
2943 "block.\n");
2944 return false;
2947 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2949 if (dump_enabled_p ())
2950 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2951 "not vectorized: not enough data-refs in "
2952 "basic block.\n");
2953 return false;
2956 if (!vect_analyze_data_ref_accesses (bb_vinfo))
2958 if (dump_enabled_p ())
2959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2960 "not vectorized: unhandled data access in "
2961 "basic block.\n");
2962 return false;
2965 /* If there are no grouped stores in the region there is no need
2966 to continue with pattern recog as vect_analyze_slp will fail
2967 anyway. */
2968 if (bb_vinfo->grouped_stores.is_empty ())
2970 if (dump_enabled_p ())
2971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2972 "not vectorized: no grouped stores in "
2973 "basic block.\n");
2974 return false;
2977 /* While the rest of the analysis below depends on it in some way. */
2978 fatal = false;
2980 vect_pattern_recog (bb_vinfo);
2982 /* Check the SLP opportunities in the basic block, analyze and build SLP
2983 trees. */
2984 if (!vect_analyze_slp (bb_vinfo, n_stmts))
2986 if (dump_enabled_p ())
2988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2989 "Failed to SLP the basic block.\n");
2990 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2991 "not vectorized: failed to find SLP opportunities "
2992 "in basic block.\n");
2994 return false;
2997 vect_record_base_alignments (bb_vinfo);
2999 /* Analyze and verify the alignment of data references and the
3000 dependence in the SLP instances. */
3001 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3003 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
3004 || ! vect_slp_analyze_instance_dependence (instance))
3006 slp_tree node = SLP_INSTANCE_TREE (instance);
3007 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3008 if (dump_enabled_p ())
3009 dump_printf_loc (MSG_NOTE, vect_location,
3010 "removing SLP instance operations starting from: %G",
3011 stmt_info->stmt);
3012 vect_free_slp_instance (instance, false);
3013 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3014 continue;
3017 /* Mark all the statements that we want to vectorize as pure SLP and
3018 relevant. */
3019 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3020 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3022 i++;
3024 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3025 return false;
3027 if (!vect_slp_analyze_operations (bb_vinfo))
3029 if (dump_enabled_p ())
3030 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3031 "not vectorized: bad operation in basic block.\n");
3032 return false;
3035 /* Cost model: check if the vectorization is worthwhile. */
3036 if (!unlimited_cost_model (NULL)
3037 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3039 if (dump_enabled_p ())
3040 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3041 "not vectorized: vectorization is not "
3042 "profitable.\n");
3043 return false;
3046 if (dump_enabled_p ())
3047 dump_printf_loc (MSG_NOTE, vect_location,
3048 "Basic block will be vectorized using SLP\n");
3049 return true;
3052 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3053 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3054 on success. The region has N_STMTS statements and has the datarefs
3055 given by DATAREFS. */
3057 static bool
3058 vect_slp_bb_region (gimple_stmt_iterator region_begin,
3059 gimple_stmt_iterator region_end,
3060 vec<data_reference_p> datarefs,
3061 unsigned int n_stmts)
3063 bb_vec_info bb_vinfo;
3064 auto_vector_sizes vector_sizes;
3066 /* Autodetect first vector size we try. */
3067 poly_uint64 next_vector_size = 0;
3068 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
3069 unsigned int next_size = 0;
3071 vec_info_shared shared;
3073 poly_uint64 autodetected_vector_size = 0;
3074 while (1)
3076 bool vectorized = false;
3077 bool fatal = false;
3078 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3080 bool first_time_p = shared.datarefs.is_empty ();
3081 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3082 if (first_time_p)
3083 bb_vinfo->shared->save_datarefs ();
3084 else
3085 bb_vinfo->shared->check_datarefs ();
3086 bb_vinfo->vector_size = next_vector_size;
3088 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
3089 && dbg_cnt (vect_slp))
3091 if (dump_enabled_p ())
3092 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3094 bb_vinfo->shared->check_datarefs ();
3095 vect_schedule_slp (bb_vinfo);
3097 unsigned HOST_WIDE_INT bytes;
3098 if (dump_enabled_p ())
3100 if (bb_vinfo->vector_size.is_constant (&bytes))
3101 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3102 "basic block part vectorized using %wu byte "
3103 "vectors\n", bytes);
3104 else
3105 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3106 "basic block part vectorized using variable "
3107 "length vectors\n");
3110 vectorized = true;
3113 if (next_size == 0)
3114 autodetected_vector_size = bb_vinfo->vector_size;
3116 delete bb_vinfo;
3118 if (next_size < vector_sizes.length ()
3119 && known_eq (vector_sizes[next_size], autodetected_vector_size))
3120 next_size += 1;
3122 if (vectorized
3123 || next_size == vector_sizes.length ()
3124 || known_eq (autodetected_vector_size, 0U)
3125 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3126 vector sizes will fail do not bother iterating. */
3127 || fatal)
3128 return vectorized;
3130 /* Try the next biggest vector size. */
3131 next_vector_size = vector_sizes[next_size++];
3132 if (dump_enabled_p ())
3134 dump_printf_loc (MSG_NOTE, vect_location,
3135 "***** Re-trying analysis with "
3136 "vector size ");
3137 dump_dec (MSG_NOTE, next_vector_size);
3138 dump_printf (MSG_NOTE, "\n");
3143 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3144 true if anything in the basic-block was vectorized. */
3146 bool
3147 vect_slp_bb (basic_block bb)
3149 gimple_stmt_iterator gsi;
3150 bool any_vectorized = false;
3152 gsi = gsi_start_bb (bb);
3153 while (!gsi_end_p (gsi))
3155 gimple_stmt_iterator region_begin = gsi;
3156 vec<data_reference_p> datarefs = vNULL;
3157 int insns = 0;
3159 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3161 gimple *stmt = gsi_stmt (gsi);
3162 if (is_gimple_debug (stmt))
3163 continue;
3164 insns++;
3166 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3167 vect_location = stmt;
3169 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3170 break;
3173 /* Skip leading unhandled stmts. */
3174 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3176 gsi_next (&gsi);
3177 continue;
3180 gimple_stmt_iterator region_end = gsi;
3182 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
3184 if (dump_enabled_p ())
3185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3186 "not vectorized: too many instructions in "
3187 "basic block.\n");
3189 else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
3190 any_vectorized = true;
3192 if (gsi_end_p (region_end))
3193 break;
3195 /* Skip the unhandled stmt. */
3196 gsi_next (&gsi);
3199 return any_vectorized;
3203 /* Return 1 if vector type STMT_VINFO is a boolean vector. */
3205 static bool
3206 vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo)
3208 enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
3209 tree op, vectype;
3210 enum vect_def_type dt;
3212 /* For comparison and COND_EXPR type is chosen depending
3213 on the non-constant other comparison operand. */
3214 if (TREE_CODE_CLASS (code) == tcc_comparison)
3216 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3217 op = gimple_assign_rhs1 (stmt);
3219 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3220 gcc_unreachable ();
3222 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3225 if (code == COND_EXPR)
3227 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3228 tree cond = gimple_assign_rhs1 (stmt);
3230 if (TREE_CODE (cond) == SSA_NAME)
3231 op = cond;
3232 else
3233 op = TREE_OPERAND (cond, 0);
3235 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3236 gcc_unreachable ();
3238 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3241 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3244 /* Build a variable-length vector in which the elements in ELTS are repeated
3245 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3246 RESULTS and add any new instructions to SEQ.
3248 The approach we use is:
3250 (1) Find a vector mode VM with integer elements of mode IM.
3252 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3253 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3254 from small vectors to IM.
3256 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3258 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3259 correct byte contents.
3261 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3263 We try to find the largest IM for which this sequence works, in order
3264 to cut down on the number of interleaves. */
3266 void
3267 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3268 vec<tree> elts, unsigned int nresults,
3269 vec<tree> &results)
3271 unsigned int nelts = elts.length ();
3272 tree element_type = TREE_TYPE (vector_type);
3274 /* (1) Find a vector mode VM with integer elements of mode IM. */
3275 unsigned int nvectors = 1;
3276 tree new_vector_type;
3277 tree permutes[2];
3278 if (!can_duplicate_and_interleave_p (vinfo, nelts, TYPE_MODE (element_type),
3279 &nvectors, &new_vector_type,
3280 permutes))
3281 gcc_unreachable ();
3283 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3284 unsigned int partial_nelts = nelts / nvectors;
3285 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3287 tree_vector_builder partial_elts;
3288 auto_vec<tree, 32> pieces (nvectors * 2);
3289 pieces.quick_grow (nvectors * 2);
3290 for (unsigned int i = 0; i < nvectors; ++i)
3292 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3293 ELTS' has mode IM. */
3294 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3295 for (unsigned int j = 0; j < partial_nelts; ++j)
3296 partial_elts.quick_push (elts[i * partial_nelts + j]);
3297 tree t = gimple_build_vector (seq, &partial_elts);
3298 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3299 TREE_TYPE (new_vector_type), t);
3301 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3302 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3305 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3306 correct byte contents.
3308 We need to repeat the following operation log2(nvectors) times:
3310 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3311 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3313 However, if each input repeats every N elements and the VF is
3314 a multiple of N * 2, the HI result is the same as the LO. */
3315 unsigned int in_start = 0;
3316 unsigned int out_start = nvectors;
3317 unsigned int hi_start = nvectors / 2;
3318 /* A bound on the number of outputs needed to produce NRESULTS results
3319 in the final iteration. */
3320 unsigned int noutputs_bound = nvectors * nresults;
3321 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3323 noutputs_bound /= 2;
3324 unsigned int limit = MIN (noutputs_bound, nvectors);
3325 for (unsigned int i = 0; i < limit; ++i)
3327 if ((i & 1) != 0
3328 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3329 2 * in_repeat))
3331 pieces[out_start + i] = pieces[out_start + i - 1];
3332 continue;
3335 tree output = make_ssa_name (new_vector_type);
3336 tree input1 = pieces[in_start + (i / 2)];
3337 tree input2 = pieces[in_start + (i / 2) + hi_start];
3338 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3339 input1, input2,
3340 permutes[i & 1]);
3341 gimple_seq_add_stmt (seq, stmt);
3342 pieces[out_start + i] = output;
3344 std::swap (in_start, out_start);
3347 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3348 results.reserve (nresults);
3349 for (unsigned int i = 0; i < nresults; ++i)
3350 if (i < nvectors)
3351 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3352 pieces[in_start + i]));
3353 else
3354 results.quick_push (results[i - nvectors]);
3358 /* For constant and loop invariant defs of SLP_NODE this function returns
3359 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3360 OP_NODE determines the node for the operand containing the scalar
3361 operands. */
3363 static void
3364 vect_get_constant_vectors (slp_tree op_node, slp_tree slp_node,
3365 vec<tree> *vec_oprnds)
3367 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3368 vec_info *vinfo = stmt_vinfo->vinfo;
3369 unsigned HOST_WIDE_INT nunits;
3370 tree vec_cst;
3371 unsigned j, number_of_places_left_in_vector;
3372 tree vector_type;
3373 tree vop;
3374 int group_size = op_node->ops.length ();
3375 unsigned int vec_num, i;
3376 unsigned number_of_copies = 1;
3377 bool constant_p;
3378 tree neutral_op = NULL;
3379 gimple_seq ctor_seq = NULL;
3380 auto_vec<tree, 16> permute_results;
3382 /* ??? SLP analysis should compute the vector type for the
3383 constant / invariant and store it in the SLP node. */
3384 tree op = op_node->ops[0];
3385 /* Check if vector type is a boolean vector. */
3386 tree stmt_vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3387 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3388 && vect_mask_constant_operand_p (stmt_vinfo))
3389 vector_type
3390 = build_same_sized_truth_vector_type (stmt_vectype);
3391 else
3392 vector_type = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op));
3394 unsigned int number_of_vectors
3395 = vect_get_num_vectors (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
3396 * TYPE_VECTOR_SUBPARTS (stmt_vectype),
3397 vector_type);
3398 vec_oprnds->create (number_of_vectors);
3399 auto_vec<tree> voprnds (number_of_vectors);
3401 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3402 created vectors. It is greater than 1 if unrolling is performed.
3404 For example, we have two scalar operands, s1 and s2 (e.g., group of
3405 strided accesses of size two), while NUNITS is four (i.e., four scalars
3406 of this type can be packed in a vector). The output vector will contain
3407 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3408 will be 2).
3410 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3411 containing the operands.
3413 For example, NUNITS is four as before, and the group size is 8
3414 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3415 {s5, s6, s7, s8}. */
3417 /* When using duplicate_and_interleave, we just need one element for
3418 each scalar statement. */
3419 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3420 nunits = group_size;
3422 number_of_copies = nunits * number_of_vectors / group_size;
3424 number_of_places_left_in_vector = nunits;
3425 constant_p = true;
3426 tree_vector_builder elts (vector_type, nunits, 1);
3427 elts.quick_grow (nunits);
3428 bool place_after_defs = false;
3429 for (j = 0; j < number_of_copies; j++)
3431 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3433 /* Create 'vect_ = {op0,op1,...,opn}'. */
3434 number_of_places_left_in_vector--;
3435 tree orig_op = op;
3436 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3438 if (CONSTANT_CLASS_P (op))
3440 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3442 /* Can't use VIEW_CONVERT_EXPR for booleans because
3443 of possibly different sizes of scalar value and
3444 vector element. */
3445 if (integer_zerop (op))
3446 op = build_int_cst (TREE_TYPE (vector_type), 0);
3447 else if (integer_onep (op))
3448 op = build_all_ones_cst (TREE_TYPE (vector_type));
3449 else
3450 gcc_unreachable ();
3452 else
3453 op = fold_unary (VIEW_CONVERT_EXPR,
3454 TREE_TYPE (vector_type), op);
3455 gcc_assert (op && CONSTANT_CLASS_P (op));
3457 else
3459 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3460 gimple *init_stmt;
3461 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3463 tree true_val
3464 = build_all_ones_cst (TREE_TYPE (vector_type));
3465 tree false_val
3466 = build_zero_cst (TREE_TYPE (vector_type));
3467 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3468 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3469 op, true_val,
3470 false_val);
3472 else
3474 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3475 op);
3476 init_stmt
3477 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3478 op);
3480 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3481 op = new_temp;
3484 elts[number_of_places_left_in_vector] = op;
3485 if (!CONSTANT_CLASS_P (op))
3486 constant_p = false;
3487 if (TREE_CODE (orig_op) == SSA_NAME
3488 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3489 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3490 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3491 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3492 place_after_defs = true;
3494 if (number_of_places_left_in_vector == 0)
3496 if (constant_p
3497 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3498 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3499 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3500 else
3502 if (permute_results.is_empty ())
3503 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
3504 elts, number_of_vectors,
3505 permute_results);
3506 vec_cst = permute_results[number_of_vectors - j - 1];
3508 tree init;
3509 gimple_stmt_iterator gsi;
3510 if (place_after_defs)
3512 stmt_vec_info last_stmt_info
3513 = vect_find_last_scalar_stmt_in_slp (slp_node);
3514 gsi = gsi_for_stmt (last_stmt_info->stmt);
3515 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3516 &gsi);
3518 else
3519 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3520 NULL);
3521 if (ctor_seq != NULL)
3523 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3524 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3525 ctor_seq = NULL;
3527 voprnds.quick_push (init);
3528 place_after_defs = false;
3529 number_of_places_left_in_vector = nunits;
3530 constant_p = true;
3531 elts.new_vector (vector_type, nunits, 1);
3532 elts.quick_grow (nunits);
3537 /* Since the vectors are created in the reverse order, we should invert
3538 them. */
3539 vec_num = voprnds.length ();
3540 for (j = vec_num; j != 0; j--)
3542 vop = voprnds[j - 1];
3543 vec_oprnds->quick_push (vop);
3546 /* In case that VF is greater than the unrolling factor needed for the SLP
3547 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3548 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3549 to replicate the vectors. */
3550 while (number_of_vectors > vec_oprnds->length ())
3552 tree neutral_vec = NULL;
3554 if (neutral_op)
3556 if (!neutral_vec)
3557 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3559 vec_oprnds->quick_push (neutral_vec);
3561 else
3563 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3564 vec_oprnds->quick_push (vop);
3570 /* Get vectorized definitions from SLP_NODE that contains corresponding
3571 vectorized def-stmts. */
3573 static void
3574 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3576 stmt_vec_info vec_def_stmt_info;
3577 unsigned int i;
3579 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3581 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
3582 vec_oprnds->quick_push (gimple_get_lhs (vec_def_stmt_info->stmt));
3586 /* Get N vectorized definitions for SLP_NODE.
3587 If the scalar definitions are loop invariants or constants, collect them and
3588 call vect_get_constant_vectors() to create vector stmts.
3589 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3590 must be stored in the corresponding child of SLP_NODE, and we call
3591 vect_get_slp_vect_defs () to retrieve them. */
3593 void
3594 vect_get_slp_defs (slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
3596 if (n == -1U)
3597 n = SLP_TREE_CHILDREN (slp_node).length ();
3599 for (unsigned i = 0; i < n; ++i)
3601 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
3603 vec<tree> vec_defs = vNULL;
3605 /* For each operand we check if it has vectorized definitions in a child
3606 node or we need to create them (for invariants and constants). */
3607 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3609 vec_defs.create (SLP_TREE_NUMBER_OF_VEC_STMTS (child));
3610 vect_get_slp_vect_defs (child, &vec_defs);
3612 else
3613 vect_get_constant_vectors (child, slp_node, &vec_defs);
3615 vec_oprnds->quick_push (vec_defs);
3619 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3620 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3621 permute statements for the SLP node NODE of the SLP instance
3622 SLP_NODE_INSTANCE. */
3624 bool
3625 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3626 gimple_stmt_iterator *gsi, poly_uint64 vf,
3627 slp_instance slp_node_instance, bool analyze_only,
3628 unsigned *n_perms)
3630 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3631 vec_info *vinfo = stmt_info->vinfo;
3632 int vec_index = 0;
3633 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3634 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3635 unsigned int mask_element;
3636 machine_mode mode;
3638 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3639 return false;
3641 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3643 mode = TYPE_MODE (vectype);
3644 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3646 /* Initialize the vect stmts of NODE to properly insert the generated
3647 stmts later. */
3648 if (! analyze_only)
3649 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3650 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3651 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3653 /* Generate permutation masks for every NODE. Number of masks for each NODE
3654 is equal to GROUP_SIZE.
3655 E.g., we have a group of three nodes with three loads from the same
3656 location in each node, and the vector size is 4. I.e., we have a
3657 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3658 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3659 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3662 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3663 The last mask is illegal since we assume two operands for permute
3664 operation, and the mask element values can't be outside that range.
3665 Hence, the last mask must be converted into {2,5,5,5}.
3666 For the first two permutations we need the first and the second input
3667 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3668 we need the second and the third vectors: {b1,c1,a2,b2} and
3669 {c2,a3,b3,c3}. */
3671 int vect_stmts_counter = 0;
3672 unsigned int index = 0;
3673 int first_vec_index = -1;
3674 int second_vec_index = -1;
3675 bool noop_p = true;
3676 *n_perms = 0;
3678 vec_perm_builder mask;
3679 unsigned int nelts_to_build;
3680 unsigned int nvectors_per_build;
3681 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3682 && multiple_p (nunits, group_size));
3683 if (repeating_p)
3685 /* A single vector contains a whole number of copies of the node, so:
3686 (a) all permutes can use the same mask; and
3687 (b) the permutes only need a single vector input. */
3688 mask.new_vector (nunits, group_size, 3);
3689 nelts_to_build = mask.encoded_nelts ();
3690 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3692 else
3694 /* We need to construct a separate mask for each vector statement. */
3695 unsigned HOST_WIDE_INT const_nunits, const_vf;
3696 if (!nunits.is_constant (&const_nunits)
3697 || !vf.is_constant (&const_vf))
3698 return false;
3699 mask.new_vector (const_nunits, const_nunits, 1);
3700 nelts_to_build = const_vf * group_size;
3701 nvectors_per_build = 1;
3704 unsigned int count = mask.encoded_nelts ();
3705 mask.quick_grow (count);
3706 vec_perm_indices indices;
3708 for (unsigned int j = 0; j < nelts_to_build; j++)
3710 unsigned int iter_num = j / group_size;
3711 unsigned int stmt_num = j % group_size;
3712 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3713 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3714 if (repeating_p)
3716 first_vec_index = 0;
3717 mask_element = i;
3719 else
3721 /* Enforced before the loop when !repeating_p. */
3722 unsigned int const_nunits = nunits.to_constant ();
3723 vec_index = i / const_nunits;
3724 mask_element = i % const_nunits;
3725 if (vec_index == first_vec_index
3726 || first_vec_index == -1)
3728 first_vec_index = vec_index;
3730 else if (vec_index == second_vec_index
3731 || second_vec_index == -1)
3733 second_vec_index = vec_index;
3734 mask_element += const_nunits;
3736 else
3738 if (dump_enabled_p ())
3739 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3740 "permutation requires at "
3741 "least three vectors %G",
3742 stmt_info->stmt);
3743 gcc_assert (analyze_only);
3744 return false;
3747 gcc_assert (mask_element < 2 * const_nunits);
3750 if (mask_element != index)
3751 noop_p = false;
3752 mask[index++] = mask_element;
3754 if (index == count && !noop_p)
3756 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
3757 if (!can_vec_perm_const_p (mode, indices))
3759 if (dump_enabled_p ())
3761 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3762 vect_location,
3763 "unsupported vect permute { ");
3764 for (i = 0; i < count; ++i)
3766 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3767 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3769 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3771 gcc_assert (analyze_only);
3772 return false;
3775 ++*n_perms;
3778 if (index == count)
3780 if (!analyze_only)
3782 tree mask_vec = NULL_TREE;
3784 if (! noop_p)
3785 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
3787 if (second_vec_index == -1)
3788 second_vec_index = first_vec_index;
3790 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
3792 /* Generate the permute statement if necessary. */
3793 tree first_vec = dr_chain[first_vec_index + ri];
3794 tree second_vec = dr_chain[second_vec_index + ri];
3795 stmt_vec_info perm_stmt_info;
3796 if (! noop_p)
3798 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3799 tree perm_dest
3800 = vect_create_destination_var (gimple_assign_lhs (stmt),
3801 vectype);
3802 perm_dest = make_ssa_name (perm_dest);
3803 gassign *perm_stmt
3804 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3805 first_vec, second_vec,
3806 mask_vec);
3807 perm_stmt_info
3808 = vect_finish_stmt_generation (stmt_info, perm_stmt,
3809 gsi);
3811 else
3812 /* If mask was NULL_TREE generate the requested
3813 identity transform. */
3814 perm_stmt_info = vinfo->lookup_def (first_vec);
3816 /* Store the vector statement in NODE. */
3817 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
3818 = perm_stmt_info;
3822 index = 0;
3823 first_vec_index = -1;
3824 second_vec_index = -1;
3825 noop_p = true;
3829 return true;
3832 /* Vectorize SLP instance tree in postorder. */
3834 static void
3835 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3836 scalar_stmts_to_slp_tree_map_t *bst_map)
3838 gimple_stmt_iterator si;
3839 stmt_vec_info stmt_info;
3840 unsigned int group_size;
3841 tree vectype;
3842 int i, j;
3843 slp_tree child;
3845 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3846 return;
3848 /* See if we have already vectorized the node in the graph of the
3849 SLP instance. */
3850 if (SLP_TREE_VEC_STMTS (node).exists ())
3851 return;
3853 /* See if we have already vectorized the same set of stmts and reuse their
3854 vectorized stmts across instances. */
3855 if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
3857 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
3858 return;
3861 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
3862 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3863 vect_schedule_slp_instance (child, instance, bst_map);
3865 /* Push SLP node def-type to stmts. */
3866 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3867 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3869 stmt_vec_info child_stmt_info;
3870 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
3871 STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
3874 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3876 /* VECTYPE is the type of the destination. */
3877 vectype = STMT_VINFO_VECTYPE (stmt_info);
3878 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3879 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3881 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
3882 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
3884 if (dump_enabled_p ())
3885 dump_printf_loc (MSG_NOTE, vect_location,
3886 "------>vectorizing SLP node starting from: %G",
3887 stmt_info->stmt);
3889 /* Vectorized stmts go before the last scalar stmt which is where
3890 all uses are ready. */
3891 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
3892 si = gsi_for_stmt (last_stmt_info->stmt);
3894 /* Handle two-operation SLP nodes by vectorizing the group with
3895 both operations and then performing a merge. */
3896 if (SLP_TREE_TWO_OPERATORS (node))
3898 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3899 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3900 enum tree_code ocode = ERROR_MARK;
3901 stmt_vec_info ostmt_info;
3902 vec_perm_builder mask (group_size, group_size, 1);
3903 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
3905 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
3906 if (gimple_assign_rhs_code (ostmt) != code0)
3908 mask.quick_push (1);
3909 ocode = gimple_assign_rhs_code (ostmt);
3911 else
3912 mask.quick_push (0);
3914 if (ocode != ERROR_MARK)
3916 vec<stmt_vec_info> v0;
3917 vec<stmt_vec_info> v1;
3918 unsigned j;
3919 tree tmask = NULL_TREE;
3920 vect_transform_stmt (stmt_info, &si, node, instance);
3921 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3922 SLP_TREE_VEC_STMTS (node).truncate (0);
3923 gimple_assign_set_rhs_code (stmt, ocode);
3924 vect_transform_stmt (stmt_info, &si, node, instance);
3925 gimple_assign_set_rhs_code (stmt, code0);
3926 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3927 SLP_TREE_VEC_STMTS (node).truncate (0);
3928 tree meltype = build_nonstandard_integer_type
3929 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
3930 tree mvectype = get_same_sized_vectype (meltype, vectype);
3931 unsigned k = 0, l;
3932 for (j = 0; j < v0.length (); ++j)
3934 /* Enforced by vect_build_slp_tree, which rejects variable-length
3935 vectors for SLP_TREE_TWO_OPERATORS. */
3936 unsigned int const_nunits = nunits.to_constant ();
3937 tree_vector_builder melts (mvectype, const_nunits, 1);
3938 for (l = 0; l < const_nunits; ++l)
3940 if (k >= group_size)
3941 k = 0;
3942 tree t = build_int_cst (meltype,
3943 mask[k++] * const_nunits + l);
3944 melts.quick_push (t);
3946 tmask = melts.build ();
3948 /* ??? Not all targets support a VEC_PERM_EXPR with a
3949 constant mask that would translate to a vec_merge RTX
3950 (with their vec_perm_const_ok). We can either not
3951 vectorize in that case or let veclower do its job.
3952 Unfortunately that isn't too great and at least for
3953 plus/minus we'd eventually like to match targets
3954 vector addsub instructions. */
3955 gimple *vstmt;
3956 vstmt = gimple_build_assign (make_ssa_name (vectype),
3957 VEC_PERM_EXPR,
3958 gimple_assign_lhs (v0[j]->stmt),
3959 gimple_assign_lhs (v1[j]->stmt),
3960 tmask);
3961 SLP_TREE_VEC_STMTS (node).quick_push
3962 (vect_finish_stmt_generation (stmt_info, vstmt, &si));
3964 v0.release ();
3965 v1.release ();
3966 return;
3969 vect_transform_stmt (stmt_info, &si, node, instance);
3971 /* Restore stmt def-types. */
3972 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3973 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3975 stmt_vec_info child_stmt_info;
3976 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
3977 STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
3981 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3982 For loop vectorization this is done in vectorizable_call, but for SLP
3983 it needs to be deferred until end of vect_schedule_slp, because multiple
3984 SLP instances may refer to the same scalar stmt. */
3986 static void
3987 vect_remove_slp_scalar_calls (slp_tree node, hash_set<slp_tree> &visited)
3989 gimple *new_stmt;
3990 gimple_stmt_iterator gsi;
3991 int i;
3992 slp_tree child;
3993 tree lhs;
3994 stmt_vec_info stmt_info;
3996 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3997 return;
3999 if (visited.add (node))
4000 return;
4002 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4003 vect_remove_slp_scalar_calls (child, visited);
4005 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4007 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4008 if (!stmt || gimple_bb (stmt) == NULL)
4009 continue;
4010 if (is_pattern_stmt_p (stmt_info)
4011 || !PURE_SLP_STMT (stmt_info))
4012 continue;
4013 lhs = gimple_call_lhs (stmt);
4014 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4015 gsi = gsi_for_stmt (stmt);
4016 stmt_info->vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4017 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4021 static void
4022 vect_remove_slp_scalar_calls (slp_tree node)
4024 hash_set<slp_tree> visited;
4025 vect_remove_slp_scalar_calls (node, visited);
4028 /* Generate vector code for all SLP instances in the loop/basic block. */
4030 void
4031 vect_schedule_slp (vec_info *vinfo)
4033 vec<slp_instance> slp_instances;
4034 slp_instance instance;
4035 unsigned int i;
4037 scalar_stmts_to_slp_tree_map_t *bst_map
4038 = new scalar_stmts_to_slp_tree_map_t ();
4039 slp_instances = vinfo->slp_instances;
4040 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4042 /* Schedule the tree of INSTANCE. */
4043 vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4044 instance, bst_map);
4045 if (dump_enabled_p ())
4046 dump_printf_loc (MSG_NOTE, vect_location,
4047 "vectorizing stmts using SLP.\n");
4049 delete bst_map;
4051 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4053 slp_tree root = SLP_INSTANCE_TREE (instance);
4054 stmt_vec_info store_info;
4055 unsigned int j;
4057 /* Remove scalar call stmts. Do not do this for basic-block
4058 vectorization as not all uses may be vectorized.
4059 ??? Why should this be necessary? DCE should be able to
4060 remove the stmts itself.
4061 ??? For BB vectorization we can as well remove scalar
4062 stmts starting from the SLP tree root if they have no
4063 uses. */
4064 if (is_a <loop_vec_info> (vinfo))
4065 vect_remove_slp_scalar_calls (root);
4067 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info)
4068 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4070 if (!STMT_VINFO_DATA_REF (store_info))
4071 break;
4073 store_info = vect_orig_stmt (store_info);
4074 /* Free the attached stmt_vec_info and remove the stmt. */
4075 vinfo->remove_stmt (store_info);