Remove extra newline
[official-gcc.git] / gcc / tree-vect-slp.c
blobf83b568dcc11f19e0e357da3965a3194614b8e0d
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
49 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
50 FINAL_P is true if we have vectorized the instance or if we have
51 made a final decision not to vectorize the statements in any way. */
53 static void
54 vect_free_slp_tree (slp_tree node, bool final_p)
56 int i;
57 slp_tree child;
59 if (--node->refcnt != 0)
60 return;
62 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
63 vect_free_slp_tree (child, final_p);
65 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
66 Some statements might no longer exist, after having been
67 removed by vect_transform_stmt. Updating the remaining
68 statements would be redundant. */
69 if (!final_p)
71 stmt_vec_info stmt_info;
72 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
74 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
75 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
79 SLP_TREE_CHILDREN (node).release ();
80 SLP_TREE_SCALAR_STMTS (node).release ();
81 SLP_TREE_SCALAR_OPS (node).release ();
82 SLP_TREE_VEC_STMTS (node).release ();
83 SLP_TREE_LOAD_PERMUTATION (node).release ();
85 free (node);
88 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
89 have vectorized the instance or if we have made a final decision not
90 to vectorize the statements in any way. */
92 void
93 vect_free_slp_instance (slp_instance instance, bool final_p)
95 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
96 SLP_INSTANCE_LOADS (instance).release ();
97 free (instance);
101 /* Create an SLP node for SCALAR_STMTS. */
103 static slp_tree
104 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
106 slp_tree node;
107 stmt_vec_info stmt_info = scalar_stmts[0];
108 unsigned int nops;
110 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
111 nops = gimple_call_num_args (stmt);
112 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
114 nops = gimple_num_ops (stmt) - 1;
115 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
116 nops++;
118 else if (is_a <gphi *> (stmt_info->stmt))
119 nops = 0;
120 else
121 return NULL;
123 node = XNEW (struct _slp_tree);
124 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
125 SLP_TREE_SCALAR_OPS (node) = vNULL;
126 SLP_TREE_VEC_STMTS (node).create (0);
127 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
128 SLP_TREE_CHILDREN (node).create (nops);
129 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
130 SLP_TREE_TWO_OPERATORS (node) = false;
131 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
132 node->refcnt = 1;
133 node->max_nunits = 1;
135 unsigned i;
136 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
137 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
139 return node;
142 /* Create an SLP node for OPS. */
144 static slp_tree
145 vect_create_new_slp_node (vec<tree> ops)
147 slp_tree node;
149 node = XNEW (struct _slp_tree);
150 SLP_TREE_SCALAR_STMTS (node) = vNULL;
151 SLP_TREE_SCALAR_OPS (node) = ops;
152 SLP_TREE_VEC_STMTS (node).create (0);
153 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
154 SLP_TREE_CHILDREN (node) = vNULL;
155 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
156 SLP_TREE_TWO_OPERATORS (node) = false;
157 SLP_TREE_DEF_TYPE (node) = vect_external_def;
158 node->refcnt = 1;
159 node->max_nunits = 1;
161 return node;
165 /* This structure is used in creation of an SLP tree. Each instance
166 corresponds to the same operand in a group of scalar stmts in an SLP
167 node. */
168 typedef struct _slp_oprnd_info
170 /* Def-stmts for the operands. */
171 vec<stmt_vec_info> def_stmts;
172 /* Operands. */
173 vec<tree> ops;
174 /* Information about the first statement, its vector def-type, type, the
175 operand itself in case it's constant, and an indication if it's a pattern
176 stmt. */
177 tree first_op_type;
178 enum vect_def_type first_dt;
179 bool any_pattern;
180 } *slp_oprnd_info;
183 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
184 operand. */
185 static vec<slp_oprnd_info>
186 vect_create_oprnd_info (int nops, int group_size)
188 int i;
189 slp_oprnd_info oprnd_info;
190 vec<slp_oprnd_info> oprnds_info;
192 oprnds_info.create (nops);
193 for (i = 0; i < nops; i++)
195 oprnd_info = XNEW (struct _slp_oprnd_info);
196 oprnd_info->def_stmts.create (group_size);
197 oprnd_info->ops.create (group_size);
198 oprnd_info->first_dt = vect_uninitialized_def;
199 oprnd_info->first_op_type = NULL_TREE;
200 oprnd_info->any_pattern = false;
201 oprnds_info.quick_push (oprnd_info);
204 return oprnds_info;
208 /* Free operands info. */
210 static void
211 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
213 int i;
214 slp_oprnd_info oprnd_info;
216 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
218 oprnd_info->def_stmts.release ();
219 oprnd_info->ops.release ();
220 XDELETE (oprnd_info);
223 oprnds_info.release ();
227 /* Return true if STMTS contains a pattern statement. */
229 static bool
230 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
232 stmt_vec_info stmt_info;
233 unsigned int i;
234 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
235 if (is_pattern_stmt_p (stmt_info))
236 return true;
237 return false;
240 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
241 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
242 of the chain. */
245 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
246 stmt_vec_info first_stmt_info)
248 stmt_vec_info next_stmt_info = first_stmt_info;
249 int result = 0;
251 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
252 return -1;
256 if (next_stmt_info == stmt_info)
257 return result;
258 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
259 if (next_stmt_info)
260 result += DR_GROUP_GAP (next_stmt_info);
262 while (next_stmt_info);
264 return -1;
267 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
268 using the method implemented by duplicate_and_interleave. Return true
269 if so, returning the number of intermediate vectors in *NVECTORS_OUT
270 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
271 (if nonnull). */
273 bool
274 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
275 tree elt_type, unsigned int *nvectors_out,
276 tree *vector_type_out,
277 tree *permutes)
279 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
280 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
281 return false;
283 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
284 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
285 unsigned int nvectors = 1;
286 for (;;)
288 scalar_int_mode int_mode;
289 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
290 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
292 /* Get the natural vector type for this SLP group size. */
293 tree int_type = build_nonstandard_integer_type
294 (GET_MODE_BITSIZE (int_mode), 1);
295 tree vector_type
296 = get_vectype_for_scalar_type (vinfo, int_type, count);
297 if (vector_type
298 && VECTOR_MODE_P (TYPE_MODE (vector_type))
299 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
300 GET_MODE_SIZE (base_vector_mode)))
302 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
303 together into elements of type INT_TYPE and using the result
304 to build NVECTORS vectors. */
305 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
306 vec_perm_builder sel1 (nelts, 2, 3);
307 vec_perm_builder sel2 (nelts, 2, 3);
308 poly_int64 half_nelts = exact_div (nelts, 2);
309 for (unsigned int i = 0; i < 3; ++i)
311 sel1.quick_push (i);
312 sel1.quick_push (i + nelts);
313 sel2.quick_push (half_nelts + i);
314 sel2.quick_push (half_nelts + i + nelts);
316 vec_perm_indices indices1 (sel1, 2, nelts);
317 vec_perm_indices indices2 (sel2, 2, nelts);
318 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
319 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
321 if (nvectors_out)
322 *nvectors_out = nvectors;
323 if (vector_type_out)
324 *vector_type_out = vector_type;
325 if (permutes)
327 permutes[0] = vect_gen_perm_mask_checked (vector_type,
328 indices1);
329 permutes[1] = vect_gen_perm_mask_checked (vector_type,
330 indices2);
332 return true;
336 if (!multiple_p (elt_bytes, 2, &elt_bytes))
337 return false;
338 nvectors *= 2;
342 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
343 they are of a valid type and that they match the defs of the first stmt of
344 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
345 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
346 indicates swap is required for cond_expr stmts. Specifically, *SWAP
347 is 1 if STMT is cond and operands of comparison need to be swapped;
348 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
349 If there is any operand swap in this function, *SWAP is set to non-zero
350 value.
351 If there was a fatal error return -1; if the error could be corrected by
352 swapping operands of father node of this one, return 1; if everything is
353 ok return 0. */
354 static int
355 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
356 vec<stmt_vec_info> stmts, unsigned stmt_num,
357 vec<slp_oprnd_info> *oprnds_info)
359 stmt_vec_info stmt_info = stmts[stmt_num];
360 tree oprnd;
361 unsigned int i, number_of_oprnds;
362 enum vect_def_type dt = vect_uninitialized_def;
363 slp_oprnd_info oprnd_info;
364 int first_op_idx = 1;
365 unsigned int commutative_op = -1U;
366 bool first_op_cond = false;
367 bool first = stmt_num == 0;
369 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
371 number_of_oprnds = gimple_call_num_args (stmt);
372 first_op_idx = 3;
373 if (gimple_call_internal_p (stmt))
375 internal_fn ifn = gimple_call_internal_fn (stmt);
376 commutative_op = first_commutative_argument (ifn);
378 /* Masked load, only look at mask. */
379 if (ifn == IFN_MASK_LOAD)
381 number_of_oprnds = 1;
382 /* Mask operand index. */
383 first_op_idx = 5;
387 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
389 enum tree_code code = gimple_assign_rhs_code (stmt);
390 number_of_oprnds = gimple_num_ops (stmt) - 1;
391 /* Swap can only be done for cond_expr if asked to, otherwise we
392 could result in different comparison code to the first stmt. */
393 if (code == COND_EXPR
394 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
396 first_op_cond = true;
397 number_of_oprnds++;
399 else
400 commutative_op = commutative_tree_code (code) ? 0U : -1U;
402 else
403 return -1;
405 bool swapped = (*swap != 0);
406 gcc_assert (!swapped || first_op_cond);
407 for (i = 0; i < number_of_oprnds; i++)
409 again:
410 if (first_op_cond)
412 /* Map indicating how operands of cond_expr should be swapped. */
413 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
414 int *map = maps[*swap];
416 if (i < 2)
417 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
418 first_op_idx), map[i]);
419 else
420 oprnd = gimple_op (stmt_info->stmt, map[i]);
422 else
423 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
424 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
425 oprnd = TREE_OPERAND (oprnd, 0);
427 oprnd_info = (*oprnds_info)[i];
429 stmt_vec_info def_stmt_info;
430 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
432 if (dump_enabled_p ())
433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
434 "Build SLP failed: can't analyze def for %T\n",
435 oprnd);
437 return -1;
440 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
441 oprnd_info->any_pattern = true;
443 if (first)
445 /* For the swapping logic below force vect_reduction_def
446 for the reduction op in a SLP reduction group. */
447 if (!STMT_VINFO_DATA_REF (stmt_info)
448 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
449 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
450 && def_stmt_info)
451 dt = vect_reduction_def;
452 oprnd_info->first_dt = dt;
453 oprnd_info->first_op_type = TREE_TYPE (oprnd);
455 else
457 /* Not first stmt of the group, check that the def-stmt/s match
458 the def-stmt/s of the first stmt. Allow different definition
459 types for reduction chains: the first stmt must be a
460 vect_reduction_def (a phi node), and the rest
461 end in the reduction chain. */
462 tree type = TREE_TYPE (oprnd);
463 if ((oprnd_info->first_dt != dt
464 && !(oprnd_info->first_dt == vect_reduction_def
465 && !STMT_VINFO_DATA_REF (stmt_info)
466 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
467 && def_stmt_info
468 && !STMT_VINFO_DATA_REF (def_stmt_info)
469 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
470 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
471 && !((oprnd_info->first_dt == vect_external_def
472 || oprnd_info->first_dt == vect_constant_def)
473 && (dt == vect_external_def
474 || dt == vect_constant_def)))
475 || !types_compatible_p (oprnd_info->first_op_type, type)
476 || (!STMT_VINFO_DATA_REF (stmt_info)
477 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
478 && ((!def_stmt_info
479 || STMT_VINFO_DATA_REF (def_stmt_info)
480 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
481 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
482 != (oprnd_info->first_dt != vect_reduction_def))))
484 /* Try swapping operands if we got a mismatch. */
485 if (i == commutative_op && !swapped)
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_NOTE, vect_location,
489 "trying swapped operands\n");
490 swapped = true;
491 goto again;
494 if (dump_enabled_p ())
495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
496 "Build SLP failed: different types\n");
498 return 1;
500 if ((dt == vect_constant_def
501 || dt == vect_external_def)
502 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
503 && (TREE_CODE (type) == BOOLEAN_TYPE
504 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
505 type)))
507 if (dump_enabled_p ())
508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
509 "Build SLP failed: invalid type of def "
510 "for variable-length SLP %T\n", oprnd);
511 return -1;
515 /* Check the types of the definitions. */
516 switch (dt)
518 case vect_external_def:
519 /* Make sure to demote the overall operand to external. */
520 oprnd_info->first_dt = vect_external_def;
521 /* Fallthru. */
522 case vect_constant_def:
523 oprnd_info->def_stmts.quick_push (NULL);
524 oprnd_info->ops.quick_push (oprnd);
525 break;
527 case vect_internal_def:
528 case vect_reduction_def:
529 if (oprnd_info->first_dt == vect_reduction_def
530 && !STMT_VINFO_DATA_REF (stmt_info)
531 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
532 && !STMT_VINFO_DATA_REF (def_stmt_info)
533 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
534 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
536 /* For a SLP reduction chain we want to duplicate the
537 reduction to each of the chain members. That gets
538 us a sane SLP graph (still the stmts are not 100%
539 correct wrt the initial values). */
540 gcc_assert (!first);
541 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
542 oprnd_info->ops.quick_push (oprnd_info->ops[0]);
543 break;
545 /* Fallthru. */
546 case vect_induction_def:
547 oprnd_info->def_stmts.quick_push (def_stmt_info);
548 oprnd_info->ops.quick_push (oprnd);
549 break;
551 default:
552 /* FORNOW: Not supported. */
553 if (dump_enabled_p ())
554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
555 "Build SLP failed: illegal type of def %T\n",
556 oprnd);
558 return -1;
562 /* Swap operands. */
563 if (swapped)
565 if (dump_enabled_p ())
566 dump_printf_loc (MSG_NOTE, vect_location,
567 "swapped operands to match def types in %G",
568 stmt_info->stmt);
571 *swap = swapped;
572 return 0;
575 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
576 Return true if we can, meaning that this choice doesn't conflict with
577 existing SLP nodes that use STMT_INFO. */
579 static bool
580 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
582 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
583 if (old_vectype && useless_type_conversion_p (vectype, old_vectype))
584 return true;
586 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
587 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
589 /* We maintain the invariant that if any statement in the group is
590 used, all other members of the group have the same vector type. */
591 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
592 stmt_vec_info member_info = first_info;
593 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
594 if (STMT_VINFO_NUM_SLP_USES (member_info) > 0
595 || is_pattern_stmt_p (member_info))
596 break;
598 if (!member_info)
600 for (member_info = first_info; member_info;
601 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
602 STMT_VINFO_VECTYPE (member_info) = vectype;
603 return true;
606 else if (STMT_VINFO_NUM_SLP_USES (stmt_info) == 0
607 && !is_pattern_stmt_p (stmt_info))
609 STMT_VINFO_VECTYPE (stmt_info) = vectype;
610 return true;
613 if (dump_enabled_p ())
615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
616 "Build SLP failed: incompatible vector"
617 " types for: %G", stmt_info->stmt);
618 dump_printf_loc (MSG_NOTE, vect_location,
619 " old vector type: %T\n", old_vectype);
620 dump_printf_loc (MSG_NOTE, vect_location,
621 " new vector type: %T\n", vectype);
623 return false;
626 /* Try to infer and assign a vector type to all the statements in STMTS.
627 Used only for BB vectorization. */
629 static bool
630 vect_update_all_shared_vectypes (vec_info *vinfo, vec<stmt_vec_info> stmts)
632 tree vectype, nunits_vectype;
633 if (!vect_get_vector_types_for_stmt (vinfo, stmts[0], &vectype,
634 &nunits_vectype, stmts.length ()))
635 return false;
637 stmt_vec_info stmt_info;
638 unsigned int i;
639 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
640 if (!vect_update_shared_vectype (stmt_info, vectype))
641 return false;
643 return true;
646 /* Return true if call statements CALL1 and CALL2 are similar enough
647 to be combined into the same SLP group. */
649 static bool
650 compatible_calls_p (gcall *call1, gcall *call2)
652 unsigned int nargs = gimple_call_num_args (call1);
653 if (nargs != gimple_call_num_args (call2))
654 return false;
656 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
657 return false;
659 if (gimple_call_internal_p (call1))
661 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
662 TREE_TYPE (gimple_call_lhs (call2))))
663 return false;
664 for (unsigned int i = 0; i < nargs; ++i)
665 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
666 TREE_TYPE (gimple_call_arg (call2, i))))
667 return false;
669 else
671 if (!operand_equal_p (gimple_call_fn (call1),
672 gimple_call_fn (call2), 0))
673 return false;
675 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
676 return false;
678 return true;
681 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
682 caller's attempt to find the vector type in STMT_INFO with the narrowest
683 element type. Return true if VECTYPE is nonnull and if it is valid
684 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
685 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
686 vect_build_slp_tree. */
688 static bool
689 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
690 unsigned int group_size,
691 tree vectype, poly_uint64 *max_nunits)
693 if (!vectype)
695 if (dump_enabled_p ())
696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
697 "Build SLP failed: unsupported data-type in %G\n",
698 stmt_info->stmt);
699 /* Fatal mismatch. */
700 return false;
703 /* If populating the vector type requires unrolling then fail
704 before adjusting *max_nunits for basic-block vectorization. */
705 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
706 unsigned HOST_WIDE_INT const_nunits;
707 if (is_a <bb_vec_info> (vinfo)
708 && (!nunits.is_constant (&const_nunits)
709 || const_nunits > group_size))
711 if (dump_enabled_p ())
712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
713 "Build SLP failed: unrolling required "
714 "in basic block SLP\n");
715 /* Fatal mismatch. */
716 return false;
719 /* In case of multiple types we need to detect the smallest type. */
720 vect_update_max_nunits (max_nunits, vectype);
721 return true;
724 /* STMTS is a group of GROUP_SIZE SLP statements in which some
725 statements do the same operation as the first statement and in which
726 the others do ALT_STMT_CODE. Return true if we can take one vector
727 of the first operation and one vector of the second and permute them
728 to get the required result. VECTYPE is the type of the vector that
729 would be permuted. */
731 static bool
732 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
733 unsigned int group_size, tree vectype,
734 tree_code alt_stmt_code)
736 unsigned HOST_WIDE_INT count;
737 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
738 return false;
740 vec_perm_builder sel (count, count, 1);
741 for (unsigned int i = 0; i < count; ++i)
743 unsigned int elt = i;
744 gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
745 if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
746 elt += count;
747 sel.quick_push (elt);
749 vec_perm_indices indices (sel, 2, count);
750 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
753 /* Verify if the scalar stmts STMTS are isomorphic, require data
754 permutation or are of unsupported types of operation. Return
755 true if they are, otherwise return false and indicate in *MATCHES
756 which stmts are not isomorphic to the first one. If MATCHES[0]
757 is false then this indicates the comparison could not be
758 carried out or the stmts will never be vectorized by SLP.
760 Note COND_EXPR is possibly isomorphic to another one after swapping its
761 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
762 the first stmt by swapping the two operands of comparison; set SWAP[i]
763 to 2 if stmt I is isormorphic to the first stmt by inverting the code
764 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
765 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
767 static bool
768 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
769 vec<stmt_vec_info> stmts, unsigned int group_size,
770 poly_uint64 *max_nunits, bool *matches,
771 bool *two_operators)
773 unsigned int i;
774 stmt_vec_info first_stmt_info = stmts[0];
775 enum tree_code first_stmt_code = ERROR_MARK;
776 enum tree_code alt_stmt_code = ERROR_MARK;
777 enum tree_code rhs_code = ERROR_MARK;
778 enum tree_code first_cond_code = ERROR_MARK;
779 tree lhs;
780 bool need_same_oprnds = false;
781 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
782 optab optab;
783 int icode;
784 machine_mode optab_op2_mode;
785 machine_mode vec_mode;
786 stmt_vec_info first_load = NULL, prev_first_load = NULL;
787 bool load_p = false;
789 /* For every stmt in NODE find its def stmt/s. */
790 stmt_vec_info stmt_info;
791 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
793 gimple *stmt = stmt_info->stmt;
794 swap[i] = 0;
795 matches[i] = false;
797 if (dump_enabled_p ())
798 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
800 /* Fail to vectorize statements marked as unvectorizable. */
801 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
803 if (dump_enabled_p ())
804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
805 "Build SLP failed: unvectorizable statement %G",
806 stmt);
807 /* Fatal mismatch. */
808 matches[0] = false;
809 return false;
812 lhs = gimple_get_lhs (stmt);
813 if (lhs == NULL_TREE)
815 if (dump_enabled_p ())
816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
817 "Build SLP failed: not GIMPLE_ASSIGN nor "
818 "GIMPLE_CALL %G", stmt);
819 /* Fatal mismatch. */
820 matches[0] = false;
821 return false;
824 tree nunits_vectype;
825 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
826 &nunits_vectype, group_size)
827 || (nunits_vectype
828 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
829 nunits_vectype, max_nunits)))
831 /* Fatal mismatch. */
832 matches[0] = false;
833 return false;
836 gcc_assert (vectype);
838 if (is_a <bb_vec_info> (vinfo)
839 && !vect_update_shared_vectype (stmt_info, vectype))
840 continue;
842 gcall *call_stmt = dyn_cast <gcall *> (stmt);
843 if (call_stmt)
845 rhs_code = CALL_EXPR;
847 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
848 load_p = true;
849 else if ((gimple_call_internal_p (call_stmt)
850 && (!vectorizable_internal_fn_p
851 (gimple_call_internal_fn (call_stmt))))
852 || gimple_call_tail_p (call_stmt)
853 || gimple_call_noreturn_p (call_stmt)
854 || !gimple_call_nothrow_p (call_stmt)
855 || gimple_call_chain (call_stmt))
857 if (dump_enabled_p ())
858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
859 "Build SLP failed: unsupported call type %G",
860 call_stmt);
861 /* Fatal mismatch. */
862 matches[0] = false;
863 return false;
866 else
868 rhs_code = gimple_assign_rhs_code (stmt);
869 load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference;
872 /* Check the operation. */
873 if (i == 0)
875 first_stmt_code = rhs_code;
877 /* Shift arguments should be equal in all the packed stmts for a
878 vector shift with scalar shift operand. */
879 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
880 || rhs_code == LROTATE_EXPR
881 || rhs_code == RROTATE_EXPR)
883 vec_mode = TYPE_MODE (vectype);
885 /* First see if we have a vector/vector shift. */
886 optab = optab_for_tree_code (rhs_code, vectype,
887 optab_vector);
889 if (!optab
890 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
892 /* No vector/vector shift, try for a vector/scalar shift. */
893 optab = optab_for_tree_code (rhs_code, vectype,
894 optab_scalar);
896 if (!optab)
898 if (dump_enabled_p ())
899 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
900 "Build SLP failed: no optab.\n");
901 /* Fatal mismatch. */
902 matches[0] = false;
903 return false;
905 icode = (int) optab_handler (optab, vec_mode);
906 if (icode == CODE_FOR_nothing)
908 if (dump_enabled_p ())
909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
910 "Build SLP failed: "
911 "op not supported by target.\n");
912 /* Fatal mismatch. */
913 matches[0] = false;
914 return false;
916 optab_op2_mode = insn_data[icode].operand[2].mode;
917 if (!VECTOR_MODE_P (optab_op2_mode))
919 need_same_oprnds = true;
920 first_op1 = gimple_assign_rhs2 (stmt);
924 else if (rhs_code == WIDEN_LSHIFT_EXPR)
926 need_same_oprnds = true;
927 first_op1 = gimple_assign_rhs2 (stmt);
929 else if (call_stmt
930 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
932 need_same_oprnds = true;
933 first_op1 = gimple_call_arg (call_stmt, 1);
936 else
938 if (first_stmt_code != rhs_code
939 && alt_stmt_code == ERROR_MARK)
940 alt_stmt_code = rhs_code;
941 if (first_stmt_code != rhs_code
942 && (first_stmt_code != IMAGPART_EXPR
943 || rhs_code != REALPART_EXPR)
944 && (first_stmt_code != REALPART_EXPR
945 || rhs_code != IMAGPART_EXPR)
946 /* Handle mismatches in plus/minus by computing both
947 and merging the results. */
948 && !((first_stmt_code == PLUS_EXPR
949 || first_stmt_code == MINUS_EXPR)
950 && (alt_stmt_code == PLUS_EXPR
951 || alt_stmt_code == MINUS_EXPR)
952 && rhs_code == alt_stmt_code)
953 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
954 && (first_stmt_code == ARRAY_REF
955 || first_stmt_code == BIT_FIELD_REF
956 || first_stmt_code == INDIRECT_REF
957 || first_stmt_code == COMPONENT_REF
958 || first_stmt_code == MEM_REF)))
960 if (dump_enabled_p ())
962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
963 "Build SLP failed: different operation "
964 "in stmt %G", stmt);
965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
966 "original stmt %G", first_stmt_info->stmt);
968 /* Mismatch. */
969 continue;
972 if (need_same_oprnds)
974 tree other_op1 = (call_stmt
975 ? gimple_call_arg (call_stmt, 1)
976 : gimple_assign_rhs2 (stmt));
977 if (!operand_equal_p (first_op1, other_op1, 0))
979 if (dump_enabled_p ())
980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
981 "Build SLP failed: different shift "
982 "arguments in %G", stmt);
983 /* Mismatch. */
984 continue;
988 if (!load_p && rhs_code == CALL_EXPR)
990 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
991 as_a <gcall *> (stmt)))
993 if (dump_enabled_p ())
994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
995 "Build SLP failed: different calls in %G",
996 stmt);
997 /* Mismatch. */
998 continue;
1003 /* Grouped store or load. */
1004 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1006 if (REFERENCE_CLASS_P (lhs))
1008 /* Store. */
1011 else
1013 /* Load. */
1014 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1015 if (prev_first_load)
1017 /* Check that there are no loads from different interleaving
1018 chains in the same node. */
1019 if (prev_first_load != first_load)
1021 if (dump_enabled_p ())
1022 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1023 vect_location,
1024 "Build SLP failed: different "
1025 "interleaving chains in one node %G",
1026 stmt);
1027 /* Mismatch. */
1028 continue;
1031 else
1032 prev_first_load = first_load;
1034 } /* Grouped access. */
1035 else
1037 if (load_p)
1039 /* Not grouped load. */
1040 if (dump_enabled_p ())
1041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1042 "Build SLP failed: not grouped load %G", stmt);
1044 /* FORNOW: Not grouped loads are not supported. */
1045 /* Fatal mismatch. */
1046 matches[0] = false;
1047 return false;
1050 /* Not memory operation. */
1051 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1052 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1053 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1054 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1055 && rhs_code != VIEW_CONVERT_EXPR
1056 && rhs_code != CALL_EXPR)
1058 if (dump_enabled_p ())
1059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1060 "Build SLP failed: operation unsupported %G",
1061 stmt);
1062 /* Fatal mismatch. */
1063 matches[0] = false;
1064 return false;
1067 if (rhs_code == COND_EXPR)
1069 tree cond_expr = gimple_assign_rhs1 (stmt);
1070 enum tree_code cond_code = TREE_CODE (cond_expr);
1071 enum tree_code swap_code = ERROR_MARK;
1072 enum tree_code invert_code = ERROR_MARK;
1074 if (i == 0)
1075 first_cond_code = TREE_CODE (cond_expr);
1076 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1078 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1079 swap_code = swap_tree_comparison (cond_code);
1080 invert_code = invert_tree_comparison (cond_code, honor_nans);
1083 if (first_cond_code == cond_code)
1085 /* Isomorphic can be achieved by swapping. */
1086 else if (first_cond_code == swap_code)
1087 swap[i] = 1;
1088 /* Isomorphic can be achieved by inverting. */
1089 else if (first_cond_code == invert_code)
1090 swap[i] = 2;
1091 else
1093 if (dump_enabled_p ())
1094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1095 "Build SLP failed: different"
1096 " operation %G", stmt);
1097 /* Mismatch. */
1098 continue;
1103 matches[i] = true;
1106 for (i = 0; i < group_size; ++i)
1107 if (!matches[i])
1108 return false;
1110 /* If we allowed a two-operation SLP node verify the target can cope
1111 with the permute we are going to use. */
1112 if (alt_stmt_code != ERROR_MARK
1113 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1115 if (!vect_two_operations_perm_ok_p (stmts, group_size,
1116 vectype, alt_stmt_code))
1118 for (i = 0; i < group_size; ++i)
1119 if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
1121 matches[i] = false;
1122 if (dump_enabled_p ())
1124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1125 "Build SLP failed: different operation "
1126 "in stmt %G", stmts[i]->stmt);
1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128 "original stmt %G", first_stmt_info->stmt);
1131 return false;
1133 *two_operators = true;
1136 return true;
1139 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1140 Note we never remove apart from at destruction time so we do not
1141 need a special value for deleted that differs from empty. */
1142 struct bst_traits
1144 typedef vec <stmt_vec_info> value_type;
1145 typedef vec <stmt_vec_info> compare_type;
1146 static inline hashval_t hash (value_type);
1147 static inline bool equal (value_type existing, value_type candidate);
1148 static inline bool is_empty (value_type x) { return !x.exists (); }
1149 static inline bool is_deleted (value_type x) { return !x.exists (); }
1150 static const bool empty_zero_p = true;
1151 static inline void mark_empty (value_type &x) { x.release (); }
1152 static inline void mark_deleted (value_type &x) { x.release (); }
1153 static inline void remove (value_type &x) { x.release (); }
1155 inline hashval_t
1156 bst_traits::hash (value_type x)
1158 inchash::hash h;
1159 for (unsigned i = 0; i < x.length (); ++i)
1160 h.add_int (gimple_uid (x[i]->stmt));
1161 return h.end ();
1163 inline bool
1164 bst_traits::equal (value_type existing, value_type candidate)
1166 if (existing.length () != candidate.length ())
1167 return false;
1168 for (unsigned i = 0; i < existing.length (); ++i)
1169 if (existing[i] != candidate[i])
1170 return false;
1171 return true;
1174 typedef hash_map <vec <gimple *>, slp_tree,
1175 simple_hashmap_traits <bst_traits, slp_tree> >
1176 scalar_stmts_to_slp_tree_map_t;
1178 static slp_tree
1179 vect_build_slp_tree_2 (vec_info *vinfo,
1180 vec<stmt_vec_info> stmts, unsigned int group_size,
1181 poly_uint64 *max_nunits,
1182 bool *matches, unsigned *npermutes, unsigned *tree_size,
1183 scalar_stmts_to_slp_tree_map_t *bst_map);
1185 static slp_tree
1186 vect_build_slp_tree (vec_info *vinfo,
1187 vec<stmt_vec_info> stmts, unsigned int group_size,
1188 poly_uint64 *max_nunits,
1189 bool *matches, unsigned *npermutes, unsigned *tree_size,
1190 scalar_stmts_to_slp_tree_map_t *bst_map)
1192 if (slp_tree *leader = bst_map->get (stmts))
1194 if (dump_enabled_p ())
1195 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1196 *leader ? "" : "failed ", *leader);
1197 if (*leader)
1199 (*leader)->refcnt++;
1200 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1202 return *leader;
1204 poly_uint64 this_max_nunits = 1;
1205 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1206 &this_max_nunits,
1207 matches, npermutes, tree_size, bst_map);
1208 if (res)
1210 res->max_nunits = this_max_nunits;
1211 vect_update_max_nunits (max_nunits, this_max_nunits);
1212 /* Keep a reference for the bst_map use. */
1213 res->refcnt++;
1215 bst_map->put (stmts.copy (), res);
1216 return res;
1219 /* Recursively build an SLP tree starting from NODE.
1220 Fail (and return a value not equal to zero) if def-stmts are not
1221 isomorphic, require data permutation or are of unsupported types of
1222 operation. Otherwise, return 0.
1223 The value returned is the depth in the SLP tree where a mismatch
1224 was found. */
1226 static slp_tree
1227 vect_build_slp_tree_2 (vec_info *vinfo,
1228 vec<stmt_vec_info> stmts, unsigned int group_size,
1229 poly_uint64 *max_nunits,
1230 bool *matches, unsigned *npermutes, unsigned *tree_size,
1231 scalar_stmts_to_slp_tree_map_t *bst_map)
1233 unsigned nops, i, this_tree_size = 0;
1234 poly_uint64 this_max_nunits = *max_nunits;
1235 slp_tree node;
1237 matches[0] = false;
1239 stmt_vec_info stmt_info = stmts[0];
1240 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1241 nops = gimple_call_num_args (stmt);
1242 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1244 nops = gimple_num_ops (stmt) - 1;
1245 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1246 nops++;
1248 else if (is_a <gphi *> (stmt_info->stmt))
1249 nops = 0;
1250 else
1251 return NULL;
1253 /* If the SLP node is a PHI (induction or reduction), terminate
1254 the recursion. */
1255 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1257 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1258 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1259 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1260 max_nunits))
1261 return NULL;
1263 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1264 /* Induction from different IVs is not supported. */
1265 if (def_type == vect_induction_def)
1267 stmt_vec_info other_info;
1268 FOR_EACH_VEC_ELT (stmts, i, other_info)
1269 if (stmt_info != other_info)
1270 return NULL;
1272 else if (def_type == vect_reduction_def
1273 || def_type == vect_double_reduction_def
1274 || def_type == vect_nested_cycle)
1276 /* Else def types have to match. */
1277 stmt_vec_info other_info;
1278 FOR_EACH_VEC_ELT (stmts, i, other_info)
1279 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1280 return NULL;
1282 else
1283 return NULL;
1284 (*tree_size)++;
1285 node = vect_create_new_slp_node (stmts);
1286 return node;
1290 bool two_operators = false;
1291 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1292 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1293 &this_max_nunits, matches, &two_operators))
1294 return NULL;
1296 /* If the SLP node is a load, terminate the recursion unless masked. */
1297 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1298 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1300 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1302 /* Masked load. */
1303 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1304 nops = 1;
1306 else
1308 *max_nunits = this_max_nunits;
1309 (*tree_size)++;
1310 node = vect_create_new_slp_node (stmts);
1311 /* And compute the load permutation. Whether it is actually
1312 a permutation depends on the unrolling factor which is
1313 decided later. */
1314 vec<unsigned> load_permutation;
1315 int j;
1316 stmt_vec_info load_info;
1317 load_permutation.create (group_size);
1318 stmt_vec_info first_stmt_info
1319 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1320 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1322 int load_place = vect_get_place_in_interleaving_chain
1323 (load_info, first_stmt_info);
1324 gcc_assert (load_place != -1);
1325 load_permutation.safe_push (load_place);
1327 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1328 return node;
1332 /* Get at the operands, verifying they are compatible. */
1333 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1334 slp_oprnd_info oprnd_info;
1335 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1337 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1338 stmts, i, &oprnds_info);
1339 if (res != 0)
1340 matches[(res == -1) ? 0 : i] = false;
1341 if (!matches[0])
1342 break;
1344 for (i = 0; i < group_size; ++i)
1345 if (!matches[i])
1347 vect_free_oprnd_info (oprnds_info);
1348 return NULL;
1351 auto_vec<slp_tree, 4> children;
1353 stmt_info = stmts[0];
1355 /* Create SLP_TREE nodes for the definition node/s. */
1356 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1358 slp_tree child;
1359 unsigned old_tree_size = this_tree_size;
1360 unsigned int j;
1362 if (oprnd_info->first_dt == vect_uninitialized_def)
1364 /* COND_EXPR have one too many eventually if the condition
1365 is a SSA name. */
1366 gcc_assert (i == 3 && nops == 4);
1367 continue;
1370 if (oprnd_info->first_dt != vect_internal_def
1371 && oprnd_info->first_dt != vect_reduction_def
1372 && oprnd_info->first_dt != vect_induction_def)
1374 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1375 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1376 oprnd_info->ops = vNULL;
1377 children.safe_push (invnode);
1378 continue;
1381 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1382 group_size, &this_max_nunits,
1383 matches, npermutes,
1384 &this_tree_size, bst_map)) != NULL)
1386 /* If we have all children of a non-unary child built up from
1387 scalars then just throw that away and build it up this node
1388 from scalars. */
1389 if (is_a <bb_vec_info> (vinfo)
1390 && SLP_TREE_CHILDREN (child).length () > 1
1391 /* ??? Rejecting patterns this way doesn't work. We'd have to
1392 do extra work to cancel the pattern so the uses see the
1393 scalar version. */
1394 && !oprnd_info->any_pattern)
1396 slp_tree grandchild;
1398 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1399 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1400 break;
1401 if (!grandchild
1402 && vect_update_all_shared_vectypes (vinfo,
1403 oprnd_info->def_stmts))
1405 /* Roll back. */
1406 this_tree_size = old_tree_size;
1407 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1408 vect_free_slp_tree (grandchild, false);
1409 SLP_TREE_CHILDREN (child).truncate (0);
1411 if (dump_enabled_p ())
1412 dump_printf_loc (MSG_NOTE, vect_location,
1413 "Building parent vector operands from "
1414 "scalars instead\n");
1415 oprnd_info->def_stmts = vNULL;
1416 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1417 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1418 oprnd_info->ops = vNULL;
1419 ++this_tree_size;
1420 children.safe_push (child);
1421 continue;
1425 oprnd_info->def_stmts = vNULL;
1426 children.safe_push (child);
1427 continue;
1430 /* If the SLP build failed fatally and we analyze a basic-block
1431 simply treat nodes we fail to build as externally defined
1432 (and thus build vectors from the scalar defs).
1433 The cost model will reject outright expensive cases.
1434 ??? This doesn't treat cases where permutation ultimatively
1435 fails (or we don't try permutation below). Ideally we'd
1436 even compute a permutation that will end up with the maximum
1437 SLP tree size... */
1438 if (is_a <bb_vec_info> (vinfo)
1439 && !matches[0]
1440 /* ??? Rejecting patterns this way doesn't work. We'd have to
1441 do extra work to cancel the pattern so the uses see the
1442 scalar version. */
1443 && !is_pattern_stmt_p (stmt_info)
1444 && !oprnd_info->any_pattern
1445 && vect_update_all_shared_vectypes (vinfo, oprnd_info->def_stmts))
1447 if (dump_enabled_p ())
1448 dump_printf_loc (MSG_NOTE, vect_location,
1449 "Building vector operands from scalars\n");
1450 this_tree_size++;
1451 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1452 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1453 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1454 children.safe_push (child);
1455 oprnd_info->ops = vNULL;
1456 oprnd_info->def_stmts = vNULL;
1457 continue;
1460 /* If the SLP build for operand zero failed and operand zero
1461 and one can be commutated try that for the scalar stmts
1462 that failed the match. */
1463 if (i == 0
1464 /* A first scalar stmt mismatch signals a fatal mismatch. */
1465 && matches[0]
1466 /* ??? For COND_EXPRs we can swap the comparison operands
1467 as well as the arms under some constraints. */
1468 && nops == 2
1469 && oprnds_info[1]->first_dt == vect_internal_def
1470 && is_gimple_assign (stmt_info->stmt)
1471 /* Swapping operands for reductions breaks assumptions later on. */
1472 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1473 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1474 /* Do so only if the number of not successful permutes was nor more
1475 than a cut-ff as re-trying the recursive match on
1476 possibly each level of the tree would expose exponential
1477 behavior. */
1478 && *npermutes < 4)
1480 /* See whether we can swap the matching or the non-matching
1481 stmt operands. */
1482 bool swap_not_matching = true;
1485 for (j = 0; j < group_size; ++j)
1487 if (matches[j] != !swap_not_matching)
1488 continue;
1489 stmt_vec_info stmt_info = stmts[j];
1490 /* Verify if we can swap operands of this stmt. */
1491 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1492 if (!stmt
1493 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1495 if (!swap_not_matching)
1496 goto fail;
1497 swap_not_matching = false;
1498 break;
1502 while (j != group_size);
1504 /* Swap mismatched definition stmts. */
1505 if (dump_enabled_p ())
1506 dump_printf_loc (MSG_NOTE, vect_location,
1507 "Re-trying with swapped operands of stmts ");
1508 for (j = 0; j < group_size; ++j)
1509 if (matches[j] == !swap_not_matching)
1511 std::swap (oprnds_info[0]->def_stmts[j],
1512 oprnds_info[1]->def_stmts[j]);
1513 std::swap (oprnds_info[0]->ops[j],
1514 oprnds_info[1]->ops[j]);
1515 if (dump_enabled_p ())
1516 dump_printf (MSG_NOTE, "%d ", j);
1518 if (dump_enabled_p ())
1519 dump_printf (MSG_NOTE, "\n");
1520 /* And try again with scratch 'matches' ... */
1521 bool *tem = XALLOCAVEC (bool, group_size);
1522 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1523 group_size, &this_max_nunits,
1524 tem, npermutes,
1525 &this_tree_size, bst_map)) != NULL)
1527 /* If we have all children of a non-unary child built up from
1528 scalars then just throw that away and build it up this node
1529 from scalars. */
1530 if (is_a <bb_vec_info> (vinfo)
1531 && SLP_TREE_CHILDREN (child).length () > 1
1532 /* ??? Rejecting patterns this way doesn't work. We'd have
1533 to do extra work to cancel the pattern so the uses see the
1534 scalar version. */
1535 && !oprnd_info->any_pattern)
1537 unsigned int j;
1538 slp_tree grandchild;
1540 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1541 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1542 break;
1543 if (!grandchild
1544 && (vect_update_all_shared_vectypes
1545 (vinfo, oprnd_info->def_stmts)))
1547 /* Roll back. */
1548 this_tree_size = old_tree_size;
1549 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1550 vect_free_slp_tree (grandchild, false);
1551 SLP_TREE_CHILDREN (child).truncate (0);
1553 if (dump_enabled_p ())
1554 dump_printf_loc (MSG_NOTE, vect_location,
1555 "Building parent vector operands from "
1556 "scalars instead\n");
1557 oprnd_info->def_stmts = vNULL;
1558 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1559 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1560 oprnd_info->ops = vNULL;
1561 ++this_tree_size;
1562 children.safe_push (child);
1563 continue;
1567 oprnd_info->def_stmts = vNULL;
1568 children.safe_push (child);
1569 continue;
1572 ++*npermutes;
1575 fail:
1576 gcc_assert (child == NULL);
1577 FOR_EACH_VEC_ELT (children, j, child)
1578 vect_free_slp_tree (child, false);
1579 vect_free_oprnd_info (oprnds_info);
1580 return NULL;
1583 vect_free_oprnd_info (oprnds_info);
1585 *tree_size += this_tree_size + 1;
1586 *max_nunits = this_max_nunits;
1588 node = vect_create_new_slp_node (stmts);
1589 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1590 SLP_TREE_CHILDREN (node).splice (children);
1591 return node;
1594 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1596 static void
1597 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1598 slp_tree node, hash_set<slp_tree> &visited)
1600 unsigned i, j;
1601 stmt_vec_info stmt_info;
1602 slp_tree child;
1603 tree op;
1605 if (visited.add (node))
1606 return;
1608 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1609 dump_user_location_t user_loc = loc.get_user_location ();
1610 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1611 SLP_TREE_DEF_TYPE (node) == vect_external_def
1612 ? " (external)"
1613 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1614 ? " (constant)"
1615 : ""), node,
1616 estimated_poly_value (node->max_nunits), node->refcnt);
1617 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1618 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1619 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1620 else
1622 dump_printf_loc (metadata, user_loc, "\t{ ");
1623 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1624 dump_printf (metadata, "%T%s ", op,
1625 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1626 dump_printf (metadata, "}\n");
1628 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1630 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1631 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1632 dump_printf (dump_kind, " %u", j);
1633 dump_printf (dump_kind, " }\n");
1635 if (SLP_TREE_CHILDREN (node).is_empty ())
1636 return;
1637 dump_printf_loc (metadata, user_loc, "\tchildren");
1638 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1639 dump_printf (dump_kind, " %p", (void *)child);
1640 dump_printf (dump_kind, "\n");
1641 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1642 vect_print_slp_tree (dump_kind, loc, child, visited);
1645 static void
1646 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1647 slp_tree node)
1649 hash_set<slp_tree> visited;
1650 vect_print_slp_tree (dump_kind, loc, node, visited);
1653 /* Mark the tree rooted at NODE with PURE_SLP. */
1655 static void
1656 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1658 int i;
1659 stmt_vec_info stmt_info;
1660 slp_tree child;
1662 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1663 return;
1665 if (visited.add (node))
1666 return;
1668 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1669 STMT_SLP_TYPE (stmt_info) = pure_slp;
1671 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1672 vect_mark_slp_stmts (child, visited);
1675 static void
1676 vect_mark_slp_stmts (slp_tree node)
1678 hash_set<slp_tree> visited;
1679 vect_mark_slp_stmts (node, visited);
1682 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1684 static void
1685 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1687 int i;
1688 stmt_vec_info stmt_info;
1689 slp_tree child;
1691 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1692 return;
1694 if (visited.add (node))
1695 return;
1697 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1699 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1700 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1701 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1704 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1705 vect_mark_slp_stmts_relevant (child, visited);
1708 static void
1709 vect_mark_slp_stmts_relevant (slp_tree node)
1711 hash_set<slp_tree> visited;
1712 vect_mark_slp_stmts_relevant (node, visited);
1715 /* Copy the SLP subtree rooted at NODE. */
1717 static slp_tree
1718 slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
1720 unsigned i;
1722 bool existed_p;
1723 slp_tree &copy_ref = map.get_or_insert (node, &existed_p);
1724 if (existed_p)
1725 return copy_ref;
1727 copy_ref = XNEW (_slp_tree);
1728 slp_tree copy = copy_ref;
1729 memcpy (copy, node, sizeof (_slp_tree));
1730 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1732 SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy ();
1733 stmt_vec_info stmt_info;
1734 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1735 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
1737 if (SLP_TREE_SCALAR_OPS (node).exists ())
1738 SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
1739 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1740 SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
1741 if (SLP_TREE_CHILDREN (node).exists ())
1742 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
1743 gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
1744 copy->refcnt = 0;
1746 slp_tree child;
1747 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child)
1749 SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map);
1750 SLP_TREE_CHILDREN (copy)[i]->refcnt++;
1752 return copy;
1755 /* Rearrange the statements of NODE according to PERMUTATION. */
1757 static void
1758 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1759 vec<unsigned> permutation,
1760 hash_set<slp_tree> &visited)
1762 unsigned int i;
1763 slp_tree child;
1765 if (visited.add (node))
1766 return;
1768 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1769 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1771 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1773 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1774 /* ??? Computation nodes are isomorphic and need no rearrangement.
1775 This is a quick hack to cover those where rearrangement breaks
1776 semantics because only the first stmt is guaranteed to have the
1777 correct operation code due to others being swapped or inverted. */
1778 stmt_vec_info first = SLP_TREE_SCALAR_STMTS (node)[0];
1779 if (is_gimple_assign (first->stmt)
1780 && gimple_assign_rhs_code (first->stmt) == COND_EXPR)
1781 return;
1782 vec<stmt_vec_info> tmp_stmts;
1783 tmp_stmts.create (group_size);
1784 tmp_stmts.quick_grow (group_size);
1785 stmt_vec_info stmt_info;
1786 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1787 tmp_stmts[permutation[i]] = stmt_info;
1788 SLP_TREE_SCALAR_STMTS (node).release ();
1789 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1791 if (SLP_TREE_SCALAR_OPS (node).exists ())
1793 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1794 vec<tree> tmp_ops;
1795 tmp_ops.create (group_size);
1796 tmp_ops.quick_grow (group_size);
1797 tree op;
1798 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1799 tmp_ops[permutation[i]] = op;
1800 SLP_TREE_SCALAR_OPS (node).release ();
1801 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1806 /* Attempt to reorder stmts in a reduction chain so that we don't
1807 require any load permutation. Return true if that was possible,
1808 otherwise return false. */
1810 static bool
1811 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1813 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1814 unsigned int i, j;
1815 unsigned int lidx;
1816 slp_tree node, load;
1818 /* Compare all the permutation sequences to the first one. We know
1819 that at least one load is permuted. */
1820 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1821 if (!node->load_permutation.exists ())
1822 return false;
1823 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1825 if (!load->load_permutation.exists ())
1826 return false;
1827 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1828 if (lidx != node->load_permutation[j])
1829 return false;
1832 /* Check that the loads in the first sequence are different and there
1833 are no gaps between them. */
1834 auto_sbitmap load_index (group_size);
1835 bitmap_clear (load_index);
1836 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1838 if (lidx >= group_size)
1839 return false;
1840 if (bitmap_bit_p (load_index, lidx))
1841 return false;
1843 bitmap_set_bit (load_index, lidx);
1845 for (i = 0; i < group_size; i++)
1846 if (!bitmap_bit_p (load_index, i))
1847 return false;
1849 /* This permutation is valid for reduction. Since the order of the
1850 statements in the nodes is not important unless they are memory
1851 accesses, we can rearrange the statements in all the nodes
1852 according to the order of the loads. */
1854 /* We have to unshare the SLP tree we modify. */
1855 hash_map<slp_tree, slp_tree> map;
1856 slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map);
1857 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn), false);
1858 unshared->refcnt++;
1859 SLP_INSTANCE_TREE (slp_instn) = unshared;
1860 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1861 SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node);
1862 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1864 /* Do the actual re-arrangement. */
1865 hash_set<slp_tree> visited;
1866 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1867 node->load_permutation, visited);
1869 /* We are done, no actual permutations need to be generated. */
1870 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1871 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1873 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1874 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1875 /* But we have to keep those permutations that are required because
1876 of handling of gaps. */
1877 if (known_eq (unrolling_factor, 1U)
1878 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1879 && DR_GROUP_GAP (first_stmt_info) == 0))
1880 SLP_TREE_LOAD_PERMUTATION (node).release ();
1881 else
1882 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1883 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1886 return true;
1889 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1891 static void
1892 vect_gather_slp_loads (slp_instance inst, slp_tree node,
1893 hash_set<slp_tree> &visited)
1895 if (visited.add (node))
1896 return;
1898 if (SLP_TREE_CHILDREN (node).length () == 0)
1900 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1901 return;
1902 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1903 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1904 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1905 SLP_INSTANCE_LOADS (inst).safe_push (node);
1907 else
1909 unsigned i;
1910 slp_tree child;
1911 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1912 vect_gather_slp_loads (inst, child, visited);
1916 static void
1917 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1919 hash_set<slp_tree> visited;
1920 vect_gather_slp_loads (inst, node, visited);
1923 /* Check if the required load permutations in the SLP instance
1924 SLP_INSTN are supported. */
1926 static bool
1927 vect_supported_load_permutation_p (vec_info *vinfo, slp_instance slp_instn)
1929 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1930 unsigned int i, j, k, next;
1931 slp_tree node;
1933 if (dump_enabled_p ())
1935 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1936 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1937 if (node->load_permutation.exists ())
1938 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1939 dump_printf (MSG_NOTE, "%d ", next);
1940 else
1941 for (k = 0; k < group_size; ++k)
1942 dump_printf (MSG_NOTE, "%d ", k);
1943 dump_printf (MSG_NOTE, "\n");
1946 /* In case of reduction every load permutation is allowed, since the order
1947 of the reduction statements is not important (as opposed to the case of
1948 grouped stores). The only condition we need to check is that all the
1949 load nodes are of the same size and have the same permutation (and then
1950 rearrange all the nodes of the SLP instance according to this
1951 permutation). */
1953 /* Check that all the load nodes are of the same size. */
1954 /* ??? Can't we assert this? */
1955 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1956 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1957 return false;
1959 node = SLP_INSTANCE_TREE (slp_instn);
1960 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1962 /* Reduction (there are no data-refs in the root).
1963 In reduction chain the order of the loads is not important. */
1964 if (!STMT_VINFO_DATA_REF (stmt_info)
1965 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1966 vect_attempt_slp_rearrange_stmts (slp_instn);
1968 /* In basic block vectorization we allow any subchain of an interleaving
1969 chain.
1970 FORNOW: not supported in loop SLP because of realignment compications. */
1971 if (is_a <bb_vec_info> (vinfo))
1973 /* Check whether the loads in an instance form a subchain and thus
1974 no permutation is necessary. */
1975 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1977 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1978 continue;
1979 bool subchain_p = true;
1980 stmt_vec_info next_load_info = NULL;
1981 stmt_vec_info load_info;
1982 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1984 if (j != 0
1985 && (next_load_info != load_info
1986 || DR_GROUP_GAP (load_info) != 1))
1988 subchain_p = false;
1989 break;
1991 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
1993 if (subchain_p)
1994 SLP_TREE_LOAD_PERMUTATION (node).release ();
1995 else
1997 stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
1998 group_info = DR_GROUP_FIRST_ELEMENT (group_info);
1999 unsigned HOST_WIDE_INT nunits;
2000 unsigned k, maxk = 0;
2001 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
2002 if (k > maxk)
2003 maxk = k;
2004 /* In BB vectorization we may not actually use a loaded vector
2005 accessing elements in excess of DR_GROUP_SIZE. */
2006 tree vectype = STMT_VINFO_VECTYPE (group_info);
2007 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
2008 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
2010 if (dump_enabled_p ())
2011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2012 "BB vectorization with gaps at the end of "
2013 "a load is not supported\n");
2014 return false;
2017 /* Verify the permutation can be generated. */
2018 vec<tree> tem;
2019 unsigned n_perms;
2020 if (!vect_transform_slp_perm_load (vinfo, node, tem, NULL,
2021 1, slp_instn, true, &n_perms))
2023 if (dump_enabled_p ())
2024 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
2025 vect_location,
2026 "unsupported load permutation\n");
2027 return false;
2031 return true;
2034 /* For loop vectorization verify we can generate the permutation. Be
2035 conservative about the vectorization factor, there are permutations
2036 that will use three vector inputs only starting from a specific factor
2037 and the vectorization factor is not yet final.
2038 ??? The SLP instance unrolling factor might not be the maximum one. */
2039 unsigned n_perms;
2040 poly_uint64 test_vf
2041 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
2042 LOOP_VINFO_VECT_FACTOR
2043 (as_a <loop_vec_info> (vinfo)));
2044 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
2045 if (node->load_permutation.exists ()
2046 && !vect_transform_slp_perm_load (vinfo, node, vNULL, NULL, test_vf,
2047 slp_instn, true, &n_perms))
2048 return false;
2050 return true;
2054 /* Find the last store in SLP INSTANCE. */
2056 stmt_vec_info
2057 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2059 stmt_vec_info last = NULL;
2060 stmt_vec_info stmt_vinfo;
2062 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2064 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2065 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2068 return last;
2071 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2072 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2073 (also containing the first GROUP1_SIZE stmts, since stores are
2074 consecutive), the second containing the remainder.
2075 Return the first stmt in the second group. */
2077 static stmt_vec_info
2078 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2080 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2081 gcc_assert (group1_size > 0);
2082 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2083 gcc_assert (group2_size > 0);
2084 DR_GROUP_SIZE (first_vinfo) = group1_size;
2086 stmt_vec_info stmt_info = first_vinfo;
2087 for (unsigned i = group1_size; i > 1; i--)
2089 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2090 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2092 /* STMT is now the last element of the first group. */
2093 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2094 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2096 DR_GROUP_SIZE (group2) = group2_size;
2097 for (stmt_info = group2; stmt_info;
2098 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2100 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2101 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2104 /* For the second group, the DR_GROUP_GAP is that before the original group,
2105 plus skipping over the first vector. */
2106 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2108 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2109 DR_GROUP_GAP (first_vinfo) += group2_size;
2111 if (dump_enabled_p ())
2112 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2113 group1_size, group2_size);
2115 return group2;
2118 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2119 statements and a vector of NUNITS elements. */
2121 static poly_uint64
2122 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2124 return exact_div (common_multiple (nunits, group_size), group_size);
2127 /* Analyze an SLP instance starting from a group of grouped stores. Call
2128 vect_build_slp_tree to build a tree of packed stmts if possible.
2129 Return FALSE if it's impossible to SLP any stmt in the loop. */
2131 static bool
2132 vect_analyze_slp_instance (vec_info *vinfo,
2133 scalar_stmts_to_slp_tree_map_t *bst_map,
2134 stmt_vec_info stmt_info, unsigned max_tree_size)
2136 slp_instance new_instance;
2137 slp_tree node;
2138 unsigned int group_size;
2139 tree vectype, scalar_type = NULL_TREE;
2140 unsigned int i;
2141 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2142 vec<stmt_vec_info> scalar_stmts;
2143 bool constructor = false;
2145 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2147 scalar_type = TREE_TYPE (DR_REF (dr));
2148 group_size = DR_GROUP_SIZE (stmt_info);
2149 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
2151 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2153 gcc_assert (is_a <loop_vec_info> (vinfo));
2154 vectype = STMT_VINFO_VECTYPE (stmt_info);
2155 group_size = REDUC_GROUP_SIZE (stmt_info);
2157 else if (is_gimple_assign (stmt_info->stmt)
2158 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2160 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
2161 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2162 constructor = true;
2164 else
2166 gcc_assert (is_a <loop_vec_info> (vinfo));
2167 vectype = STMT_VINFO_VECTYPE (stmt_info);
2168 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2171 if (!vectype)
2173 if (dump_enabled_p ())
2174 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2175 "Build SLP failed: unsupported data-type %T\n",
2176 scalar_type);
2178 return false;
2180 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2182 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2183 scalar_stmts.create (group_size);
2184 stmt_vec_info next_info = stmt_info;
2185 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2187 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2188 while (next_info)
2190 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2191 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2194 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2196 /* Collect the reduction stmts and store them in
2197 SLP_TREE_SCALAR_STMTS. */
2198 while (next_info)
2200 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2201 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2203 /* Mark the first element of the reduction chain as reduction to properly
2204 transform the node. In the reduction analysis phase only the last
2205 element of the chain is marked as reduction. */
2206 STMT_VINFO_DEF_TYPE (stmt_info)
2207 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2208 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2209 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2211 else if (constructor)
2213 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2214 tree val;
2215 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2217 if (TREE_CODE (val) == SSA_NAME)
2219 gimple* def = SSA_NAME_DEF_STMT (val);
2220 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2221 /* Value is defined in another basic block. */
2222 if (!def_info)
2223 return false;
2224 def_info = vect_stmt_to_vectorize (def_info);
2225 scalar_stmts.safe_push (def_info);
2227 else
2228 return false;
2230 if (dump_enabled_p ())
2231 dump_printf_loc (MSG_NOTE, vect_location,
2232 "Analyzing vectorizable constructor: %G\n",
2233 stmt_info->stmt);
2235 else
2237 /* Collect reduction statements. */
2238 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2239 for (i = 0; reductions.iterate (i, &next_info); i++)
2240 scalar_stmts.safe_push (next_info);
2243 /* Build the tree for the SLP instance. */
2244 bool *matches = XALLOCAVEC (bool, group_size);
2245 unsigned npermutes = 0;
2246 poly_uint64 max_nunits = nunits;
2247 unsigned tree_size = 0;
2248 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2249 &max_nunits, matches, &npermutes,
2250 &tree_size, bst_map);
2251 if (node != NULL)
2253 /* Calculate the unrolling factor based on the smallest type. */
2254 poly_uint64 unrolling_factor
2255 = calculate_unrolling_factor (max_nunits, group_size);
2257 if (maybe_ne (unrolling_factor, 1U)
2258 && is_a <bb_vec_info> (vinfo))
2260 unsigned HOST_WIDE_INT const_max_nunits;
2261 if (!max_nunits.is_constant (&const_max_nunits)
2262 || const_max_nunits > group_size)
2264 if (dump_enabled_p ())
2265 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2266 "Build SLP failed: store group "
2267 "size not a multiple of the vector size "
2268 "in basic block SLP\n");
2269 vect_free_slp_tree (node, false);
2270 return false;
2272 /* Fatal mismatch. */
2273 matches[group_size / const_max_nunits * const_max_nunits] = false;
2274 vect_free_slp_tree (node, false);
2276 else
2278 /* Create a new SLP instance. */
2279 new_instance = XNEW (class _slp_instance);
2280 SLP_INSTANCE_TREE (new_instance) = node;
2281 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2282 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2283 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2284 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2286 vect_gather_slp_loads (new_instance, node);
2287 if (dump_enabled_p ())
2288 dump_printf_loc (MSG_NOTE, vect_location,
2289 "SLP size %u vs. limit %u.\n",
2290 tree_size, max_tree_size);
2292 /* Compute the load permutation. */
2293 slp_tree load_node;
2294 bool loads_permuted = false;
2295 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2297 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2298 continue;
2299 unsigned j;
2300 stmt_vec_info load_info;
2301 bool this_load_permuted = false;
2302 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
2303 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2304 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2305 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2307 this_load_permuted = true;
2308 break;
2310 if (!this_load_permuted
2311 /* The load requires permutation when unrolling exposes
2312 a gap either because the group is larger than the SLP
2313 group-size or because there is a gap between the groups. */
2314 && (known_eq (unrolling_factor, 1U)
2315 || (group_size == DR_GROUP_SIZE (first_stmt_info)
2316 && DR_GROUP_GAP (first_stmt_info) == 0)))
2318 SLP_TREE_LOAD_PERMUTATION (load_node).release ();
2319 continue;
2321 loads_permuted = true;
2324 if (loads_permuted)
2326 if (!vect_supported_load_permutation_p (vinfo, new_instance))
2328 if (dump_enabled_p ())
2329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2330 "Build SLP failed: unsupported load "
2331 "permutation %G", stmt_info->stmt);
2332 vect_free_slp_instance (new_instance, false);
2333 return false;
2337 /* If the loads and stores can be handled with load/store-lan
2338 instructions do not generate this SLP instance. */
2339 if (is_a <loop_vec_info> (vinfo)
2340 && loads_permuted
2341 && dr && vect_store_lanes_supported (vectype, group_size, false))
2343 slp_tree load_node;
2344 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2346 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2347 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2348 /* Use SLP for strided accesses (or if we can't load-lanes). */
2349 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2350 || ! vect_load_lanes_supported
2351 (STMT_VINFO_VECTYPE (stmt_vinfo),
2352 DR_GROUP_SIZE (stmt_vinfo), false))
2353 break;
2355 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2357 if (dump_enabled_p ())
2358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2359 "Built SLP cancelled: can use "
2360 "load/store-lanes\n");
2361 vect_free_slp_instance (new_instance, false);
2362 return false;
2366 /* If this is a reduction chain with a conversion in front
2367 amend the SLP tree with a node for that. */
2368 if (!dr
2369 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2370 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2372 /* Get at the conversion stmt - we know it's the single use
2373 of the last stmt of the reduction chain. */
2374 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2375 use_operand_p use_p;
2376 gimple *use_stmt;
2377 bool r = single_imm_use (gimple_assign_lhs (tem),
2378 &use_p, &use_stmt);
2379 gcc_assert (r);
2380 next_info = vinfo->lookup_stmt (use_stmt);
2381 next_info = vect_stmt_to_vectorize (next_info);
2382 scalar_stmts = vNULL;
2383 scalar_stmts.create (group_size);
2384 for (unsigned i = 0; i < group_size; ++i)
2385 scalar_stmts.quick_push (next_info);
2386 slp_tree conv = vect_create_new_slp_node (scalar_stmts);
2387 SLP_TREE_CHILDREN (conv).quick_push (node);
2388 SLP_INSTANCE_TREE (new_instance) = conv;
2389 /* We also have to fake this conversion stmt as SLP reduction
2390 group so we don't have to mess with too much code
2391 elsewhere. */
2392 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2393 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2396 vinfo->slp_instances.safe_push (new_instance);
2398 if (dump_enabled_p ())
2400 dump_printf_loc (MSG_NOTE, vect_location,
2401 "Final SLP tree for instance:\n");
2402 vect_print_slp_tree (MSG_NOTE, vect_location,
2403 SLP_INSTANCE_TREE (new_instance));
2406 return true;
2409 else
2411 /* Failed to SLP. */
2412 /* Free the allocated memory. */
2413 scalar_stmts.release ();
2416 /* For basic block SLP, try to break the group up into multiples of the
2417 vector size. */
2418 unsigned HOST_WIDE_INT const_nunits;
2419 if (is_a <bb_vec_info> (vinfo)
2420 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2421 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2422 && nunits.is_constant (&const_nunits))
2424 /* We consider breaking the group only on VF boundaries from the existing
2425 start. */
2426 for (i = 0; i < group_size; i++)
2427 if (!matches[i]) break;
2429 if (i >= const_nunits && i < group_size)
2431 /* Split into two groups at the first vector boundary before i. */
2432 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2433 unsigned group1_size = i & ~(const_nunits - 1);
2435 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2436 group1_size);
2437 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2438 max_tree_size);
2439 /* If the first non-match was in the middle of a vector,
2440 skip the rest of that vector. */
2441 if (group1_size < i)
2443 i = group1_size + const_nunits;
2444 if (i < group_size)
2445 rest = vect_split_slp_store_group (rest, const_nunits);
2447 if (i < group_size)
2448 res |= vect_analyze_slp_instance (vinfo, bst_map,
2449 rest, max_tree_size);
2450 return res;
2452 /* Even though the first vector did not all match, we might be able to SLP
2453 (some) of the remainder. FORNOW ignore this possibility. */
2456 return false;
2460 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2461 trees of packed scalar stmts if SLP is possible. */
2463 opt_result
2464 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2466 unsigned int i;
2467 stmt_vec_info first_element;
2469 DUMP_VECT_SCOPE ("vect_analyze_slp");
2471 scalar_stmts_to_slp_tree_map_t *bst_map
2472 = new scalar_stmts_to_slp_tree_map_t ();
2474 /* Find SLP sequences starting from groups of grouped stores. */
2475 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2476 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2478 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2480 if (loop_vinfo->reduction_chains.length () > 0)
2482 /* Find SLP sequences starting from reduction chains. */
2483 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2484 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2485 max_tree_size))
2487 /* Dissolve reduction chain group. */
2488 stmt_vec_info vinfo = first_element;
2489 stmt_vec_info last = NULL;
2490 while (vinfo)
2492 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2493 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2494 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2495 last = vinfo;
2496 vinfo = next;
2498 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2499 /* It can be still vectorized as part of an SLP reduction. */
2500 loop_vinfo->reductions.safe_push (last);
2504 /* Find SLP sequences starting from groups of reductions. */
2505 if (loop_vinfo->reductions.length () > 1)
2506 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2507 max_tree_size);
2510 /* The map keeps a reference on SLP nodes built, release that. */
2511 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2512 it != bst_map->end (); ++it)
2513 if ((*it).second)
2514 vect_free_slp_tree ((*it).second, false);
2515 delete bst_map;
2517 return opt_result::success ();
2521 /* For each possible SLP instance decide whether to SLP it and calculate overall
2522 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2523 least one instance. */
2525 bool
2526 vect_make_slp_decision (loop_vec_info loop_vinfo)
2528 unsigned int i;
2529 poly_uint64 unrolling_factor = 1;
2530 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2531 slp_instance instance;
2532 int decided_to_slp = 0;
2534 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2536 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2538 /* FORNOW: SLP if you can. */
2539 /* All unroll factors have the form:
2541 GET_MODE_SIZE (vinfo->vector_mode) * X
2543 for some rational X, so they must have a common multiple. */
2544 unrolling_factor
2545 = force_common_multiple (unrolling_factor,
2546 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2548 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2549 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2550 loop-based vectorization. Such stmts will be marked as HYBRID. */
2551 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2552 decided_to_slp++;
2555 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2557 if (decided_to_slp && dump_enabled_p ())
2559 dump_printf_loc (MSG_NOTE, vect_location,
2560 "Decided to SLP %d instances. Unrolling factor ",
2561 decided_to_slp);
2562 dump_dec (MSG_NOTE, unrolling_factor);
2563 dump_printf (MSG_NOTE, "\n");
2566 return (decided_to_slp > 0);
2569 /* Private data for vect_detect_hybrid_slp. */
2570 struct vdhs_data
2572 loop_vec_info loop_vinfo;
2573 vec<stmt_vec_info> *worklist;
2576 /* Walker for walk_gimple_op. */
2578 static tree
2579 vect_detect_hybrid_slp (tree *tp, int *, void *data)
2581 walk_stmt_info *wi = (walk_stmt_info *)data;
2582 vdhs_data *dat = (vdhs_data *)wi->info;
2584 if (wi->is_lhs)
2585 return NULL_TREE;
2587 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
2588 if (!def_stmt_info)
2589 return NULL_TREE;
2590 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
2591 if (PURE_SLP_STMT (def_stmt_info))
2593 if (dump_enabled_p ())
2594 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2595 def_stmt_info->stmt);
2596 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2597 dat->worklist->safe_push (def_stmt_info);
2600 return NULL_TREE;
2603 /* Find stmts that must be both vectorized and SLPed. */
2605 void
2606 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2608 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2610 /* All stmts participating in SLP are marked pure_slp, all other
2611 stmts are loop_vect.
2612 First collect all loop_vect stmts into a worklist. */
2613 auto_vec<stmt_vec_info> worklist;
2614 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2616 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2617 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2618 gsi_next (&gsi))
2620 gphi *phi = gsi.phi ();
2621 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
2622 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2623 worklist.safe_push (stmt_info);
2625 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2626 gsi_next (&gsi))
2628 gimple *stmt = gsi_stmt (gsi);
2629 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2630 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2632 for (gimple_stmt_iterator gsi2
2633 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2634 !gsi_end_p (gsi2); gsi_next (&gsi2))
2636 stmt_vec_info patt_info
2637 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
2638 if (!STMT_SLP_TYPE (patt_info)
2639 && STMT_VINFO_RELEVANT (patt_info))
2640 worklist.safe_push (patt_info);
2642 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2644 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2645 worklist.safe_push (stmt_info);
2649 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2650 mark any SLP vectorized stmt as hybrid.
2651 ??? We're visiting def stmts N times (once for each non-SLP and
2652 once for each hybrid-SLP use). */
2653 walk_stmt_info wi;
2654 vdhs_data dat;
2655 dat.worklist = &worklist;
2656 dat.loop_vinfo = loop_vinfo;
2657 memset (&wi, 0, sizeof (wi));
2658 wi.info = (void *)&dat;
2659 while (!worklist.is_empty ())
2661 stmt_vec_info stmt_info = worklist.pop ();
2662 /* Since SSA operands are not set up for pattern stmts we need
2663 to use walk_gimple_op. */
2664 wi.is_lhs = 0;
2665 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
2670 /* Initialize a bb_vec_info struct for the statements between
2671 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2673 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2674 gimple_stmt_iterator region_end_in,
2675 vec_info_shared *shared)
2676 : vec_info (vec_info::bb, init_cost (NULL), shared),
2677 bb (gsi_bb (region_begin_in)),
2678 region_begin (region_begin_in),
2679 region_end (region_end_in)
2681 gimple_stmt_iterator gsi;
2683 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2684 gsi_next (&gsi))
2686 gimple *stmt = gsi_stmt (gsi);
2687 gimple_set_uid (stmt, 0);
2688 add_stmt (stmt);
2691 bb->aux = this;
2695 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2696 stmts in the basic block. */
2698 _bb_vec_info::~_bb_vec_info ()
2700 for (gimple_stmt_iterator si = region_begin;
2701 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2702 /* Reset region marker. */
2703 gimple_set_uid (gsi_stmt (si), -1);
2705 bb->aux = NULL;
2708 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2709 given then that child nodes have already been processed, and that
2710 their def types currently match their SLP node's def type. */
2712 static bool
2713 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2714 slp_instance node_instance,
2715 stmt_vector_for_cost *cost_vec)
2717 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2718 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2720 /* Calculate the number of vector statements to be created for the
2721 scalar stmts in this node. For SLP reductions it is equal to the
2722 number of vector statements in the children (which has already been
2723 calculated by the recursive call). Otherwise it is the number of
2724 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2725 VF divided by the number of elements in a vector. */
2726 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2727 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2729 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2730 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2732 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2733 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2734 break;
2737 else
2739 poly_uint64 vf;
2740 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2741 vf = loop_vinfo->vectorization_factor;
2742 else
2743 vf = 1;
2744 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2745 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2746 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2747 = vect_get_num_vectors (vf * group_size, vectype);
2750 bool dummy;
2751 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
2752 node, node_instance, cost_vec);
2755 /* Try to build NODE from scalars, returning true on success.
2756 NODE_INSTANCE is the SLP instance that contains NODE. */
2758 static bool
2759 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
2760 slp_instance node_instance)
2762 stmt_vec_info stmt_info;
2763 unsigned int i;
2765 if (!is_a <bb_vec_info> (vinfo)
2766 || node == SLP_INSTANCE_TREE (node_instance)
2767 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
2768 return false;
2770 if (dump_enabled_p ())
2771 dump_printf_loc (MSG_NOTE, vect_location,
2772 "Building vector operands from scalars instead\n");
2774 /* Don't remove and free the child nodes here, since they could be
2775 referenced by other structures. The analysis and scheduling phases
2776 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2777 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
2778 SLP_TREE_DEF_TYPE (node) = vect_external_def;
2779 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size);
2780 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2782 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
2783 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
2785 return true;
2788 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2789 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2791 Return true if the operations are supported. */
2793 static bool
2794 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2795 slp_instance node_instance,
2796 hash_set<slp_tree> &visited,
2797 hash_set<slp_tree> &lvisited,
2798 stmt_vector_for_cost *cost_vec)
2800 int i, j;
2801 slp_tree child;
2803 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2804 return true;
2806 /* If we already analyzed the exact same set of scalar stmts we're done.
2807 We share the generated vector stmts for those.
2808 The SLP graph is acyclic so not caching whether we failed or succeeded
2809 doesn't result in any issue since we throw away the lvisited set
2810 when we fail. */
2811 if (visited.contains (node)
2812 || lvisited.add (node))
2813 return true;
2815 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2816 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2817 visited, lvisited, cost_vec))
2818 return false;
2820 /* ??? We have to catch the case late where two first scalar stmts appear
2821 in multiple SLP children with different def type and fail. Remember
2822 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2823 match it when that is vect_internal_def. */
2824 auto_vec<vect_def_type, 4> dt;
2825 dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2826 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2827 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2828 dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
2830 /* Push SLP node def-type to stmt operands. */
2831 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2832 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def
2833 && SLP_TREE_SCALAR_STMTS (child).length () != 0)
2834 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2835 = SLP_TREE_DEF_TYPE (child);
2837 /* Check everything worked out. */
2838 bool res = true;
2839 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2840 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2842 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2844 if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2845 != SLP_TREE_DEF_TYPE (child))
2846 res = false;
2848 else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2849 != dt[j])
2850 res = false;
2852 if (!res && dump_enabled_p ())
2853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2854 "not vectorized: same operand with different "
2855 "def type in stmt.\n");
2857 if (res)
2858 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2859 cost_vec);
2861 /* Restore def-types. */
2862 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2863 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2864 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
2866 /* If this node can't be vectorized, try pruning the tree here rather
2867 than felling the whole thing. */
2868 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
2869 res = true;
2871 return res;
2875 /* Analyze statements in SLP instances of VINFO. Return true if the
2876 operations are supported. */
2878 bool
2879 vect_slp_analyze_operations (vec_info *vinfo)
2881 slp_instance instance;
2882 int i;
2884 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2886 hash_set<slp_tree> visited;
2887 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2889 hash_set<slp_tree> lvisited;
2890 stmt_vector_for_cost cost_vec;
2891 cost_vec.create (2);
2892 if (!vect_slp_analyze_node_operations (vinfo,
2893 SLP_INSTANCE_TREE (instance),
2894 instance, visited, lvisited,
2895 &cost_vec)
2896 /* Instances with a root stmt require vectorized defs for the
2897 SLP tree root. */
2898 || (SLP_INSTANCE_ROOT_STMT (instance)
2899 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
2900 != vect_internal_def)))
2902 slp_tree node = SLP_INSTANCE_TREE (instance);
2903 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2904 if (dump_enabled_p ())
2905 dump_printf_loc (MSG_NOTE, vect_location,
2906 "removing SLP instance operations starting from: %G",
2907 stmt_info->stmt);
2908 vect_free_slp_instance (instance, false);
2909 vinfo->slp_instances.ordered_remove (i);
2910 cost_vec.release ();
2912 else
2914 for (hash_set<slp_tree>::iterator x = lvisited.begin();
2915 x != lvisited.end(); ++x)
2916 visited.add (*x);
2917 i++;
2919 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
2920 cost_vec.release ();
2924 return !vinfo->slp_instances.is_empty ();
2928 /* Compute the scalar cost of the SLP node NODE and its children
2929 and return it. Do not account defs that are marked in LIFE and
2930 update LIFE according to uses of NODE. */
2932 static void
2933 vect_bb_slp_scalar_cost (vec_info *vinfo, basic_block bb,
2934 slp_tree node, vec<bool, va_heap> *life,
2935 stmt_vector_for_cost *cost_vec,
2936 hash_set<slp_tree> &visited)
2938 unsigned i;
2939 stmt_vec_info stmt_info;
2940 slp_tree child;
2942 if (visited.add (node))
2943 return;
2945 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2947 gimple *stmt = stmt_info->stmt;
2948 ssa_op_iter op_iter;
2949 def_operand_p def_p;
2951 if ((*life)[i])
2952 continue;
2954 /* If there is a non-vectorized use of the defs then the scalar
2955 stmt is kept live in which case we do not account it or any
2956 required defs in the SLP children in the scalar cost. This
2957 way we make the vectorization more costly when compared to
2958 the scalar cost. */
2959 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2961 imm_use_iterator use_iter;
2962 gimple *use_stmt;
2963 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2964 if (!is_gimple_debug (use_stmt))
2966 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
2967 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
2969 (*life)[i] = true;
2970 BREAK_FROM_IMM_USE_STMT (use_iter);
2974 if ((*life)[i])
2975 continue;
2977 /* Count scalar stmts only once. */
2978 if (gimple_visited_p (stmt))
2979 continue;
2980 gimple_set_visited (stmt, true);
2982 vect_cost_for_stmt kind;
2983 if (STMT_VINFO_DATA_REF (stmt_info))
2985 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2986 kind = scalar_load;
2987 else
2988 kind = scalar_store;
2990 else if (vect_nop_conversion_p (stmt_info))
2991 continue;
2992 else
2993 kind = scalar_stmt;
2994 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
2997 auto_vec<bool, 20> subtree_life;
2998 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3000 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3002 /* Do not directly pass LIFE to the recursive call, copy it to
3003 confine changes in the callee to the current child/subtree. */
3004 subtree_life.safe_splice (*life);
3005 vect_bb_slp_scalar_cost (vinfo, bb, child, &subtree_life, cost_vec,
3006 visited);
3007 subtree_life.truncate (0);
3012 /* Check if vectorization of the basic block is profitable. */
3014 static bool
3015 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
3017 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3018 slp_instance instance;
3019 int i;
3020 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3021 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3023 /* Calculate scalar cost. */
3024 stmt_vector_for_cost scalar_costs;
3025 scalar_costs.create (0);
3026 hash_set<slp_tree> visited;
3027 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3029 auto_vec<bool, 20> life;
3030 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
3031 vect_bb_slp_scalar_cost (bb_vinfo, BB_VINFO_BB (bb_vinfo),
3032 SLP_INSTANCE_TREE (instance),
3033 &life, &scalar_costs, visited);
3035 void *target_cost_data = init_cost (NULL);
3036 add_stmt_costs (bb_vinfo, target_cost_data, &scalar_costs);
3037 scalar_costs.release ();
3038 unsigned dummy;
3039 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
3040 destroy_cost_data (target_cost_data);
3042 /* Unset visited flag. */
3043 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
3044 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3045 gimple_set_visited (gsi_stmt (gsi), false);
3047 /* Complete the target-specific cost calculation. */
3048 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
3049 &vec_inside_cost, &vec_epilogue_cost);
3051 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3053 if (dump_enabled_p ())
3055 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3056 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3057 vec_inside_cost);
3058 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3059 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3060 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3063 /* Vectorization is profitable if its cost is more than the cost of scalar
3064 version. Note that we err on the vector side for equal cost because
3065 the cost estimate is otherwise quite pessimistic (constant uses are
3066 free on the scalar side but cost a load on the vector side for
3067 example). */
3068 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3069 return false;
3071 return true;
3074 /* Find any vectorizable constructors and add them to the grouped_store
3075 array. */
3077 static void
3078 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3080 gimple_stmt_iterator gsi;
3082 for (gsi = bb_vinfo->region_begin;
3083 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3085 gassign *stmt = dyn_cast <gassign *> (gsi_stmt (gsi));
3086 if (!stmt || gimple_assign_rhs_code (stmt) != CONSTRUCTOR)
3087 continue;
3089 tree rhs = gimple_assign_rhs1 (stmt);
3090 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
3091 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
3092 CONSTRUCTOR_NELTS (rhs))
3093 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
3094 || uniform_vector_p (rhs))
3095 continue;
3097 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (stmt);
3098 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3102 /* Check if the region described by BB_VINFO can be vectorized, returning
3103 true if so. When returning false, set FATAL to true if the same failure
3104 would prevent vectorization at other vector sizes, false if it is still
3105 worth trying other sizes. N_STMTS is the number of statements in the
3106 region. */
3108 static bool
3109 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
3111 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3113 slp_instance instance;
3114 int i;
3115 poly_uint64 min_vf = 2;
3117 /* The first group of checks is independent of the vector size. */
3118 fatal = true;
3120 /* Analyze the data references. */
3122 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3124 if (dump_enabled_p ())
3125 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3126 "not vectorized: unhandled data-ref in basic "
3127 "block.\n");
3128 return false;
3131 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
3133 if (dump_enabled_p ())
3134 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3135 "not vectorized: not enough data-refs in "
3136 "basic block.\n");
3137 return false;
3140 if (!vect_analyze_data_ref_accesses (bb_vinfo))
3142 if (dump_enabled_p ())
3143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3144 "not vectorized: unhandled data access in "
3145 "basic block.\n");
3146 return false;
3149 vect_slp_check_for_constructors (bb_vinfo);
3151 /* If there are no grouped stores in the region there is no need
3152 to continue with pattern recog as vect_analyze_slp will fail
3153 anyway. */
3154 if (bb_vinfo->grouped_stores.is_empty ())
3156 if (dump_enabled_p ())
3157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3158 "not vectorized: no grouped stores in "
3159 "basic block.\n");
3160 return false;
3163 /* While the rest of the analysis below depends on it in some way. */
3164 fatal = false;
3166 vect_pattern_recog (bb_vinfo);
3168 /* Check the SLP opportunities in the basic block, analyze and build SLP
3169 trees. */
3170 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3172 if (dump_enabled_p ())
3174 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3175 "Failed to SLP the basic block.\n");
3176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3177 "not vectorized: failed to find SLP opportunities "
3178 "in basic block.\n");
3180 return false;
3183 vect_record_base_alignments (bb_vinfo);
3185 /* Analyze and verify the alignment of data references and the
3186 dependence in the SLP instances. */
3187 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3189 if (! vect_slp_analyze_and_verify_instance_alignment (bb_vinfo, instance)
3190 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
3192 slp_tree node = SLP_INSTANCE_TREE (instance);
3193 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3194 if (dump_enabled_p ())
3195 dump_printf_loc (MSG_NOTE, vect_location,
3196 "removing SLP instance operations starting from: %G",
3197 stmt_info->stmt);
3198 vect_free_slp_instance (instance, false);
3199 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3200 continue;
3203 /* Mark all the statements that we want to vectorize as pure SLP and
3204 relevant. */
3205 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3206 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3207 if (SLP_INSTANCE_ROOT_STMT (instance))
3208 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3210 i++;
3212 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3213 return false;
3215 if (!vect_slp_analyze_operations (bb_vinfo))
3217 if (dump_enabled_p ())
3218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3219 "not vectorized: bad operation in basic block.\n");
3220 return false;
3223 /* Cost model: check if the vectorization is worthwhile. */
3224 if (!unlimited_cost_model (NULL)
3225 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3227 if (dump_enabled_p ())
3228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3229 "not vectorized: vectorization is not "
3230 "profitable.\n");
3231 return false;
3234 if (dump_enabled_p ())
3235 dump_printf_loc (MSG_NOTE, vect_location,
3236 "Basic block will be vectorized using SLP\n");
3237 return true;
3240 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3241 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3242 on success. The region has N_STMTS statements and has the datarefs
3243 given by DATAREFS. */
3245 static bool
3246 vect_slp_bb_region (gimple_stmt_iterator region_begin,
3247 gimple_stmt_iterator region_end,
3248 vec<data_reference_p> datarefs,
3249 unsigned int n_stmts)
3251 bb_vec_info bb_vinfo;
3252 auto_vector_modes vector_modes;
3254 /* Autodetect first vector size we try. */
3255 machine_mode next_vector_mode = VOIDmode;
3256 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3257 unsigned int mode_i = 0;
3259 vec_info_shared shared;
3261 machine_mode autodetected_vector_mode = VOIDmode;
3262 while (1)
3264 bool vectorized = false;
3265 bool fatal = false;
3266 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3268 bool first_time_p = shared.datarefs.is_empty ();
3269 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3270 if (first_time_p)
3271 bb_vinfo->shared->save_datarefs ();
3272 else
3273 bb_vinfo->shared->check_datarefs ();
3274 bb_vinfo->vector_mode = next_vector_mode;
3276 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
3277 && dbg_cnt (vect_slp))
3279 if (dump_enabled_p ())
3281 dump_printf_loc (MSG_NOTE, vect_location,
3282 "***** Analysis succeeded with vector mode"
3283 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
3284 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3287 bb_vinfo->shared->check_datarefs ();
3288 vect_schedule_slp (bb_vinfo);
3290 unsigned HOST_WIDE_INT bytes;
3291 if (dump_enabled_p ())
3293 if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
3294 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3295 "basic block part vectorized using %wu byte "
3296 "vectors\n", bytes);
3297 else
3298 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3299 "basic block part vectorized using variable "
3300 "length vectors\n");
3303 vectorized = true;
3305 else
3307 if (dump_enabled_p ())
3308 dump_printf_loc (MSG_NOTE, vect_location,
3309 "***** Analysis failed with vector mode %s\n",
3310 GET_MODE_NAME (bb_vinfo->vector_mode));
3313 if (mode_i == 0)
3314 autodetected_vector_mode = bb_vinfo->vector_mode;
3316 if (!fatal)
3317 while (mode_i < vector_modes.length ()
3318 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
3320 if (dump_enabled_p ())
3321 dump_printf_loc (MSG_NOTE, vect_location,
3322 "***** The result for vector mode %s would"
3323 " be the same\n",
3324 GET_MODE_NAME (vector_modes[mode_i]));
3325 mode_i += 1;
3328 delete bb_vinfo;
3330 if (mode_i < vector_modes.length ()
3331 && VECTOR_MODE_P (autodetected_vector_mode)
3332 && (related_vector_mode (vector_modes[mode_i],
3333 GET_MODE_INNER (autodetected_vector_mode))
3334 == autodetected_vector_mode)
3335 && (related_vector_mode (autodetected_vector_mode,
3336 GET_MODE_INNER (vector_modes[mode_i]))
3337 == vector_modes[mode_i]))
3339 if (dump_enabled_p ())
3340 dump_printf_loc (MSG_NOTE, vect_location,
3341 "***** Skipping vector mode %s, which would"
3342 " repeat the analysis for %s\n",
3343 GET_MODE_NAME (vector_modes[mode_i]),
3344 GET_MODE_NAME (autodetected_vector_mode));
3345 mode_i += 1;
3348 if (vectorized
3349 || mode_i == vector_modes.length ()
3350 || autodetected_vector_mode == VOIDmode
3351 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3352 vector sizes will fail do not bother iterating. */
3353 || fatal)
3354 return vectorized;
3356 /* Try the next biggest vector size. */
3357 next_vector_mode = vector_modes[mode_i++];
3358 if (dump_enabled_p ())
3359 dump_printf_loc (MSG_NOTE, vect_location,
3360 "***** Re-trying analysis with vector mode %s\n",
3361 GET_MODE_NAME (next_vector_mode));
3365 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3366 true if anything in the basic-block was vectorized. */
3368 bool
3369 vect_slp_bb (basic_block bb)
3371 gimple_stmt_iterator gsi;
3372 bool any_vectorized = false;
3374 gsi = gsi_start_bb (bb);
3375 while (!gsi_end_p (gsi))
3377 gimple_stmt_iterator region_begin = gsi;
3378 vec<data_reference_p> datarefs = vNULL;
3379 int insns = 0;
3381 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3383 gimple *stmt = gsi_stmt (gsi);
3384 if (is_gimple_debug (stmt))
3385 continue;
3386 insns++;
3388 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3389 vect_location = stmt;
3391 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3392 break;
3395 /* Skip leading unhandled stmts. */
3396 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3398 gsi_next (&gsi);
3399 continue;
3402 gimple_stmt_iterator region_end = gsi;
3404 if (insns > param_slp_max_insns_in_bb)
3406 if (dump_enabled_p ())
3407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3408 "not vectorized: too many instructions in "
3409 "basic block.\n");
3411 else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
3412 any_vectorized = true;
3414 if (gsi_end_p (region_end))
3415 break;
3417 /* Skip the unhandled stmt. */
3418 gsi_next (&gsi);
3421 return any_vectorized;
3425 /* Return 1 if vector type STMT_VINFO is a boolean vector. */
3427 static bool
3428 vect_mask_constant_operand_p (vec_info *vinfo,
3429 stmt_vec_info stmt_vinfo, unsigned op_num)
3431 enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
3432 tree op, vectype;
3433 enum vect_def_type dt;
3435 /* For comparison and COND_EXPR type is chosen depending
3436 on the non-constant other comparison operand. */
3437 if (TREE_CODE_CLASS (code) == tcc_comparison)
3439 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3440 op = gimple_assign_rhs1 (stmt);
3442 if (!vect_is_simple_use (op, vinfo, &dt, &vectype))
3443 gcc_unreachable ();
3445 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3448 if (code == COND_EXPR)
3450 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3451 tree cond = gimple_assign_rhs1 (stmt);
3453 if (TREE_CODE (cond) == SSA_NAME)
3455 if (op_num > 0)
3456 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3457 op = cond;
3459 else
3461 if (op_num > 1)
3462 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3463 op = TREE_OPERAND (cond, 0);
3466 if (!vect_is_simple_use (op, vinfo, &dt, &vectype))
3467 gcc_unreachable ();
3469 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3472 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3475 /* Build a variable-length vector in which the elements in ELTS are repeated
3476 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3477 RESULTS and add any new instructions to SEQ.
3479 The approach we use is:
3481 (1) Find a vector mode VM with integer elements of mode IM.
3483 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3484 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3485 from small vectors to IM.
3487 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3489 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3490 correct byte contents.
3492 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3494 We try to find the largest IM for which this sequence works, in order
3495 to cut down on the number of interleaves. */
3497 void
3498 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3499 vec<tree> elts, unsigned int nresults,
3500 vec<tree> &results)
3502 unsigned int nelts = elts.length ();
3503 tree element_type = TREE_TYPE (vector_type);
3505 /* (1) Find a vector mode VM with integer elements of mode IM. */
3506 unsigned int nvectors = 1;
3507 tree new_vector_type;
3508 tree permutes[2];
3509 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
3510 &nvectors, &new_vector_type,
3511 permutes))
3512 gcc_unreachable ();
3514 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3515 unsigned int partial_nelts = nelts / nvectors;
3516 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3518 tree_vector_builder partial_elts;
3519 auto_vec<tree, 32> pieces (nvectors * 2);
3520 pieces.quick_grow (nvectors * 2);
3521 for (unsigned int i = 0; i < nvectors; ++i)
3523 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3524 ELTS' has mode IM. */
3525 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3526 for (unsigned int j = 0; j < partial_nelts; ++j)
3527 partial_elts.quick_push (elts[i * partial_nelts + j]);
3528 tree t = gimple_build_vector (seq, &partial_elts);
3529 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3530 TREE_TYPE (new_vector_type), t);
3532 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3533 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3536 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3537 correct byte contents.
3539 We need to repeat the following operation log2(nvectors) times:
3541 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3542 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3544 However, if each input repeats every N elements and the VF is
3545 a multiple of N * 2, the HI result is the same as the LO. */
3546 unsigned int in_start = 0;
3547 unsigned int out_start = nvectors;
3548 unsigned int hi_start = nvectors / 2;
3549 /* A bound on the number of outputs needed to produce NRESULTS results
3550 in the final iteration. */
3551 unsigned int noutputs_bound = nvectors * nresults;
3552 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3554 noutputs_bound /= 2;
3555 unsigned int limit = MIN (noutputs_bound, nvectors);
3556 for (unsigned int i = 0; i < limit; ++i)
3558 if ((i & 1) != 0
3559 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3560 2 * in_repeat))
3562 pieces[out_start + i] = pieces[out_start + i - 1];
3563 continue;
3566 tree output = make_ssa_name (new_vector_type);
3567 tree input1 = pieces[in_start + (i / 2)];
3568 tree input2 = pieces[in_start + (i / 2) + hi_start];
3569 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3570 input1, input2,
3571 permutes[i & 1]);
3572 gimple_seq_add_stmt (seq, stmt);
3573 pieces[out_start + i] = output;
3575 std::swap (in_start, out_start);
3578 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3579 results.reserve (nresults);
3580 for (unsigned int i = 0; i < nresults; ++i)
3581 if (i < nvectors)
3582 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3583 pieces[in_start + i]));
3584 else
3585 results.quick_push (results[i - nvectors]);
3589 /* For constant and loop invariant defs of SLP_NODE this function returns
3590 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3591 OP_NODE determines the node for the operand containing the scalar
3592 operands. */
3594 static void
3595 vect_get_constant_vectors (vec_info *vinfo,
3596 slp_tree slp_node, unsigned op_num,
3597 vec<tree> *vec_oprnds)
3599 slp_tree op_node = SLP_TREE_CHILDREN (slp_node)[op_num];
3600 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3601 unsigned HOST_WIDE_INT nunits;
3602 tree vec_cst;
3603 unsigned j, number_of_places_left_in_vector;
3604 tree vector_type;
3605 tree vop;
3606 int group_size = op_node->ops.length ();
3607 unsigned int vec_num, i;
3608 unsigned number_of_copies = 1;
3609 bool constant_p;
3610 tree neutral_op = NULL;
3611 gimple_seq ctor_seq = NULL;
3612 auto_vec<tree, 16> permute_results;
3614 /* ??? SLP analysis should compute the vector type for the
3615 constant / invariant and store it in the SLP node. */
3616 tree op = op_node->ops[0];
3617 /* Check if vector type is a boolean vector. */
3618 tree stmt_vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3619 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3620 && vect_mask_constant_operand_p (vinfo, stmt_vinfo, op_num))
3621 vector_type = truth_type_for (stmt_vectype);
3622 else
3623 vector_type = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op), op_node);
3625 /* ??? For lane-reducing ops we should also have the required number
3626 of vector stmts initialized rather than second-guessing here. */
3627 unsigned int number_of_vectors;
3628 if (is_gimple_assign (stmt_vinfo->stmt)
3629 && (gimple_assign_rhs_code (stmt_vinfo->stmt) == SAD_EXPR
3630 || gimple_assign_rhs_code (stmt_vinfo->stmt) == DOT_PROD_EXPR
3631 || gimple_assign_rhs_code (stmt_vinfo->stmt) == WIDEN_SUM_EXPR))
3632 number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3633 else
3634 number_of_vectors
3635 = vect_get_num_vectors (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
3636 * TYPE_VECTOR_SUBPARTS (stmt_vectype),
3637 vector_type);
3638 vec_oprnds->create (number_of_vectors);
3639 auto_vec<tree> voprnds (number_of_vectors);
3641 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3642 created vectors. It is greater than 1 if unrolling is performed.
3644 For example, we have two scalar operands, s1 and s2 (e.g., group of
3645 strided accesses of size two), while NUNITS is four (i.e., four scalars
3646 of this type can be packed in a vector). The output vector will contain
3647 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3648 will be 2).
3650 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3651 containing the operands.
3653 For example, NUNITS is four as before, and the group size is 8
3654 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3655 {s5, s6, s7, s8}. */
3657 /* When using duplicate_and_interleave, we just need one element for
3658 each scalar statement. */
3659 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3660 nunits = group_size;
3662 number_of_copies = nunits * number_of_vectors / group_size;
3664 number_of_places_left_in_vector = nunits;
3665 constant_p = true;
3666 tree_vector_builder elts (vector_type, nunits, 1);
3667 elts.quick_grow (nunits);
3668 bool place_after_defs = false;
3669 for (j = 0; j < number_of_copies; j++)
3671 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3673 /* Create 'vect_ = {op0,op1,...,opn}'. */
3674 number_of_places_left_in_vector--;
3675 tree orig_op = op;
3676 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3678 if (CONSTANT_CLASS_P (op))
3680 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3682 /* Can't use VIEW_CONVERT_EXPR for booleans because
3683 of possibly different sizes of scalar value and
3684 vector element. */
3685 if (integer_zerop (op))
3686 op = build_int_cst (TREE_TYPE (vector_type), 0);
3687 else if (integer_onep (op))
3688 op = build_all_ones_cst (TREE_TYPE (vector_type));
3689 else
3690 gcc_unreachable ();
3692 else
3693 op = fold_unary (VIEW_CONVERT_EXPR,
3694 TREE_TYPE (vector_type), op);
3695 gcc_assert (op && CONSTANT_CLASS_P (op));
3697 else
3699 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3700 gimple *init_stmt;
3701 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3703 tree true_val
3704 = build_all_ones_cst (TREE_TYPE (vector_type));
3705 tree false_val
3706 = build_zero_cst (TREE_TYPE (vector_type));
3707 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3708 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3709 op, true_val,
3710 false_val);
3712 else
3714 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3715 op);
3716 init_stmt
3717 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3718 op);
3720 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3721 op = new_temp;
3724 elts[number_of_places_left_in_vector] = op;
3725 if (!CONSTANT_CLASS_P (op))
3726 constant_p = false;
3727 if (TREE_CODE (orig_op) == SSA_NAME
3728 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3729 && is_a <bb_vec_info> (vinfo)
3730 && (as_a <bb_vec_info> (vinfo)->bb
3731 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3732 place_after_defs = true;
3734 if (number_of_places_left_in_vector == 0)
3736 if (constant_p
3737 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3738 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3739 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3740 else
3742 if (permute_results.is_empty ())
3743 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
3744 elts, number_of_vectors,
3745 permute_results);
3746 vec_cst = permute_results[number_of_vectors - j - 1];
3748 tree init;
3749 gimple_stmt_iterator gsi;
3750 if (place_after_defs)
3752 stmt_vec_info last_stmt_info
3753 = vect_find_last_scalar_stmt_in_slp (slp_node);
3754 gsi = gsi_for_stmt (last_stmt_info->stmt);
3755 init = vect_init_vector (vinfo, stmt_vinfo, vec_cst,
3756 vector_type, &gsi);
3758 else
3759 init = vect_init_vector (vinfo, stmt_vinfo, vec_cst,
3760 vector_type, NULL);
3761 if (ctor_seq != NULL)
3763 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3764 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3765 ctor_seq = NULL;
3767 voprnds.quick_push (init);
3768 place_after_defs = false;
3769 number_of_places_left_in_vector = nunits;
3770 constant_p = true;
3771 elts.new_vector (vector_type, nunits, 1);
3772 elts.quick_grow (nunits);
3777 /* Since the vectors are created in the reverse order, we should invert
3778 them. */
3779 vec_num = voprnds.length ();
3780 for (j = vec_num; j != 0; j--)
3782 vop = voprnds[j - 1];
3783 vec_oprnds->quick_push (vop);
3786 /* In case that VF is greater than the unrolling factor needed for the SLP
3787 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3788 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3789 to replicate the vectors. */
3790 while (number_of_vectors > vec_oprnds->length ())
3792 tree neutral_vec = NULL;
3794 if (neutral_op)
3796 if (!neutral_vec)
3797 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3799 vec_oprnds->quick_push (neutral_vec);
3801 else
3803 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3804 vec_oprnds->quick_push (vop);
3810 /* Get vectorized definitions from SLP_NODE that contains corresponding
3811 vectorized def-stmts. */
3813 static void
3814 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3816 stmt_vec_info vec_def_stmt_info;
3817 unsigned int i;
3819 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3821 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
3822 vec_oprnds->quick_push (gimple_get_lhs (vec_def_stmt_info->stmt));
3826 /* Get N vectorized definitions for SLP_NODE.
3827 If the scalar definitions are loop invariants or constants, collect them and
3828 call vect_get_constant_vectors() to create vector stmts.
3829 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3830 must be stored in the corresponding child of SLP_NODE, and we call
3831 vect_get_slp_vect_defs () to retrieve them. */
3833 void
3834 vect_get_slp_defs (vec_info *vinfo,
3835 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
3837 if (n == -1U)
3838 n = SLP_TREE_CHILDREN (slp_node).length ();
3840 for (unsigned i = 0; i < n; ++i)
3842 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
3844 vec<tree> vec_defs = vNULL;
3846 /* For each operand we check if it has vectorized definitions in a child
3847 node or we need to create them (for invariants and constants). */
3848 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3850 vec_defs.create (SLP_TREE_NUMBER_OF_VEC_STMTS (child));
3851 vect_get_slp_vect_defs (child, &vec_defs);
3853 else
3854 vect_get_constant_vectors (vinfo, slp_node, i, &vec_defs);
3856 vec_oprnds->quick_push (vec_defs);
3860 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3861 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3862 permute statements for the SLP node NODE of the SLP instance
3863 SLP_NODE_INSTANCE. */
3865 bool
3866 vect_transform_slp_perm_load (vec_info *vinfo,
3867 slp_tree node, vec<tree> dr_chain,
3868 gimple_stmt_iterator *gsi, poly_uint64 vf,
3869 slp_instance slp_node_instance, bool analyze_only,
3870 unsigned *n_perms)
3872 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3873 int vec_index = 0;
3874 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3875 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3876 unsigned int mask_element;
3877 machine_mode mode;
3879 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3880 return false;
3882 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3884 mode = TYPE_MODE (vectype);
3885 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3887 /* Initialize the vect stmts of NODE to properly insert the generated
3888 stmts later. */
3889 if (! analyze_only)
3890 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3891 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3892 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3894 /* Generate permutation masks for every NODE. Number of masks for each NODE
3895 is equal to GROUP_SIZE.
3896 E.g., we have a group of three nodes with three loads from the same
3897 location in each node, and the vector size is 4. I.e., we have a
3898 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3899 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3900 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3903 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3904 The last mask is illegal since we assume two operands for permute
3905 operation, and the mask element values can't be outside that range.
3906 Hence, the last mask must be converted into {2,5,5,5}.
3907 For the first two permutations we need the first and the second input
3908 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3909 we need the second and the third vectors: {b1,c1,a2,b2} and
3910 {c2,a3,b3,c3}. */
3912 int vect_stmts_counter = 0;
3913 unsigned int index = 0;
3914 int first_vec_index = -1;
3915 int second_vec_index = -1;
3916 bool noop_p = true;
3917 *n_perms = 0;
3919 vec_perm_builder mask;
3920 unsigned int nelts_to_build;
3921 unsigned int nvectors_per_build;
3922 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3923 && multiple_p (nunits, group_size));
3924 if (repeating_p)
3926 /* A single vector contains a whole number of copies of the node, so:
3927 (a) all permutes can use the same mask; and
3928 (b) the permutes only need a single vector input. */
3929 mask.new_vector (nunits, group_size, 3);
3930 nelts_to_build = mask.encoded_nelts ();
3931 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3933 else
3935 /* We need to construct a separate mask for each vector statement. */
3936 unsigned HOST_WIDE_INT const_nunits, const_vf;
3937 if (!nunits.is_constant (&const_nunits)
3938 || !vf.is_constant (&const_vf))
3939 return false;
3940 mask.new_vector (const_nunits, const_nunits, 1);
3941 nelts_to_build = const_vf * group_size;
3942 nvectors_per_build = 1;
3945 unsigned int count = mask.encoded_nelts ();
3946 mask.quick_grow (count);
3947 vec_perm_indices indices;
3949 for (unsigned int j = 0; j < nelts_to_build; j++)
3951 unsigned int iter_num = j / group_size;
3952 unsigned int stmt_num = j % group_size;
3953 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3954 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3955 if (repeating_p)
3957 first_vec_index = 0;
3958 mask_element = i;
3960 else
3962 /* Enforced before the loop when !repeating_p. */
3963 unsigned int const_nunits = nunits.to_constant ();
3964 vec_index = i / const_nunits;
3965 mask_element = i % const_nunits;
3966 if (vec_index == first_vec_index
3967 || first_vec_index == -1)
3969 first_vec_index = vec_index;
3971 else if (vec_index == second_vec_index
3972 || second_vec_index == -1)
3974 second_vec_index = vec_index;
3975 mask_element += const_nunits;
3977 else
3979 if (dump_enabled_p ())
3980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3981 "permutation requires at "
3982 "least three vectors %G",
3983 stmt_info->stmt);
3984 gcc_assert (analyze_only);
3985 return false;
3988 gcc_assert (mask_element < 2 * const_nunits);
3991 if (mask_element != index)
3992 noop_p = false;
3993 mask[index++] = mask_element;
3995 if (index == count && !noop_p)
3997 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
3998 if (!can_vec_perm_const_p (mode, indices))
4000 if (dump_enabled_p ())
4002 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4003 vect_location,
4004 "unsupported vect permute { ");
4005 for (i = 0; i < count; ++i)
4007 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4008 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4010 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4012 gcc_assert (analyze_only);
4013 return false;
4016 ++*n_perms;
4019 if (index == count)
4021 if (!analyze_only)
4023 tree mask_vec = NULL_TREE;
4025 if (! noop_p)
4026 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4028 if (second_vec_index == -1)
4029 second_vec_index = first_vec_index;
4031 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4033 /* Generate the permute statement if necessary. */
4034 tree first_vec = dr_chain[first_vec_index + ri];
4035 tree second_vec = dr_chain[second_vec_index + ri];
4036 stmt_vec_info perm_stmt_info;
4037 if (! noop_p)
4039 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4040 tree perm_dest
4041 = vect_create_destination_var (gimple_assign_lhs (stmt),
4042 vectype);
4043 perm_dest = make_ssa_name (perm_dest);
4044 gassign *perm_stmt
4045 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4046 first_vec, second_vec,
4047 mask_vec);
4048 perm_stmt_info
4049 = vect_finish_stmt_generation (vinfo,
4050 stmt_info, perm_stmt,
4051 gsi);
4053 else
4054 /* If mask was NULL_TREE generate the requested
4055 identity transform. */
4056 perm_stmt_info = vinfo->lookup_def (first_vec);
4058 /* Store the vector statement in NODE. */
4059 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
4060 = perm_stmt_info;
4064 index = 0;
4065 first_vec_index = -1;
4066 second_vec_index = -1;
4067 noop_p = true;
4071 return true;
4074 /* Vectorize SLP instance tree in postorder. */
4076 static void
4077 vect_schedule_slp_instance (vec_info *vinfo,
4078 slp_tree node, slp_instance instance)
4080 gimple_stmt_iterator si;
4081 stmt_vec_info stmt_info;
4082 unsigned int group_size;
4083 tree vectype;
4084 int i, j;
4085 slp_tree child;
4087 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4088 return;
4090 /* See if we have already vectorized the node in the graph of the
4091 SLP instance. */
4092 if (SLP_TREE_VEC_STMTS (node).exists ())
4093 return;
4095 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4096 vect_schedule_slp_instance (vinfo, child, instance);
4098 /* Push SLP node def-type to stmts. */
4099 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4100 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4102 stmt_vec_info child_stmt_info;
4103 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4104 STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
4107 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4109 /* VECTYPE is the type of the destination. */
4110 vectype = STMT_VINFO_VECTYPE (stmt_info);
4111 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4112 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
4114 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
4115 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4117 if (dump_enabled_p ())
4118 dump_printf_loc (MSG_NOTE, vect_location,
4119 "------>vectorizing SLP node starting from: %G",
4120 stmt_info->stmt);
4122 /* Vectorized stmts go before the last scalar stmt which is where
4123 all uses are ready. */
4124 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
4125 si = gsi_for_stmt (last_stmt_info->stmt);
4127 /* Handle two-operation SLP nodes by vectorizing the group with
4128 both operations and then performing a merge. */
4129 bool done_p = false;
4130 if (SLP_TREE_TWO_OPERATORS (node))
4132 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4133 enum tree_code code0 = gimple_assign_rhs_code (stmt);
4134 enum tree_code ocode = ERROR_MARK;
4135 stmt_vec_info ostmt_info;
4136 vec_perm_builder mask (group_size, group_size, 1);
4137 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
4139 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
4140 if (gimple_assign_rhs_code (ostmt) != code0)
4142 mask.quick_push (1);
4143 ocode = gimple_assign_rhs_code (ostmt);
4145 else
4146 mask.quick_push (0);
4148 if (ocode != ERROR_MARK)
4150 vec<stmt_vec_info> v0;
4151 vec<stmt_vec_info> v1;
4152 unsigned j;
4153 tree tmask = NULL_TREE;
4154 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
4155 v0 = SLP_TREE_VEC_STMTS (node).copy ();
4156 SLP_TREE_VEC_STMTS (node).truncate (0);
4157 gimple_assign_set_rhs_code (stmt, ocode);
4158 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
4159 gimple_assign_set_rhs_code (stmt, code0);
4160 v1 = SLP_TREE_VEC_STMTS (node).copy ();
4161 SLP_TREE_VEC_STMTS (node).truncate (0);
4162 tree meltype = build_nonstandard_integer_type
4163 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
4164 tree mvectype = get_same_sized_vectype (meltype, vectype);
4165 unsigned k = 0, l;
4166 for (j = 0; j < v0.length (); ++j)
4168 /* Enforced by vect_build_slp_tree, which rejects variable-length
4169 vectors for SLP_TREE_TWO_OPERATORS. */
4170 unsigned int const_nunits = nunits.to_constant ();
4171 tree_vector_builder melts (mvectype, const_nunits, 1);
4172 for (l = 0; l < const_nunits; ++l)
4174 if (k >= group_size)
4175 k = 0;
4176 tree t = build_int_cst (meltype,
4177 mask[k++] * const_nunits + l);
4178 melts.quick_push (t);
4180 tmask = melts.build ();
4182 /* ??? Not all targets support a VEC_PERM_EXPR with a
4183 constant mask that would translate to a vec_merge RTX
4184 (with their vec_perm_const_ok). We can either not
4185 vectorize in that case or let veclower do its job.
4186 Unfortunately that isn't too great and at least for
4187 plus/minus we'd eventually like to match targets
4188 vector addsub instructions. */
4189 gimple *vstmt;
4190 vstmt = gimple_build_assign (make_ssa_name (vectype),
4191 VEC_PERM_EXPR,
4192 gimple_assign_lhs (v0[j]->stmt),
4193 gimple_assign_lhs (v1[j]->stmt),
4194 tmask);
4195 SLP_TREE_VEC_STMTS (node).quick_push
4196 (vect_finish_stmt_generation (vinfo, stmt_info, vstmt, &si));
4198 v0.release ();
4199 v1.release ();
4200 done_p = true;
4203 if (!done_p)
4204 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
4206 /* Restore stmt def-types. */
4207 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4208 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4210 stmt_vec_info child_stmt_info;
4211 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4212 STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
4216 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4217 For loop vectorization this is done in vectorizable_call, but for SLP
4218 it needs to be deferred until end of vect_schedule_slp, because multiple
4219 SLP instances may refer to the same scalar stmt. */
4221 static void
4222 vect_remove_slp_scalar_calls (vec_info *vinfo,
4223 slp_tree node, hash_set<slp_tree> &visited)
4225 gimple *new_stmt;
4226 gimple_stmt_iterator gsi;
4227 int i;
4228 slp_tree child;
4229 tree lhs;
4230 stmt_vec_info stmt_info;
4232 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4233 return;
4235 if (visited.add (node))
4236 return;
4238 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4239 vect_remove_slp_scalar_calls (vinfo, child, visited);
4241 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4243 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4244 if (!stmt || gimple_bb (stmt) == NULL)
4245 continue;
4246 if (is_pattern_stmt_p (stmt_info)
4247 || !PURE_SLP_STMT (stmt_info))
4248 continue;
4249 lhs = gimple_call_lhs (stmt);
4250 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4251 gsi = gsi_for_stmt (stmt);
4252 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4253 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4257 static void
4258 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
4260 hash_set<slp_tree> visited;
4261 vect_remove_slp_scalar_calls (vinfo, node, visited);
4264 /* Vectorize the instance root. */
4266 void
4267 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
4269 gassign *rstmt = NULL;
4271 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
4273 stmt_vec_info child_stmt_info;
4274 int j;
4276 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
4278 tree vect_lhs = gimple_get_lhs (child_stmt_info->stmt);
4279 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
4280 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
4281 TREE_TYPE (vect_lhs)))
4282 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
4283 vect_lhs);
4284 rstmt = gimple_build_assign (root_lhs, vect_lhs);
4285 break;
4288 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
4290 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4291 stmt_vec_info child_stmt_info;
4292 int j;
4293 vec<constructor_elt, va_gc> *v;
4294 vec_alloc (v, nelts);
4296 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
4298 CONSTRUCTOR_APPEND_ELT (v,
4299 NULL_TREE,
4300 gimple_get_lhs (child_stmt_info->stmt));
4302 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
4303 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
4304 tree r_constructor = build_constructor (rtype, v);
4305 rstmt = gimple_build_assign (lhs, r_constructor);
4308 gcc_assert (rstmt);
4310 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
4311 gsi_replace (&rgsi, rstmt, true);
4314 /* Generate vector code for all SLP instances in the loop/basic block. */
4316 void
4317 vect_schedule_slp (vec_info *vinfo)
4319 vec<slp_instance> slp_instances;
4320 slp_instance instance;
4321 unsigned int i;
4323 slp_instances = vinfo->slp_instances;
4324 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4326 slp_tree node = SLP_INSTANCE_TREE (instance);
4327 /* Schedule the tree of INSTANCE. */
4328 vect_schedule_slp_instance (vinfo, node, instance);
4330 if (SLP_INSTANCE_ROOT_STMT (instance))
4331 vectorize_slp_instance_root_stmt (node, instance);
4333 if (dump_enabled_p ())
4334 dump_printf_loc (MSG_NOTE, vect_location,
4335 "vectorizing stmts using SLP.\n");
4338 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4340 slp_tree root = SLP_INSTANCE_TREE (instance);
4341 stmt_vec_info store_info;
4342 unsigned int j;
4344 /* Remove scalar call stmts. Do not do this for basic-block
4345 vectorization as not all uses may be vectorized.
4346 ??? Why should this be necessary? DCE should be able to
4347 remove the stmts itself.
4348 ??? For BB vectorization we can as well remove scalar
4349 stmts starting from the SLP tree root if they have no
4350 uses. */
4351 if (is_a <loop_vec_info> (vinfo))
4352 vect_remove_slp_scalar_calls (vinfo, root);
4354 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info)
4355 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4357 if (!STMT_VINFO_DATA_REF (store_info))
4358 break;
4360 if (SLP_INSTANCE_ROOT_STMT (instance))
4361 continue;
4363 store_info = vect_orig_stmt (store_info);
4364 /* Free the attached stmt_vec_info and remove the stmt. */
4365 vinfo->remove_stmt (store_info);