testsuite: Correct vec-rlmi-rlnm.c testsuite expected result
[official-gcc.git] / gcc / tree-vect-slp.c
blob4544f0f84a8551e35180941b90dba0d41965a070
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
52 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
53 slp_tree, stmt_vector_for_cost *);
55 /* Initialize a SLP node. */
57 _slp_tree::_slp_tree ()
59 SLP_TREE_SCALAR_STMTS (this) = vNULL;
60 SLP_TREE_SCALAR_OPS (this) = vNULL;
61 SLP_TREE_VEC_STMTS (this) = vNULL;
62 SLP_TREE_VEC_DEFS (this) = vNULL;
63 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
64 SLP_TREE_CHILDREN (this) = vNULL;
65 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
66 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
67 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
68 SLP_TREE_CODE (this) = ERROR_MARK;
69 SLP_TREE_VECTYPE (this) = NULL_TREE;
70 SLP_TREE_REPRESENTATIVE (this) = NULL;
71 SLP_TREE_REF_COUNT (this) = 1;
72 this->max_nunits = 1;
73 this->lanes = 0;
76 /* Tear down a SLP node. */
78 _slp_tree::~_slp_tree ()
80 SLP_TREE_CHILDREN (this).release ();
81 SLP_TREE_SCALAR_STMTS (this).release ();
82 SLP_TREE_SCALAR_OPS (this).release ();
83 SLP_TREE_VEC_STMTS (this).release ();
84 SLP_TREE_VEC_DEFS (this).release ();
85 SLP_TREE_LOAD_PERMUTATION (this).release ();
86 SLP_TREE_LANE_PERMUTATION (this).release ();
89 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
91 static void
92 vect_free_slp_tree (slp_tree node)
94 int i;
95 slp_tree child;
97 if (--SLP_TREE_REF_COUNT (node) != 0)
98 return;
100 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
101 vect_free_slp_tree (child);
103 delete node;
106 /* Return a location suitable for dumpings related to the SLP instance. */
108 dump_user_location_t
109 _slp_instance::location () const
111 if (root_stmt)
112 return root_stmt->stmt;
113 else
114 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
118 /* Free the memory allocated for the SLP instance. */
120 void
121 vect_free_slp_instance (slp_instance instance)
123 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
124 SLP_INSTANCE_LOADS (instance).release ();
125 instance->subgraph_entries.release ();
126 instance->cost_vec.release ();
127 free (instance);
131 /* Create an SLP node for SCALAR_STMTS. */
133 static slp_tree
134 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
136 slp_tree node = new _slp_tree;
137 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
138 SLP_TREE_CHILDREN (node).create (nops);
139 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
140 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
141 SLP_TREE_LANES (node) = scalar_stmts.length ();
142 return node;
145 /* Create an SLP node for OPS. */
147 static slp_tree
148 vect_create_new_slp_node (vec<tree> ops)
150 slp_tree node = new _slp_tree;
151 SLP_TREE_SCALAR_OPS (node) = ops;
152 SLP_TREE_DEF_TYPE (node) = vect_external_def;
153 SLP_TREE_LANES (node) = ops.length ();
154 return node;
158 /* This structure is used in creation of an SLP tree. Each instance
159 corresponds to the same operand in a group of scalar stmts in an SLP
160 node. */
161 typedef struct _slp_oprnd_info
163 /* Def-stmts for the operands. */
164 vec<stmt_vec_info> def_stmts;
165 /* Operands. */
166 vec<tree> ops;
167 /* Information about the first statement, its vector def-type, type, the
168 operand itself in case it's constant, and an indication if it's a pattern
169 stmt. */
170 tree first_op_type;
171 enum vect_def_type first_dt;
172 bool any_pattern;
173 } *slp_oprnd_info;
176 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
177 operand. */
178 static vec<slp_oprnd_info>
179 vect_create_oprnd_info (int nops, int group_size)
181 int i;
182 slp_oprnd_info oprnd_info;
183 vec<slp_oprnd_info> oprnds_info;
185 oprnds_info.create (nops);
186 for (i = 0; i < nops; i++)
188 oprnd_info = XNEW (struct _slp_oprnd_info);
189 oprnd_info->def_stmts.create (group_size);
190 oprnd_info->ops.create (group_size);
191 oprnd_info->first_dt = vect_uninitialized_def;
192 oprnd_info->first_op_type = NULL_TREE;
193 oprnd_info->any_pattern = false;
194 oprnds_info.quick_push (oprnd_info);
197 return oprnds_info;
201 /* Free operands info. */
203 static void
204 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
206 int i;
207 slp_oprnd_info oprnd_info;
209 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
211 oprnd_info->def_stmts.release ();
212 oprnd_info->ops.release ();
213 XDELETE (oprnd_info);
216 oprnds_info.release ();
220 /* Return true if STMTS contains a pattern statement. */
222 static bool
223 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
225 stmt_vec_info stmt_info;
226 unsigned int i;
227 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
228 if (is_pattern_stmt_p (stmt_info))
229 return true;
230 return false;
233 /* Return true when all lanes in the external or constant NODE have
234 the same value. */
236 static bool
237 vect_slp_tree_uniform_p (slp_tree node)
239 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
240 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
242 /* Pre-exsting vectors. */
243 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
244 return false;
246 unsigned i;
247 tree op, first = NULL_TREE;
248 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
249 if (!first)
250 first = op;
251 else if (!operand_equal_p (first, op, 0))
252 return false;
254 return true;
257 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
258 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
259 of the chain. */
262 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
263 stmt_vec_info first_stmt_info)
265 stmt_vec_info next_stmt_info = first_stmt_info;
266 int result = 0;
268 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
269 return -1;
273 if (next_stmt_info == stmt_info)
274 return result;
275 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
276 if (next_stmt_info)
277 result += DR_GROUP_GAP (next_stmt_info);
279 while (next_stmt_info);
281 return -1;
284 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
285 using the method implemented by duplicate_and_interleave. Return true
286 if so, returning the number of intermediate vectors in *NVECTORS_OUT
287 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
288 (if nonnull). */
290 bool
291 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
292 tree elt_type, unsigned int *nvectors_out,
293 tree *vector_type_out,
294 tree *permutes)
296 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
297 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
298 return false;
300 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
301 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
302 unsigned int nvectors = 1;
303 for (;;)
305 scalar_int_mode int_mode;
306 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
307 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
309 /* Get the natural vector type for this SLP group size. */
310 tree int_type = build_nonstandard_integer_type
311 (GET_MODE_BITSIZE (int_mode), 1);
312 tree vector_type
313 = get_vectype_for_scalar_type (vinfo, int_type, count);
314 if (vector_type
315 && VECTOR_MODE_P (TYPE_MODE (vector_type))
316 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
317 GET_MODE_SIZE (base_vector_mode)))
319 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
320 together into elements of type INT_TYPE and using the result
321 to build NVECTORS vectors. */
322 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
323 vec_perm_builder sel1 (nelts, 2, 3);
324 vec_perm_builder sel2 (nelts, 2, 3);
325 poly_int64 half_nelts = exact_div (nelts, 2);
326 for (unsigned int i = 0; i < 3; ++i)
328 sel1.quick_push (i);
329 sel1.quick_push (i + nelts);
330 sel2.quick_push (half_nelts + i);
331 sel2.quick_push (half_nelts + i + nelts);
333 vec_perm_indices indices1 (sel1, 2, nelts);
334 vec_perm_indices indices2 (sel2, 2, nelts);
335 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
336 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
338 if (nvectors_out)
339 *nvectors_out = nvectors;
340 if (vector_type_out)
341 *vector_type_out = vector_type;
342 if (permutes)
344 permutes[0] = vect_gen_perm_mask_checked (vector_type,
345 indices1);
346 permutes[1] = vect_gen_perm_mask_checked (vector_type,
347 indices2);
349 return true;
353 if (!multiple_p (elt_bytes, 2, &elt_bytes))
354 return false;
355 nvectors *= 2;
359 /* Return true if DTA and DTB match. */
361 static bool
362 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
364 return (dta == dtb
365 || ((dta == vect_external_def || dta == vect_constant_def)
366 && (dtb == vect_external_def || dtb == vect_constant_def)));
369 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
370 they are of a valid type and that they match the defs of the first stmt of
371 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
372 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
373 indicates swap is required for cond_expr stmts. Specifically, *SWAP
374 is 1 if STMT is cond and operands of comparison need to be swapped;
375 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
376 If there is any operand swap in this function, *SWAP is set to non-zero
377 value.
378 If there was a fatal error return -1; if the error could be corrected by
379 swapping operands of father node of this one, return 1; if everything is
380 ok return 0. */
381 static int
382 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
383 vec<stmt_vec_info> stmts, unsigned stmt_num,
384 vec<slp_oprnd_info> *oprnds_info)
386 stmt_vec_info stmt_info = stmts[stmt_num];
387 tree oprnd;
388 unsigned int i, number_of_oprnds;
389 enum vect_def_type dt = vect_uninitialized_def;
390 slp_oprnd_info oprnd_info;
391 int first_op_idx = 1;
392 unsigned int commutative_op = -1U;
393 bool first_op_cond = false;
394 bool first = stmt_num == 0;
396 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
398 number_of_oprnds = gimple_call_num_args (stmt);
399 first_op_idx = 3;
400 if (gimple_call_internal_p (stmt))
402 internal_fn ifn = gimple_call_internal_fn (stmt);
403 commutative_op = first_commutative_argument (ifn);
405 /* Masked load, only look at mask. */
406 if (ifn == IFN_MASK_LOAD)
408 number_of_oprnds = 1;
409 /* Mask operand index. */
410 first_op_idx = 5;
414 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
416 enum tree_code code = gimple_assign_rhs_code (stmt);
417 number_of_oprnds = gimple_num_ops (stmt) - 1;
418 /* Swap can only be done for cond_expr if asked to, otherwise we
419 could result in different comparison code to the first stmt. */
420 if (code == COND_EXPR
421 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
423 first_op_cond = true;
424 number_of_oprnds++;
426 else
427 commutative_op = commutative_tree_code (code) ? 0U : -1U;
429 else
430 return -1;
432 bool swapped = (swap != 0);
433 gcc_assert (!swapped || first_op_cond);
434 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
435 for (i = 0; i < number_of_oprnds; i++)
437 if (first_op_cond)
439 /* Map indicating how operands of cond_expr should be swapped. */
440 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
441 int *map = maps[swap];
443 if (i < 2)
444 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
445 first_op_idx), map[i]);
446 else
447 oprnd = gimple_op (stmt_info->stmt, map[i]);
449 else
450 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
451 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
452 oprnd = TREE_OPERAND (oprnd, 0);
454 oprnd_info = (*oprnds_info)[i];
456 stmt_vec_info def_stmt_info;
457 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
459 if (dump_enabled_p ())
460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
461 "Build SLP failed: can't analyze def for %T\n",
462 oprnd);
464 return -1;
467 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
468 oprnd_info->any_pattern = true;
470 oprnd_info->def_stmts.quick_push (def_stmt_info);
471 oprnd_info->ops.quick_push (oprnd);
473 if (first)
475 tree type = TREE_TYPE (oprnd);
476 dt = dts[i];
477 if ((dt == vect_constant_def
478 || dt == vect_external_def)
479 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
480 && (TREE_CODE (type) == BOOLEAN_TYPE
481 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
482 type)))
484 if (dump_enabled_p ())
485 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
486 "Build SLP failed: invalid type of def "
487 "for variable-length SLP %T\n", oprnd);
488 return -1;
491 /* For the swapping logic below force vect_reduction_def
492 for the reduction op in a SLP reduction group. */
493 if (!STMT_VINFO_DATA_REF (stmt_info)
494 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
495 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
496 && def_stmt_info)
497 dts[i] = dt = vect_reduction_def;
499 /* Check the types of the definition. */
500 switch (dt)
502 case vect_external_def:
503 case vect_constant_def:
504 case vect_internal_def:
505 case vect_reduction_def:
506 case vect_induction_def:
507 break;
509 default:
510 /* FORNOW: Not supported. */
511 if (dump_enabled_p ())
512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
513 "Build SLP failed: illegal type of def %T\n",
514 oprnd);
515 return -1;
518 oprnd_info->first_dt = dt;
519 oprnd_info->first_op_type = type;
522 if (first)
523 return 0;
525 /* Now match the operand definition types to that of the first stmt. */
526 for (i = 0; i < number_of_oprnds;)
528 oprnd_info = (*oprnds_info)[i];
529 dt = dts[i];
530 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
531 oprnd = oprnd_info->ops[stmt_num];
532 tree type = TREE_TYPE (oprnd);
534 if (!types_compatible_p (oprnd_info->first_op_type, type))
536 if (dump_enabled_p ())
537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
538 "Build SLP failed: different operand types\n");
539 return 1;
542 /* Not first stmt of the group, check that the def-stmt/s match
543 the def-stmt/s of the first stmt. Allow different definition
544 types for reduction chains: the first stmt must be a
545 vect_reduction_def (a phi node), and the rest
546 end in the reduction chain. */
547 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
548 && !(oprnd_info->first_dt == vect_reduction_def
549 && !STMT_VINFO_DATA_REF (stmt_info)
550 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
551 && def_stmt_info
552 && !STMT_VINFO_DATA_REF (def_stmt_info)
553 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
554 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
555 || (!STMT_VINFO_DATA_REF (stmt_info)
556 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
557 && ((!def_stmt_info
558 || STMT_VINFO_DATA_REF (def_stmt_info)
559 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
560 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
561 != (oprnd_info->first_dt != vect_reduction_def))))
563 /* Try swapping operands if we got a mismatch. For BB
564 vectorization only in case it will clearly improve things. */
565 if (i == commutative_op && !swapped
566 && (!is_a <bb_vec_info> (vinfo)
567 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
568 dts[i+1])
569 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
570 || vect_def_types_match
571 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
573 if (dump_enabled_p ())
574 dump_printf_loc (MSG_NOTE, vect_location,
575 "trying swapped operands\n");
576 std::swap (dts[i], dts[i+1]);
577 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
578 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
579 std::swap ((*oprnds_info)[i]->ops[stmt_num],
580 (*oprnds_info)[i+1]->ops[stmt_num]);
581 swapped = true;
582 continue;
585 if (is_a <bb_vec_info> (vinfo)
586 && !oprnd_info->any_pattern)
588 /* Now for commutative ops we should see whether we can
589 make the other operand matching. */
590 if (dump_enabled_p ())
591 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
592 "treating operand as external\n");
593 oprnd_info->first_dt = dt = vect_external_def;
595 else
597 if (dump_enabled_p ())
598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
599 "Build SLP failed: different types\n");
600 return 1;
604 /* Make sure to demote the overall operand to external. */
605 if (dt == vect_external_def)
606 oprnd_info->first_dt = vect_external_def;
607 /* For a SLP reduction chain we want to duplicate the reduction to
608 each of the chain members. That gets us a sane SLP graph (still
609 the stmts are not 100% correct wrt the initial values). */
610 else if ((dt == vect_internal_def
611 || dt == vect_reduction_def)
612 && oprnd_info->first_dt == vect_reduction_def
613 && !STMT_VINFO_DATA_REF (stmt_info)
614 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
615 && !STMT_VINFO_DATA_REF (def_stmt_info)
616 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
617 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
619 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
620 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
623 ++i;
626 /* Swap operands. */
627 if (swapped)
629 if (dump_enabled_p ())
630 dump_printf_loc (MSG_NOTE, vect_location,
631 "swapped operands to match def types in %G",
632 stmt_info->stmt);
635 return 0;
638 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
639 Return true if we can, meaning that this choice doesn't conflict with
640 existing SLP nodes that use STMT_INFO. */
642 bool
643 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
645 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
646 if (old_vectype)
647 return useless_type_conversion_p (vectype, old_vectype);
649 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
651 /* We maintain the invariant that if any statement in the group is
652 used, all other members of the group have the same vector type. */
653 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
654 stmt_vec_info member_info = first_info;
655 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
656 if (is_pattern_stmt_p (member_info)
657 && !useless_type_conversion_p (vectype,
658 STMT_VINFO_VECTYPE (member_info)))
659 break;
661 if (!member_info)
663 for (member_info = first_info; member_info;
664 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
665 STMT_VINFO_VECTYPE (member_info) = vectype;
666 return true;
669 else if (!is_pattern_stmt_p (stmt_info))
671 STMT_VINFO_VECTYPE (stmt_info) = vectype;
672 return true;
675 if (dump_enabled_p ())
677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
678 "Build SLP failed: incompatible vector"
679 " types for: %G", stmt_info->stmt);
680 dump_printf_loc (MSG_NOTE, vect_location,
681 " old vector type: %T\n", old_vectype);
682 dump_printf_loc (MSG_NOTE, vect_location,
683 " new vector type: %T\n", vectype);
685 return false;
688 /* Return true if call statements CALL1 and CALL2 are similar enough
689 to be combined into the same SLP group. */
691 static bool
692 compatible_calls_p (gcall *call1, gcall *call2)
694 unsigned int nargs = gimple_call_num_args (call1);
695 if (nargs != gimple_call_num_args (call2))
696 return false;
698 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
699 return false;
701 if (gimple_call_internal_p (call1))
703 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
704 TREE_TYPE (gimple_call_lhs (call2))))
705 return false;
706 for (unsigned int i = 0; i < nargs; ++i)
707 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
708 TREE_TYPE (gimple_call_arg (call2, i))))
709 return false;
711 else
713 if (!operand_equal_p (gimple_call_fn (call1),
714 gimple_call_fn (call2), 0))
715 return false;
717 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
718 return false;
720 return true;
723 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
724 caller's attempt to find the vector type in STMT_INFO with the narrowest
725 element type. Return true if VECTYPE is nonnull and if it is valid
726 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
727 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
728 vect_build_slp_tree. */
730 static bool
731 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
732 unsigned int group_size,
733 tree vectype, poly_uint64 *max_nunits)
735 if (!vectype)
737 if (dump_enabled_p ())
738 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
739 "Build SLP failed: unsupported data-type in %G\n",
740 stmt_info->stmt);
741 /* Fatal mismatch. */
742 return false;
745 /* If populating the vector type requires unrolling then fail
746 before adjusting *max_nunits for basic-block vectorization. */
747 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
748 unsigned HOST_WIDE_INT const_nunits;
749 if (is_a <bb_vec_info> (vinfo)
750 && (!nunits.is_constant (&const_nunits)
751 || const_nunits > group_size))
753 if (dump_enabled_p ())
754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
755 "Build SLP failed: unrolling required "
756 "in basic block SLP\n");
757 /* Fatal mismatch. */
758 return false;
761 /* In case of multiple types we need to detect the smallest type. */
762 vect_update_max_nunits (max_nunits, vectype);
763 return true;
766 /* Verify if the scalar stmts STMTS are isomorphic, require data
767 permutation or are of unsupported types of operation. Return
768 true if they are, otherwise return false and indicate in *MATCHES
769 which stmts are not isomorphic to the first one. If MATCHES[0]
770 is false then this indicates the comparison could not be
771 carried out or the stmts will never be vectorized by SLP.
773 Note COND_EXPR is possibly isomorphic to another one after swapping its
774 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
775 the first stmt by swapping the two operands of comparison; set SWAP[i]
776 to 2 if stmt I is isormorphic to the first stmt by inverting the code
777 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
778 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
780 static bool
781 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
782 vec<stmt_vec_info> stmts, unsigned int group_size,
783 poly_uint64 *max_nunits, bool *matches,
784 bool *two_operators, tree *node_vectype)
786 unsigned int i;
787 stmt_vec_info first_stmt_info = stmts[0];
788 enum tree_code first_stmt_code = ERROR_MARK;
789 enum tree_code alt_stmt_code = ERROR_MARK;
790 enum tree_code rhs_code = ERROR_MARK;
791 enum tree_code first_cond_code = ERROR_MARK;
792 tree lhs;
793 bool need_same_oprnds = false;
794 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
795 optab optab;
796 int icode;
797 machine_mode optab_op2_mode;
798 machine_mode vec_mode;
799 stmt_vec_info first_load = NULL, prev_first_load = NULL;
800 bool first_stmt_load_p = false, load_p = false;
802 /* For every stmt in NODE find its def stmt/s. */
803 stmt_vec_info stmt_info;
804 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
806 gimple *stmt = stmt_info->stmt;
807 swap[i] = 0;
808 matches[i] = false;
810 if (dump_enabled_p ())
811 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
813 /* Fail to vectorize statements marked as unvectorizable, throw
814 or are volatile. */
815 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
816 || stmt_can_throw_internal (cfun, stmt)
817 || gimple_has_volatile_ops (stmt))
819 if (dump_enabled_p ())
820 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
821 "Build SLP failed: unvectorizable statement %G",
822 stmt);
823 /* ??? For BB vectorization we want to commutate operands in a way
824 to shuffle all unvectorizable defs into one operand and have
825 the other still vectorized. The following doesn't reliably
826 work for this though but it's the easiest we can do here. */
827 if (is_a <bb_vec_info> (vinfo) && i != 0)
828 continue;
829 /* Fatal mismatch. */
830 matches[0] = false;
831 return false;
834 lhs = gimple_get_lhs (stmt);
835 if (lhs == NULL_TREE)
837 if (dump_enabled_p ())
838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
839 "Build SLP failed: not GIMPLE_ASSIGN nor "
840 "GIMPLE_CALL %G", stmt);
841 if (is_a <bb_vec_info> (vinfo) && i != 0)
842 continue;
843 /* Fatal mismatch. */
844 matches[0] = false;
845 return false;
848 tree nunits_vectype;
849 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
850 &nunits_vectype, group_size)
851 || (nunits_vectype
852 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
853 nunits_vectype, max_nunits)))
855 if (is_a <bb_vec_info> (vinfo) && i != 0)
856 continue;
857 /* Fatal mismatch. */
858 matches[0] = false;
859 return false;
862 gcc_assert (vectype);
864 gcall *call_stmt = dyn_cast <gcall *> (stmt);
865 if (call_stmt)
867 rhs_code = CALL_EXPR;
869 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
870 load_p = true;
871 else if ((gimple_call_internal_p (call_stmt)
872 && (!vectorizable_internal_fn_p
873 (gimple_call_internal_fn (call_stmt))))
874 || gimple_call_tail_p (call_stmt)
875 || gimple_call_noreturn_p (call_stmt)
876 || !gimple_call_nothrow_p (call_stmt)
877 || gimple_call_chain (call_stmt))
879 if (dump_enabled_p ())
880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
881 "Build SLP failed: unsupported call type %G",
882 call_stmt);
883 if (is_a <bb_vec_info> (vinfo) && i != 0)
884 continue;
885 /* Fatal mismatch. */
886 matches[0] = false;
887 return false;
890 else
892 rhs_code = gimple_assign_rhs_code (stmt);
893 load_p = gimple_vuse (stmt);
896 /* Check the operation. */
897 if (i == 0)
899 *node_vectype = vectype;
900 first_stmt_code = rhs_code;
901 first_stmt_load_p = load_p;
903 /* Shift arguments should be equal in all the packed stmts for a
904 vector shift with scalar shift operand. */
905 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
906 || rhs_code == LROTATE_EXPR
907 || rhs_code == RROTATE_EXPR)
909 vec_mode = TYPE_MODE (vectype);
911 /* First see if we have a vector/vector shift. */
912 optab = optab_for_tree_code (rhs_code, vectype,
913 optab_vector);
915 if (!optab
916 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
918 /* No vector/vector shift, try for a vector/scalar shift. */
919 optab = optab_for_tree_code (rhs_code, vectype,
920 optab_scalar);
922 if (!optab)
924 if (dump_enabled_p ())
925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
926 "Build SLP failed: no optab.\n");
927 if (is_a <bb_vec_info> (vinfo) && i != 0)
928 continue;
929 /* Fatal mismatch. */
930 matches[0] = false;
931 return false;
933 icode = (int) optab_handler (optab, vec_mode);
934 if (icode == CODE_FOR_nothing)
936 if (dump_enabled_p ())
937 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
938 "Build SLP failed: "
939 "op not supported by target.\n");
940 if (is_a <bb_vec_info> (vinfo) && i != 0)
941 continue;
942 /* Fatal mismatch. */
943 matches[0] = false;
944 return false;
946 optab_op2_mode = insn_data[icode].operand[2].mode;
947 if (!VECTOR_MODE_P (optab_op2_mode))
949 need_same_oprnds = true;
950 first_op1 = gimple_assign_rhs2 (stmt);
954 else if (rhs_code == WIDEN_LSHIFT_EXPR)
956 need_same_oprnds = true;
957 first_op1 = gimple_assign_rhs2 (stmt);
959 else if (!load_p
960 && rhs_code == BIT_FIELD_REF)
962 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
963 if (TREE_CODE (vec) != SSA_NAME
964 || !types_compatible_p (vectype, TREE_TYPE (vec)))
966 if (is_a <bb_vec_info> (vinfo) && i != 0)
967 continue;
968 if (dump_enabled_p ())
969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
970 "Build SLP failed: "
971 "BIT_FIELD_REF not supported\n");
972 /* Fatal mismatch. */
973 matches[0] = false;
974 return false;
977 else if (call_stmt
978 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
980 need_same_oprnds = true;
981 first_op1 = gimple_call_arg (call_stmt, 1);
984 else
986 if (first_stmt_code != rhs_code
987 && alt_stmt_code == ERROR_MARK)
988 alt_stmt_code = rhs_code;
989 if ((first_stmt_code != rhs_code
990 && (first_stmt_code != IMAGPART_EXPR
991 || rhs_code != REALPART_EXPR)
992 && (first_stmt_code != REALPART_EXPR
993 || rhs_code != IMAGPART_EXPR)
994 /* Handle mismatches in plus/minus by computing both
995 and merging the results. */
996 && !((first_stmt_code == PLUS_EXPR
997 || first_stmt_code == MINUS_EXPR)
998 && (alt_stmt_code == PLUS_EXPR
999 || alt_stmt_code == MINUS_EXPR)
1000 && rhs_code == alt_stmt_code)
1001 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1002 && (first_stmt_code == ARRAY_REF
1003 || first_stmt_code == BIT_FIELD_REF
1004 || first_stmt_code == INDIRECT_REF
1005 || first_stmt_code == COMPONENT_REF
1006 || first_stmt_code == MEM_REF)))
1007 || first_stmt_load_p != load_p)
1009 if (dump_enabled_p ())
1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 "Build SLP failed: different operation "
1013 "in stmt %G", stmt);
1014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1015 "original stmt %G", first_stmt_info->stmt);
1017 /* Mismatch. */
1018 continue;
1021 if (need_same_oprnds)
1023 tree other_op1 = (call_stmt
1024 ? gimple_call_arg (call_stmt, 1)
1025 : gimple_assign_rhs2 (stmt));
1026 if (!operand_equal_p (first_op1, other_op1, 0))
1028 if (dump_enabled_p ())
1029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1030 "Build SLP failed: different shift "
1031 "arguments in %G", stmt);
1032 /* Mismatch. */
1033 continue;
1036 if (!load_p
1037 && first_stmt_code == BIT_FIELD_REF
1038 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1039 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1041 if (dump_enabled_p ())
1042 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1043 "Build SLP failed: different BIT_FIELD_REF "
1044 "arguments in %G", stmt);
1045 /* Mismatch. */
1046 continue;
1049 if (!load_p && rhs_code == CALL_EXPR)
1051 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1052 as_a <gcall *> (stmt)))
1054 if (dump_enabled_p ())
1055 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1056 "Build SLP failed: different calls in %G",
1057 stmt);
1058 /* Mismatch. */
1059 continue;
1063 if (!types_compatible_p (vectype, *node_vectype))
1065 if (dump_enabled_p ())
1066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1067 "Build SLP failed: different vector type "
1068 "in %G", stmt);
1069 /* Mismatch. */
1070 continue;
1074 /* Grouped store or load. */
1075 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1077 if (REFERENCE_CLASS_P (lhs))
1079 /* Store. */
1082 else
1084 /* Load. */
1085 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1086 if (prev_first_load)
1088 /* Check that there are no loads from different interleaving
1089 chains in the same node. */
1090 if (prev_first_load != first_load)
1092 if (dump_enabled_p ())
1093 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1094 vect_location,
1095 "Build SLP failed: different "
1096 "interleaving chains in one node %G",
1097 stmt);
1098 /* Mismatch. */
1099 continue;
1102 else
1103 prev_first_load = first_load;
1105 } /* Grouped access. */
1106 else
1108 if (load_p)
1110 /* Not grouped load. */
1111 if (dump_enabled_p ())
1112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113 "Build SLP failed: not grouped load %G", stmt);
1115 /* FORNOW: Not grouped loads are not supported. */
1116 if (is_a <bb_vec_info> (vinfo) && i != 0)
1117 continue;
1118 /* Fatal mismatch. */
1119 matches[0] = false;
1120 return false;
1123 /* Not memory operation. */
1124 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1125 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1126 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1127 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1128 && rhs_code != VIEW_CONVERT_EXPR
1129 && rhs_code != CALL_EXPR
1130 && rhs_code != BIT_FIELD_REF)
1132 if (dump_enabled_p ())
1133 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1134 "Build SLP failed: operation unsupported %G",
1135 stmt);
1136 if (is_a <bb_vec_info> (vinfo) && i != 0)
1137 continue;
1138 /* Fatal mismatch. */
1139 matches[0] = false;
1140 return false;
1143 if (rhs_code == COND_EXPR)
1145 tree cond_expr = gimple_assign_rhs1 (stmt);
1146 enum tree_code cond_code = TREE_CODE (cond_expr);
1147 enum tree_code swap_code = ERROR_MARK;
1148 enum tree_code invert_code = ERROR_MARK;
1150 if (i == 0)
1151 first_cond_code = TREE_CODE (cond_expr);
1152 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1154 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1155 swap_code = swap_tree_comparison (cond_code);
1156 invert_code = invert_tree_comparison (cond_code, honor_nans);
1159 if (first_cond_code == cond_code)
1161 /* Isomorphic can be achieved by swapping. */
1162 else if (first_cond_code == swap_code)
1163 swap[i] = 1;
1164 /* Isomorphic can be achieved by inverting. */
1165 else if (first_cond_code == invert_code)
1166 swap[i] = 2;
1167 else
1169 if (dump_enabled_p ())
1170 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1171 "Build SLP failed: different"
1172 " operation %G", stmt);
1173 /* Mismatch. */
1174 continue;
1179 matches[i] = true;
1182 for (i = 0; i < group_size; ++i)
1183 if (!matches[i])
1184 return false;
1186 /* If we allowed a two-operation SLP node verify the target can cope
1187 with the permute we are going to use. */
1188 if (alt_stmt_code != ERROR_MARK
1189 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1191 *two_operators = true;
1194 return true;
1197 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1198 Note we never remove apart from at destruction time so we do not
1199 need a special value for deleted that differs from empty. */
1200 struct bst_traits
1202 typedef vec <stmt_vec_info> value_type;
1203 typedef vec <stmt_vec_info> compare_type;
1204 static inline hashval_t hash (value_type);
1205 static inline bool equal (value_type existing, value_type candidate);
1206 static inline bool is_empty (value_type x) { return !x.exists (); }
1207 static inline bool is_deleted (value_type x) { return !x.exists (); }
1208 static const bool empty_zero_p = true;
1209 static inline void mark_empty (value_type &x) { x.release (); }
1210 static inline void mark_deleted (value_type &x) { x.release (); }
1211 static inline void remove (value_type &x) { x.release (); }
1213 inline hashval_t
1214 bst_traits::hash (value_type x)
1216 inchash::hash h;
1217 for (unsigned i = 0; i < x.length (); ++i)
1218 h.add_int (gimple_uid (x[i]->stmt));
1219 return h.end ();
1221 inline bool
1222 bst_traits::equal (value_type existing, value_type candidate)
1224 if (existing.length () != candidate.length ())
1225 return false;
1226 for (unsigned i = 0; i < existing.length (); ++i)
1227 if (existing[i] != candidate[i])
1228 return false;
1229 return true;
1232 typedef hash_map <vec <gimple *>, slp_tree,
1233 simple_hashmap_traits <bst_traits, slp_tree> >
1234 scalar_stmts_to_slp_tree_map_t;
1236 static slp_tree
1237 vect_build_slp_tree_2 (vec_info *vinfo,
1238 vec<stmt_vec_info> stmts, unsigned int group_size,
1239 poly_uint64 *max_nunits,
1240 bool *matches, unsigned *npermutes, unsigned *tree_size,
1241 scalar_stmts_to_slp_tree_map_t *bst_map);
1243 static slp_tree
1244 vect_build_slp_tree (vec_info *vinfo,
1245 vec<stmt_vec_info> stmts, unsigned int group_size,
1246 poly_uint64 *max_nunits,
1247 bool *matches, unsigned *npermutes, unsigned *tree_size,
1248 scalar_stmts_to_slp_tree_map_t *bst_map)
1250 if (slp_tree *leader = bst_map->get (stmts))
1252 if (dump_enabled_p ())
1253 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1254 *leader ? "" : "failed ", *leader);
1255 if (*leader)
1257 SLP_TREE_REF_COUNT (*leader)++;
1258 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1260 return *leader;
1262 poly_uint64 this_max_nunits = 1;
1263 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1264 &this_max_nunits,
1265 matches, npermutes, tree_size, bst_map);
1266 if (res)
1268 res->max_nunits = this_max_nunits;
1269 vect_update_max_nunits (max_nunits, this_max_nunits);
1270 /* Keep a reference for the bst_map use. */
1271 SLP_TREE_REF_COUNT (res)++;
1273 bst_map->put (stmts.copy (), res);
1274 return res;
1277 /* Recursively build an SLP tree starting from NODE.
1278 Fail (and return a value not equal to zero) if def-stmts are not
1279 isomorphic, require data permutation or are of unsupported types of
1280 operation. Otherwise, return 0.
1281 The value returned is the depth in the SLP tree where a mismatch
1282 was found. */
1284 static slp_tree
1285 vect_build_slp_tree_2 (vec_info *vinfo,
1286 vec<stmt_vec_info> stmts, unsigned int group_size,
1287 poly_uint64 *max_nunits,
1288 bool *matches, unsigned *npermutes, unsigned *tree_size,
1289 scalar_stmts_to_slp_tree_map_t *bst_map)
1291 unsigned nops, i, this_tree_size = 0;
1292 poly_uint64 this_max_nunits = *max_nunits;
1293 slp_tree node;
1295 matches[0] = false;
1297 stmt_vec_info stmt_info = stmts[0];
1298 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1299 nops = gimple_call_num_args (stmt);
1300 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1302 nops = gimple_num_ops (stmt) - 1;
1303 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1304 nops++;
1306 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1307 nops = gimple_phi_num_args (phi);
1308 else
1309 return NULL;
1311 /* If the SLP node is a PHI (induction or reduction), terminate
1312 the recursion. */
1313 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1315 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1316 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1317 group_size);
1318 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1319 max_nunits))
1320 return NULL;
1322 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1323 /* Induction from different IVs is not supported. */
1324 if (def_type == vect_induction_def)
1326 stmt_vec_info other_info;
1327 FOR_EACH_VEC_ELT (stmts, i, other_info)
1328 if (stmt_info != other_info)
1329 return NULL;
1331 else if (def_type == vect_reduction_def
1332 || def_type == vect_double_reduction_def
1333 || def_type == vect_nested_cycle)
1335 /* Else def types have to match. */
1336 stmt_vec_info other_info;
1337 FOR_EACH_VEC_ELT (stmts, i, other_info)
1338 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1339 return NULL;
1341 else
1342 return NULL;
1343 (*tree_size)++;
1344 node = vect_create_new_slp_node (stmts, nops);
1345 SLP_TREE_VECTYPE (node) = vectype;
1346 return node;
1350 bool two_operators = false;
1351 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1352 tree vectype = NULL_TREE;
1353 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1354 &this_max_nunits, matches, &two_operators,
1355 &vectype))
1356 return NULL;
1358 /* If the SLP node is a load, terminate the recursion unless masked. */
1359 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1360 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1362 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1364 /* Masked load. */
1365 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1366 nops = 1;
1368 else
1370 *max_nunits = this_max_nunits;
1371 (*tree_size)++;
1372 node = vect_create_new_slp_node (stmts, 0);
1373 SLP_TREE_VECTYPE (node) = vectype;
1374 /* And compute the load permutation. Whether it is actually
1375 a permutation depends on the unrolling factor which is
1376 decided later. */
1377 vec<unsigned> load_permutation;
1378 int j;
1379 stmt_vec_info load_info;
1380 load_permutation.create (group_size);
1381 stmt_vec_info first_stmt_info
1382 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1383 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1385 int load_place = vect_get_place_in_interleaving_chain
1386 (load_info, first_stmt_info);
1387 gcc_assert (load_place != -1);
1388 load_permutation.safe_push (load_place);
1390 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1391 return node;
1394 else if (gimple_assign_single_p (stmt_info->stmt)
1395 && !gimple_vuse (stmt_info->stmt)
1396 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1398 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1399 the same SSA name vector of a compatible type to vectype. */
1400 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1401 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1402 stmt_vec_info estmt_info;
1403 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1405 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1406 tree bfref = gimple_assign_rhs1 (estmt);
1407 HOST_WIDE_INT lane;
1408 if (!known_eq (bit_field_size (bfref),
1409 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1410 || !constant_multiple_p (bit_field_offset (bfref),
1411 bit_field_size (bfref), &lane))
1413 lperm.release ();
1414 return NULL;
1416 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1418 slp_tree vnode = vect_create_new_slp_node (vNULL);
1419 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1420 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1421 /* We are always building a permutation node even if it is an identity
1422 permute to shield the rest of the vectorizer from the odd node
1423 representing an actual vector without any scalar ops.
1424 ??? We could hide it completely with making the permute node
1425 external? */
1426 node = vect_create_new_slp_node (stmts, 1);
1427 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1428 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1429 SLP_TREE_VECTYPE (node) = vectype;
1430 SLP_TREE_CHILDREN (node).quick_push (vnode);
1431 return node;
1434 /* Get at the operands, verifying they are compatible. */
1435 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1436 slp_oprnd_info oprnd_info;
1437 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1439 int res = vect_get_and_check_slp_defs (vinfo, swap[i],
1440 stmts, i, &oprnds_info);
1441 if (res != 0)
1442 matches[(res == -1) ? 0 : i] = false;
1443 if (!matches[0])
1444 break;
1446 for (i = 0; i < group_size; ++i)
1447 if (!matches[i])
1449 vect_free_oprnd_info (oprnds_info);
1450 return NULL;
1452 swap = NULL;
1454 auto_vec<slp_tree, 4> children;
1456 stmt_info = stmts[0];
1458 /* Create SLP_TREE nodes for the definition node/s. */
1459 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1461 slp_tree child;
1462 unsigned int j;
1464 if (oprnd_info->first_dt == vect_uninitialized_def)
1466 /* COND_EXPR have one too many eventually if the condition
1467 is a SSA name. */
1468 gcc_assert (i == 3 && nops == 4);
1469 continue;
1472 if (oprnd_info->first_dt != vect_internal_def
1473 && oprnd_info->first_dt != vect_reduction_def
1474 && oprnd_info->first_dt != vect_induction_def)
1476 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1477 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1478 oprnd_info->ops = vNULL;
1479 children.safe_push (invnode);
1480 continue;
1483 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1484 group_size, &this_max_nunits,
1485 matches, npermutes,
1486 &this_tree_size, bst_map)) != NULL)
1488 oprnd_info->def_stmts = vNULL;
1489 children.safe_push (child);
1490 continue;
1493 /* If the SLP build for operand zero failed and operand zero
1494 and one can be commutated try that for the scalar stmts
1495 that failed the match. */
1496 if (i == 0
1497 /* A first scalar stmt mismatch signals a fatal mismatch. */
1498 && matches[0]
1499 /* ??? For COND_EXPRs we can swap the comparison operands
1500 as well as the arms under some constraints. */
1501 && nops == 2
1502 && oprnds_info[1]->first_dt == vect_internal_def
1503 && is_gimple_assign (stmt_info->stmt)
1504 /* Swapping operands for reductions breaks assumptions later on. */
1505 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1506 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1507 /* Do so only if the number of not successful permutes was nor more
1508 than a cut-ff as re-trying the recursive match on
1509 possibly each level of the tree would expose exponential
1510 behavior. */
1511 && *npermutes < 4)
1513 /* See whether we can swap the matching or the non-matching
1514 stmt operands. */
1515 bool swap_not_matching = true;
1518 for (j = 0; j < group_size; ++j)
1520 if (matches[j] != !swap_not_matching)
1521 continue;
1522 stmt_vec_info stmt_info = stmts[j];
1523 /* Verify if we can swap operands of this stmt. */
1524 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1525 if (!stmt
1526 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1528 if (!swap_not_matching)
1529 goto fail;
1530 swap_not_matching = false;
1531 break;
1535 while (j != group_size);
1537 /* Swap mismatched definition stmts. */
1538 if (dump_enabled_p ())
1539 dump_printf_loc (MSG_NOTE, vect_location,
1540 "Re-trying with swapped operands of stmts ");
1541 for (j = 0; j < group_size; ++j)
1542 if (matches[j] == !swap_not_matching)
1544 std::swap (oprnds_info[0]->def_stmts[j],
1545 oprnds_info[1]->def_stmts[j]);
1546 std::swap (oprnds_info[0]->ops[j],
1547 oprnds_info[1]->ops[j]);
1548 if (dump_enabled_p ())
1549 dump_printf (MSG_NOTE, "%d ", j);
1551 if (dump_enabled_p ())
1552 dump_printf (MSG_NOTE, "\n");
1553 /* And try again with scratch 'matches' ... */
1554 bool *tem = XALLOCAVEC (bool, group_size);
1555 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1556 group_size, &this_max_nunits,
1557 tem, npermutes,
1558 &this_tree_size, bst_map)) != NULL)
1560 oprnd_info->def_stmts = vNULL;
1561 children.safe_push (child);
1562 continue;
1564 /* We do not undo the swapping here since it might still be
1565 the better order for the second operand in case we build
1566 the first one from scalars below. */
1567 ++*npermutes;
1569 fail:
1571 /* If the SLP build failed and we analyze a basic-block
1572 simply treat nodes we fail to build as externally defined
1573 (and thus build vectors from the scalar defs).
1574 The cost model will reject outright expensive cases.
1575 ??? This doesn't treat cases where permutation ultimatively
1576 fails (or we don't try permutation below). Ideally we'd
1577 even compute a permutation that will end up with the maximum
1578 SLP tree size... */
1579 if (is_a <bb_vec_info> (vinfo)
1580 /* ??? Rejecting patterns this way doesn't work. We'd have to
1581 do extra work to cancel the pattern so the uses see the
1582 scalar version. */
1583 && !is_pattern_stmt_p (stmt_info)
1584 && !oprnd_info->any_pattern)
1586 /* But if there's a leading vector sized set of matching stmts
1587 fail here so we can split the group. This matches the condition
1588 vect_analyze_slp_instance uses. */
1589 /* ??? We might want to split here and combine the results to support
1590 multiple vector sizes better. */
1591 for (j = 0; j < group_size; ++j)
1592 if (!matches[j])
1593 break;
1594 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1596 if (dump_enabled_p ())
1597 dump_printf_loc (MSG_NOTE, vect_location,
1598 "Building vector operands from scalars\n");
1599 this_tree_size++;
1600 child = vect_create_new_slp_node (oprnd_info->ops);
1601 children.safe_push (child);
1602 oprnd_info->ops = vNULL;
1603 oprnd_info->def_stmts = vNULL;
1604 continue;
1608 gcc_assert (child == NULL);
1609 FOR_EACH_VEC_ELT (children, j, child)
1610 vect_free_slp_tree (child);
1611 vect_free_oprnd_info (oprnds_info);
1612 return NULL;
1615 vect_free_oprnd_info (oprnds_info);
1617 /* If we have all children of a child built up from uniform scalars
1618 or does more than one possibly expensive vector construction then
1619 just throw that away, causing it built up from scalars.
1620 The exception is the SLP node for the vector store. */
1621 if (is_a <bb_vec_info> (vinfo)
1622 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1623 /* ??? Rejecting patterns this way doesn't work. We'd have to
1624 do extra work to cancel the pattern so the uses see the
1625 scalar version. */
1626 && !is_pattern_stmt_p (stmt_info))
1628 slp_tree child;
1629 unsigned j;
1630 bool all_uniform_p = true;
1631 unsigned n_vector_builds = 0;
1632 FOR_EACH_VEC_ELT (children, j, child)
1634 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1635 all_uniform_p = false;
1636 else if (!vect_slp_tree_uniform_p (child))
1638 all_uniform_p = false;
1639 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1640 n_vector_builds++;
1643 if (all_uniform_p || n_vector_builds > 1)
1645 /* Roll back. */
1646 matches[0] = false;
1647 FOR_EACH_VEC_ELT (children, j, child)
1648 vect_free_slp_tree (child);
1650 if (dump_enabled_p ())
1651 dump_printf_loc (MSG_NOTE, vect_location,
1652 "Building parent vector operands from "
1653 "scalars instead\n");
1654 return NULL;
1658 *tree_size += this_tree_size + 1;
1659 *max_nunits = this_max_nunits;
1661 if (two_operators)
1663 /* ??? We'd likely want to either cache in bst_map sth like
1664 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1665 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1666 explicit stmts to put in so the keying on 'stmts' doesn't
1667 work (but we have the same issue with nodes that use 'ops'). */
1668 slp_tree one = new _slp_tree;
1669 slp_tree two = new _slp_tree;
1670 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1671 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1672 SLP_TREE_VECTYPE (one) = vectype;
1673 SLP_TREE_VECTYPE (two) = vectype;
1674 SLP_TREE_CHILDREN (one).safe_splice (children);
1675 SLP_TREE_CHILDREN (two).safe_splice (children);
1676 slp_tree child;
1677 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1678 SLP_TREE_REF_COUNT (child)++;
1680 /* Here we record the original defs since this
1681 node represents the final lane configuration. */
1682 node = vect_create_new_slp_node (stmts, 2);
1683 SLP_TREE_VECTYPE (node) = vectype;
1684 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1685 SLP_TREE_CHILDREN (node).quick_push (one);
1686 SLP_TREE_CHILDREN (node).quick_push (two);
1687 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1688 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1689 enum tree_code ocode = ERROR_MARK;
1690 stmt_vec_info ostmt_info;
1691 unsigned j = 0;
1692 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1694 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1695 if (gimple_assign_rhs_code (ostmt) != code0)
1697 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1698 ocode = gimple_assign_rhs_code (ostmt);
1699 j = i;
1701 else
1702 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1704 SLP_TREE_CODE (one) = code0;
1705 SLP_TREE_CODE (two) = ocode;
1706 SLP_TREE_LANES (one) = stmts.length ();
1707 SLP_TREE_LANES (two) = stmts.length ();
1708 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1709 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1710 return node;
1713 node = vect_create_new_slp_node (stmts, nops);
1714 SLP_TREE_VECTYPE (node) = vectype;
1715 SLP_TREE_CHILDREN (node).splice (children);
1716 return node;
1719 /* Dump a single SLP tree NODE. */
1721 static void
1722 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1723 slp_tree node)
1725 unsigned i, j;
1726 slp_tree child;
1727 stmt_vec_info stmt_info;
1728 tree op;
1730 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1731 dump_user_location_t user_loc = loc.get_user_location ();
1732 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1733 SLP_TREE_DEF_TYPE (node) == vect_external_def
1734 ? " (external)"
1735 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1736 ? " (constant)"
1737 : ""), node,
1738 estimated_poly_value (node->max_nunits),
1739 SLP_TREE_REF_COUNT (node));
1740 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1741 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1742 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1743 else
1745 dump_printf_loc (metadata, user_loc, "\t{ ");
1746 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1747 dump_printf (metadata, "%T%s ", op,
1748 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1749 dump_printf (metadata, "}\n");
1751 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1753 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1754 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1755 dump_printf (dump_kind, " %u", j);
1756 dump_printf (dump_kind, " }\n");
1758 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1760 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1761 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1762 dump_printf (dump_kind, " %u[%u]",
1763 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1764 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1765 dump_printf (dump_kind, " }\n");
1767 if (SLP_TREE_CHILDREN (node).is_empty ())
1768 return;
1769 dump_printf_loc (metadata, user_loc, "\tchildren");
1770 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1771 dump_printf (dump_kind, " %p", (void *)child);
1772 dump_printf (dump_kind, "\n");
1775 DEBUG_FUNCTION void
1776 debug (slp_tree node)
1778 debug_dump_context ctx;
1779 vect_print_slp_tree (MSG_NOTE,
1780 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1781 node);
1784 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1786 static void
1787 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1788 slp_tree node, hash_set<slp_tree> &visited)
1790 unsigned i;
1791 slp_tree child;
1793 if (visited.add (node))
1794 return;
1796 vect_print_slp_tree (dump_kind, loc, node);
1798 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1799 vect_print_slp_graph (dump_kind, loc, child, visited);
1802 static void
1803 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1804 slp_tree entry)
1806 hash_set<slp_tree> visited;
1807 vect_print_slp_graph (dump_kind, loc, entry, visited);
1810 /* Mark the tree rooted at NODE with PURE_SLP. */
1812 static void
1813 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1815 int i;
1816 stmt_vec_info stmt_info;
1817 slp_tree child;
1819 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1820 return;
1822 if (visited.add (node))
1823 return;
1825 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1826 STMT_SLP_TYPE (stmt_info) = pure_slp;
1828 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1829 vect_mark_slp_stmts (child, visited);
1832 static void
1833 vect_mark_slp_stmts (slp_tree node)
1835 hash_set<slp_tree> visited;
1836 vect_mark_slp_stmts (node, visited);
1839 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1841 static void
1842 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1844 int i;
1845 stmt_vec_info stmt_info;
1846 slp_tree child;
1848 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1849 return;
1851 if (visited.add (node))
1852 return;
1854 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1856 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1857 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1858 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1861 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1862 vect_mark_slp_stmts_relevant (child, visited);
1865 static void
1866 vect_mark_slp_stmts_relevant (slp_tree node)
1868 hash_set<slp_tree> visited;
1869 vect_mark_slp_stmts_relevant (node, visited);
1873 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1875 static void
1876 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
1877 hash_set<slp_tree> &visited)
1879 if (visited.add (node))
1880 return;
1882 if (SLP_TREE_CHILDREN (node).length () == 0)
1884 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1885 return;
1886 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1887 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1888 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1889 loads.safe_push (node);
1891 else
1893 unsigned i;
1894 slp_tree child;
1895 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1896 vect_gather_slp_loads (loads, child, visited);
1900 static void
1901 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1903 hash_set<slp_tree> visited;
1904 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
1908 /* Find the last store in SLP INSTANCE. */
1910 stmt_vec_info
1911 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1913 stmt_vec_info last = NULL;
1914 stmt_vec_info stmt_vinfo;
1916 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1918 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1919 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1922 return last;
1925 /* Find the first stmt in NODE. */
1927 stmt_vec_info
1928 vect_find_first_scalar_stmt_in_slp (slp_tree node)
1930 stmt_vec_info first = NULL;
1931 stmt_vec_info stmt_vinfo;
1933 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1935 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1936 if (!first
1937 || get_later_stmt (stmt_vinfo, first) == first)
1938 first = stmt_vinfo;
1941 return first;
1944 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1945 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1946 (also containing the first GROUP1_SIZE stmts, since stores are
1947 consecutive), the second containing the remainder.
1948 Return the first stmt in the second group. */
1950 static stmt_vec_info
1951 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1953 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1954 gcc_assert (group1_size > 0);
1955 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1956 gcc_assert (group2_size > 0);
1957 DR_GROUP_SIZE (first_vinfo) = group1_size;
1959 stmt_vec_info stmt_info = first_vinfo;
1960 for (unsigned i = group1_size; i > 1; i--)
1962 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1963 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1965 /* STMT is now the last element of the first group. */
1966 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1967 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1969 DR_GROUP_SIZE (group2) = group2_size;
1970 for (stmt_info = group2; stmt_info;
1971 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1973 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1974 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1977 /* For the second group, the DR_GROUP_GAP is that before the original group,
1978 plus skipping over the first vector. */
1979 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1981 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1982 DR_GROUP_GAP (first_vinfo) += group2_size;
1984 if (dump_enabled_p ())
1985 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1986 group1_size, group2_size);
1988 return group2;
1991 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1992 statements and a vector of NUNITS elements. */
1994 static poly_uint64
1995 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
1997 return exact_div (common_multiple (nunits, group_size), group_size);
2000 /* Analyze an SLP instance starting from a group of grouped stores. Call
2001 vect_build_slp_tree to build a tree of packed stmts if possible.
2002 Return FALSE if it's impossible to SLP any stmt in the loop. */
2004 static bool
2005 vect_analyze_slp_instance (vec_info *vinfo,
2006 scalar_stmts_to_slp_tree_map_t *bst_map,
2007 stmt_vec_info stmt_info, unsigned max_tree_size)
2009 slp_instance new_instance;
2010 slp_tree node;
2011 unsigned int group_size;
2012 unsigned int i;
2013 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2014 vec<stmt_vec_info> scalar_stmts;
2015 bool constructor = false;
2017 if (is_a <bb_vec_info> (vinfo))
2018 vect_location = stmt_info->stmt;
2019 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2021 group_size = DR_GROUP_SIZE (stmt_info);
2023 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2025 gcc_assert (is_a <loop_vec_info> (vinfo));
2026 group_size = REDUC_GROUP_SIZE (stmt_info);
2028 else if (is_gimple_assign (stmt_info->stmt)
2029 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2031 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2032 constructor = true;
2034 else
2036 gcc_assert (is_a <loop_vec_info> (vinfo));
2037 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2040 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2041 scalar_stmts.create (group_size);
2042 stmt_vec_info next_info = stmt_info;
2043 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2045 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2046 while (next_info)
2048 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2049 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2052 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2054 /* Collect the reduction stmts and store them in
2055 SLP_TREE_SCALAR_STMTS. */
2056 while (next_info)
2058 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2059 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2061 /* Mark the first element of the reduction chain as reduction to properly
2062 transform the node. In the reduction analysis phase only the last
2063 element of the chain is marked as reduction. */
2064 STMT_VINFO_DEF_TYPE (stmt_info)
2065 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2066 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2067 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2069 else if (constructor)
2071 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2072 tree val;
2073 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2075 if (TREE_CODE (val) == SSA_NAME)
2077 gimple* def = SSA_NAME_DEF_STMT (val);
2078 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2079 /* Value is defined in another basic block. */
2080 if (!def_info)
2081 return false;
2082 def_info = vect_stmt_to_vectorize (def_info);
2083 scalar_stmts.safe_push (def_info);
2085 else
2086 return false;
2088 if (dump_enabled_p ())
2089 dump_printf_loc (MSG_NOTE, vect_location,
2090 "Analyzing vectorizable constructor: %G\n",
2091 stmt_info->stmt);
2093 else
2095 /* Collect reduction statements. */
2096 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2097 for (i = 0; reductions.iterate (i, &next_info); i++)
2098 scalar_stmts.safe_push (next_info);
2101 if (dump_enabled_p ())
2103 dump_printf_loc (MSG_NOTE, vect_location,
2104 "Starting SLP discovery for\n");
2105 for (i = 0; i < scalar_stmts.length (); ++i)
2106 dump_printf_loc (MSG_NOTE, vect_location,
2107 " %G", scalar_stmts[i]->stmt);
2110 /* Build the tree for the SLP instance. */
2111 bool *matches = XALLOCAVEC (bool, group_size);
2112 unsigned npermutes = 0;
2113 poly_uint64 max_nunits = 1;
2114 unsigned tree_size = 0;
2115 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2116 &max_nunits, matches, &npermutes,
2117 &tree_size, bst_map);
2118 if (node != NULL)
2120 /* Calculate the unrolling factor based on the smallest type. */
2121 poly_uint64 unrolling_factor
2122 = calculate_unrolling_factor (max_nunits, group_size);
2124 if (maybe_ne (unrolling_factor, 1U)
2125 && is_a <bb_vec_info> (vinfo))
2127 unsigned HOST_WIDE_INT const_max_nunits;
2128 if (!max_nunits.is_constant (&const_max_nunits)
2129 || const_max_nunits > group_size)
2131 if (dump_enabled_p ())
2132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2133 "Build SLP failed: store group "
2134 "size not a multiple of the vector size "
2135 "in basic block SLP\n");
2136 vect_free_slp_tree (node);
2137 return false;
2139 /* Fatal mismatch. */
2140 if (dump_enabled_p ())
2141 dump_printf_loc (MSG_NOTE, vect_location,
2142 "SLP discovery succeeded but node needs "
2143 "splitting\n");
2144 memset (matches, true, group_size);
2145 matches[group_size / const_max_nunits * const_max_nunits] = false;
2146 vect_free_slp_tree (node);
2148 else
2150 /* Create a new SLP instance. */
2151 new_instance = XNEW (class _slp_instance);
2152 SLP_INSTANCE_TREE (new_instance) = node;
2153 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2154 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2155 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2156 new_instance->reduc_phis = NULL;
2157 new_instance->cost_vec = vNULL;
2158 new_instance->subgraph_entries = vNULL;
2160 vect_gather_slp_loads (new_instance, node);
2161 if (dump_enabled_p ())
2162 dump_printf_loc (MSG_NOTE, vect_location,
2163 "SLP size %u vs. limit %u.\n",
2164 tree_size, max_tree_size);
2166 /* Check whether any load is possibly permuted. */
2167 slp_tree load_node;
2168 bool loads_permuted = false;
2169 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2171 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2172 continue;
2173 unsigned j;
2174 stmt_vec_info load_info;
2175 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2176 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2178 loads_permuted = true;
2179 break;
2183 /* If the loads and stores can be handled with load/store-lane
2184 instructions do not generate this SLP instance. */
2185 if (is_a <loop_vec_info> (vinfo)
2186 && loads_permuted
2187 && dr
2188 && vect_store_lanes_supported
2189 (STMT_VINFO_VECTYPE (scalar_stmts[0]), group_size, false))
2191 slp_tree load_node;
2192 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2194 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2195 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2196 /* Use SLP for strided accesses (or if we can't load-lanes). */
2197 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2198 || ! vect_load_lanes_supported
2199 (STMT_VINFO_VECTYPE (stmt_vinfo),
2200 DR_GROUP_SIZE (stmt_vinfo), false))
2201 break;
2203 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2205 if (dump_enabled_p ())
2206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2207 "Built SLP cancelled: can use "
2208 "load/store-lanes\n");
2209 vect_free_slp_instance (new_instance);
2210 return false;
2214 /* If this is a reduction chain with a conversion in front
2215 amend the SLP tree with a node for that. */
2216 if (!dr
2217 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2218 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2220 /* Get at the conversion stmt - we know it's the single use
2221 of the last stmt of the reduction chain. */
2222 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2223 use_operand_p use_p;
2224 gimple *use_stmt;
2225 bool r = single_imm_use (gimple_assign_lhs (tem),
2226 &use_p, &use_stmt);
2227 gcc_assert (r);
2228 next_info = vinfo->lookup_stmt (use_stmt);
2229 next_info = vect_stmt_to_vectorize (next_info);
2230 scalar_stmts = vNULL;
2231 scalar_stmts.create (group_size);
2232 for (unsigned i = 0; i < group_size; ++i)
2233 scalar_stmts.quick_push (next_info);
2234 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2235 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2236 SLP_TREE_CHILDREN (conv).quick_push (node);
2237 SLP_INSTANCE_TREE (new_instance) = conv;
2238 /* We also have to fake this conversion stmt as SLP reduction
2239 group so we don't have to mess with too much code
2240 elsewhere. */
2241 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2242 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2245 vinfo->slp_instances.safe_push (new_instance);
2247 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2248 the number of scalar stmts in the root in a few places.
2249 Verify that assumption holds. */
2250 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2251 .length () == group_size);
2253 if (dump_enabled_p ())
2255 dump_printf_loc (MSG_NOTE, vect_location,
2256 "Final SLP tree for instance:\n");
2257 vect_print_slp_graph (MSG_NOTE, vect_location,
2258 SLP_INSTANCE_TREE (new_instance));
2261 return true;
2264 else
2266 /* Failed to SLP. */
2267 /* Free the allocated memory. */
2268 scalar_stmts.release ();
2271 /* Try to break the group up into pieces. */
2272 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2273 && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
2275 for (i = 0; i < group_size; i++)
2276 if (!matches[i])
2277 break;
2279 /* For basic block SLP, try to break the group up into multiples of
2280 a vector size. */
2281 if (is_a <bb_vec_info> (vinfo)
2282 && (i > 1 && i < group_size))
2284 tree scalar_type
2285 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2286 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2287 least_bit_hwi (i));
2288 unsigned HOST_WIDE_INT const_nunits;
2289 if (vectype
2290 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2292 /* Split into two groups at the first vector boundary. */
2293 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2294 unsigned group1_size = i & ~(const_nunits - 1);
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_NOTE, vect_location,
2298 "Splitting SLP group at stmt %u\n", i);
2299 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2300 group1_size);
2301 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2302 max_tree_size);
2303 /* If the first non-match was in the middle of a vector,
2304 skip the rest of that vector. Do not bother to re-analyze
2305 single stmt groups. */
2306 if (group1_size < i)
2308 i = group1_size + const_nunits;
2309 if (i + 1 < group_size)
2310 rest = vect_split_slp_store_group (rest, const_nunits);
2312 if (i + 1 < group_size)
2313 res |= vect_analyze_slp_instance (vinfo, bst_map,
2314 rest, max_tree_size);
2315 return res;
2319 /* For loop vectorization split into arbitrary pieces of size > 1. */
2320 if (is_a <loop_vec_info> (vinfo)
2321 && (i > 1 && i < group_size))
2323 unsigned group1_size = i;
2325 if (dump_enabled_p ())
2326 dump_printf_loc (MSG_NOTE, vect_location,
2327 "Splitting SLP group at stmt %u\n", i);
2329 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2330 group1_size);
2331 /* Loop vectorization cannot handle gaps in stores, make sure
2332 the split group appears as strided. */
2333 STMT_VINFO_STRIDED_P (rest) = 1;
2334 DR_GROUP_GAP (rest) = 0;
2335 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2336 DR_GROUP_GAP (stmt_info) = 0;
2338 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2339 max_tree_size);
2340 if (i + 1 < group_size)
2341 res |= vect_analyze_slp_instance (vinfo, bst_map,
2342 rest, max_tree_size);
2344 return res;
2347 /* Even though the first vector did not all match, we might be able to SLP
2348 (some) of the remainder. FORNOW ignore this possibility. */
2351 /* Failed to SLP. */
2352 if (dump_enabled_p ())
2353 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2354 return false;
2357 /* Fill in backedge SLP children in the SLP graph. */
2359 static void
2360 vect_analyze_slp_backedges (vec_info *vinfo, slp_tree node,
2361 scalar_stmts_to_slp_tree_map_t *bst_map,
2362 hash_set<slp_tree> &visited)
2364 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
2365 || visited.add (node))
2366 return;
2368 slp_tree child;
2369 unsigned i;
2370 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2371 if (child)
2372 vect_analyze_slp_backedges (vinfo, child, bst_map, visited);
2374 /* Inductions are not vectorized by vectorizing their defining cycle
2375 but by materializing the values from SCEV data. */
2376 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (node))
2377 == vect_induction_def)
2378 return;
2380 if (gphi *phi = dyn_cast <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
2381 for (unsigned i = 0; i < gimple_phi_num_args (phi); ++i)
2383 auto_vec<stmt_vec_info, 64> stmts;
2384 unsigned j;
2385 stmt_vec_info phi_info;
2386 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, phi_info)
2388 tree def = gimple_phi_arg_def (as_a <gphi *>(phi_info->stmt), i);
2389 stmt_vec_info def_info = vinfo->lookup_def (def);
2390 if (!def_info)
2391 break;
2392 stmts.safe_push (vect_stmt_to_vectorize (def_info));
2394 if (j != SLP_TREE_LANES (node))
2395 continue;
2396 slp_tree *edge_def = bst_map->get (stmts);
2397 if (edge_def)
2399 /* ??? We're currently not recording non-backedge children
2400 of PHIs like external reduction initial values. Avoid
2401 NULL entries in SLP_TREE_CHILDREN for those and thus
2402 for now simply only record backedge defs at a
2403 SLP_TREE_CHILDREN index possibly not matching that of
2404 the corresponding PHI argument index. */
2405 SLP_TREE_CHILDREN (node).quick_push (*edge_def);
2406 (*edge_def)->refcnt++;
2411 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2412 trees of packed scalar stmts if SLP is possible. */
2414 opt_result
2415 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2417 unsigned int i;
2418 stmt_vec_info first_element;
2420 DUMP_VECT_SCOPE ("vect_analyze_slp");
2422 scalar_stmts_to_slp_tree_map_t *bst_map
2423 = new scalar_stmts_to_slp_tree_map_t ();
2425 /* Find SLP sequences starting from groups of grouped stores. */
2426 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2427 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2429 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2431 if (loop_vinfo->reduction_chains.length () > 0)
2433 /* Find SLP sequences starting from reduction chains. */
2434 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2435 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2436 max_tree_size))
2438 /* Dissolve reduction chain group. */
2439 stmt_vec_info vinfo = first_element;
2440 stmt_vec_info last = NULL;
2441 while (vinfo)
2443 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2444 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2445 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2446 last = vinfo;
2447 vinfo = next;
2449 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2450 /* It can be still vectorized as part of an SLP reduction. */
2451 loop_vinfo->reductions.safe_push (last);
2455 /* Find SLP sequences starting from groups of reductions. */
2456 if (loop_vinfo->reductions.length () > 1)
2457 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2458 max_tree_size);
2461 /* Fill in backedges. */
2462 slp_instance instance;
2463 hash_set<slp_tree> visited;
2464 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2465 vect_analyze_slp_backedges (vinfo, SLP_INSTANCE_TREE (instance),
2466 bst_map, visited);
2468 /* The map keeps a reference on SLP nodes built, release that. */
2469 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2470 it != bst_map->end (); ++it)
2471 if ((*it).second)
2472 vect_free_slp_tree ((*it).second);
2473 delete bst_map;
2475 return opt_result::success ();
2478 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2480 static void
2481 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2482 vec<slp_tree> &vertices, vec<int> &leafs)
2484 unsigned i;
2485 slp_tree child;
2487 if (visited.add (node))
2488 return;
2490 node->vertex = vertices.length ();
2491 vertices.safe_push (node);
2492 if (SLP_TREE_CHILDREN (node).is_empty ())
2493 leafs.safe_push (node->vertex);
2494 else
2495 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2496 vect_slp_build_vertices (visited, child, vertices, leafs);
2499 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2501 static void
2502 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2503 vec<int> &leafs)
2505 hash_set<slp_tree> visited;
2506 unsigned i;
2507 slp_instance instance;
2508 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2509 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2510 leafs);
2513 /* Apply (reverse) bijectite PERM to VEC. */
2515 template <class T>
2516 static void
2517 vect_slp_permute (vec<unsigned> perm,
2518 vec<T> &vec, bool reverse)
2520 auto_vec<T, 64> saved;
2521 saved.create (vec.length ());
2522 for (unsigned i = 0; i < vec.length (); ++i)
2523 saved.quick_push (vec[i]);
2525 if (reverse)
2527 for (unsigned i = 0; i < vec.length (); ++i)
2528 vec[perm[i]] = saved[i];
2529 for (unsigned i = 0; i < vec.length (); ++i)
2530 gcc_assert (vec[perm[i]] == saved[i]);
2532 else
2534 for (unsigned i = 0; i < vec.length (); ++i)
2535 vec[i] = saved[perm[i]];
2536 for (unsigned i = 0; i < vec.length (); ++i)
2537 gcc_assert (vec[i] == saved[perm[i]]);
2541 /* Return whether permutations PERM_A and PERM_B as recorded in the
2542 PERMS vector are equal. */
2544 static bool
2545 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2546 int perm_a, int perm_b)
2548 return (perm_a == perm_b
2549 || (perms[perm_a].length () == perms[perm_b].length ()
2550 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2551 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2554 /* Optimize the SLP graph of VINFO. */
2556 void
2557 vect_optimize_slp (vec_info *vinfo)
2559 if (vinfo->slp_instances.is_empty ())
2560 return;
2562 slp_tree node;
2563 unsigned i;
2564 auto_vec<slp_tree> vertices;
2565 auto_vec<int> leafs;
2566 vect_slp_build_vertices (vinfo, vertices, leafs);
2568 struct graph *slpg = new_graph (vertices.length ());
2569 FOR_EACH_VEC_ELT (vertices, i, node)
2571 unsigned j;
2572 slp_tree child;
2573 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2574 add_edge (slpg, i, child->vertex);
2577 /* Compute (reverse) postorder on the inverted graph. */
2578 auto_vec<int> ipo;
2579 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
2581 auto_sbitmap n_visited (vertices.length ());
2582 auto_sbitmap n_materialize (vertices.length ());
2583 auto_vec<int> n_perm (vertices.length ());
2584 auto_vec<vec<unsigned> > perms;
2586 bitmap_clear (n_visited);
2587 bitmap_clear (n_materialize);
2588 n_perm.quick_grow_cleared (vertices.length ());
2589 perms.safe_push (vNULL); /* zero is no permute */
2591 /* Produce initial permutations. */
2592 for (i = 0; i < leafs.length (); ++i)
2594 int idx = leafs[i];
2595 slp_tree node = vertices[idx];
2597 /* Handle externals and constants optimistically throughout the
2598 iteration. */
2599 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
2600 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
2601 continue;
2603 /* Loads are the only thing generating permutes and leafs do not
2604 change across iterations. */
2605 bitmap_set_bit (n_visited, idx);
2606 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2607 continue;
2609 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2610 node unpermuted, record this permute. */
2611 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
2612 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
2613 continue;
2614 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
2615 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
2616 bool any_permute = false;
2617 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2619 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
2620 imin = MIN (imin, idx);
2621 imax = MAX (imax, idx);
2622 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
2623 any_permute = true;
2625 /* If there's no permute no need to split one out. */
2626 if (!any_permute)
2627 continue;
2628 /* If the span doesn't match we'd disrupt VF computation, avoid
2629 that for now. */
2630 if (imax - imin + 1 != SLP_TREE_LANES (node))
2631 continue;
2633 /* For now only handle true permutes, like
2634 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2635 when permuting constants and invariants keeping the permute
2636 bijective. */
2637 auto_sbitmap load_index (SLP_TREE_LANES (node));
2638 bitmap_clear (load_index);
2639 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2640 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
2641 unsigned j;
2642 for (j = 0; j < SLP_TREE_LANES (node); ++j)
2643 if (!bitmap_bit_p (load_index, j))
2644 break;
2645 if (j != SLP_TREE_LANES (node))
2646 continue;
2648 vec<unsigned> perm = vNULL;
2649 perm.safe_grow (SLP_TREE_LANES (node), true);
2650 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2651 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
2652 perms.safe_push (perm);
2653 n_perm[idx] = perms.length () - 1;
2656 /* Propagate permutes along the graph and compute materialization points. */
2657 bool changed;
2658 unsigned iteration = 0;
2661 changed = false;
2662 ++iteration;
2664 for (i = vertices.length (); i > 0 ; --i)
2666 int idx = ipo[i-1];
2667 slp_tree node = vertices[idx];
2668 /* For leafs there's nothing to do - we've seeded permutes
2669 on those above. */
2670 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2671 continue;
2673 bitmap_set_bit (n_visited, idx);
2675 /* We cannot move a permute across a store. */
2676 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2677 && DR_IS_WRITE
2678 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
2679 continue;
2681 int perm = -1;
2682 for (graph_edge *succ = slpg->vertices[idx].succ;
2683 succ; succ = succ->succ_next)
2685 int succ_idx = succ->dest;
2686 /* Handle unvisited nodes optimistically. */
2687 /* ??? But for constants once we want to handle non-bijective
2688 permutes we have to verify the permute, when unifying lanes,
2689 will not unify different constants. For example see
2690 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2691 if (!bitmap_bit_p (n_visited, succ_idx))
2692 continue;
2693 int succ_perm = n_perm[succ_idx];
2694 /* Once we materialize succs permutation its output lanes
2695 appear unpermuted to us. */
2696 if (bitmap_bit_p (n_materialize, succ_idx))
2697 succ_perm = 0;
2698 if (perm == -1)
2699 perm = succ_perm;
2700 else if (succ_perm == 0)
2702 perm = 0;
2703 break;
2705 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
2707 perm = 0;
2708 break;
2712 if (perm == -1)
2713 /* Pick up pre-computed leaf values. */
2714 perm = n_perm[idx];
2715 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
2717 if (iteration > 1)
2718 /* Make sure we eventually converge. */
2719 gcc_checking_assert (perm == 0);
2720 n_perm[idx] = perm;
2721 if (perm == 0)
2722 bitmap_clear_bit (n_materialize, idx);
2723 changed = true;
2726 if (perm == 0)
2727 continue;
2729 /* Elide pruning at materialization points in the first
2730 iteration so every node was visited once at least. */
2731 if (iteration == 1)
2732 continue;
2734 /* Decide on permute materialization. Look whether there's
2735 a use (pred) edge that is permuted differently than us.
2736 In that case mark ourselves so the permutation is applied. */
2737 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
2738 for (graph_edge *pred = slpg->vertices[idx].pred;
2739 pred; pred = pred->pred_next)
2741 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
2742 int pred_perm = n_perm[pred->src];
2743 if (!vect_slp_perms_eq (perms, perm, pred_perm))
2745 all_preds_permuted = false;
2746 break;
2749 if (!all_preds_permuted)
2751 if (!bitmap_bit_p (n_materialize, idx))
2752 changed = true;
2753 bitmap_set_bit (n_materialize, idx);
2757 while (changed || iteration == 1);
2759 /* Materialize. */
2760 for (i = 0; i < vertices.length (); ++i)
2762 int perm = n_perm[i];
2763 if (perm <= 0)
2764 continue;
2766 slp_tree node = vertices[i];
2768 /* First permute invariant/external original successors. */
2769 unsigned j;
2770 slp_tree child;
2771 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2773 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2774 continue;
2776 /* If the vector is uniform there's nothing to do. */
2777 if (vect_slp_tree_uniform_p (child))
2778 continue;
2780 /* We can end up sharing some externals via two_operator
2781 handling. Be prepared to unshare those. */
2782 if (child->refcnt != 1)
2784 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
2785 SLP_TREE_CHILDREN (node)[j] = child
2786 = vect_create_new_slp_node
2787 (SLP_TREE_SCALAR_OPS (child).copy ());
2789 vect_slp_permute (perms[perm],
2790 SLP_TREE_SCALAR_OPS (child), true);
2793 if (bitmap_bit_p (n_materialize, i))
2795 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2796 /* For loads simply drop the permutation, the load permutation
2797 already performs the desired permutation. */
2799 else
2801 if (dump_enabled_p ())
2802 dump_printf_loc (MSG_NOTE, vect_location,
2803 "inserting permute node in place of %p\n",
2804 node);
2806 /* Make a copy of NODE and in-place change it to a
2807 VEC_PERM node to permute the lanes of the copy. */
2808 slp_tree copy = new _slp_tree;
2809 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
2810 SLP_TREE_CHILDREN (node) = vNULL;
2811 SLP_TREE_SCALAR_STMTS (copy)
2812 = SLP_TREE_SCALAR_STMTS (node).copy ();
2813 vect_slp_permute (perms[perm],
2814 SLP_TREE_SCALAR_STMTS (copy), true);
2815 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
2816 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
2817 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
2818 SLP_TREE_LANE_PERMUTATION (copy)
2819 = SLP_TREE_LANE_PERMUTATION (node);
2820 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
2821 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
2822 copy->refcnt = 1;
2823 copy->max_nunits = node->max_nunits;
2824 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
2825 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
2826 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
2828 /* Now turn NODE into a VEC_PERM. */
2829 SLP_TREE_CHILDREN (node).safe_push (copy);
2830 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
2831 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2832 SLP_TREE_LANE_PERMUTATION (node)
2833 .quick_push (std::make_pair (0, perms[perm][j]));
2834 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2837 else
2839 /* Apply the reverse permutation to our stmts. */
2840 vect_slp_permute (perms[perm],
2841 SLP_TREE_SCALAR_STMTS (node), true);
2842 /* And to the load permutation, which we can simply
2843 make regular by design. */
2844 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2846 /* ??? When we handle non-bijective permutes the idea
2847 is that we can force the load-permutation to be
2848 { min, min + 1, min + 2, ... max }. But then the
2849 scalar defs might no longer match the lane content
2850 which means wrong-code with live lane vectorization.
2851 So we possibly have to have NULL entries for those. */
2852 vect_slp_permute (perms[perm],
2853 SLP_TREE_LOAD_PERMUTATION (node), true);
2858 /* Free the perms vector used for propagation. */
2859 while (!perms.is_empty ())
2860 perms.pop ().release ();
2861 free_graph (slpg);
2864 /* Now elide load permutations that are not necessary. */
2865 for (i = 0; i < leafs.length (); ++i)
2867 node = vertices[i];
2868 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2869 continue;
2871 /* In basic block vectorization we allow any subchain of an interleaving
2872 chain.
2873 FORNOW: not in loop SLP because of realignment complications. */
2874 if (is_a <bb_vec_info> (vinfo))
2876 bool subchain_p = true;
2877 stmt_vec_info next_load_info = NULL;
2878 stmt_vec_info load_info;
2879 unsigned j;
2880 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2882 if (j != 0
2883 && (next_load_info != load_info
2884 || DR_GROUP_GAP (load_info) != 1))
2886 subchain_p = false;
2887 break;
2889 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
2891 if (subchain_p)
2893 SLP_TREE_LOAD_PERMUTATION (node).release ();
2894 continue;
2897 else
2899 stmt_vec_info load_info;
2900 bool this_load_permuted = false;
2901 unsigned j;
2902 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2903 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
2905 this_load_permuted = true;
2906 break;
2908 stmt_vec_info first_stmt_info
2909 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
2910 if (!this_load_permuted
2911 /* The load requires permutation when unrolling exposes
2912 a gap either because the group is larger than the SLP
2913 group-size or because there is a gap between the groups. */
2914 && (known_eq (LOOP_VINFO_VECT_FACTOR
2915 (as_a <loop_vec_info> (vinfo)), 1U)
2916 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
2917 && DR_GROUP_GAP (first_stmt_info) == 0)))
2919 SLP_TREE_LOAD_PERMUTATION (node).release ();
2920 continue;
2927 /* For each possible SLP instance decide whether to SLP it and calculate overall
2928 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2929 least one instance. */
2931 bool
2932 vect_make_slp_decision (loop_vec_info loop_vinfo)
2934 unsigned int i;
2935 poly_uint64 unrolling_factor = 1;
2936 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2937 slp_instance instance;
2938 int decided_to_slp = 0;
2940 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2942 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2944 /* FORNOW: SLP if you can. */
2945 /* All unroll factors have the form:
2947 GET_MODE_SIZE (vinfo->vector_mode) * X
2949 for some rational X, so they must have a common multiple. */
2950 unrolling_factor
2951 = force_common_multiple (unrolling_factor,
2952 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2954 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2955 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2956 loop-based vectorization. Such stmts will be marked as HYBRID. */
2957 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2958 decided_to_slp++;
2961 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2963 if (decided_to_slp && dump_enabled_p ())
2965 dump_printf_loc (MSG_NOTE, vect_location,
2966 "Decided to SLP %d instances. Unrolling factor ",
2967 decided_to_slp);
2968 dump_dec (MSG_NOTE, unrolling_factor);
2969 dump_printf (MSG_NOTE, "\n");
2972 return (decided_to_slp > 0);
2975 /* Private data for vect_detect_hybrid_slp. */
2976 struct vdhs_data
2978 loop_vec_info loop_vinfo;
2979 vec<stmt_vec_info> *worklist;
2982 /* Walker for walk_gimple_op. */
2984 static tree
2985 vect_detect_hybrid_slp (tree *tp, int *, void *data)
2987 walk_stmt_info *wi = (walk_stmt_info *)data;
2988 vdhs_data *dat = (vdhs_data *)wi->info;
2990 if (wi->is_lhs)
2991 return NULL_TREE;
2993 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
2994 if (!def_stmt_info)
2995 return NULL_TREE;
2996 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
2997 if (PURE_SLP_STMT (def_stmt_info))
2999 if (dump_enabled_p ())
3000 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3001 def_stmt_info->stmt);
3002 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3003 dat->worklist->safe_push (def_stmt_info);
3006 return NULL_TREE;
3009 /* Find stmts that must be both vectorized and SLPed. */
3011 void
3012 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3014 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3016 /* All stmts participating in SLP are marked pure_slp, all other
3017 stmts are loop_vect.
3018 First collect all loop_vect stmts into a worklist. */
3019 auto_vec<stmt_vec_info> worklist;
3020 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3022 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3023 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3024 gsi_next (&gsi))
3026 gphi *phi = gsi.phi ();
3027 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3028 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3029 worklist.safe_push (stmt_info);
3031 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3032 gsi_next (&gsi))
3034 gimple *stmt = gsi_stmt (gsi);
3035 if (is_gimple_debug (stmt))
3036 continue;
3037 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3038 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3040 for (gimple_stmt_iterator gsi2
3041 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3042 !gsi_end_p (gsi2); gsi_next (&gsi2))
3044 stmt_vec_info patt_info
3045 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3046 if (!STMT_SLP_TYPE (patt_info)
3047 && STMT_VINFO_RELEVANT (patt_info))
3048 worklist.safe_push (patt_info);
3050 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3052 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3053 worklist.safe_push (stmt_info);
3057 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3058 mark any SLP vectorized stmt as hybrid.
3059 ??? We're visiting def stmts N times (once for each non-SLP and
3060 once for each hybrid-SLP use). */
3061 walk_stmt_info wi;
3062 vdhs_data dat;
3063 dat.worklist = &worklist;
3064 dat.loop_vinfo = loop_vinfo;
3065 memset (&wi, 0, sizeof (wi));
3066 wi.info = (void *)&dat;
3067 while (!worklist.is_empty ())
3069 stmt_vec_info stmt_info = worklist.pop ();
3070 /* Since SSA operands are not set up for pattern stmts we need
3071 to use walk_gimple_op. */
3072 wi.is_lhs = 0;
3073 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3078 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3080 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3081 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs)
3083 for (unsigned i = 0; i < bbs.length (); ++i)
3085 if (i != 0)
3086 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3087 gsi_next (&si))
3089 gphi *phi = si.phi ();
3090 gimple_set_uid (phi, 0);
3091 add_stmt (phi);
3093 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3094 !gsi_end_p (gsi); gsi_next (&gsi))
3096 gimple *stmt = gsi_stmt (gsi);
3097 gimple_set_uid (stmt, 0);
3098 if (is_gimple_debug (stmt))
3099 continue;
3100 add_stmt (stmt);
3106 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3107 stmts in the basic block. */
3109 _bb_vec_info::~_bb_vec_info ()
3111 /* Reset region marker. */
3112 for (unsigned i = 0; i < bbs.length (); ++i)
3114 if (i != 0)
3115 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3116 gsi_next (&si))
3118 gphi *phi = si.phi ();
3119 gimple_set_uid (phi, -1);
3121 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3122 !gsi_end_p (gsi); gsi_next (&gsi))
3124 gimple *stmt = gsi_stmt (gsi);
3125 gimple_set_uid (stmt, -1);
3130 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3131 given then that child nodes have already been processed, and that
3132 their def types currently match their SLP node's def type. */
3134 static bool
3135 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3136 slp_instance node_instance,
3137 stmt_vector_for_cost *cost_vec)
3139 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3140 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3142 /* Calculate the number of vector statements to be created for the
3143 scalar stmts in this node. For SLP reductions it is equal to the
3144 number of vector statements in the children (which has already been
3145 calculated by the recursive call). Otherwise it is the number of
3146 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3147 VF divided by the number of elements in a vector. */
3148 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3149 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3151 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3152 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3154 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3155 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3156 break;
3159 else
3161 poly_uint64 vf;
3162 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3163 vf = loop_vinfo->vectorization_factor;
3164 else
3165 vf = 1;
3166 unsigned int group_size = SLP_TREE_LANES (node);
3167 tree vectype = SLP_TREE_VECTYPE (node);
3168 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3169 = vect_get_num_vectors (vf * group_size, vectype);
3172 /* Handle purely internal nodes. */
3173 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3174 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3176 if (is_a <bb_vec_info> (vinfo)
3177 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3178 return false;
3180 bool dummy;
3181 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3182 node, node_instance, cost_vec);
3185 /* Try to build NODE from scalars, returning true on success.
3186 NODE_INSTANCE is the SLP instance that contains NODE. */
3188 static bool
3189 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3190 slp_instance node_instance)
3192 stmt_vec_info stmt_info;
3193 unsigned int i;
3195 if (!is_a <bb_vec_info> (vinfo)
3196 || node == SLP_INSTANCE_TREE (node_instance)
3197 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3198 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3199 return false;
3201 if (dump_enabled_p ())
3202 dump_printf_loc (MSG_NOTE, vect_location,
3203 "Building vector operands from scalars instead\n");
3205 /* Don't remove and free the child nodes here, since they could be
3206 referenced by other structures. The analysis and scheduling phases
3207 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3208 unsigned int group_size = SLP_TREE_LANES (node);
3209 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3210 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3211 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3213 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3214 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3216 return true;
3219 /* Compute the prologue cost for invariant or constant operands represented
3220 by NODE. */
3222 static void
3223 vect_prologue_cost_for_slp (slp_tree node,
3224 stmt_vector_for_cost *cost_vec)
3226 /* There's a special case of an existing vector, that costs nothing. */
3227 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3228 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3229 return;
3230 /* Without looking at the actual initializer a vector of
3231 constants can be implemented as load from the constant pool.
3232 When all elements are the same we can use a splat. */
3233 tree vectype = SLP_TREE_VECTYPE (node);
3234 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3235 unsigned num_vects_to_check;
3236 unsigned HOST_WIDE_INT const_nunits;
3237 unsigned nelt_limit;
3238 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3239 && ! multiple_p (const_nunits, group_size))
3241 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3242 nelt_limit = const_nunits;
3244 else
3246 /* If either the vector has variable length or the vectors
3247 are composed of repeated whole groups we only need to
3248 cost construction once. All vectors will be the same. */
3249 num_vects_to_check = 1;
3250 nelt_limit = group_size;
3252 tree elt = NULL_TREE;
3253 unsigned nelt = 0;
3254 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3256 unsigned si = j % group_size;
3257 if (nelt == 0)
3258 elt = SLP_TREE_SCALAR_OPS (node)[si];
3259 /* ??? We're just tracking whether all operands of a single
3260 vector initializer are the same, ideally we'd check if
3261 we emitted the same one already. */
3262 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3263 elt = NULL_TREE;
3264 nelt++;
3265 if (nelt == nelt_limit)
3267 record_stmt_cost (cost_vec, 1,
3268 SLP_TREE_DEF_TYPE (node) == vect_external_def
3269 ? (elt ? scalar_to_vec : vec_construct)
3270 : vector_load,
3271 NULL, vectype, 0, vect_prologue);
3272 nelt = 0;
3277 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3278 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3280 Return true if the operations are supported. */
3282 static bool
3283 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3284 slp_instance node_instance,
3285 hash_set<slp_tree> &visited,
3286 hash_set<slp_tree> &lvisited,
3287 stmt_vector_for_cost *cost_vec)
3289 int i, j;
3290 slp_tree child;
3292 /* Assume we can code-generate all invariants. */
3293 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3294 return true;
3296 /* If we already analyzed the exact same set of scalar stmts we're done.
3297 We share the generated vector stmts for those. */
3298 if (visited.contains (node)
3299 || lvisited.add (node))
3300 return true;
3302 bool res = true;
3303 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3305 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3306 visited, lvisited, cost_vec);
3307 if (!res)
3308 break;
3311 if (res)
3312 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3313 cost_vec);
3314 if (!res)
3315 lvisited.remove (node);
3317 /* When the node can be vectorized cost invariant nodes it references.
3318 This is not done in DFS order to allow the refering node
3319 vectorizable_* calls to nail down the invariant nodes vector type
3320 and possibly unshare it if it needs a different vector type than
3321 other referrers. */
3322 if (res)
3323 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3324 if ((SLP_TREE_DEF_TYPE (child) == vect_constant_def
3325 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3326 /* Perform usual caching, note code-generation still
3327 code-gens these nodes multiple times but we expect
3328 to CSE them later. */
3329 && !visited.contains (child)
3330 && !lvisited.add (child))
3332 /* ??? After auditing more code paths make a "default"
3333 and push the vector type from NODE to all children
3334 if it is not already set. */
3335 /* Compute the number of vectors to be generated. */
3336 tree vector_type = SLP_TREE_VECTYPE (child);
3337 if (!vector_type)
3339 /* For shifts with a scalar argument we don't need
3340 to cost or code-generate anything.
3341 ??? Represent this more explicitely. */
3342 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3343 == shift_vec_info_type)
3344 && j == 1);
3345 continue;
3347 unsigned group_size = SLP_TREE_LANES (child);
3348 poly_uint64 vf = 1;
3349 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3350 vf = loop_vinfo->vectorization_factor;
3351 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3352 = vect_get_num_vectors (vf * group_size, vector_type);
3353 /* And cost them. */
3354 vect_prologue_cost_for_slp (child, cost_vec);
3357 /* If this node or any of its children can't be vectorized, try pruning
3358 the tree here rather than felling the whole thing. */
3359 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3361 /* We'll need to revisit this for invariant costing and number
3362 of vectorized stmt setting. */
3363 res = true;
3366 return res;
3370 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3371 region and that can be vectorized using vectorizable_live_operation
3372 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3373 scalar code computing it to be retained. */
3375 static void
3376 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3377 slp_instance instance,
3378 stmt_vector_for_cost *cost_vec,
3379 hash_set<stmt_vec_info> &svisited)
3381 unsigned i;
3382 stmt_vec_info stmt_info;
3383 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3384 bool all_visited = true;
3385 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3387 if (svisited.contains (stmt_info))
3388 continue;
3389 all_visited = false;
3390 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3391 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3392 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3393 /* Only the pattern root stmt computes the original scalar value. */
3394 continue;
3395 bool mark_visited = true;
3396 gimple *orig_stmt = orig_stmt_info->stmt;
3397 ssa_op_iter op_iter;
3398 def_operand_p def_p;
3399 FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3401 imm_use_iterator use_iter;
3402 gimple *use_stmt;
3403 stmt_vec_info use_stmt_info;
3404 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3405 if (!is_gimple_debug (use_stmt))
3407 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3408 if (!use_stmt_info
3409 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3411 STMT_VINFO_LIVE_P (stmt_info) = true;
3412 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3413 NULL, node, instance, i,
3414 false, cost_vec))
3415 /* ??? So we know we can vectorize the live stmt
3416 from one SLP node. If we cannot do so from all
3417 or none consistently we'd have to record which
3418 SLP node (and lane) we want to use for the live
3419 operation. So make sure we can code-generate
3420 from all nodes. */
3421 mark_visited = false;
3422 else
3423 STMT_VINFO_LIVE_P (stmt_info) = false;
3424 BREAK_FROM_IMM_USE_STMT (use_iter);
3427 /* We have to verify whether we can insert the lane extract
3428 before all uses. The following is a conservative approximation.
3429 We cannot put this into vectorizable_live_operation because
3430 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3431 doesn't work.
3432 Note that while the fact that we emit code for loads at the
3433 first load should make this a non-problem leafs we construct
3434 from scalars are vectorized after the last scalar def.
3435 ??? If we'd actually compute the insert location during
3436 analysis we could use sth less conservative than the last
3437 scalar stmt in the node for the dominance check. */
3438 /* ??? What remains is "live" uses in vector CTORs in the same
3439 SLP graph which is where those uses can end up code-generated
3440 right after their definition instead of close to their original
3441 use. But that would restrict us to code-generate lane-extracts
3442 from the latest stmt in a node. So we compensate for this
3443 during code-generation, simply not replacing uses for those
3444 hopefully rare cases. */
3445 if (STMT_VINFO_LIVE_P (stmt_info))
3446 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3447 if (!is_gimple_debug (use_stmt)
3448 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
3449 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3450 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
3452 if (dump_enabled_p ())
3453 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3454 "Cannot determine insertion place for "
3455 "lane extract\n");
3456 STMT_VINFO_LIVE_P (stmt_info) = false;
3457 mark_visited = true;
3460 if (mark_visited)
3461 svisited.add (stmt_info);
3463 if (all_visited)
3464 return;
3466 slp_tree child;
3467 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3468 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3469 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
3470 cost_vec, svisited);
3473 /* Analyze statements in SLP instances of VINFO. Return true if the
3474 operations are supported. */
3476 bool
3477 vect_slp_analyze_operations (vec_info *vinfo)
3479 slp_instance instance;
3480 int i;
3482 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3484 hash_set<slp_tree> visited;
3485 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
3487 hash_set<slp_tree> lvisited;
3488 stmt_vector_for_cost cost_vec;
3489 cost_vec.create (2);
3490 if (is_a <bb_vec_info> (vinfo))
3491 vect_location = instance->location ();
3492 if (!vect_slp_analyze_node_operations (vinfo,
3493 SLP_INSTANCE_TREE (instance),
3494 instance, visited, lvisited,
3495 &cost_vec)
3496 /* Instances with a root stmt require vectorized defs for the
3497 SLP tree root. */
3498 || (SLP_INSTANCE_ROOT_STMT (instance)
3499 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
3500 != vect_internal_def)))
3502 slp_tree node = SLP_INSTANCE_TREE (instance);
3503 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3504 if (dump_enabled_p ())
3505 dump_printf_loc (MSG_NOTE, vect_location,
3506 "removing SLP instance operations starting from: %G",
3507 stmt_info->stmt);
3508 vect_free_slp_instance (instance);
3509 vinfo->slp_instances.ordered_remove (i);
3510 cost_vec.release ();
3512 else
3514 for (hash_set<slp_tree>::iterator x = lvisited.begin();
3515 x != lvisited.end(); ++x)
3516 visited.add (*x);
3517 i++;
3519 /* For BB vectorization remember the SLP graph entry
3520 cost for later. */
3521 if (is_a <bb_vec_info> (vinfo))
3522 instance->cost_vec = cost_vec;
3523 else
3525 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3526 cost_vec.release ();
3531 /* Compute vectorizable live stmts. */
3532 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3534 hash_set<stmt_vec_info> svisited;
3535 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
3537 vect_location = instance->location ();
3538 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
3539 instance, &instance->cost_vec, svisited);
3543 return !vinfo->slp_instances.is_empty ();
3546 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3547 closing the eventual chain. */
3549 static slp_instance
3550 get_ultimate_leader (slp_instance instance,
3551 hash_map<slp_instance, slp_instance> &instance_leader)
3553 auto_vec<slp_instance *, 8> chain;
3554 slp_instance *tem;
3555 while (*(tem = instance_leader.get (instance)) != instance)
3557 chain.safe_push (tem);
3558 instance = *tem;
3560 while (!chain.is_empty ())
3561 *chain.pop () = instance;
3562 return instance;
3565 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3567 static void
3568 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
3569 slp_instance instance, slp_tree node,
3570 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
3571 hash_map<slp_instance, slp_instance> &instance_leader,
3572 hash_set<slp_tree> &visited)
3574 stmt_vec_info stmt_info;
3575 unsigned i;
3577 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3579 bool existed_p;
3580 slp_instance &stmt_instance
3581 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
3582 if (!existed_p)
3584 else if (stmt_instance != instance)
3586 /* If we're running into a previously marked stmt make us the
3587 leader of the current ultimate leader. This keeps the
3588 leader chain acyclic and works even when the current instance
3589 connects two previously independent graph parts. */
3590 slp_instance stmt_leader
3591 = get_ultimate_leader (stmt_instance, instance_leader);
3592 if (stmt_leader != instance)
3593 instance_leader.put (stmt_leader, instance);
3595 stmt_instance = instance;
3598 if (visited.add (node))
3599 return;
3601 slp_tree child;
3602 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3603 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3604 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
3605 instance_leader, visited);
3608 /* Partition the SLP graph into pieces that can be costed independently. */
3610 static void
3611 vect_bb_partition_graph (bb_vec_info bb_vinfo)
3613 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3615 /* First walk the SLP graph assigning each involved scalar stmt a
3616 corresponding SLP graph entry and upon visiting a previously
3617 marked stmt, make the stmts leader the current SLP graph entry. */
3618 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
3619 hash_map<slp_instance, slp_instance> instance_leader;
3620 hash_set<slp_tree> visited;
3621 slp_instance instance;
3622 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3624 instance_leader.put (instance, instance);
3625 vect_bb_partition_graph_r (bb_vinfo,
3626 instance, SLP_INSTANCE_TREE (instance),
3627 stmt_to_instance, instance_leader,
3628 visited);
3631 /* Then collect entries to each independent subgraph. */
3632 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3634 slp_instance leader = get_ultimate_leader (instance, instance_leader);
3635 leader->subgraph_entries.safe_push (instance);
3636 if (dump_enabled_p ()
3637 && leader != instance)
3638 dump_printf_loc (MSG_NOTE, vect_location,
3639 "instance %p is leader of %p\n",
3640 leader, instance);
3644 /* Compute the scalar cost of the SLP node NODE and its children
3645 and return it. Do not account defs that are marked in LIFE and
3646 update LIFE according to uses of NODE. */
3648 static void
3649 vect_bb_slp_scalar_cost (vec_info *vinfo,
3650 slp_tree node, vec<bool, va_heap> *life,
3651 stmt_vector_for_cost *cost_vec,
3652 hash_set<slp_tree> &visited)
3654 unsigned i;
3655 stmt_vec_info stmt_info;
3656 slp_tree child;
3658 if (visited.add (node))
3659 return;
3661 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3663 ssa_op_iter op_iter;
3664 def_operand_p def_p;
3666 if ((*life)[i])
3667 continue;
3669 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3670 gimple *orig_stmt = orig_stmt_info->stmt;
3672 /* If there is a non-vectorized use of the defs then the scalar
3673 stmt is kept live in which case we do not account it or any
3674 required defs in the SLP children in the scalar cost. This
3675 way we make the vectorization more costly when compared to
3676 the scalar cost. */
3677 if (!STMT_VINFO_LIVE_P (stmt_info))
3679 FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3681 imm_use_iterator use_iter;
3682 gimple *use_stmt;
3683 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3684 if (!is_gimple_debug (use_stmt))
3686 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3687 if (!use_stmt_info
3688 || !PURE_SLP_STMT
3689 (vect_stmt_to_vectorize (use_stmt_info)))
3691 (*life)[i] = true;
3692 BREAK_FROM_IMM_USE_STMT (use_iter);
3696 if ((*life)[i])
3697 continue;
3700 /* Count scalar stmts only once. */
3701 if (gimple_visited_p (orig_stmt))
3702 continue;
3703 gimple_set_visited (orig_stmt, true);
3705 vect_cost_for_stmt kind;
3706 if (STMT_VINFO_DATA_REF (orig_stmt_info))
3708 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
3709 kind = scalar_load;
3710 else
3711 kind = scalar_store;
3713 else if (vect_nop_conversion_p (orig_stmt_info))
3714 continue;
3715 else
3716 kind = scalar_stmt;
3717 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info, 0, vect_body);
3720 auto_vec<bool, 20> subtree_life;
3721 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3723 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3725 /* Do not directly pass LIFE to the recursive call, copy it to
3726 confine changes in the callee to the current child/subtree. */
3727 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3729 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
3730 for (unsigned j = 0;
3731 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3733 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3734 if (perm.first == i)
3735 subtree_life[perm.second] = (*life)[j];
3738 else
3740 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
3741 subtree_life.safe_splice (*life);
3743 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3744 visited);
3745 subtree_life.truncate (0);
3750 /* Check if vectorization of the basic block is profitable for the
3751 subgraph denoted by SLP_INSTANCES. */
3753 static bool
3754 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
3755 vec<slp_instance> slp_instances)
3757 slp_instance instance;
3758 int i;
3759 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3760 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3762 void *vect_target_cost_data = init_cost (NULL);
3764 /* Calculate scalar cost and sum the cost for the vector stmts
3765 previously collected. */
3766 stmt_vector_for_cost scalar_costs;
3767 scalar_costs.create (0);
3768 hash_set<slp_tree> visited;
3769 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3771 auto_vec<bool, 20> life;
3772 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
3773 true);
3774 vect_bb_slp_scalar_cost (bb_vinfo,
3775 SLP_INSTANCE_TREE (instance),
3776 &life, &scalar_costs, visited);
3777 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
3778 instance->cost_vec.release ();
3780 /* Unset visited flag. */
3781 stmt_info_for_cost *si;
3782 FOR_EACH_VEC_ELT (scalar_costs, i, si)
3783 gimple_set_visited (si->stmt_info->stmt, false);
3785 void *scalar_target_cost_data = init_cost (NULL);
3786 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
3787 scalar_costs.release ();
3788 unsigned dummy;
3789 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
3790 destroy_cost_data (scalar_target_cost_data);
3792 /* Complete the target-specific vector cost calculation. */
3793 finish_cost (vect_target_cost_data, &vec_prologue_cost,
3794 &vec_inside_cost, &vec_epilogue_cost);
3795 destroy_cost_data (vect_target_cost_data);
3797 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3799 if (dump_enabled_p ())
3801 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3802 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3803 vec_inside_cost);
3804 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3805 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3806 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3809 /* Vectorization is profitable if its cost is more than the cost of scalar
3810 version. Note that we err on the vector side for equal cost because
3811 the cost estimate is otherwise quite pessimistic (constant uses are
3812 free on the scalar side but cost a load on the vector side for
3813 example). */
3814 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3815 return false;
3817 return true;
3820 /* Find any vectorizable constructors and add them to the grouped_store
3821 array. */
3823 static void
3824 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3826 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
3827 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
3828 !gsi_end_p (gsi); gsi_next (&gsi))
3830 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
3831 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
3832 continue;
3834 tree rhs = gimple_assign_rhs1 (assign);
3835 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
3836 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
3837 CONSTRUCTOR_NELTS (rhs))
3838 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
3839 || uniform_vector_p (rhs))
3840 continue;
3842 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
3843 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3847 /* Check if the region described by BB_VINFO can be vectorized, returning
3848 true if so. When returning false, set FATAL to true if the same failure
3849 would prevent vectorization at other vector sizes, false if it is still
3850 worth trying other sizes. N_STMTS is the number of statements in the
3851 region. */
3853 static bool
3854 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
3855 vec<int> *dataref_groups)
3857 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3859 slp_instance instance;
3860 int i;
3861 poly_uint64 min_vf = 2;
3863 /* The first group of checks is independent of the vector size. */
3864 fatal = true;
3866 /* Analyze the data references. */
3868 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3870 if (dump_enabled_p ())
3871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3872 "not vectorized: unhandled data-ref in basic "
3873 "block.\n");
3874 return false;
3877 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
3879 if (dump_enabled_p ())
3880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3881 "not vectorized: unhandled data access in "
3882 "basic block.\n");
3883 return false;
3886 vect_slp_check_for_constructors (bb_vinfo);
3888 /* If there are no grouped stores and no constructors in the region
3889 there is no need to continue with pattern recog as vect_analyze_slp
3890 will fail anyway. */
3891 if (bb_vinfo->grouped_stores.is_empty ())
3893 if (dump_enabled_p ())
3894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3895 "not vectorized: no grouped stores in "
3896 "basic block.\n");
3897 return false;
3900 /* While the rest of the analysis below depends on it in some way. */
3901 fatal = false;
3903 vect_pattern_recog (bb_vinfo);
3905 /* Check the SLP opportunities in the basic block, analyze and build SLP
3906 trees. */
3907 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3909 if (dump_enabled_p ())
3911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3912 "Failed to SLP the basic block.\n");
3913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3914 "not vectorized: failed to find SLP opportunities "
3915 "in basic block.\n");
3917 return false;
3920 /* Optimize permutations. */
3921 vect_optimize_slp (bb_vinfo);
3923 vect_record_base_alignments (bb_vinfo);
3925 /* Analyze and verify the alignment of data references and the
3926 dependence in the SLP instances. */
3927 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3929 vect_location = instance->location ();
3930 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
3931 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
3933 slp_tree node = SLP_INSTANCE_TREE (instance);
3934 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3935 if (dump_enabled_p ())
3936 dump_printf_loc (MSG_NOTE, vect_location,
3937 "removing SLP instance operations starting from: %G",
3938 stmt_info->stmt);
3939 vect_free_slp_instance (instance);
3940 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3941 continue;
3944 /* Mark all the statements that we want to vectorize as pure SLP and
3945 relevant. */
3946 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3947 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3948 if (SLP_INSTANCE_ROOT_STMT (instance))
3949 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3951 i++;
3953 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3954 return false;
3956 if (!vect_slp_analyze_operations (bb_vinfo))
3958 if (dump_enabled_p ())
3959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3960 "not vectorized: bad operation in basic block.\n");
3961 return false;
3964 vect_bb_partition_graph (bb_vinfo);
3966 return true;
3969 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
3970 basic blocks in BBS, returning true on success.
3971 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
3973 static bool
3974 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
3975 vec<int> *dataref_groups, unsigned int n_stmts)
3977 bb_vec_info bb_vinfo;
3978 auto_vector_modes vector_modes;
3980 /* Autodetect first vector size we try. */
3981 machine_mode next_vector_mode = VOIDmode;
3982 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3983 unsigned int mode_i = 0;
3985 vec_info_shared shared;
3987 machine_mode autodetected_vector_mode = VOIDmode;
3988 while (1)
3990 bool vectorized = false;
3991 bool fatal = false;
3992 bb_vinfo = new _bb_vec_info (bbs, &shared);
3994 bool first_time_p = shared.datarefs.is_empty ();
3995 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3996 if (first_time_p)
3997 bb_vinfo->shared->save_datarefs ();
3998 else
3999 bb_vinfo->shared->check_datarefs ();
4000 bb_vinfo->vector_mode = next_vector_mode;
4002 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
4003 && dbg_cnt (vect_slp))
4005 if (dump_enabled_p ())
4007 dump_printf_loc (MSG_NOTE, vect_location,
4008 "***** Analysis succeeded with vector mode"
4009 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4010 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4013 bb_vinfo->shared->check_datarefs ();
4015 unsigned i;
4016 slp_instance instance;
4017 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4019 if (instance->subgraph_entries.is_empty ())
4020 continue;
4022 vect_location = instance->location ();
4023 if (!unlimited_cost_model (NULL)
4024 && !vect_bb_vectorization_profitable_p
4025 (bb_vinfo, instance->subgraph_entries))
4027 if (dump_enabled_p ())
4028 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4029 "not vectorized: vectorization is not "
4030 "profitable.\n");
4031 continue;
4034 if (!vectorized && dump_enabled_p ())
4035 dump_printf_loc (MSG_NOTE, vect_location,
4036 "Basic block will be vectorized "
4037 "using SLP\n");
4038 vectorized = true;
4040 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4042 unsigned HOST_WIDE_INT bytes;
4043 if (dump_enabled_p ())
4045 if (GET_MODE_SIZE
4046 (bb_vinfo->vector_mode).is_constant (&bytes))
4047 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4048 "basic block part vectorized using %wu "
4049 "byte vectors\n", bytes);
4050 else
4051 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4052 "basic block part vectorized using "
4053 "variable length vectors\n");
4057 else
4059 if (dump_enabled_p ())
4060 dump_printf_loc (MSG_NOTE, vect_location,
4061 "***** Analysis failed with vector mode %s\n",
4062 GET_MODE_NAME (bb_vinfo->vector_mode));
4065 if (mode_i == 0)
4066 autodetected_vector_mode = bb_vinfo->vector_mode;
4068 if (!fatal)
4069 while (mode_i < vector_modes.length ()
4070 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4072 if (dump_enabled_p ())
4073 dump_printf_loc (MSG_NOTE, vect_location,
4074 "***** The result for vector mode %s would"
4075 " be the same\n",
4076 GET_MODE_NAME (vector_modes[mode_i]));
4077 mode_i += 1;
4080 delete bb_vinfo;
4082 if (mode_i < vector_modes.length ()
4083 && VECTOR_MODE_P (autodetected_vector_mode)
4084 && (related_vector_mode (vector_modes[mode_i],
4085 GET_MODE_INNER (autodetected_vector_mode))
4086 == autodetected_vector_mode)
4087 && (related_vector_mode (autodetected_vector_mode,
4088 GET_MODE_INNER (vector_modes[mode_i]))
4089 == vector_modes[mode_i]))
4091 if (dump_enabled_p ())
4092 dump_printf_loc (MSG_NOTE, vect_location,
4093 "***** Skipping vector mode %s, which would"
4094 " repeat the analysis for %s\n",
4095 GET_MODE_NAME (vector_modes[mode_i]),
4096 GET_MODE_NAME (autodetected_vector_mode));
4097 mode_i += 1;
4100 if (vectorized
4101 || mode_i == vector_modes.length ()
4102 || autodetected_vector_mode == VOIDmode
4103 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4104 vector sizes will fail do not bother iterating. */
4105 || fatal)
4106 return vectorized;
4108 /* Try the next biggest vector size. */
4109 next_vector_mode = vector_modes[mode_i++];
4110 if (dump_enabled_p ())
4111 dump_printf_loc (MSG_NOTE, vect_location,
4112 "***** Re-trying analysis with vector mode %s\n",
4113 GET_MODE_NAME (next_vector_mode));
4118 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4119 true if anything in the basic-block was vectorized. */
4121 static bool
4122 vect_slp_bbs (vec<basic_block> bbs)
4124 vec<data_reference_p> datarefs = vNULL;
4125 auto_vec<int> dataref_groups;
4126 int insns = 0;
4127 int current_group = 0;
4129 for (unsigned i = 0; i < bbs.length (); i++)
4131 basic_block bb = bbs[i];
4132 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
4133 gsi_next (&gsi))
4135 gimple *stmt = gsi_stmt (gsi);
4136 if (is_gimple_debug (stmt))
4137 continue;
4139 insns++;
4141 if (gimple_location (stmt) != UNKNOWN_LOCATION)
4142 vect_location = stmt;
4144 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
4145 &dataref_groups, current_group))
4146 ++current_group;
4148 if (insns > param_slp_max_insns_in_bb)
4150 if (dump_enabled_p ())
4151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4152 "not vectorized: too many instructions in "
4153 "region.\n");
4158 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
4161 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4162 true if anything in the basic-block was vectorized. */
4164 bool
4165 vect_slp_bb (basic_block bb)
4167 auto_vec<basic_block> bbs;
4168 bbs.safe_push (bb);
4169 return vect_slp_bbs (bbs);
4172 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4173 true if anything in the basic-block was vectorized. */
4175 bool
4176 vect_slp_function (function *fun)
4178 bool r = false;
4179 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
4180 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
4182 /* For the moment split the function into pieces to avoid making
4183 the iteration on the vector mode moot. Split at points we know
4184 to not handle well which is CFG merges (SLP discovery doesn't
4185 handle non-loop-header PHIs) and loop exits. Since pattern
4186 recog requires reverse iteration to visit uses before defs
4187 simply chop RPO into pieces. */
4188 auto_vec<basic_block> bbs;
4189 for (unsigned i = 0; i < n; i++)
4191 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
4193 /* Split when a basic block has multiple predecessors or when the
4194 edge into it exits a loop (because of implementation issues with
4195 respect to placement of CTORs for externals). */
4196 bool split = false;
4197 edge e;
4198 if (!single_pred_p (bb)
4199 || ((e = single_pred_edge (bb)),
4200 loop_exit_edge_p (e->src->loop_father, e)))
4201 split = true;
4202 /* Split when a BB is not dominated by the first block. */
4203 else if (!bbs.is_empty ()
4204 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
4205 split = true;
4207 if (split && !bbs.is_empty ())
4209 r |= vect_slp_bbs (bbs);
4210 bbs.truncate (0);
4211 bbs.quick_push (bb);
4213 else
4214 bbs.safe_push (bb);
4216 /* When we have a stmt ending this block we have to insert on
4217 edges when inserting after it. Avoid this for now. */
4218 if (gimple *last = last_stmt (bb))
4219 if (is_ctrl_altering_stmt (last))
4221 r |= vect_slp_bbs (bbs);
4222 bbs.truncate (0);
4226 if (!bbs.is_empty ())
4227 r |= vect_slp_bbs (bbs);
4229 free (rpo);
4231 return r;
4234 /* Build a variable-length vector in which the elements in ELTS are repeated
4235 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4236 RESULTS and add any new instructions to SEQ.
4238 The approach we use is:
4240 (1) Find a vector mode VM with integer elements of mode IM.
4242 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4243 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4244 from small vectors to IM.
4246 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4248 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4249 correct byte contents.
4251 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4253 We try to find the largest IM for which this sequence works, in order
4254 to cut down on the number of interleaves. */
4256 void
4257 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
4258 vec<tree> elts, unsigned int nresults,
4259 vec<tree> &results)
4261 unsigned int nelts = elts.length ();
4262 tree element_type = TREE_TYPE (vector_type);
4264 /* (1) Find a vector mode VM with integer elements of mode IM. */
4265 unsigned int nvectors = 1;
4266 tree new_vector_type;
4267 tree permutes[2];
4268 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
4269 &nvectors, &new_vector_type,
4270 permutes))
4271 gcc_unreachable ();
4273 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4274 unsigned int partial_nelts = nelts / nvectors;
4275 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
4277 tree_vector_builder partial_elts;
4278 auto_vec<tree, 32> pieces (nvectors * 2);
4279 pieces.quick_grow (nvectors * 2);
4280 for (unsigned int i = 0; i < nvectors; ++i)
4282 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4283 ELTS' has mode IM. */
4284 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
4285 for (unsigned int j = 0; j < partial_nelts; ++j)
4286 partial_elts.quick_push (elts[i * partial_nelts + j]);
4287 tree t = gimple_build_vector (seq, &partial_elts);
4288 t = gimple_build (seq, VIEW_CONVERT_EXPR,
4289 TREE_TYPE (new_vector_type), t);
4291 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4292 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
4295 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4296 correct byte contents.
4298 We need to repeat the following operation log2(nvectors) times:
4300 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4301 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4303 However, if each input repeats every N elements and the VF is
4304 a multiple of N * 2, the HI result is the same as the LO. */
4305 unsigned int in_start = 0;
4306 unsigned int out_start = nvectors;
4307 unsigned int hi_start = nvectors / 2;
4308 /* A bound on the number of outputs needed to produce NRESULTS results
4309 in the final iteration. */
4310 unsigned int noutputs_bound = nvectors * nresults;
4311 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
4313 noutputs_bound /= 2;
4314 unsigned int limit = MIN (noutputs_bound, nvectors);
4315 for (unsigned int i = 0; i < limit; ++i)
4317 if ((i & 1) != 0
4318 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
4319 2 * in_repeat))
4321 pieces[out_start + i] = pieces[out_start + i - 1];
4322 continue;
4325 tree output = make_ssa_name (new_vector_type);
4326 tree input1 = pieces[in_start + (i / 2)];
4327 tree input2 = pieces[in_start + (i / 2) + hi_start];
4328 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
4329 input1, input2,
4330 permutes[i & 1]);
4331 gimple_seq_add_stmt (seq, stmt);
4332 pieces[out_start + i] = output;
4334 std::swap (in_start, out_start);
4337 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4338 results.reserve (nresults);
4339 for (unsigned int i = 0; i < nresults; ++i)
4340 if (i < nvectors)
4341 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
4342 pieces[in_start + i]));
4343 else
4344 results.quick_push (results[i - nvectors]);
4348 /* For constant and loop invariant defs in OP_NODE this function creates
4349 vector defs that will be used in the vectorized stmts and stores them
4350 to SLP_TREE_VEC_DEFS of OP_NODE. */
4352 static void
4353 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
4355 unsigned HOST_WIDE_INT nunits;
4356 tree vec_cst;
4357 unsigned j, number_of_places_left_in_vector;
4358 tree vector_type;
4359 tree vop;
4360 int group_size = op_node->ops.length ();
4361 unsigned int vec_num, i;
4362 unsigned number_of_copies = 1;
4363 bool constant_p;
4364 gimple_seq ctor_seq = NULL;
4365 auto_vec<tree, 16> permute_results;
4367 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4368 vector_type = SLP_TREE_VECTYPE (op_node);
4370 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
4371 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
4372 auto_vec<tree> voprnds (number_of_vectors);
4374 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4375 created vectors. It is greater than 1 if unrolling is performed.
4377 For example, we have two scalar operands, s1 and s2 (e.g., group of
4378 strided accesses of size two), while NUNITS is four (i.e., four scalars
4379 of this type can be packed in a vector). The output vector will contain
4380 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4381 will be 2).
4383 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4384 containing the operands.
4386 For example, NUNITS is four as before, and the group size is 8
4387 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4388 {s5, s6, s7, s8}. */
4390 /* When using duplicate_and_interleave, we just need one element for
4391 each scalar statement. */
4392 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4393 nunits = group_size;
4395 number_of_copies = nunits * number_of_vectors / group_size;
4397 number_of_places_left_in_vector = nunits;
4398 constant_p = true;
4399 tree_vector_builder elts (vector_type, nunits, 1);
4400 elts.quick_grow (nunits);
4401 stmt_vec_info insert_after = NULL;
4402 for (j = 0; j < number_of_copies; j++)
4404 tree op;
4405 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
4407 /* Create 'vect_ = {op0,op1,...,opn}'. */
4408 number_of_places_left_in_vector--;
4409 tree orig_op = op;
4410 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
4412 if (CONSTANT_CLASS_P (op))
4414 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4416 /* Can't use VIEW_CONVERT_EXPR for booleans because
4417 of possibly different sizes of scalar value and
4418 vector element. */
4419 if (integer_zerop (op))
4420 op = build_int_cst (TREE_TYPE (vector_type), 0);
4421 else if (integer_onep (op))
4422 op = build_all_ones_cst (TREE_TYPE (vector_type));
4423 else
4424 gcc_unreachable ();
4426 else
4427 op = fold_unary (VIEW_CONVERT_EXPR,
4428 TREE_TYPE (vector_type), op);
4429 gcc_assert (op && CONSTANT_CLASS_P (op));
4431 else
4433 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
4434 gimple *init_stmt;
4435 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4437 tree true_val
4438 = build_all_ones_cst (TREE_TYPE (vector_type));
4439 tree false_val
4440 = build_zero_cst (TREE_TYPE (vector_type));
4441 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
4442 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
4443 op, true_val,
4444 false_val);
4446 else
4448 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
4449 op);
4450 init_stmt
4451 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
4452 op);
4454 gimple_seq_add_stmt (&ctor_seq, init_stmt);
4455 op = new_temp;
4458 elts[number_of_places_left_in_vector] = op;
4459 if (!CONSTANT_CLASS_P (op))
4460 constant_p = false;
4461 /* For BB vectorization we have to compute an insert location
4462 when a def is inside the analyzed region since we cannot
4463 simply insert at the BB start in this case. */
4464 stmt_vec_info opdef;
4465 if (TREE_CODE (orig_op) == SSA_NAME
4466 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
4467 && is_a <bb_vec_info> (vinfo)
4468 && (opdef = vinfo->lookup_def (orig_op)))
4470 if (!insert_after)
4471 insert_after = opdef;
4472 else
4473 insert_after = get_later_stmt (insert_after, opdef);
4476 if (number_of_places_left_in_vector == 0)
4478 if (constant_p
4479 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4480 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4481 vec_cst = gimple_build_vector (&ctor_seq, &elts);
4482 else
4484 if (permute_results.is_empty ())
4485 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
4486 elts, number_of_vectors,
4487 permute_results);
4488 vec_cst = permute_results[number_of_vectors - j - 1];
4490 if (!gimple_seq_empty_p (ctor_seq))
4492 if (insert_after)
4494 gimple_stmt_iterator gsi;
4495 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
4497 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
4498 gsi_insert_seq_before (&gsi, ctor_seq,
4499 GSI_CONTINUE_LINKING);
4501 else if (!stmt_ends_bb_p (insert_after->stmt))
4503 gsi = gsi_for_stmt (insert_after->stmt);
4504 gsi_insert_seq_after (&gsi, ctor_seq,
4505 GSI_CONTINUE_LINKING);
4507 else
4509 /* When we want to insert after a def where the
4510 defining stmt throws then insert on the fallthru
4511 edge. */
4512 edge e = find_fallthru_edge
4513 (gimple_bb (insert_after->stmt)->succs);
4514 basic_block new_bb
4515 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
4516 gcc_assert (!new_bb);
4519 else
4520 vinfo->insert_seq_on_entry (NULL, ctor_seq);
4521 ctor_seq = NULL;
4523 voprnds.quick_push (vec_cst);
4524 insert_after = NULL;
4525 number_of_places_left_in_vector = nunits;
4526 constant_p = true;
4527 elts.new_vector (vector_type, nunits, 1);
4528 elts.quick_grow (nunits);
4533 /* Since the vectors are created in the reverse order, we should invert
4534 them. */
4535 vec_num = voprnds.length ();
4536 for (j = vec_num; j != 0; j--)
4538 vop = voprnds[j - 1];
4539 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4542 /* In case that VF is greater than the unrolling factor needed for the SLP
4543 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4544 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4545 to replicate the vectors. */
4546 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
4547 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
4548 i++)
4549 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4552 /* Get the Ith vectorized definition from SLP_NODE. */
4554 tree
4555 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
4557 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
4558 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
4559 else
4560 return SLP_TREE_VEC_DEFS (slp_node)[i];
4563 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4565 void
4566 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
4568 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
4569 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
4571 unsigned j;
4572 gimple *vec_def_stmt;
4573 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
4574 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
4576 else
4577 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
4580 /* Get N vectorized definitions for SLP_NODE. */
4582 void
4583 vect_get_slp_defs (vec_info *,
4584 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
4586 if (n == -1U)
4587 n = SLP_TREE_CHILDREN (slp_node).length ();
4589 for (unsigned i = 0; i < n; ++i)
4591 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
4592 vec<tree> vec_defs = vNULL;
4593 vect_get_slp_defs (child, &vec_defs);
4594 vec_oprnds->quick_push (vec_defs);
4598 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4599 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4600 permute statements for the SLP node NODE. */
4602 bool
4603 vect_transform_slp_perm_load (vec_info *vinfo,
4604 slp_tree node, vec<tree> dr_chain,
4605 gimple_stmt_iterator *gsi, poly_uint64 vf,
4606 bool analyze_only, unsigned *n_perms)
4608 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4609 int vec_index = 0;
4610 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4611 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
4612 unsigned int mask_element;
4613 machine_mode mode;
4615 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
4616 return false;
4618 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
4620 mode = TYPE_MODE (vectype);
4621 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4623 /* Initialize the vect stmts of NODE to properly insert the generated
4624 stmts later. */
4625 if (! analyze_only)
4626 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
4627 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
4628 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
4630 /* Generate permutation masks for every NODE. Number of masks for each NODE
4631 is equal to GROUP_SIZE.
4632 E.g., we have a group of three nodes with three loads from the same
4633 location in each node, and the vector size is 4. I.e., we have a
4634 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4635 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4636 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4639 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4640 The last mask is illegal since we assume two operands for permute
4641 operation, and the mask element values can't be outside that range.
4642 Hence, the last mask must be converted into {2,5,5,5}.
4643 For the first two permutations we need the first and the second input
4644 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4645 we need the second and the third vectors: {b1,c1,a2,b2} and
4646 {c2,a3,b3,c3}. */
4648 int vect_stmts_counter = 0;
4649 unsigned int index = 0;
4650 int first_vec_index = -1;
4651 int second_vec_index = -1;
4652 bool noop_p = true;
4653 *n_perms = 0;
4655 vec_perm_builder mask;
4656 unsigned int nelts_to_build;
4657 unsigned int nvectors_per_build;
4658 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
4659 && multiple_p (nunits, group_size));
4660 if (repeating_p)
4662 /* A single vector contains a whole number of copies of the node, so:
4663 (a) all permutes can use the same mask; and
4664 (b) the permutes only need a single vector input. */
4665 mask.new_vector (nunits, group_size, 3);
4666 nelts_to_build = mask.encoded_nelts ();
4667 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
4669 else
4671 /* We need to construct a separate mask for each vector statement. */
4672 unsigned HOST_WIDE_INT const_nunits, const_vf;
4673 if (!nunits.is_constant (&const_nunits)
4674 || !vf.is_constant (&const_vf))
4675 return false;
4676 mask.new_vector (const_nunits, const_nunits, 1);
4677 nelts_to_build = const_vf * group_size;
4678 nvectors_per_build = 1;
4681 unsigned int count = mask.encoded_nelts ();
4682 mask.quick_grow (count);
4683 vec_perm_indices indices;
4685 for (unsigned int j = 0; j < nelts_to_build; j++)
4687 unsigned int iter_num = j / group_size;
4688 unsigned int stmt_num = j % group_size;
4689 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
4690 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
4691 if (repeating_p)
4693 first_vec_index = 0;
4694 mask_element = i;
4696 else
4698 /* Enforced before the loop when !repeating_p. */
4699 unsigned int const_nunits = nunits.to_constant ();
4700 vec_index = i / const_nunits;
4701 mask_element = i % const_nunits;
4702 if (vec_index == first_vec_index
4703 || first_vec_index == -1)
4705 first_vec_index = vec_index;
4707 else if (vec_index == second_vec_index
4708 || second_vec_index == -1)
4710 second_vec_index = vec_index;
4711 mask_element += const_nunits;
4713 else
4715 if (dump_enabled_p ())
4716 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4717 "permutation requires at "
4718 "least three vectors %G",
4719 stmt_info->stmt);
4720 gcc_assert (analyze_only);
4721 return false;
4724 gcc_assert (mask_element < 2 * const_nunits);
4727 if (mask_element != index)
4728 noop_p = false;
4729 mask[index++] = mask_element;
4731 if (index == count && !noop_p)
4733 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4734 if (!can_vec_perm_const_p (mode, indices))
4736 if (dump_enabled_p ())
4738 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4739 vect_location,
4740 "unsupported vect permute { ");
4741 for (i = 0; i < count; ++i)
4743 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4744 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4746 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4748 gcc_assert (analyze_only);
4749 return false;
4752 ++*n_perms;
4755 if (index == count)
4757 if (!analyze_only)
4759 tree mask_vec = NULL_TREE;
4761 if (! noop_p)
4762 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4764 if (second_vec_index == -1)
4765 second_vec_index = first_vec_index;
4767 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4769 /* Generate the permute statement if necessary. */
4770 tree first_vec = dr_chain[first_vec_index + ri];
4771 tree second_vec = dr_chain[second_vec_index + ri];
4772 gimple *perm_stmt;
4773 if (! noop_p)
4775 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4776 tree perm_dest
4777 = vect_create_destination_var (gimple_assign_lhs (stmt),
4778 vectype);
4779 perm_dest = make_ssa_name (perm_dest);
4780 perm_stmt
4781 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4782 first_vec, second_vec,
4783 mask_vec);
4784 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
4785 gsi);
4787 else
4788 /* If mask was NULL_TREE generate the requested
4789 identity transform. */
4790 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
4792 /* Store the vector statement in NODE. */
4793 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
4797 index = 0;
4798 first_vec_index = -1;
4799 second_vec_index = -1;
4800 noop_p = true;
4804 return true;
4808 /* Vectorize the SLP permutations in NODE as specified
4809 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
4810 child number and lane number.
4811 Interleaving of two two-lane two-child SLP subtrees (not supported):
4812 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
4813 A blend of two four-lane two-child SLP subtrees:
4814 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
4815 Highpart of a four-lane one-child SLP subtree (not supported):
4816 [ { 0, 2 }, { 0, 3 } ]
4817 Where currently only a subset is supported by code generating below. */
4819 static bool
4820 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
4821 slp_tree node, stmt_vector_for_cost *cost_vec)
4823 tree vectype = SLP_TREE_VECTYPE (node);
4825 /* ??? We currently only support all same vector input and output types
4826 while the SLP IL should really do a concat + select and thus accept
4827 arbitrary mismatches. */
4828 slp_tree child;
4829 unsigned i;
4830 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4831 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
4833 if (dump_enabled_p ())
4834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4835 "Unsupported lane permutation\n");
4836 return false;
4839 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
4840 gcc_assert (perm.length () == SLP_TREE_LANES (node));
4841 if (dump_enabled_p ())
4843 dump_printf_loc (MSG_NOTE, vect_location,
4844 "vectorizing permutation");
4845 for (unsigned i = 0; i < perm.length (); ++i)
4846 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
4847 dump_printf (MSG_NOTE, "\n");
4850 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4851 if (!nunits.is_constant ())
4852 return false;
4853 unsigned HOST_WIDE_INT vf = 1;
4854 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
4855 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
4856 return false;
4857 unsigned olanes = vf * SLP_TREE_LANES (node);
4858 gcc_assert (multiple_p (olanes, nunits));
4860 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
4861 from the { SLP operand, scalar lane } permutation as recorded in the
4862 SLP node as intermediate step. This part should already work
4863 with SLP children with arbitrary number of lanes. */
4864 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
4865 auto_vec<unsigned> active_lane;
4866 vperm.create (olanes);
4867 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
4868 for (unsigned i = 0; i < vf; ++i)
4870 for (unsigned pi = 0; pi < perm.length (); ++pi)
4872 std::pair<unsigned, unsigned> p = perm[pi];
4873 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
4874 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
4875 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
4876 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
4877 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
4879 /* Advance to the next group. */
4880 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
4881 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
4884 if (dump_enabled_p ())
4886 dump_printf_loc (MSG_NOTE, vect_location, "as");
4887 for (unsigned i = 0; i < vperm.length (); ++i)
4889 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
4890 dump_printf (MSG_NOTE, ",");
4891 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
4892 vperm[i].first.first, vperm[i].first.second,
4893 vperm[i].first.second);
4895 dump_printf (MSG_NOTE, "\n");
4898 /* We can only handle two-vector permutes, everything else should
4899 be lowered on the SLP level. The following is closely inspired
4900 by vect_transform_slp_perm_load and is supposed to eventually
4901 replace it.
4902 ??? As intermediate step do code-gen in the SLP tree representation
4903 somehow? */
4904 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
4905 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
4906 unsigned int const_nunits = nunits.to_constant ();
4907 unsigned int index = 0;
4908 unsigned int mask_element;
4909 vec_perm_builder mask;
4910 mask.new_vector (const_nunits, const_nunits, 1);
4911 unsigned int count = mask.encoded_nelts ();
4912 mask.quick_grow (count);
4913 vec_perm_indices indices;
4914 unsigned nperms = 0;
4915 for (unsigned i = 0; i < vperm.length (); ++i)
4917 mask_element = vperm[i].second;
4918 if (first_vec.first == -1U
4919 || first_vec == vperm[i].first)
4920 first_vec = vperm[i].first;
4921 else if (second_vec.first == -1U
4922 || second_vec == vperm[i].first)
4924 second_vec = vperm[i].first;
4925 mask_element += const_nunits;
4927 else
4929 if (dump_enabled_p ())
4930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4931 "permutation requires at "
4932 "least three vectors");
4933 gcc_assert (!gsi);
4934 return false;
4937 mask[index++] = mask_element;
4939 if (index == count)
4941 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
4942 const_nunits);
4943 bool identity_p = indices.series_p (0, 1, 0, 1);
4944 if (!identity_p
4945 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
4947 if (dump_enabled_p ())
4949 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4950 vect_location,
4951 "unsupported vect permute { ");
4952 for (i = 0; i < count; ++i)
4954 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4955 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4957 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4959 gcc_assert (!gsi);
4960 return false;
4963 if (!identity_p)
4964 nperms++;
4965 if (gsi)
4967 if (second_vec.first == -1U)
4968 second_vec = first_vec;
4970 /* Generate the permute statement if necessary. */
4971 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
4972 tree first_def
4973 = vect_get_slp_vect_def (first_node, first_vec.second);
4974 gassign *perm_stmt;
4975 tree perm_dest = make_ssa_name (vectype);
4976 if (!identity_p)
4978 slp_tree second_node
4979 = SLP_TREE_CHILDREN (node)[second_vec.first];
4980 tree second_def
4981 = vect_get_slp_vect_def (second_node, second_vec.second);
4982 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4983 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4984 first_def, second_def,
4985 mask_vec);
4987 else
4988 /* We need a copy here in case the def was external. */
4989 perm_stmt = gimple_build_assign (perm_dest, first_def);
4990 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
4991 /* Store the vector statement in NODE. */
4992 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
4995 index = 0;
4996 first_vec = std::make_pair (-1U, -1U);
4997 second_vec = std::make_pair (-1U, -1U);
5001 if (!gsi)
5002 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
5004 return true;
5007 /* Vectorize SLP instance tree in postorder. */
5009 static void
5010 vect_schedule_slp_instance (vec_info *vinfo,
5011 slp_tree node, slp_instance instance,
5012 hash_set<slp_tree> &visited)
5014 gimple_stmt_iterator si;
5015 int i;
5016 slp_tree child;
5018 /* See if we have already vectorized the node in the graph of the
5019 SLP instance. */
5020 if ((SLP_TREE_DEF_TYPE (node) == vect_internal_def
5021 && SLP_TREE_VEC_STMTS (node).exists ())
5022 || SLP_TREE_VEC_DEFS (node).exists ())
5023 return;
5025 /* Vectorize externals and constants. */
5026 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5027 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5029 /* ??? vectorizable_shift can end up using a scalar operand which is
5030 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5031 node in this case. */
5032 if (!SLP_TREE_VECTYPE (node))
5033 return;
5035 vect_create_constant_vectors (vinfo, node);
5036 return;
5039 /* ??? If we'd have a way to mark backedges that would be cheaper. */
5040 if (visited.add (node))
5041 return;
5043 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5044 vect_schedule_slp_instance (vinfo, child, instance, visited);
5046 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
5047 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
5049 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5050 if (dump_enabled_p ())
5051 dump_printf_loc (MSG_NOTE, vect_location,
5052 "------>vectorizing SLP node starting from: %G",
5053 stmt_info->stmt);
5055 if (STMT_VINFO_DATA_REF (stmt_info)
5056 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5058 /* Vectorized loads go before the first scalar load to make it
5059 ready early, vectorized stores go before the last scalar
5060 stmt which is where all uses are ready. */
5061 stmt_vec_info last_stmt_info = NULL;
5062 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
5063 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
5064 else /* DR_IS_WRITE */
5065 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
5066 si = gsi_for_stmt (last_stmt_info->stmt);
5068 else if ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
5069 == cycle_phi_info_type)
5070 || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
5071 == induc_vec_info_type))
5073 /* For reduction and induction PHIs we do not use the
5074 insertion iterator. */
5075 si = gsi_none ();
5077 else
5079 /* Emit other stmts after the children vectorized defs which is
5080 earliest possible. */
5081 gimple *last_stmt = NULL;
5082 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5083 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5085 /* For fold-left reductions we are retaining the scalar
5086 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5087 set so the representation isn't perfect. Resort to the
5088 last scalar def here. */
5089 if (SLP_TREE_VEC_STMTS (child).is_empty ())
5091 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
5092 == cycle_phi_info_type);
5093 gphi *phi = as_a <gphi *>
5094 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
5095 if (!last_stmt
5096 || vect_stmt_dominates_stmt_p (last_stmt, phi))
5097 last_stmt = phi;
5099 /* We are emitting all vectorized stmts in the same place and
5100 the last one is the last.
5101 ??? Unless we have a load permutation applied and that
5102 figures to re-use an earlier generated load. */
5103 unsigned j;
5104 gimple *vstmt;
5105 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
5106 if (!last_stmt
5107 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5108 last_stmt = vstmt;
5110 else if (!SLP_TREE_VECTYPE (child))
5112 /* For externals we use unvectorized at all scalar defs. */
5113 unsigned j;
5114 tree def;
5115 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
5116 if (TREE_CODE (def) == SSA_NAME
5117 && !SSA_NAME_IS_DEFAULT_DEF (def))
5119 gimple *stmt = SSA_NAME_DEF_STMT (def);
5120 if (!last_stmt
5121 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
5122 last_stmt = stmt;
5125 else
5127 /* For externals we have to look at all defs since their
5128 insertion place is decided per vector. But beware
5129 of pre-existing vectors where we need to make sure
5130 we do not insert before the region boundary. */
5131 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
5132 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
5133 last_stmt = gsi_stmt (gsi_after_labels
5134 (as_a <bb_vec_info> (vinfo)->bbs[0]));
5135 else
5137 unsigned j;
5138 tree vdef;
5139 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
5140 if (TREE_CODE (vdef) == SSA_NAME
5141 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
5143 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
5144 if (!last_stmt
5145 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5146 last_stmt = vstmt;
5150 /* This can happen when all children are pre-existing vectors or
5151 constants. */
5152 if (!last_stmt)
5153 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
5154 if (is_a <gphi *> (last_stmt))
5155 si = gsi_after_labels (gimple_bb (last_stmt));
5156 else
5158 si = gsi_for_stmt (last_stmt);
5159 gsi_next (&si);
5163 bool done_p = false;
5165 /* Handle purely internal nodes. */
5166 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5168 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5169 be shared with different SLP nodes (but usually it's the same
5170 operation apart from the case the stmt is only there for denoting
5171 the actual scalar lane defs ...). So do not call vect_transform_stmt
5172 but open-code it here (partly). */
5173 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
5174 gcc_assert (done);
5175 done_p = true;
5177 if (!done_p)
5178 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
5181 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5182 For loop vectorization this is done in vectorizable_call, but for SLP
5183 it needs to be deferred until end of vect_schedule_slp, because multiple
5184 SLP instances may refer to the same scalar stmt. */
5186 static void
5187 vect_remove_slp_scalar_calls (vec_info *vinfo,
5188 slp_tree node, hash_set<slp_tree> &visited)
5190 gimple *new_stmt;
5191 gimple_stmt_iterator gsi;
5192 int i;
5193 slp_tree child;
5194 tree lhs;
5195 stmt_vec_info stmt_info;
5197 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5198 return;
5200 if (visited.add (node))
5201 return;
5203 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5204 vect_remove_slp_scalar_calls (vinfo, child, visited);
5206 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5208 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
5209 if (!stmt || gimple_bb (stmt) == NULL)
5210 continue;
5211 if (is_pattern_stmt_p (stmt_info)
5212 || !PURE_SLP_STMT (stmt_info))
5213 continue;
5214 lhs = gimple_call_lhs (stmt);
5215 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
5216 gsi = gsi_for_stmt (stmt);
5217 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
5218 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
5222 static void
5223 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
5225 hash_set<slp_tree> visited;
5226 vect_remove_slp_scalar_calls (vinfo, node, visited);
5229 /* Vectorize the instance root. */
5231 void
5232 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
5234 gassign *rstmt = NULL;
5236 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
5238 gimple *child_stmt;
5239 int j;
5241 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5243 tree vect_lhs = gimple_get_lhs (child_stmt);
5244 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
5245 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
5246 TREE_TYPE (vect_lhs)))
5247 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
5248 vect_lhs);
5249 rstmt = gimple_build_assign (root_lhs, vect_lhs);
5250 break;
5253 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
5255 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5256 gimple *child_stmt;
5257 int j;
5258 vec<constructor_elt, va_gc> *v;
5259 vec_alloc (v, nelts);
5261 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5263 CONSTRUCTOR_APPEND_ELT (v,
5264 NULL_TREE,
5265 gimple_get_lhs (child_stmt));
5267 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
5268 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
5269 tree r_constructor = build_constructor (rtype, v);
5270 rstmt = gimple_build_assign (lhs, r_constructor);
5273 gcc_assert (rstmt);
5275 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
5276 gsi_replace (&rgsi, rstmt, true);
5279 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
5281 void
5282 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
5284 slp_instance instance;
5285 unsigned int i;
5287 hash_set<slp_tree> visited;
5288 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5290 slp_tree node = SLP_INSTANCE_TREE (instance);
5291 if (dump_enabled_p ())
5293 dump_printf_loc (MSG_NOTE, vect_location,
5294 "Vectorizing SLP tree:\n");
5295 if (SLP_INSTANCE_ROOT_STMT (instance))
5296 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
5297 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
5298 vect_print_slp_graph (MSG_NOTE, vect_location,
5299 SLP_INSTANCE_TREE (instance));
5301 /* Schedule the tree of INSTANCE. */
5302 vect_schedule_slp_instance (vinfo, node, instance, visited);
5304 if (SLP_INSTANCE_ROOT_STMT (instance))
5305 vectorize_slp_instance_root_stmt (node, instance);
5307 if (dump_enabled_p ())
5308 dump_printf_loc (MSG_NOTE, vect_location,
5309 "vectorizing stmts using SLP.\n");
5312 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5314 slp_tree root = SLP_INSTANCE_TREE (instance);
5315 stmt_vec_info store_info;
5316 unsigned int j;
5318 /* For reductions set the latch values of the vectorized PHIs. */
5319 if (instance->reduc_phis
5320 && STMT_VINFO_REDUC_TYPE (SLP_TREE_REPRESENTATIVE
5321 (instance->reduc_phis)) != FOLD_LEFT_REDUCTION
5322 && STMT_VINFO_REDUC_TYPE (SLP_TREE_REPRESENTATIVE
5323 (instance->reduc_phis)) != EXTRACT_LAST_REDUCTION)
5325 slp_tree slp_node = root;
5326 slp_tree phi_node = instance->reduc_phis;
5327 gphi *phi = as_a <gphi *> (SLP_TREE_SCALAR_STMTS (phi_node)[0]->stmt);
5328 edge e = loop_latch_edge (gimple_bb (phi)->loop_father);
5329 gcc_assert (SLP_TREE_VEC_STMTS (phi_node).length ()
5330 == SLP_TREE_VEC_STMTS (slp_node).length ());
5331 for (unsigned j = 0; j < SLP_TREE_VEC_STMTS (phi_node).length (); ++j)
5332 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[j]),
5333 vect_get_slp_vect_def (slp_node, j),
5334 e, gimple_phi_arg_location (phi, e->dest_idx));
5337 /* Remove scalar call stmts. Do not do this for basic-block
5338 vectorization as not all uses may be vectorized.
5339 ??? Why should this be necessary? DCE should be able to
5340 remove the stmts itself.
5341 ??? For BB vectorization we can as well remove scalar
5342 stmts starting from the SLP tree root if they have no
5343 uses. */
5344 if (is_a <loop_vec_info> (vinfo))
5345 vect_remove_slp_scalar_calls (vinfo, root);
5347 /* Remove vectorized stores original scalar stmts. */
5348 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
5350 if (!STMT_VINFO_DATA_REF (store_info)
5351 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
5352 break;
5354 store_info = vect_orig_stmt (store_info);
5355 /* Free the attached stmt_vec_info and remove the stmt. */
5356 vinfo->remove_stmt (store_info);