[Ada] Fix internal error on bit-aligned component of function call
[official-gcc.git] / gcc / tree-vect-slp.c
blob0c1447e7aa0d7984302c60e9df27a5fc8986157b
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
52 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
53 slp_tree, stmt_vector_for_cost *);
55 /* Initialize a SLP node. */
57 _slp_tree::_slp_tree ()
59 SLP_TREE_SCALAR_STMTS (this) = vNULL;
60 SLP_TREE_SCALAR_OPS (this) = vNULL;
61 SLP_TREE_VEC_STMTS (this) = vNULL;
62 SLP_TREE_VEC_DEFS (this) = vNULL;
63 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
64 SLP_TREE_CHILDREN (this) = vNULL;
65 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
66 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
67 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
68 SLP_TREE_CODE (this) = ERROR_MARK;
69 SLP_TREE_VECTYPE (this) = NULL_TREE;
70 SLP_TREE_REPRESENTATIVE (this) = NULL;
71 SLP_TREE_REF_COUNT (this) = 1;
72 this->max_nunits = 1;
73 this->lanes = 0;
76 /* Tear down a SLP node. */
78 _slp_tree::~_slp_tree ()
80 SLP_TREE_CHILDREN (this).release ();
81 SLP_TREE_SCALAR_STMTS (this).release ();
82 SLP_TREE_SCALAR_OPS (this).release ();
83 SLP_TREE_VEC_STMTS (this).release ();
84 SLP_TREE_VEC_DEFS (this).release ();
85 SLP_TREE_LOAD_PERMUTATION (this).release ();
86 SLP_TREE_LANE_PERMUTATION (this).release ();
89 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
91 static void
92 vect_free_slp_tree (slp_tree node)
94 int i;
95 slp_tree child;
97 if (--SLP_TREE_REF_COUNT (node) != 0)
98 return;
100 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
101 vect_free_slp_tree (child);
103 delete node;
106 /* Return a location suitable for dumpings related to the SLP instance. */
108 dump_user_location_t
109 _slp_instance::location () const
111 if (root_stmt)
112 return root_stmt->stmt;
113 else
114 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
118 /* Free the memory allocated for the SLP instance. */
120 void
121 vect_free_slp_instance (slp_instance instance)
123 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
124 SLP_INSTANCE_LOADS (instance).release ();
125 instance->subgraph_entries.release ();
126 instance->cost_vec.release ();
127 free (instance);
131 /* Create an SLP node for SCALAR_STMTS. */
133 static slp_tree
134 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
136 slp_tree node = new _slp_tree;
137 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
138 SLP_TREE_CHILDREN (node).create (nops);
139 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
140 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
141 SLP_TREE_LANES (node) = scalar_stmts.length ();
142 return node;
145 /* Create an SLP node for OPS. */
147 static slp_tree
148 vect_create_new_slp_node (vec<tree> ops)
150 slp_tree node = new _slp_tree;
151 SLP_TREE_SCALAR_OPS (node) = ops;
152 SLP_TREE_DEF_TYPE (node) = vect_external_def;
153 SLP_TREE_LANES (node) = ops.length ();
154 return node;
158 /* This structure is used in creation of an SLP tree. Each instance
159 corresponds to the same operand in a group of scalar stmts in an SLP
160 node. */
161 typedef struct _slp_oprnd_info
163 /* Def-stmts for the operands. */
164 vec<stmt_vec_info> def_stmts;
165 /* Operands. */
166 vec<tree> ops;
167 /* Information about the first statement, its vector def-type, type, the
168 operand itself in case it's constant, and an indication if it's a pattern
169 stmt. */
170 tree first_op_type;
171 enum vect_def_type first_dt;
172 bool any_pattern;
173 } *slp_oprnd_info;
176 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
177 operand. */
178 static vec<slp_oprnd_info>
179 vect_create_oprnd_info (int nops, int group_size)
181 int i;
182 slp_oprnd_info oprnd_info;
183 vec<slp_oprnd_info> oprnds_info;
185 oprnds_info.create (nops);
186 for (i = 0; i < nops; i++)
188 oprnd_info = XNEW (struct _slp_oprnd_info);
189 oprnd_info->def_stmts.create (group_size);
190 oprnd_info->ops.create (group_size);
191 oprnd_info->first_dt = vect_uninitialized_def;
192 oprnd_info->first_op_type = NULL_TREE;
193 oprnd_info->any_pattern = false;
194 oprnds_info.quick_push (oprnd_info);
197 return oprnds_info;
201 /* Free operands info. */
203 static void
204 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
206 int i;
207 slp_oprnd_info oprnd_info;
209 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
211 oprnd_info->def_stmts.release ();
212 oprnd_info->ops.release ();
213 XDELETE (oprnd_info);
216 oprnds_info.release ();
220 /* Return true if STMTS contains a pattern statement. */
222 static bool
223 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
225 stmt_vec_info stmt_info;
226 unsigned int i;
227 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
228 if (is_pattern_stmt_p (stmt_info))
229 return true;
230 return false;
233 /* Return true when all lanes in the external or constant NODE have
234 the same value. */
236 static bool
237 vect_slp_tree_uniform_p (slp_tree node)
239 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
240 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
242 /* Pre-exsting vectors. */
243 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
244 return false;
246 unsigned i;
247 tree op, first = NULL_TREE;
248 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
249 if (!first)
250 first = op;
251 else if (!operand_equal_p (first, op, 0))
252 return false;
254 return true;
257 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
258 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
259 of the chain. */
262 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
263 stmt_vec_info first_stmt_info)
265 stmt_vec_info next_stmt_info = first_stmt_info;
266 int result = 0;
268 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
269 return -1;
273 if (next_stmt_info == stmt_info)
274 return result;
275 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
276 if (next_stmt_info)
277 result += DR_GROUP_GAP (next_stmt_info);
279 while (next_stmt_info);
281 return -1;
284 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
285 using the method implemented by duplicate_and_interleave. Return true
286 if so, returning the number of intermediate vectors in *NVECTORS_OUT
287 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
288 (if nonnull). */
290 bool
291 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
292 tree elt_type, unsigned int *nvectors_out,
293 tree *vector_type_out,
294 tree *permutes)
296 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
297 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
298 return false;
300 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
301 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
302 unsigned int nvectors = 1;
303 for (;;)
305 scalar_int_mode int_mode;
306 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
307 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
309 /* Get the natural vector type for this SLP group size. */
310 tree int_type = build_nonstandard_integer_type
311 (GET_MODE_BITSIZE (int_mode), 1);
312 tree vector_type
313 = get_vectype_for_scalar_type (vinfo, int_type, count);
314 if (vector_type
315 && VECTOR_MODE_P (TYPE_MODE (vector_type))
316 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
317 GET_MODE_SIZE (base_vector_mode)))
319 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
320 together into elements of type INT_TYPE and using the result
321 to build NVECTORS vectors. */
322 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
323 vec_perm_builder sel1 (nelts, 2, 3);
324 vec_perm_builder sel2 (nelts, 2, 3);
325 poly_int64 half_nelts = exact_div (nelts, 2);
326 for (unsigned int i = 0; i < 3; ++i)
328 sel1.quick_push (i);
329 sel1.quick_push (i + nelts);
330 sel2.quick_push (half_nelts + i);
331 sel2.quick_push (half_nelts + i + nelts);
333 vec_perm_indices indices1 (sel1, 2, nelts);
334 vec_perm_indices indices2 (sel2, 2, nelts);
335 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
336 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
338 if (nvectors_out)
339 *nvectors_out = nvectors;
340 if (vector_type_out)
341 *vector_type_out = vector_type;
342 if (permutes)
344 permutes[0] = vect_gen_perm_mask_checked (vector_type,
345 indices1);
346 permutes[1] = vect_gen_perm_mask_checked (vector_type,
347 indices2);
349 return true;
353 if (!multiple_p (elt_bytes, 2, &elt_bytes))
354 return false;
355 nvectors *= 2;
359 /* Return true if DTA and DTB match. */
361 static bool
362 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
364 return (dta == dtb
365 || ((dta == vect_external_def || dta == vect_constant_def)
366 && (dtb == vect_external_def || dtb == vect_constant_def)));
369 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
370 they are of a valid type and that they match the defs of the first stmt of
371 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
372 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
373 indicates swap is required for cond_expr stmts. Specifically, *SWAP
374 is 1 if STMT is cond and operands of comparison need to be swapped;
375 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
376 If there is any operand swap in this function, *SWAP is set to non-zero
377 value.
378 If there was a fatal error return -1; if the error could be corrected by
379 swapping operands of father node of this one, return 1; if everything is
380 ok return 0. */
381 static int
382 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
383 vec<stmt_vec_info> stmts, unsigned stmt_num,
384 vec<slp_oprnd_info> *oprnds_info)
386 stmt_vec_info stmt_info = stmts[stmt_num];
387 tree oprnd;
388 unsigned int i, number_of_oprnds;
389 enum vect_def_type dt = vect_uninitialized_def;
390 slp_oprnd_info oprnd_info;
391 int first_op_idx = 1;
392 unsigned int commutative_op = -1U;
393 bool first_op_cond = false;
394 bool first = stmt_num == 0;
396 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
398 number_of_oprnds = gimple_call_num_args (stmt);
399 first_op_idx = 3;
400 if (gimple_call_internal_p (stmt))
402 internal_fn ifn = gimple_call_internal_fn (stmt);
403 commutative_op = first_commutative_argument (ifn);
405 /* Masked load, only look at mask. */
406 if (ifn == IFN_MASK_LOAD)
408 number_of_oprnds = 1;
409 /* Mask operand index. */
410 first_op_idx = 5;
414 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
416 enum tree_code code = gimple_assign_rhs_code (stmt);
417 number_of_oprnds = gimple_num_ops (stmt) - 1;
418 /* Swap can only be done for cond_expr if asked to, otherwise we
419 could result in different comparison code to the first stmt. */
420 if (code == COND_EXPR
421 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
423 first_op_cond = true;
424 number_of_oprnds++;
426 else
427 commutative_op = commutative_tree_code (code) ? 0U : -1U;
429 else
430 return -1;
432 bool swapped = (swap != 0);
433 gcc_assert (!swapped || first_op_cond);
434 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
435 for (i = 0; i < number_of_oprnds; i++)
437 if (first_op_cond)
439 /* Map indicating how operands of cond_expr should be swapped. */
440 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
441 int *map = maps[swap];
443 if (i < 2)
444 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
445 first_op_idx), map[i]);
446 else
447 oprnd = gimple_op (stmt_info->stmt, map[i]);
449 else
450 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
451 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
452 oprnd = TREE_OPERAND (oprnd, 0);
454 oprnd_info = (*oprnds_info)[i];
456 stmt_vec_info def_stmt_info;
457 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
459 if (dump_enabled_p ())
460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
461 "Build SLP failed: can't analyze def for %T\n",
462 oprnd);
464 return -1;
467 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
468 oprnd_info->any_pattern = true;
470 oprnd_info->def_stmts.quick_push (def_stmt_info);
471 oprnd_info->ops.quick_push (oprnd);
473 if (first)
475 tree type = TREE_TYPE (oprnd);
476 dt = dts[i];
477 if ((dt == vect_constant_def
478 || dt == vect_external_def)
479 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
480 && (TREE_CODE (type) == BOOLEAN_TYPE
481 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
482 type)))
484 if (dump_enabled_p ())
485 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
486 "Build SLP failed: invalid type of def "
487 "for variable-length SLP %T\n", oprnd);
488 return -1;
491 /* For the swapping logic below force vect_reduction_def
492 for the reduction op in a SLP reduction group. */
493 if (!STMT_VINFO_DATA_REF (stmt_info)
494 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
495 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
496 && def_stmt_info)
497 dts[i] = dt = vect_reduction_def;
499 /* Check the types of the definition. */
500 switch (dt)
502 case vect_external_def:
503 case vect_constant_def:
504 case vect_internal_def:
505 case vect_reduction_def:
506 case vect_induction_def:
507 break;
509 default:
510 /* FORNOW: Not supported. */
511 if (dump_enabled_p ())
512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
513 "Build SLP failed: illegal type of def %T\n",
514 oprnd);
515 return -1;
518 oprnd_info->first_dt = dt;
519 oprnd_info->first_op_type = type;
522 if (first)
523 return 0;
525 /* Now match the operand definition types to that of the first stmt. */
526 for (i = 0; i < number_of_oprnds;)
528 oprnd_info = (*oprnds_info)[i];
529 dt = dts[i];
530 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
531 oprnd = oprnd_info->ops[stmt_num];
532 tree type = TREE_TYPE (oprnd);
534 if (!types_compatible_p (oprnd_info->first_op_type, type))
536 if (dump_enabled_p ())
537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
538 "Build SLP failed: different operand types\n");
539 return 1;
542 /* Not first stmt of the group, check that the def-stmt/s match
543 the def-stmt/s of the first stmt. Allow different definition
544 types for reduction chains: the first stmt must be a
545 vect_reduction_def (a phi node), and the rest
546 end in the reduction chain. */
547 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
548 && !(oprnd_info->first_dt == vect_reduction_def
549 && !STMT_VINFO_DATA_REF (stmt_info)
550 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
551 && def_stmt_info
552 && !STMT_VINFO_DATA_REF (def_stmt_info)
553 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
554 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
555 || (!STMT_VINFO_DATA_REF (stmt_info)
556 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
557 && ((!def_stmt_info
558 || STMT_VINFO_DATA_REF (def_stmt_info)
559 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
560 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
561 != (oprnd_info->first_dt != vect_reduction_def))))
563 /* Try swapping operands if we got a mismatch. For BB
564 vectorization only in case it will clearly improve things. */
565 if (i == commutative_op && !swapped
566 && (!is_a <bb_vec_info> (vinfo)
567 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
568 dts[i+1])
569 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
570 || vect_def_types_match
571 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
573 if (dump_enabled_p ())
574 dump_printf_loc (MSG_NOTE, vect_location,
575 "trying swapped operands\n");
576 std::swap (dts[i], dts[i+1]);
577 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
578 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
579 std::swap ((*oprnds_info)[i]->ops[stmt_num],
580 (*oprnds_info)[i+1]->ops[stmt_num]);
581 swapped = true;
582 continue;
585 if (is_a <bb_vec_info> (vinfo)
586 && !oprnd_info->any_pattern)
588 /* Now for commutative ops we should see whether we can
589 make the other operand matching. */
590 if (dump_enabled_p ())
591 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
592 "treating operand as external\n");
593 oprnd_info->first_dt = dt = vect_external_def;
595 else
597 if (dump_enabled_p ())
598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
599 "Build SLP failed: different types\n");
600 return 1;
604 /* Make sure to demote the overall operand to external. */
605 if (dt == vect_external_def)
606 oprnd_info->first_dt = vect_external_def;
607 /* For a SLP reduction chain we want to duplicate the reduction to
608 each of the chain members. That gets us a sane SLP graph (still
609 the stmts are not 100% correct wrt the initial values). */
610 else if ((dt == vect_internal_def
611 || dt == vect_reduction_def)
612 && oprnd_info->first_dt == vect_reduction_def
613 && !STMT_VINFO_DATA_REF (stmt_info)
614 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
615 && !STMT_VINFO_DATA_REF (def_stmt_info)
616 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
617 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
619 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
620 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
623 ++i;
626 /* Swap operands. */
627 if (swapped)
629 if (dump_enabled_p ())
630 dump_printf_loc (MSG_NOTE, vect_location,
631 "swapped operands to match def types in %G",
632 stmt_info->stmt);
635 return 0;
638 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
639 Return true if we can, meaning that this choice doesn't conflict with
640 existing SLP nodes that use STMT_INFO. */
642 bool
643 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
645 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
646 if (old_vectype)
647 return useless_type_conversion_p (vectype, old_vectype);
649 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
651 /* We maintain the invariant that if any statement in the group is
652 used, all other members of the group have the same vector type. */
653 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
654 stmt_vec_info member_info = first_info;
655 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
656 if (is_pattern_stmt_p (member_info)
657 && !useless_type_conversion_p (vectype,
658 STMT_VINFO_VECTYPE (member_info)))
659 break;
661 if (!member_info)
663 for (member_info = first_info; member_info;
664 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
665 STMT_VINFO_VECTYPE (member_info) = vectype;
666 return true;
669 else if (!is_pattern_stmt_p (stmt_info))
671 STMT_VINFO_VECTYPE (stmt_info) = vectype;
672 return true;
675 if (dump_enabled_p ())
677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
678 "Build SLP failed: incompatible vector"
679 " types for: %G", stmt_info->stmt);
680 dump_printf_loc (MSG_NOTE, vect_location,
681 " old vector type: %T\n", old_vectype);
682 dump_printf_loc (MSG_NOTE, vect_location,
683 " new vector type: %T\n", vectype);
685 return false;
688 /* Return true if call statements CALL1 and CALL2 are similar enough
689 to be combined into the same SLP group. */
691 static bool
692 compatible_calls_p (gcall *call1, gcall *call2)
694 unsigned int nargs = gimple_call_num_args (call1);
695 if (nargs != gimple_call_num_args (call2))
696 return false;
698 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
699 return false;
701 if (gimple_call_internal_p (call1))
703 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
704 TREE_TYPE (gimple_call_lhs (call2))))
705 return false;
706 for (unsigned int i = 0; i < nargs; ++i)
707 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
708 TREE_TYPE (gimple_call_arg (call2, i))))
709 return false;
711 else
713 if (!operand_equal_p (gimple_call_fn (call1),
714 gimple_call_fn (call2), 0))
715 return false;
717 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
718 return false;
720 return true;
723 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
724 caller's attempt to find the vector type in STMT_INFO with the narrowest
725 element type. Return true if VECTYPE is nonnull and if it is valid
726 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
727 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
728 vect_build_slp_tree. */
730 static bool
731 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
732 unsigned int group_size,
733 tree vectype, poly_uint64 *max_nunits)
735 if (!vectype)
737 if (dump_enabled_p ())
738 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
739 "Build SLP failed: unsupported data-type in %G\n",
740 stmt_info->stmt);
741 /* Fatal mismatch. */
742 return false;
745 /* If populating the vector type requires unrolling then fail
746 before adjusting *max_nunits for basic-block vectorization. */
747 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
748 unsigned HOST_WIDE_INT const_nunits;
749 if (is_a <bb_vec_info> (vinfo)
750 && (!nunits.is_constant (&const_nunits)
751 || const_nunits > group_size))
753 if (dump_enabled_p ())
754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
755 "Build SLP failed: unrolling required "
756 "in basic block SLP\n");
757 /* Fatal mismatch. */
758 return false;
761 /* In case of multiple types we need to detect the smallest type. */
762 vect_update_max_nunits (max_nunits, vectype);
763 return true;
766 /* Verify if the scalar stmts STMTS are isomorphic, require data
767 permutation or are of unsupported types of operation. Return
768 true if they are, otherwise return false and indicate in *MATCHES
769 which stmts are not isomorphic to the first one. If MATCHES[0]
770 is false then this indicates the comparison could not be
771 carried out or the stmts will never be vectorized by SLP.
773 Note COND_EXPR is possibly isomorphic to another one after swapping its
774 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
775 the first stmt by swapping the two operands of comparison; set SWAP[i]
776 to 2 if stmt I is isormorphic to the first stmt by inverting the code
777 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
778 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
780 static bool
781 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
782 vec<stmt_vec_info> stmts, unsigned int group_size,
783 poly_uint64 *max_nunits, bool *matches,
784 bool *two_operators, tree *node_vectype)
786 unsigned int i;
787 stmt_vec_info first_stmt_info = stmts[0];
788 enum tree_code first_stmt_code = ERROR_MARK;
789 enum tree_code alt_stmt_code = ERROR_MARK;
790 enum tree_code rhs_code = ERROR_MARK;
791 enum tree_code first_cond_code = ERROR_MARK;
792 tree lhs;
793 bool need_same_oprnds = false;
794 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
795 optab optab;
796 int icode;
797 machine_mode optab_op2_mode;
798 machine_mode vec_mode;
799 stmt_vec_info first_load = NULL, prev_first_load = NULL;
800 bool first_stmt_load_p = false, load_p = false;
802 /* For every stmt in NODE find its def stmt/s. */
803 stmt_vec_info stmt_info;
804 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
806 gimple *stmt = stmt_info->stmt;
807 swap[i] = 0;
808 matches[i] = false;
810 if (dump_enabled_p ())
811 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
813 /* Fail to vectorize statements marked as unvectorizable, throw
814 or are volatile. */
815 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
816 || stmt_can_throw_internal (cfun, stmt)
817 || gimple_has_volatile_ops (stmt))
819 if (dump_enabled_p ())
820 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
821 "Build SLP failed: unvectorizable statement %G",
822 stmt);
823 /* ??? For BB vectorization we want to commutate operands in a way
824 to shuffle all unvectorizable defs into one operand and have
825 the other still vectorized. The following doesn't reliably
826 work for this though but it's the easiest we can do here. */
827 if (is_a <bb_vec_info> (vinfo) && i != 0)
828 continue;
829 /* Fatal mismatch. */
830 matches[0] = false;
831 return false;
834 lhs = gimple_get_lhs (stmt);
835 if (lhs == NULL_TREE)
837 if (dump_enabled_p ())
838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
839 "Build SLP failed: not GIMPLE_ASSIGN nor "
840 "GIMPLE_CALL %G", stmt);
841 if (is_a <bb_vec_info> (vinfo) && i != 0)
842 continue;
843 /* Fatal mismatch. */
844 matches[0] = false;
845 return false;
848 tree nunits_vectype;
849 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
850 &nunits_vectype, group_size)
851 || (nunits_vectype
852 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
853 nunits_vectype, max_nunits)))
855 if (is_a <bb_vec_info> (vinfo) && i != 0)
856 continue;
857 /* Fatal mismatch. */
858 matches[0] = false;
859 return false;
862 gcc_assert (vectype);
864 gcall *call_stmt = dyn_cast <gcall *> (stmt);
865 if (call_stmt)
867 rhs_code = CALL_EXPR;
869 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
870 load_p = true;
871 else if ((gimple_call_internal_p (call_stmt)
872 && (!vectorizable_internal_fn_p
873 (gimple_call_internal_fn (call_stmt))))
874 || gimple_call_tail_p (call_stmt)
875 || gimple_call_noreturn_p (call_stmt)
876 || !gimple_call_nothrow_p (call_stmt)
877 || gimple_call_chain (call_stmt))
879 if (dump_enabled_p ())
880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
881 "Build SLP failed: unsupported call type %G",
882 call_stmt);
883 if (is_a <bb_vec_info> (vinfo) && i != 0)
884 continue;
885 /* Fatal mismatch. */
886 matches[0] = false;
887 return false;
890 else
892 rhs_code = gimple_assign_rhs_code (stmt);
893 load_p = gimple_vuse (stmt);
896 /* Check the operation. */
897 if (i == 0)
899 *node_vectype = vectype;
900 first_stmt_code = rhs_code;
901 first_stmt_load_p = load_p;
903 /* Shift arguments should be equal in all the packed stmts for a
904 vector shift with scalar shift operand. */
905 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
906 || rhs_code == LROTATE_EXPR
907 || rhs_code == RROTATE_EXPR)
909 vec_mode = TYPE_MODE (vectype);
911 /* First see if we have a vector/vector shift. */
912 optab = optab_for_tree_code (rhs_code, vectype,
913 optab_vector);
915 if (!optab
916 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
918 /* No vector/vector shift, try for a vector/scalar shift. */
919 optab = optab_for_tree_code (rhs_code, vectype,
920 optab_scalar);
922 if (!optab)
924 if (dump_enabled_p ())
925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
926 "Build SLP failed: no optab.\n");
927 if (is_a <bb_vec_info> (vinfo) && i != 0)
928 continue;
929 /* Fatal mismatch. */
930 matches[0] = false;
931 return false;
933 icode = (int) optab_handler (optab, vec_mode);
934 if (icode == CODE_FOR_nothing)
936 if (dump_enabled_p ())
937 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
938 "Build SLP failed: "
939 "op not supported by target.\n");
940 if (is_a <bb_vec_info> (vinfo) && i != 0)
941 continue;
942 /* Fatal mismatch. */
943 matches[0] = false;
944 return false;
946 optab_op2_mode = insn_data[icode].operand[2].mode;
947 if (!VECTOR_MODE_P (optab_op2_mode))
949 need_same_oprnds = true;
950 first_op1 = gimple_assign_rhs2 (stmt);
954 else if (rhs_code == WIDEN_LSHIFT_EXPR)
956 need_same_oprnds = true;
957 first_op1 = gimple_assign_rhs2 (stmt);
959 else if (!load_p
960 && rhs_code == BIT_FIELD_REF)
962 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
963 if (TREE_CODE (vec) != SSA_NAME
964 || !types_compatible_p (vectype, TREE_TYPE (vec)))
966 if (is_a <bb_vec_info> (vinfo) && i != 0)
967 continue;
968 if (dump_enabled_p ())
969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
970 "Build SLP failed: "
971 "BIT_FIELD_REF not supported\n");
972 /* Fatal mismatch. */
973 matches[0] = false;
974 return false;
977 else if (call_stmt
978 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
980 need_same_oprnds = true;
981 first_op1 = gimple_call_arg (call_stmt, 1);
984 else
986 if (first_stmt_code != rhs_code
987 && alt_stmt_code == ERROR_MARK)
988 alt_stmt_code = rhs_code;
989 if ((first_stmt_code != rhs_code
990 && (first_stmt_code != IMAGPART_EXPR
991 || rhs_code != REALPART_EXPR)
992 && (first_stmt_code != REALPART_EXPR
993 || rhs_code != IMAGPART_EXPR)
994 /* Handle mismatches in plus/minus by computing both
995 and merging the results. */
996 && !((first_stmt_code == PLUS_EXPR
997 || first_stmt_code == MINUS_EXPR)
998 && (alt_stmt_code == PLUS_EXPR
999 || alt_stmt_code == MINUS_EXPR)
1000 && rhs_code == alt_stmt_code)
1001 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1002 && (first_stmt_code == ARRAY_REF
1003 || first_stmt_code == BIT_FIELD_REF
1004 || first_stmt_code == INDIRECT_REF
1005 || first_stmt_code == COMPONENT_REF
1006 || first_stmt_code == MEM_REF)))
1007 || first_stmt_load_p != load_p)
1009 if (dump_enabled_p ())
1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 "Build SLP failed: different operation "
1013 "in stmt %G", stmt);
1014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1015 "original stmt %G", first_stmt_info->stmt);
1017 /* Mismatch. */
1018 continue;
1021 if (need_same_oprnds)
1023 tree other_op1 = (call_stmt
1024 ? gimple_call_arg (call_stmt, 1)
1025 : gimple_assign_rhs2 (stmt));
1026 if (!operand_equal_p (first_op1, other_op1, 0))
1028 if (dump_enabled_p ())
1029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1030 "Build SLP failed: different shift "
1031 "arguments in %G", stmt);
1032 /* Mismatch. */
1033 continue;
1036 if (!load_p
1037 && first_stmt_code == BIT_FIELD_REF
1038 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1039 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1041 if (dump_enabled_p ())
1042 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1043 "Build SLP failed: different BIT_FIELD_REF "
1044 "arguments in %G", stmt);
1045 /* Mismatch. */
1046 continue;
1049 if (!load_p && rhs_code == CALL_EXPR)
1051 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1052 as_a <gcall *> (stmt)))
1054 if (dump_enabled_p ())
1055 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1056 "Build SLP failed: different calls in %G",
1057 stmt);
1058 /* Mismatch. */
1059 continue;
1063 if (!types_compatible_p (vectype, *node_vectype))
1065 if (dump_enabled_p ())
1066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1067 "Build SLP failed: different vector type "
1068 "in %G", stmt);
1069 /* Mismatch. */
1070 continue;
1074 /* Grouped store or load. */
1075 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1077 if (REFERENCE_CLASS_P (lhs))
1079 /* Store. */
1082 else
1084 /* Load. */
1085 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1086 if (prev_first_load)
1088 /* Check that there are no loads from different interleaving
1089 chains in the same node. */
1090 if (prev_first_load != first_load)
1092 if (dump_enabled_p ())
1093 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1094 vect_location,
1095 "Build SLP failed: different "
1096 "interleaving chains in one node %G",
1097 stmt);
1098 /* Mismatch. */
1099 continue;
1102 else
1103 prev_first_load = first_load;
1105 } /* Grouped access. */
1106 else
1108 if (load_p)
1110 /* Not grouped load. */
1111 if (dump_enabled_p ())
1112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113 "Build SLP failed: not grouped load %G", stmt);
1115 /* FORNOW: Not grouped loads are not supported. */
1116 if (is_a <bb_vec_info> (vinfo) && i != 0)
1117 continue;
1118 /* Fatal mismatch. */
1119 matches[0] = false;
1120 return false;
1123 /* Not memory operation. */
1124 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1125 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1126 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1127 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1128 && rhs_code != VIEW_CONVERT_EXPR
1129 && rhs_code != CALL_EXPR
1130 && rhs_code != BIT_FIELD_REF)
1132 if (dump_enabled_p ())
1133 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1134 "Build SLP failed: operation unsupported %G",
1135 stmt);
1136 if (is_a <bb_vec_info> (vinfo) && i != 0)
1137 continue;
1138 /* Fatal mismatch. */
1139 matches[0] = false;
1140 return false;
1143 if (rhs_code == COND_EXPR)
1145 tree cond_expr = gimple_assign_rhs1 (stmt);
1146 enum tree_code cond_code = TREE_CODE (cond_expr);
1147 enum tree_code swap_code = ERROR_MARK;
1148 enum tree_code invert_code = ERROR_MARK;
1150 if (i == 0)
1151 first_cond_code = TREE_CODE (cond_expr);
1152 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1154 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1155 swap_code = swap_tree_comparison (cond_code);
1156 invert_code = invert_tree_comparison (cond_code, honor_nans);
1159 if (first_cond_code == cond_code)
1161 /* Isomorphic can be achieved by swapping. */
1162 else if (first_cond_code == swap_code)
1163 swap[i] = 1;
1164 /* Isomorphic can be achieved by inverting. */
1165 else if (first_cond_code == invert_code)
1166 swap[i] = 2;
1167 else
1169 if (dump_enabled_p ())
1170 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1171 "Build SLP failed: different"
1172 " operation %G", stmt);
1173 /* Mismatch. */
1174 continue;
1179 matches[i] = true;
1182 for (i = 0; i < group_size; ++i)
1183 if (!matches[i])
1184 return false;
1186 /* If we allowed a two-operation SLP node verify the target can cope
1187 with the permute we are going to use. */
1188 if (alt_stmt_code != ERROR_MARK
1189 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1191 *two_operators = true;
1194 return true;
1197 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1198 Note we never remove apart from at destruction time so we do not
1199 need a special value for deleted that differs from empty. */
1200 struct bst_traits
1202 typedef vec <stmt_vec_info> value_type;
1203 typedef vec <stmt_vec_info> compare_type;
1204 static inline hashval_t hash (value_type);
1205 static inline bool equal (value_type existing, value_type candidate);
1206 static inline bool is_empty (value_type x) { return !x.exists (); }
1207 static inline bool is_deleted (value_type x) { return !x.exists (); }
1208 static const bool empty_zero_p = true;
1209 static inline void mark_empty (value_type &x) { x.release (); }
1210 static inline void mark_deleted (value_type &x) { x.release (); }
1211 static inline void remove (value_type &x) { x.release (); }
1213 inline hashval_t
1214 bst_traits::hash (value_type x)
1216 inchash::hash h;
1217 for (unsigned i = 0; i < x.length (); ++i)
1218 h.add_int (gimple_uid (x[i]->stmt));
1219 return h.end ();
1221 inline bool
1222 bst_traits::equal (value_type existing, value_type candidate)
1224 if (existing.length () != candidate.length ())
1225 return false;
1226 for (unsigned i = 0; i < existing.length (); ++i)
1227 if (existing[i] != candidate[i])
1228 return false;
1229 return true;
1232 typedef hash_map <vec <gimple *>, slp_tree,
1233 simple_hashmap_traits <bst_traits, slp_tree> >
1234 scalar_stmts_to_slp_tree_map_t;
1236 static slp_tree
1237 vect_build_slp_tree_2 (vec_info *vinfo,
1238 vec<stmt_vec_info> stmts, unsigned int group_size,
1239 poly_uint64 *max_nunits,
1240 bool *matches, unsigned *npermutes, unsigned *tree_size,
1241 scalar_stmts_to_slp_tree_map_t *bst_map);
1243 static slp_tree
1244 vect_build_slp_tree (vec_info *vinfo,
1245 vec<stmt_vec_info> stmts, unsigned int group_size,
1246 poly_uint64 *max_nunits,
1247 bool *matches, unsigned *npermutes, unsigned *tree_size,
1248 scalar_stmts_to_slp_tree_map_t *bst_map)
1250 if (slp_tree *leader = bst_map->get (stmts))
1252 if (dump_enabled_p ())
1253 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1254 *leader ? "" : "failed ", *leader);
1255 if (*leader)
1257 SLP_TREE_REF_COUNT (*leader)++;
1258 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1260 return *leader;
1262 poly_uint64 this_max_nunits = 1;
1263 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1264 &this_max_nunits,
1265 matches, npermutes, tree_size, bst_map);
1266 if (res)
1268 res->max_nunits = this_max_nunits;
1269 vect_update_max_nunits (max_nunits, this_max_nunits);
1270 /* Keep a reference for the bst_map use. */
1271 SLP_TREE_REF_COUNT (res)++;
1273 bst_map->put (stmts.copy (), res);
1274 return res;
1277 /* Recursively build an SLP tree starting from NODE.
1278 Fail (and return a value not equal to zero) if def-stmts are not
1279 isomorphic, require data permutation or are of unsupported types of
1280 operation. Otherwise, return 0.
1281 The value returned is the depth in the SLP tree where a mismatch
1282 was found. */
1284 static slp_tree
1285 vect_build_slp_tree_2 (vec_info *vinfo,
1286 vec<stmt_vec_info> stmts, unsigned int group_size,
1287 poly_uint64 *max_nunits,
1288 bool *matches, unsigned *npermutes, unsigned *tree_size,
1289 scalar_stmts_to_slp_tree_map_t *bst_map)
1291 unsigned nops, i, this_tree_size = 0;
1292 poly_uint64 this_max_nunits = *max_nunits;
1293 slp_tree node;
1295 matches[0] = false;
1297 stmt_vec_info stmt_info = stmts[0];
1298 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1299 nops = gimple_call_num_args (stmt);
1300 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1302 nops = gimple_num_ops (stmt) - 1;
1303 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1304 nops++;
1306 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1307 nops = gimple_phi_num_args (phi);
1308 else
1309 return NULL;
1311 /* If the SLP node is a PHI (induction or reduction), terminate
1312 the recursion. */
1313 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1315 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1316 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1317 group_size);
1318 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1319 max_nunits))
1320 return NULL;
1322 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1323 /* Induction from different IVs is not supported. */
1324 if (def_type == vect_induction_def)
1326 stmt_vec_info other_info;
1327 FOR_EACH_VEC_ELT (stmts, i, other_info)
1328 if (stmt_info != other_info)
1329 return NULL;
1331 else if (def_type == vect_reduction_def
1332 || def_type == vect_double_reduction_def
1333 || def_type == vect_nested_cycle)
1335 /* Else def types have to match. */
1336 stmt_vec_info other_info;
1337 FOR_EACH_VEC_ELT (stmts, i, other_info)
1338 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1339 return NULL;
1341 else
1342 return NULL;
1343 (*tree_size)++;
1344 node = vect_create_new_slp_node (stmts, nops);
1345 SLP_TREE_VECTYPE (node) = vectype;
1346 return node;
1350 bool two_operators = false;
1351 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1352 tree vectype = NULL_TREE;
1353 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1354 &this_max_nunits, matches, &two_operators,
1355 &vectype))
1356 return NULL;
1358 /* If the SLP node is a load, terminate the recursion unless masked. */
1359 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1360 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1362 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1364 /* Masked load. */
1365 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1366 nops = 1;
1368 else
1370 *max_nunits = this_max_nunits;
1371 (*tree_size)++;
1372 node = vect_create_new_slp_node (stmts, 0);
1373 SLP_TREE_VECTYPE (node) = vectype;
1374 /* And compute the load permutation. Whether it is actually
1375 a permutation depends on the unrolling factor which is
1376 decided later. */
1377 vec<unsigned> load_permutation;
1378 int j;
1379 stmt_vec_info load_info;
1380 load_permutation.create (group_size);
1381 stmt_vec_info first_stmt_info
1382 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1383 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1385 int load_place = vect_get_place_in_interleaving_chain
1386 (load_info, first_stmt_info);
1387 gcc_assert (load_place != -1);
1388 load_permutation.safe_push (load_place);
1390 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1391 return node;
1394 else if (gimple_assign_single_p (stmt_info->stmt)
1395 && !gimple_vuse (stmt_info->stmt)
1396 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1398 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1399 the same SSA name vector of a compatible type to vectype. */
1400 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1401 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1402 stmt_vec_info estmt_info;
1403 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1405 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1406 tree bfref = gimple_assign_rhs1 (estmt);
1407 HOST_WIDE_INT lane;
1408 if (!known_eq (bit_field_size (bfref),
1409 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1410 || !constant_multiple_p (bit_field_offset (bfref),
1411 bit_field_size (bfref), &lane))
1413 lperm.release ();
1414 return NULL;
1416 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1418 slp_tree vnode = vect_create_new_slp_node (vNULL);
1419 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1420 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1421 /* We are always building a permutation node even if it is an identity
1422 permute to shield the rest of the vectorizer from the odd node
1423 representing an actual vector without any scalar ops.
1424 ??? We could hide it completely with making the permute node
1425 external? */
1426 node = vect_create_new_slp_node (stmts, 1);
1427 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1428 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1429 SLP_TREE_VECTYPE (node) = vectype;
1430 SLP_TREE_CHILDREN (node).quick_push (vnode);
1431 return node;
1434 /* Get at the operands, verifying they are compatible. */
1435 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1436 slp_oprnd_info oprnd_info;
1437 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1439 int res = vect_get_and_check_slp_defs (vinfo, swap[i],
1440 stmts, i, &oprnds_info);
1441 if (res != 0)
1442 matches[(res == -1) ? 0 : i] = false;
1443 if (!matches[0])
1444 break;
1446 for (i = 0; i < group_size; ++i)
1447 if (!matches[i])
1449 vect_free_oprnd_info (oprnds_info);
1450 return NULL;
1452 swap = NULL;
1454 auto_vec<slp_tree, 4> children;
1456 stmt_info = stmts[0];
1458 /* Create SLP_TREE nodes for the definition node/s. */
1459 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1461 slp_tree child;
1462 unsigned int j;
1464 if (oprnd_info->first_dt == vect_uninitialized_def)
1466 /* COND_EXPR have one too many eventually if the condition
1467 is a SSA name. */
1468 gcc_assert (i == 3 && nops == 4);
1469 continue;
1472 if (oprnd_info->first_dt != vect_internal_def
1473 && oprnd_info->first_dt != vect_reduction_def
1474 && oprnd_info->first_dt != vect_induction_def)
1476 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1477 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1478 oprnd_info->ops = vNULL;
1479 children.safe_push (invnode);
1480 continue;
1483 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1484 group_size, &this_max_nunits,
1485 matches, npermutes,
1486 &this_tree_size, bst_map)) != NULL)
1488 oprnd_info->def_stmts = vNULL;
1489 children.safe_push (child);
1490 continue;
1493 /* If the SLP build for operand zero failed and operand zero
1494 and one can be commutated try that for the scalar stmts
1495 that failed the match. */
1496 if (i == 0
1497 /* A first scalar stmt mismatch signals a fatal mismatch. */
1498 && matches[0]
1499 /* ??? For COND_EXPRs we can swap the comparison operands
1500 as well as the arms under some constraints. */
1501 && nops == 2
1502 && oprnds_info[1]->first_dt == vect_internal_def
1503 && is_gimple_assign (stmt_info->stmt)
1504 /* Swapping operands for reductions breaks assumptions later on. */
1505 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1506 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1507 /* Do so only if the number of not successful permutes was nor more
1508 than a cut-ff as re-trying the recursive match on
1509 possibly each level of the tree would expose exponential
1510 behavior. */
1511 && *npermutes < 4)
1513 /* See whether we can swap the matching or the non-matching
1514 stmt operands. */
1515 bool swap_not_matching = true;
1518 for (j = 0; j < group_size; ++j)
1520 if (matches[j] != !swap_not_matching)
1521 continue;
1522 stmt_vec_info stmt_info = stmts[j];
1523 /* Verify if we can swap operands of this stmt. */
1524 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1525 if (!stmt
1526 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1528 if (!swap_not_matching)
1529 goto fail;
1530 swap_not_matching = false;
1531 break;
1535 while (j != group_size);
1537 /* Swap mismatched definition stmts. */
1538 if (dump_enabled_p ())
1539 dump_printf_loc (MSG_NOTE, vect_location,
1540 "Re-trying with swapped operands of stmts ");
1541 for (j = 0; j < group_size; ++j)
1542 if (matches[j] == !swap_not_matching)
1544 std::swap (oprnds_info[0]->def_stmts[j],
1545 oprnds_info[1]->def_stmts[j]);
1546 std::swap (oprnds_info[0]->ops[j],
1547 oprnds_info[1]->ops[j]);
1548 if (dump_enabled_p ())
1549 dump_printf (MSG_NOTE, "%d ", j);
1551 if (dump_enabled_p ())
1552 dump_printf (MSG_NOTE, "\n");
1553 /* And try again with scratch 'matches' ... */
1554 bool *tem = XALLOCAVEC (bool, group_size);
1555 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1556 group_size, &this_max_nunits,
1557 tem, npermutes,
1558 &this_tree_size, bst_map)) != NULL)
1560 oprnd_info->def_stmts = vNULL;
1561 children.safe_push (child);
1562 continue;
1564 /* We do not undo the swapping here since it might still be
1565 the better order for the second operand in case we build
1566 the first one from scalars below. */
1567 ++*npermutes;
1569 fail:
1571 /* If the SLP build failed and we analyze a basic-block
1572 simply treat nodes we fail to build as externally defined
1573 (and thus build vectors from the scalar defs).
1574 The cost model will reject outright expensive cases.
1575 ??? This doesn't treat cases where permutation ultimatively
1576 fails (or we don't try permutation below). Ideally we'd
1577 even compute a permutation that will end up with the maximum
1578 SLP tree size... */
1579 if (is_a <bb_vec_info> (vinfo)
1580 /* ??? Rejecting patterns this way doesn't work. We'd have to
1581 do extra work to cancel the pattern so the uses see the
1582 scalar version. */
1583 && !is_pattern_stmt_p (stmt_info)
1584 && !oprnd_info->any_pattern)
1586 /* But if there's a leading vector sized set of matching stmts
1587 fail here so we can split the group. This matches the condition
1588 vect_analyze_slp_instance uses. */
1589 /* ??? We might want to split here and combine the results to support
1590 multiple vector sizes better. */
1591 for (j = 0; j < group_size; ++j)
1592 if (!matches[j])
1593 break;
1594 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1596 if (dump_enabled_p ())
1597 dump_printf_loc (MSG_NOTE, vect_location,
1598 "Building vector operands from scalars\n");
1599 this_tree_size++;
1600 child = vect_create_new_slp_node (oprnd_info->ops);
1601 children.safe_push (child);
1602 oprnd_info->ops = vNULL;
1603 oprnd_info->def_stmts = vNULL;
1604 continue;
1608 gcc_assert (child == NULL);
1609 FOR_EACH_VEC_ELT (children, j, child)
1610 vect_free_slp_tree (child);
1611 vect_free_oprnd_info (oprnds_info);
1612 return NULL;
1615 vect_free_oprnd_info (oprnds_info);
1617 /* If we have all children of a child built up from uniform scalars
1618 or does more than one possibly expensive vector construction then
1619 just throw that away, causing it built up from scalars.
1620 The exception is the SLP node for the vector store. */
1621 if (is_a <bb_vec_info> (vinfo)
1622 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1623 /* ??? Rejecting patterns this way doesn't work. We'd have to
1624 do extra work to cancel the pattern so the uses see the
1625 scalar version. */
1626 && !is_pattern_stmt_p (stmt_info))
1628 slp_tree child;
1629 unsigned j;
1630 bool all_uniform_p = true;
1631 unsigned n_vector_builds = 0;
1632 FOR_EACH_VEC_ELT (children, j, child)
1634 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1635 all_uniform_p = false;
1636 else if (!vect_slp_tree_uniform_p (child))
1638 all_uniform_p = false;
1639 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1640 n_vector_builds++;
1643 if (all_uniform_p || n_vector_builds > 1)
1645 /* Roll back. */
1646 matches[0] = false;
1647 FOR_EACH_VEC_ELT (children, j, child)
1648 vect_free_slp_tree (child);
1650 if (dump_enabled_p ())
1651 dump_printf_loc (MSG_NOTE, vect_location,
1652 "Building parent vector operands from "
1653 "scalars instead\n");
1654 return NULL;
1658 *tree_size += this_tree_size + 1;
1659 *max_nunits = this_max_nunits;
1661 if (two_operators)
1663 /* ??? We'd likely want to either cache in bst_map sth like
1664 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1665 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1666 explicit stmts to put in so the keying on 'stmts' doesn't
1667 work (but we have the same issue with nodes that use 'ops'). */
1668 slp_tree one = new _slp_tree;
1669 slp_tree two = new _slp_tree;
1670 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1671 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1672 SLP_TREE_VECTYPE (one) = vectype;
1673 SLP_TREE_VECTYPE (two) = vectype;
1674 SLP_TREE_CHILDREN (one).safe_splice (children);
1675 SLP_TREE_CHILDREN (two).safe_splice (children);
1676 slp_tree child;
1677 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1678 SLP_TREE_REF_COUNT (child)++;
1680 /* Here we record the original defs since this
1681 node represents the final lane configuration. */
1682 node = vect_create_new_slp_node (stmts, 2);
1683 SLP_TREE_VECTYPE (node) = vectype;
1684 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1685 SLP_TREE_CHILDREN (node).quick_push (one);
1686 SLP_TREE_CHILDREN (node).quick_push (two);
1687 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1688 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1689 enum tree_code ocode = ERROR_MARK;
1690 stmt_vec_info ostmt_info;
1691 unsigned j = 0;
1692 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1694 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1695 if (gimple_assign_rhs_code (ostmt) != code0)
1697 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1698 ocode = gimple_assign_rhs_code (ostmt);
1699 j = i;
1701 else
1702 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1704 SLP_TREE_CODE (one) = code0;
1705 SLP_TREE_CODE (two) = ocode;
1706 SLP_TREE_LANES (one) = stmts.length ();
1707 SLP_TREE_LANES (two) = stmts.length ();
1708 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1709 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1710 return node;
1713 node = vect_create_new_slp_node (stmts, nops);
1714 SLP_TREE_VECTYPE (node) = vectype;
1715 SLP_TREE_CHILDREN (node).splice (children);
1716 return node;
1719 /* Dump a single SLP tree NODE. */
1721 static void
1722 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1723 slp_tree node)
1725 unsigned i, j;
1726 slp_tree child;
1727 stmt_vec_info stmt_info;
1728 tree op;
1730 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1731 dump_user_location_t user_loc = loc.get_user_location ();
1732 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1733 SLP_TREE_DEF_TYPE (node) == vect_external_def
1734 ? " (external)"
1735 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1736 ? " (constant)"
1737 : ""), node,
1738 estimated_poly_value (node->max_nunits),
1739 SLP_TREE_REF_COUNT (node));
1740 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1741 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1742 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1743 else
1745 dump_printf_loc (metadata, user_loc, "\t{ ");
1746 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1747 dump_printf (metadata, "%T%s ", op,
1748 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1749 dump_printf (metadata, "}\n");
1751 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1753 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1754 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1755 dump_printf (dump_kind, " %u", j);
1756 dump_printf (dump_kind, " }\n");
1758 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1760 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1761 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1762 dump_printf (dump_kind, " %u[%u]",
1763 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1764 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1765 dump_printf (dump_kind, " }\n");
1767 if (SLP_TREE_CHILDREN (node).is_empty ())
1768 return;
1769 dump_printf_loc (metadata, user_loc, "\tchildren");
1770 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1771 dump_printf (dump_kind, " %p", (void *)child);
1772 dump_printf (dump_kind, "\n");
1775 DEBUG_FUNCTION void
1776 debug (slp_tree node)
1778 debug_dump_context ctx;
1779 vect_print_slp_tree (MSG_NOTE,
1780 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1781 node);
1784 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1786 static void
1787 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1788 slp_tree node, hash_set<slp_tree> &visited)
1790 unsigned i;
1791 slp_tree child;
1793 if (visited.add (node))
1794 return;
1796 vect_print_slp_tree (dump_kind, loc, node);
1798 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1799 vect_print_slp_graph (dump_kind, loc, child, visited);
1802 static void
1803 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1804 slp_tree entry)
1806 hash_set<slp_tree> visited;
1807 vect_print_slp_graph (dump_kind, loc, entry, visited);
1810 /* Mark the tree rooted at NODE with PURE_SLP. */
1812 static void
1813 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1815 int i;
1816 stmt_vec_info stmt_info;
1817 slp_tree child;
1819 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1820 return;
1822 if (visited.add (node))
1823 return;
1825 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1826 STMT_SLP_TYPE (stmt_info) = pure_slp;
1828 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1829 vect_mark_slp_stmts (child, visited);
1832 static void
1833 vect_mark_slp_stmts (slp_tree node)
1835 hash_set<slp_tree> visited;
1836 vect_mark_slp_stmts (node, visited);
1839 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1841 static void
1842 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1844 int i;
1845 stmt_vec_info stmt_info;
1846 slp_tree child;
1848 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1849 return;
1851 if (visited.add (node))
1852 return;
1854 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1856 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1857 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1858 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1861 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1862 vect_mark_slp_stmts_relevant (child, visited);
1865 static void
1866 vect_mark_slp_stmts_relevant (slp_tree node)
1868 hash_set<slp_tree> visited;
1869 vect_mark_slp_stmts_relevant (node, visited);
1873 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1875 static void
1876 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
1877 hash_set<slp_tree> &visited)
1879 if (visited.add (node))
1880 return;
1882 if (SLP_TREE_CHILDREN (node).length () == 0)
1884 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1885 return;
1886 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1887 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1888 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1889 loads.safe_push (node);
1891 else
1893 unsigned i;
1894 slp_tree child;
1895 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1896 vect_gather_slp_loads (loads, child, visited);
1900 static void
1901 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1903 hash_set<slp_tree> visited;
1904 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
1908 /* Find the last store in SLP INSTANCE. */
1910 stmt_vec_info
1911 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1913 stmt_vec_info last = NULL;
1914 stmt_vec_info stmt_vinfo;
1916 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1918 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1919 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1922 return last;
1925 /* Find the first stmt in NODE. */
1927 stmt_vec_info
1928 vect_find_first_scalar_stmt_in_slp (slp_tree node)
1930 stmt_vec_info first = NULL;
1931 stmt_vec_info stmt_vinfo;
1933 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1935 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1936 if (!first
1937 || get_later_stmt (stmt_vinfo, first) == first)
1938 first = stmt_vinfo;
1941 return first;
1944 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1945 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1946 (also containing the first GROUP1_SIZE stmts, since stores are
1947 consecutive), the second containing the remainder.
1948 Return the first stmt in the second group. */
1950 static stmt_vec_info
1951 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1953 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1954 gcc_assert (group1_size > 0);
1955 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1956 gcc_assert (group2_size > 0);
1957 DR_GROUP_SIZE (first_vinfo) = group1_size;
1959 stmt_vec_info stmt_info = first_vinfo;
1960 for (unsigned i = group1_size; i > 1; i--)
1962 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1963 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1965 /* STMT is now the last element of the first group. */
1966 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1967 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1969 DR_GROUP_SIZE (group2) = group2_size;
1970 for (stmt_info = group2; stmt_info;
1971 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1973 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1974 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1977 /* For the second group, the DR_GROUP_GAP is that before the original group,
1978 plus skipping over the first vector. */
1979 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1981 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1982 DR_GROUP_GAP (first_vinfo) += group2_size;
1984 if (dump_enabled_p ())
1985 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1986 group1_size, group2_size);
1988 return group2;
1991 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1992 statements and a vector of NUNITS elements. */
1994 static poly_uint64
1995 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
1997 return exact_div (common_multiple (nunits, group_size), group_size);
2000 /* Analyze an SLP instance starting from a group of grouped stores. Call
2001 vect_build_slp_tree to build a tree of packed stmts if possible.
2002 Return FALSE if it's impossible to SLP any stmt in the loop. */
2004 static bool
2005 vect_analyze_slp_instance (vec_info *vinfo,
2006 scalar_stmts_to_slp_tree_map_t *bst_map,
2007 stmt_vec_info stmt_info, unsigned max_tree_size)
2009 slp_instance new_instance;
2010 slp_tree node;
2011 unsigned int group_size;
2012 tree vectype, scalar_type = NULL_TREE;
2013 unsigned int i;
2014 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2015 vec<stmt_vec_info> scalar_stmts;
2016 bool constructor = false;
2018 if (is_a <bb_vec_info> (vinfo))
2019 vect_location = stmt_info->stmt;
2020 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2022 scalar_type = TREE_TYPE (DR_REF (dr));
2023 group_size = DR_GROUP_SIZE (stmt_info);
2024 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
2026 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2028 gcc_assert (is_a <loop_vec_info> (vinfo));
2029 vectype = STMT_VINFO_VECTYPE (stmt_info);
2030 group_size = REDUC_GROUP_SIZE (stmt_info);
2032 else if (is_gimple_assign (stmt_info->stmt)
2033 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2035 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
2036 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2037 constructor = true;
2039 else
2041 gcc_assert (is_a <loop_vec_info> (vinfo));
2042 vectype = STMT_VINFO_VECTYPE (stmt_info);
2043 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2046 if (!vectype)
2048 if (dump_enabled_p ())
2049 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2050 "Build SLP failed: unsupported data-type %T\n",
2051 scalar_type);
2053 return false;
2055 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2057 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2058 scalar_stmts.create (group_size);
2059 stmt_vec_info next_info = stmt_info;
2060 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2062 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2063 while (next_info)
2065 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2066 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2069 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2071 /* Collect the reduction stmts and store them in
2072 SLP_TREE_SCALAR_STMTS. */
2073 while (next_info)
2075 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2076 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2078 /* Mark the first element of the reduction chain as reduction to properly
2079 transform the node. In the reduction analysis phase only the last
2080 element of the chain is marked as reduction. */
2081 STMT_VINFO_DEF_TYPE (stmt_info)
2082 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2083 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2084 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2086 else if (constructor)
2088 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2089 tree val;
2090 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2092 if (TREE_CODE (val) == SSA_NAME)
2094 gimple* def = SSA_NAME_DEF_STMT (val);
2095 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2096 /* Value is defined in another basic block. */
2097 if (!def_info)
2098 return false;
2099 def_info = vect_stmt_to_vectorize (def_info);
2100 scalar_stmts.safe_push (def_info);
2102 else
2103 return false;
2105 if (dump_enabled_p ())
2106 dump_printf_loc (MSG_NOTE, vect_location,
2107 "Analyzing vectorizable constructor: %G\n",
2108 stmt_info->stmt);
2110 else
2112 /* Collect reduction statements. */
2113 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2114 for (i = 0; reductions.iterate (i, &next_info); i++)
2115 scalar_stmts.safe_push (next_info);
2118 if (dump_enabled_p ())
2120 dump_printf_loc (MSG_NOTE, vect_location,
2121 "Starting SLP discovery for\n");
2122 for (i = 0; i < scalar_stmts.length (); ++i)
2123 dump_printf_loc (MSG_NOTE, vect_location,
2124 " %G", scalar_stmts[i]->stmt);
2127 /* Build the tree for the SLP instance. */
2128 bool *matches = XALLOCAVEC (bool, group_size);
2129 unsigned npermutes = 0;
2130 poly_uint64 max_nunits = nunits;
2131 unsigned tree_size = 0;
2132 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2133 &max_nunits, matches, &npermutes,
2134 &tree_size, bst_map);
2135 if (node != NULL)
2137 /* Calculate the unrolling factor based on the smallest type. */
2138 poly_uint64 unrolling_factor
2139 = calculate_unrolling_factor (max_nunits, group_size);
2141 if (maybe_ne (unrolling_factor, 1U)
2142 && is_a <bb_vec_info> (vinfo))
2144 unsigned HOST_WIDE_INT const_max_nunits;
2145 if (!max_nunits.is_constant (&const_max_nunits)
2146 || const_max_nunits > group_size)
2148 if (dump_enabled_p ())
2149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2150 "Build SLP failed: store group "
2151 "size not a multiple of the vector size "
2152 "in basic block SLP\n");
2153 vect_free_slp_tree (node);
2154 return false;
2156 /* Fatal mismatch. */
2157 if (dump_enabled_p ())
2158 dump_printf_loc (MSG_NOTE, vect_location,
2159 "SLP discovery succeeded but node needs "
2160 "splitting\n");
2161 memset (matches, true, group_size);
2162 matches[group_size / const_max_nunits * const_max_nunits] = false;
2163 vect_free_slp_tree (node);
2165 else
2167 /* Create a new SLP instance. */
2168 new_instance = XNEW (class _slp_instance);
2169 SLP_INSTANCE_TREE (new_instance) = node;
2170 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2171 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2172 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2173 new_instance->reduc_phis = NULL;
2174 new_instance->cost_vec = vNULL;
2175 new_instance->subgraph_entries = vNULL;
2177 vect_gather_slp_loads (new_instance, node);
2178 if (dump_enabled_p ())
2179 dump_printf_loc (MSG_NOTE, vect_location,
2180 "SLP size %u vs. limit %u.\n",
2181 tree_size, max_tree_size);
2183 /* Check whether any load is possibly permuted. */
2184 slp_tree load_node;
2185 bool loads_permuted = false;
2186 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2188 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2189 continue;
2190 unsigned j;
2191 stmt_vec_info load_info;
2192 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2193 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2195 loads_permuted = true;
2196 break;
2200 /* If the loads and stores can be handled with load/store-lane
2201 instructions do not generate this SLP instance. */
2202 if (is_a <loop_vec_info> (vinfo)
2203 && loads_permuted
2204 && dr && vect_store_lanes_supported (vectype, group_size, false))
2206 slp_tree load_node;
2207 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2209 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2210 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2211 /* Use SLP for strided accesses (or if we can't load-lanes). */
2212 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2213 || ! vect_load_lanes_supported
2214 (STMT_VINFO_VECTYPE (stmt_vinfo),
2215 DR_GROUP_SIZE (stmt_vinfo), false))
2216 break;
2218 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2220 if (dump_enabled_p ())
2221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2222 "Built SLP cancelled: can use "
2223 "load/store-lanes\n");
2224 vect_free_slp_instance (new_instance);
2225 return false;
2229 /* If this is a reduction chain with a conversion in front
2230 amend the SLP tree with a node for that. */
2231 if (!dr
2232 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2233 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2235 /* Get at the conversion stmt - we know it's the single use
2236 of the last stmt of the reduction chain. */
2237 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2238 use_operand_p use_p;
2239 gimple *use_stmt;
2240 bool r = single_imm_use (gimple_assign_lhs (tem),
2241 &use_p, &use_stmt);
2242 gcc_assert (r);
2243 next_info = vinfo->lookup_stmt (use_stmt);
2244 next_info = vect_stmt_to_vectorize (next_info);
2245 scalar_stmts = vNULL;
2246 scalar_stmts.create (group_size);
2247 for (unsigned i = 0; i < group_size; ++i)
2248 scalar_stmts.quick_push (next_info);
2249 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2250 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2251 SLP_TREE_CHILDREN (conv).quick_push (node);
2252 SLP_INSTANCE_TREE (new_instance) = conv;
2253 /* We also have to fake this conversion stmt as SLP reduction
2254 group so we don't have to mess with too much code
2255 elsewhere. */
2256 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2257 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2260 vinfo->slp_instances.safe_push (new_instance);
2262 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2263 the number of scalar stmts in the root in a few places.
2264 Verify that assumption holds. */
2265 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2266 .length () == group_size);
2268 if (dump_enabled_p ())
2270 dump_printf_loc (MSG_NOTE, vect_location,
2271 "Final SLP tree for instance:\n");
2272 vect_print_slp_graph (MSG_NOTE, vect_location,
2273 SLP_INSTANCE_TREE (new_instance));
2276 return true;
2279 else
2281 /* Failed to SLP. */
2282 /* Free the allocated memory. */
2283 scalar_stmts.release ();
2286 /* Try to break the group up into pieces. */
2287 unsigned HOST_WIDE_INT const_nunits;
2288 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2289 && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))
2290 && nunits.is_constant (&const_nunits))
2292 for (i = 0; i < group_size; i++)
2293 if (!matches[i])
2294 break;
2296 /* For basic block SLP, try to break the group up into multiples of the
2297 vector size. */
2298 if (is_a <bb_vec_info> (vinfo)
2299 && (i >= const_nunits && i < group_size))
2301 /* Split into two groups at the first vector boundary before i. */
2302 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2303 unsigned group1_size = i & ~(const_nunits - 1);
2305 if (dump_enabled_p ())
2306 dump_printf_loc (MSG_NOTE, vect_location,
2307 "Splitting SLP group at stmt %u\n", i);
2308 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2309 group1_size);
2310 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2311 max_tree_size);
2312 /* If the first non-match was in the middle of a vector,
2313 skip the rest of that vector. Do not bother to re-analyze
2314 single stmt groups. */
2315 if (group1_size < i)
2317 i = group1_size + const_nunits;
2318 if (i + 1 < group_size)
2319 rest = vect_split_slp_store_group (rest, const_nunits);
2321 if (i + 1 < group_size)
2322 res |= vect_analyze_slp_instance (vinfo, bst_map,
2323 rest, max_tree_size);
2324 return res;
2327 /* For loop vectorization split into arbitrary pieces of size > 1. */
2328 if (is_a <loop_vec_info> (vinfo)
2329 && (i > 1 && i < group_size))
2331 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2332 unsigned group1_size = i;
2334 if (dump_enabled_p ())
2335 dump_printf_loc (MSG_NOTE, vect_location,
2336 "Splitting SLP group at stmt %u\n", i);
2338 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2339 group1_size);
2340 /* Loop vectorization cannot handle gaps in stores, make sure
2341 the split group appears as strided. */
2342 STMT_VINFO_STRIDED_P (rest) = 1;
2343 DR_GROUP_GAP (rest) = 0;
2344 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2345 DR_GROUP_GAP (stmt_info) = 0;
2347 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2348 max_tree_size);
2349 if (i + 1 < group_size)
2350 res |= vect_analyze_slp_instance (vinfo, bst_map,
2351 rest, max_tree_size);
2353 return res;
2356 /* Even though the first vector did not all match, we might be able to SLP
2357 (some) of the remainder. FORNOW ignore this possibility. */
2360 /* Failed to SLP. */
2361 if (dump_enabled_p ())
2362 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2363 return false;
2366 /* Fill in backedge SLP children in the SLP graph. */
2368 static void
2369 vect_analyze_slp_backedges (vec_info *vinfo, slp_tree node,
2370 scalar_stmts_to_slp_tree_map_t *bst_map,
2371 hash_set<slp_tree> &visited)
2373 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
2374 || visited.add (node))
2375 return;
2377 slp_tree child;
2378 unsigned i;
2379 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2380 if (child)
2381 vect_analyze_slp_backedges (vinfo, child, bst_map, visited);
2383 if (gphi *phi = dyn_cast <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
2384 for (unsigned i = 0; i < gimple_phi_num_args (phi); ++i)
2386 auto_vec<stmt_vec_info, 64> stmts;
2387 unsigned j;
2388 stmt_vec_info phi_info;
2389 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, phi_info)
2391 tree def = gimple_phi_arg_def (as_a <gphi *>(phi_info->stmt), i);
2392 stmt_vec_info def_info = vinfo->lookup_def (def);
2393 if (!def_info)
2394 break;
2395 stmts.safe_push (vect_stmt_to_vectorize (def_info));
2397 if (j != SLP_TREE_LANES (node))
2398 continue;
2399 slp_tree *edge_def = bst_map->get (stmts);
2400 if (edge_def)
2402 /* ??? We're currently not recording non-backedge children
2403 of PHIs like external reduction initial values. Avoid
2404 NULL entries in SLP_TREE_CHILDREN for those and thus
2405 for now simply only record backedge defs at a
2406 SLP_TREE_CHILDREN index possibly not matching that of
2407 the corresponding PHI argument index. */
2408 SLP_TREE_CHILDREN (node).quick_push (*edge_def);
2409 (*edge_def)->refcnt++;
2414 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2415 trees of packed scalar stmts if SLP is possible. */
2417 opt_result
2418 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2420 unsigned int i;
2421 stmt_vec_info first_element;
2423 DUMP_VECT_SCOPE ("vect_analyze_slp");
2425 scalar_stmts_to_slp_tree_map_t *bst_map
2426 = new scalar_stmts_to_slp_tree_map_t ();
2428 /* Find SLP sequences starting from groups of grouped stores. */
2429 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2430 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2432 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2434 if (loop_vinfo->reduction_chains.length () > 0)
2436 /* Find SLP sequences starting from reduction chains. */
2437 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2438 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2439 max_tree_size))
2441 /* Dissolve reduction chain group. */
2442 stmt_vec_info vinfo = first_element;
2443 stmt_vec_info last = NULL;
2444 while (vinfo)
2446 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2447 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2448 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2449 last = vinfo;
2450 vinfo = next;
2452 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2453 /* It can be still vectorized as part of an SLP reduction. */
2454 loop_vinfo->reductions.safe_push (last);
2458 /* Find SLP sequences starting from groups of reductions. */
2459 if (loop_vinfo->reductions.length () > 1)
2460 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2461 max_tree_size);
2464 /* Fill in backedges. */
2465 slp_instance instance;
2466 hash_set<slp_tree> visited;
2467 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2468 vect_analyze_slp_backedges (vinfo, SLP_INSTANCE_TREE (instance),
2469 bst_map, visited);
2471 /* The map keeps a reference on SLP nodes built, release that. */
2472 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2473 it != bst_map->end (); ++it)
2474 if ((*it).second)
2475 vect_free_slp_tree ((*it).second);
2476 delete bst_map;
2478 return opt_result::success ();
2481 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2483 static void
2484 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2485 vec<slp_tree> &vertices, vec<int> &leafs)
2487 unsigned i;
2488 slp_tree child;
2490 if (visited.add (node))
2491 return;
2493 node->vertex = vertices.length ();
2494 vertices.safe_push (node);
2495 if (SLP_TREE_CHILDREN (node).is_empty ())
2496 leafs.safe_push (node->vertex);
2497 else
2498 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2499 vect_slp_build_vertices (visited, child, vertices, leafs);
2502 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2504 static void
2505 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2506 vec<int> &leafs)
2508 hash_set<slp_tree> visited;
2509 unsigned i;
2510 slp_instance instance;
2511 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2512 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2513 leafs);
2516 /* Apply (reverse) bijectite PERM to VEC. */
2518 template <class T>
2519 static void
2520 vect_slp_permute (vec<unsigned> perm,
2521 vec<T> &vec, bool reverse)
2523 auto_vec<T, 64> saved;
2524 saved.create (vec.length ());
2525 for (unsigned i = 0; i < vec.length (); ++i)
2526 saved.quick_push (vec[i]);
2528 if (reverse)
2530 for (unsigned i = 0; i < vec.length (); ++i)
2531 vec[perm[i]] = saved[i];
2532 for (unsigned i = 0; i < vec.length (); ++i)
2533 gcc_assert (vec[perm[i]] == saved[i]);
2535 else
2537 for (unsigned i = 0; i < vec.length (); ++i)
2538 vec[i] = saved[perm[i]];
2539 for (unsigned i = 0; i < vec.length (); ++i)
2540 gcc_assert (vec[i] == saved[perm[i]]);
2544 /* Return whether permutations PERM_A and PERM_B as recorded in the
2545 PERMS vector are equal. */
2547 static bool
2548 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2549 int perm_a, int perm_b)
2551 return (perm_a == perm_b
2552 || (perms[perm_a].length () == perms[perm_b].length ()
2553 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2554 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2557 /* Optimize the SLP graph of VINFO. */
2559 void
2560 vect_optimize_slp (vec_info *vinfo)
2562 if (vinfo->slp_instances.is_empty ())
2563 return;
2565 slp_tree node;
2566 unsigned i;
2567 auto_vec<slp_tree> vertices;
2568 auto_vec<int> leafs;
2569 vect_slp_build_vertices (vinfo, vertices, leafs);
2571 struct graph *slpg = new_graph (vertices.length ());
2572 FOR_EACH_VEC_ELT (vertices, i, node)
2574 unsigned j;
2575 slp_tree child;
2576 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2577 add_edge (slpg, i, child->vertex);
2580 /* Compute (reverse) postorder on the inverted graph. */
2581 auto_vec<int> ipo;
2582 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
2584 auto_sbitmap n_visited (vertices.length ());
2585 auto_sbitmap n_materialize (vertices.length ());
2586 auto_vec<int> n_perm (vertices.length ());
2587 auto_vec<vec<unsigned> > perms;
2589 bitmap_clear (n_visited);
2590 bitmap_clear (n_materialize);
2591 n_perm.quick_grow_cleared (vertices.length ());
2592 perms.safe_push (vNULL); /* zero is no permute */
2594 /* Produce initial permutations. */
2595 for (i = 0; i < leafs.length (); ++i)
2597 int idx = leafs[i];
2598 slp_tree node = vertices[idx];
2600 /* Handle externals and constants optimistically throughout the
2601 iteration. */
2602 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
2603 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
2604 continue;
2606 /* Loads are the only thing generating permutes and leafs do not
2607 change across iterations. */
2608 bitmap_set_bit (n_visited, idx);
2609 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2610 continue;
2612 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2613 node unpermuted, record this permute. */
2614 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
2615 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
2616 continue;
2617 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
2618 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
2619 bool any_permute = false;
2620 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2622 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
2623 imin = MIN (imin, idx);
2624 imax = MAX (imax, idx);
2625 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
2626 any_permute = true;
2628 /* If there's no permute no need to split one out. */
2629 if (!any_permute)
2630 continue;
2631 /* If the span doesn't match we'd disrupt VF computation, avoid
2632 that for now. */
2633 if (imax - imin + 1 != SLP_TREE_LANES (node))
2634 continue;
2636 /* For now only handle true permutes, like
2637 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2638 when permuting constants and invariants keeping the permute
2639 bijective. */
2640 auto_sbitmap load_index (SLP_TREE_LANES (node));
2641 bitmap_clear (load_index);
2642 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2643 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
2644 unsigned j;
2645 for (j = 0; j < SLP_TREE_LANES (node); ++j)
2646 if (!bitmap_bit_p (load_index, j))
2647 break;
2648 if (j != SLP_TREE_LANES (node))
2649 continue;
2651 vec<unsigned> perm = vNULL;
2652 perm.safe_grow (SLP_TREE_LANES (node), true);
2653 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2654 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
2655 perms.safe_push (perm);
2656 n_perm[idx] = perms.length () - 1;
2659 /* Propagate permutes along the graph and compute materialization points. */
2660 bool changed;
2661 unsigned iteration = 0;
2664 changed = false;
2665 ++iteration;
2667 for (i = vertices.length (); i > 0 ; --i)
2669 int idx = ipo[i-1];
2670 slp_tree node = vertices[idx];
2671 /* For leafs there's nothing to do - we've seeded permutes
2672 on those above. */
2673 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2674 continue;
2676 bitmap_set_bit (n_visited, idx);
2678 /* We cannot move a permute across a store. */
2679 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2680 && DR_IS_WRITE
2681 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
2682 continue;
2684 int perm = -1;
2685 for (graph_edge *succ = slpg->vertices[idx].succ;
2686 succ; succ = succ->succ_next)
2688 int succ_idx = succ->dest;
2689 /* Handle unvisited nodes optimistically. */
2690 /* ??? But for constants once we want to handle non-bijective
2691 permutes we have to verify the permute, when unifying lanes,
2692 will not unify different constants. For example see
2693 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2694 if (!bitmap_bit_p (n_visited, succ_idx))
2695 continue;
2696 int succ_perm = n_perm[succ_idx];
2697 /* Once we materialize succs permutation its output lanes
2698 appear unpermuted to us. */
2699 if (bitmap_bit_p (n_materialize, succ_idx))
2700 succ_perm = 0;
2701 if (perm == -1)
2702 perm = succ_perm;
2703 else if (succ_perm == 0)
2705 perm = 0;
2706 break;
2708 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
2710 perm = 0;
2711 break;
2715 if (perm == -1)
2716 /* Pick up pre-computed leaf values. */
2717 perm = n_perm[idx];
2718 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
2720 if (iteration > 1)
2721 /* Make sure we eventually converge. */
2722 gcc_checking_assert (perm == 0);
2723 n_perm[idx] = perm;
2724 if (perm == 0)
2725 bitmap_clear_bit (n_materialize, idx);
2726 changed = true;
2729 if (perm == 0)
2730 continue;
2732 /* Elide pruning at materialization points in the first
2733 iteration so every node was visited once at least. */
2734 if (iteration == 1)
2735 continue;
2737 /* Decide on permute materialization. Look whether there's
2738 a use (pred) edge that is permuted differently than us.
2739 In that case mark ourselves so the permutation is applied. */
2740 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
2741 for (graph_edge *pred = slpg->vertices[idx].pred;
2742 pred; pred = pred->pred_next)
2744 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
2745 int pred_perm = n_perm[pred->src];
2746 if (!vect_slp_perms_eq (perms, perm, pred_perm))
2748 all_preds_permuted = false;
2749 break;
2752 if (!all_preds_permuted)
2754 if (!bitmap_bit_p (n_materialize, idx))
2755 changed = true;
2756 bitmap_set_bit (n_materialize, idx);
2760 while (changed || iteration == 1);
2762 /* Materialize. */
2763 for (i = 0; i < vertices.length (); ++i)
2765 int perm = n_perm[i];
2766 if (perm <= 0)
2767 continue;
2769 slp_tree node = vertices[i];
2771 /* First permute invariant/external original successors. */
2772 unsigned j;
2773 slp_tree child;
2774 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2776 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2777 continue;
2779 /* If the vector is uniform there's nothing to do. */
2780 if (vect_slp_tree_uniform_p (child))
2781 continue;
2783 /* We can end up sharing some externals via two_operator
2784 handling. Be prepared to unshare those. */
2785 if (child->refcnt != 1)
2787 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
2788 SLP_TREE_CHILDREN (node)[j] = child
2789 = vect_create_new_slp_node
2790 (SLP_TREE_SCALAR_OPS (child).copy ());
2792 vect_slp_permute (perms[perm],
2793 SLP_TREE_SCALAR_OPS (child), true);
2796 if (bitmap_bit_p (n_materialize, i))
2798 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2799 /* For loads simply drop the permutation, the load permutation
2800 already performs the desired permutation. */
2802 else
2804 if (dump_enabled_p ())
2805 dump_printf_loc (MSG_NOTE, vect_location,
2806 "inserting permute node in place of %p\n",
2807 node);
2809 /* Make a copy of NODE and in-place change it to a
2810 VEC_PERM node to permute the lanes of the copy. */
2811 slp_tree copy = new _slp_tree;
2812 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
2813 SLP_TREE_CHILDREN (node) = vNULL;
2814 SLP_TREE_SCALAR_STMTS (copy)
2815 = SLP_TREE_SCALAR_STMTS (node).copy ();
2816 vect_slp_permute (perms[perm],
2817 SLP_TREE_SCALAR_STMTS (copy), true);
2818 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
2819 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
2820 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
2821 SLP_TREE_LANE_PERMUTATION (copy)
2822 = SLP_TREE_LANE_PERMUTATION (node);
2823 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
2824 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
2825 copy->refcnt = 1;
2826 copy->max_nunits = node->max_nunits;
2827 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
2828 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
2829 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
2831 /* Now turn NODE into a VEC_PERM. */
2832 SLP_TREE_CHILDREN (node).safe_push (copy);
2833 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
2834 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2835 SLP_TREE_LANE_PERMUTATION (node)
2836 .quick_push (std::make_pair (0, perms[perm][j]));
2837 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2840 else
2842 /* Apply the reverse permutation to our stmts. */
2843 vect_slp_permute (perms[perm],
2844 SLP_TREE_SCALAR_STMTS (node), true);
2845 /* And to the load permutation, which we can simply
2846 make regular by design. */
2847 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2849 /* ??? When we handle non-bijective permutes the idea
2850 is that we can force the load-permutation to be
2851 { min, min + 1, min + 2, ... max }. But then the
2852 scalar defs might no longer match the lane content
2853 which means wrong-code with live lane vectorization.
2854 So we possibly have to have NULL entries for those. */
2855 vect_slp_permute (perms[perm],
2856 SLP_TREE_LOAD_PERMUTATION (node), true);
2861 /* Free the perms vector used for propagation. */
2862 while (!perms.is_empty ())
2863 perms.pop ().release ();
2864 free_graph (slpg);
2867 /* Now elide load permutations that are not necessary. */
2868 for (i = 0; i < leafs.length (); ++i)
2870 node = vertices[i];
2871 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2872 continue;
2874 /* In basic block vectorization we allow any subchain of an interleaving
2875 chain.
2876 FORNOW: not in loop SLP because of realignment complications. */
2877 if (is_a <bb_vec_info> (vinfo))
2879 bool subchain_p = true;
2880 stmt_vec_info next_load_info = NULL;
2881 stmt_vec_info load_info;
2882 unsigned j;
2883 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2885 if (j != 0
2886 && (next_load_info != load_info
2887 || DR_GROUP_GAP (load_info) != 1))
2889 subchain_p = false;
2890 break;
2892 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
2894 if (subchain_p)
2896 SLP_TREE_LOAD_PERMUTATION (node).release ();
2897 continue;
2900 else
2902 stmt_vec_info load_info;
2903 bool this_load_permuted = false;
2904 unsigned j;
2905 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2906 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
2908 this_load_permuted = true;
2909 break;
2911 stmt_vec_info first_stmt_info
2912 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
2913 if (!this_load_permuted
2914 /* The load requires permutation when unrolling exposes
2915 a gap either because the group is larger than the SLP
2916 group-size or because there is a gap between the groups. */
2917 && (known_eq (LOOP_VINFO_VECT_FACTOR
2918 (as_a <loop_vec_info> (vinfo)), 1U)
2919 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
2920 && DR_GROUP_GAP (first_stmt_info) == 0)))
2922 SLP_TREE_LOAD_PERMUTATION (node).release ();
2923 continue;
2930 /* For each possible SLP instance decide whether to SLP it and calculate overall
2931 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2932 least one instance. */
2934 bool
2935 vect_make_slp_decision (loop_vec_info loop_vinfo)
2937 unsigned int i;
2938 poly_uint64 unrolling_factor = 1;
2939 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2940 slp_instance instance;
2941 int decided_to_slp = 0;
2943 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2945 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2947 /* FORNOW: SLP if you can. */
2948 /* All unroll factors have the form:
2950 GET_MODE_SIZE (vinfo->vector_mode) * X
2952 for some rational X, so they must have a common multiple. */
2953 unrolling_factor
2954 = force_common_multiple (unrolling_factor,
2955 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2957 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2958 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2959 loop-based vectorization. Such stmts will be marked as HYBRID. */
2960 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2961 decided_to_slp++;
2964 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2966 if (decided_to_slp && dump_enabled_p ())
2968 dump_printf_loc (MSG_NOTE, vect_location,
2969 "Decided to SLP %d instances. Unrolling factor ",
2970 decided_to_slp);
2971 dump_dec (MSG_NOTE, unrolling_factor);
2972 dump_printf (MSG_NOTE, "\n");
2975 return (decided_to_slp > 0);
2978 /* Private data for vect_detect_hybrid_slp. */
2979 struct vdhs_data
2981 loop_vec_info loop_vinfo;
2982 vec<stmt_vec_info> *worklist;
2985 /* Walker for walk_gimple_op. */
2987 static tree
2988 vect_detect_hybrid_slp (tree *tp, int *, void *data)
2990 walk_stmt_info *wi = (walk_stmt_info *)data;
2991 vdhs_data *dat = (vdhs_data *)wi->info;
2993 if (wi->is_lhs)
2994 return NULL_TREE;
2996 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
2997 if (!def_stmt_info)
2998 return NULL_TREE;
2999 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3000 if (PURE_SLP_STMT (def_stmt_info))
3002 if (dump_enabled_p ())
3003 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3004 def_stmt_info->stmt);
3005 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3006 dat->worklist->safe_push (def_stmt_info);
3009 return NULL_TREE;
3012 /* Find stmts that must be both vectorized and SLPed. */
3014 void
3015 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3017 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3019 /* All stmts participating in SLP are marked pure_slp, all other
3020 stmts are loop_vect.
3021 First collect all loop_vect stmts into a worklist. */
3022 auto_vec<stmt_vec_info> worklist;
3023 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3025 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3026 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3027 gsi_next (&gsi))
3029 gphi *phi = gsi.phi ();
3030 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3031 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3032 worklist.safe_push (stmt_info);
3034 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3035 gsi_next (&gsi))
3037 gimple *stmt = gsi_stmt (gsi);
3038 if (is_gimple_debug (stmt))
3039 continue;
3040 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3041 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3043 for (gimple_stmt_iterator gsi2
3044 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3045 !gsi_end_p (gsi2); gsi_next (&gsi2))
3047 stmt_vec_info patt_info
3048 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3049 if (!STMT_SLP_TYPE (patt_info)
3050 && STMT_VINFO_RELEVANT (patt_info))
3051 worklist.safe_push (patt_info);
3053 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3055 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3056 worklist.safe_push (stmt_info);
3060 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3061 mark any SLP vectorized stmt as hybrid.
3062 ??? We're visiting def stmts N times (once for each non-SLP and
3063 once for each hybrid-SLP use). */
3064 walk_stmt_info wi;
3065 vdhs_data dat;
3066 dat.worklist = &worklist;
3067 dat.loop_vinfo = loop_vinfo;
3068 memset (&wi, 0, sizeof (wi));
3069 wi.info = (void *)&dat;
3070 while (!worklist.is_empty ())
3072 stmt_vec_info stmt_info = worklist.pop ();
3073 /* Since SSA operands are not set up for pattern stmts we need
3074 to use walk_gimple_op. */
3075 wi.is_lhs = 0;
3076 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3081 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3083 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3084 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs)
3086 for (unsigned i = 0; i < bbs.length (); ++i)
3088 if (i != 0)
3089 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3090 gsi_next (&si))
3092 gphi *phi = si.phi ();
3093 gimple_set_uid (phi, 0);
3094 add_stmt (phi);
3096 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3097 !gsi_end_p (gsi); gsi_next (&gsi))
3099 gimple *stmt = gsi_stmt (gsi);
3100 gimple_set_uid (stmt, 0);
3101 if (is_gimple_debug (stmt))
3102 continue;
3103 add_stmt (stmt);
3109 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3110 stmts in the basic block. */
3112 _bb_vec_info::~_bb_vec_info ()
3114 /* Reset region marker. */
3115 for (unsigned i = 0; i < bbs.length (); ++i)
3117 if (i != 0)
3118 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3119 gsi_next (&si))
3121 gphi *phi = si.phi ();
3122 gimple_set_uid (phi, -1);
3124 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3125 !gsi_end_p (gsi); gsi_next (&gsi))
3127 gimple *stmt = gsi_stmt (gsi);
3128 gimple_set_uid (stmt, -1);
3133 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3134 given then that child nodes have already been processed, and that
3135 their def types currently match their SLP node's def type. */
3137 static bool
3138 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3139 slp_instance node_instance,
3140 stmt_vector_for_cost *cost_vec)
3142 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3143 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3145 /* Calculate the number of vector statements to be created for the
3146 scalar stmts in this node. For SLP reductions it is equal to the
3147 number of vector statements in the children (which has already been
3148 calculated by the recursive call). Otherwise it is the number of
3149 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3150 VF divided by the number of elements in a vector. */
3151 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3152 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3154 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3155 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3157 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3158 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3159 break;
3162 else
3164 poly_uint64 vf;
3165 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3166 vf = loop_vinfo->vectorization_factor;
3167 else
3168 vf = 1;
3169 unsigned int group_size = SLP_TREE_LANES (node);
3170 tree vectype = SLP_TREE_VECTYPE (node);
3171 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3172 = vect_get_num_vectors (vf * group_size, vectype);
3175 /* Handle purely internal nodes. */
3176 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3177 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3179 if (is_a <bb_vec_info> (vinfo)
3180 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3181 return false;
3183 bool dummy;
3184 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3185 node, node_instance, cost_vec);
3188 /* Try to build NODE from scalars, returning true on success.
3189 NODE_INSTANCE is the SLP instance that contains NODE. */
3191 static bool
3192 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3193 slp_instance node_instance)
3195 stmt_vec_info stmt_info;
3196 unsigned int i;
3198 if (!is_a <bb_vec_info> (vinfo)
3199 || node == SLP_INSTANCE_TREE (node_instance)
3200 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3201 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3202 return false;
3204 if (dump_enabled_p ())
3205 dump_printf_loc (MSG_NOTE, vect_location,
3206 "Building vector operands from scalars instead\n");
3208 /* Don't remove and free the child nodes here, since they could be
3209 referenced by other structures. The analysis and scheduling phases
3210 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3211 unsigned int group_size = SLP_TREE_LANES (node);
3212 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3213 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3214 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3216 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3217 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3219 return true;
3222 /* Compute the prologue cost for invariant or constant operands represented
3223 by NODE. */
3225 static void
3226 vect_prologue_cost_for_slp (slp_tree node,
3227 stmt_vector_for_cost *cost_vec)
3229 /* There's a special case of an existing vector, that costs nothing. */
3230 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3231 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3232 return;
3233 /* Without looking at the actual initializer a vector of
3234 constants can be implemented as load from the constant pool.
3235 When all elements are the same we can use a splat. */
3236 tree vectype = SLP_TREE_VECTYPE (node);
3237 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3238 unsigned num_vects_to_check;
3239 unsigned HOST_WIDE_INT const_nunits;
3240 unsigned nelt_limit;
3241 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3242 && ! multiple_p (const_nunits, group_size))
3244 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3245 nelt_limit = const_nunits;
3247 else
3249 /* If either the vector has variable length or the vectors
3250 are composed of repeated whole groups we only need to
3251 cost construction once. All vectors will be the same. */
3252 num_vects_to_check = 1;
3253 nelt_limit = group_size;
3255 tree elt = NULL_TREE;
3256 unsigned nelt = 0;
3257 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3259 unsigned si = j % group_size;
3260 if (nelt == 0)
3261 elt = SLP_TREE_SCALAR_OPS (node)[si];
3262 /* ??? We're just tracking whether all operands of a single
3263 vector initializer are the same, ideally we'd check if
3264 we emitted the same one already. */
3265 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3266 elt = NULL_TREE;
3267 nelt++;
3268 if (nelt == nelt_limit)
3270 record_stmt_cost (cost_vec, 1,
3271 SLP_TREE_DEF_TYPE (node) == vect_external_def
3272 ? (elt ? scalar_to_vec : vec_construct)
3273 : vector_load,
3274 NULL, vectype, 0, vect_prologue);
3275 nelt = 0;
3280 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3281 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3283 Return true if the operations are supported. */
3285 static bool
3286 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3287 slp_instance node_instance,
3288 hash_set<slp_tree> &visited,
3289 hash_set<slp_tree> &lvisited,
3290 stmt_vector_for_cost *cost_vec)
3292 int i, j;
3293 slp_tree child;
3295 /* Assume we can code-generate all invariants. */
3296 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3297 return true;
3299 /* If we already analyzed the exact same set of scalar stmts we're done.
3300 We share the generated vector stmts for those. */
3301 if (visited.contains (node)
3302 || lvisited.add (node))
3303 return true;
3305 bool res = true;
3306 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3308 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3309 visited, lvisited, cost_vec);
3310 if (!res)
3311 break;
3314 if (res)
3315 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3316 cost_vec);
3317 if (!res)
3318 lvisited.remove (node);
3320 /* When the node can be vectorized cost invariant nodes it references.
3321 This is not done in DFS order to allow the refering node
3322 vectorizable_* calls to nail down the invariant nodes vector type
3323 and possibly unshare it if it needs a different vector type than
3324 other referrers. */
3325 if (res)
3326 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3327 if ((SLP_TREE_DEF_TYPE (child) == vect_constant_def
3328 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3329 /* Perform usual caching, note code-generation still
3330 code-gens these nodes multiple times but we expect
3331 to CSE them later. */
3332 && !visited.contains (child)
3333 && !lvisited.add (child))
3335 /* ??? After auditing more code paths make a "default"
3336 and push the vector type from NODE to all children
3337 if it is not already set. */
3338 /* Compute the number of vectors to be generated. */
3339 tree vector_type = SLP_TREE_VECTYPE (child);
3340 if (!vector_type)
3342 /* For shifts with a scalar argument we don't need
3343 to cost or code-generate anything.
3344 ??? Represent this more explicitely. */
3345 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3346 == shift_vec_info_type)
3347 && j == 1);
3348 continue;
3350 unsigned group_size = SLP_TREE_LANES (child);
3351 poly_uint64 vf = 1;
3352 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3353 vf = loop_vinfo->vectorization_factor;
3354 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3355 = vect_get_num_vectors (vf * group_size, vector_type);
3356 /* And cost them. */
3357 vect_prologue_cost_for_slp (child, cost_vec);
3360 /* If this node or any of its children can't be vectorized, try pruning
3361 the tree here rather than felling the whole thing. */
3362 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3364 /* We'll need to revisit this for invariant costing and number
3365 of vectorized stmt setting. */
3366 res = true;
3369 return res;
3373 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3374 region and that can be vectorized using vectorizable_live_operation
3375 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3376 scalar code computing it to be retained. */
3378 static void
3379 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3380 slp_instance instance,
3381 stmt_vector_for_cost *cost_vec,
3382 hash_set<stmt_vec_info> &svisited)
3384 unsigned i;
3385 stmt_vec_info stmt_info;
3386 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3387 bool all_visited = true;
3388 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3390 if (svisited.contains (stmt_info))
3391 continue;
3392 all_visited = false;
3393 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3394 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3395 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3396 /* Only the pattern root stmt computes the original scalar value. */
3397 continue;
3398 bool mark_visited = true;
3399 gimple *orig_stmt = orig_stmt_info->stmt;
3400 ssa_op_iter op_iter;
3401 def_operand_p def_p;
3402 FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3404 imm_use_iterator use_iter;
3405 gimple *use_stmt;
3406 stmt_vec_info use_stmt_info;
3407 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3408 if (!is_gimple_debug (use_stmt))
3410 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3411 if (!use_stmt_info
3412 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3414 STMT_VINFO_LIVE_P (stmt_info) = true;
3415 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3416 NULL, node, instance, i,
3417 false, cost_vec))
3418 /* ??? So we know we can vectorize the live stmt
3419 from one SLP node. If we cannot do so from all
3420 or none consistently we'd have to record which
3421 SLP node (and lane) we want to use for the live
3422 operation. So make sure we can code-generate
3423 from all nodes. */
3424 mark_visited = false;
3425 else
3426 STMT_VINFO_LIVE_P (stmt_info) = false;
3427 BREAK_FROM_IMM_USE_STMT (use_iter);
3430 /* We have to verify whether we can insert the lane extract
3431 before all uses. The following is a conservative approximation.
3432 We cannot put this into vectorizable_live_operation because
3433 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3434 doesn't work.
3435 Note that while the fact that we emit code for loads at the
3436 first load should make this a non-problem leafs we construct
3437 from scalars are vectorized after the last scalar def.
3438 ??? If we'd actually compute the insert location during
3439 analysis we could use sth less conservative than the last
3440 scalar stmt in the node for the dominance check. */
3441 /* ??? What remains is "live" uses in vector CTORs in the same
3442 SLP graph which is where those uses can end up code-generated
3443 right after their definition instead of close to their original
3444 use. But that would restrict us to code-generate lane-extracts
3445 from the latest stmt in a node. So we compensate for this
3446 during code-generation, simply not replacing uses for those
3447 hopefully rare cases. */
3448 if (STMT_VINFO_LIVE_P (stmt_info))
3449 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3450 if (!is_gimple_debug (use_stmt)
3451 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
3452 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3453 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
3455 if (dump_enabled_p ())
3456 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3457 "Cannot determine insertion place for "
3458 "lane extract\n");
3459 STMT_VINFO_LIVE_P (stmt_info) = false;
3460 mark_visited = true;
3463 if (mark_visited)
3464 svisited.add (stmt_info);
3466 if (all_visited)
3467 return;
3469 slp_tree child;
3470 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3471 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3472 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
3473 cost_vec, svisited);
3476 /* Analyze statements in SLP instances of VINFO. Return true if the
3477 operations are supported. */
3479 bool
3480 vect_slp_analyze_operations (vec_info *vinfo)
3482 slp_instance instance;
3483 int i;
3485 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3487 hash_set<slp_tree> visited;
3488 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
3490 hash_set<slp_tree> lvisited;
3491 stmt_vector_for_cost cost_vec;
3492 cost_vec.create (2);
3493 if (is_a <bb_vec_info> (vinfo))
3494 vect_location = instance->location ();
3495 if (!vect_slp_analyze_node_operations (vinfo,
3496 SLP_INSTANCE_TREE (instance),
3497 instance, visited, lvisited,
3498 &cost_vec)
3499 /* Instances with a root stmt require vectorized defs for the
3500 SLP tree root. */
3501 || (SLP_INSTANCE_ROOT_STMT (instance)
3502 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
3503 != vect_internal_def)))
3505 slp_tree node = SLP_INSTANCE_TREE (instance);
3506 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3507 if (dump_enabled_p ())
3508 dump_printf_loc (MSG_NOTE, vect_location,
3509 "removing SLP instance operations starting from: %G",
3510 stmt_info->stmt);
3511 vect_free_slp_instance (instance);
3512 vinfo->slp_instances.ordered_remove (i);
3513 cost_vec.release ();
3515 else
3517 for (hash_set<slp_tree>::iterator x = lvisited.begin();
3518 x != lvisited.end(); ++x)
3519 visited.add (*x);
3520 i++;
3522 /* For BB vectorization remember the SLP graph entry
3523 cost for later. */
3524 if (is_a <bb_vec_info> (vinfo))
3525 instance->cost_vec = cost_vec;
3526 else
3528 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3529 cost_vec.release ();
3534 /* Compute vectorizable live stmts. */
3535 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3537 hash_set<stmt_vec_info> svisited;
3538 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
3540 vect_location = instance->location ();
3541 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
3542 instance, &instance->cost_vec, svisited);
3546 return !vinfo->slp_instances.is_empty ();
3549 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3550 closing the eventual chain. */
3552 static slp_instance
3553 get_ultimate_leader (slp_instance instance,
3554 hash_map<slp_instance, slp_instance> &instance_leader)
3556 auto_vec<slp_instance *, 8> chain;
3557 slp_instance *tem;
3558 while (*(tem = instance_leader.get (instance)) != instance)
3560 chain.safe_push (tem);
3561 instance = *tem;
3563 while (!chain.is_empty ())
3564 *chain.pop () = instance;
3565 return instance;
3568 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3570 static void
3571 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
3572 slp_instance instance, slp_tree node,
3573 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
3574 hash_map<slp_instance, slp_instance> &instance_leader,
3575 hash_set<slp_tree> &visited)
3577 stmt_vec_info stmt_info;
3578 unsigned i;
3580 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3582 bool existed_p;
3583 slp_instance &stmt_instance
3584 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
3585 if (!existed_p)
3587 else if (stmt_instance != instance)
3589 /* If we're running into a previously marked stmt make us the
3590 leader of the current ultimate leader. This keeps the
3591 leader chain acyclic and works even when the current instance
3592 connects two previously independent graph parts. */
3593 slp_instance stmt_leader
3594 = get_ultimate_leader (stmt_instance, instance_leader);
3595 if (stmt_leader != instance)
3596 instance_leader.put (stmt_leader, instance);
3598 stmt_instance = instance;
3601 if (visited.add (node))
3602 return;
3604 slp_tree child;
3605 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3606 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3607 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
3608 instance_leader, visited);
3611 /* Partition the SLP graph into pieces that can be costed independently. */
3613 static void
3614 vect_bb_partition_graph (bb_vec_info bb_vinfo)
3616 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3618 /* First walk the SLP graph assigning each involved scalar stmt a
3619 corresponding SLP graph entry and upon visiting a previously
3620 marked stmt, make the stmts leader the current SLP graph entry. */
3621 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
3622 hash_map<slp_instance, slp_instance> instance_leader;
3623 hash_set<slp_tree> visited;
3624 slp_instance instance;
3625 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3627 instance_leader.put (instance, instance);
3628 vect_bb_partition_graph_r (bb_vinfo,
3629 instance, SLP_INSTANCE_TREE (instance),
3630 stmt_to_instance, instance_leader,
3631 visited);
3634 /* Then collect entries to each independent subgraph. */
3635 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3637 slp_instance leader = get_ultimate_leader (instance, instance_leader);
3638 leader->subgraph_entries.safe_push (instance);
3639 if (dump_enabled_p ()
3640 && leader != instance)
3641 dump_printf_loc (MSG_NOTE, vect_location,
3642 "instance %p is leader of %p\n",
3643 leader, instance);
3647 /* Compute the scalar cost of the SLP node NODE and its children
3648 and return it. Do not account defs that are marked in LIFE and
3649 update LIFE according to uses of NODE. */
3651 static void
3652 vect_bb_slp_scalar_cost (vec_info *vinfo,
3653 slp_tree node, vec<bool, va_heap> *life,
3654 stmt_vector_for_cost *cost_vec,
3655 hash_set<slp_tree> &visited)
3657 unsigned i;
3658 stmt_vec_info stmt_info;
3659 slp_tree child;
3661 if (visited.add (node))
3662 return;
3664 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3666 ssa_op_iter op_iter;
3667 def_operand_p def_p;
3669 if ((*life)[i])
3670 continue;
3672 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3673 gimple *orig_stmt = orig_stmt_info->stmt;
3675 /* If there is a non-vectorized use of the defs then the scalar
3676 stmt is kept live in which case we do not account it or any
3677 required defs in the SLP children in the scalar cost. This
3678 way we make the vectorization more costly when compared to
3679 the scalar cost. */
3680 if (!STMT_VINFO_LIVE_P (stmt_info))
3682 FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3684 imm_use_iterator use_iter;
3685 gimple *use_stmt;
3686 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3687 if (!is_gimple_debug (use_stmt))
3689 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3690 if (!use_stmt_info
3691 || !PURE_SLP_STMT
3692 (vect_stmt_to_vectorize (use_stmt_info)))
3694 (*life)[i] = true;
3695 BREAK_FROM_IMM_USE_STMT (use_iter);
3699 if ((*life)[i])
3700 continue;
3703 /* Count scalar stmts only once. */
3704 if (gimple_visited_p (orig_stmt))
3705 continue;
3706 gimple_set_visited (orig_stmt, true);
3708 vect_cost_for_stmt kind;
3709 if (STMT_VINFO_DATA_REF (orig_stmt_info))
3711 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
3712 kind = scalar_load;
3713 else
3714 kind = scalar_store;
3716 else if (vect_nop_conversion_p (orig_stmt_info))
3717 continue;
3718 else
3719 kind = scalar_stmt;
3720 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info, 0, vect_body);
3723 auto_vec<bool, 20> subtree_life;
3724 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3726 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3728 /* Do not directly pass LIFE to the recursive call, copy it to
3729 confine changes in the callee to the current child/subtree. */
3730 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3732 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
3733 for (unsigned j = 0;
3734 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3736 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3737 if (perm.first == i)
3738 subtree_life[perm.second] = (*life)[j];
3741 else
3743 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
3744 subtree_life.safe_splice (*life);
3746 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3747 visited);
3748 subtree_life.truncate (0);
3753 /* Check if vectorization of the basic block is profitable for the
3754 subgraph denoted by SLP_INSTANCES. */
3756 static bool
3757 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
3758 vec<slp_instance> slp_instances)
3760 slp_instance instance;
3761 int i;
3762 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3763 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3765 void *vect_target_cost_data = init_cost (NULL);
3767 /* Calculate scalar cost and sum the cost for the vector stmts
3768 previously collected. */
3769 stmt_vector_for_cost scalar_costs;
3770 scalar_costs.create (0);
3771 hash_set<slp_tree> visited;
3772 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3774 auto_vec<bool, 20> life;
3775 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
3776 true);
3777 vect_bb_slp_scalar_cost (bb_vinfo,
3778 SLP_INSTANCE_TREE (instance),
3779 &life, &scalar_costs, visited);
3780 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
3781 instance->cost_vec.release ();
3783 /* Unset visited flag. */
3784 stmt_info_for_cost *si;
3785 FOR_EACH_VEC_ELT (scalar_costs, i, si)
3786 gimple_set_visited (si->stmt_info->stmt, false);
3788 void *scalar_target_cost_data = init_cost (NULL);
3789 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
3790 scalar_costs.release ();
3791 unsigned dummy;
3792 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
3793 destroy_cost_data (scalar_target_cost_data);
3795 /* Complete the target-specific vector cost calculation. */
3796 finish_cost (vect_target_cost_data, &vec_prologue_cost,
3797 &vec_inside_cost, &vec_epilogue_cost);
3798 destroy_cost_data (vect_target_cost_data);
3800 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3802 if (dump_enabled_p ())
3804 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3805 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3806 vec_inside_cost);
3807 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3808 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3809 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3812 /* Vectorization is profitable if its cost is more than the cost of scalar
3813 version. Note that we err on the vector side for equal cost because
3814 the cost estimate is otherwise quite pessimistic (constant uses are
3815 free on the scalar side but cost a load on the vector side for
3816 example). */
3817 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3818 return false;
3820 return true;
3823 /* Find any vectorizable constructors and add them to the grouped_store
3824 array. */
3826 static void
3827 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3829 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
3830 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
3831 !gsi_end_p (gsi); gsi_next (&gsi))
3833 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
3834 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
3835 continue;
3837 tree rhs = gimple_assign_rhs1 (assign);
3838 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
3839 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
3840 CONSTRUCTOR_NELTS (rhs))
3841 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
3842 || uniform_vector_p (rhs))
3843 continue;
3845 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
3846 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3850 /* Check if the region described by BB_VINFO can be vectorized, returning
3851 true if so. When returning false, set FATAL to true if the same failure
3852 would prevent vectorization at other vector sizes, false if it is still
3853 worth trying other sizes. N_STMTS is the number of statements in the
3854 region. */
3856 static bool
3857 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
3858 vec<int> *dataref_groups)
3860 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3862 slp_instance instance;
3863 int i;
3864 poly_uint64 min_vf = 2;
3866 /* The first group of checks is independent of the vector size. */
3867 fatal = true;
3869 /* Analyze the data references. */
3871 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3873 if (dump_enabled_p ())
3874 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3875 "not vectorized: unhandled data-ref in basic "
3876 "block.\n");
3877 return false;
3880 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
3882 if (dump_enabled_p ())
3883 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3884 "not vectorized: unhandled data access in "
3885 "basic block.\n");
3886 return false;
3889 vect_slp_check_for_constructors (bb_vinfo);
3891 /* If there are no grouped stores and no constructors in the region
3892 there is no need to continue with pattern recog as vect_analyze_slp
3893 will fail anyway. */
3894 if (bb_vinfo->grouped_stores.is_empty ())
3896 if (dump_enabled_p ())
3897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3898 "not vectorized: no grouped stores in "
3899 "basic block.\n");
3900 return false;
3903 /* While the rest of the analysis below depends on it in some way. */
3904 fatal = false;
3906 vect_pattern_recog (bb_vinfo);
3908 /* Check the SLP opportunities in the basic block, analyze and build SLP
3909 trees. */
3910 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3912 if (dump_enabled_p ())
3914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3915 "Failed to SLP the basic block.\n");
3916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3917 "not vectorized: failed to find SLP opportunities "
3918 "in basic block.\n");
3920 return false;
3923 /* Optimize permutations. */
3924 vect_optimize_slp (bb_vinfo);
3926 vect_record_base_alignments (bb_vinfo);
3928 /* Analyze and verify the alignment of data references and the
3929 dependence in the SLP instances. */
3930 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3932 vect_location = instance->location ();
3933 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
3934 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
3936 slp_tree node = SLP_INSTANCE_TREE (instance);
3937 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3938 if (dump_enabled_p ())
3939 dump_printf_loc (MSG_NOTE, vect_location,
3940 "removing SLP instance operations starting from: %G",
3941 stmt_info->stmt);
3942 vect_free_slp_instance (instance);
3943 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3944 continue;
3947 /* Mark all the statements that we want to vectorize as pure SLP and
3948 relevant. */
3949 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3950 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3951 if (SLP_INSTANCE_ROOT_STMT (instance))
3952 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3954 i++;
3956 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3957 return false;
3959 if (!vect_slp_analyze_operations (bb_vinfo))
3961 if (dump_enabled_p ())
3962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3963 "not vectorized: bad operation in basic block.\n");
3964 return false;
3967 vect_bb_partition_graph (bb_vinfo);
3969 return true;
3972 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
3973 basic blocks in BBS, returning true on success.
3974 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
3976 static bool
3977 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
3978 vec<int> *dataref_groups, unsigned int n_stmts)
3980 bb_vec_info bb_vinfo;
3981 auto_vector_modes vector_modes;
3983 /* Autodetect first vector size we try. */
3984 machine_mode next_vector_mode = VOIDmode;
3985 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3986 unsigned int mode_i = 0;
3988 vec_info_shared shared;
3990 machine_mode autodetected_vector_mode = VOIDmode;
3991 while (1)
3993 bool vectorized = false;
3994 bool fatal = false;
3995 bb_vinfo = new _bb_vec_info (bbs, &shared);
3997 bool first_time_p = shared.datarefs.is_empty ();
3998 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3999 if (first_time_p)
4000 bb_vinfo->shared->save_datarefs ();
4001 else
4002 bb_vinfo->shared->check_datarefs ();
4003 bb_vinfo->vector_mode = next_vector_mode;
4005 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
4006 && dbg_cnt (vect_slp))
4008 if (dump_enabled_p ())
4010 dump_printf_loc (MSG_NOTE, vect_location,
4011 "***** Analysis succeeded with vector mode"
4012 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4013 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4016 bb_vinfo->shared->check_datarefs ();
4018 unsigned i;
4019 slp_instance instance;
4020 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4022 if (instance->subgraph_entries.is_empty ())
4023 continue;
4025 vect_location = instance->location ();
4026 if (!unlimited_cost_model (NULL)
4027 && !vect_bb_vectorization_profitable_p
4028 (bb_vinfo, instance->subgraph_entries))
4030 if (dump_enabled_p ())
4031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4032 "not vectorized: vectorization is not "
4033 "profitable.\n");
4034 continue;
4037 if (!vectorized && dump_enabled_p ())
4038 dump_printf_loc (MSG_NOTE, vect_location,
4039 "Basic block will be vectorized "
4040 "using SLP\n");
4041 vectorized = true;
4043 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4045 unsigned HOST_WIDE_INT bytes;
4046 if (dump_enabled_p ())
4048 if (GET_MODE_SIZE
4049 (bb_vinfo->vector_mode).is_constant (&bytes))
4050 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4051 "basic block part vectorized using %wu "
4052 "byte vectors\n", bytes);
4053 else
4054 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4055 "basic block part vectorized using "
4056 "variable length vectors\n");
4060 else
4062 if (dump_enabled_p ())
4063 dump_printf_loc (MSG_NOTE, vect_location,
4064 "***** Analysis failed with vector mode %s\n",
4065 GET_MODE_NAME (bb_vinfo->vector_mode));
4068 if (mode_i == 0)
4069 autodetected_vector_mode = bb_vinfo->vector_mode;
4071 if (!fatal)
4072 while (mode_i < vector_modes.length ()
4073 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4075 if (dump_enabled_p ())
4076 dump_printf_loc (MSG_NOTE, vect_location,
4077 "***** The result for vector mode %s would"
4078 " be the same\n",
4079 GET_MODE_NAME (vector_modes[mode_i]));
4080 mode_i += 1;
4083 delete bb_vinfo;
4085 if (mode_i < vector_modes.length ()
4086 && VECTOR_MODE_P (autodetected_vector_mode)
4087 && (related_vector_mode (vector_modes[mode_i],
4088 GET_MODE_INNER (autodetected_vector_mode))
4089 == autodetected_vector_mode)
4090 && (related_vector_mode (autodetected_vector_mode,
4091 GET_MODE_INNER (vector_modes[mode_i]))
4092 == vector_modes[mode_i]))
4094 if (dump_enabled_p ())
4095 dump_printf_loc (MSG_NOTE, vect_location,
4096 "***** Skipping vector mode %s, which would"
4097 " repeat the analysis for %s\n",
4098 GET_MODE_NAME (vector_modes[mode_i]),
4099 GET_MODE_NAME (autodetected_vector_mode));
4100 mode_i += 1;
4103 if (vectorized
4104 || mode_i == vector_modes.length ()
4105 || autodetected_vector_mode == VOIDmode
4106 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4107 vector sizes will fail do not bother iterating. */
4108 || fatal)
4109 return vectorized;
4111 /* Try the next biggest vector size. */
4112 next_vector_mode = vector_modes[mode_i++];
4113 if (dump_enabled_p ())
4114 dump_printf_loc (MSG_NOTE, vect_location,
4115 "***** Re-trying analysis with vector mode %s\n",
4116 GET_MODE_NAME (next_vector_mode));
4121 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4122 true if anything in the basic-block was vectorized. */
4124 static bool
4125 vect_slp_bbs (vec<basic_block> bbs)
4127 vec<data_reference_p> datarefs = vNULL;
4128 auto_vec<int> dataref_groups;
4129 int insns = 0;
4130 int current_group = 0;
4132 for (unsigned i = 0; i < bbs.length (); i++)
4134 basic_block bb = bbs[i];
4135 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
4136 gsi_next (&gsi))
4138 gimple *stmt = gsi_stmt (gsi);
4139 if (is_gimple_debug (stmt))
4140 continue;
4142 insns++;
4144 if (gimple_location (stmt) != UNKNOWN_LOCATION)
4145 vect_location = stmt;
4147 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
4148 &dataref_groups, current_group))
4149 ++current_group;
4151 if (insns > param_slp_max_insns_in_bb)
4153 if (dump_enabled_p ())
4154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4155 "not vectorized: too many instructions in "
4156 "region.\n");
4161 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
4164 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4165 true if anything in the basic-block was vectorized. */
4167 bool
4168 vect_slp_bb (basic_block bb)
4170 auto_vec<basic_block> bbs;
4171 bbs.safe_push (bb);
4172 return vect_slp_bbs (bbs);
4175 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4176 true if anything in the basic-block was vectorized. */
4178 bool
4179 vect_slp_function (function *fun)
4181 bool r = false;
4182 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
4183 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
4185 /* For the moment split the function into pieces to avoid making
4186 the iteration on the vector mode moot. Split at points we know
4187 to not handle well which is CFG merges (SLP discovery doesn't
4188 handle non-loop-header PHIs) and loop exits. Since pattern
4189 recog requires reverse iteration to visit uses before defs
4190 simply chop RPO into pieces. */
4191 auto_vec<basic_block> bbs;
4192 for (unsigned i = 0; i < n; i++)
4194 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
4196 /* Split when a basic block has multiple predecessors or when the
4197 edge into it exits a loop (because of implementation issues with
4198 respect to placement of CTORs for externals). */
4199 bool split = false;
4200 edge e;
4201 if (!single_pred_p (bb)
4202 || ((e = single_pred_edge (bb)),
4203 loop_exit_edge_p (e->src->loop_father, e)))
4204 split = true;
4205 /* Split when a BB is not dominated by the first block. */
4206 else if (!bbs.is_empty ()
4207 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
4208 split = true;
4210 if (split && !bbs.is_empty ())
4212 r |= vect_slp_bbs (bbs);
4213 bbs.truncate (0);
4214 bbs.quick_push (bb);
4216 else
4217 bbs.safe_push (bb);
4219 /* When we have a stmt ending this block we have to insert on
4220 edges when inserting after it. Avoid this for now. */
4221 if (gimple *last = last_stmt (bb))
4222 if (is_ctrl_altering_stmt (last))
4224 r |= vect_slp_bbs (bbs);
4225 bbs.truncate (0);
4229 if (!bbs.is_empty ())
4230 r |= vect_slp_bbs (bbs);
4232 free (rpo);
4234 return r;
4237 /* Build a variable-length vector in which the elements in ELTS are repeated
4238 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4239 RESULTS and add any new instructions to SEQ.
4241 The approach we use is:
4243 (1) Find a vector mode VM with integer elements of mode IM.
4245 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4246 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4247 from small vectors to IM.
4249 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4251 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4252 correct byte contents.
4254 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4256 We try to find the largest IM for which this sequence works, in order
4257 to cut down on the number of interleaves. */
4259 void
4260 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
4261 vec<tree> elts, unsigned int nresults,
4262 vec<tree> &results)
4264 unsigned int nelts = elts.length ();
4265 tree element_type = TREE_TYPE (vector_type);
4267 /* (1) Find a vector mode VM with integer elements of mode IM. */
4268 unsigned int nvectors = 1;
4269 tree new_vector_type;
4270 tree permutes[2];
4271 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
4272 &nvectors, &new_vector_type,
4273 permutes))
4274 gcc_unreachable ();
4276 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4277 unsigned int partial_nelts = nelts / nvectors;
4278 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
4280 tree_vector_builder partial_elts;
4281 auto_vec<tree, 32> pieces (nvectors * 2);
4282 pieces.quick_grow (nvectors * 2);
4283 for (unsigned int i = 0; i < nvectors; ++i)
4285 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4286 ELTS' has mode IM. */
4287 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
4288 for (unsigned int j = 0; j < partial_nelts; ++j)
4289 partial_elts.quick_push (elts[i * partial_nelts + j]);
4290 tree t = gimple_build_vector (seq, &partial_elts);
4291 t = gimple_build (seq, VIEW_CONVERT_EXPR,
4292 TREE_TYPE (new_vector_type), t);
4294 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4295 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
4298 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4299 correct byte contents.
4301 We need to repeat the following operation log2(nvectors) times:
4303 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4304 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4306 However, if each input repeats every N elements and the VF is
4307 a multiple of N * 2, the HI result is the same as the LO. */
4308 unsigned int in_start = 0;
4309 unsigned int out_start = nvectors;
4310 unsigned int hi_start = nvectors / 2;
4311 /* A bound on the number of outputs needed to produce NRESULTS results
4312 in the final iteration. */
4313 unsigned int noutputs_bound = nvectors * nresults;
4314 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
4316 noutputs_bound /= 2;
4317 unsigned int limit = MIN (noutputs_bound, nvectors);
4318 for (unsigned int i = 0; i < limit; ++i)
4320 if ((i & 1) != 0
4321 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
4322 2 * in_repeat))
4324 pieces[out_start + i] = pieces[out_start + i - 1];
4325 continue;
4328 tree output = make_ssa_name (new_vector_type);
4329 tree input1 = pieces[in_start + (i / 2)];
4330 tree input2 = pieces[in_start + (i / 2) + hi_start];
4331 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
4332 input1, input2,
4333 permutes[i & 1]);
4334 gimple_seq_add_stmt (seq, stmt);
4335 pieces[out_start + i] = output;
4337 std::swap (in_start, out_start);
4340 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4341 results.reserve (nresults);
4342 for (unsigned int i = 0; i < nresults; ++i)
4343 if (i < nvectors)
4344 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
4345 pieces[in_start + i]));
4346 else
4347 results.quick_push (results[i - nvectors]);
4351 /* For constant and loop invariant defs in OP_NODE this function creates
4352 vector defs that will be used in the vectorized stmts and stores them
4353 to SLP_TREE_VEC_DEFS of OP_NODE. */
4355 static void
4356 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
4358 unsigned HOST_WIDE_INT nunits;
4359 tree vec_cst;
4360 unsigned j, number_of_places_left_in_vector;
4361 tree vector_type;
4362 tree vop;
4363 int group_size = op_node->ops.length ();
4364 unsigned int vec_num, i;
4365 unsigned number_of_copies = 1;
4366 bool constant_p;
4367 gimple_seq ctor_seq = NULL;
4368 auto_vec<tree, 16> permute_results;
4370 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4371 vector_type = SLP_TREE_VECTYPE (op_node);
4373 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
4374 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
4375 auto_vec<tree> voprnds (number_of_vectors);
4377 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4378 created vectors. It is greater than 1 if unrolling is performed.
4380 For example, we have two scalar operands, s1 and s2 (e.g., group of
4381 strided accesses of size two), while NUNITS is four (i.e., four scalars
4382 of this type can be packed in a vector). The output vector will contain
4383 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4384 will be 2).
4386 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4387 containing the operands.
4389 For example, NUNITS is four as before, and the group size is 8
4390 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4391 {s5, s6, s7, s8}. */
4393 /* When using duplicate_and_interleave, we just need one element for
4394 each scalar statement. */
4395 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4396 nunits = group_size;
4398 number_of_copies = nunits * number_of_vectors / group_size;
4400 number_of_places_left_in_vector = nunits;
4401 constant_p = true;
4402 tree_vector_builder elts (vector_type, nunits, 1);
4403 elts.quick_grow (nunits);
4404 stmt_vec_info insert_after = NULL;
4405 for (j = 0; j < number_of_copies; j++)
4407 tree op;
4408 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
4410 /* Create 'vect_ = {op0,op1,...,opn}'. */
4411 number_of_places_left_in_vector--;
4412 tree orig_op = op;
4413 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
4415 if (CONSTANT_CLASS_P (op))
4417 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4419 /* Can't use VIEW_CONVERT_EXPR for booleans because
4420 of possibly different sizes of scalar value and
4421 vector element. */
4422 if (integer_zerop (op))
4423 op = build_int_cst (TREE_TYPE (vector_type), 0);
4424 else if (integer_onep (op))
4425 op = build_all_ones_cst (TREE_TYPE (vector_type));
4426 else
4427 gcc_unreachable ();
4429 else
4430 op = fold_unary (VIEW_CONVERT_EXPR,
4431 TREE_TYPE (vector_type), op);
4432 gcc_assert (op && CONSTANT_CLASS_P (op));
4434 else
4436 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
4437 gimple *init_stmt;
4438 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4440 tree true_val
4441 = build_all_ones_cst (TREE_TYPE (vector_type));
4442 tree false_val
4443 = build_zero_cst (TREE_TYPE (vector_type));
4444 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
4445 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
4446 op, true_val,
4447 false_val);
4449 else
4451 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
4452 op);
4453 init_stmt
4454 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
4455 op);
4457 gimple_seq_add_stmt (&ctor_seq, init_stmt);
4458 op = new_temp;
4461 elts[number_of_places_left_in_vector] = op;
4462 if (!CONSTANT_CLASS_P (op))
4463 constant_p = false;
4464 /* For BB vectorization we have to compute an insert location
4465 when a def is inside the analyzed region since we cannot
4466 simply insert at the BB start in this case. */
4467 stmt_vec_info opdef;
4468 if (TREE_CODE (orig_op) == SSA_NAME
4469 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
4470 && is_a <bb_vec_info> (vinfo)
4471 && (opdef = vinfo->lookup_def (orig_op)))
4473 if (!insert_after)
4474 insert_after = opdef;
4475 else
4476 insert_after = get_later_stmt (insert_after, opdef);
4479 if (number_of_places_left_in_vector == 0)
4481 if (constant_p
4482 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4483 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4484 vec_cst = gimple_build_vector (&ctor_seq, &elts);
4485 else
4487 if (permute_results.is_empty ())
4488 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
4489 elts, number_of_vectors,
4490 permute_results);
4491 vec_cst = permute_results[number_of_vectors - j - 1];
4493 if (!gimple_seq_empty_p (ctor_seq))
4495 if (insert_after)
4497 gimple_stmt_iterator gsi;
4498 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
4500 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
4501 gsi_insert_seq_before (&gsi, ctor_seq,
4502 GSI_CONTINUE_LINKING);
4504 else if (!stmt_ends_bb_p (insert_after->stmt))
4506 gsi = gsi_for_stmt (insert_after->stmt);
4507 gsi_insert_seq_after (&gsi, ctor_seq,
4508 GSI_CONTINUE_LINKING);
4510 else
4512 /* When we want to insert after a def where the
4513 defining stmt throws then insert on the fallthru
4514 edge. */
4515 edge e = find_fallthru_edge
4516 (gimple_bb (insert_after->stmt)->succs);
4517 basic_block new_bb
4518 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
4519 gcc_assert (!new_bb);
4522 else
4523 vinfo->insert_seq_on_entry (NULL, ctor_seq);
4524 ctor_seq = NULL;
4526 voprnds.quick_push (vec_cst);
4527 insert_after = NULL;
4528 number_of_places_left_in_vector = nunits;
4529 constant_p = true;
4530 elts.new_vector (vector_type, nunits, 1);
4531 elts.quick_grow (nunits);
4536 /* Since the vectors are created in the reverse order, we should invert
4537 them. */
4538 vec_num = voprnds.length ();
4539 for (j = vec_num; j != 0; j--)
4541 vop = voprnds[j - 1];
4542 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4545 /* In case that VF is greater than the unrolling factor needed for the SLP
4546 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4547 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4548 to replicate the vectors. */
4549 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
4550 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
4551 i++)
4552 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4555 /* Get the Ith vectorized definition from SLP_NODE. */
4557 tree
4558 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
4560 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
4561 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
4562 else
4563 return SLP_TREE_VEC_DEFS (slp_node)[i];
4566 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4568 void
4569 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
4571 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
4572 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
4574 unsigned j;
4575 gimple *vec_def_stmt;
4576 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
4577 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
4579 else
4580 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
4583 /* Get N vectorized definitions for SLP_NODE. */
4585 void
4586 vect_get_slp_defs (vec_info *,
4587 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
4589 if (n == -1U)
4590 n = SLP_TREE_CHILDREN (slp_node).length ();
4592 for (unsigned i = 0; i < n; ++i)
4594 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
4595 vec<tree> vec_defs = vNULL;
4596 vect_get_slp_defs (child, &vec_defs);
4597 vec_oprnds->quick_push (vec_defs);
4601 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4602 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4603 permute statements for the SLP node NODE. */
4605 bool
4606 vect_transform_slp_perm_load (vec_info *vinfo,
4607 slp_tree node, vec<tree> dr_chain,
4608 gimple_stmt_iterator *gsi, poly_uint64 vf,
4609 bool analyze_only, unsigned *n_perms)
4611 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4612 int vec_index = 0;
4613 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4614 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
4615 unsigned int mask_element;
4616 machine_mode mode;
4618 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
4619 return false;
4621 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
4623 mode = TYPE_MODE (vectype);
4624 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4626 /* Initialize the vect stmts of NODE to properly insert the generated
4627 stmts later. */
4628 if (! analyze_only)
4629 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
4630 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
4631 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
4633 /* Generate permutation masks for every NODE. Number of masks for each NODE
4634 is equal to GROUP_SIZE.
4635 E.g., we have a group of three nodes with three loads from the same
4636 location in each node, and the vector size is 4. I.e., we have a
4637 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4638 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4639 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4642 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4643 The last mask is illegal since we assume two operands for permute
4644 operation, and the mask element values can't be outside that range.
4645 Hence, the last mask must be converted into {2,5,5,5}.
4646 For the first two permutations we need the first and the second input
4647 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4648 we need the second and the third vectors: {b1,c1,a2,b2} and
4649 {c2,a3,b3,c3}. */
4651 int vect_stmts_counter = 0;
4652 unsigned int index = 0;
4653 int first_vec_index = -1;
4654 int second_vec_index = -1;
4655 bool noop_p = true;
4656 *n_perms = 0;
4658 vec_perm_builder mask;
4659 unsigned int nelts_to_build;
4660 unsigned int nvectors_per_build;
4661 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
4662 && multiple_p (nunits, group_size));
4663 if (repeating_p)
4665 /* A single vector contains a whole number of copies of the node, so:
4666 (a) all permutes can use the same mask; and
4667 (b) the permutes only need a single vector input. */
4668 mask.new_vector (nunits, group_size, 3);
4669 nelts_to_build = mask.encoded_nelts ();
4670 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
4672 else
4674 /* We need to construct a separate mask for each vector statement. */
4675 unsigned HOST_WIDE_INT const_nunits, const_vf;
4676 if (!nunits.is_constant (&const_nunits)
4677 || !vf.is_constant (&const_vf))
4678 return false;
4679 mask.new_vector (const_nunits, const_nunits, 1);
4680 nelts_to_build = const_vf * group_size;
4681 nvectors_per_build = 1;
4684 unsigned int count = mask.encoded_nelts ();
4685 mask.quick_grow (count);
4686 vec_perm_indices indices;
4688 for (unsigned int j = 0; j < nelts_to_build; j++)
4690 unsigned int iter_num = j / group_size;
4691 unsigned int stmt_num = j % group_size;
4692 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
4693 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
4694 if (repeating_p)
4696 first_vec_index = 0;
4697 mask_element = i;
4699 else
4701 /* Enforced before the loop when !repeating_p. */
4702 unsigned int const_nunits = nunits.to_constant ();
4703 vec_index = i / const_nunits;
4704 mask_element = i % const_nunits;
4705 if (vec_index == first_vec_index
4706 || first_vec_index == -1)
4708 first_vec_index = vec_index;
4710 else if (vec_index == second_vec_index
4711 || second_vec_index == -1)
4713 second_vec_index = vec_index;
4714 mask_element += const_nunits;
4716 else
4718 if (dump_enabled_p ())
4719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4720 "permutation requires at "
4721 "least three vectors %G",
4722 stmt_info->stmt);
4723 gcc_assert (analyze_only);
4724 return false;
4727 gcc_assert (mask_element < 2 * const_nunits);
4730 if (mask_element != index)
4731 noop_p = false;
4732 mask[index++] = mask_element;
4734 if (index == count && !noop_p)
4736 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4737 if (!can_vec_perm_const_p (mode, indices))
4739 if (dump_enabled_p ())
4741 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4742 vect_location,
4743 "unsupported vect permute { ");
4744 for (i = 0; i < count; ++i)
4746 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4747 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4749 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4751 gcc_assert (analyze_only);
4752 return false;
4755 ++*n_perms;
4758 if (index == count)
4760 if (!analyze_only)
4762 tree mask_vec = NULL_TREE;
4764 if (! noop_p)
4765 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4767 if (second_vec_index == -1)
4768 second_vec_index = first_vec_index;
4770 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4772 /* Generate the permute statement if necessary. */
4773 tree first_vec = dr_chain[first_vec_index + ri];
4774 tree second_vec = dr_chain[second_vec_index + ri];
4775 gimple *perm_stmt;
4776 if (! noop_p)
4778 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4779 tree perm_dest
4780 = vect_create_destination_var (gimple_assign_lhs (stmt),
4781 vectype);
4782 perm_dest = make_ssa_name (perm_dest);
4783 perm_stmt
4784 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4785 first_vec, second_vec,
4786 mask_vec);
4787 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
4788 gsi);
4790 else
4791 /* If mask was NULL_TREE generate the requested
4792 identity transform. */
4793 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
4795 /* Store the vector statement in NODE. */
4796 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
4800 index = 0;
4801 first_vec_index = -1;
4802 second_vec_index = -1;
4803 noop_p = true;
4807 return true;
4811 /* Vectorize the SLP permutations in NODE as specified
4812 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
4813 child number and lane number.
4814 Interleaving of two two-lane two-child SLP subtrees (not supported):
4815 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
4816 A blend of two four-lane two-child SLP subtrees:
4817 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
4818 Highpart of a four-lane one-child SLP subtree (not supported):
4819 [ { 0, 2 }, { 0, 3 } ]
4820 Where currently only a subset is supported by code generating below. */
4822 static bool
4823 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
4824 slp_tree node, stmt_vector_for_cost *cost_vec)
4826 tree vectype = SLP_TREE_VECTYPE (node);
4828 /* ??? We currently only support all same vector input and output types
4829 while the SLP IL should really do a concat + select and thus accept
4830 arbitrary mismatches. */
4831 slp_tree child;
4832 unsigned i;
4833 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4834 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
4836 if (dump_enabled_p ())
4837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4838 "Unsupported lane permutation\n");
4839 return false;
4842 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
4843 gcc_assert (perm.length () == SLP_TREE_LANES (node));
4844 if (dump_enabled_p ())
4846 dump_printf_loc (MSG_NOTE, vect_location,
4847 "vectorizing permutation");
4848 for (unsigned i = 0; i < perm.length (); ++i)
4849 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
4850 dump_printf (MSG_NOTE, "\n");
4853 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4854 if (!nunits.is_constant ())
4855 return false;
4856 unsigned HOST_WIDE_INT vf = 1;
4857 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
4858 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
4859 return false;
4860 unsigned olanes = vf * SLP_TREE_LANES (node);
4861 gcc_assert (multiple_p (olanes, nunits));
4863 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
4864 from the { SLP operand, scalar lane } permutation as recorded in the
4865 SLP node as intermediate step. This part should already work
4866 with SLP children with arbitrary number of lanes. */
4867 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
4868 auto_vec<unsigned> active_lane;
4869 vperm.create (olanes);
4870 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
4871 for (unsigned i = 0; i < vf; ++i)
4873 for (unsigned pi = 0; pi < perm.length (); ++pi)
4875 std::pair<unsigned, unsigned> p = perm[pi];
4876 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
4877 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
4878 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
4879 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
4880 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
4882 /* Advance to the next group. */
4883 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
4884 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
4887 if (dump_enabled_p ())
4889 dump_printf_loc (MSG_NOTE, vect_location, "as");
4890 for (unsigned i = 0; i < vperm.length (); ++i)
4892 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
4893 dump_printf (MSG_NOTE, ",");
4894 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
4895 vperm[i].first.first, vperm[i].first.second,
4896 vperm[i].first.second);
4898 dump_printf (MSG_NOTE, "\n");
4901 /* We can only handle two-vector permutes, everything else should
4902 be lowered on the SLP level. The following is closely inspired
4903 by vect_transform_slp_perm_load and is supposed to eventually
4904 replace it.
4905 ??? As intermediate step do code-gen in the SLP tree representation
4906 somehow? */
4907 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
4908 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
4909 unsigned int const_nunits = nunits.to_constant ();
4910 unsigned int index = 0;
4911 unsigned int mask_element;
4912 vec_perm_builder mask;
4913 mask.new_vector (const_nunits, const_nunits, 1);
4914 unsigned int count = mask.encoded_nelts ();
4915 mask.quick_grow (count);
4916 vec_perm_indices indices;
4917 unsigned nperms = 0;
4918 for (unsigned i = 0; i < vperm.length (); ++i)
4920 mask_element = vperm[i].second;
4921 if (first_vec.first == -1U
4922 || first_vec == vperm[i].first)
4923 first_vec = vperm[i].first;
4924 else if (second_vec.first == -1U
4925 || second_vec == vperm[i].first)
4927 second_vec = vperm[i].first;
4928 mask_element += const_nunits;
4930 else
4932 if (dump_enabled_p ())
4933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4934 "permutation requires at "
4935 "least three vectors");
4936 gcc_assert (!gsi);
4937 return false;
4940 mask[index++] = mask_element;
4942 if (index == count)
4944 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
4945 const_nunits);
4946 bool identity_p = indices.series_p (0, 1, 0, 1);
4947 if (!identity_p
4948 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
4950 if (dump_enabled_p ())
4952 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4953 vect_location,
4954 "unsupported vect permute { ");
4955 for (i = 0; i < count; ++i)
4957 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4958 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4960 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4962 gcc_assert (!gsi);
4963 return false;
4966 if (!identity_p)
4967 nperms++;
4968 if (gsi)
4970 if (second_vec.first == -1U)
4971 second_vec = first_vec;
4973 /* Generate the permute statement if necessary. */
4974 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
4975 tree first_def
4976 = vect_get_slp_vect_def (first_node, first_vec.second);
4977 gassign *perm_stmt;
4978 tree perm_dest = make_ssa_name (vectype);
4979 if (!identity_p)
4981 slp_tree second_node
4982 = SLP_TREE_CHILDREN (node)[second_vec.first];
4983 tree second_def
4984 = vect_get_slp_vect_def (second_node, second_vec.second);
4985 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4986 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4987 first_def, second_def,
4988 mask_vec);
4990 else
4991 /* We need a copy here in case the def was external. */
4992 perm_stmt = gimple_build_assign (perm_dest, first_def);
4993 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
4994 /* Store the vector statement in NODE. */
4995 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
4998 index = 0;
4999 first_vec = std::make_pair (-1U, -1U);
5000 second_vec = std::make_pair (-1U, -1U);
5004 if (!gsi)
5005 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
5007 return true;
5010 /* Vectorize SLP instance tree in postorder. */
5012 static void
5013 vect_schedule_slp_instance (vec_info *vinfo,
5014 slp_tree node, slp_instance instance,
5015 hash_set<slp_tree> &visited)
5017 gimple_stmt_iterator si;
5018 int i;
5019 slp_tree child;
5021 /* See if we have already vectorized the node in the graph of the
5022 SLP instance. */
5023 if ((SLP_TREE_DEF_TYPE (node) == vect_internal_def
5024 && SLP_TREE_VEC_STMTS (node).exists ())
5025 || SLP_TREE_VEC_DEFS (node).exists ())
5026 return;
5028 /* Vectorize externals and constants. */
5029 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5030 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5032 /* ??? vectorizable_shift can end up using a scalar operand which is
5033 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5034 node in this case. */
5035 if (!SLP_TREE_VECTYPE (node))
5036 return;
5038 vect_create_constant_vectors (vinfo, node);
5039 return;
5042 /* ??? If we'd have a way to mark backedges that would be cheaper. */
5043 if (visited.add (node))
5044 return;
5046 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5047 vect_schedule_slp_instance (vinfo, child, instance, visited);
5049 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
5050 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
5052 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5053 if (dump_enabled_p ())
5054 dump_printf_loc (MSG_NOTE, vect_location,
5055 "------>vectorizing SLP node starting from: %G",
5056 stmt_info->stmt);
5058 if (STMT_VINFO_DATA_REF (stmt_info)
5059 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5061 /* Vectorized loads go before the first scalar load to make it
5062 ready early, vectorized stores go before the last scalar
5063 stmt which is where all uses are ready. */
5064 stmt_vec_info last_stmt_info = NULL;
5065 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
5066 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
5067 else /* DR_IS_WRITE */
5068 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
5069 si = gsi_for_stmt (last_stmt_info->stmt);
5071 else if ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
5072 == cycle_phi_info_type)
5073 || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
5074 == induc_vec_info_type))
5076 /* For reduction and induction PHIs we do not use the
5077 insertion iterator. */
5078 si = gsi_none ();
5080 else
5082 /* Emit other stmts after the children vectorized defs which is
5083 earliest possible. */
5084 gimple *last_stmt = NULL;
5085 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5086 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5088 /* For fold-left reductions we are retaining the scalar
5089 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5090 set so the representation isn't perfect. Resort to the
5091 last scalar def here. */
5092 if (SLP_TREE_VEC_STMTS (child).is_empty ())
5094 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
5095 == cycle_phi_info_type);
5096 gphi *phi = as_a <gphi *>
5097 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
5098 if (!last_stmt
5099 || vect_stmt_dominates_stmt_p (last_stmt, phi))
5100 last_stmt = phi;
5102 /* We are emitting all vectorized stmts in the same place and
5103 the last one is the last.
5104 ??? Unless we have a load permutation applied and that
5105 figures to re-use an earlier generated load. */
5106 unsigned j;
5107 gimple *vstmt;
5108 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
5109 if (!last_stmt
5110 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5111 last_stmt = vstmt;
5113 else if (!SLP_TREE_VECTYPE (child))
5115 /* For externals we use unvectorized at all scalar defs. */
5116 unsigned j;
5117 tree def;
5118 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
5119 if (TREE_CODE (def) == SSA_NAME
5120 && !SSA_NAME_IS_DEFAULT_DEF (def))
5122 gimple *stmt = SSA_NAME_DEF_STMT (def);
5123 if (!last_stmt
5124 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
5125 last_stmt = stmt;
5128 else
5130 /* For externals we have to look at all defs since their
5131 insertion place is decided per vector. But beware
5132 of pre-existing vectors where we need to make sure
5133 we do not insert before the region boundary. */
5134 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
5135 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
5136 last_stmt = gsi_stmt (gsi_after_labels
5137 (as_a <bb_vec_info> (vinfo)->bbs[0]));
5138 else
5140 unsigned j;
5141 tree vdef;
5142 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
5143 if (TREE_CODE (vdef) == SSA_NAME
5144 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
5146 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
5147 if (!last_stmt
5148 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5149 last_stmt = vstmt;
5153 /* This can happen when all children are pre-existing vectors or
5154 constants. */
5155 if (!last_stmt)
5156 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
5157 if (is_a <gphi *> (last_stmt))
5158 si = gsi_after_labels (gimple_bb (last_stmt));
5159 else
5161 si = gsi_for_stmt (last_stmt);
5162 gsi_next (&si);
5166 bool done_p = false;
5168 /* Handle purely internal nodes. */
5169 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5171 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5172 be shared with different SLP nodes (but usually it's the same
5173 operation apart from the case the stmt is only there for denoting
5174 the actual scalar lane defs ...). So do not call vect_transform_stmt
5175 but open-code it here (partly). */
5176 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
5177 gcc_assert (done);
5178 done_p = true;
5180 if (!done_p)
5181 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
5184 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5185 For loop vectorization this is done in vectorizable_call, but for SLP
5186 it needs to be deferred until end of vect_schedule_slp, because multiple
5187 SLP instances may refer to the same scalar stmt. */
5189 static void
5190 vect_remove_slp_scalar_calls (vec_info *vinfo,
5191 slp_tree node, hash_set<slp_tree> &visited)
5193 gimple *new_stmt;
5194 gimple_stmt_iterator gsi;
5195 int i;
5196 slp_tree child;
5197 tree lhs;
5198 stmt_vec_info stmt_info;
5200 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5201 return;
5203 if (visited.add (node))
5204 return;
5206 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5207 vect_remove_slp_scalar_calls (vinfo, child, visited);
5209 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5211 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
5212 if (!stmt || gimple_bb (stmt) == NULL)
5213 continue;
5214 if (is_pattern_stmt_p (stmt_info)
5215 || !PURE_SLP_STMT (stmt_info))
5216 continue;
5217 lhs = gimple_call_lhs (stmt);
5218 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
5219 gsi = gsi_for_stmt (stmt);
5220 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
5221 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
5225 static void
5226 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
5228 hash_set<slp_tree> visited;
5229 vect_remove_slp_scalar_calls (vinfo, node, visited);
5232 /* Vectorize the instance root. */
5234 void
5235 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
5237 gassign *rstmt = NULL;
5239 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
5241 gimple *child_stmt;
5242 int j;
5244 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5246 tree vect_lhs = gimple_get_lhs (child_stmt);
5247 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
5248 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
5249 TREE_TYPE (vect_lhs)))
5250 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
5251 vect_lhs);
5252 rstmt = gimple_build_assign (root_lhs, vect_lhs);
5253 break;
5256 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
5258 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5259 gimple *child_stmt;
5260 int j;
5261 vec<constructor_elt, va_gc> *v;
5262 vec_alloc (v, nelts);
5264 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5266 CONSTRUCTOR_APPEND_ELT (v,
5267 NULL_TREE,
5268 gimple_get_lhs (child_stmt));
5270 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
5271 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
5272 tree r_constructor = build_constructor (rtype, v);
5273 rstmt = gimple_build_assign (lhs, r_constructor);
5276 gcc_assert (rstmt);
5278 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
5279 gsi_replace (&rgsi, rstmt, true);
5282 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
5284 void
5285 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
5287 slp_instance instance;
5288 unsigned int i;
5290 hash_set<slp_tree> visited;
5291 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5293 slp_tree node = SLP_INSTANCE_TREE (instance);
5294 if (dump_enabled_p ())
5296 dump_printf_loc (MSG_NOTE, vect_location,
5297 "Vectorizing SLP tree:\n");
5298 if (SLP_INSTANCE_ROOT_STMT (instance))
5299 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
5300 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
5301 vect_print_slp_graph (MSG_NOTE, vect_location,
5302 SLP_INSTANCE_TREE (instance));
5304 /* Schedule the tree of INSTANCE. */
5305 vect_schedule_slp_instance (vinfo, node, instance, visited);
5307 if (SLP_INSTANCE_ROOT_STMT (instance))
5308 vectorize_slp_instance_root_stmt (node, instance);
5310 if (dump_enabled_p ())
5311 dump_printf_loc (MSG_NOTE, vect_location,
5312 "vectorizing stmts using SLP.\n");
5315 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5317 slp_tree root = SLP_INSTANCE_TREE (instance);
5318 stmt_vec_info store_info;
5319 unsigned int j;
5321 /* For reductions set the latch values of the vectorized PHIs. */
5322 if (instance->reduc_phis
5323 && STMT_VINFO_REDUC_TYPE (SLP_TREE_REPRESENTATIVE
5324 (instance->reduc_phis)) != FOLD_LEFT_REDUCTION
5325 && STMT_VINFO_REDUC_TYPE (SLP_TREE_REPRESENTATIVE
5326 (instance->reduc_phis)) != EXTRACT_LAST_REDUCTION)
5328 slp_tree slp_node = root;
5329 slp_tree phi_node = instance->reduc_phis;
5330 gphi *phi = as_a <gphi *> (SLP_TREE_SCALAR_STMTS (phi_node)[0]->stmt);
5331 edge e = loop_latch_edge (gimple_bb (phi)->loop_father);
5332 gcc_assert (SLP_TREE_VEC_STMTS (phi_node).length ()
5333 == SLP_TREE_VEC_STMTS (slp_node).length ());
5334 for (unsigned j = 0; j < SLP_TREE_VEC_STMTS (phi_node).length (); ++j)
5335 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[j]),
5336 vect_get_slp_vect_def (slp_node, j),
5337 e, gimple_phi_arg_location (phi, e->dest_idx));
5340 /* Remove scalar call stmts. Do not do this for basic-block
5341 vectorization as not all uses may be vectorized.
5342 ??? Why should this be necessary? DCE should be able to
5343 remove the stmts itself.
5344 ??? For BB vectorization we can as well remove scalar
5345 stmts starting from the SLP tree root if they have no
5346 uses. */
5347 if (is_a <loop_vec_info> (vinfo))
5348 vect_remove_slp_scalar_calls (vinfo, root);
5350 /* Remove vectorized stores original scalar stmts. */
5351 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
5353 if (!STMT_VINFO_DATA_REF (store_info)
5354 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
5355 break;
5357 store_info = vect_orig_stmt (store_info);
5358 /* Free the attached stmt_vec_info and remove the stmt. */
5359 vinfo->remove_stmt (store_info);