gcc/
[official-gcc.git] / gcc / tree-vect-slp.c
blobfb325d54f1084461d44cd54a98e5b7f99541a188
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
46 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
48 static void
49 vect_free_slp_tree (slp_tree node)
51 int i;
52 slp_tree child;
54 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
55 vect_free_slp_tree (child);
57 gimple *stmt;
58 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
59 /* After transform some stmts are removed and thus their vinfo is gone. */
60 if (vinfo_for_stmt (stmt))
62 gcc_assert (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) > 0);
63 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))--;
66 SLP_TREE_CHILDREN (node).release ();
67 SLP_TREE_SCALAR_STMTS (node).release ();
68 SLP_TREE_VEC_STMTS (node).release ();
69 SLP_TREE_LOAD_PERMUTATION (node).release ();
71 free (node);
75 /* Free the memory allocated for the SLP instance. */
77 void
78 vect_free_slp_instance (slp_instance instance)
80 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
81 SLP_INSTANCE_LOADS (instance).release ();
82 free (instance);
86 /* Create an SLP node for SCALAR_STMTS. */
88 static slp_tree
89 vect_create_new_slp_node (vec<gimple *> scalar_stmts)
91 slp_tree node;
92 gimple *stmt = scalar_stmts[0];
93 unsigned int nops;
95 if (is_gimple_call (stmt))
96 nops = gimple_call_num_args (stmt);
97 else if (is_gimple_assign (stmt))
99 nops = gimple_num_ops (stmt) - 1;
100 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
101 nops++;
103 else
104 return NULL;
106 node = XNEW (struct _slp_tree);
107 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
108 SLP_TREE_VEC_STMTS (node).create (0);
109 SLP_TREE_CHILDREN (node).create (nops);
110 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
111 SLP_TREE_TWO_OPERATORS (node) = false;
112 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
114 unsigned i;
115 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt)
116 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))++;
118 return node;
122 /* This structure is used in creation of an SLP tree. Each instance
123 corresponds to the same operand in a group of scalar stmts in an SLP
124 node. */
125 typedef struct _slp_oprnd_info
127 /* Def-stmts for the operands. */
128 vec<gimple *> def_stmts;
129 /* Information about the first statement, its vector def-type, type, the
130 operand itself in case it's constant, and an indication if it's a pattern
131 stmt. */
132 enum vect_def_type first_dt;
133 tree first_op_type;
134 bool first_pattern;
135 bool second_pattern;
136 } *slp_oprnd_info;
139 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
140 operand. */
141 static vec<slp_oprnd_info>
142 vect_create_oprnd_info (int nops, int group_size)
144 int i;
145 slp_oprnd_info oprnd_info;
146 vec<slp_oprnd_info> oprnds_info;
148 oprnds_info.create (nops);
149 for (i = 0; i < nops; i++)
151 oprnd_info = XNEW (struct _slp_oprnd_info);
152 oprnd_info->def_stmts.create (group_size);
153 oprnd_info->first_dt = vect_uninitialized_def;
154 oprnd_info->first_op_type = NULL_TREE;
155 oprnd_info->first_pattern = false;
156 oprnd_info->second_pattern = false;
157 oprnds_info.quick_push (oprnd_info);
160 return oprnds_info;
164 /* Free operands info. */
166 static void
167 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
169 int i;
170 slp_oprnd_info oprnd_info;
172 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
174 oprnd_info->def_stmts.release ();
175 XDELETE (oprnd_info);
178 oprnds_info.release ();
182 /* Find the place of the data-ref in STMT in the interleaving chain that starts
183 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
185 static int
186 vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
188 gimple *next_stmt = first_stmt;
189 int result = 0;
191 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
192 return -1;
196 if (next_stmt == stmt)
197 return result;
198 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
199 if (next_stmt)
200 result += GROUP_GAP (vinfo_for_stmt (next_stmt));
202 while (next_stmt);
204 return -1;
208 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
209 they are of a valid type and that they match the defs of the first stmt of
210 the SLP group (stored in OPRNDS_INFO). If there was a fatal error
211 return -1, if the error could be corrected by swapping operands of the
212 operation return 1, if everything is ok return 0. */
214 static int
215 vect_get_and_check_slp_defs (vec_info *vinfo,
216 gimple *stmt, unsigned stmt_num,
217 vec<slp_oprnd_info> *oprnds_info)
219 tree oprnd;
220 unsigned int i, number_of_oprnds;
221 gimple *def_stmt;
222 enum vect_def_type dt = vect_uninitialized_def;
223 bool pattern = false;
224 slp_oprnd_info oprnd_info;
225 int first_op_idx = 1;
226 bool commutative = false;
227 bool first_op_cond = false;
228 bool first = stmt_num == 0;
229 bool second = stmt_num == 1;
231 if (is_gimple_call (stmt))
233 number_of_oprnds = gimple_call_num_args (stmt);
234 first_op_idx = 3;
236 else if (is_gimple_assign (stmt))
238 enum tree_code code = gimple_assign_rhs_code (stmt);
239 number_of_oprnds = gimple_num_ops (stmt) - 1;
240 if (gimple_assign_rhs_code (stmt) == COND_EXPR
241 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
243 first_op_cond = true;
244 commutative = true;
245 number_of_oprnds++;
247 else
248 commutative = commutative_tree_code (code);
250 else
251 return -1;
253 bool swapped = false;
254 for (i = 0; i < number_of_oprnds; i++)
256 again:
257 if (first_op_cond)
259 if (i == 0 || i == 1)
260 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
261 swapped ? !i : i);
262 else
263 oprnd = gimple_op (stmt, first_op_idx + i - 1);
265 else
266 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
268 oprnd_info = (*oprnds_info)[i];
270 if (!vect_is_simple_use (oprnd, vinfo, &def_stmt, &dt))
272 if (dump_enabled_p ())
274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
275 "Build SLP failed: can't analyze def for ");
276 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
277 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
280 return -1;
283 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
284 from the pattern. Check that all the stmts of the node are in the
285 pattern. */
286 if (def_stmt && gimple_bb (def_stmt)
287 && vect_stmt_in_region_p (vinfo, def_stmt)
288 && vinfo_for_stmt (def_stmt)
289 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
290 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
291 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
293 pattern = true;
294 if (!first && !oprnd_info->first_pattern
295 /* Allow different pattern state for the defs of the
296 first stmt in reduction chains. */
297 && (oprnd_info->first_dt != vect_reduction_def
298 || (!second && !oprnd_info->second_pattern)))
300 if (i == 0
301 && !swapped
302 && commutative)
304 swapped = true;
305 goto again;
308 if (dump_enabled_p ())
310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
311 "Build SLP failed: some of the stmts"
312 " are in a pattern, and others are not ");
313 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
314 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
317 return 1;
320 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
321 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
323 if (dt == vect_unknown_def_type)
325 if (dump_enabled_p ())
326 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
327 "Unsupported pattern.\n");
328 return -1;
331 switch (gimple_code (def_stmt))
333 case GIMPLE_PHI:
334 case GIMPLE_ASSIGN:
335 break;
337 default:
338 if (dump_enabled_p ())
339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
340 "unsupported defining stmt:\n");
341 return -1;
345 if (second)
346 oprnd_info->second_pattern = pattern;
348 if (first)
350 oprnd_info->first_dt = dt;
351 oprnd_info->first_pattern = pattern;
352 oprnd_info->first_op_type = TREE_TYPE (oprnd);
354 else
356 /* Not first stmt of the group, check that the def-stmt/s match
357 the def-stmt/s of the first stmt. Allow different definition
358 types for reduction chains: the first stmt must be a
359 vect_reduction_def (a phi node), and the rest
360 vect_internal_def. */
361 if (((oprnd_info->first_dt != dt
362 && !(oprnd_info->first_dt == vect_reduction_def
363 && dt == vect_internal_def)
364 && !((oprnd_info->first_dt == vect_external_def
365 || oprnd_info->first_dt == vect_constant_def)
366 && (dt == vect_external_def
367 || dt == vect_constant_def)))
368 || !types_compatible_p (oprnd_info->first_op_type,
369 TREE_TYPE (oprnd))))
371 /* Try swapping operands if we got a mismatch. */
372 if (i == 0
373 && !swapped
374 && commutative)
376 swapped = true;
377 goto again;
380 if (dump_enabled_p ())
381 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
382 "Build SLP failed: different types\n");
384 return 1;
388 /* Check the types of the definitions. */
389 switch (dt)
391 case vect_constant_def:
392 case vect_external_def:
393 case vect_reduction_def:
394 break;
396 case vect_internal_def:
397 oprnd_info->def_stmts.quick_push (def_stmt);
398 break;
400 default:
401 /* FORNOW: Not supported. */
402 if (dump_enabled_p ())
404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
405 "Build SLP failed: illegal type of def ");
406 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
407 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
410 return -1;
414 /* Swap operands. */
415 if (swapped)
417 /* If there are already uses of this stmt in a SLP instance then
418 we've committed to the operand order and can't swap it. */
419 if (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) != 0)
421 if (dump_enabled_p ())
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 "Build SLP failed: cannot swap operands of "
425 "shared stmt ");
426 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
428 return -1;
431 if (first_op_cond)
433 tree cond = gimple_assign_rhs1 (stmt);
434 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
435 &TREE_OPERAND (cond, 1));
436 TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
438 else
439 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
440 gimple_assign_rhs2_ptr (stmt));
441 if (dump_enabled_p ())
443 dump_printf_loc (MSG_NOTE, vect_location,
444 "swapped operands to match def types in ");
445 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
449 return 0;
453 /* Verify if the scalar stmts STMTS are isomorphic, require data
454 permutation or are of unsupported types of operation. Return
455 true if they are, otherwise return false and indicate in *MATCHES
456 which stmts are not isomorphic to the first one. If MATCHES[0]
457 is false then this indicates the comparison could not be
458 carried out or the stmts will never be vectorized by SLP. */
460 static bool
461 vect_build_slp_tree_1 (vec_info *vinfo,
462 vec<gimple *> stmts, unsigned int group_size,
463 unsigned nops, unsigned int *max_nunits,
464 bool *matches, bool *two_operators)
466 unsigned int i;
467 gimple *first_stmt = stmts[0], *stmt = stmts[0];
468 enum tree_code first_stmt_code = ERROR_MARK;
469 enum tree_code alt_stmt_code = ERROR_MARK;
470 enum tree_code rhs_code = ERROR_MARK;
471 enum tree_code first_cond_code = ERROR_MARK;
472 tree lhs;
473 bool need_same_oprnds = false;
474 tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
475 optab optab;
476 int icode;
477 machine_mode optab_op2_mode;
478 machine_mode vec_mode;
479 HOST_WIDE_INT dummy;
480 gimple *first_load = NULL, *prev_first_load = NULL;
482 /* For every stmt in NODE find its def stmt/s. */
483 FOR_EACH_VEC_ELT (stmts, i, stmt)
485 matches[i] = false;
487 if (dump_enabled_p ())
489 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
490 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
493 /* Fail to vectorize statements marked as unvectorizable. */
494 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
496 if (dump_enabled_p ())
498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
499 "Build SLP failed: unvectorizable statement ");
500 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
502 /* Fatal mismatch. */
503 matches[0] = false;
504 return false;
507 lhs = gimple_get_lhs (stmt);
508 if (lhs == NULL_TREE)
510 if (dump_enabled_p ())
512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
513 "Build SLP failed: not GIMPLE_ASSIGN nor "
514 "GIMPLE_CALL ");
515 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
517 /* Fatal mismatch. */
518 matches[0] = false;
519 return false;
522 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
523 vectype = get_vectype_for_scalar_type (scalar_type);
524 if (!vectype)
526 if (dump_enabled_p ())
528 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
529 "Build SLP failed: unsupported data-type ");
530 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
531 scalar_type);
532 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
534 /* Fatal mismatch. */
535 matches[0] = false;
536 return false;
539 /* If populating the vector type requires unrolling then fail
540 before adjusting *max_nunits for basic-block vectorization. */
541 if (is_a <bb_vec_info> (vinfo)
542 && TYPE_VECTOR_SUBPARTS (vectype) > group_size)
544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
545 "Build SLP failed: unrolling required "
546 "in basic block SLP\n");
547 /* Fatal mismatch. */
548 matches[0] = false;
549 return false;
552 /* In case of multiple types we need to detect the smallest type. */
553 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
554 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
556 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
558 rhs_code = CALL_EXPR;
559 if (gimple_call_internal_p (call_stmt)
560 || gimple_call_tail_p (call_stmt)
561 || gimple_call_noreturn_p (call_stmt)
562 || !gimple_call_nothrow_p (call_stmt)
563 || gimple_call_chain (call_stmt))
565 if (dump_enabled_p ())
567 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
568 "Build SLP failed: unsupported call type ");
569 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
570 call_stmt, 0);
572 /* Fatal mismatch. */
573 matches[0] = false;
574 return false;
577 else
578 rhs_code = gimple_assign_rhs_code (stmt);
580 /* Check the operation. */
581 if (i == 0)
583 first_stmt_code = rhs_code;
585 /* Shift arguments should be equal in all the packed stmts for a
586 vector shift with scalar shift operand. */
587 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
588 || rhs_code == LROTATE_EXPR
589 || rhs_code == RROTATE_EXPR)
591 vec_mode = TYPE_MODE (vectype);
593 /* First see if we have a vector/vector shift. */
594 optab = optab_for_tree_code (rhs_code, vectype,
595 optab_vector);
597 if (!optab
598 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
600 /* No vector/vector shift, try for a vector/scalar shift. */
601 optab = optab_for_tree_code (rhs_code, vectype,
602 optab_scalar);
604 if (!optab)
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
608 "Build SLP failed: no optab.\n");
609 /* Fatal mismatch. */
610 matches[0] = false;
611 return false;
613 icode = (int) optab_handler (optab, vec_mode);
614 if (icode == CODE_FOR_nothing)
616 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
618 "Build SLP failed: "
619 "op not supported by target.\n");
620 /* Fatal mismatch. */
621 matches[0] = false;
622 return false;
624 optab_op2_mode = insn_data[icode].operand[2].mode;
625 if (!VECTOR_MODE_P (optab_op2_mode))
627 need_same_oprnds = true;
628 first_op1 = gimple_assign_rhs2 (stmt);
632 else if (rhs_code == WIDEN_LSHIFT_EXPR)
634 need_same_oprnds = true;
635 first_op1 = gimple_assign_rhs2 (stmt);
638 else
640 if (first_stmt_code != rhs_code
641 && alt_stmt_code == ERROR_MARK)
642 alt_stmt_code = rhs_code;
643 if (first_stmt_code != rhs_code
644 && (first_stmt_code != IMAGPART_EXPR
645 || rhs_code != REALPART_EXPR)
646 && (first_stmt_code != REALPART_EXPR
647 || rhs_code != IMAGPART_EXPR)
648 /* Handle mismatches in plus/minus by computing both
649 and merging the results. */
650 && !((first_stmt_code == PLUS_EXPR
651 || first_stmt_code == MINUS_EXPR)
652 && (alt_stmt_code == PLUS_EXPR
653 || alt_stmt_code == MINUS_EXPR)
654 && rhs_code == alt_stmt_code)
655 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
656 && (first_stmt_code == ARRAY_REF
657 || first_stmt_code == BIT_FIELD_REF
658 || first_stmt_code == INDIRECT_REF
659 || first_stmt_code == COMPONENT_REF
660 || first_stmt_code == MEM_REF)))
662 if (dump_enabled_p ())
664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
665 "Build SLP failed: different operation "
666 "in stmt ");
667 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
669 "original stmt ");
670 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
671 first_stmt, 0);
673 /* Mismatch. */
674 continue;
677 if (need_same_oprnds
678 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
680 if (dump_enabled_p ())
682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
683 "Build SLP failed: different shift "
684 "arguments in ");
685 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
687 /* Mismatch. */
688 continue;
691 if (rhs_code == CALL_EXPR)
693 gimple *first_stmt = stmts[0];
694 if (gimple_call_num_args (stmt) != nops
695 || !operand_equal_p (gimple_call_fn (first_stmt),
696 gimple_call_fn (stmt), 0)
697 || gimple_call_fntype (first_stmt)
698 != gimple_call_fntype (stmt))
700 if (dump_enabled_p ())
702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
703 "Build SLP failed: different calls in ");
704 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
705 stmt, 0);
707 /* Mismatch. */
708 continue;
713 /* Grouped store or load. */
714 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
716 if (REFERENCE_CLASS_P (lhs))
718 /* Store. */
721 else
723 /* Load. */
724 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
725 if (prev_first_load)
727 /* Check that there are no loads from different interleaving
728 chains in the same node. */
729 if (prev_first_load != first_load)
731 if (dump_enabled_p ())
733 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
734 vect_location,
735 "Build SLP failed: different "
736 "interleaving chains in one node ");
737 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
738 stmt, 0);
740 /* Mismatch. */
741 continue;
744 else
745 prev_first_load = first_load;
747 } /* Grouped access. */
748 else
750 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
752 /* Not grouped load. */
753 if (dump_enabled_p ())
755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
756 "Build SLP failed: not grouped load ");
757 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
760 /* FORNOW: Not grouped loads are not supported. */
761 /* Fatal mismatch. */
762 matches[0] = false;
763 return false;
766 /* Not memory operation. */
767 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
768 && TREE_CODE_CLASS (rhs_code) != tcc_unary
769 && TREE_CODE_CLASS (rhs_code) != tcc_expression
770 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
771 && rhs_code != CALL_EXPR)
773 if (dump_enabled_p ())
775 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
776 "Build SLP failed: operation");
777 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
778 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
780 /* Fatal mismatch. */
781 matches[0] = false;
782 return false;
785 if (rhs_code == COND_EXPR)
787 tree cond_expr = gimple_assign_rhs1 (stmt);
789 if (i == 0)
790 first_cond_code = TREE_CODE (cond_expr);
791 else if (first_cond_code != TREE_CODE (cond_expr))
793 if (dump_enabled_p ())
795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
796 "Build SLP failed: different"
797 " operation");
798 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
799 stmt, 0);
801 /* Mismatch. */
802 continue;
807 matches[i] = true;
810 for (i = 0; i < group_size; ++i)
811 if (!matches[i])
812 return false;
814 /* If we allowed a two-operation SLP node verify the target can cope
815 with the permute we are going to use. */
816 if (alt_stmt_code != ERROR_MARK
817 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
819 unsigned char *sel
820 = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (vectype));
821 for (i = 0; i < TYPE_VECTOR_SUBPARTS (vectype); ++i)
823 sel[i] = i;
824 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
825 sel[i] += TYPE_VECTOR_SUBPARTS (vectype);
827 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
829 for (i = 0; i < group_size; ++i)
830 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
832 matches[i] = false;
833 if (dump_enabled_p ())
835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
836 "Build SLP failed: different operation "
837 "in stmt ");
838 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
839 stmts[i], 0);
840 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
841 "original stmt ");
842 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
843 first_stmt, 0);
846 return false;
848 *two_operators = true;
851 return true;
854 /* Recursively build an SLP tree starting from NODE.
855 Fail (and return a value not equal to zero) if def-stmts are not
856 isomorphic, require data permutation or are of unsupported types of
857 operation. Otherwise, return 0.
858 The value returned is the depth in the SLP tree where a mismatch
859 was found. */
861 static slp_tree
862 vect_build_slp_tree (vec_info *vinfo,
863 vec<gimple *> stmts, unsigned int group_size,
864 unsigned int *max_nunits,
865 vec<slp_tree> *loads,
866 bool *matches, unsigned *npermutes, unsigned *tree_size,
867 unsigned max_tree_size)
869 unsigned nops, i, this_tree_size = 0, this_max_nunits = *max_nunits;
870 gimple *stmt;
871 slp_tree node;
873 matches[0] = false;
875 stmt = stmts[0];
876 if (is_gimple_call (stmt))
877 nops = gimple_call_num_args (stmt);
878 else if (is_gimple_assign (stmt))
880 nops = gimple_num_ops (stmt) - 1;
881 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
882 nops++;
884 else
885 return NULL;
887 bool two_operators = false;
888 if (!vect_build_slp_tree_1 (vinfo,
889 stmts, group_size, nops,
890 &this_max_nunits, matches, &two_operators))
891 return NULL;
893 /* If the SLP node is a load, terminate the recursion. */
894 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
895 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
897 *max_nunits = this_max_nunits;
898 node = vect_create_new_slp_node (stmts);
899 loads->safe_push (node);
900 return node;
903 /* Get at the operands, verifying they are compatible. */
904 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
905 slp_oprnd_info oprnd_info;
906 FOR_EACH_VEC_ELT (stmts, i, stmt)
908 switch (vect_get_and_check_slp_defs (vinfo, stmt, i, &oprnds_info))
910 case 0:
911 break;
912 case -1:
913 matches[0] = false;
914 vect_free_oprnd_info (oprnds_info);
915 return NULL;
916 case 1:
917 matches[i] = false;
918 break;
921 for (i = 0; i < group_size; ++i)
922 if (!matches[i])
924 vect_free_oprnd_info (oprnds_info);
925 return NULL;
928 auto_vec<slp_tree, 4> children;
929 auto_vec<slp_tree> this_loads;
931 stmt = stmts[0];
933 /* Create SLP_TREE nodes for the definition node/s. */
934 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
936 slp_tree child;
937 unsigned old_nloads = this_loads.length ();
938 unsigned old_tree_size = this_tree_size;
939 unsigned int j;
941 if (oprnd_info->first_dt != vect_internal_def)
942 continue;
944 if (++this_tree_size > max_tree_size)
946 FOR_EACH_VEC_ELT (children, j, child)
947 vect_free_slp_tree (child);
948 vect_free_oprnd_info (oprnds_info);
949 return NULL;
952 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
953 group_size, &this_max_nunits,
954 &this_loads, matches, npermutes,
955 &this_tree_size,
956 max_tree_size)) != NULL)
958 /* If we have all children of child built up from scalars then just
959 throw that away and build it up this node from scalars. */
960 if (!SLP_TREE_CHILDREN (child).is_empty ()
961 /* ??? Rejecting patterns this way doesn't work. We'd have to
962 do extra work to cancel the pattern so the uses see the
963 scalar version. */
964 && !is_pattern_stmt_p
965 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
967 slp_tree grandchild;
969 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
970 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
971 break;
972 if (!grandchild)
974 /* Roll back. */
975 this_loads.truncate (old_nloads);
976 this_tree_size = old_tree_size;
977 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
978 vect_free_slp_tree (grandchild);
979 SLP_TREE_CHILDREN (child).truncate (0);
981 dump_printf_loc (MSG_NOTE, vect_location,
982 "Building parent vector operands from "
983 "scalars instead\n");
984 oprnd_info->def_stmts = vNULL;
985 SLP_TREE_DEF_TYPE (child) = vect_external_def;
986 children.safe_push (child);
987 continue;
991 oprnd_info->def_stmts = vNULL;
992 children.safe_push (child);
993 continue;
996 /* If the SLP build failed fatally and we analyze a basic-block
997 simply treat nodes we fail to build as externally defined
998 (and thus build vectors from the scalar defs).
999 The cost model will reject outright expensive cases.
1000 ??? This doesn't treat cases where permutation ultimatively
1001 fails (or we don't try permutation below). Ideally we'd
1002 even compute a permutation that will end up with the maximum
1003 SLP tree size... */
1004 if (is_a <bb_vec_info> (vinfo)
1005 && !matches[0]
1006 /* ??? Rejecting patterns this way doesn't work. We'd have to
1007 do extra work to cancel the pattern so the uses see the
1008 scalar version. */
1009 && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1011 dump_printf_loc (MSG_NOTE, vect_location,
1012 "Building vector operands from scalars\n");
1013 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1014 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1015 children.safe_push (child);
1016 oprnd_info->def_stmts = vNULL;
1017 continue;
1020 /* If the SLP build for operand zero failed and operand zero
1021 and one can be commutated try that for the scalar stmts
1022 that failed the match. */
1023 if (i == 0
1024 /* A first scalar stmt mismatch signals a fatal mismatch. */
1025 && matches[0]
1026 /* ??? For COND_EXPRs we can swap the comparison operands
1027 as well as the arms under some constraints. */
1028 && nops == 2
1029 && oprnds_info[1]->first_dt == vect_internal_def
1030 && is_gimple_assign (stmt)
1031 && commutative_tree_code (gimple_assign_rhs_code (stmt))
1032 && ! two_operators
1033 /* Do so only if the number of not successful permutes was nor more
1034 than a cut-ff as re-trying the recursive match on
1035 possibly each level of the tree would expose exponential
1036 behavior. */
1037 && *npermutes < 4)
1039 /* Verify if we can safely swap or if we committed to a specific
1040 operand order already. */
1041 for (j = 0; j < group_size; ++j)
1042 if (!matches[j]
1043 && STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmts[j])) != 0)
1045 if (dump_enabled_p ())
1047 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1048 "Build SLP failed: cannot swap operands "
1049 "of shared stmt ");
1050 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1051 stmts[j], 0);
1053 goto fail;
1056 /* Swap mismatched definition stmts. */
1057 dump_printf_loc (MSG_NOTE, vect_location,
1058 "Re-trying with swapped operands of stmts ");
1059 for (j = 0; j < group_size; ++j)
1060 if (!matches[j])
1062 std::swap (oprnds_info[0]->def_stmts[j],
1063 oprnds_info[1]->def_stmts[j]);
1064 dump_printf (MSG_NOTE, "%d ", j);
1066 dump_printf (MSG_NOTE, "\n");
1067 /* And try again with scratch 'matches' ... */
1068 bool *tem = XALLOCAVEC (bool, group_size);
1069 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1070 group_size, &this_max_nunits,
1071 &this_loads, tem, npermutes,
1072 &this_tree_size,
1073 max_tree_size)) != NULL)
1075 /* ... so if successful we can apply the operand swapping
1076 to the GIMPLE IL. This is necessary because for example
1077 vect_get_slp_defs uses operand indexes and thus expects
1078 canonical operand order. This is also necessary even
1079 if we end up building the operand from scalars as
1080 we'll continue to process swapped operand two. */
1081 for (j = 0; j < group_size; ++j)
1083 gimple *stmt = stmts[j];
1084 gimple_set_plf (stmt, GF_PLF_1, false);
1086 for (j = 0; j < group_size; ++j)
1088 gimple *stmt = stmts[j];
1089 if (!matches[j])
1091 /* Avoid swapping operands twice. */
1092 if (gimple_plf (stmt, GF_PLF_1))
1093 continue;
1094 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1095 gimple_assign_rhs2_ptr (stmt));
1096 gimple_set_plf (stmt, GF_PLF_1, true);
1099 /* Verify we swap all duplicates or none. */
1100 if (flag_checking)
1101 for (j = 0; j < group_size; ++j)
1103 gimple *stmt = stmts[j];
1104 gcc_assert (gimple_plf (stmt, GF_PLF_1) == ! matches[j]);
1107 /* If we have all children of child built up from scalars then
1108 just throw that away and build it up this node from scalars. */
1109 if (!SLP_TREE_CHILDREN (child).is_empty ()
1110 /* ??? Rejecting patterns this way doesn't work. We'd have
1111 to do extra work to cancel the pattern so the uses see the
1112 scalar version. */
1113 && !is_pattern_stmt_p
1114 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1116 unsigned int j;
1117 slp_tree grandchild;
1119 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1120 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1121 break;
1122 if (!grandchild)
1124 /* Roll back. */
1125 this_loads.truncate (old_nloads);
1126 this_tree_size = old_tree_size;
1127 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1128 vect_free_slp_tree (grandchild);
1129 SLP_TREE_CHILDREN (child).truncate (0);
1131 dump_printf_loc (MSG_NOTE, vect_location,
1132 "Building parent vector operands from "
1133 "scalars instead\n");
1134 oprnd_info->def_stmts = vNULL;
1135 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1136 children.safe_push (child);
1137 continue;
1141 oprnd_info->def_stmts = vNULL;
1142 children.safe_push (child);
1143 continue;
1146 ++*npermutes;
1149 fail:
1150 gcc_assert (child == NULL);
1151 FOR_EACH_VEC_ELT (children, j, child)
1152 vect_free_slp_tree (child);
1153 vect_free_oprnd_info (oprnds_info);
1154 return NULL;
1157 vect_free_oprnd_info (oprnds_info);
1159 if (tree_size)
1160 *tree_size += this_tree_size;
1161 *max_nunits = this_max_nunits;
1162 loads->safe_splice (this_loads);
1164 node = vect_create_new_slp_node (stmts);
1165 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1166 SLP_TREE_CHILDREN (node).splice (children);
1167 return node;
1170 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1172 static void
1173 vect_print_slp_tree (int dump_kind, location_t loc, slp_tree node)
1175 int i;
1176 gimple *stmt;
1177 slp_tree child;
1179 dump_printf_loc (dump_kind, loc, "node%s\n",
1180 SLP_TREE_DEF_TYPE (node) != vect_internal_def
1181 ? " (external)" : "");
1182 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1184 dump_printf_loc (dump_kind, loc, "\tstmt %d ", i);
1185 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1187 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1188 vect_print_slp_tree (dump_kind, loc, child);
1192 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1193 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1194 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1195 stmts in NODE are to be marked. */
1197 static void
1198 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1200 int i;
1201 gimple *stmt;
1202 slp_tree child;
1204 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1205 return;
1207 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1208 if (j < 0 || i == j)
1209 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1211 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1212 vect_mark_slp_stmts (child, mark, j);
1216 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1218 static void
1219 vect_mark_slp_stmts_relevant (slp_tree node)
1221 int i;
1222 gimple *stmt;
1223 stmt_vec_info stmt_info;
1224 slp_tree child;
1226 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1227 return;
1229 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1231 stmt_info = vinfo_for_stmt (stmt);
1232 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1233 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1234 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1237 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1238 vect_mark_slp_stmts_relevant (child);
1242 /* Rearrange the statements of NODE according to PERMUTATION. */
1244 static void
1245 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1246 vec<unsigned> permutation)
1248 gimple *stmt;
1249 vec<gimple *> tmp_stmts;
1250 unsigned int i;
1251 slp_tree child;
1253 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1254 vect_slp_rearrange_stmts (child, group_size, permutation);
1256 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1257 tmp_stmts.create (group_size);
1258 tmp_stmts.quick_grow_cleared (group_size);
1260 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1261 tmp_stmts[permutation[i]] = stmt;
1263 SLP_TREE_SCALAR_STMTS (node).release ();
1264 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1268 /* Attempt to reorder stmts in a reduction chain so that we don't
1269 require any load permutation. Return true if that was possible,
1270 otherwise return false. */
1272 static bool
1273 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1275 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1276 unsigned int i, j;
1277 unsigned int lidx;
1278 slp_tree node, load;
1280 /* Compare all the permutation sequences to the first one. We know
1281 that at least one load is permuted. */
1282 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1283 if (!node->load_permutation.exists ())
1284 return false;
1285 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1287 if (!load->load_permutation.exists ())
1288 return false;
1289 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1290 if (lidx != node->load_permutation[j])
1291 return false;
1294 /* Check that the loads in the first sequence are different and there
1295 are no gaps between them. */
1296 auto_sbitmap load_index (group_size);
1297 bitmap_clear (load_index);
1298 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1300 if (lidx >= group_size)
1301 return false;
1302 if (bitmap_bit_p (load_index, lidx))
1303 return false;
1305 bitmap_set_bit (load_index, lidx);
1307 for (i = 0; i < group_size; i++)
1308 if (!bitmap_bit_p (load_index, i))
1309 return false;
1311 /* This permutation is valid for reduction. Since the order of the
1312 statements in the nodes is not important unless they are memory
1313 accesses, we can rearrange the statements in all the nodes
1314 according to the order of the loads. */
1315 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1316 node->load_permutation);
1318 /* We are done, no actual permutations need to be generated. */
1319 unsigned int unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1320 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1322 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1323 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
1324 /* But we have to keep those permutations that are required because
1325 of handling of gaps. */
1326 if (unrolling_factor == 1
1327 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
1328 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0))
1329 SLP_TREE_LOAD_PERMUTATION (node).release ();
1330 else
1331 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1332 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1335 return true;
1338 /* Check if the required load permutations in the SLP instance
1339 SLP_INSTN are supported. */
1341 static bool
1342 vect_supported_load_permutation_p (slp_instance slp_instn)
1344 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1345 unsigned int i, j, k, next;
1346 slp_tree node;
1347 gimple *stmt, *load, *next_load;
1349 if (dump_enabled_p ())
1351 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1352 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1353 if (node->load_permutation.exists ())
1354 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1355 dump_printf (MSG_NOTE, "%d ", next);
1356 else
1357 for (k = 0; k < group_size; ++k)
1358 dump_printf (MSG_NOTE, "%d ", k);
1359 dump_printf (MSG_NOTE, "\n");
1362 /* In case of reduction every load permutation is allowed, since the order
1363 of the reduction statements is not important (as opposed to the case of
1364 grouped stores). The only condition we need to check is that all the
1365 load nodes are of the same size and have the same permutation (and then
1366 rearrange all the nodes of the SLP instance according to this
1367 permutation). */
1369 /* Check that all the load nodes are of the same size. */
1370 /* ??? Can't we assert this? */
1371 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1372 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1373 return false;
1375 node = SLP_INSTANCE_TREE (slp_instn);
1376 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1378 /* Reduction (there are no data-refs in the root).
1379 In reduction chain the order of the loads is not important. */
1380 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1381 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1382 vect_attempt_slp_rearrange_stmts (slp_instn);
1384 /* In basic block vectorization we allow any subchain of an interleaving
1385 chain.
1386 FORNOW: not supported in loop SLP because of realignment compications. */
1387 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1389 /* Check whether the loads in an instance form a subchain and thus
1390 no permutation is necessary. */
1391 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1393 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1394 continue;
1395 bool subchain_p = true;
1396 next_load = NULL;
1397 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1399 if (j != 0
1400 && (next_load != load
1401 || GROUP_GAP (vinfo_for_stmt (load)) != 1))
1403 subchain_p = false;
1404 break;
1406 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1408 if (subchain_p)
1409 SLP_TREE_LOAD_PERMUTATION (node).release ();
1410 else
1412 /* Verify the permutation can be generated. */
1413 vec<tree> tem;
1414 if (!vect_transform_slp_perm_load (node, tem, NULL,
1415 1, slp_instn, true))
1417 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1418 vect_location,
1419 "unsupported load permutation\n");
1420 return false;
1424 return true;
1427 /* For loop vectorization verify we can generate the permutation. */
1428 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1429 if (node->load_permutation.exists ()
1430 && !vect_transform_slp_perm_load
1431 (node, vNULL, NULL,
1432 SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1433 return false;
1435 return true;
1439 /* Find the last store in SLP INSTANCE. */
1441 gimple *
1442 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1444 gimple *last = NULL, *stmt;
1446 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1448 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1449 if (is_pattern_stmt_p (stmt_vinfo))
1450 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1451 else
1452 last = get_later_stmt (stmt, last);
1455 return last;
1458 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1460 static void
1461 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1462 stmt_vector_for_cost *prologue_cost_vec,
1463 stmt_vector_for_cost *body_cost_vec,
1464 unsigned ncopies_for_cost)
1466 unsigned i, j;
1467 slp_tree child;
1468 gimple *stmt;
1469 stmt_vec_info stmt_info;
1470 tree lhs;
1472 /* Recurse down the SLP tree. */
1473 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1474 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1475 vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1476 body_cost_vec, ncopies_for_cost);
1478 /* Look at the first scalar stmt to determine the cost. */
1479 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1480 stmt_info = vinfo_for_stmt (stmt);
1481 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1483 vect_memory_access_type memory_access_type
1484 = (STMT_VINFO_STRIDED_P (stmt_info)
1485 ? VMAT_STRIDED_SLP
1486 : VMAT_CONTIGUOUS);
1487 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1488 vect_model_store_cost (stmt_info, ncopies_for_cost,
1489 memory_access_type, vect_uninitialized_def,
1490 node, prologue_cost_vec, body_cost_vec);
1491 else
1493 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1494 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1496 /* If the load is permuted then the alignment is determined by
1497 the first group element not by the first scalar stmt DR. */
1498 stmt = GROUP_FIRST_ELEMENT (stmt_info);
1499 stmt_info = vinfo_for_stmt (stmt);
1500 /* Record the cost for the permutation. */
1501 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1502 stmt_info, 0, vect_body);
1503 /* And adjust the number of loads performed. */
1504 unsigned nunits
1505 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1506 ncopies_for_cost
1507 = (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info)
1508 + nunits - 1) / nunits;
1509 ncopies_for_cost *= SLP_INSTANCE_UNROLLING_FACTOR (instance);
1511 /* Record the cost for the vector loads. */
1512 vect_model_load_cost (stmt_info, ncopies_for_cost,
1513 memory_access_type, node, prologue_cost_vec,
1514 body_cost_vec);
1515 return;
1518 else
1520 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1521 stmt_info, 0, vect_body);
1522 if (SLP_TREE_TWO_OPERATORS (node))
1524 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1525 stmt_info, 0, vect_body);
1526 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1527 stmt_info, 0, vect_body);
1531 /* Push SLP node def-type to stmts. */
1532 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1533 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1534 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
1535 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
1537 /* Scan operands and account for prologue cost of constants/externals.
1538 ??? This over-estimates cost for multiple uses and should be
1539 re-engineered. */
1540 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1541 lhs = gimple_get_lhs (stmt);
1542 for (i = 0; i < gimple_num_ops (stmt); ++i)
1544 tree op = gimple_op (stmt, i);
1545 gimple *def_stmt;
1546 enum vect_def_type dt;
1547 if (!op || op == lhs)
1548 continue;
1549 if (vect_is_simple_use (op, stmt_info->vinfo, &def_stmt, &dt))
1551 /* Without looking at the actual initializer a vector of
1552 constants can be implemented as load from the constant pool.
1553 ??? We need to pass down stmt_info for a vector type
1554 even if it points to the wrong stmt. */
1555 if (dt == vect_constant_def)
1556 record_stmt_cost (prologue_cost_vec, 1, vector_load,
1557 stmt_info, 0, vect_prologue);
1558 else if (dt == vect_external_def)
1559 record_stmt_cost (prologue_cost_vec, 1, vec_construct,
1560 stmt_info, 0, vect_prologue);
1564 /* Restore stmt def-types. */
1565 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1566 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1567 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
1568 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
1571 /* Compute the cost for the SLP instance INSTANCE. */
1573 static void
1574 vect_analyze_slp_cost (slp_instance instance, void *data)
1576 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1577 unsigned ncopies_for_cost;
1578 stmt_info_for_cost *si;
1579 unsigned i;
1581 if (dump_enabled_p ())
1582 dump_printf_loc (MSG_NOTE, vect_location,
1583 "=== vect_analyze_slp_cost ===\n");
1585 /* Calculate the number of vector stmts to create based on the unrolling
1586 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1587 GROUP_SIZE / NUNITS otherwise. */
1588 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1589 slp_tree node = SLP_INSTANCE_TREE (instance);
1590 stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1591 /* Adjust the group_size by the vectorization factor which is always one
1592 for basic-block vectorization. */
1593 if (STMT_VINFO_LOOP_VINFO (stmt_info))
1594 group_size *= LOOP_VINFO_VECT_FACTOR (STMT_VINFO_LOOP_VINFO (stmt_info));
1595 unsigned nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1596 /* For reductions look at a reduction operand in case the reduction
1597 operation is widening like DOT_PROD or SAD. */
1598 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1600 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1601 switch (gimple_assign_rhs_code (stmt))
1603 case DOT_PROD_EXPR:
1604 case SAD_EXPR:
1605 nunits = TYPE_VECTOR_SUBPARTS (get_vectype_for_scalar_type
1606 (TREE_TYPE (gimple_assign_rhs1 (stmt))));
1607 break;
1608 default:;
1611 ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1613 prologue_cost_vec.create (10);
1614 body_cost_vec.create (10);
1615 vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
1616 &prologue_cost_vec, &body_cost_vec,
1617 ncopies_for_cost);
1619 /* Record the prologue costs, which were delayed until we were
1620 sure that SLP was successful. */
1621 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1623 struct _stmt_vec_info *stmt_info
1624 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1625 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1626 si->misalign, vect_prologue);
1629 /* Record the instance's instructions in the target cost model. */
1630 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1632 struct _stmt_vec_info *stmt_info
1633 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1634 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1635 si->misalign, vect_body);
1638 prologue_cost_vec.release ();
1639 body_cost_vec.release ();
1642 /* Splits a group of stores, currently beginning at FIRST_STMT, into two groups:
1643 one (still beginning at FIRST_STMT) of size GROUP1_SIZE (also containing
1644 the first GROUP1_SIZE stmts, since stores are consecutive), the second
1645 containing the remainder.
1646 Return the first stmt in the second group. */
1648 static gimple *
1649 vect_split_slp_store_group (gimple *first_stmt, unsigned group1_size)
1651 stmt_vec_info first_vinfo = vinfo_for_stmt (first_stmt);
1652 gcc_assert (GROUP_FIRST_ELEMENT (first_vinfo) == first_stmt);
1653 gcc_assert (group1_size > 0);
1654 int group2_size = GROUP_SIZE (first_vinfo) - group1_size;
1655 gcc_assert (group2_size > 0);
1656 GROUP_SIZE (first_vinfo) = group1_size;
1658 gimple *stmt = first_stmt;
1659 for (unsigned i = group1_size; i > 1; i--)
1661 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
1662 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
1664 /* STMT is now the last element of the first group. */
1665 gimple *group2 = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
1666 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)) = 0;
1668 GROUP_SIZE (vinfo_for_stmt (group2)) = group2_size;
1669 for (stmt = group2; stmt; stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)))
1671 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = group2;
1672 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
1675 /* For the second group, the GROUP_GAP is that before the original group,
1676 plus skipping over the first vector. */
1677 GROUP_GAP (vinfo_for_stmt (group2)) =
1678 GROUP_GAP (first_vinfo) + group1_size;
1680 /* GROUP_GAP of the first group now has to skip over the second group too. */
1681 GROUP_GAP (first_vinfo) += group2_size;
1683 if (dump_enabled_p ())
1684 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1685 group1_size, group2_size);
1687 return group2;
1690 /* Analyze an SLP instance starting from a group of grouped stores. Call
1691 vect_build_slp_tree to build a tree of packed stmts if possible.
1692 Return FALSE if it's impossible to SLP any stmt in the loop. */
1694 static bool
1695 vect_analyze_slp_instance (vec_info *vinfo,
1696 gimple *stmt, unsigned max_tree_size)
1698 slp_instance new_instance;
1699 slp_tree node;
1700 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1701 unsigned int unrolling_factor = 1, nunits;
1702 tree vectype, scalar_type = NULL_TREE;
1703 gimple *next;
1704 unsigned int i;
1705 unsigned int max_nunits = 0;
1706 vec<slp_tree> loads;
1707 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1708 vec<gimple *> scalar_stmts;
1710 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1712 if (dr)
1714 scalar_type = TREE_TYPE (DR_REF (dr));
1715 vectype = get_vectype_for_scalar_type (scalar_type);
1717 else
1719 gcc_assert (is_a <loop_vec_info> (vinfo));
1720 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1723 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1725 else
1727 gcc_assert (is_a <loop_vec_info> (vinfo));
1728 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1729 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
1732 if (!vectype)
1734 if (dump_enabled_p ())
1736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1737 "Build SLP failed: unsupported data-type ");
1738 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1739 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1742 return false;
1744 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1746 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1747 scalar_stmts.create (group_size);
1748 next = stmt;
1749 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1751 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1752 while (next)
1754 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1755 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1756 scalar_stmts.safe_push (
1757 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1758 else
1759 scalar_stmts.safe_push (next);
1760 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1762 /* Mark the first element of the reduction chain as reduction to properly
1763 transform the node. In the reduction analysis phase only the last
1764 element of the chain is marked as reduction. */
1765 if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
1766 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
1768 else
1770 /* Collect reduction statements. */
1771 vec<gimple *> reductions = as_a <loop_vec_info> (vinfo)->reductions;
1772 for (i = 0; reductions.iterate (i, &next); i++)
1773 scalar_stmts.safe_push (next);
1776 loads.create (group_size);
1778 /* Build the tree for the SLP instance. */
1779 bool *matches = XALLOCAVEC (bool, group_size);
1780 unsigned npermutes = 0;
1781 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
1782 &max_nunits, &loads, matches, &npermutes,
1783 NULL, max_tree_size);
1784 if (node != NULL)
1786 /* Calculate the unrolling factor based on the smallest type. */
1787 unrolling_factor
1788 = least_common_multiple (max_nunits, group_size) / group_size;
1790 if (unrolling_factor != 1
1791 && is_a <bb_vec_info> (vinfo))
1794 if (max_nunits > group_size)
1796 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1797 "Build SLP failed: store group "
1798 "size not a multiple of the vector size "
1799 "in basic block SLP\n");
1800 vect_free_slp_tree (node);
1801 loads.release ();
1802 return false;
1804 /* Fatal mismatch. */
1805 matches[group_size/max_nunits * max_nunits] = false;
1806 vect_free_slp_tree (node);
1807 loads.release ();
1809 else
1811 /* Create a new SLP instance. */
1812 new_instance = XNEW (struct _slp_instance);
1813 SLP_INSTANCE_TREE (new_instance) = node;
1814 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1815 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1816 SLP_INSTANCE_LOADS (new_instance) = loads;
1818 /* Compute the load permutation. */
1819 slp_tree load_node;
1820 bool loads_permuted = false;
1821 FOR_EACH_VEC_ELT (loads, i, load_node)
1823 vec<unsigned> load_permutation;
1824 int j;
1825 gimple *load, *first_stmt;
1826 bool this_load_permuted = false;
1827 load_permutation.create (group_size);
1828 first_stmt = GROUP_FIRST_ELEMENT
1829 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1830 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1832 int load_place = vect_get_place_in_interleaving_chain
1833 (load, first_stmt);
1834 gcc_assert (load_place != -1);
1835 if (load_place != j)
1836 this_load_permuted = true;
1837 load_permutation.safe_push (load_place);
1839 if (!this_load_permuted
1840 /* The load requires permutation when unrolling exposes
1841 a gap either because the group is larger than the SLP
1842 group-size or because there is a gap between the groups. */
1843 && (unrolling_factor == 1
1844 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
1845 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
1847 load_permutation.release ();
1848 continue;
1850 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1851 loads_permuted = true;
1854 if (loads_permuted)
1856 if (!vect_supported_load_permutation_p (new_instance))
1858 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861 "Build SLP failed: unsupported load "
1862 "permutation ");
1863 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
1864 TDF_SLIM, stmt, 0);
1866 vect_free_slp_instance (new_instance);
1867 return false;
1871 /* If the loads and stores can be handled with load/store-lan
1872 instructions do not generate this SLP instance. */
1873 if (is_a <loop_vec_info> (vinfo)
1874 && loads_permuted
1875 && dr && vect_store_lanes_supported (vectype, group_size))
1877 slp_tree load_node;
1878 FOR_EACH_VEC_ELT (loads, i, load_node)
1880 gimple *first_stmt = GROUP_FIRST_ELEMENT
1881 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1882 stmt_vec_info stmt_vinfo = vinfo_for_stmt (first_stmt);
1883 /* Use SLP for strided accesses (or if we
1884 can't load-lanes). */
1885 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
1886 || ! vect_load_lanes_supported
1887 (STMT_VINFO_VECTYPE (stmt_vinfo),
1888 GROUP_SIZE (stmt_vinfo)))
1889 break;
1891 if (i == loads.length ())
1893 if (dump_enabled_p ())
1894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1895 "Built SLP cancelled: can use "
1896 "load/store-lanes\n");
1897 vect_free_slp_instance (new_instance);
1898 return false;
1902 vinfo->slp_instances.safe_push (new_instance);
1904 if (dump_enabled_p ())
1906 dump_printf_loc (MSG_NOTE, vect_location,
1907 "Final SLP tree for instance:\n");
1908 vect_print_slp_tree (MSG_NOTE, vect_location, node);
1911 return true;
1914 else
1916 /* Failed to SLP. */
1917 /* Free the allocated memory. */
1918 scalar_stmts.release ();
1919 loads.release ();
1922 /* For basic block SLP, try to break the group up into multiples of the
1923 vector size. */
1924 if (is_a <bb_vec_info> (vinfo)
1925 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1926 && STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
1928 /* We consider breaking the group only on VF boundaries from the existing
1929 start. */
1930 for (i = 0; i < group_size; i++)
1931 if (!matches[i]) break;
1933 if (i >= nunits && i < group_size)
1935 /* Split into two groups at the first vector boundary before i. */
1936 gcc_assert ((nunits & (nunits - 1)) == 0);
1937 unsigned group1_size = i & ~(nunits - 1);
1939 gimple *rest = vect_split_slp_store_group (stmt, group1_size);
1940 bool res = vect_analyze_slp_instance (vinfo, stmt, max_tree_size);
1941 /* If the first non-match was in the middle of a vector,
1942 skip the rest of that vector. */
1943 if (group1_size < i)
1945 i = group1_size + nunits;
1946 if (i < group_size)
1947 rest = vect_split_slp_store_group (rest, nunits);
1949 if (i < group_size)
1950 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
1951 return res;
1953 /* Even though the first vector did not all match, we might be able to SLP
1954 (some) of the remainder. FORNOW ignore this possibility. */
1957 return false;
1961 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1962 trees of packed scalar stmts if SLP is possible. */
1964 bool
1965 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
1967 unsigned int i;
1968 gimple *first_element;
1969 bool ok = false;
1971 if (dump_enabled_p ())
1972 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1974 /* Find SLP sequences starting from groups of grouped stores. */
1975 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
1976 if (vect_analyze_slp_instance (vinfo, first_element, max_tree_size))
1977 ok = true;
1979 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1981 if (loop_vinfo->reduction_chains.length () > 0)
1983 /* Find SLP sequences starting from reduction chains. */
1984 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
1985 if (vect_analyze_slp_instance (vinfo, first_element,
1986 max_tree_size))
1987 ok = true;
1988 else
1989 return false;
1991 /* Don't try to vectorize SLP reductions if reduction chain was
1992 detected. */
1993 return ok;
1996 /* Find SLP sequences starting from groups of reductions. */
1997 if (loop_vinfo->reductions.length () > 1
1998 && vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
1999 max_tree_size))
2000 ok = true;
2003 return true;
2007 /* For each possible SLP instance decide whether to SLP it and calculate overall
2008 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2009 least one instance. */
2011 bool
2012 vect_make_slp_decision (loop_vec_info loop_vinfo)
2014 unsigned int i, unrolling_factor = 1;
2015 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2016 slp_instance instance;
2017 int decided_to_slp = 0;
2019 if (dump_enabled_p ())
2020 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
2021 "\n");
2023 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2025 /* FORNOW: SLP if you can. */
2026 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
2027 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
2029 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2030 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2031 loop-based vectorization. Such stmts will be marked as HYBRID. */
2032 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2033 decided_to_slp++;
2036 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2038 if (decided_to_slp && dump_enabled_p ())
2039 dump_printf_loc (MSG_NOTE, vect_location,
2040 "Decided to SLP %d instances. Unrolling factor %d\n",
2041 decided_to_slp, unrolling_factor);
2043 return (decided_to_slp > 0);
2047 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2048 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2050 static void
2051 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2053 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[i];
2054 imm_use_iterator imm_iter;
2055 gimple *use_stmt;
2056 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
2057 slp_tree child;
2058 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2059 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2060 int j;
2062 /* Propagate hybrid down the SLP tree. */
2063 if (stype == hybrid)
2065 else if (HYBRID_SLP_STMT (stmt_vinfo))
2066 stype = hybrid;
2067 else
2069 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2070 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2071 /* If we get a pattern stmt here we have to use the LHS of the
2072 original stmt for immediate uses. */
2073 if (! STMT_VINFO_IN_PATTERN_P (stmt_vinfo)
2074 && STMT_VINFO_RELATED_STMT (stmt_vinfo))
2075 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
2076 if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
2077 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
2079 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2080 continue;
2081 use_vinfo = vinfo_for_stmt (use_stmt);
2082 if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2083 && STMT_VINFO_RELATED_STMT (use_vinfo))
2084 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
2085 if (!STMT_SLP_TYPE (use_vinfo)
2086 && (STMT_VINFO_RELEVANT (use_vinfo)
2087 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2088 && !(gimple_code (use_stmt) == GIMPLE_PHI
2089 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2091 if (dump_enabled_p ())
2093 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2094 "def in non-SLP stmt: ");
2095 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
2097 stype = hybrid;
2102 if (stype == hybrid
2103 && !HYBRID_SLP_STMT (stmt_vinfo))
2105 if (dump_enabled_p ())
2107 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2108 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2110 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2113 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2114 if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
2115 vect_detect_hybrid_slp_stmts (child, i, stype);
2118 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2120 static tree
2121 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2123 walk_stmt_info *wi = (walk_stmt_info *)data;
2124 struct loop *loopp = (struct loop *)wi->info;
2126 if (wi->is_lhs)
2127 return NULL_TREE;
2129 if (TREE_CODE (*tp) == SSA_NAME
2130 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2132 gimple *def_stmt = SSA_NAME_DEF_STMT (*tp);
2133 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2134 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2136 if (dump_enabled_p ())
2138 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2139 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2141 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2145 return NULL_TREE;
2148 static tree
2149 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2150 walk_stmt_info *)
2152 /* If the stmt is in a SLP instance then this isn't a reason
2153 to mark use definitions in other SLP instances as hybrid. */
2154 if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
2155 *handled = true;
2156 return NULL_TREE;
2159 /* Find stmts that must be both vectorized and SLPed. */
2161 void
2162 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2164 unsigned int i;
2165 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2166 slp_instance instance;
2168 if (dump_enabled_p ())
2169 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2170 "\n");
2172 /* First walk all pattern stmt in the loop and mark defs of uses as
2173 hybrid because immediate uses in them are not recorded. */
2174 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2176 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2177 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2178 gsi_next (&gsi))
2180 gimple *stmt = gsi_stmt (gsi);
2181 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2182 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2184 walk_stmt_info wi;
2185 memset (&wi, 0, sizeof (wi));
2186 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2187 gimple_stmt_iterator gsi2
2188 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2189 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2190 vect_detect_hybrid_slp_1, &wi);
2191 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2192 vect_detect_hybrid_slp_2,
2193 vect_detect_hybrid_slp_1, &wi);
2198 /* Then walk the SLP instance trees marking stmts with uses in
2199 non-SLP stmts as hybrid, also propagating hybrid down the
2200 SLP tree, collecting the above info on-the-fly. */
2201 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2203 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2204 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2205 i, pure_slp);
2210 /* Create and initialize a new bb_vec_info struct for BB, as well as
2211 stmt_vec_info structs for all the stmts in it. */
2213 static bb_vec_info
2214 new_bb_vec_info (gimple_stmt_iterator region_begin,
2215 gimple_stmt_iterator region_end)
2217 basic_block bb = gsi_bb (region_begin);
2218 bb_vec_info res = NULL;
2219 gimple_stmt_iterator gsi;
2221 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
2222 res->kind = vec_info::bb;
2223 BB_VINFO_BB (res) = bb;
2224 res->region_begin = region_begin;
2225 res->region_end = region_end;
2227 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2228 gsi_next (&gsi))
2230 gimple *stmt = gsi_stmt (gsi);
2231 gimple_set_uid (stmt, 0);
2232 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
2235 BB_VINFO_GROUPED_STORES (res).create (10);
2236 BB_VINFO_SLP_INSTANCES (res).create (2);
2237 BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2239 bb->aux = res;
2240 return res;
2244 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2245 stmts in the basic block. */
2247 static void
2248 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2250 slp_instance instance;
2251 unsigned i;
2253 if (!bb_vinfo)
2254 return;
2256 vect_destroy_datarefs (bb_vinfo);
2257 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2258 BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2259 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
2260 vect_free_slp_instance (instance);
2261 BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2262 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2264 for (gimple_stmt_iterator si = bb_vinfo->region_begin;
2265 gsi_stmt (si) != gsi_stmt (bb_vinfo->region_end); gsi_next (&si))
2267 gimple *stmt = gsi_stmt (si);
2268 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2270 if (stmt_info)
2271 /* Free stmt_vec_info. */
2272 free_stmt_vec_info (stmt);
2274 /* Reset region marker. */
2275 gimple_set_uid (stmt, -1);
2278 BB_VINFO_BB (bb_vinfo)->aux = NULL;
2279 free (bb_vinfo);
2283 /* Analyze statements contained in SLP tree node after recursively analyzing
2284 the subtree. Return TRUE if the operations are supported. */
2286 static bool
2287 vect_slp_analyze_node_operations (slp_tree node)
2289 bool dummy;
2290 int i, j;
2291 gimple *stmt;
2292 slp_tree child;
2294 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2295 return true;
2297 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2298 if (!vect_slp_analyze_node_operations (child))
2299 return false;
2301 bool res = true;
2302 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2304 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2305 gcc_assert (stmt_info);
2306 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2308 /* Push SLP node def-type to stmt operands. */
2309 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2310 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2311 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[i]))
2312 = SLP_TREE_DEF_TYPE (child);
2313 res = vect_analyze_stmt (stmt, &dummy, node);
2314 /* Restore def-types. */
2315 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2316 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2317 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[i]))
2318 = vect_internal_def;
2319 if (! res)
2320 break;
2323 return res;
2327 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
2328 operations are supported. */
2330 bool
2331 vect_slp_analyze_operations (vec<slp_instance> slp_instances, void *data)
2333 slp_instance instance;
2334 int i;
2336 if (dump_enabled_p ())
2337 dump_printf_loc (MSG_NOTE, vect_location,
2338 "=== vect_slp_analyze_operations ===\n");
2340 for (i = 0; slp_instances.iterate (i, &instance); )
2342 if (!vect_slp_analyze_node_operations (SLP_INSTANCE_TREE (instance)))
2344 dump_printf_loc (MSG_NOTE, vect_location,
2345 "removing SLP instance operations starting from: ");
2346 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2347 SLP_TREE_SCALAR_STMTS
2348 (SLP_INSTANCE_TREE (instance))[0], 0);
2349 vect_free_slp_instance (instance);
2350 slp_instances.ordered_remove (i);
2352 else
2354 /* Compute the costs of the SLP instance. */
2355 vect_analyze_slp_cost (instance, data);
2356 i++;
2360 if (!slp_instances.length ())
2361 return false;
2363 return true;
2367 /* Compute the scalar cost of the SLP node NODE and its children
2368 and return it. Do not account defs that are marked in LIFE and
2369 update LIFE according to uses of NODE. */
2371 static unsigned
2372 vect_bb_slp_scalar_cost (basic_block bb,
2373 slp_tree node, vec<bool, va_heap> *life)
2375 unsigned scalar_cost = 0;
2376 unsigned i;
2377 gimple *stmt;
2378 slp_tree child;
2380 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2382 unsigned stmt_cost;
2383 ssa_op_iter op_iter;
2384 def_operand_p def_p;
2385 stmt_vec_info stmt_info;
2387 if ((*life)[i])
2388 continue;
2390 /* If there is a non-vectorized use of the defs then the scalar
2391 stmt is kept live in which case we do not account it or any
2392 required defs in the SLP children in the scalar cost. This
2393 way we make the vectorization more costly when compared to
2394 the scalar cost. */
2395 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2397 imm_use_iterator use_iter;
2398 gimple *use_stmt;
2399 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2400 if (!is_gimple_debug (use_stmt)
2401 && (! vect_stmt_in_region_p (vinfo_for_stmt (stmt)->vinfo,
2402 use_stmt)
2403 || ! PURE_SLP_STMT (vinfo_for_stmt (use_stmt))))
2405 (*life)[i] = true;
2406 BREAK_FROM_IMM_USE_STMT (use_iter);
2409 if ((*life)[i])
2410 continue;
2412 /* Count scalar stmts only once. */
2413 if (gimple_visited_p (stmt))
2414 continue;
2415 gimple_set_visited (stmt, true);
2417 stmt_info = vinfo_for_stmt (stmt);
2418 if (STMT_VINFO_DATA_REF (stmt_info))
2420 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2421 stmt_cost = vect_get_stmt_cost (scalar_load);
2422 else
2423 stmt_cost = vect_get_stmt_cost (scalar_store);
2425 else
2426 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2428 scalar_cost += stmt_cost;
2431 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2432 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2433 scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2435 return scalar_cost;
2438 /* Check if vectorization of the basic block is profitable. */
2440 static bool
2441 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2443 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2444 slp_instance instance;
2445 int i;
2446 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2447 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2449 /* Calculate scalar cost. */
2450 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2452 auto_vec<bool, 20> life;
2453 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2454 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2455 SLP_INSTANCE_TREE (instance),
2456 &life);
2459 /* Unset visited flag. */
2460 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2461 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2462 gimple_set_visited (gsi_stmt (gsi), false);
2464 /* Complete the target-specific cost calculation. */
2465 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2466 &vec_inside_cost, &vec_epilogue_cost);
2468 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2470 if (dump_enabled_p ())
2472 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2473 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2474 vec_inside_cost);
2475 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2476 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2477 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2480 /* Vectorization is profitable if its cost is more than the cost of scalar
2481 version. Note that we err on the vector side for equal cost because
2482 the cost estimate is otherwise quite pessimistic (constant uses are
2483 free on the scalar side but cost a load on the vector side for
2484 example). */
2485 if (vec_outside_cost + vec_inside_cost > scalar_cost)
2486 return false;
2488 return true;
2491 /* Check if the basic block can be vectorized. Returns a bb_vec_info
2492 if so and sets fatal to true if failure is independent of
2493 current_vector_size. */
2495 static bb_vec_info
2496 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
2497 gimple_stmt_iterator region_end,
2498 vec<data_reference_p> datarefs, int n_stmts,
2499 bool &fatal)
2501 bb_vec_info bb_vinfo;
2502 slp_instance instance;
2503 int i;
2504 int min_vf = 2;
2506 /* The first group of checks is independent of the vector size. */
2507 fatal = true;
2509 if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2511 if (dump_enabled_p ())
2512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2513 "not vectorized: too many instructions in "
2514 "basic block.\n");
2515 free_data_refs (datarefs);
2516 return NULL;
2519 bb_vinfo = new_bb_vec_info (region_begin, region_end);
2520 if (!bb_vinfo)
2521 return NULL;
2523 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
2525 /* Analyze the data references. */
2527 if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
2529 if (dump_enabled_p ())
2530 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2531 "not vectorized: unhandled data-ref in basic "
2532 "block.\n");
2534 destroy_bb_vec_info (bb_vinfo);
2535 return NULL;
2538 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2540 if (dump_enabled_p ())
2541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2542 "not vectorized: not enough data-refs in "
2543 "basic block.\n");
2545 destroy_bb_vec_info (bb_vinfo);
2546 return NULL;
2549 if (!vect_analyze_data_ref_accesses (bb_vinfo))
2551 if (dump_enabled_p ())
2552 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2553 "not vectorized: unhandled data access in "
2554 "basic block.\n");
2556 destroy_bb_vec_info (bb_vinfo);
2557 return NULL;
2560 /* If there are no grouped stores in the region there is no need
2561 to continue with pattern recog as vect_analyze_slp will fail
2562 anyway. */
2563 if (bb_vinfo->grouped_stores.is_empty ())
2565 if (dump_enabled_p ())
2566 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2567 "not vectorized: no grouped stores in "
2568 "basic block.\n");
2570 destroy_bb_vec_info (bb_vinfo);
2571 return NULL;
2574 /* While the rest of the analysis below depends on it in some way. */
2575 fatal = false;
2577 vect_pattern_recog (bb_vinfo);
2579 /* Check the SLP opportunities in the basic block, analyze and build SLP
2580 trees. */
2581 if (!vect_analyze_slp (bb_vinfo, n_stmts))
2583 if (dump_enabled_p ())
2585 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2586 "Failed to SLP the basic block.\n");
2587 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2588 "not vectorized: failed to find SLP opportunities "
2589 "in basic block.\n");
2592 destroy_bb_vec_info (bb_vinfo);
2593 return NULL;
2596 /* Analyze and verify the alignment of data references and the
2597 dependence in the SLP instances. */
2598 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
2600 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
2601 || ! vect_slp_analyze_instance_dependence (instance))
2603 dump_printf_loc (MSG_NOTE, vect_location,
2604 "removing SLP instance operations starting from: ");
2605 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2606 SLP_TREE_SCALAR_STMTS
2607 (SLP_INSTANCE_TREE (instance))[0], 0);
2608 vect_free_slp_instance (instance);
2609 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
2610 continue;
2613 /* Mark all the statements that we want to vectorize as pure SLP and
2614 relevant. */
2615 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2616 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2618 i++;
2620 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
2622 destroy_bb_vec_info (bb_vinfo);
2623 return NULL;
2626 if (!vect_slp_analyze_operations (BB_VINFO_SLP_INSTANCES (bb_vinfo),
2627 BB_VINFO_TARGET_COST_DATA (bb_vinfo)))
2629 if (dump_enabled_p ())
2630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2631 "not vectorized: bad operation in basic block.\n");
2633 destroy_bb_vec_info (bb_vinfo);
2634 return NULL;
2637 /* Cost model: check if the vectorization is worthwhile. */
2638 if (!unlimited_cost_model (NULL)
2639 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2641 if (dump_enabled_p ())
2642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2643 "not vectorized: vectorization is not "
2644 "profitable.\n");
2646 destroy_bb_vec_info (bb_vinfo);
2647 return NULL;
2650 if (dump_enabled_p ())
2651 dump_printf_loc (MSG_NOTE, vect_location,
2652 "Basic block will be vectorized using SLP\n");
2654 return bb_vinfo;
2658 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
2659 true if anything in the basic-block was vectorized. */
2661 bool
2662 vect_slp_bb (basic_block bb)
2664 bb_vec_info bb_vinfo;
2665 gimple_stmt_iterator gsi;
2666 unsigned int vector_sizes;
2667 bool any_vectorized = false;
2669 if (dump_enabled_p ())
2670 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2672 /* Autodetect first vector size we try. */
2673 current_vector_size = 0;
2674 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2676 gsi = gsi_start_bb (bb);
2678 while (1)
2680 if (gsi_end_p (gsi))
2681 break;
2683 gimple_stmt_iterator region_begin = gsi;
2684 vec<data_reference_p> datarefs = vNULL;
2685 int insns = 0;
2687 for (; !gsi_end_p (gsi); gsi_next (&gsi))
2689 gimple *stmt = gsi_stmt (gsi);
2690 if (is_gimple_debug (stmt))
2691 continue;
2692 insns++;
2694 if (gimple_location (stmt) != UNKNOWN_LOCATION)
2695 vect_location = gimple_location (stmt);
2697 if (!find_data_references_in_stmt (NULL, stmt, &datarefs))
2698 break;
2701 /* Skip leading unhandled stmts. */
2702 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
2704 gsi_next (&gsi);
2705 continue;
2708 gimple_stmt_iterator region_end = gsi;
2710 bool vectorized = false;
2711 bool fatal = false;
2712 bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
2713 datarefs, insns, fatal);
2714 if (bb_vinfo
2715 && dbg_cnt (vect_slp))
2717 if (dump_enabled_p ())
2718 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
2720 vect_schedule_slp (bb_vinfo);
2722 if (dump_enabled_p ())
2723 dump_printf_loc (MSG_NOTE, vect_location,
2724 "basic block part vectorized\n");
2726 destroy_bb_vec_info (bb_vinfo);
2728 vectorized = true;
2730 else
2731 destroy_bb_vec_info (bb_vinfo);
2733 any_vectorized |= vectorized;
2735 vector_sizes &= ~current_vector_size;
2736 if (vectorized
2737 || vector_sizes == 0
2738 || current_vector_size == 0
2739 /* If vect_slp_analyze_bb_1 signaled that analysis for all
2740 vector sizes will fail do not bother iterating. */
2741 || fatal)
2743 if (gsi_end_p (region_end))
2744 break;
2746 /* Skip the unhandled stmt. */
2747 gsi_next (&gsi);
2749 /* And reset vector sizes. */
2750 current_vector_size = 0;
2751 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2753 else
2755 /* Try the next biggest vector size. */
2756 current_vector_size = 1 << floor_log2 (vector_sizes);
2757 if (dump_enabled_p ())
2758 dump_printf_loc (MSG_NOTE, vect_location,
2759 "***** Re-trying analysis with "
2760 "vector size %d\n", current_vector_size);
2762 /* Start over. */
2763 gsi = region_begin;
2767 return any_vectorized;
2771 /* Return 1 if vector type of boolean constant which is OPNUM
2772 operand in statement STMT is a boolean vector. */
2774 static bool
2775 vect_mask_constant_operand_p (gimple *stmt, int opnum)
2777 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2778 enum tree_code code = gimple_expr_code (stmt);
2779 tree op, vectype;
2780 gimple *def_stmt;
2781 enum vect_def_type dt;
2783 /* For comparison and COND_EXPR type is chosen depending
2784 on the other comparison operand. */
2785 if (TREE_CODE_CLASS (code) == tcc_comparison)
2787 if (opnum)
2788 op = gimple_assign_rhs1 (stmt);
2789 else
2790 op = gimple_assign_rhs2 (stmt);
2792 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
2793 &dt, &vectype))
2794 gcc_unreachable ();
2796 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
2799 if (code == COND_EXPR)
2801 tree cond = gimple_assign_rhs1 (stmt);
2803 if (TREE_CODE (cond) == SSA_NAME)
2804 return false;
2806 if (opnum)
2807 op = TREE_OPERAND (cond, 1);
2808 else
2809 op = TREE_OPERAND (cond, 0);
2811 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
2812 &dt, &vectype))
2813 gcc_unreachable ();
2815 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
2818 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
2822 /* For constant and loop invariant defs of SLP_NODE this function returns
2823 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2824 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2825 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2826 REDUC_INDEX is the index of the reduction operand in the statements, unless
2827 it is -1. */
2829 static void
2830 vect_get_constant_vectors (tree op, slp_tree slp_node,
2831 vec<tree> *vec_oprnds,
2832 unsigned int op_num, unsigned int number_of_vectors,
2833 int reduc_index)
2835 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2836 gimple *stmt = stmts[0];
2837 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2838 unsigned nunits;
2839 tree vec_cst;
2840 tree *elts;
2841 unsigned j, number_of_places_left_in_vector;
2842 tree vector_type;
2843 tree vop;
2844 int group_size = stmts.length ();
2845 unsigned int vec_num, i;
2846 unsigned number_of_copies = 1;
2847 vec<tree> voprnds;
2848 voprnds.create (number_of_vectors);
2849 bool constant_p, is_store;
2850 tree neutral_op = NULL;
2851 enum tree_code code = gimple_expr_code (stmt);
2852 gimple *def_stmt;
2853 struct loop *loop;
2854 gimple_seq ctor_seq = NULL;
2856 /* Check if vector type is a boolean vector. */
2857 if (TREE_CODE (TREE_TYPE (op)) == BOOLEAN_TYPE
2858 && vect_mask_constant_operand_p (stmt, op_num))
2859 vector_type
2860 = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
2861 else
2862 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2863 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2865 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2866 && reduc_index != -1)
2868 op_num = reduc_index;
2869 op = gimple_op (stmt, op_num + 1);
2870 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2871 we need either neutral operands or the original operands. See
2872 get_initial_def_for_reduction() for details. */
2873 switch (code)
2875 case WIDEN_SUM_EXPR:
2876 case DOT_PROD_EXPR:
2877 case SAD_EXPR:
2878 case PLUS_EXPR:
2879 case MINUS_EXPR:
2880 case BIT_IOR_EXPR:
2881 case BIT_XOR_EXPR:
2882 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2883 neutral_op = build_real (TREE_TYPE (op), dconst0);
2884 else
2885 neutral_op = build_int_cst (TREE_TYPE (op), 0);
2887 break;
2889 case MULT_EXPR:
2890 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2891 neutral_op = build_real (TREE_TYPE (op), dconst1);
2892 else
2893 neutral_op = build_int_cst (TREE_TYPE (op), 1);
2895 break;
2897 case BIT_AND_EXPR:
2898 neutral_op = build_int_cst (TREE_TYPE (op), -1);
2899 break;
2901 /* For MIN/MAX we don't have an easy neutral operand but
2902 the initial values can be used fine here. Only for
2903 a reduction chain we have to force a neutral element. */
2904 case MAX_EXPR:
2905 case MIN_EXPR:
2906 if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2907 neutral_op = NULL;
2908 else
2910 def_stmt = SSA_NAME_DEF_STMT (op);
2911 loop = (gimple_bb (stmt))->loop_father;
2912 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2913 loop_preheader_edge (loop));
2915 break;
2917 default:
2918 gcc_assert (!GROUP_FIRST_ELEMENT (stmt_vinfo));
2919 neutral_op = NULL;
2923 if (STMT_VINFO_DATA_REF (stmt_vinfo))
2925 is_store = true;
2926 op = gimple_assign_rhs1 (stmt);
2928 else
2929 is_store = false;
2931 gcc_assert (op);
2933 if (CONSTANT_CLASS_P (op))
2934 constant_p = true;
2935 else
2936 constant_p = false;
2938 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2939 created vectors. It is greater than 1 if unrolling is performed.
2941 For example, we have two scalar operands, s1 and s2 (e.g., group of
2942 strided accesses of size two), while NUNITS is four (i.e., four scalars
2943 of this type can be packed in a vector). The output vector will contain
2944 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2945 will be 2).
2947 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2948 containing the operands.
2950 For example, NUNITS is four as before, and the group size is 8
2951 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2952 {s5, s6, s7, s8}. */
2954 number_of_copies = nunits * number_of_vectors / group_size;
2956 number_of_places_left_in_vector = nunits;
2957 elts = XALLOCAVEC (tree, nunits);
2958 bool place_after_defs = false;
2959 for (j = 0; j < number_of_copies; j++)
2961 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2963 if (is_store)
2964 op = gimple_assign_rhs1 (stmt);
2965 else
2967 switch (code)
2969 case COND_EXPR:
2971 tree cond = gimple_assign_rhs1 (stmt);
2972 if (TREE_CODE (cond) == SSA_NAME)
2973 op = gimple_op (stmt, op_num + 1);
2974 else if (op_num == 0 || op_num == 1)
2975 op = TREE_OPERAND (cond, op_num);
2976 else
2978 if (op_num == 2)
2979 op = gimple_assign_rhs2 (stmt);
2980 else
2981 op = gimple_assign_rhs3 (stmt);
2984 break;
2986 case CALL_EXPR:
2987 op = gimple_call_arg (stmt, op_num);
2988 break;
2990 case LSHIFT_EXPR:
2991 case RSHIFT_EXPR:
2992 case LROTATE_EXPR:
2993 case RROTATE_EXPR:
2994 op = gimple_op (stmt, op_num + 1);
2995 /* Unlike the other binary operators, shifts/rotates have
2996 the shift count being int, instead of the same type as
2997 the lhs, so make sure the scalar is the right type if
2998 we are dealing with vectors of
2999 long long/long/short/char. */
3000 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3001 op = fold_convert (TREE_TYPE (vector_type), op);
3002 break;
3004 default:
3005 op = gimple_op (stmt, op_num + 1);
3006 break;
3010 if (reduc_index != -1)
3012 loop = (gimple_bb (stmt))->loop_father;
3013 def_stmt = SSA_NAME_DEF_STMT (op);
3015 gcc_assert (loop);
3017 /* Get the def before the loop. In reduction chain we have only
3018 one initial value. */
3019 if ((j != (number_of_copies - 1)
3020 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
3021 && i != 0))
3022 && neutral_op)
3023 op = neutral_op;
3024 else
3025 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
3026 loop_preheader_edge (loop));
3029 /* Create 'vect_ = {op0,op1,...,opn}'. */
3030 number_of_places_left_in_vector--;
3031 tree orig_op = op;
3032 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3034 if (CONSTANT_CLASS_P (op))
3036 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3038 /* Can't use VIEW_CONVERT_EXPR for booleans because
3039 of possibly different sizes of scalar value and
3040 vector element. */
3041 if (integer_zerop (op))
3042 op = build_int_cst (TREE_TYPE (vector_type), 0);
3043 else if (integer_onep (op))
3044 op = build_all_ones_cst (TREE_TYPE (vector_type));
3045 else
3046 gcc_unreachable ();
3048 else
3049 op = fold_unary (VIEW_CONVERT_EXPR,
3050 TREE_TYPE (vector_type), op);
3051 gcc_assert (op && CONSTANT_CLASS_P (op));
3053 else
3055 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3056 gimple *init_stmt;
3057 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3059 tree true_val
3060 = build_all_ones_cst (TREE_TYPE (vector_type));
3061 tree false_val
3062 = build_zero_cst (TREE_TYPE (vector_type));
3063 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3064 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3065 op, true_val,
3066 false_val);
3068 else
3070 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3071 op);
3072 init_stmt
3073 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3074 op);
3076 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3077 op = new_temp;
3080 elts[number_of_places_left_in_vector] = op;
3081 if (!CONSTANT_CLASS_P (op))
3082 constant_p = false;
3083 if (TREE_CODE (orig_op) == SSA_NAME
3084 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3085 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3086 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3087 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3088 place_after_defs = true;
3090 if (number_of_places_left_in_vector == 0)
3092 number_of_places_left_in_vector = nunits;
3094 if (constant_p)
3095 vec_cst = build_vector (vector_type, elts);
3096 else
3098 vec<constructor_elt, va_gc> *v;
3099 unsigned k;
3100 vec_alloc (v, nunits);
3101 for (k = 0; k < nunits; ++k)
3102 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
3103 vec_cst = build_constructor (vector_type, v);
3105 tree init;
3106 gimple_stmt_iterator gsi;
3107 if (place_after_defs)
3109 gsi = gsi_for_stmt
3110 (vect_find_last_scalar_stmt_in_slp (slp_node));
3111 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
3113 else
3114 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
3115 if (ctor_seq != NULL)
3117 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3118 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
3119 GSI_SAME_STMT);
3120 ctor_seq = NULL;
3122 voprnds.quick_push (init);
3123 place_after_defs = false;
3128 /* Since the vectors are created in the reverse order, we should invert
3129 them. */
3130 vec_num = voprnds.length ();
3131 for (j = vec_num; j != 0; j--)
3133 vop = voprnds[j - 1];
3134 vec_oprnds->quick_push (vop);
3137 voprnds.release ();
3139 /* In case that VF is greater than the unrolling factor needed for the SLP
3140 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3141 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3142 to replicate the vectors. */
3143 while (number_of_vectors > vec_oprnds->length ())
3145 tree neutral_vec = NULL;
3147 if (neutral_op)
3149 if (!neutral_vec)
3150 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3152 vec_oprnds->quick_push (neutral_vec);
3154 else
3156 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3157 vec_oprnds->quick_push (vop);
3163 /* Get vectorized definitions from SLP_NODE that contains corresponding
3164 vectorized def-stmts. */
3166 static void
3167 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3169 tree vec_oprnd;
3170 gimple *vec_def_stmt;
3171 unsigned int i;
3173 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3175 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
3177 gcc_assert (vec_def_stmt);
3178 vec_oprnd = gimple_get_lhs (vec_def_stmt);
3179 vec_oprnds->quick_push (vec_oprnd);
3184 /* Get vectorized definitions for SLP_NODE.
3185 If the scalar definitions are loop invariants or constants, collect them and
3186 call vect_get_constant_vectors() to create vector stmts.
3187 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3188 must be stored in the corresponding child of SLP_NODE, and we call
3189 vect_get_slp_vect_defs () to retrieve them. */
3191 void
3192 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3193 vec<vec<tree> > *vec_oprnds, int reduc_index)
3195 gimple *first_stmt;
3196 int number_of_vects = 0, i;
3197 unsigned int child_index = 0;
3198 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3199 slp_tree child = NULL;
3200 vec<tree> vec_defs;
3201 tree oprnd;
3202 bool vectorized_defs;
3204 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3205 FOR_EACH_VEC_ELT (ops, i, oprnd)
3207 /* For each operand we check if it has vectorized definitions in a child
3208 node or we need to create them (for invariants and constants). We
3209 check if the LHS of the first stmt of the next child matches OPRND.
3210 If it does, we found the correct child. Otherwise, we call
3211 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3212 to check this child node for the next operand. */
3213 vectorized_defs = false;
3214 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3216 child = SLP_TREE_CHILDREN (slp_node)[child_index];
3218 /* We have to check both pattern and original def, if available. */
3219 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3221 gimple *first_def = SLP_TREE_SCALAR_STMTS (child)[0];
3222 gimple *related
3223 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
3225 if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
3226 || (related
3227 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
3229 /* The number of vector defs is determined by the number of
3230 vector statements in the node from which we get those
3231 statements. */
3232 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3233 vectorized_defs = true;
3234 child_index++;
3237 else
3238 child_index++;
3241 if (!vectorized_defs)
3243 if (i == 0)
3245 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3246 /* Number of vector stmts was calculated according to LHS in
3247 vect_schedule_slp_instance (), fix it by replacing LHS with
3248 RHS, if necessary. See vect_get_smallest_scalar_type () for
3249 details. */
3250 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
3251 &rhs_size_unit);
3252 if (rhs_size_unit != lhs_size_unit)
3254 number_of_vects *= rhs_size_unit;
3255 number_of_vects /= lhs_size_unit;
3260 /* Allocate memory for vectorized defs. */
3261 vec_defs = vNULL;
3262 vec_defs.create (number_of_vects);
3264 /* For reduction defs we call vect_get_constant_vectors (), since we are
3265 looking for initial loop invariant values. */
3266 if (vectorized_defs && reduc_index == -1)
3267 /* The defs are already vectorized. */
3268 vect_get_slp_vect_defs (child, &vec_defs);
3269 else
3270 /* Build vectors from scalar defs. */
3271 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3272 number_of_vects, reduc_index);
3274 vec_oprnds->quick_push (vec_defs);
3276 /* For reductions, we only need initial values. */
3277 if (reduc_index != -1)
3278 return;
3283 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
3284 building a vector of type MASK_TYPE from it) and two input vectors placed in
3285 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
3286 shifting by STRIDE elements of DR_CHAIN for every copy.
3287 (STRIDE is the number of vectorized stmts for NODE divided by the number of
3288 copies).
3289 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
3290 the created stmts must be inserted. */
3292 static inline void
3293 vect_create_mask_and_perm (gimple *stmt,
3294 tree mask, int first_vec_indx, int second_vec_indx,
3295 gimple_stmt_iterator *gsi, slp_tree node,
3296 tree vectype, vec<tree> dr_chain,
3297 int ncopies, int vect_stmts_counter)
3299 tree perm_dest;
3300 gimple *perm_stmt = NULL;
3301 int i, stride_in, stride_out;
3302 tree first_vec, second_vec, data_ref;
3304 stride_out = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
3305 stride_in = dr_chain.length () / ncopies;
3307 /* Initialize the vect stmts of NODE to properly insert the generated
3308 stmts later. */
3309 for (i = SLP_TREE_VEC_STMTS (node).length ();
3310 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3311 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3313 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
3314 for (i = 0; i < ncopies; i++)
3316 first_vec = dr_chain[first_vec_indx];
3317 second_vec = dr_chain[second_vec_indx];
3319 /* Generate the permute statement if necessary. */
3320 if (mask)
3322 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3323 first_vec, second_vec, mask);
3324 data_ref = make_ssa_name (perm_dest, perm_stmt);
3325 gimple_set_lhs (perm_stmt, data_ref);
3326 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3328 else
3329 /* If mask was NULL_TREE generate the requested identity transform. */
3330 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
3332 /* Store the vector statement in NODE. */
3333 SLP_TREE_VEC_STMTS (node)[stride_out * i + vect_stmts_counter]
3334 = perm_stmt;
3336 first_vec_indx += stride_in;
3337 second_vec_indx += stride_in;
3342 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3343 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3344 permute statements for the SLP node NODE of the SLP instance
3345 SLP_NODE_INSTANCE. */
3347 bool
3348 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3349 gimple_stmt_iterator *gsi, int vf,
3350 slp_instance slp_node_instance, bool analyze_only)
3352 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3353 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3354 tree mask_element_type = NULL_TREE, mask_type;
3355 int nunits, vec_index = 0;
3356 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3357 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3358 int unroll_factor, mask_element, ncopies;
3359 unsigned char *mask;
3360 machine_mode mode;
3362 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3363 return false;
3365 stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
3367 mode = TYPE_MODE (vectype);
3369 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3370 same size as the vector element being permuted. */
3371 mask_element_type = lang_hooks.types.type_for_mode
3372 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3373 mask_type = get_vectype_for_scalar_type (mask_element_type);
3374 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3375 mask = XALLOCAVEC (unsigned char, nunits);
3376 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3378 /* Number of copies is determined by the final vectorization factor
3379 relatively to SLP_NODE_INSTANCE unrolling factor. */
3380 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3382 /* Generate permutation masks for every NODE. Number of masks for each NODE
3383 is equal to GROUP_SIZE.
3384 E.g., we have a group of three nodes with three loads from the same
3385 location in each node, and the vector size is 4. I.e., we have a
3386 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3387 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3388 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3391 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3392 The last mask is illegal since we assume two operands for permute
3393 operation, and the mask element values can't be outside that range.
3394 Hence, the last mask must be converted into {2,5,5,5}.
3395 For the first two permutations we need the first and the second input
3396 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3397 we need the second and the third vectors: {b1,c1,a2,b2} and
3398 {c2,a3,b3,c3}. */
3400 int vect_stmts_counter = 0;
3401 int index = 0;
3402 int first_vec_index = -1;
3403 int second_vec_index = -1;
3404 bool noop_p = true;
3406 for (int j = 0; j < unroll_factor; j++)
3408 for (int k = 0; k < group_size; k++)
3410 int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
3411 + j * STMT_VINFO_GROUP_SIZE (stmt_info));
3412 vec_index = i / nunits;
3413 mask_element = i % nunits;
3414 if (vec_index == first_vec_index
3415 || first_vec_index == -1)
3417 first_vec_index = vec_index;
3419 else if (vec_index == second_vec_index
3420 || second_vec_index == -1)
3422 second_vec_index = vec_index;
3423 mask_element += nunits;
3425 else
3427 if (dump_enabled_p ())
3429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3430 "permutation requires at "
3431 "least three vectors ");
3432 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3433 stmt, 0);
3435 return false;
3438 gcc_assert (mask_element >= 0
3439 && mask_element < 2 * nunits);
3440 if (mask_element != index)
3441 noop_p = false;
3442 mask[index++] = mask_element;
3444 if (index == nunits)
3446 if (! noop_p
3447 && ! can_vec_perm_p (mode, false, mask))
3449 if (dump_enabled_p ())
3451 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3452 vect_location,
3453 "unsupported vect permute { ");
3454 for (i = 0; i < nunits; ++i)
3455 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ", mask[i]);
3456 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3458 return false;
3461 if (!analyze_only)
3463 tree mask_vec = NULL_TREE;
3465 if (! noop_p)
3467 tree *mask_elts = XALLOCAVEC (tree, nunits);
3468 for (int l = 0; l < nunits; ++l)
3469 mask_elts[l] = build_int_cst (mask_element_type,
3470 mask[l]);
3471 mask_vec = build_vector (mask_type, mask_elts);
3474 if (second_vec_index == -1)
3475 second_vec_index = first_vec_index;
3476 vect_create_mask_and_perm (stmt, mask_vec, first_vec_index,
3477 second_vec_index,
3478 gsi, node, vectype, dr_chain,
3479 ncopies, vect_stmts_counter++);
3482 index = 0;
3483 first_vec_index = -1;
3484 second_vec_index = -1;
3485 noop_p = true;
3490 return true;
3495 /* Vectorize SLP instance tree in postorder. */
3497 static bool
3498 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3499 unsigned int vectorization_factor)
3501 gimple *stmt;
3502 bool grouped_store, is_store;
3503 gimple_stmt_iterator si;
3504 stmt_vec_info stmt_info;
3505 unsigned int vec_stmts_size, nunits, group_size;
3506 tree vectype;
3507 int i, j;
3508 slp_tree child;
3510 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3511 return false;
3513 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3514 vect_schedule_slp_instance (child, instance, vectorization_factor);
3516 /* Push SLP node def-type to stmts. */
3517 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3518 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3519 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
3520 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
3522 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3523 stmt_info = vinfo_for_stmt (stmt);
3525 /* VECTYPE is the type of the destination. */
3526 vectype = STMT_VINFO_VECTYPE (stmt_info);
3527 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3528 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3530 /* For each SLP instance calculate number of vector stmts to be created
3531 for the scalar stmts in each node of the SLP tree. Number of vector
3532 elements in one vector iteration is the number of scalar elements in
3533 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3534 size.
3535 Unless this is a SLP reduction in which case the number of vector
3536 stmts is equal to the number of vector stmts of the children. */
3537 if (GROUP_FIRST_ELEMENT (stmt_info)
3538 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
3539 vec_stmts_size = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
3540 else
3541 vec_stmts_size = (vectorization_factor * group_size) / nunits;
3543 if (!SLP_TREE_VEC_STMTS (node).exists ())
3545 SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3546 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3549 if (dump_enabled_p ())
3551 dump_printf_loc (MSG_NOTE,vect_location,
3552 "------>vectorizing SLP node starting from: ");
3553 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3556 /* Vectorized stmts go before the last scalar stmt which is where
3557 all uses are ready. */
3558 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
3560 /* Mark the first element of the reduction chain as reduction to properly
3561 transform the node. In the analysis phase only the last element of the
3562 chain is marked as reduction. */
3563 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3564 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3566 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3567 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3570 /* Handle two-operation SLP nodes by vectorizing the group with
3571 both operations and then performing a merge. */
3572 if (SLP_TREE_TWO_OPERATORS (node))
3574 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3575 enum tree_code ocode = ERROR_MARK;
3576 gimple *ostmt;
3577 unsigned char *mask = XALLOCAVEC (unsigned char, group_size);
3578 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
3579 if (gimple_assign_rhs_code (ostmt) != code0)
3581 mask[i] = 1;
3582 ocode = gimple_assign_rhs_code (ostmt);
3584 else
3585 mask[i] = 0;
3586 if (ocode != ERROR_MARK)
3588 vec<gimple *> v0;
3589 vec<gimple *> v1;
3590 unsigned j;
3591 tree tmask = NULL_TREE;
3592 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3593 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3594 SLP_TREE_VEC_STMTS (node).truncate (0);
3595 gimple_assign_set_rhs_code (stmt, ocode);
3596 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3597 gimple_assign_set_rhs_code (stmt, code0);
3598 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3599 SLP_TREE_VEC_STMTS (node).truncate (0);
3600 tree meltype = build_nonstandard_integer_type
3601 (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))), 1);
3602 tree mvectype = get_same_sized_vectype (meltype, vectype);
3603 unsigned k = 0, l;
3604 for (j = 0; j < v0.length (); ++j)
3606 tree *melts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (vectype));
3607 for (l = 0; l < TYPE_VECTOR_SUBPARTS (vectype); ++l)
3609 if (k >= group_size)
3610 k = 0;
3611 melts[l] = build_int_cst
3612 (meltype, mask[k++] * TYPE_VECTOR_SUBPARTS (vectype) + l);
3614 tmask = build_vector (mvectype, melts);
3616 /* ??? Not all targets support a VEC_PERM_EXPR with a
3617 constant mask that would translate to a vec_merge RTX
3618 (with their vec_perm_const_ok). We can either not
3619 vectorize in that case or let veclower do its job.
3620 Unfortunately that isn't too great and at least for
3621 plus/minus we'd eventually like to match targets
3622 vector addsub instructions. */
3623 gimple *vstmt;
3624 vstmt = gimple_build_assign (make_ssa_name (vectype),
3625 VEC_PERM_EXPR,
3626 gimple_assign_lhs (v0[j]),
3627 gimple_assign_lhs (v1[j]), tmask);
3628 vect_finish_stmt_generation (stmt, vstmt, &si);
3629 SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
3631 v0.release ();
3632 v1.release ();
3633 return false;
3636 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3638 /* Restore stmt def-types. */
3639 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3640 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3641 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
3642 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
3644 return is_store;
3647 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3648 For loop vectorization this is done in vectorizable_call, but for SLP
3649 it needs to be deferred until end of vect_schedule_slp, because multiple
3650 SLP instances may refer to the same scalar stmt. */
3652 static void
3653 vect_remove_slp_scalar_calls (slp_tree node)
3655 gimple *stmt, *new_stmt;
3656 gimple_stmt_iterator gsi;
3657 int i;
3658 slp_tree child;
3659 tree lhs;
3660 stmt_vec_info stmt_info;
3662 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3663 return;
3665 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3666 vect_remove_slp_scalar_calls (child);
3668 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3670 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3671 continue;
3672 stmt_info = vinfo_for_stmt (stmt);
3673 if (stmt_info == NULL
3674 || is_pattern_stmt_p (stmt_info)
3675 || !PURE_SLP_STMT (stmt_info))
3676 continue;
3677 lhs = gimple_call_lhs (stmt);
3678 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3679 set_vinfo_for_stmt (new_stmt, stmt_info);
3680 set_vinfo_for_stmt (stmt, NULL);
3681 STMT_VINFO_STMT (stmt_info) = new_stmt;
3682 gsi = gsi_for_stmt (stmt);
3683 gsi_replace (&gsi, new_stmt, false);
3684 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3688 /* Generate vector code for all SLP instances in the loop/basic block. */
3690 bool
3691 vect_schedule_slp (vec_info *vinfo)
3693 vec<slp_instance> slp_instances;
3694 slp_instance instance;
3695 unsigned int i, vf;
3696 bool is_store = false;
3698 slp_instances = vinfo->slp_instances;
3699 if (is_a <loop_vec_info> (vinfo))
3700 vf = as_a <loop_vec_info> (vinfo)->vectorization_factor;
3701 else
3702 vf = 1;
3704 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3706 /* Schedule the tree of INSTANCE. */
3707 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3708 instance, vf);
3709 if (dump_enabled_p ())
3710 dump_printf_loc (MSG_NOTE, vect_location,
3711 "vectorizing stmts using SLP.\n");
3714 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3716 slp_tree root = SLP_INSTANCE_TREE (instance);
3717 gimple *store;
3718 unsigned int j;
3719 gimple_stmt_iterator gsi;
3721 /* Remove scalar call stmts. Do not do this for basic-block
3722 vectorization as not all uses may be vectorized.
3723 ??? Why should this be necessary? DCE should be able to
3724 remove the stmts itself.
3725 ??? For BB vectorization we can as well remove scalar
3726 stmts starting from the SLP tree root if they have no
3727 uses. */
3728 if (is_a <loop_vec_info> (vinfo))
3729 vect_remove_slp_scalar_calls (root);
3731 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3732 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3734 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3735 break;
3737 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3738 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3739 /* Free the attached stmt_vec_info and remove the stmt. */
3740 gsi = gsi_for_stmt (store);
3741 unlink_stmt_vdef (store);
3742 gsi_remove (&gsi, true);
3743 release_defs (store);
3744 free_stmt_vec_info (store);
3748 return is_store;