PR middle-end/59175
[official-gcc.git] / gcc / tree-vect-slp.c
blob247bdfd6669ace9050d9f69bf32e28048cc3a001
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "target.h"
30 #include "basic-block.h"
31 #include "gimple-pretty-print.h"
32 #include "gimple.h"
33 #include "gimple-iterator.h"
34 #include "gimple-ssa.h"
35 #include "tree-phinodes.h"
36 #include "ssa-iterators.h"
37 #include "tree-ssanames.h"
38 #include "tree-pass.h"
39 #include "cfgloop.h"
40 #include "expr.h"
41 #include "recog.h" /* FIXME: for insn_data */
42 #include "optabs.h"
43 #include "tree-vectorizer.h"
44 #include "langhooks.h"
46 /* Extract the location of the basic block in the source code.
47 Return the basic block location if succeed and NULL if not. */
49 LOC
50 find_bb_location (basic_block bb)
52 gimple stmt = NULL;
53 gimple_stmt_iterator si;
55 if (!bb)
56 return UNKNOWN_LOC;
58 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
60 stmt = gsi_stmt (si);
61 if (gimple_location (stmt) != UNKNOWN_LOC)
62 return gimple_location (stmt);
65 return UNKNOWN_LOC;
69 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
71 static void
72 vect_free_slp_tree (slp_tree node)
74 int i;
75 slp_tree child;
77 if (!node)
78 return;
80 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
81 vect_free_slp_tree (child);
83 SLP_TREE_CHILDREN (node).release ();
84 SLP_TREE_SCALAR_STMTS (node).release ();
85 SLP_TREE_VEC_STMTS (node).release ();
86 SLP_TREE_LOAD_PERMUTATION (node).release ();
88 free (node);
92 /* Free the memory allocated for the SLP instance. */
94 void
95 vect_free_slp_instance (slp_instance instance)
97 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
98 SLP_INSTANCE_LOADS (instance).release ();
99 SLP_INSTANCE_BODY_COST_VEC (instance).release ();
100 free (instance);
104 /* Create an SLP node for SCALAR_STMTS. */
106 static slp_tree
107 vect_create_new_slp_node (vec<gimple> scalar_stmts)
109 slp_tree node;
110 gimple stmt = scalar_stmts[0];
111 unsigned int nops;
113 if (is_gimple_call (stmt))
114 nops = gimple_call_num_args (stmt);
115 else if (is_gimple_assign (stmt))
117 nops = gimple_num_ops (stmt) - 1;
118 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
119 nops++;
121 else
122 return NULL;
124 node = XNEW (struct _slp_tree);
125 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
126 SLP_TREE_VEC_STMTS (node).create (0);
127 SLP_TREE_CHILDREN (node).create (nops);
128 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
130 return node;
134 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
135 operand. */
136 static vec<slp_oprnd_info>
137 vect_create_oprnd_info (int nops, int group_size)
139 int i;
140 slp_oprnd_info oprnd_info;
141 vec<slp_oprnd_info> oprnds_info;
143 oprnds_info.create (nops);
144 for (i = 0; i < nops; i++)
146 oprnd_info = XNEW (struct _slp_oprnd_info);
147 oprnd_info->def_stmts.create (group_size);
148 oprnd_info->first_dt = vect_uninitialized_def;
149 oprnd_info->first_op_type = NULL_TREE;
150 oprnd_info->first_pattern = false;
151 oprnds_info.quick_push (oprnd_info);
154 return oprnds_info;
158 /* Free operands info. */
160 static void
161 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
163 int i;
164 slp_oprnd_info oprnd_info;
166 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
168 oprnd_info->def_stmts.release ();
169 XDELETE (oprnd_info);
172 oprnds_info.release ();
176 /* Find the place of the data-ref in STMT in the interleaving chain that starts
177 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
179 static int
180 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
182 gimple next_stmt = first_stmt;
183 int result = 0;
185 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
186 return -1;
190 if (next_stmt == stmt)
191 return result;
192 result++;
193 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
195 while (next_stmt);
197 return -1;
201 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
202 they are of a valid type and that they match the defs of the first stmt of
203 the SLP group (stored in OPRNDS_INFO). */
205 static bool
206 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
207 gimple stmt, bool first,
208 vec<slp_oprnd_info> *oprnds_info)
210 tree oprnd;
211 unsigned int i, number_of_oprnds;
212 tree def;
213 gimple def_stmt;
214 enum vect_def_type dt = vect_uninitialized_def;
215 struct loop *loop = NULL;
216 bool pattern = false;
217 slp_oprnd_info oprnd_info;
218 int op_idx = 1;
219 tree compare_rhs = NULL_TREE;
221 if (loop_vinfo)
222 loop = LOOP_VINFO_LOOP (loop_vinfo);
224 if (is_gimple_call (stmt))
226 number_of_oprnds = gimple_call_num_args (stmt);
227 op_idx = 3;
229 else if (is_gimple_assign (stmt))
231 number_of_oprnds = gimple_num_ops (stmt) - 1;
232 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
233 number_of_oprnds++;
235 else
236 return false;
238 for (i = 0; i < number_of_oprnds; i++)
240 if (compare_rhs)
242 oprnd = compare_rhs;
243 compare_rhs = NULL_TREE;
245 else
246 oprnd = gimple_op (stmt, op_idx++);
248 oprnd_info = (*oprnds_info)[i];
250 if (COMPARISON_CLASS_P (oprnd))
252 compare_rhs = TREE_OPERAND (oprnd, 1);
253 oprnd = TREE_OPERAND (oprnd, 0);
256 if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
257 &def, &dt)
258 || (!def_stmt && dt != vect_constant_def))
260 if (dump_enabled_p ())
262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
263 "Build SLP failed: can't find def for ");
264 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
265 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
268 return false;
271 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
272 from the pattern. Check that all the stmts of the node are in the
273 pattern. */
274 if (def_stmt && gimple_bb (def_stmt)
275 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
276 || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
277 && gimple_code (def_stmt) != GIMPLE_PHI))
278 && vinfo_for_stmt (def_stmt)
279 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
280 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
281 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
283 pattern = true;
284 if (!first && !oprnd_info->first_pattern)
286 if (dump_enabled_p ())
288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
289 "Build SLP failed: some of the stmts"
290 " are in a pattern, and others are not ");
291 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
292 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
295 return false;
298 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
299 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
301 if (dt == vect_unknown_def_type)
303 if (dump_enabled_p ())
304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
305 "Unsupported pattern.\n");
306 return false;
309 switch (gimple_code (def_stmt))
311 case GIMPLE_PHI:
312 def = gimple_phi_result (def_stmt);
313 break;
315 case GIMPLE_ASSIGN:
316 def = gimple_assign_lhs (def_stmt);
317 break;
319 default:
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
322 "unsupported defining stmt:\n");
323 return false;
327 if (first)
329 oprnd_info->first_dt = dt;
330 oprnd_info->first_pattern = pattern;
331 oprnd_info->first_op_type = TREE_TYPE (oprnd);
333 else
335 /* Not first stmt of the group, check that the def-stmt/s match
336 the def-stmt/s of the first stmt. Allow different definition
337 types for reduction chains: the first stmt must be a
338 vect_reduction_def (a phi node), and the rest
339 vect_internal_def. */
340 if (((oprnd_info->first_dt != dt
341 && !(oprnd_info->first_dt == vect_reduction_def
342 && dt == vect_internal_def)
343 && !((oprnd_info->first_dt == vect_external_def
344 || oprnd_info->first_dt == vect_constant_def)
345 && (dt == vect_external_def
346 || dt == vect_constant_def)))
347 || !types_compatible_p (oprnd_info->first_op_type,
348 TREE_TYPE (oprnd))))
350 if (dump_enabled_p ())
351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
352 "Build SLP failed: different types\n");
354 return false;
358 /* Check the types of the definitions. */
359 switch (dt)
361 case vect_constant_def:
362 case vect_external_def:
363 case vect_reduction_def:
364 break;
366 case vect_internal_def:
367 oprnd_info->def_stmts.quick_push (def_stmt);
368 break;
370 default:
371 /* FORNOW: Not supported. */
372 if (dump_enabled_p ())
374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
375 "Build SLP failed: illegal type of def ");
376 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
377 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
380 return false;
384 return true;
388 /* Verify if the scalar stmts STMTS are isomorphic, require data
389 permutation or are of unsupported types of operation. Return
390 true if they are, otherwise return false and indicate in *MATCHES
391 which stmts are not isomorphic to the first one. If MATCHES[0]
392 is false then this indicates the comparison could not be
393 carried out or the stmts will never be vectorized by SLP. */
395 static bool
396 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
397 vec<gimple> stmts, unsigned int group_size,
398 unsigned nops, unsigned int *max_nunits,
399 unsigned int vectorization_factor, bool *matches)
401 unsigned int i;
402 gimple stmt = stmts[0];
403 enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
404 enum tree_code first_cond_code = ERROR_MARK;
405 tree lhs;
406 bool need_same_oprnds = false;
407 tree vectype, scalar_type, first_op1 = NULL_TREE;
408 optab optab;
409 int icode;
410 enum machine_mode optab_op2_mode;
411 enum machine_mode vec_mode;
412 struct data_reference *first_dr;
413 HOST_WIDE_INT dummy;
414 gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
415 tree cond;
417 /* For every stmt in NODE find its def stmt/s. */
418 FOR_EACH_VEC_ELT (stmts, i, stmt)
420 matches[i] = false;
422 if (dump_enabled_p ())
424 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
425 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
426 dump_printf (MSG_NOTE, "\n");
429 /* Fail to vectorize statements marked as unvectorizable. */
430 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
432 if (dump_enabled_p ())
434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435 "Build SLP failed: unvectorizable statement ");
436 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
437 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
439 /* Fatal mismatch. */
440 matches[0] = false;
441 return false;
444 lhs = gimple_get_lhs (stmt);
445 if (lhs == NULL_TREE)
447 if (dump_enabled_p ())
449 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
450 "Build SLP failed: not GIMPLE_ASSIGN nor "
451 "GIMPLE_CALL ");
452 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
453 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
455 /* Fatal mismatch. */
456 matches[0] = false;
457 return false;
460 if (is_gimple_assign (stmt)
461 && gimple_assign_rhs_code (stmt) == COND_EXPR
462 && (cond = gimple_assign_rhs1 (stmt))
463 && !COMPARISON_CLASS_P (cond))
465 if (dump_enabled_p ())
467 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
468 "Build SLP failed: condition is not "
469 "comparison ");
470 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
471 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
473 /* Fatal mismatch. */
474 matches[0] = false;
475 return false;
478 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
479 vectype = get_vectype_for_scalar_type (scalar_type);
480 if (!vectype)
482 if (dump_enabled_p ())
484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
485 "Build SLP failed: unsupported data-type ");
486 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
487 scalar_type);
488 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
490 /* Fatal mismatch. */
491 matches[0] = false;
492 return false;
495 /* In case of multiple types we need to detect the smallest type. */
496 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
498 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
499 if (bb_vinfo)
500 vectorization_factor = *max_nunits;
503 if (is_gimple_call (stmt))
505 rhs_code = CALL_EXPR;
506 if (gimple_call_internal_p (stmt)
507 || gimple_call_tail_p (stmt)
508 || gimple_call_noreturn_p (stmt)
509 || !gimple_call_nothrow_p (stmt)
510 || gimple_call_chain (stmt))
512 if (dump_enabled_p ())
514 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
515 "Build SLP failed: unsupported call type ");
516 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
517 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
519 /* Fatal mismatch. */
520 matches[0] = false;
521 return false;
524 else
525 rhs_code = gimple_assign_rhs_code (stmt);
527 /* Check the operation. */
528 if (i == 0)
530 first_stmt_code = rhs_code;
532 /* Shift arguments should be equal in all the packed stmts for a
533 vector shift with scalar shift operand. */
534 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
535 || rhs_code == LROTATE_EXPR
536 || rhs_code == RROTATE_EXPR)
538 vec_mode = TYPE_MODE (vectype);
540 /* First see if we have a vector/vector shift. */
541 optab = optab_for_tree_code (rhs_code, vectype,
542 optab_vector);
544 if (!optab
545 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
547 /* No vector/vector shift, try for a vector/scalar shift. */
548 optab = optab_for_tree_code (rhs_code, vectype,
549 optab_scalar);
551 if (!optab)
553 if (dump_enabled_p ())
554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
555 "Build SLP failed: no optab.\n");
556 /* Fatal mismatch. */
557 matches[0] = false;
558 return false;
560 icode = (int) optab_handler (optab, vec_mode);
561 if (icode == CODE_FOR_nothing)
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
565 "Build SLP failed: "
566 "op not supported by target.\n");
567 /* Fatal mismatch. */
568 matches[0] = false;
569 return false;
571 optab_op2_mode = insn_data[icode].operand[2].mode;
572 if (!VECTOR_MODE_P (optab_op2_mode))
574 need_same_oprnds = true;
575 first_op1 = gimple_assign_rhs2 (stmt);
579 else if (rhs_code == WIDEN_LSHIFT_EXPR)
581 need_same_oprnds = true;
582 first_op1 = gimple_assign_rhs2 (stmt);
585 else
587 if (first_stmt_code != rhs_code
588 && (first_stmt_code != IMAGPART_EXPR
589 || rhs_code != REALPART_EXPR)
590 && (first_stmt_code != REALPART_EXPR
591 || rhs_code != IMAGPART_EXPR)
592 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
593 && (first_stmt_code == ARRAY_REF
594 || first_stmt_code == BIT_FIELD_REF
595 || first_stmt_code == INDIRECT_REF
596 || first_stmt_code == COMPONENT_REF
597 || first_stmt_code == MEM_REF)))
599 if (dump_enabled_p ())
601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
602 "Build SLP failed: different operation "
603 "in stmt ");
604 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
605 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
607 /* Mismatch. */
608 continue;
611 if (need_same_oprnds
612 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
614 if (dump_enabled_p ())
616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
617 "Build SLP failed: different shift "
618 "arguments in ");
619 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
620 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
622 /* Mismatch. */
623 continue;
626 if (rhs_code == CALL_EXPR)
628 gimple first_stmt = stmts[0];
629 if (gimple_call_num_args (stmt) != nops
630 || !operand_equal_p (gimple_call_fn (first_stmt),
631 gimple_call_fn (stmt), 0)
632 || gimple_call_fntype (first_stmt)
633 != gimple_call_fntype (stmt))
635 if (dump_enabled_p ())
637 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
638 "Build SLP failed: different calls in ");
639 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
640 stmt, 0);
641 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
643 /* Mismatch. */
644 continue;
649 /* Grouped store or load. */
650 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
652 if (REFERENCE_CLASS_P (lhs))
654 /* Store. */
657 else
659 /* Load. */
660 unsigned unrolling_factor
661 = least_common_multiple
662 (*max_nunits, group_size) / group_size;
663 /* FORNOW: Check that there is no gap between the loads
664 and no gap between the groups when we need to load
665 multiple groups at once.
666 ??? We should enhance this to only disallow gaps
667 inside vectors. */
668 if ((unrolling_factor > 1
669 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
670 && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
671 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
672 && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
674 if (dump_enabled_p ())
676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
677 "Build SLP failed: grouped "
678 "loads have gaps ");
679 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
680 stmt, 0);
681 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
683 /* Fatal mismatch. */
684 matches[0] = false;
685 return false;
688 /* Check that the size of interleaved loads group is not
689 greater than the SLP group size. */
690 unsigned ncopies
691 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
692 if (loop_vinfo
693 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
694 && ((GROUP_SIZE (vinfo_for_stmt (stmt))
695 - GROUP_GAP (vinfo_for_stmt (stmt)))
696 > ncopies * group_size))
698 if (dump_enabled_p ())
700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
701 "Build SLP failed: the number "
702 "of interleaved loads is greater than "
703 "the SLP group size ");
704 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
705 stmt, 0);
706 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
708 /* Fatal mismatch. */
709 matches[0] = false;
710 return false;
713 old_first_load = first_load;
714 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
715 if (prev_first_load)
717 /* Check that there are no loads from different interleaving
718 chains in the same node. */
719 if (prev_first_load != first_load)
721 if (dump_enabled_p ())
723 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
724 vect_location,
725 "Build SLP failed: different "
726 "interleaving chains in one node ");
727 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
728 stmt, 0);
729 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
731 /* Mismatch. */
732 continue;
735 else
736 prev_first_load = first_load;
738 /* In some cases a group of loads is just the same load
739 repeated N times. Only analyze its cost once. */
740 if (first_load == stmt && old_first_load != first_load)
742 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
743 if (vect_supportable_dr_alignment (first_dr, false)
744 == dr_unaligned_unsupported)
746 if (dump_enabled_p ())
748 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
749 vect_location,
750 "Build SLP failed: unsupported "
751 "unaligned load ");
752 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
753 stmt, 0);
754 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
756 /* Fatal mismatch. */
757 matches[0] = false;
758 return false;
762 } /* Grouped access. */
763 else
765 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
767 /* Not grouped load. */
768 if (dump_enabled_p ())
770 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
771 "Build SLP failed: not grouped load ");
772 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
773 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
776 /* FORNOW: Not grouped loads are not supported. */
777 /* Fatal mismatch. */
778 matches[0] = false;
779 return false;
782 /* Not memory operation. */
783 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
784 && TREE_CODE_CLASS (rhs_code) != tcc_unary
785 && rhs_code != COND_EXPR
786 && rhs_code != CALL_EXPR)
788 if (dump_enabled_p ())
790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
791 "Build SLP failed: operation");
792 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
793 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
794 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
796 /* Fatal mismatch. */
797 matches[0] = false;
798 return false;
801 if (rhs_code == COND_EXPR)
803 tree cond_expr = gimple_assign_rhs1 (stmt);
805 if (i == 0)
806 first_cond_code = TREE_CODE (cond_expr);
807 else if (first_cond_code != TREE_CODE (cond_expr))
809 if (dump_enabled_p ())
811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
812 "Build SLP failed: different"
813 " operation");
814 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
815 stmt, 0);
816 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
818 /* Mismatch. */
819 continue;
824 matches[i] = true;
827 for (i = 0; i < group_size; ++i)
828 if (!matches[i])
829 return false;
831 return true;
834 /* Recursively build an SLP tree starting from NODE.
835 Fail (and return a value not equal to zero) if def-stmts are not
836 isomorphic, require data permutation or are of unsupported types of
837 operation. Otherwise, return 0.
838 The value returned is the depth in the SLP tree where a mismatch
839 was found. */
841 static bool
842 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
843 slp_tree *node, unsigned int group_size,
844 unsigned int *max_nunits,
845 vec<slp_tree> *loads,
846 unsigned int vectorization_factor,
847 bool *matches, unsigned *npermutes)
849 unsigned nops, i, this_npermutes = 0;
850 gimple stmt;
852 if (!matches)
853 matches = XALLOCAVEC (bool, group_size);
854 if (!npermutes)
855 npermutes = &this_npermutes;
857 matches[0] = false;
859 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
860 if (is_gimple_call (stmt))
861 nops = gimple_call_num_args (stmt);
862 else if (is_gimple_assign (stmt))
864 nops = gimple_num_ops (stmt) - 1;
865 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
866 nops++;
868 else
869 return false;
871 if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
872 SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
873 max_nunits, vectorization_factor, matches))
874 return false;
876 /* If the SLP node is a load, terminate the recursion. */
877 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
878 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
880 loads->safe_push (*node);
881 return true;
884 /* Get at the operands, verifying they are compatible. */
885 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
886 slp_oprnd_info oprnd_info;
887 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
889 if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
890 stmt, (i == 0), &oprnds_info))
892 vect_free_oprnd_info (oprnds_info);
893 return false;
897 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
899 /* Create SLP_TREE nodes for the definition node/s. */
900 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
902 slp_tree child;
903 unsigned old_nloads = loads->length ();
904 unsigned old_max_nunits = *max_nunits;
906 if (oprnd_info->first_dt != vect_internal_def)
907 continue;
909 child = vect_create_new_slp_node (oprnd_info->def_stmts);
910 if (!child)
912 vect_free_oprnd_info (oprnds_info);
913 return false;
916 bool *matches = XALLOCAVEC (bool, group_size);
917 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
918 group_size, max_nunits, loads,
919 vectorization_factor, matches, npermutes))
921 oprnd_info->def_stmts = vNULL;
922 SLP_TREE_CHILDREN (*node).quick_push (child);
923 continue;
926 /* If the SLP build for operand zero failed and operand zero
927 and one can be commutated try that for the scalar stmts
928 that failed the match. */
929 if (i == 0
930 /* A first scalar stmt mismatch signals a fatal mismatch. */
931 && matches[0]
932 /* ??? For COND_EXPRs we can swap the comparison operands
933 as well as the arms under some constraints. */
934 && nops == 2
935 && oprnds_info[1]->first_dt == vect_internal_def
936 && is_gimple_assign (stmt)
937 && commutative_tree_code (gimple_assign_rhs_code (stmt))
938 /* Do so only if the number of not successful permutes was nor more
939 than a cut-ff as re-trying the recursive match on
940 possibly each level of the tree would expose exponential
941 behavior. */
942 && *npermutes < 4)
944 /* Roll back. */
945 *max_nunits = old_max_nunits;
946 loads->truncate (old_nloads);
947 /* Swap mismatched definition stmts. */
948 for (unsigned j = 0; j < group_size; ++j)
949 if (!matches[j])
951 gimple tem = oprnds_info[0]->def_stmts[j];
952 oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
953 oprnds_info[1]->def_stmts[j] = tem;
955 /* And try again ... */
956 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
957 group_size, max_nunits, loads,
958 vectorization_factor,
959 matches, npermutes))
961 oprnd_info->def_stmts = vNULL;
962 SLP_TREE_CHILDREN (*node).quick_push (child);
963 continue;
966 ++*npermutes;
969 oprnd_info->def_stmts = vNULL;
970 vect_free_slp_tree (child);
971 vect_free_oprnd_info (oprnds_info);
972 return false;
975 vect_free_oprnd_info (oprnds_info);
976 return true;
979 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
981 static void
982 vect_print_slp_tree (int dump_kind, slp_tree node)
984 int i;
985 gimple stmt;
986 slp_tree child;
988 if (!node)
989 return;
991 dump_printf (dump_kind, "node ");
992 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
994 dump_printf (dump_kind, "\n\tstmt %d ", i);
995 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
997 dump_printf (dump_kind, "\n");
999 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1000 vect_print_slp_tree (dump_kind, child);
1004 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1005 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1006 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1007 stmts in NODE are to be marked. */
1009 static void
1010 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1012 int i;
1013 gimple stmt;
1014 slp_tree child;
1016 if (!node)
1017 return;
1019 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1020 if (j < 0 || i == j)
1021 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1023 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1024 vect_mark_slp_stmts (child, mark, j);
1028 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1030 static void
1031 vect_mark_slp_stmts_relevant (slp_tree node)
1033 int i;
1034 gimple stmt;
1035 stmt_vec_info stmt_info;
1036 slp_tree child;
1038 if (!node)
1039 return;
1041 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1043 stmt_info = vinfo_for_stmt (stmt);
1044 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1045 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1046 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1049 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1050 vect_mark_slp_stmts_relevant (child);
1054 /* Rearrange the statements of NODE according to PERMUTATION. */
1056 static void
1057 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1058 vec<unsigned> permutation)
1060 gimple stmt;
1061 vec<gimple> tmp_stmts;
1062 unsigned int i;
1063 slp_tree child;
1065 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1066 vect_slp_rearrange_stmts (child, group_size, permutation);
1068 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1069 tmp_stmts.create (group_size);
1070 tmp_stmts.quick_grow_cleared (group_size);
1072 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1073 tmp_stmts[permutation[i]] = stmt;
1075 SLP_TREE_SCALAR_STMTS (node).release ();
1076 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1080 /* Check if the required load permutations in the SLP instance
1081 SLP_INSTN are supported. */
1083 static bool
1084 vect_supported_load_permutation_p (slp_instance slp_instn)
1086 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1087 unsigned int i, j, k, next;
1088 sbitmap load_index;
1089 slp_tree node;
1090 gimple stmt, load, next_load, first_load;
1091 struct data_reference *dr;
1093 if (dump_enabled_p ())
1095 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1096 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1097 if (node->load_permutation.exists ())
1098 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1099 dump_printf (MSG_NOTE, "%d ", next);
1100 else
1101 for (i = 0; i < group_size; ++i)
1102 dump_printf (MSG_NOTE, "%d ", i);
1103 dump_printf (MSG_NOTE, "\n");
1106 /* In case of reduction every load permutation is allowed, since the order
1107 of the reduction statements is not important (as opposed to the case of
1108 grouped stores). The only condition we need to check is that all the
1109 load nodes are of the same size and have the same permutation (and then
1110 rearrange all the nodes of the SLP instance according to this
1111 permutation). */
1113 /* Check that all the load nodes are of the same size. */
1114 /* ??? Can't we assert this? */
1115 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1116 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1117 return false;
1119 node = SLP_INSTANCE_TREE (slp_instn);
1120 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1122 /* Reduction (there are no data-refs in the root).
1123 In reduction chain the order of the loads is important. */
1124 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1125 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1127 slp_tree load;
1128 unsigned int lidx;
1130 /* Compare all the permutation sequences to the first one. We know
1131 that at least one load is permuted. */
1132 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1133 if (!node->load_permutation.exists ())
1134 return false;
1135 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1137 if (!load->load_permutation.exists ())
1138 return false;
1139 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1140 if (lidx != node->load_permutation[j])
1141 return false;
1144 /* Check that the loads in the first sequence are different and there
1145 are no gaps between them. */
1146 load_index = sbitmap_alloc (group_size);
1147 bitmap_clear (load_index);
1148 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1150 if (bitmap_bit_p (load_index, lidx))
1152 sbitmap_free (load_index);
1153 return false;
1155 bitmap_set_bit (load_index, lidx);
1157 for (i = 0; i < group_size; i++)
1158 if (!bitmap_bit_p (load_index, i))
1160 sbitmap_free (load_index);
1161 return false;
1163 sbitmap_free (load_index);
1165 /* This permutation is valid for reduction. Since the order of the
1166 statements in the nodes is not important unless they are memory
1167 accesses, we can rearrange the statements in all the nodes
1168 according to the order of the loads. */
1169 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1170 node->load_permutation);
1172 /* We are done, no actual permutations need to be generated. */
1173 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1174 SLP_TREE_LOAD_PERMUTATION (node).release ();
1175 return true;
1178 /* In basic block vectorization we allow any subchain of an interleaving
1179 chain.
1180 FORNOW: not supported in loop SLP because of realignment compications. */
1181 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1183 /* Check that for every node in the instance the loads
1184 form a subchain. */
1185 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1187 next_load = NULL;
1188 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1190 if (j != 0 && next_load != load)
1191 return false;
1192 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1196 /* Check that the alignment of the first load in every subchain, i.e.,
1197 the first statement in every load node, is supported.
1198 ??? This belongs in alignment checking. */
1199 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1201 first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1202 if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1204 dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1205 if (vect_supportable_dr_alignment (dr, false)
1206 == dr_unaligned_unsupported)
1208 if (dump_enabled_p ())
1210 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1211 vect_location,
1212 "unsupported unaligned load ");
1213 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1214 first_load, 0);
1215 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1217 return false;
1222 /* We are done, no actual permutations need to be generated. */
1223 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1224 SLP_TREE_LOAD_PERMUTATION (node).release ();
1225 return true;
1228 /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1229 GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1230 well (unless it's reduction). */
1231 if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
1232 return false;
1233 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1234 if (!node->load_permutation.exists ())
1235 return false;
1237 load_index = sbitmap_alloc (group_size);
1238 bitmap_clear (load_index);
1239 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1241 unsigned int lidx = node->load_permutation[0];
1242 if (bitmap_bit_p (load_index, lidx))
1244 sbitmap_free (load_index);
1245 return false;
1247 bitmap_set_bit (load_index, lidx);
1248 FOR_EACH_VEC_ELT (node->load_permutation, j, k)
1249 if (k != lidx)
1251 sbitmap_free (load_index);
1252 return false;
1255 for (i = 0; i < group_size; i++)
1256 if (!bitmap_bit_p (load_index, i))
1258 sbitmap_free (load_index);
1259 return false;
1261 sbitmap_free (load_index);
1263 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1264 if (node->load_permutation.exists ()
1265 && !vect_transform_slp_perm_load
1266 (node, vNULL, NULL,
1267 SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1268 return false;
1269 return true;
1273 /* Find the first load in the loop that belongs to INSTANCE.
1274 When loads are in several SLP nodes, there can be a case in which the first
1275 load does not appear in the first SLP node to be transformed, causing
1276 incorrect order of statements. Since we generate all the loads together,
1277 they must be inserted before the first load of the SLP instance and not
1278 before the first load of the first node of the instance. */
1280 static gimple
1281 vect_find_first_load_in_slp_instance (slp_instance instance)
1283 int i, j;
1284 slp_tree load_node;
1285 gimple first_load = NULL, load;
1287 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1288 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1289 first_load = get_earlier_stmt (load, first_load);
1291 return first_load;
1295 /* Find the last store in SLP INSTANCE. */
1297 static gimple
1298 vect_find_last_store_in_slp_instance (slp_instance instance)
1300 int i;
1301 slp_tree node;
1302 gimple last_store = NULL, store;
1304 node = SLP_INSTANCE_TREE (instance);
1305 for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1306 last_store = get_later_stmt (store, last_store);
1308 return last_store;
1311 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1313 static void
1314 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1315 slp_instance instance, slp_tree node,
1316 stmt_vector_for_cost *prologue_cost_vec,
1317 unsigned ncopies_for_cost)
1319 stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
1321 unsigned i;
1322 slp_tree child;
1323 gimple stmt, s;
1324 stmt_vec_info stmt_info;
1325 tree lhs;
1326 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1328 /* Recurse down the SLP tree. */
1329 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1330 vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1331 instance, child, prologue_cost_vec,
1332 ncopies_for_cost);
1334 /* Look at the first scalar stmt to determine the cost. */
1335 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1336 stmt_info = vinfo_for_stmt (stmt);
1337 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1339 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1340 vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1341 vect_uninitialized_def,
1342 node, prologue_cost_vec, body_cost_vec);
1343 else
1345 int i;
1346 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1347 vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1348 node, prologue_cost_vec, body_cost_vec);
1349 /* If the load is permuted record the cost for the permutation.
1350 ??? Loads from multiple chains are let through here only
1351 for a single special case involving complex numbers where
1352 in the end no permutation is necessary. */
1353 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1354 if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1355 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1356 && vect_get_place_in_interleaving_chain
1357 (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1359 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1360 stmt_info, 0, vect_body);
1361 break;
1365 else
1366 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1367 stmt_info, 0, vect_body);
1369 /* Scan operands and account for prologue cost of constants/externals.
1370 ??? This over-estimates cost for multiple uses and should be
1371 re-engineered. */
1372 lhs = gimple_get_lhs (stmt);
1373 for (i = 0; i < gimple_num_ops (stmt); ++i)
1375 tree def, op = gimple_op (stmt, i);
1376 gimple def_stmt;
1377 enum vect_def_type dt;
1378 if (!op || op == lhs)
1379 continue;
1380 if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
1381 &def_stmt, &def, &dt)
1382 && (dt == vect_constant_def || dt == vect_external_def))
1383 record_stmt_cost (prologue_cost_vec, 1, vector_stmt,
1384 stmt_info, 0, vect_prologue);
1388 /* Compute the cost for the SLP instance INSTANCE. */
1390 static void
1391 vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1392 slp_instance instance, unsigned nunits)
1394 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1395 unsigned ncopies_for_cost;
1396 stmt_info_for_cost *si;
1397 unsigned i;
1399 /* Calculate the number of vector stmts to create based on the unrolling
1400 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1401 GROUP_SIZE / NUNITS otherwise. */
1402 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1403 ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1405 prologue_cost_vec.create (10);
1406 body_cost_vec.create (10);
1407 SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
1408 vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1409 instance, SLP_INSTANCE_TREE (instance),
1410 &prologue_cost_vec, ncopies_for_cost);
1412 /* Record the prologue costs, which were delayed until we were
1413 sure that SLP was successful. Unlike the body costs, we know
1414 the final values now regardless of the loop vectorization factor. */
1415 void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1416 : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1417 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1419 struct _stmt_vec_info *stmt_info
1420 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1421 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1422 si->misalign, vect_prologue);
1425 prologue_cost_vec.release ();
1428 /* Analyze an SLP instance starting from a group of grouped stores. Call
1429 vect_build_slp_tree to build a tree of packed stmts if possible.
1430 Return FALSE if it's impossible to SLP any stmt in the loop. */
1432 static bool
1433 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1434 gimple stmt)
1436 slp_instance new_instance;
1437 slp_tree node;
1438 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1439 unsigned int unrolling_factor = 1, nunits;
1440 tree vectype, scalar_type = NULL_TREE;
1441 gimple next;
1442 unsigned int vectorization_factor = 0;
1443 int i;
1444 unsigned int max_nunits = 0;
1445 vec<slp_tree> loads;
1446 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1447 vec<gimple> scalar_stmts;
1449 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1451 if (dr)
1453 scalar_type = TREE_TYPE (DR_REF (dr));
1454 vectype = get_vectype_for_scalar_type (scalar_type);
1456 else
1458 gcc_assert (loop_vinfo);
1459 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1462 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1464 else
1466 gcc_assert (loop_vinfo);
1467 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1468 group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1471 if (!vectype)
1473 if (dump_enabled_p ())
1475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1476 "Build SLP failed: unsupported data-type ");
1477 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1478 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1481 return false;
1484 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1485 if (loop_vinfo)
1486 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1487 else
1488 vectorization_factor = nunits;
1490 /* Calculate the unrolling factor. */
1491 unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1492 if (unrolling_factor != 1 && !loop_vinfo)
1494 if (dump_enabled_p ())
1495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1496 "Build SLP failed: unrolling required in basic"
1497 " block SLP\n");
1499 return false;
1502 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1503 scalar_stmts.create (group_size);
1504 next = stmt;
1505 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1507 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1508 while (next)
1510 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1511 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1512 scalar_stmts.safe_push (
1513 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1514 else
1515 scalar_stmts.safe_push (next);
1516 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1519 else
1521 /* Collect reduction statements. */
1522 vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1523 for (i = 0; reductions.iterate (i, &next); i++)
1524 scalar_stmts.safe_push (next);
1527 node = vect_create_new_slp_node (scalar_stmts);
1529 loads.create (group_size);
1531 /* Build the tree for the SLP instance. */
1532 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1533 &max_nunits, &loads,
1534 vectorization_factor, NULL, NULL))
1536 /* Calculate the unrolling factor based on the smallest type. */
1537 if (max_nunits > nunits)
1538 unrolling_factor = least_common_multiple (max_nunits, group_size)
1539 / group_size;
1541 if (unrolling_factor != 1 && !loop_vinfo)
1543 if (dump_enabled_p ())
1544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545 "Build SLP failed: unrolling required in basic"
1546 " block SLP\n");
1547 vect_free_slp_tree (node);
1548 loads.release ();
1549 return false;
1552 /* Create a new SLP instance. */
1553 new_instance = XNEW (struct _slp_instance);
1554 SLP_INSTANCE_TREE (new_instance) = node;
1555 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1556 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1557 SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
1558 SLP_INSTANCE_LOADS (new_instance) = loads;
1559 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1561 /* Compute the load permutation. */
1562 slp_tree load_node;
1563 bool loads_permuted = false;
1564 FOR_EACH_VEC_ELT (loads, i, load_node)
1566 vec<unsigned> load_permutation;
1567 int j;
1568 gimple load, first_stmt;
1569 bool this_load_permuted = false;
1570 load_permutation.create (group_size);
1571 first_stmt = GROUP_FIRST_ELEMENT
1572 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1573 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1575 int load_place
1576 = vect_get_place_in_interleaving_chain (load, first_stmt);
1577 gcc_assert (load_place != -1);
1578 if (load_place != j)
1579 this_load_permuted = true;
1580 load_permutation.safe_push (load_place);
1582 if (!this_load_permuted)
1584 load_permutation.release ();
1585 continue;
1587 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1588 loads_permuted = true;
1591 if (loads_permuted)
1593 if (!vect_supported_load_permutation_p (new_instance))
1595 if (dump_enabled_p ())
1597 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1598 "Build SLP failed: unsupported load "
1599 "permutation ");
1600 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1601 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1603 vect_free_slp_instance (new_instance);
1604 return false;
1607 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1608 = vect_find_first_load_in_slp_instance (new_instance);
1611 /* Compute the costs of this SLP instance. */
1612 vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
1613 new_instance, TYPE_VECTOR_SUBPARTS (vectype));
1615 if (loop_vinfo)
1616 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1617 else
1618 BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1620 if (dump_enabled_p ())
1621 vect_print_slp_tree (MSG_NOTE, node);
1623 return true;
1626 /* Failed to SLP. */
1627 /* Free the allocated memory. */
1628 vect_free_slp_tree (node);
1629 loads.release ();
1631 return false;
1635 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1636 trees of packed scalar stmts if SLP is possible. */
1638 bool
1639 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
1641 unsigned int i;
1642 vec<gimple> grouped_stores;
1643 vec<gimple> reductions = vNULL;
1644 vec<gimple> reduc_chains = vNULL;
1645 gimple first_element;
1646 bool ok = false;
1648 if (dump_enabled_p ())
1649 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1651 if (loop_vinfo)
1653 grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1654 reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1655 reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1657 else
1658 grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1660 /* Find SLP sequences starting from groups of grouped stores. */
1661 FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1662 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
1663 ok = true;
1665 if (bb_vinfo && !ok)
1667 if (dump_enabled_p ())
1668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1669 "Failed to SLP the basic block.\n");
1671 return false;
1674 if (loop_vinfo
1675 && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1677 /* Find SLP sequences starting from reduction chains. */
1678 FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1679 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
1680 ok = true;
1681 else
1682 return false;
1684 /* Don't try to vectorize SLP reductions if reduction chain was
1685 detected. */
1686 return ok;
1689 /* Find SLP sequences starting from groups of reductions. */
1690 if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1691 && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0]))
1692 ok = true;
1694 return true;
1698 /* For each possible SLP instance decide whether to SLP it and calculate overall
1699 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1700 least one instance. */
1702 bool
1703 vect_make_slp_decision (loop_vec_info loop_vinfo)
1705 unsigned int i, unrolling_factor = 1;
1706 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1707 slp_instance instance;
1708 int decided_to_slp = 0;
1710 if (dump_enabled_p ())
1711 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1712 "\n");
1714 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1716 /* FORNOW: SLP if you can. */
1717 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1718 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1720 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
1721 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1722 loop-based vectorization. Such stmts will be marked as HYBRID. */
1723 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1724 decided_to_slp++;
1727 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1729 if (decided_to_slp && dump_enabled_p ())
1730 dump_printf_loc (MSG_NOTE, vect_location,
1731 "Decided to SLP %d instances. Unrolling factor %d\n",
1732 decided_to_slp, unrolling_factor);
1734 return (decided_to_slp > 0);
1738 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1739 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1741 static void
1742 vect_detect_hybrid_slp_stmts (slp_tree node)
1744 int i;
1745 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (node);
1746 gimple stmt = stmts[0];
1747 imm_use_iterator imm_iter;
1748 gimple use_stmt;
1749 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1750 slp_tree child;
1751 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1752 struct loop *loop = NULL;
1753 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo);
1754 basic_block bb = NULL;
1756 if (!node)
1757 return;
1759 if (loop_vinfo)
1760 loop = LOOP_VINFO_LOOP (loop_vinfo);
1761 else
1762 bb = BB_VINFO_BB (bb_vinfo);
1764 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1765 if (PURE_SLP_STMT (vinfo_for_stmt (stmt))
1766 && TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1767 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1768 if (gimple_bb (use_stmt)
1769 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1770 || bb == gimple_bb (use_stmt))
1771 && (stmt_vinfo = vinfo_for_stmt (use_stmt))
1772 && !STMT_SLP_TYPE (stmt_vinfo)
1773 && (STMT_VINFO_RELEVANT (stmt_vinfo)
1774 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
1775 && !(gimple_code (use_stmt) == GIMPLE_PHI
1776 && STMT_VINFO_DEF_TYPE (stmt_vinfo)
1777 == vect_reduction_def))
1778 vect_mark_slp_stmts (node, hybrid, i);
1780 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1781 vect_detect_hybrid_slp_stmts (child);
1785 /* Find stmts that must be both vectorized and SLPed. */
1787 void
1788 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1790 unsigned int i;
1791 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1792 slp_instance instance;
1794 if (dump_enabled_p ())
1795 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
1796 "\n");
1798 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1799 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance));
1803 /* Create and initialize a new bb_vec_info struct for BB, as well as
1804 stmt_vec_info structs for all the stmts in it. */
1806 static bb_vec_info
1807 new_bb_vec_info (basic_block bb)
1809 bb_vec_info res = NULL;
1810 gimple_stmt_iterator gsi;
1812 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
1813 BB_VINFO_BB (res) = bb;
1815 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1817 gimple stmt = gsi_stmt (gsi);
1818 gimple_set_uid (stmt, 0);
1819 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
1822 BB_VINFO_GROUPED_STORES (res).create (10);
1823 BB_VINFO_SLP_INSTANCES (res).create (2);
1824 BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
1826 bb->aux = res;
1827 return res;
1831 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
1832 stmts in the basic block. */
1834 static void
1835 destroy_bb_vec_info (bb_vec_info bb_vinfo)
1837 vec<slp_instance> slp_instances;
1838 slp_instance instance;
1839 basic_block bb;
1840 gimple_stmt_iterator si;
1841 unsigned i;
1843 if (!bb_vinfo)
1844 return;
1846 bb = BB_VINFO_BB (bb_vinfo);
1848 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1850 gimple stmt = gsi_stmt (si);
1851 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1853 if (stmt_info)
1854 /* Free stmt_vec_info. */
1855 free_stmt_vec_info (stmt);
1858 vect_destroy_datarefs (NULL, bb_vinfo);
1859 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
1860 BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
1861 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1862 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1863 vect_free_slp_instance (instance);
1864 BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
1865 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1866 free (bb_vinfo);
1867 bb->aux = NULL;
1871 /* Analyze statements contained in SLP tree node after recursively analyzing
1872 the subtree. Return TRUE if the operations are supported. */
1874 static bool
1875 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
1877 bool dummy;
1878 int i;
1879 gimple stmt;
1880 slp_tree child;
1882 if (!node)
1883 return true;
1885 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1886 if (!vect_slp_analyze_node_operations (bb_vinfo, child))
1887 return false;
1889 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1891 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1892 gcc_assert (stmt_info);
1893 gcc_assert (PURE_SLP_STMT (stmt_info));
1895 if (!vect_analyze_stmt (stmt, &dummy, node))
1896 return false;
1899 return true;
1903 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
1904 operations are supported. */
1906 static bool
1907 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
1909 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1910 slp_instance instance;
1911 int i;
1913 for (i = 0; slp_instances.iterate (i, &instance); )
1915 if (!vect_slp_analyze_node_operations (bb_vinfo,
1916 SLP_INSTANCE_TREE (instance)))
1918 vect_free_slp_instance (instance);
1919 slp_instances.ordered_remove (i);
1921 else
1922 i++;
1925 if (!slp_instances.length ())
1926 return false;
1928 return true;
1932 /* Compute the scalar cost of the SLP node NODE and its children
1933 and return it. Do not account defs that are marked in LIFE and
1934 update LIFE according to uses of NODE. */
1936 static unsigned
1937 vect_bb_slp_scalar_cost (basic_block bb,
1938 slp_tree node, vec<bool, va_heap> *life)
1940 unsigned scalar_cost = 0;
1941 unsigned i;
1942 gimple stmt;
1943 slp_tree child;
1945 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1947 unsigned stmt_cost;
1948 ssa_op_iter op_iter;
1949 def_operand_p def_p;
1950 stmt_vec_info stmt_info;
1952 if ((*life)[i])
1953 continue;
1955 /* If there is a non-vectorized use of the defs then the scalar
1956 stmt is kept live in which case we do not account it or any
1957 required defs in the SLP children in the scalar cost. This
1958 way we make the vectorization more costly when compared to
1959 the scalar cost. */
1960 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
1962 imm_use_iterator use_iter;
1963 gimple use_stmt;
1964 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
1965 if (gimple_code (use_stmt) == GIMPLE_PHI
1966 || gimple_bb (use_stmt) != bb
1967 || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt)))
1969 (*life)[i] = true;
1970 BREAK_FROM_IMM_USE_STMT (use_iter);
1973 if ((*life)[i])
1974 continue;
1976 stmt_info = vinfo_for_stmt (stmt);
1977 if (STMT_VINFO_DATA_REF (stmt_info))
1979 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1980 stmt_cost = vect_get_stmt_cost (scalar_load);
1981 else
1982 stmt_cost = vect_get_stmt_cost (scalar_store);
1984 else
1985 stmt_cost = vect_get_stmt_cost (scalar_stmt);
1987 scalar_cost += stmt_cost;
1990 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1991 scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
1993 return scalar_cost;
1996 /* Check if vectorization of the basic block is profitable. */
1998 static bool
1999 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2001 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2002 slp_instance instance;
2003 int i, j;
2004 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2005 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2006 void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2007 stmt_vec_info stmt_info = NULL;
2008 stmt_vector_for_cost body_cost_vec;
2009 stmt_info_for_cost *ci;
2011 /* Calculate vector costs. */
2012 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2014 body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2016 FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2018 stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2019 (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2020 stmt_info, ci->misalign, vect_body);
2024 /* Calculate scalar cost. */
2025 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2027 stack_vec<bool, 20> life;
2028 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2029 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2030 SLP_INSTANCE_TREE (instance),
2031 &life);
2034 /* Complete the target-specific cost calculation. */
2035 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2036 &vec_inside_cost, &vec_epilogue_cost);
2038 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2040 if (dump_enabled_p ())
2042 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2043 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2044 vec_inside_cost);
2045 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2046 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2047 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2050 /* Vectorization is profitable if its cost is less than the cost of scalar
2051 version. */
2052 if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2053 return false;
2055 return true;
2058 /* Check if the basic block can be vectorized. */
2060 static bb_vec_info
2061 vect_slp_analyze_bb_1 (basic_block bb)
2063 bb_vec_info bb_vinfo;
2064 vec<slp_instance> slp_instances;
2065 slp_instance instance;
2066 int i;
2067 int min_vf = 2;
2069 bb_vinfo = new_bb_vec_info (bb);
2070 if (!bb_vinfo)
2071 return NULL;
2073 if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf))
2075 if (dump_enabled_p ())
2076 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2077 "not vectorized: unhandled data-ref in basic "
2078 "block.\n");
2080 destroy_bb_vec_info (bb_vinfo);
2081 return NULL;
2084 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2086 if (dump_enabled_p ())
2087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2088 "not vectorized: not enough data-refs in "
2089 "basic block.\n");
2091 destroy_bb_vec_info (bb_vinfo);
2092 return NULL;
2095 if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2097 if (dump_enabled_p ())
2098 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2099 "not vectorized: unhandled data access in "
2100 "basic block.\n");
2102 destroy_bb_vec_info (bb_vinfo);
2103 return NULL;
2106 vect_pattern_recog (NULL, bb_vinfo);
2108 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2110 if (dump_enabled_p ())
2111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2112 "not vectorized: unhandled data dependence "
2113 "in basic block.\n");
2115 destroy_bb_vec_info (bb_vinfo);
2116 return NULL;
2119 if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2121 if (dump_enabled_p ())
2122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2123 "not vectorized: bad data alignment in basic "
2124 "block.\n");
2126 destroy_bb_vec_info (bb_vinfo);
2127 return NULL;
2130 /* Check the SLP opportunities in the basic block, analyze and build SLP
2131 trees. */
2132 if (!vect_analyze_slp (NULL, bb_vinfo))
2134 if (dump_enabled_p ())
2135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2136 "not vectorized: failed to find SLP opportunities "
2137 "in basic block.\n");
2139 destroy_bb_vec_info (bb_vinfo);
2140 return NULL;
2143 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2145 /* Mark all the statements that we want to vectorize as pure SLP and
2146 relevant. */
2147 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2149 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2150 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2153 if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2155 if (dump_enabled_p ())
2156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2157 "not vectorized: unsupported alignment in basic "
2158 "block.\n");
2159 destroy_bb_vec_info (bb_vinfo);
2160 return NULL;
2163 if (!vect_slp_analyze_operations (bb_vinfo))
2165 if (dump_enabled_p ())
2166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2167 "not vectorized: bad operation in basic block.\n");
2169 destroy_bb_vec_info (bb_vinfo);
2170 return NULL;
2173 /* Cost model: check if the vectorization is worthwhile. */
2174 if (!unlimited_cost_model ()
2175 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2177 if (dump_enabled_p ())
2178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2179 "not vectorized: vectorization is not "
2180 "profitable.\n");
2182 destroy_bb_vec_info (bb_vinfo);
2183 return NULL;
2186 if (dump_enabled_p ())
2187 dump_printf_loc (MSG_NOTE, vect_location,
2188 "Basic block will be vectorized using SLP\n");
2190 return bb_vinfo;
2194 bb_vec_info
2195 vect_slp_analyze_bb (basic_block bb)
2197 bb_vec_info bb_vinfo;
2198 int insns = 0;
2199 gimple_stmt_iterator gsi;
2200 unsigned int vector_sizes;
2202 if (dump_enabled_p ())
2203 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2205 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2207 gimple stmt = gsi_stmt (gsi);
2208 if (!is_gimple_debug (stmt)
2209 && !gimple_nop_p (stmt)
2210 && gimple_code (stmt) != GIMPLE_LABEL)
2211 insns++;
2214 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2216 if (dump_enabled_p ())
2217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2218 "not vectorized: too many instructions in "
2219 "basic block.\n");
2221 return NULL;
2224 /* Autodetect first vector size we try. */
2225 current_vector_size = 0;
2226 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2228 while (1)
2230 bb_vinfo = vect_slp_analyze_bb_1 (bb);
2231 if (bb_vinfo)
2232 return bb_vinfo;
2234 destroy_bb_vec_info (bb_vinfo);
2236 vector_sizes &= ~current_vector_size;
2237 if (vector_sizes == 0
2238 || current_vector_size == 0)
2239 return NULL;
2241 /* Try the next biggest vector size. */
2242 current_vector_size = 1 << floor_log2 (vector_sizes);
2243 if (dump_enabled_p ())
2244 dump_printf_loc (MSG_NOTE, vect_location,
2245 "***** Re-trying analysis with "
2246 "vector size %d\n", current_vector_size);
2251 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2252 the number of created vector stmts depends on the unrolling factor).
2253 However, the actual number of vector stmts for every SLP node depends on
2254 VF which is set later in vect_analyze_operations (). Hence, SLP costs
2255 should be updated. In this function we assume that the inside costs
2256 calculated in vect_model_xxx_cost are linear in ncopies. */
2258 void
2259 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2261 unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2262 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2263 slp_instance instance;
2264 stmt_vector_for_cost body_cost_vec;
2265 stmt_info_for_cost *si;
2266 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2268 if (dump_enabled_p ())
2269 dump_printf_loc (MSG_NOTE, vect_location,
2270 "=== vect_update_slp_costs_according_to_vf ===\n");
2272 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2274 /* We assume that costs are linear in ncopies. */
2275 int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2277 /* Record the instance's instructions in the target cost model.
2278 This was delayed until here because the count of instructions
2279 isn't known beforehand. */
2280 body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2282 FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2283 (void) add_stmt_cost (data, si->count * ncopies, si->kind,
2284 vinfo_for_stmt (si->stmt), si->misalign,
2285 vect_body);
2290 /* For constant and loop invariant defs of SLP_NODE this function returns
2291 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2292 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2293 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2294 REDUC_INDEX is the index of the reduction operand in the statements, unless
2295 it is -1. */
2297 static void
2298 vect_get_constant_vectors (tree op, slp_tree slp_node,
2299 vec<tree> *vec_oprnds,
2300 unsigned int op_num, unsigned int number_of_vectors,
2301 int reduc_index)
2303 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2304 gimple stmt = stmts[0];
2305 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2306 unsigned nunits;
2307 tree vec_cst;
2308 tree *elts;
2309 unsigned j, number_of_places_left_in_vector;
2310 tree vector_type;
2311 tree vop;
2312 int group_size = stmts.length ();
2313 unsigned int vec_num, i;
2314 unsigned number_of_copies = 1;
2315 vec<tree> voprnds;
2316 voprnds.create (number_of_vectors);
2317 bool constant_p, is_store;
2318 tree neutral_op = NULL;
2319 enum tree_code code = gimple_expr_code (stmt);
2320 gimple def_stmt;
2321 struct loop *loop;
2322 gimple_seq ctor_seq = NULL;
2324 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2325 && reduc_index != -1)
2327 op_num = reduc_index - 1;
2328 op = gimple_op (stmt, reduc_index);
2329 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2330 we need either neutral operands or the original operands. See
2331 get_initial_def_for_reduction() for details. */
2332 switch (code)
2334 case WIDEN_SUM_EXPR:
2335 case DOT_PROD_EXPR:
2336 case PLUS_EXPR:
2337 case MINUS_EXPR:
2338 case BIT_IOR_EXPR:
2339 case BIT_XOR_EXPR:
2340 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2341 neutral_op = build_real (TREE_TYPE (op), dconst0);
2342 else
2343 neutral_op = build_int_cst (TREE_TYPE (op), 0);
2345 break;
2347 case MULT_EXPR:
2348 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2349 neutral_op = build_real (TREE_TYPE (op), dconst1);
2350 else
2351 neutral_op = build_int_cst (TREE_TYPE (op), 1);
2353 break;
2355 case BIT_AND_EXPR:
2356 neutral_op = build_int_cst (TREE_TYPE (op), -1);
2357 break;
2359 case MAX_EXPR:
2360 case MIN_EXPR:
2361 def_stmt = SSA_NAME_DEF_STMT (op);
2362 loop = (gimple_bb (stmt))->loop_father;
2363 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2364 loop_preheader_edge (loop));
2365 break;
2367 default:
2368 neutral_op = NULL;
2372 if (STMT_VINFO_DATA_REF (stmt_vinfo))
2374 is_store = true;
2375 op = gimple_assign_rhs1 (stmt);
2377 else
2378 is_store = false;
2380 gcc_assert (op);
2382 if (CONSTANT_CLASS_P (op))
2383 constant_p = true;
2384 else
2385 constant_p = false;
2387 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2388 gcc_assert (vector_type);
2389 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2391 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2392 created vectors. It is greater than 1 if unrolling is performed.
2394 For example, we have two scalar operands, s1 and s2 (e.g., group of
2395 strided accesses of size two), while NUNITS is four (i.e., four scalars
2396 of this type can be packed in a vector). The output vector will contain
2397 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2398 will be 2).
2400 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2401 containing the operands.
2403 For example, NUNITS is four as before, and the group size is 8
2404 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2405 {s5, s6, s7, s8}. */
2407 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2409 number_of_places_left_in_vector = nunits;
2410 elts = XALLOCAVEC (tree, nunits);
2411 for (j = 0; j < number_of_copies; j++)
2413 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2415 if (is_store)
2416 op = gimple_assign_rhs1 (stmt);
2417 else
2419 switch (code)
2421 case COND_EXPR:
2422 if (op_num == 0 || op_num == 1)
2424 tree cond = gimple_assign_rhs1 (stmt);
2425 op = TREE_OPERAND (cond, op_num);
2427 else
2429 if (op_num == 2)
2430 op = gimple_assign_rhs2 (stmt);
2431 else
2432 op = gimple_assign_rhs3 (stmt);
2434 break;
2436 case CALL_EXPR:
2437 op = gimple_call_arg (stmt, op_num);
2438 break;
2440 case LSHIFT_EXPR:
2441 case RSHIFT_EXPR:
2442 case LROTATE_EXPR:
2443 case RROTATE_EXPR:
2444 op = gimple_op (stmt, op_num + 1);
2445 /* Unlike the other binary operators, shifts/rotates have
2446 the shift count being int, instead of the same type as
2447 the lhs, so make sure the scalar is the right type if
2448 we are dealing with vectors of
2449 long long/long/short/char. */
2450 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2451 op = fold_convert (TREE_TYPE (vector_type), op);
2452 break;
2454 default:
2455 op = gimple_op (stmt, op_num + 1);
2456 break;
2460 if (reduc_index != -1)
2462 loop = (gimple_bb (stmt))->loop_father;
2463 def_stmt = SSA_NAME_DEF_STMT (op);
2465 gcc_assert (loop);
2467 /* Get the def before the loop. In reduction chain we have only
2468 one initial value. */
2469 if ((j != (number_of_copies - 1)
2470 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2471 && i != 0))
2472 && neutral_op)
2473 op = neutral_op;
2474 else
2475 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2476 loop_preheader_edge (loop));
2479 /* Create 'vect_ = {op0,op1,...,opn}'. */
2480 number_of_places_left_in_vector--;
2481 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2483 if (CONSTANT_CLASS_P (op))
2485 op = fold_unary (VIEW_CONVERT_EXPR,
2486 TREE_TYPE (vector_type), op);
2487 gcc_assert (op && CONSTANT_CLASS_P (op));
2489 else
2491 tree new_temp
2492 = make_ssa_name (TREE_TYPE (vector_type), NULL);
2493 gimple init_stmt;
2494 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
2495 op);
2496 init_stmt
2497 = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR,
2498 new_temp, op, NULL_TREE);
2499 gimple_seq_add_stmt (&ctor_seq, init_stmt);
2500 op = new_temp;
2503 elts[number_of_places_left_in_vector] = op;
2504 if (!CONSTANT_CLASS_P (op))
2505 constant_p = false;
2507 if (number_of_places_left_in_vector == 0)
2509 number_of_places_left_in_vector = nunits;
2511 if (constant_p)
2512 vec_cst = build_vector (vector_type, elts);
2513 else
2515 vec<constructor_elt, va_gc> *v;
2516 unsigned k;
2517 vec_alloc (v, nunits);
2518 for (k = 0; k < nunits; ++k)
2519 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2520 vec_cst = build_constructor (vector_type, v);
2522 voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2523 vector_type, NULL));
2524 if (ctor_seq != NULL)
2526 gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2527 gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2528 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2529 GSI_SAME_STMT);
2530 ctor_seq = NULL;
2536 /* Since the vectors are created in the reverse order, we should invert
2537 them. */
2538 vec_num = voprnds.length ();
2539 for (j = vec_num; j != 0; j--)
2541 vop = voprnds[j - 1];
2542 vec_oprnds->quick_push (vop);
2545 voprnds.release ();
2547 /* In case that VF is greater than the unrolling factor needed for the SLP
2548 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2549 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2550 to replicate the vectors. */
2551 while (number_of_vectors > vec_oprnds->length ())
2553 tree neutral_vec = NULL;
2555 if (neutral_op)
2557 if (!neutral_vec)
2558 neutral_vec = build_vector_from_val (vector_type, neutral_op);
2560 vec_oprnds->quick_push (neutral_vec);
2562 else
2564 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2565 vec_oprnds->quick_push (vop);
2571 /* Get vectorized definitions from SLP_NODE that contains corresponding
2572 vectorized def-stmts. */
2574 static void
2575 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2577 tree vec_oprnd;
2578 gimple vec_def_stmt;
2579 unsigned int i;
2581 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2583 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2585 gcc_assert (vec_def_stmt);
2586 vec_oprnd = gimple_get_lhs (vec_def_stmt);
2587 vec_oprnds->quick_push (vec_oprnd);
2592 /* Get vectorized definitions for SLP_NODE.
2593 If the scalar definitions are loop invariants or constants, collect them and
2594 call vect_get_constant_vectors() to create vector stmts.
2595 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2596 must be stored in the corresponding child of SLP_NODE, and we call
2597 vect_get_slp_vect_defs () to retrieve them. */
2599 void
2600 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2601 vec<vec<tree> > *vec_oprnds, int reduc_index)
2603 gimple first_stmt;
2604 int number_of_vects = 0, i;
2605 unsigned int child_index = 0;
2606 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2607 slp_tree child = NULL;
2608 vec<tree> vec_defs;
2609 tree oprnd;
2610 bool vectorized_defs;
2612 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2613 FOR_EACH_VEC_ELT (ops, i, oprnd)
2615 /* For each operand we check if it has vectorized definitions in a child
2616 node or we need to create them (for invariants and constants). We
2617 check if the LHS of the first stmt of the next child matches OPRND.
2618 If it does, we found the correct child. Otherwise, we call
2619 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2620 to check this child node for the next operand. */
2621 vectorized_defs = false;
2622 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2624 child = SLP_TREE_CHILDREN (slp_node)[child_index];
2626 /* We have to check both pattern and original def, if available. */
2627 gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2628 gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2630 if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2631 || (related
2632 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2634 /* The number of vector defs is determined by the number of
2635 vector statements in the node from which we get those
2636 statements. */
2637 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2638 vectorized_defs = true;
2639 child_index++;
2643 if (!vectorized_defs)
2645 if (i == 0)
2647 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2648 /* Number of vector stmts was calculated according to LHS in
2649 vect_schedule_slp_instance (), fix it by replacing LHS with
2650 RHS, if necessary. See vect_get_smallest_scalar_type () for
2651 details. */
2652 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2653 &rhs_size_unit);
2654 if (rhs_size_unit != lhs_size_unit)
2656 number_of_vects *= rhs_size_unit;
2657 number_of_vects /= lhs_size_unit;
2662 /* Allocate memory for vectorized defs. */
2663 vec_defs = vNULL;
2664 vec_defs.create (number_of_vects);
2666 /* For reduction defs we call vect_get_constant_vectors (), since we are
2667 looking for initial loop invariant values. */
2668 if (vectorized_defs && reduc_index == -1)
2669 /* The defs are already vectorized. */
2670 vect_get_slp_vect_defs (child, &vec_defs);
2671 else
2672 /* Build vectors from scalar defs. */
2673 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2674 number_of_vects, reduc_index);
2676 vec_oprnds->quick_push (vec_defs);
2678 /* For reductions, we only need initial values. */
2679 if (reduc_index != -1)
2680 return;
2685 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2686 building a vector of type MASK_TYPE from it) and two input vectors placed in
2687 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2688 shifting by STRIDE elements of DR_CHAIN for every copy.
2689 (STRIDE is the number of vectorized stmts for NODE divided by the number of
2690 copies).
2691 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2692 the created stmts must be inserted. */
2694 static inline void
2695 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2696 tree mask, int first_vec_indx, int second_vec_indx,
2697 gimple_stmt_iterator *gsi, slp_tree node,
2698 tree vectype, vec<tree> dr_chain,
2699 int ncopies, int vect_stmts_counter)
2701 tree perm_dest;
2702 gimple perm_stmt = NULL;
2703 stmt_vec_info next_stmt_info;
2704 int i, stride;
2705 tree first_vec, second_vec, data_ref;
2707 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2709 /* Initialize the vect stmts of NODE to properly insert the generated
2710 stmts later. */
2711 for (i = SLP_TREE_VEC_STMTS (node).length ();
2712 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2713 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2715 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2716 for (i = 0; i < ncopies; i++)
2718 first_vec = dr_chain[first_vec_indx];
2719 second_vec = dr_chain[second_vec_indx];
2721 /* Generate the permute statement. */
2722 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, perm_dest,
2723 first_vec, second_vec, mask);
2724 data_ref = make_ssa_name (perm_dest, perm_stmt);
2725 gimple_set_lhs (perm_stmt, data_ref);
2726 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2728 /* Store the vector statement in NODE. */
2729 SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2731 first_vec_indx += stride;
2732 second_vec_indx += stride;
2735 /* Mark the scalar stmt as vectorized. */
2736 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2737 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2741 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2742 return in CURRENT_MASK_ELEMENT its equivalent in target specific
2743 representation. Check that the mask is valid and return FALSE if not.
2744 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2745 the next vector, i.e., the current first vector is not needed. */
2747 static bool
2748 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2749 int mask_nunits, bool only_one_vec, int index,
2750 unsigned char *mask, int *current_mask_element,
2751 bool *need_next_vector, int *number_of_mask_fixes,
2752 bool *mask_fixed, bool *needs_first_vector)
2754 int i;
2756 /* Convert to target specific representation. */
2757 *current_mask_element = first_mask_element + m;
2758 /* Adjust the value in case it's a mask for second and third vectors. */
2759 *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2761 if (*current_mask_element < mask_nunits)
2762 *needs_first_vector = true;
2764 /* We have only one input vector to permute but the mask accesses values in
2765 the next vector as well. */
2766 if (only_one_vec && *current_mask_element >= mask_nunits)
2768 if (dump_enabled_p ())
2770 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2771 "permutation requires at least two vectors ");
2772 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2773 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2776 return false;
2779 /* The mask requires the next vector. */
2780 if (*current_mask_element >= mask_nunits * 2)
2782 if (*needs_first_vector || *mask_fixed)
2784 /* We either need the first vector too or have already moved to the
2785 next vector. In both cases, this permutation needs three
2786 vectors. */
2787 if (dump_enabled_p ())
2789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2790 "permutation requires at "
2791 "least three vectors ");
2792 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2793 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2796 return false;
2799 /* We move to the next vector, dropping the first one and working with
2800 the second and the third - we need to adjust the values of the mask
2801 accordingly. */
2802 *current_mask_element -= mask_nunits * *number_of_mask_fixes;
2804 for (i = 0; i < index; i++)
2805 mask[i] -= mask_nunits * *number_of_mask_fixes;
2807 (*number_of_mask_fixes)++;
2808 *mask_fixed = true;
2811 *need_next_vector = *mask_fixed;
2813 /* This was the last element of this mask. Start a new one. */
2814 if (index == mask_nunits - 1)
2816 *number_of_mask_fixes = 1;
2817 *mask_fixed = false;
2818 *needs_first_vector = false;
2821 return true;
2825 /* Generate vector permute statements from a list of loads in DR_CHAIN.
2826 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
2827 permute statements for the SLP node NODE of the SLP instance
2828 SLP_NODE_INSTANCE. */
2830 bool
2831 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
2832 gimple_stmt_iterator *gsi, int vf,
2833 slp_instance slp_node_instance, bool analyze_only)
2835 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2836 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2837 tree mask_element_type = NULL_TREE, mask_type;
2838 int i, j, k, nunits, vec_index = 0, scalar_index;
2839 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2840 gimple next_scalar_stmt;
2841 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
2842 int first_mask_element;
2843 int index, unroll_factor, current_mask_element, ncopies;
2844 unsigned char *mask;
2845 bool only_one_vec = false, need_next_vector = false;
2846 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
2847 int number_of_mask_fixes = 1;
2848 bool mask_fixed = false;
2849 bool needs_first_vector = false;
2850 enum machine_mode mode;
2852 mode = TYPE_MODE (vectype);
2854 if (!can_vec_perm_p (mode, false, NULL))
2856 if (dump_enabled_p ())
2858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2859 "no vect permute for ");
2860 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2861 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2863 return false;
2866 /* The generic VEC_PERM_EXPR code always uses an integral type of the
2867 same size as the vector element being permuted. */
2868 mask_element_type = lang_hooks.types.type_for_mode
2869 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
2870 mask_type = get_vectype_for_scalar_type (mask_element_type);
2871 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2872 mask = XALLOCAVEC (unsigned char, nunits);
2873 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2875 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
2876 unrolling factor. */
2877 orig_vec_stmts_num = group_size *
2878 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
2879 if (orig_vec_stmts_num == 1)
2880 only_one_vec = true;
2882 /* Number of copies is determined by the final vectorization factor
2883 relatively to SLP_NODE_INSTANCE unrolling factor. */
2884 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2886 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2887 return false;
2889 /* Generate permutation masks for every NODE. Number of masks for each NODE
2890 is equal to GROUP_SIZE.
2891 E.g., we have a group of three nodes with three loads from the same
2892 location in each node, and the vector size is 4. I.e., we have a
2893 a0b0c0a1b1c1... sequence and we need to create the following vectors:
2894 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
2895 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
2898 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
2899 The last mask is illegal since we assume two operands for permute
2900 operation, and the mask element values can't be outside that range.
2901 Hence, the last mask must be converted into {2,5,5,5}.
2902 For the first two permutations we need the first and the second input
2903 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
2904 we need the second and the third vectors: {b1,c1,a2,b2} and
2905 {c2,a3,b3,c3}. */
2908 scalar_index = 0;
2909 index = 0;
2910 vect_stmts_counter = 0;
2911 vec_index = 0;
2912 first_vec_index = vec_index++;
2913 if (only_one_vec)
2914 second_vec_index = first_vec_index;
2915 else
2916 second_vec_index = vec_index++;
2918 for (j = 0; j < unroll_factor; j++)
2920 for (k = 0; k < group_size; k++)
2922 i = SLP_TREE_LOAD_PERMUTATION (node)[k];
2923 first_mask_element = i + j * group_size;
2924 if (!vect_get_mask_element (stmt, first_mask_element, 0,
2925 nunits, only_one_vec, index,
2926 mask, &current_mask_element,
2927 &need_next_vector,
2928 &number_of_mask_fixes, &mask_fixed,
2929 &needs_first_vector))
2930 return false;
2931 mask[index++] = current_mask_element;
2933 if (index == nunits)
2935 index = 0;
2936 if (!can_vec_perm_p (mode, false, mask))
2938 if (dump_enabled_p ())
2940 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
2941 vect_location,
2942 "unsupported vect permute { ");
2943 for (i = 0; i < nunits; ++i)
2944 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
2945 mask[i]);
2946 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
2948 return false;
2951 if (!analyze_only)
2953 int l;
2954 tree mask_vec, *mask_elts;
2955 mask_elts = XALLOCAVEC (tree, nunits);
2956 for (l = 0; l < nunits; ++l)
2957 mask_elts[l] = build_int_cst (mask_element_type,
2958 mask[l]);
2959 mask_vec = build_vector (mask_type, mask_elts);
2961 if (need_next_vector)
2963 first_vec_index = second_vec_index;
2964 second_vec_index = vec_index;
2967 next_scalar_stmt
2968 = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
2970 vect_create_mask_and_perm (stmt, next_scalar_stmt,
2971 mask_vec, first_vec_index, second_vec_index,
2972 gsi, node, vectype, dr_chain,
2973 ncopies, vect_stmts_counter++);
2980 return true;
2985 /* Vectorize SLP instance tree in postorder. */
2987 static bool
2988 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
2989 unsigned int vectorization_factor)
2991 gimple stmt;
2992 bool grouped_store, is_store;
2993 gimple_stmt_iterator si;
2994 stmt_vec_info stmt_info;
2995 unsigned int vec_stmts_size, nunits, group_size;
2996 tree vectype;
2997 int i;
2998 slp_tree child;
3000 if (!node)
3001 return false;
3003 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3004 vect_schedule_slp_instance (child, instance, vectorization_factor);
3006 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3007 stmt_info = vinfo_for_stmt (stmt);
3009 /* VECTYPE is the type of the destination. */
3010 vectype = STMT_VINFO_VECTYPE (stmt_info);
3011 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3012 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3014 /* For each SLP instance calculate number of vector stmts to be created
3015 for the scalar stmts in each node of the SLP tree. Number of vector
3016 elements in one vector iteration is the number of scalar elements in
3017 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3018 size. */
3019 vec_stmts_size = (vectorization_factor * group_size) / nunits;
3021 if (!SLP_TREE_VEC_STMTS (node).exists ())
3023 SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3024 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3027 if (dump_enabled_p ())
3029 dump_printf_loc (MSG_NOTE,vect_location,
3030 "------>vectorizing SLP node starting from: ");
3031 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3032 dump_printf (MSG_NOTE, "\n");
3035 /* Loads should be inserted before the first load. */
3036 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3037 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3038 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3039 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
3040 si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3041 else if (is_pattern_stmt_p (stmt_info))
3042 si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3043 else
3044 si = gsi_for_stmt (stmt);
3046 /* Stores should be inserted just before the last store. */
3047 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3048 && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3050 gimple last_store = vect_find_last_store_in_slp_instance (instance);
3051 if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3052 last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3053 si = gsi_for_stmt (last_store);
3056 /* Mark the first element of the reduction chain as reduction to properly
3057 transform the node. In the analysis phase only the last element of the
3058 chain is marked as reduction. */
3059 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3060 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3062 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3063 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3066 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3067 return is_store;
3070 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3071 For loop vectorization this is done in vectorizable_call, but for SLP
3072 it needs to be deferred until end of vect_schedule_slp, because multiple
3073 SLP instances may refer to the same scalar stmt. */
3075 static void
3076 vect_remove_slp_scalar_calls (slp_tree node)
3078 gimple stmt, new_stmt;
3079 gimple_stmt_iterator gsi;
3080 int i;
3081 slp_tree child;
3082 tree lhs;
3083 stmt_vec_info stmt_info;
3085 if (!node)
3086 return;
3088 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3089 vect_remove_slp_scalar_calls (child);
3091 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3093 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3094 continue;
3095 stmt_info = vinfo_for_stmt (stmt);
3096 if (stmt_info == NULL
3097 || is_pattern_stmt_p (stmt_info)
3098 || !PURE_SLP_STMT (stmt_info))
3099 continue;
3100 lhs = gimple_call_lhs (stmt);
3101 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3102 set_vinfo_for_stmt (new_stmt, stmt_info);
3103 set_vinfo_for_stmt (stmt, NULL);
3104 STMT_VINFO_STMT (stmt_info) = new_stmt;
3105 gsi = gsi_for_stmt (stmt);
3106 gsi_replace (&gsi, new_stmt, false);
3107 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3111 /* Generate vector code for all SLP instances in the loop/basic block. */
3113 bool
3114 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3116 vec<slp_instance> slp_instances;
3117 slp_instance instance;
3118 unsigned int i, vf;
3119 bool is_store = false;
3121 if (loop_vinfo)
3123 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3124 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3126 else
3128 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3129 vf = 1;
3132 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3134 /* Schedule the tree of INSTANCE. */
3135 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3136 instance, vf);
3137 if (dump_enabled_p ())
3138 dump_printf_loc (MSG_NOTE, vect_location,
3139 "vectorizing stmts using SLP.\n");
3142 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3144 slp_tree root = SLP_INSTANCE_TREE (instance);
3145 gimple store;
3146 unsigned int j;
3147 gimple_stmt_iterator gsi;
3149 /* Remove scalar call stmts. Do not do this for basic-block
3150 vectorization as not all uses may be vectorized.
3151 ??? Why should this be necessary? DCE should be able to
3152 remove the stmts itself.
3153 ??? For BB vectorization we can as well remove scalar
3154 stmts starting from the SLP tree root if they have no
3155 uses. */
3156 if (loop_vinfo)
3157 vect_remove_slp_scalar_calls (root);
3159 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3160 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3162 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3163 break;
3165 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3166 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3167 /* Free the attached stmt_vec_info and remove the stmt. */
3168 gsi = gsi_for_stmt (store);
3169 unlink_stmt_vdef (store);
3170 gsi_remove (&gsi, true);
3171 release_defs (store);
3172 free_stmt_vec_info (store);
3176 return is_store;
3180 /* Vectorize the basic block. */
3182 void
3183 vect_slp_transform_bb (basic_block bb)
3185 bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3186 gimple_stmt_iterator si;
3188 gcc_assert (bb_vinfo);
3190 if (dump_enabled_p ())
3191 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3193 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3195 gimple stmt = gsi_stmt (si);
3196 stmt_vec_info stmt_info;
3198 if (dump_enabled_p ())
3200 dump_printf_loc (MSG_NOTE, vect_location,
3201 "------>SLPing statement: ");
3202 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3203 dump_printf (MSG_NOTE, "\n");
3206 stmt_info = vinfo_for_stmt (stmt);
3207 gcc_assert (stmt_info);
3209 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3210 if (STMT_SLP_TYPE (stmt_info))
3212 vect_schedule_slp (NULL, bb_vinfo);
3213 break;
3217 if (dump_enabled_p ())
3218 dump_printf_loc (MSG_NOTE, vect_location,
3219 "BASIC BLOCK VECTORIZED\n");
3221 destroy_bb_vec_info (bb_vinfo);