PR c++/56838
[official-gcc.git] / gcc / tree-vect-slp.c
blob4a5317dc3ce82340946344de8e34a38da9f20943
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "target.h"
30 #include "basic-block.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-pass.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "recog.h" /* FIXME: for insn_data */
37 #include "optabs.h"
38 #include "tree-vectorizer.h"
39 #include "langhooks.h"
41 /* Extract the location of the basic block in the source code.
42 Return the basic block location if succeed and NULL if not. */
44 LOC
45 find_bb_location (basic_block bb)
47 gimple stmt = NULL;
48 gimple_stmt_iterator si;
50 if (!bb)
51 return UNKNOWN_LOC;
53 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
55 stmt = gsi_stmt (si);
56 if (gimple_location (stmt) != UNKNOWN_LOC)
57 return gimple_location (stmt);
60 return UNKNOWN_LOC;
64 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
66 static void
67 vect_free_slp_tree (slp_tree node)
69 int i;
70 slp_void_p child;
72 if (!node)
73 return;
75 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
76 vect_free_slp_tree ((slp_tree) child);
78 SLP_TREE_CHILDREN (node).release ();
79 SLP_TREE_SCALAR_STMTS (node).release ();
80 SLP_TREE_VEC_STMTS (node).release ();
82 free (node);
86 /* Free the memory allocated for the SLP instance. */
88 void
89 vect_free_slp_instance (slp_instance instance)
91 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
92 SLP_INSTANCE_LOAD_PERMUTATION (instance).release ();
93 SLP_INSTANCE_LOADS (instance).release ();
94 SLP_INSTANCE_BODY_COST_VEC (instance).release ();
95 free (instance);
99 /* Create an SLP node for SCALAR_STMTS. */
101 static slp_tree
102 vect_create_new_slp_node (vec<gimple> scalar_stmts)
104 slp_tree node;
105 gimple stmt = scalar_stmts[0];
106 unsigned int nops;
108 if (is_gimple_call (stmt))
109 nops = gimple_call_num_args (stmt);
110 else if (is_gimple_assign (stmt))
112 nops = gimple_num_ops (stmt) - 1;
113 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
114 nops++;
116 else
117 return NULL;
119 node = XNEW (struct _slp_tree);
120 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
121 SLP_TREE_VEC_STMTS (node).create (0);
122 SLP_TREE_CHILDREN (node).create (nops);
124 return node;
128 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
129 operand. */
130 static vec<slp_oprnd_info>
131 vect_create_oprnd_info (int nops, int group_size)
133 int i;
134 slp_oprnd_info oprnd_info;
135 vec<slp_oprnd_info> oprnds_info;
137 oprnds_info.create (nops);
138 for (i = 0; i < nops; i++)
140 oprnd_info = XNEW (struct _slp_oprnd_info);
141 oprnd_info->def_stmts.create (group_size);
142 oprnd_info->first_dt = vect_uninitialized_def;
143 oprnd_info->first_def_type = NULL_TREE;
144 oprnd_info->first_const_oprnd = NULL_TREE;
145 oprnd_info->first_pattern = false;
146 oprnds_info.quick_push (oprnd_info);
149 return oprnds_info;
153 /* Free operands info. */
155 static void
156 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
158 int i;
159 slp_oprnd_info oprnd_info;
161 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
163 oprnd_info->def_stmts.release ();
164 XDELETE (oprnd_info);
167 oprnds_info.release ();
171 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
172 they are of a valid type and that they match the defs of the first stmt of
173 the SLP group (stored in OPRNDS_INFO). */
175 static bool
176 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
177 slp_tree slp_node, gimple stmt,
178 int ncopies_for_cost, bool first,
179 vec<slp_oprnd_info> *oprnds_info,
180 stmt_vector_for_cost *prologue_cost_vec,
181 stmt_vector_for_cost *body_cost_vec)
183 tree oprnd;
184 unsigned int i, number_of_oprnds;
185 tree def, def_op0 = NULL_TREE;
186 gimple def_stmt;
187 enum vect_def_type dt = vect_uninitialized_def;
188 enum vect_def_type dt_op0 = vect_uninitialized_def;
189 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
190 tree lhs = gimple_get_lhs (stmt);
191 struct loop *loop = NULL;
192 enum tree_code rhs_code;
193 bool different_types = false;
194 bool pattern = false;
195 slp_oprnd_info oprnd_info, oprnd0_info, oprnd1_info;
196 int op_idx = 1;
197 tree compare_rhs = NULL_TREE;
199 if (loop_vinfo)
200 loop = LOOP_VINFO_LOOP (loop_vinfo);
202 if (is_gimple_call (stmt))
204 number_of_oprnds = gimple_call_num_args (stmt);
205 op_idx = 3;
207 else if (is_gimple_assign (stmt))
209 number_of_oprnds = gimple_num_ops (stmt) - 1;
210 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
211 number_of_oprnds++;
213 else
214 return false;
216 for (i = 0; i < number_of_oprnds; i++)
218 if (compare_rhs)
220 oprnd = compare_rhs;
221 compare_rhs = NULL_TREE;
223 else
224 oprnd = gimple_op (stmt, op_idx++);
226 oprnd_info = (*oprnds_info)[i];
228 if (COMPARISON_CLASS_P (oprnd))
230 compare_rhs = TREE_OPERAND (oprnd, 1);
231 oprnd = TREE_OPERAND (oprnd, 0);
234 if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
235 &def, &dt)
236 || (!def_stmt && dt != vect_constant_def))
238 if (dump_enabled_p ())
240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
241 "Build SLP failed: can't find def for ");
242 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
245 return false;
248 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
249 from the pattern. Check that all the stmts of the node are in the
250 pattern. */
251 if (def_stmt && gimple_bb (def_stmt)
252 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
253 || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
254 && gimple_code (def_stmt) != GIMPLE_PHI))
255 && vinfo_for_stmt (def_stmt)
256 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
257 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
258 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
260 pattern = true;
261 if (!first && !oprnd_info->first_pattern)
263 if (dump_enabled_p ())
265 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
266 "Build SLP failed: some of the stmts"
267 " are in a pattern, and others are not ");
268 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
271 return false;
274 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
275 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
277 if (dt == vect_unknown_def_type)
279 if (dump_enabled_p ())
280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
281 "Unsupported pattern.");
282 return false;
285 switch (gimple_code (def_stmt))
287 case GIMPLE_PHI:
288 def = gimple_phi_result (def_stmt);
289 break;
291 case GIMPLE_ASSIGN:
292 def = gimple_assign_lhs (def_stmt);
293 break;
295 default:
296 if (dump_enabled_p ())
297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
298 "unsupported defining stmt: ");
299 return false;
303 if (first)
305 oprnd_info->first_dt = dt;
306 oprnd_info->first_pattern = pattern;
307 if (def)
309 oprnd_info->first_def_type = TREE_TYPE (def);
310 oprnd_info->first_const_oprnd = NULL_TREE;
312 else
314 oprnd_info->first_def_type = NULL_TREE;
315 oprnd_info->first_const_oprnd = oprnd;
318 if (i == 0)
320 def_op0 = def;
321 dt_op0 = dt;
322 /* Analyze costs (for the first stmt of the group only). */
323 if (REFERENCE_CLASS_P (lhs))
324 /* Store. */
325 vect_model_store_cost (stmt_info, ncopies_for_cost, false,
326 dt, slp_node, prologue_cost_vec,
327 body_cost_vec);
328 else
330 enum vect_def_type dts[2];
331 dts[0] = dt;
332 dts[1] = vect_uninitialized_def;
333 /* Not memory operation (we don't call this function for
334 loads). */
335 vect_model_simple_cost (stmt_info, ncopies_for_cost, dts,
336 prologue_cost_vec, body_cost_vec);
340 else
342 /* Not first stmt of the group, check that the def-stmt/s match
343 the def-stmt/s of the first stmt. Allow different definition
344 types for reduction chains: the first stmt must be a
345 vect_reduction_def (a phi node), and the rest
346 vect_internal_def. */
347 if (((oprnd_info->first_dt != dt
348 && !(oprnd_info->first_dt == vect_reduction_def
349 && dt == vect_internal_def))
350 || (oprnd_info->first_def_type != NULL_TREE
351 && def
352 && !types_compatible_p (oprnd_info->first_def_type,
353 TREE_TYPE (def))))
354 || (!def
355 && !types_compatible_p (TREE_TYPE (oprnd_info->first_const_oprnd),
356 TREE_TYPE (oprnd)))
357 || different_types)
359 if (number_of_oprnds != 2)
361 if (dump_enabled_p ())
362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
363 "Build SLP failed: different types ");
365 return false;
368 /* Try to swap operands in case of binary operation. */
369 if (i == 0)
370 different_types = true;
371 else
373 oprnd0_info = (*oprnds_info)[0];
374 if (is_gimple_assign (stmt)
375 && (rhs_code = gimple_assign_rhs_code (stmt))
376 && TREE_CODE_CLASS (rhs_code) == tcc_binary
377 && commutative_tree_code (rhs_code)
378 && oprnd0_info->first_dt == dt
379 && oprnd_info->first_dt == dt_op0
380 && def_op0 && def
381 && !(oprnd0_info->first_def_type
382 && !types_compatible_p (oprnd0_info->first_def_type,
383 TREE_TYPE (def)))
384 && !(oprnd_info->first_def_type
385 && !types_compatible_p (oprnd_info->first_def_type,
386 TREE_TYPE (def_op0))))
388 if (dump_enabled_p ())
390 dump_printf_loc (MSG_NOTE, vect_location,
391 "Swapping operands of ");
392 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
395 swap_tree_operands (stmt, gimple_assign_rhs1_ptr (stmt),
396 gimple_assign_rhs2_ptr (stmt));
398 else
400 if (dump_enabled_p ())
401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
402 "Build SLP failed: different types ");
404 return false;
410 /* Check the types of the definitions. */
411 switch (dt)
413 case vect_constant_def:
414 case vect_external_def:
415 case vect_reduction_def:
416 break;
418 case vect_internal_def:
419 if (different_types)
421 oprnd0_info = (*oprnds_info)[0];
422 oprnd1_info = (*oprnds_info)[0];
423 if (i == 0)
424 oprnd1_info->def_stmts.quick_push (def_stmt);
425 else
426 oprnd0_info->def_stmts.quick_push (def_stmt);
428 else
429 oprnd_info->def_stmts.quick_push (def_stmt);
431 break;
433 default:
434 /* FORNOW: Not supported. */
435 if (dump_enabled_p ())
437 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
438 "Build SLP failed: illegal type of def ");
439 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
442 return false;
446 return true;
450 /* Recursively build an SLP tree starting from NODE.
451 Fail (and return FALSE) if def-stmts are not isomorphic, require data
452 permutation or are of unsupported types of operation. Otherwise, return
453 TRUE. */
455 static bool
456 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
457 slp_tree *node, unsigned int group_size, int *outside_cost,
458 int ncopies_for_cost, unsigned int *max_nunits,
459 vec<int> *load_permutation,
460 vec<slp_tree> *loads,
461 unsigned int vectorization_factor, bool *loads_permuted,
462 stmt_vector_for_cost *prologue_cost_vec,
463 stmt_vector_for_cost *body_cost_vec)
465 unsigned int i;
466 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (*node);
467 gimple stmt = stmts[0];
468 enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
469 enum tree_code first_cond_code = ERROR_MARK;
470 tree lhs;
471 bool stop_recursion = false, need_same_oprnds = false;
472 tree vectype, scalar_type, first_op1 = NULL_TREE;
473 optab optab;
474 int icode;
475 enum machine_mode optab_op2_mode;
476 enum machine_mode vec_mode;
477 struct data_reference *first_dr;
478 HOST_WIDE_INT dummy;
479 bool permutation = false;
480 unsigned int load_place;
481 gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
482 vec<slp_oprnd_info> oprnds_info;
483 unsigned int nops;
484 slp_oprnd_info oprnd_info;
485 tree cond;
487 if (is_gimple_call (stmt))
488 nops = gimple_call_num_args (stmt);
489 else if (is_gimple_assign (stmt))
491 nops = gimple_num_ops (stmt) - 1;
492 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
493 nops++;
495 else
496 return false;
498 oprnds_info = vect_create_oprnd_info (nops, group_size);
500 /* For every stmt in NODE find its def stmt/s. */
501 FOR_EACH_VEC_ELT (stmts, i, stmt)
503 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
506 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
509 /* Fail to vectorize statements marked as unvectorizable. */
510 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
512 if (dump_enabled_p ())
514 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
515 "Build SLP failed: unvectorizable statement ");
516 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
519 vect_free_oprnd_info (oprnds_info);
520 return false;
523 lhs = gimple_get_lhs (stmt);
524 if (lhs == NULL_TREE)
526 if (dump_enabled_p ())
528 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
529 "Build SLP failed: not GIMPLE_ASSIGN nor "
530 "GIMPLE_CALL ");
531 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
534 vect_free_oprnd_info (oprnds_info);
535 return false;
538 if (is_gimple_assign (stmt)
539 && gimple_assign_rhs_code (stmt) == COND_EXPR
540 && (cond = gimple_assign_rhs1 (stmt))
541 && !COMPARISON_CLASS_P (cond))
543 if (dump_enabled_p ())
545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
546 "Build SLP failed: condition is not "
547 "comparison ");
548 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
551 vect_free_oprnd_info (oprnds_info);
552 return false;
555 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
556 vectype = get_vectype_for_scalar_type (scalar_type);
557 if (!vectype)
559 if (dump_enabled_p ())
561 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
562 "Build SLP failed: unsupported data-type ");
563 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
564 scalar_type);
567 vect_free_oprnd_info (oprnds_info);
568 return false;
571 /* In case of multiple types we need to detect the smallest type. */
572 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
574 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
575 if (bb_vinfo)
576 vectorization_factor = *max_nunits;
579 if (is_gimple_call (stmt))
581 rhs_code = CALL_EXPR;
582 if (gimple_call_internal_p (stmt)
583 || gimple_call_tail_p (stmt)
584 || gimple_call_noreturn_p (stmt)
585 || !gimple_call_nothrow_p (stmt)
586 || gimple_call_chain (stmt))
588 if (dump_enabled_p ())
590 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
591 "Build SLP failed: unsupported call type ");
592 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
595 vect_free_oprnd_info (oprnds_info);
596 return false;
599 else
600 rhs_code = gimple_assign_rhs_code (stmt);
602 /* Check the operation. */
603 if (i == 0)
605 first_stmt_code = rhs_code;
607 /* Shift arguments should be equal in all the packed stmts for a
608 vector shift with scalar shift operand. */
609 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
610 || rhs_code == LROTATE_EXPR
611 || rhs_code == RROTATE_EXPR)
613 vec_mode = TYPE_MODE (vectype);
615 /* First see if we have a vector/vector shift. */
616 optab = optab_for_tree_code (rhs_code, vectype,
617 optab_vector);
619 if (!optab
620 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
622 /* No vector/vector shift, try for a vector/scalar shift. */
623 optab = optab_for_tree_code (rhs_code, vectype,
624 optab_scalar);
626 if (!optab)
628 if (dump_enabled_p ())
629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
630 "Build SLP failed: no optab.");
631 vect_free_oprnd_info (oprnds_info);
632 return false;
634 icode = (int) optab_handler (optab, vec_mode);
635 if (icode == CODE_FOR_nothing)
637 if (dump_enabled_p ())
638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
639 "Build SLP failed: "
640 "op not supported by target.");
641 vect_free_oprnd_info (oprnds_info);
642 return false;
644 optab_op2_mode = insn_data[icode].operand[2].mode;
645 if (!VECTOR_MODE_P (optab_op2_mode))
647 need_same_oprnds = true;
648 first_op1 = gimple_assign_rhs2 (stmt);
652 else if (rhs_code == WIDEN_LSHIFT_EXPR)
654 need_same_oprnds = true;
655 first_op1 = gimple_assign_rhs2 (stmt);
658 else
660 if (first_stmt_code != rhs_code
661 && (first_stmt_code != IMAGPART_EXPR
662 || rhs_code != REALPART_EXPR)
663 && (first_stmt_code != REALPART_EXPR
664 || rhs_code != IMAGPART_EXPR)
665 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
666 && (first_stmt_code == ARRAY_REF
667 || first_stmt_code == BIT_FIELD_REF
668 || first_stmt_code == INDIRECT_REF
669 || first_stmt_code == COMPONENT_REF
670 || first_stmt_code == MEM_REF)))
672 if (dump_enabled_p ())
674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
675 "Build SLP failed: different operation "
676 "in stmt ");
677 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
680 vect_free_oprnd_info (oprnds_info);
681 return false;
684 if (need_same_oprnds
685 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
687 if (dump_enabled_p ())
689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
690 "Build SLP failed: different shift "
691 "arguments in ");
692 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
695 vect_free_oprnd_info (oprnds_info);
696 return false;
699 if (rhs_code == CALL_EXPR)
701 gimple first_stmt = stmts[0];
702 if (gimple_call_num_args (stmt) != nops
703 || !operand_equal_p (gimple_call_fn (first_stmt),
704 gimple_call_fn (stmt), 0)
705 || gimple_call_fntype (first_stmt)
706 != gimple_call_fntype (stmt))
708 if (dump_enabled_p ())
710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
711 "Build SLP failed: different calls in ");
712 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
713 stmt, 0);
716 vect_free_oprnd_info (oprnds_info);
717 return false;
722 /* Grouped store or load. */
723 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
725 if (REFERENCE_CLASS_P (lhs))
727 /* Store. */
728 if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node,
729 stmt, ncopies_for_cost,
730 (i == 0), &oprnds_info,
731 prologue_cost_vec,
732 body_cost_vec))
734 vect_free_oprnd_info (oprnds_info);
735 return false;
738 else
740 /* Load. */
741 unsigned unrolling_factor
742 = least_common_multiple
743 (*max_nunits, group_size) / group_size;
744 /* FORNOW: Check that there is no gap between the loads
745 and no gap between the groups when we need to load
746 multiple groups at once.
747 ??? We should enhance this to only disallow gaps
748 inside vectors. */
749 if ((unrolling_factor > 1
750 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
751 && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
752 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
753 && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
755 if (dump_enabled_p ())
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 "Build SLP failed: grouped "
759 "loads have gaps ");
760 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
761 stmt, 0);
764 vect_free_oprnd_info (oprnds_info);
765 return false;
768 /* Check that the size of interleaved loads group is not
769 greater than the SLP group size. */
770 unsigned ncopies
771 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
772 if (loop_vinfo
773 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
774 && ((GROUP_SIZE (vinfo_for_stmt (stmt))
775 - GROUP_GAP (vinfo_for_stmt (stmt)))
776 > ncopies * group_size))
778 if (dump_enabled_p ())
780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
781 "Build SLP failed: the number "
782 "of interleaved loads is greater than "
783 "the SLP group size ");
784 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
785 stmt, 0);
788 vect_free_oprnd_info (oprnds_info);
789 return false;
792 old_first_load = first_load;
793 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
794 if (prev_first_load)
796 /* Check that there are no loads from different interleaving
797 chains in the same node. The only exception is complex
798 numbers. */
799 if (prev_first_load != first_load
800 && rhs_code != REALPART_EXPR
801 && rhs_code != IMAGPART_EXPR)
803 if (dump_enabled_p ())
805 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
806 vect_location,
807 "Build SLP failed: different "
808 "interleaving chains in one node ");
809 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
810 stmt, 0);
813 vect_free_oprnd_info (oprnds_info);
814 return false;
817 else
818 prev_first_load = first_load;
820 /* In some cases a group of loads is just the same load
821 repeated N times. Only analyze its cost once. */
822 if (first_load == stmt && old_first_load != first_load)
824 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
825 if (vect_supportable_dr_alignment (first_dr, false)
826 == dr_unaligned_unsupported)
828 if (dump_enabled_p ())
830 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
831 vect_location,
832 "Build SLP failed: unsupported "
833 "unaligned load ");
834 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
835 stmt, 0);
838 vect_free_oprnd_info (oprnds_info);
839 return false;
842 /* Analyze costs (for the first stmt in the group). */
843 vect_model_load_cost (vinfo_for_stmt (stmt),
844 ncopies_for_cost, false, *node,
845 prologue_cost_vec, body_cost_vec);
848 /* Store the place of this load in the interleaving chain. In
849 case that permutation is needed we later decide if a specific
850 permutation is supported. */
851 load_place = vect_get_place_in_interleaving_chain (stmt,
852 first_load);
853 if (load_place != i)
854 permutation = true;
856 load_permutation->safe_push (load_place);
858 /* We stop the tree when we reach a group of loads. */
859 stop_recursion = true;
860 continue;
862 } /* Grouped access. */
863 else
865 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
867 /* Not grouped load. */
868 if (dump_enabled_p ())
870 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
871 "Build SLP failed: not grouped load ");
872 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
875 /* FORNOW: Not grouped loads are not supported. */
876 vect_free_oprnd_info (oprnds_info);
877 return false;
880 /* Not memory operation. */
881 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
882 && TREE_CODE_CLASS (rhs_code) != tcc_unary
883 && rhs_code != COND_EXPR
884 && rhs_code != CALL_EXPR)
886 if (dump_enabled_p ())
888 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
889 "Build SLP failed: operation");
890 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
891 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
894 vect_free_oprnd_info (oprnds_info);
895 return false;
898 if (rhs_code == COND_EXPR)
900 tree cond_expr = gimple_assign_rhs1 (stmt);
902 if (i == 0)
903 first_cond_code = TREE_CODE (cond_expr);
904 else if (first_cond_code != TREE_CODE (cond_expr))
906 if (dump_enabled_p ())
908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
909 "Build SLP failed: different"
910 " operation");
911 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
912 stmt, 0);
915 vect_free_oprnd_info (oprnds_info);
916 return false;
920 /* Find the def-stmts. */
921 if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node, stmt,
922 ncopies_for_cost, (i == 0),
923 &oprnds_info, prologue_cost_vec,
924 body_cost_vec))
926 vect_free_oprnd_info (oprnds_info);
927 return false;
932 /* Grouped loads were reached - stop the recursion. */
933 if (stop_recursion)
935 loads->safe_push (*node);
936 if (permutation)
938 gimple first_stmt = stmts[0];
939 *loads_permuted = true;
940 (void) record_stmt_cost (body_cost_vec, group_size, vec_perm,
941 vinfo_for_stmt (first_stmt), 0, vect_body);
943 else
945 /* We don't check here complex numbers chains, so we set
946 LOADS_PERMUTED for further check in
947 vect_supported_load_permutation_p. */
948 if (rhs_code == REALPART_EXPR || rhs_code == IMAGPART_EXPR)
949 *loads_permuted = true;
952 vect_free_oprnd_info (oprnds_info);
953 return true;
956 /* Create SLP_TREE nodes for the definition node/s. */
957 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
959 slp_tree child;
961 if (oprnd_info->first_dt != vect_internal_def)
962 continue;
964 child = vect_create_new_slp_node (oprnd_info->def_stmts);
965 if (!child
966 || !vect_build_slp_tree (loop_vinfo, bb_vinfo, &child, group_size,
967 outside_cost, ncopies_for_cost,
968 max_nunits, load_permutation, loads,
969 vectorization_factor, loads_permuted,
970 prologue_cost_vec, body_cost_vec))
972 if (child)
973 oprnd_info->def_stmts = vNULL;
974 vect_free_slp_tree (child);
975 vect_free_oprnd_info (oprnds_info);
976 return false;
979 oprnd_info->def_stmts.create (0);
980 SLP_TREE_CHILDREN (*node).quick_push (child);
983 vect_free_oprnd_info (oprnds_info);
984 return true;
987 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
989 static void
990 vect_print_slp_tree (int dump_kind, slp_tree node)
992 int i;
993 gimple stmt;
994 slp_void_p child;
996 if (!node)
997 return;
999 dump_printf (dump_kind, "node ");
1000 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1002 dump_printf (dump_kind, "\n\tstmt %d ", i);
1003 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1005 dump_printf (dump_kind, "\n");
1007 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1008 vect_print_slp_tree (dump_kind, (slp_tree) child);
1012 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1013 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1014 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1015 stmts in NODE are to be marked. */
1017 static void
1018 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1020 int i;
1021 gimple stmt;
1022 slp_void_p child;
1024 if (!node)
1025 return;
1027 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1028 if (j < 0 || i == j)
1029 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1031 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1032 vect_mark_slp_stmts ((slp_tree) child, mark, j);
1036 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1038 static void
1039 vect_mark_slp_stmts_relevant (slp_tree node)
1041 int i;
1042 gimple stmt;
1043 stmt_vec_info stmt_info;
1044 slp_void_p child;
1046 if (!node)
1047 return;
1049 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1051 stmt_info = vinfo_for_stmt (stmt);
1052 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1053 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1054 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1057 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1058 vect_mark_slp_stmts_relevant ((slp_tree) child);
1062 /* Check if the permutation required by the SLP INSTANCE is supported.
1063 Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed. */
1065 static bool
1066 vect_supported_slp_permutation_p (slp_instance instance)
1068 slp_tree node = SLP_INSTANCE_LOADS (instance)[0];
1069 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1070 gimple first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
1071 vec<slp_tree> sorted_loads = vNULL;
1072 int index;
1073 slp_tree *tmp_loads = NULL;
1074 int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j;
1075 slp_tree load;
1077 /* FORNOW: The only supported loads permutation is loads from the same
1078 location in all the loads in the node, when the data-refs in
1079 nodes of LOADS constitute an interleaving chain.
1080 Sort the nodes according to the order of accesses in the chain. */
1081 tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size);
1082 for (i = 0, j = 0;
1083 SLP_INSTANCE_LOAD_PERMUTATION (instance).iterate (i, &index)
1084 && SLP_INSTANCE_LOADS (instance).iterate (j, &load);
1085 i += group_size, j++)
1087 gimple scalar_stmt = SLP_TREE_SCALAR_STMTS (load)[0];
1088 /* Check that the loads are all in the same interleaving chain. */
1089 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (scalar_stmt)) != first_load)
1091 if (dump_enabled_p ())
1093 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1094 "Build SLP failed: unsupported data "
1095 "permutation ");
1096 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1097 scalar_stmt, 0);
1100 free (tmp_loads);
1101 return false;
1104 tmp_loads[index] = load;
1107 sorted_loads.create (group_size);
1108 for (i = 0; i < group_size; i++)
1109 sorted_loads.safe_push (tmp_loads[i]);
1111 SLP_INSTANCE_LOADS (instance).release ();
1112 SLP_INSTANCE_LOADS (instance) = sorted_loads;
1113 free (tmp_loads);
1115 if (!vect_transform_slp_perm_load (stmt, vNULL, NULL,
1116 SLP_INSTANCE_UNROLLING_FACTOR (instance),
1117 instance, true))
1118 return false;
1120 return true;
1124 /* Rearrange the statements of NODE according to PERMUTATION. */
1126 static void
1127 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1128 vec<int> permutation)
1130 gimple stmt;
1131 vec<gimple> tmp_stmts;
1132 unsigned int index, i;
1133 slp_void_p child;
1135 if (!node)
1136 return;
1138 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1139 vect_slp_rearrange_stmts ((slp_tree) child, group_size, permutation);
1141 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1142 tmp_stmts.create (group_size);
1144 for (i = 0; i < group_size; i++)
1145 tmp_stmts.safe_push (NULL);
1147 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1149 index = permutation[i];
1150 tmp_stmts[index] = stmt;
1153 SLP_TREE_SCALAR_STMTS (node).release ();
1154 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1158 /* Check if the required load permutation is supported.
1159 LOAD_PERMUTATION contains a list of indices of the loads.
1160 In SLP this permutation is relative to the order of grouped stores that are
1161 the base of the SLP instance. */
1163 static bool
1164 vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
1165 vec<int> load_permutation)
1167 int i = 0, j, prev = -1, next, k, number_of_groups;
1168 bool supported, bad_permutation = false;
1169 sbitmap load_index;
1170 slp_tree node, other_complex_node;
1171 gimple stmt, first = NULL, other_node_first, load, next_load, first_load;
1172 unsigned complex_numbers = 0;
1173 struct data_reference *dr;
1174 bb_vec_info bb_vinfo;
1176 /* FORNOW: permutations are only supported in SLP. */
1177 if (!slp_instn)
1178 return false;
1180 if (dump_enabled_p ())
1182 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1183 FOR_EACH_VEC_ELT (load_permutation, i, next)
1184 dump_printf (MSG_NOTE, "%d ", next);
1187 /* In case of reduction every load permutation is allowed, since the order
1188 of the reduction statements is not important (as opposed to the case of
1189 grouped stores). The only condition we need to check is that all the
1190 load nodes are of the same size and have the same permutation (and then
1191 rearrange all the nodes of the SLP instance according to this
1192 permutation). */
1194 /* Check that all the load nodes are of the same size. */
1195 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1197 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1198 return false;
1200 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1201 if (is_gimple_assign (stmt)
1202 && (gimple_assign_rhs_code (stmt) == REALPART_EXPR
1203 || gimple_assign_rhs_code (stmt) == IMAGPART_EXPR))
1204 complex_numbers++;
1207 /* Complex operands can be swapped as following:
1208 real_c = real_b + real_a;
1209 imag_c = imag_a + imag_b;
1210 i.e., we have {real_b, imag_a} and {real_a, imag_b} instead of
1211 {real_a, imag_a} and {real_b, imag_b}. We check here that if interleaving
1212 chains are mixed, they match the above pattern. */
1213 if (complex_numbers)
1215 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1217 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, stmt)
1219 if (j == 0)
1220 first = stmt;
1221 else
1223 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != first)
1225 if (complex_numbers != 2)
1226 return false;
1228 if (i == 0)
1229 k = 1;
1230 else
1231 k = 0;
1233 other_complex_node = SLP_INSTANCE_LOADS (slp_instn)[k];
1234 other_node_first =
1235 SLP_TREE_SCALAR_STMTS (other_complex_node)[0];
1237 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1238 != other_node_first)
1239 return false;
1246 /* We checked that this case ok, so there is no need to proceed with
1247 permutation tests. */
1248 if (complex_numbers == 2
1249 && SLP_INSTANCE_LOADS (slp_instn).length () == 2)
1251 SLP_INSTANCE_LOADS (slp_instn).release ();
1252 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1253 return true;
1256 node = SLP_INSTANCE_TREE (slp_instn);
1257 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1258 /* LOAD_PERMUTATION is a list of indices of all the loads of the SLP
1259 instance, not all the loads belong to the same node or interleaving
1260 group. Hence, we need to divide them into groups according to
1261 GROUP_SIZE. */
1262 number_of_groups = load_permutation.length () / group_size;
1264 /* Reduction (there are no data-refs in the root).
1265 In reduction chain the order of the loads is important. */
1266 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1267 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1269 int first_group_load_index;
1271 /* Compare all the permutation sequences to the first one. */
1272 for (i = 1; i < number_of_groups; i++)
1274 k = 0;
1275 for (j = i * group_size; j < i * group_size + group_size; j++)
1277 next = load_permutation[j];
1278 first_group_load_index = load_permutation[k];
1280 if (next != first_group_load_index)
1282 bad_permutation = true;
1283 break;
1286 k++;
1289 if (bad_permutation)
1290 break;
1293 if (!bad_permutation)
1295 /* Check that the loads in the first sequence are different and there
1296 are no gaps between them. */
1297 load_index = sbitmap_alloc (group_size);
1298 bitmap_clear (load_index);
1299 for (k = 0; k < group_size; k++)
1301 first_group_load_index = load_permutation[k];
1302 if (bitmap_bit_p (load_index, first_group_load_index))
1304 bad_permutation = true;
1305 break;
1308 bitmap_set_bit (load_index, first_group_load_index);
1311 if (!bad_permutation)
1312 for (k = 0; k < group_size; k++)
1313 if (!bitmap_bit_p (load_index, k))
1315 bad_permutation = true;
1316 break;
1319 sbitmap_free (load_index);
1322 if (!bad_permutation)
1324 /* This permutation is valid for reduction. Since the order of the
1325 statements in the nodes is not important unless they are memory
1326 accesses, we can rearrange the statements in all the nodes
1327 according to the order of the loads. */
1328 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1329 load_permutation);
1330 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1331 return true;
1335 /* In basic block vectorization we allow any subchain of an interleaving
1336 chain.
1337 FORNOW: not supported in loop SLP because of realignment compications. */
1338 bb_vinfo = STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt));
1339 bad_permutation = false;
1340 /* Check that for every node in the instance the loads form a subchain. */
1341 if (bb_vinfo)
1343 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1345 next_load = NULL;
1346 first_load = NULL;
1347 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1349 if (!first_load)
1350 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (load));
1351 else if (first_load
1352 != GROUP_FIRST_ELEMENT (vinfo_for_stmt (load)))
1354 bad_permutation = true;
1355 break;
1358 if (j != 0 && next_load != load)
1360 bad_permutation = true;
1361 break;
1364 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1367 if (bad_permutation)
1368 break;
1371 /* Check that the alignment of the first load in every subchain, i.e.,
1372 the first statement in every load node, is supported. */
1373 if (!bad_permutation)
1375 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1377 first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1378 if (first_load
1379 != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1381 dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1382 if (vect_supportable_dr_alignment (dr, false)
1383 == dr_unaligned_unsupported)
1385 if (dump_enabled_p ())
1387 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1388 vect_location,
1389 "unsupported unaligned load ");
1390 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1391 first_load, 0);
1393 bad_permutation = true;
1394 break;
1399 if (!bad_permutation)
1401 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1402 return true;
1407 /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1408 GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1409 well (unless it's reduction). */
1410 if (load_permutation.length ()
1411 != (unsigned int) (group_size * group_size))
1412 return false;
1414 supported = true;
1415 load_index = sbitmap_alloc (group_size);
1416 bitmap_clear (load_index);
1417 for (j = 0; j < group_size; j++)
1419 for (i = j * group_size, k = 0;
1420 load_permutation.iterate (i, &next) && k < group_size;
1421 i++, k++)
1423 if (i != j * group_size && next != prev)
1425 supported = false;
1426 break;
1429 prev = next;
1432 if (bitmap_bit_p (load_index, prev))
1434 supported = false;
1435 break;
1438 bitmap_set_bit (load_index, prev);
1441 for (j = 0; j < group_size; j++)
1442 if (!bitmap_bit_p (load_index, j))
1444 sbitmap_free (load_index);
1445 return false;
1448 sbitmap_free (load_index);
1450 if (supported && i == group_size * group_size
1451 && vect_supported_slp_permutation_p (slp_instn))
1452 return true;
1454 return false;
1458 /* Find the first load in the loop that belongs to INSTANCE.
1459 When loads are in several SLP nodes, there can be a case in which the first
1460 load does not appear in the first SLP node to be transformed, causing
1461 incorrect order of statements. Since we generate all the loads together,
1462 they must be inserted before the first load of the SLP instance and not
1463 before the first load of the first node of the instance. */
1465 static gimple
1466 vect_find_first_load_in_slp_instance (slp_instance instance)
1468 int i, j;
1469 slp_tree load_node;
1470 gimple first_load = NULL, load;
1472 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1473 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1474 first_load = get_earlier_stmt (load, first_load);
1476 return first_load;
1480 /* Find the last store in SLP INSTANCE. */
1482 static gimple
1483 vect_find_last_store_in_slp_instance (slp_instance instance)
1485 int i;
1486 slp_tree node;
1487 gimple last_store = NULL, store;
1489 node = SLP_INSTANCE_TREE (instance);
1490 for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1491 last_store = get_later_stmt (store, last_store);
1493 return last_store;
1497 /* Analyze an SLP instance starting from a group of grouped stores. Call
1498 vect_build_slp_tree to build a tree of packed stmts if possible.
1499 Return FALSE if it's impossible to SLP any stmt in the loop. */
1501 static bool
1502 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1503 gimple stmt)
1505 slp_instance new_instance;
1506 slp_tree node;
1507 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1508 unsigned int unrolling_factor = 1, nunits;
1509 tree vectype, scalar_type = NULL_TREE;
1510 gimple next;
1511 unsigned int vectorization_factor = 0;
1512 int outside_cost = 0, ncopies_for_cost, i;
1513 unsigned int max_nunits = 0;
1514 vec<int> load_permutation;
1515 vec<slp_tree> loads;
1516 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1517 bool loads_permuted = false;
1518 vec<gimple> scalar_stmts;
1519 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1520 stmt_info_for_cost *si;
1522 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1524 if (dr)
1526 scalar_type = TREE_TYPE (DR_REF (dr));
1527 vectype = get_vectype_for_scalar_type (scalar_type);
1529 else
1531 gcc_assert (loop_vinfo);
1532 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1535 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1537 else
1539 gcc_assert (loop_vinfo);
1540 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1541 group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1544 if (!vectype)
1546 if (dump_enabled_p ())
1548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1549 "Build SLP failed: unsupported data-type ");
1550 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1553 return false;
1556 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1557 if (loop_vinfo)
1558 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1559 else
1560 vectorization_factor = nunits;
1562 /* Calculate the unrolling factor. */
1563 unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1564 if (unrolling_factor != 1 && !loop_vinfo)
1566 if (dump_enabled_p ())
1567 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1568 "Build SLP failed: unrolling required in basic"
1569 " block SLP");
1571 return false;
1574 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1575 scalar_stmts.create (group_size);
1576 next = stmt;
1577 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1579 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1580 while (next)
1582 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1583 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1584 scalar_stmts.safe_push (
1585 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1586 else
1587 scalar_stmts.safe_push (next);
1588 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1591 else
1593 /* Collect reduction statements. */
1594 vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1595 for (i = 0; reductions.iterate (i, &next); i++)
1596 scalar_stmts.safe_push (next);
1599 node = vect_create_new_slp_node (scalar_stmts);
1601 /* Calculate the number of vector stmts to create based on the unrolling
1602 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1603 GROUP_SIZE / NUNITS otherwise. */
1604 ncopies_for_cost = unrolling_factor * group_size / nunits;
1606 load_permutation.create (group_size * group_size);
1607 loads.create (group_size);
1608 prologue_cost_vec.create (10);
1609 body_cost_vec.create (10);
1611 /* Build the tree for the SLP instance. */
1612 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1613 &outside_cost, ncopies_for_cost,
1614 &max_nunits, &load_permutation, &loads,
1615 vectorization_factor, &loads_permuted,
1616 &prologue_cost_vec, &body_cost_vec))
1618 void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1619 : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1621 /* Calculate the unrolling factor based on the smallest type. */
1622 if (max_nunits > nunits)
1623 unrolling_factor = least_common_multiple (max_nunits, group_size)
1624 / group_size;
1626 if (unrolling_factor != 1 && !loop_vinfo)
1628 if (dump_enabled_p ())
1629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630 "Build SLP failed: unrolling required in basic"
1631 " block SLP");
1632 vect_free_slp_tree (node);
1633 body_cost_vec.release ();
1634 prologue_cost_vec.release ();
1635 load_permutation.release ();
1636 loads.release ();
1637 return false;
1640 /* Create a new SLP instance. */
1641 new_instance = XNEW (struct _slp_instance);
1642 SLP_INSTANCE_TREE (new_instance) = node;
1643 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1644 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1645 SLP_INSTANCE_BODY_COST_VEC (new_instance) = body_cost_vec;
1646 SLP_INSTANCE_LOADS (new_instance) = loads;
1647 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1648 SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation;
1650 if (loads_permuted)
1652 if (!vect_supported_load_permutation_p (new_instance, group_size,
1653 load_permutation))
1655 if (dump_enabled_p ())
1657 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1658 "Build SLP failed: unsupported load "
1659 "permutation ");
1660 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1663 vect_free_slp_instance (new_instance);
1664 prologue_cost_vec.release ();
1665 return false;
1668 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1669 = vect_find_first_load_in_slp_instance (new_instance);
1671 else
1672 SLP_INSTANCE_LOAD_PERMUTATION (new_instance).release ();
1674 /* Record the prologue costs, which were delayed until we were
1675 sure that SLP was successful. Unlike the body costs, we know
1676 the final values now regardless of the loop vectorization factor. */
1677 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1679 struct _stmt_vec_info *stmt_info
1680 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1681 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1682 si->misalign, vect_prologue);
1685 prologue_cost_vec.release ();
1687 if (loop_vinfo)
1688 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1689 else
1690 BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1692 if (dump_enabled_p ())
1693 vect_print_slp_tree (MSG_NOTE, node);
1695 return true;
1697 else
1699 body_cost_vec.release ();
1700 prologue_cost_vec.release ();
1703 /* Failed to SLP. */
1704 /* Free the allocated memory. */
1705 vect_free_slp_tree (node);
1706 load_permutation.release ();
1707 loads.release ();
1709 return false;
1713 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1714 trees of packed scalar stmts if SLP is possible. */
1716 bool
1717 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
1719 unsigned int i;
1720 vec<gimple> grouped_stores;
1721 vec<gimple> reductions = vNULL;
1722 vec<gimple> reduc_chains = vNULL;
1723 gimple first_element;
1724 bool ok = false;
1726 if (dump_enabled_p ())
1727 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===");
1729 if (loop_vinfo)
1731 grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1732 reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1733 reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1735 else
1736 grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1738 /* Find SLP sequences starting from groups of grouped stores. */
1739 FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1740 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
1741 ok = true;
1743 if (bb_vinfo && !ok)
1745 if (dump_enabled_p ())
1746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1747 "Failed to SLP the basic block.");
1749 return false;
1752 if (loop_vinfo
1753 && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1755 /* Find SLP sequences starting from reduction chains. */
1756 FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1757 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
1758 ok = true;
1759 else
1760 return false;
1762 /* Don't try to vectorize SLP reductions if reduction chain was
1763 detected. */
1764 return ok;
1767 /* Find SLP sequences starting from groups of reductions. */
1768 if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1769 && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0]))
1770 ok = true;
1772 return true;
1776 /* For each possible SLP instance decide whether to SLP it and calculate overall
1777 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1778 least one instance. */
1780 bool
1781 vect_make_slp_decision (loop_vec_info loop_vinfo)
1783 unsigned int i, unrolling_factor = 1;
1784 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1785 slp_instance instance;
1786 int decided_to_slp = 0;
1788 if (dump_enabled_p ())
1789 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ===");
1791 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1793 /* FORNOW: SLP if you can. */
1794 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1795 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1797 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
1798 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1799 loop-based vectorization. Such stmts will be marked as HYBRID. */
1800 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1801 decided_to_slp++;
1804 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1806 if (decided_to_slp && dump_enabled_p ())
1807 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
1808 "Decided to SLP %d instances. Unrolling factor %d",
1809 decided_to_slp, unrolling_factor);
1811 return (decided_to_slp > 0);
1815 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1816 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1818 static void
1819 vect_detect_hybrid_slp_stmts (slp_tree node)
1821 int i;
1822 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (node);
1823 gimple stmt = stmts[0];
1824 imm_use_iterator imm_iter;
1825 gimple use_stmt;
1826 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1827 slp_void_p child;
1828 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1829 struct loop *loop = NULL;
1830 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo);
1831 basic_block bb = NULL;
1833 if (!node)
1834 return;
1836 if (loop_vinfo)
1837 loop = LOOP_VINFO_LOOP (loop_vinfo);
1838 else
1839 bb = BB_VINFO_BB (bb_vinfo);
1841 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1842 if (PURE_SLP_STMT (vinfo_for_stmt (stmt))
1843 && TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1844 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1845 if (gimple_bb (use_stmt)
1846 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1847 || bb == gimple_bb (use_stmt))
1848 && (stmt_vinfo = vinfo_for_stmt (use_stmt))
1849 && !STMT_SLP_TYPE (stmt_vinfo)
1850 && (STMT_VINFO_RELEVANT (stmt_vinfo)
1851 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
1852 && !(gimple_code (use_stmt) == GIMPLE_PHI
1853 && STMT_VINFO_DEF_TYPE (stmt_vinfo)
1854 == vect_reduction_def))
1855 vect_mark_slp_stmts (node, hybrid, i);
1857 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1858 vect_detect_hybrid_slp_stmts ((slp_tree) child);
1862 /* Find stmts that must be both vectorized and SLPed. */
1864 void
1865 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1867 unsigned int i;
1868 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1869 slp_instance instance;
1871 if (dump_enabled_p ())
1872 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ===");
1874 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1875 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance));
1879 /* Create and initialize a new bb_vec_info struct for BB, as well as
1880 stmt_vec_info structs for all the stmts in it. */
1882 static bb_vec_info
1883 new_bb_vec_info (basic_block bb)
1885 bb_vec_info res = NULL;
1886 gimple_stmt_iterator gsi;
1888 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
1889 BB_VINFO_BB (res) = bb;
1891 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1893 gimple stmt = gsi_stmt (gsi);
1894 gimple_set_uid (stmt, 0);
1895 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
1898 BB_VINFO_GROUPED_STORES (res).create (10);
1899 BB_VINFO_SLP_INSTANCES (res).create (2);
1900 BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
1902 bb->aux = res;
1903 return res;
1907 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
1908 stmts in the basic block. */
1910 static void
1911 destroy_bb_vec_info (bb_vec_info bb_vinfo)
1913 vec<slp_instance> slp_instances;
1914 slp_instance instance;
1915 basic_block bb;
1916 gimple_stmt_iterator si;
1917 unsigned i;
1919 if (!bb_vinfo)
1920 return;
1922 bb = BB_VINFO_BB (bb_vinfo);
1924 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1926 gimple stmt = gsi_stmt (si);
1927 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1929 if (stmt_info)
1930 /* Free stmt_vec_info. */
1931 free_stmt_vec_info (stmt);
1934 free_data_refs (BB_VINFO_DATAREFS (bb_vinfo));
1935 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
1936 BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
1937 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1938 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1939 vect_free_slp_instance (instance);
1940 BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
1941 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1942 free (bb_vinfo);
1943 bb->aux = NULL;
1947 /* Analyze statements contained in SLP tree node after recursively analyzing
1948 the subtree. Return TRUE if the operations are supported. */
1950 static bool
1951 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
1953 bool dummy;
1954 int i;
1955 gimple stmt;
1956 slp_void_p child;
1958 if (!node)
1959 return true;
1961 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1962 if (!vect_slp_analyze_node_operations (bb_vinfo, (slp_tree) child))
1963 return false;
1965 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1967 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1968 gcc_assert (stmt_info);
1969 gcc_assert (PURE_SLP_STMT (stmt_info));
1971 if (!vect_analyze_stmt (stmt, &dummy, node))
1972 return false;
1975 return true;
1979 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
1980 operations are supported. */
1982 static bool
1983 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
1985 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1986 slp_instance instance;
1987 int i;
1989 for (i = 0; slp_instances.iterate (i, &instance); )
1991 if (!vect_slp_analyze_node_operations (bb_vinfo,
1992 SLP_INSTANCE_TREE (instance)))
1994 vect_free_slp_instance (instance);
1995 slp_instances.ordered_remove (i);
1997 else
1998 i++;
2001 if (!slp_instances.length ())
2002 return false;
2004 return true;
2007 /* Check if vectorization of the basic block is profitable. */
2009 static bool
2010 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2012 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2013 slp_instance instance;
2014 int i, j;
2015 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2016 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2017 unsigned int stmt_cost;
2018 gimple stmt;
2019 gimple_stmt_iterator si;
2020 basic_block bb = BB_VINFO_BB (bb_vinfo);
2021 void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2022 stmt_vec_info stmt_info = NULL;
2023 stmt_vector_for_cost body_cost_vec;
2024 stmt_info_for_cost *ci;
2026 /* Calculate vector costs. */
2027 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2029 body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2031 FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2033 stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2034 (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2035 stmt_info, ci->misalign, vect_body);
2039 /* Calculate scalar cost. */
2040 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2042 stmt = gsi_stmt (si);
2043 stmt_info = vinfo_for_stmt (stmt);
2045 if (!stmt_info || !STMT_VINFO_VECTORIZABLE (stmt_info)
2046 || !PURE_SLP_STMT (stmt_info))
2047 continue;
2049 if (STMT_VINFO_DATA_REF (stmt_info))
2051 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2052 stmt_cost = vect_get_stmt_cost (scalar_load);
2053 else
2054 stmt_cost = vect_get_stmt_cost (scalar_store);
2056 else
2057 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2059 scalar_cost += stmt_cost;
2062 /* Complete the target-specific cost calculation. */
2063 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2064 &vec_inside_cost, &vec_epilogue_cost);
2066 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2068 if (dump_enabled_p ())
2070 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2071 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2072 vec_inside_cost);
2073 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2074 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2075 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d", scalar_cost);
2078 /* Vectorization is profitable if its cost is less than the cost of scalar
2079 version. */
2080 if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2081 return false;
2083 return true;
2086 /* Check if the basic block can be vectorized. */
2088 static bb_vec_info
2089 vect_slp_analyze_bb_1 (basic_block bb)
2091 bb_vec_info bb_vinfo;
2092 vec<slp_instance> slp_instances;
2093 slp_instance instance;
2094 int i;
2095 int min_vf = 2;
2097 bb_vinfo = new_bb_vec_info (bb);
2098 if (!bb_vinfo)
2099 return NULL;
2101 if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf))
2103 if (dump_enabled_p ())
2104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2105 "not vectorized: unhandled data-ref in basic "
2106 "block.\n");
2108 destroy_bb_vec_info (bb_vinfo);
2109 return NULL;
2112 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2114 if (dump_enabled_p ())
2115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2116 "not vectorized: not enough data-refs in "
2117 "basic block.\n");
2119 destroy_bb_vec_info (bb_vinfo);
2120 return NULL;
2123 if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2125 if (dump_enabled_p ())
2126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2127 "not vectorized: unhandled data access in "
2128 "basic block.\n");
2130 destroy_bb_vec_info (bb_vinfo);
2131 return NULL;
2134 vect_pattern_recog (NULL, bb_vinfo);
2136 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2138 if (dump_enabled_p ())
2139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2140 "not vectorized: unhandled data dependence "
2141 "in basic block.\n");
2143 destroy_bb_vec_info (bb_vinfo);
2144 return NULL;
2147 if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2149 if (dump_enabled_p ())
2150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2151 "not vectorized: bad data alignment in basic "
2152 "block.\n");
2154 destroy_bb_vec_info (bb_vinfo);
2155 return NULL;
2158 /* Check the SLP opportunities in the basic block, analyze and build SLP
2159 trees. */
2160 if (!vect_analyze_slp (NULL, bb_vinfo))
2162 if (dump_enabled_p ())
2163 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2164 "not vectorized: failed to find SLP opportunities "
2165 "in basic block.\n");
2167 destroy_bb_vec_info (bb_vinfo);
2168 return NULL;
2171 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2173 /* Mark all the statements that we want to vectorize as pure SLP and
2174 relevant. */
2175 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2177 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2178 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2181 if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2183 if (dump_enabled_p ())
2184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2185 "not vectorized: unsupported alignment in basic "
2186 "block.\n");
2187 destroy_bb_vec_info (bb_vinfo);
2188 return NULL;
2191 if (!vect_slp_analyze_operations (bb_vinfo))
2193 if (dump_enabled_p ())
2194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2195 "not vectorized: bad operation in basic block.\n");
2197 destroy_bb_vec_info (bb_vinfo);
2198 return NULL;
2201 /* Cost model: check if the vectorization is worthwhile. */
2202 if (flag_vect_cost_model
2203 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2205 if (dump_enabled_p ())
2206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2207 "not vectorized: vectorization is not "
2208 "profitable.\n");
2210 destroy_bb_vec_info (bb_vinfo);
2211 return NULL;
2214 if (dump_enabled_p ())
2215 dump_printf_loc (MSG_NOTE, vect_location,
2216 "Basic block will be vectorized using SLP\n");
2218 return bb_vinfo;
2222 bb_vec_info
2223 vect_slp_analyze_bb (basic_block bb)
2225 bb_vec_info bb_vinfo;
2226 int insns = 0;
2227 gimple_stmt_iterator gsi;
2228 unsigned int vector_sizes;
2230 if (dump_enabled_p ())
2231 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2233 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2235 gimple stmt = gsi_stmt (gsi);
2236 if (!is_gimple_debug (stmt)
2237 && !gimple_nop_p (stmt)
2238 && gimple_code (stmt) != GIMPLE_LABEL)
2239 insns++;
2242 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2244 if (dump_enabled_p ())
2245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246 "not vectorized: too many instructions in "
2247 "basic block.\n");
2249 return NULL;
2252 /* Autodetect first vector size we try. */
2253 current_vector_size = 0;
2254 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2256 while (1)
2258 bb_vinfo = vect_slp_analyze_bb_1 (bb);
2259 if (bb_vinfo)
2260 return bb_vinfo;
2262 destroy_bb_vec_info (bb_vinfo);
2264 vector_sizes &= ~current_vector_size;
2265 if (vector_sizes == 0
2266 || current_vector_size == 0)
2267 return NULL;
2269 /* Try the next biggest vector size. */
2270 current_vector_size = 1 << floor_log2 (vector_sizes);
2271 if (dump_enabled_p ())
2272 dump_printf_loc (MSG_NOTE, vect_location,
2273 "***** Re-trying analysis with "
2274 "vector size %d\n", current_vector_size);
2279 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2280 the number of created vector stmts depends on the unrolling factor).
2281 However, the actual number of vector stmts for every SLP node depends on
2282 VF which is set later in vect_analyze_operations (). Hence, SLP costs
2283 should be updated. In this function we assume that the inside costs
2284 calculated in vect_model_xxx_cost are linear in ncopies. */
2286 void
2287 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2289 unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2290 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2291 slp_instance instance;
2292 stmt_vector_for_cost body_cost_vec;
2293 stmt_info_for_cost *si;
2294 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_NOTE, vect_location,
2298 "=== vect_update_slp_costs_according_to_vf ===");
2300 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2302 /* We assume that costs are linear in ncopies. */
2303 int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2305 /* Record the instance's instructions in the target cost model.
2306 This was delayed until here because the count of instructions
2307 isn't known beforehand. */
2308 body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2310 FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2311 (void) add_stmt_cost (data, si->count * ncopies, si->kind,
2312 vinfo_for_stmt (si->stmt), si->misalign,
2313 vect_body);
2318 /* For constant and loop invariant defs of SLP_NODE this function returns
2319 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2320 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2321 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2322 REDUC_INDEX is the index of the reduction operand in the statements, unless
2323 it is -1. */
2325 static void
2326 vect_get_constant_vectors (tree op, slp_tree slp_node,
2327 vec<tree> *vec_oprnds,
2328 unsigned int op_num, unsigned int number_of_vectors,
2329 int reduc_index)
2331 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2332 gimple stmt = stmts[0];
2333 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2334 unsigned nunits;
2335 tree vec_cst;
2336 tree *elts;
2337 unsigned j, number_of_places_left_in_vector;
2338 tree vector_type;
2339 tree vop;
2340 int group_size = stmts.length ();
2341 unsigned int vec_num, i;
2342 unsigned number_of_copies = 1;
2343 vec<tree> voprnds;
2344 voprnds.create (number_of_vectors);
2345 bool constant_p, is_store;
2346 tree neutral_op = NULL;
2347 enum tree_code code = gimple_expr_code (stmt);
2348 gimple def_stmt;
2349 struct loop *loop;
2350 gimple_seq ctor_seq = NULL;
2352 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2353 && reduc_index != -1)
2355 op_num = reduc_index - 1;
2356 op = gimple_op (stmt, reduc_index);
2357 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2358 we need either neutral operands or the original operands. See
2359 get_initial_def_for_reduction() for details. */
2360 switch (code)
2362 case WIDEN_SUM_EXPR:
2363 case DOT_PROD_EXPR:
2364 case PLUS_EXPR:
2365 case MINUS_EXPR:
2366 case BIT_IOR_EXPR:
2367 case BIT_XOR_EXPR:
2368 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2369 neutral_op = build_real (TREE_TYPE (op), dconst0);
2370 else
2371 neutral_op = build_int_cst (TREE_TYPE (op), 0);
2373 break;
2375 case MULT_EXPR:
2376 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2377 neutral_op = build_real (TREE_TYPE (op), dconst1);
2378 else
2379 neutral_op = build_int_cst (TREE_TYPE (op), 1);
2381 break;
2383 case BIT_AND_EXPR:
2384 neutral_op = build_int_cst (TREE_TYPE (op), -1);
2385 break;
2387 case MAX_EXPR:
2388 case MIN_EXPR:
2389 def_stmt = SSA_NAME_DEF_STMT (op);
2390 loop = (gimple_bb (stmt))->loop_father;
2391 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2392 loop_preheader_edge (loop));
2393 break;
2395 default:
2396 neutral_op = NULL;
2400 if (STMT_VINFO_DATA_REF (stmt_vinfo))
2402 is_store = true;
2403 op = gimple_assign_rhs1 (stmt);
2405 else
2406 is_store = false;
2408 gcc_assert (op);
2410 if (CONSTANT_CLASS_P (op))
2411 constant_p = true;
2412 else
2413 constant_p = false;
2415 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2416 gcc_assert (vector_type);
2417 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2419 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2420 created vectors. It is greater than 1 if unrolling is performed.
2422 For example, we have two scalar operands, s1 and s2 (e.g., group of
2423 strided accesses of size two), while NUNITS is four (i.e., four scalars
2424 of this type can be packed in a vector). The output vector will contain
2425 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2426 will be 2).
2428 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2429 containing the operands.
2431 For example, NUNITS is four as before, and the group size is 8
2432 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2433 {s5, s6, s7, s8}. */
2435 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2437 number_of_places_left_in_vector = nunits;
2438 elts = XALLOCAVEC (tree, nunits);
2439 for (j = 0; j < number_of_copies; j++)
2441 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2443 if (is_store)
2444 op = gimple_assign_rhs1 (stmt);
2445 else
2447 switch (code)
2449 case COND_EXPR:
2450 if (op_num == 0 || op_num == 1)
2452 tree cond = gimple_assign_rhs1 (stmt);
2453 op = TREE_OPERAND (cond, op_num);
2455 else
2457 if (op_num == 2)
2458 op = gimple_assign_rhs2 (stmt);
2459 else
2460 op = gimple_assign_rhs3 (stmt);
2462 break;
2464 case CALL_EXPR:
2465 op = gimple_call_arg (stmt, op_num);
2466 break;
2468 case LSHIFT_EXPR:
2469 case RSHIFT_EXPR:
2470 case LROTATE_EXPR:
2471 case RROTATE_EXPR:
2472 op = gimple_op (stmt, op_num + 1);
2473 /* Unlike the other binary operators, shifts/rotates have
2474 the shift count being int, instead of the same type as
2475 the lhs, so make sure the scalar is the right type if
2476 we are dealing with vectors of
2477 long long/long/short/char. */
2478 if (op_num == 1 && constant_p)
2479 op = fold_convert (TREE_TYPE (vector_type), op);
2480 break;
2482 default:
2483 op = gimple_op (stmt, op_num + 1);
2484 break;
2488 if (reduc_index != -1)
2490 loop = (gimple_bb (stmt))->loop_father;
2491 def_stmt = SSA_NAME_DEF_STMT (op);
2493 gcc_assert (loop);
2495 /* Get the def before the loop. In reduction chain we have only
2496 one initial value. */
2497 if ((j != (number_of_copies - 1)
2498 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2499 && i != 0))
2500 && neutral_op)
2501 op = neutral_op;
2502 else
2503 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2504 loop_preheader_edge (loop));
2507 /* Create 'vect_ = {op0,op1,...,opn}'. */
2508 number_of_places_left_in_vector--;
2509 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2511 if (constant_p)
2513 op = fold_unary (VIEW_CONVERT_EXPR,
2514 TREE_TYPE (vector_type), op);
2515 gcc_assert (op && CONSTANT_CLASS_P (op));
2517 else
2519 tree new_temp
2520 = make_ssa_name (TREE_TYPE (vector_type), NULL);
2521 gimple init_stmt;
2522 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
2523 op);
2524 init_stmt
2525 = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR,
2526 new_temp, op, NULL_TREE);
2527 gimple_seq_add_stmt (&ctor_seq, init_stmt);
2528 op = new_temp;
2531 elts[number_of_places_left_in_vector] = op;
2533 if (number_of_places_left_in_vector == 0)
2535 number_of_places_left_in_vector = nunits;
2537 if (constant_p)
2538 vec_cst = build_vector (vector_type, elts);
2539 else
2541 vec<constructor_elt, va_gc> *v;
2542 unsigned k;
2543 vec_alloc (v, nunits);
2544 for (k = 0; k < nunits; ++k)
2545 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2546 vec_cst = build_constructor (vector_type, v);
2548 voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2549 vector_type, NULL));
2550 if (ctor_seq != NULL)
2552 gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2553 gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2554 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2555 GSI_SAME_STMT);
2556 ctor_seq = NULL;
2562 /* Since the vectors are created in the reverse order, we should invert
2563 them. */
2564 vec_num = voprnds.length ();
2565 for (j = vec_num; j != 0; j--)
2567 vop = voprnds[j - 1];
2568 vec_oprnds->quick_push (vop);
2571 voprnds.release ();
2573 /* In case that VF is greater than the unrolling factor needed for the SLP
2574 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2575 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2576 to replicate the vectors. */
2577 while (number_of_vectors > vec_oprnds->length ())
2579 tree neutral_vec = NULL;
2581 if (neutral_op)
2583 if (!neutral_vec)
2584 neutral_vec = build_vector_from_val (vector_type, neutral_op);
2586 vec_oprnds->quick_push (neutral_vec);
2588 else
2590 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2591 vec_oprnds->quick_push (vop);
2597 /* Get vectorized definitions from SLP_NODE that contains corresponding
2598 vectorized def-stmts. */
2600 static void
2601 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2603 tree vec_oprnd;
2604 gimple vec_def_stmt;
2605 unsigned int i;
2607 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2609 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2611 gcc_assert (vec_def_stmt);
2612 vec_oprnd = gimple_get_lhs (vec_def_stmt);
2613 vec_oprnds->quick_push (vec_oprnd);
2618 /* Get vectorized definitions for SLP_NODE.
2619 If the scalar definitions are loop invariants or constants, collect them and
2620 call vect_get_constant_vectors() to create vector stmts.
2621 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2622 must be stored in the corresponding child of SLP_NODE, and we call
2623 vect_get_slp_vect_defs () to retrieve them. */
2625 void
2626 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2627 vec<vec<tree> > *vec_oprnds, int reduc_index)
2629 gimple first_stmt;
2630 int number_of_vects = 0, i;
2631 unsigned int child_index = 0;
2632 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2633 slp_tree child = NULL;
2634 vec<tree> vec_defs;
2635 tree oprnd;
2636 bool vectorized_defs;
2638 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2639 FOR_EACH_VEC_ELT (ops, i, oprnd)
2641 /* For each operand we check if it has vectorized definitions in a child
2642 node or we need to create them (for invariants and constants). We
2643 check if the LHS of the first stmt of the next child matches OPRND.
2644 If it does, we found the correct child. Otherwise, we call
2645 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2646 to check this child node for the next operand. */
2647 vectorized_defs = false;
2648 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2650 child = (slp_tree) SLP_TREE_CHILDREN (slp_node)[child_index];
2652 /* We have to check both pattern and original def, if available. */
2653 gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2654 gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2656 if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2657 || (related
2658 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2660 /* The number of vector defs is determined by the number of
2661 vector statements in the node from which we get those
2662 statements. */
2663 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2664 vectorized_defs = true;
2665 child_index++;
2669 if (!vectorized_defs)
2671 if (i == 0)
2673 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2674 /* Number of vector stmts was calculated according to LHS in
2675 vect_schedule_slp_instance (), fix it by replacing LHS with
2676 RHS, if necessary. See vect_get_smallest_scalar_type () for
2677 details. */
2678 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2679 &rhs_size_unit);
2680 if (rhs_size_unit != lhs_size_unit)
2682 number_of_vects *= rhs_size_unit;
2683 number_of_vects /= lhs_size_unit;
2688 /* Allocate memory for vectorized defs. */
2689 vec_defs = vNULL;
2690 vec_defs.create (number_of_vects);
2692 /* For reduction defs we call vect_get_constant_vectors (), since we are
2693 looking for initial loop invariant values. */
2694 if (vectorized_defs && reduc_index == -1)
2695 /* The defs are already vectorized. */
2696 vect_get_slp_vect_defs (child, &vec_defs);
2697 else
2698 /* Build vectors from scalar defs. */
2699 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2700 number_of_vects, reduc_index);
2702 vec_oprnds->quick_push (vec_defs);
2704 /* For reductions, we only need initial values. */
2705 if (reduc_index != -1)
2706 return;
2711 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2712 building a vector of type MASK_TYPE from it) and two input vectors placed in
2713 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2714 shifting by STRIDE elements of DR_CHAIN for every copy.
2715 (STRIDE is the number of vectorized stmts for NODE divided by the number of
2716 copies).
2717 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2718 the created stmts must be inserted. */
2720 static inline void
2721 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2722 tree mask, int first_vec_indx, int second_vec_indx,
2723 gimple_stmt_iterator *gsi, slp_tree node,
2724 tree vectype, vec<tree> dr_chain,
2725 int ncopies, int vect_stmts_counter)
2727 tree perm_dest;
2728 gimple perm_stmt = NULL;
2729 stmt_vec_info next_stmt_info;
2730 int i, stride;
2731 tree first_vec, second_vec, data_ref;
2733 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2735 /* Initialize the vect stmts of NODE to properly insert the generated
2736 stmts later. */
2737 for (i = SLP_TREE_VEC_STMTS (node).length ();
2738 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2739 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2741 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2742 for (i = 0; i < ncopies; i++)
2744 first_vec = dr_chain[first_vec_indx];
2745 second_vec = dr_chain[second_vec_indx];
2747 /* Generate the permute statement. */
2748 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, perm_dest,
2749 first_vec, second_vec, mask);
2750 data_ref = make_ssa_name (perm_dest, perm_stmt);
2751 gimple_set_lhs (perm_stmt, data_ref);
2752 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2754 /* Store the vector statement in NODE. */
2755 SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2757 first_vec_indx += stride;
2758 second_vec_indx += stride;
2761 /* Mark the scalar stmt as vectorized. */
2762 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2763 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2767 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2768 return in CURRENT_MASK_ELEMENT its equivalent in target specific
2769 representation. Check that the mask is valid and return FALSE if not.
2770 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2771 the next vector, i.e., the current first vector is not needed. */
2773 static bool
2774 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2775 int mask_nunits, bool only_one_vec, int index,
2776 unsigned char *mask, int *current_mask_element,
2777 bool *need_next_vector, int *number_of_mask_fixes,
2778 bool *mask_fixed, bool *needs_first_vector)
2780 int i;
2782 /* Convert to target specific representation. */
2783 *current_mask_element = first_mask_element + m;
2784 /* Adjust the value in case it's a mask for second and third vectors. */
2785 *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2787 if (*current_mask_element < mask_nunits)
2788 *needs_first_vector = true;
2790 /* We have only one input vector to permute but the mask accesses values in
2791 the next vector as well. */
2792 if (only_one_vec && *current_mask_element >= mask_nunits)
2794 if (dump_enabled_p ())
2796 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2797 "permutation requires at least two vectors ");
2798 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2801 return false;
2804 /* The mask requires the next vector. */
2805 if (*current_mask_element >= mask_nunits * 2)
2807 if (*needs_first_vector || *mask_fixed)
2809 /* We either need the first vector too or have already moved to the
2810 next vector. In both cases, this permutation needs three
2811 vectors. */
2812 if (dump_enabled_p ())
2814 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2815 "permutation requires at "
2816 "least three vectors ");
2817 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2820 return false;
2823 /* We move to the next vector, dropping the first one and working with
2824 the second and the third - we need to adjust the values of the mask
2825 accordingly. */
2826 *current_mask_element -= mask_nunits * *number_of_mask_fixes;
2828 for (i = 0; i < index; i++)
2829 mask[i] -= mask_nunits * *number_of_mask_fixes;
2831 (*number_of_mask_fixes)++;
2832 *mask_fixed = true;
2835 *need_next_vector = *mask_fixed;
2837 /* This was the last element of this mask. Start a new one. */
2838 if (index == mask_nunits - 1)
2840 *number_of_mask_fixes = 1;
2841 *mask_fixed = false;
2842 *needs_first_vector = false;
2845 return true;
2849 /* Generate vector permute statements from a list of loads in DR_CHAIN.
2850 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
2851 permute statements for SLP_NODE_INSTANCE. */
2852 bool
2853 vect_transform_slp_perm_load (gimple stmt, vec<tree> dr_chain,
2854 gimple_stmt_iterator *gsi, int vf,
2855 slp_instance slp_node_instance, bool analyze_only)
2857 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2858 tree mask_element_type = NULL_TREE, mask_type;
2859 int i, j, k, nunits, vec_index = 0, scalar_index;
2860 slp_tree node;
2861 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2862 gimple next_scalar_stmt;
2863 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
2864 int first_mask_element;
2865 int index, unroll_factor, current_mask_element, ncopies;
2866 unsigned char *mask;
2867 bool only_one_vec = false, need_next_vector = false;
2868 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
2869 int number_of_mask_fixes = 1;
2870 bool mask_fixed = false;
2871 bool needs_first_vector = false;
2872 enum machine_mode mode;
2874 mode = TYPE_MODE (vectype);
2876 if (!can_vec_perm_p (mode, false, NULL))
2878 if (dump_enabled_p ())
2880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2881 "no vect permute for ");
2882 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2884 return false;
2887 /* The generic VEC_PERM_EXPR code always uses an integral type of the
2888 same size as the vector element being permuted. */
2889 mask_element_type = lang_hooks.types.type_for_mode
2890 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
2891 mask_type = get_vectype_for_scalar_type (mask_element_type);
2892 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2893 mask = XALLOCAVEC (unsigned char, nunits);
2894 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2896 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
2897 unrolling factor. */
2898 orig_vec_stmts_num = group_size *
2899 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
2900 if (orig_vec_stmts_num == 1)
2901 only_one_vec = true;
2903 /* Number of copies is determined by the final vectorization factor
2904 relatively to SLP_NODE_INSTANCE unrolling factor. */
2905 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2907 /* Generate permutation masks for every NODE. Number of masks for each NODE
2908 is equal to GROUP_SIZE.
2909 E.g., we have a group of three nodes with three loads from the same
2910 location in each node, and the vector size is 4. I.e., we have a
2911 a0b0c0a1b1c1... sequence and we need to create the following vectors:
2912 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
2913 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
2916 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
2917 The last mask is illegal since we assume two operands for permute
2918 operation, and the mask element values can't be outside that range.
2919 Hence, the last mask must be converted into {2,5,5,5}.
2920 For the first two permutations we need the first and the second input
2921 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
2922 we need the second and the third vectors: {b1,c1,a2,b2} and
2923 {c2,a3,b3,c3}. */
2925 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_node_instance), i, node)
2927 scalar_index = 0;
2928 index = 0;
2929 vect_stmts_counter = 0;
2930 vec_index = 0;
2931 first_vec_index = vec_index++;
2932 if (only_one_vec)
2933 second_vec_index = first_vec_index;
2934 else
2935 second_vec_index = vec_index++;
2937 for (j = 0; j < unroll_factor; j++)
2939 for (k = 0; k < group_size; k++)
2941 first_mask_element = i + j * group_size;
2942 if (!vect_get_mask_element (stmt, first_mask_element, 0,
2943 nunits, only_one_vec, index,
2944 mask, &current_mask_element,
2945 &need_next_vector,
2946 &number_of_mask_fixes, &mask_fixed,
2947 &needs_first_vector))
2948 return false;
2949 mask[index++] = current_mask_element;
2951 if (index == nunits)
2953 tree mask_vec, *mask_elts;
2954 int l;
2956 if (!can_vec_perm_p (mode, false, mask))
2958 if (dump_enabled_p ())
2960 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
2961 vect_location,
2962 "unsupported vect permute { ");
2963 for (i = 0; i < nunits; ++i)
2964 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
2965 mask[i]);
2966 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
2968 return false;
2971 mask_elts = XALLOCAVEC (tree, nunits);
2972 for (l = 0; l < nunits; ++l)
2973 mask_elts[l] = build_int_cst (mask_element_type, mask[l]);
2974 mask_vec = build_vector (mask_type, mask_elts);
2975 index = 0;
2977 if (!analyze_only)
2979 if (need_next_vector)
2981 first_vec_index = second_vec_index;
2982 second_vec_index = vec_index;
2985 next_scalar_stmt
2986 = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
2988 vect_create_mask_and_perm (stmt, next_scalar_stmt,
2989 mask_vec, first_vec_index, second_vec_index,
2990 gsi, node, vectype, dr_chain,
2991 ncopies, vect_stmts_counter++);
2998 return true;
3003 /* Vectorize SLP instance tree in postorder. */
3005 static bool
3006 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3007 unsigned int vectorization_factor)
3009 gimple stmt;
3010 bool grouped_store, is_store;
3011 gimple_stmt_iterator si;
3012 stmt_vec_info stmt_info;
3013 unsigned int vec_stmts_size, nunits, group_size;
3014 tree vectype;
3015 int i;
3016 slp_tree loads_node;
3017 slp_void_p child;
3019 if (!node)
3020 return false;
3022 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3023 vect_schedule_slp_instance ((slp_tree) child, instance,
3024 vectorization_factor);
3026 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3027 stmt_info = vinfo_for_stmt (stmt);
3029 /* VECTYPE is the type of the destination. */
3030 vectype = STMT_VINFO_VECTYPE (stmt_info);
3031 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3032 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3034 /* For each SLP instance calculate number of vector stmts to be created
3035 for the scalar stmts in each node of the SLP tree. Number of vector
3036 elements in one vector iteration is the number of scalar elements in
3037 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3038 size. */
3039 vec_stmts_size = (vectorization_factor * group_size) / nunits;
3041 /* In case of load permutation we have to allocate vectorized statements for
3042 all the nodes that participate in that permutation. */
3043 if (SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
3045 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, loads_node)
3047 if (!SLP_TREE_VEC_STMTS (loads_node).exists ())
3049 SLP_TREE_VEC_STMTS (loads_node).create (vec_stmts_size);
3050 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
3055 if (!SLP_TREE_VEC_STMTS (node).exists ())
3057 SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3058 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3061 if (dump_enabled_p ())
3063 dump_printf_loc (MSG_NOTE,vect_location,
3064 "------>vectorizing SLP node starting from: ");
3065 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3068 /* Loads should be inserted before the first load. */
3069 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3070 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3071 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3072 && SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
3073 si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3074 else if (is_pattern_stmt_p (stmt_info))
3075 si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3076 else
3077 si = gsi_for_stmt (stmt);
3079 /* Stores should be inserted just before the last store. */
3080 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3081 && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3083 gimple last_store = vect_find_last_store_in_slp_instance (instance);
3084 if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3085 last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3086 si = gsi_for_stmt (last_store);
3089 /* Mark the first element of the reduction chain as reduction to properly
3090 transform the node. In the analysis phase only the last element of the
3091 chain is marked as reduction. */
3092 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3093 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3095 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3096 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3099 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3100 return is_store;
3103 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3104 For loop vectorization this is done in vectorizable_call, but for SLP
3105 it needs to be deferred until end of vect_schedule_slp, because multiple
3106 SLP instances may refer to the same scalar stmt. */
3108 static void
3109 vect_remove_slp_scalar_calls (slp_tree node)
3111 gimple stmt, new_stmt;
3112 gimple_stmt_iterator gsi;
3113 int i;
3114 slp_void_p child;
3115 tree lhs;
3116 stmt_vec_info stmt_info;
3118 if (!node)
3119 return;
3121 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3122 vect_remove_slp_scalar_calls ((slp_tree) child);
3124 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3126 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3127 continue;
3128 stmt_info = vinfo_for_stmt (stmt);
3129 if (stmt_info == NULL
3130 || is_pattern_stmt_p (stmt_info)
3131 || !PURE_SLP_STMT (stmt_info))
3132 continue;
3133 lhs = gimple_call_lhs (stmt);
3134 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3135 set_vinfo_for_stmt (new_stmt, stmt_info);
3136 set_vinfo_for_stmt (stmt, NULL);
3137 STMT_VINFO_STMT (stmt_info) = new_stmt;
3138 gsi = gsi_for_stmt (stmt);
3139 gsi_replace (&gsi, new_stmt, false);
3140 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3144 /* Generate vector code for all SLP instances in the loop/basic block. */
3146 bool
3147 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3149 vec<slp_instance> slp_instances;
3150 slp_instance instance;
3151 slp_tree loads_node;
3152 unsigned int i, j, vf;
3153 bool is_store = false;
3155 if (loop_vinfo)
3157 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3158 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3160 else
3162 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3163 vf = 1;
3166 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3168 /* Schedule the tree of INSTANCE. */
3169 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3170 instance, vf);
3172 /* Clear STMT_VINFO_VEC_STMT of all loads. With shared loads
3173 between SLP instances we fail to properly initialize the
3174 vectorized SLP stmts and confuse different load permutations. */
3175 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, loads_node)
3176 STMT_VINFO_VEC_STMT
3177 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (loads_node)[0])) = NULL;
3179 if (dump_enabled_p ())
3180 dump_printf_loc (MSG_NOTE, vect_location,
3181 "vectorizing stmts using SLP.");
3184 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3186 slp_tree root = SLP_INSTANCE_TREE (instance);
3187 gimple store;
3188 unsigned int j;
3189 gimple_stmt_iterator gsi;
3191 /* Remove scalar call stmts. Do not do this for basic-block
3192 vectorization as not all uses may be vectorized.
3193 ??? Why should this be necessary? DCE should be able to
3194 remove the stmts itself.
3195 ??? For BB vectorization we can as well remove scalar
3196 stmts starting from the SLP tree root if they have no
3197 uses. */
3198 if (loop_vinfo)
3199 vect_remove_slp_scalar_calls (root);
3201 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3202 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3204 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3205 break;
3207 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3208 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3209 /* Free the attached stmt_vec_info and remove the stmt. */
3210 gsi = gsi_for_stmt (store);
3211 unlink_stmt_vdef (store);
3212 gsi_remove (&gsi, true);
3213 release_defs (store);
3214 free_stmt_vec_info (store);
3218 return is_store;
3222 /* Vectorize the basic block. */
3224 void
3225 vect_slp_transform_bb (basic_block bb)
3227 bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3228 gimple_stmt_iterator si;
3230 gcc_assert (bb_vinfo);
3232 if (dump_enabled_p ())
3233 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3235 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3237 gimple stmt = gsi_stmt (si);
3238 stmt_vec_info stmt_info;
3240 if (dump_enabled_p ())
3242 dump_printf_loc (MSG_NOTE, vect_location,
3243 "------>SLPing statement: ");
3244 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3247 stmt_info = vinfo_for_stmt (stmt);
3248 gcc_assert (stmt_info);
3250 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3251 if (STMT_SLP_TYPE (stmt_info))
3253 vect_schedule_slp (NULL, bb_vinfo);
3254 break;
3258 if (dump_enabled_p ())
3259 dump_printf (MSG_OPTIMIZED_LOCATIONS, "BASIC BLOCK VECTORIZED\n");
3261 destroy_bb_vec_info (bb_vinfo);