Mark ChangeLog
[official-gcc.git] / gcc / tree-vect-slp.c
blobd00a0d40b1fd4d9aa43908ae0e45086336e2f810
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "target.h"
30 #include "basic-block.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-pass.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "recog.h" /* FIXME: for insn_data */
37 #include "optabs.h"
38 #include "tree-vectorizer.h"
39 #include "langhooks.h"
41 /* Extract the location of the basic block in the source code.
42 Return the basic block location if succeed and NULL if not. */
44 LOC
45 find_bb_location (basic_block bb)
47 gimple stmt = NULL;
48 gimple_stmt_iterator si;
50 if (!bb)
51 return UNKNOWN_LOC;
53 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
55 stmt = gsi_stmt (si);
56 if (gimple_location (stmt) != UNKNOWN_LOC)
57 return gimple_location (stmt);
60 return UNKNOWN_LOC;
64 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
66 static void
67 vect_free_slp_tree (slp_tree node)
69 int i;
70 slp_void_p child;
72 if (!node)
73 return;
75 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
76 vect_free_slp_tree ((slp_tree) child);
78 SLP_TREE_CHILDREN (node).release ();
79 SLP_TREE_SCALAR_STMTS (node).release ();
80 SLP_TREE_VEC_STMTS (node).release ();
82 free (node);
86 /* Free the memory allocated for the SLP instance. */
88 void
89 vect_free_slp_instance (slp_instance instance)
91 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
92 SLP_INSTANCE_LOAD_PERMUTATION (instance).release ();
93 SLP_INSTANCE_LOADS (instance).release ();
94 SLP_INSTANCE_BODY_COST_VEC (instance).release ();
95 free (instance);
99 /* Create an SLP node for SCALAR_STMTS. */
101 static slp_tree
102 vect_create_new_slp_node (vec<gimple> scalar_stmts)
104 slp_tree node;
105 gimple stmt = scalar_stmts[0];
106 unsigned int nops;
108 if (is_gimple_call (stmt))
109 nops = gimple_call_num_args (stmt);
110 else if (is_gimple_assign (stmt))
112 nops = gimple_num_ops (stmt) - 1;
113 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
114 nops++;
116 else
117 return NULL;
119 node = XNEW (struct _slp_tree);
120 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
121 SLP_TREE_VEC_STMTS (node).create (0);
122 SLP_TREE_CHILDREN (node).create (nops);
124 return node;
128 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
129 operand. */
130 static vec<slp_oprnd_info>
131 vect_create_oprnd_info (int nops, int group_size)
133 int i;
134 slp_oprnd_info oprnd_info;
135 vec<slp_oprnd_info> oprnds_info;
137 oprnds_info.create (nops);
138 for (i = 0; i < nops; i++)
140 oprnd_info = XNEW (struct _slp_oprnd_info);
141 oprnd_info->def_stmts.create (group_size);
142 oprnd_info->first_dt = vect_uninitialized_def;
143 oprnd_info->first_def_type = NULL_TREE;
144 oprnd_info->first_const_oprnd = NULL_TREE;
145 oprnd_info->first_pattern = false;
146 oprnds_info.quick_push (oprnd_info);
149 return oprnds_info;
153 /* Free operands info. */
155 static void
156 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
158 int i;
159 slp_oprnd_info oprnd_info;
161 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
163 oprnd_info->def_stmts.release ();
164 XDELETE (oprnd_info);
167 oprnds_info.release ();
171 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
172 they are of a valid type and that they match the defs of the first stmt of
173 the SLP group (stored in OPRNDS_INFO). */
175 static bool
176 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
177 slp_tree slp_node, gimple stmt,
178 int ncopies_for_cost, bool first,
179 vec<slp_oprnd_info> *oprnds_info,
180 stmt_vector_for_cost *prologue_cost_vec,
181 stmt_vector_for_cost *body_cost_vec)
183 tree oprnd;
184 unsigned int i, number_of_oprnds;
185 tree def, def_op0 = NULL_TREE;
186 gimple def_stmt;
187 enum vect_def_type dt = vect_uninitialized_def;
188 enum vect_def_type dt_op0 = vect_uninitialized_def;
189 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
190 tree lhs = gimple_get_lhs (stmt);
191 struct loop *loop = NULL;
192 enum tree_code rhs_code;
193 bool different_types = false;
194 bool pattern = false;
195 slp_oprnd_info oprnd_info, oprnd0_info, oprnd1_info;
196 int op_idx = 1;
197 tree compare_rhs = NULL_TREE;
199 if (loop_vinfo)
200 loop = LOOP_VINFO_LOOP (loop_vinfo);
202 if (is_gimple_call (stmt))
204 number_of_oprnds = gimple_call_num_args (stmt);
205 op_idx = 3;
207 else if (is_gimple_assign (stmt))
209 number_of_oprnds = gimple_num_ops (stmt) - 1;
210 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
211 number_of_oprnds++;
213 else
214 return false;
216 for (i = 0; i < number_of_oprnds; i++)
218 if (compare_rhs)
220 oprnd = compare_rhs;
221 compare_rhs = NULL_TREE;
223 else
224 oprnd = gimple_op (stmt, op_idx++);
226 oprnd_info = (*oprnds_info)[i];
228 if (COMPARISON_CLASS_P (oprnd))
230 compare_rhs = TREE_OPERAND (oprnd, 1);
231 oprnd = TREE_OPERAND (oprnd, 0);
234 if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
235 &def, &dt)
236 || (!def_stmt && dt != vect_constant_def))
238 if (dump_enabled_p ())
240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
241 "Build SLP failed: can't find def for ");
242 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
245 return false;
248 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
249 from the pattern. Check that all the stmts of the node are in the
250 pattern. */
251 if (def_stmt && gimple_bb (def_stmt)
252 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
253 || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
254 && gimple_code (def_stmt) != GIMPLE_PHI))
255 && vinfo_for_stmt (def_stmt)
256 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
257 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
258 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
260 pattern = true;
261 if (!first && !oprnd_info->first_pattern)
263 if (dump_enabled_p ())
265 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
266 "Build SLP failed: some of the stmts"
267 " are in a pattern, and others are not ");
268 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
271 return false;
274 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
275 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
277 if (dt == vect_unknown_def_type)
279 if (dump_enabled_p ())
280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
281 "Unsupported pattern.");
282 return false;
285 switch (gimple_code (def_stmt))
287 case GIMPLE_PHI:
288 def = gimple_phi_result (def_stmt);
289 break;
291 case GIMPLE_ASSIGN:
292 def = gimple_assign_lhs (def_stmt);
293 break;
295 default:
296 if (dump_enabled_p ())
297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
298 "unsupported defining stmt: ");
299 return false;
303 if (first)
305 oprnd_info->first_dt = dt;
306 oprnd_info->first_pattern = pattern;
307 if (def)
309 oprnd_info->first_def_type = TREE_TYPE (def);
310 oprnd_info->first_const_oprnd = NULL_TREE;
312 else
314 oprnd_info->first_def_type = NULL_TREE;
315 oprnd_info->first_const_oprnd = oprnd;
318 if (i == 0)
320 def_op0 = def;
321 dt_op0 = dt;
322 /* Analyze costs (for the first stmt of the group only). */
323 if (REFERENCE_CLASS_P (lhs))
324 /* Store. */
325 vect_model_store_cost (stmt_info, ncopies_for_cost, false,
326 dt, slp_node, prologue_cost_vec,
327 body_cost_vec);
328 else
330 enum vect_def_type dts[2];
331 dts[0] = dt;
332 dts[1] = vect_uninitialized_def;
333 /* Not memory operation (we don't call this function for
334 loads). */
335 vect_model_simple_cost (stmt_info, ncopies_for_cost, dts,
336 prologue_cost_vec, body_cost_vec);
340 else
342 /* Not first stmt of the group, check that the def-stmt/s match
343 the def-stmt/s of the first stmt. Allow different definition
344 types for reduction chains: the first stmt must be a
345 vect_reduction_def (a phi node), and the rest
346 vect_internal_def. */
347 if (((oprnd_info->first_dt != dt
348 && !(oprnd_info->first_dt == vect_reduction_def
349 && dt == vect_internal_def))
350 || (oprnd_info->first_def_type != NULL_TREE
351 && def
352 && !types_compatible_p (oprnd_info->first_def_type,
353 TREE_TYPE (def))))
354 || (!def
355 && !types_compatible_p (TREE_TYPE (oprnd_info->first_const_oprnd),
356 TREE_TYPE (oprnd)))
357 || different_types)
359 if (number_of_oprnds != 2)
361 if (dump_enabled_p ())
362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
363 "Build SLP failed: different types ");
365 return false;
368 /* Try to swap operands in case of binary operation. */
369 if (i == 0)
370 different_types = true;
371 else
373 oprnd0_info = (*oprnds_info)[0];
374 if (is_gimple_assign (stmt)
375 && (rhs_code = gimple_assign_rhs_code (stmt))
376 && TREE_CODE_CLASS (rhs_code) == tcc_binary
377 && commutative_tree_code (rhs_code)
378 && oprnd0_info->first_dt == dt
379 && oprnd_info->first_dt == dt_op0
380 && def_op0 && def
381 && !(oprnd0_info->first_def_type
382 && !types_compatible_p (oprnd0_info->first_def_type,
383 TREE_TYPE (def)))
384 && !(oprnd_info->first_def_type
385 && !types_compatible_p (oprnd_info->first_def_type,
386 TREE_TYPE (def_op0))))
388 if (dump_enabled_p ())
390 dump_printf_loc (MSG_NOTE, vect_location,
391 "Swapping operands of ");
392 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
395 swap_tree_operands (stmt, gimple_assign_rhs1_ptr (stmt),
396 gimple_assign_rhs2_ptr (stmt));
398 else
400 if (dump_enabled_p ())
401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
402 "Build SLP failed: different types ");
404 return false;
410 /* Check the types of the definitions. */
411 switch (dt)
413 case vect_constant_def:
414 case vect_external_def:
415 case vect_reduction_def:
416 break;
418 case vect_internal_def:
419 if (different_types)
421 oprnd0_info = (*oprnds_info)[0];
422 oprnd1_info = (*oprnds_info)[0];
423 if (i == 0)
424 oprnd1_info->def_stmts.quick_push (def_stmt);
425 else
426 oprnd0_info->def_stmts.quick_push (def_stmt);
428 else
429 oprnd_info->def_stmts.quick_push (def_stmt);
431 break;
433 default:
434 /* FORNOW: Not supported. */
435 if (dump_enabled_p ())
437 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
438 "Build SLP failed: illegal type of def ");
439 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
442 return false;
446 return true;
450 /* Recursively build an SLP tree starting from NODE.
451 Fail (and return FALSE) if def-stmts are not isomorphic, require data
452 permutation or are of unsupported types of operation. Otherwise, return
453 TRUE. */
455 static bool
456 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
457 slp_tree *node, unsigned int group_size, int *outside_cost,
458 int ncopies_for_cost, unsigned int *max_nunits,
459 vec<int> *load_permutation,
460 vec<slp_tree> *loads,
461 unsigned int vectorization_factor, bool *loads_permuted,
462 stmt_vector_for_cost *prologue_cost_vec,
463 stmt_vector_for_cost *body_cost_vec)
465 unsigned int i;
466 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (*node);
467 gimple stmt = stmts[0];
468 enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
469 enum tree_code first_cond_code = ERROR_MARK;
470 tree lhs;
471 bool stop_recursion = false, need_same_oprnds = false;
472 tree vectype, scalar_type, first_op1 = NULL_TREE;
473 unsigned int ncopies;
474 optab optab;
475 int icode;
476 enum machine_mode optab_op2_mode;
477 enum machine_mode vec_mode;
478 struct data_reference *first_dr;
479 HOST_WIDE_INT dummy;
480 bool permutation = false;
481 unsigned int load_place;
482 gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
483 vec<slp_oprnd_info> oprnds_info;
484 unsigned int nops;
485 slp_oprnd_info oprnd_info;
486 tree cond;
488 if (is_gimple_call (stmt))
489 nops = gimple_call_num_args (stmt);
490 else if (is_gimple_assign (stmt))
492 nops = gimple_num_ops (stmt) - 1;
493 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
494 nops++;
496 else
497 return false;
499 oprnds_info = vect_create_oprnd_info (nops, group_size);
501 /* For every stmt in NODE find its def stmt/s. */
502 FOR_EACH_VEC_ELT (stmts, i, stmt)
504 if (dump_enabled_p ())
506 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
507 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
510 /* Fail to vectorize statements marked as unvectorizable. */
511 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
513 if (dump_enabled_p ())
515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
516 "Build SLP failed: unvectorizable statement ");
517 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
520 vect_free_oprnd_info (oprnds_info);
521 return false;
524 lhs = gimple_get_lhs (stmt);
525 if (lhs == NULL_TREE)
527 if (dump_enabled_p ())
529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
530 "Build SLP failed: not GIMPLE_ASSIGN nor "
531 "GIMPLE_CALL ");
532 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
535 vect_free_oprnd_info (oprnds_info);
536 return false;
539 if (is_gimple_assign (stmt)
540 && gimple_assign_rhs_code (stmt) == COND_EXPR
541 && (cond = gimple_assign_rhs1 (stmt))
542 && !COMPARISON_CLASS_P (cond))
544 if (dump_enabled_p ())
546 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
547 "Build SLP failed: condition is not "
548 "comparison ");
549 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
552 vect_free_oprnd_info (oprnds_info);
553 return false;
556 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
557 vectype = get_vectype_for_scalar_type (scalar_type);
558 if (!vectype)
560 if (dump_enabled_p ())
562 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
563 "Build SLP failed: unsupported data-type ");
564 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
565 scalar_type);
568 vect_free_oprnd_info (oprnds_info);
569 return false;
572 /* In case of multiple types we need to detect the smallest type. */
573 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
575 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
576 if (bb_vinfo)
577 vectorization_factor = *max_nunits;
580 ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
582 if (is_gimple_call (stmt))
584 rhs_code = CALL_EXPR;
585 if (gimple_call_internal_p (stmt)
586 || gimple_call_tail_p (stmt)
587 || gimple_call_noreturn_p (stmt)
588 || !gimple_call_nothrow_p (stmt)
589 || gimple_call_chain (stmt))
591 if (dump_enabled_p ())
593 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
594 "Build SLP failed: unsupported call type ");
595 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
598 vect_free_oprnd_info (oprnds_info);
599 return false;
602 else
603 rhs_code = gimple_assign_rhs_code (stmt);
605 /* Check the operation. */
606 if (i == 0)
608 first_stmt_code = rhs_code;
610 /* Shift arguments should be equal in all the packed stmts for a
611 vector shift with scalar shift operand. */
612 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
613 || rhs_code == LROTATE_EXPR
614 || rhs_code == RROTATE_EXPR)
616 vec_mode = TYPE_MODE (vectype);
618 /* First see if we have a vector/vector shift. */
619 optab = optab_for_tree_code (rhs_code, vectype,
620 optab_vector);
622 if (!optab
623 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
625 /* No vector/vector shift, try for a vector/scalar shift. */
626 optab = optab_for_tree_code (rhs_code, vectype,
627 optab_scalar);
629 if (!optab)
631 if (dump_enabled_p ())
632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
633 "Build SLP failed: no optab.");
634 vect_free_oprnd_info (oprnds_info);
635 return false;
637 icode = (int) optab_handler (optab, vec_mode);
638 if (icode == CODE_FOR_nothing)
640 if (dump_enabled_p ())
641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
642 "Build SLP failed: "
643 "op not supported by target.");
644 vect_free_oprnd_info (oprnds_info);
645 return false;
647 optab_op2_mode = insn_data[icode].operand[2].mode;
648 if (!VECTOR_MODE_P (optab_op2_mode))
650 need_same_oprnds = true;
651 first_op1 = gimple_assign_rhs2 (stmt);
655 else if (rhs_code == WIDEN_LSHIFT_EXPR)
657 need_same_oprnds = true;
658 first_op1 = gimple_assign_rhs2 (stmt);
661 else
663 if (first_stmt_code != rhs_code
664 && (first_stmt_code != IMAGPART_EXPR
665 || rhs_code != REALPART_EXPR)
666 && (first_stmt_code != REALPART_EXPR
667 || rhs_code != IMAGPART_EXPR)
668 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
669 && (first_stmt_code == ARRAY_REF
670 || first_stmt_code == INDIRECT_REF
671 || first_stmt_code == COMPONENT_REF
672 || first_stmt_code == MEM_REF)))
674 if (dump_enabled_p ())
676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
677 "Build SLP failed: different operation "
678 "in stmt ");
679 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
682 vect_free_oprnd_info (oprnds_info);
683 return false;
686 if (need_same_oprnds
687 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
689 if (dump_enabled_p ())
691 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
692 "Build SLP failed: different shift "
693 "arguments in ");
694 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
697 vect_free_oprnd_info (oprnds_info);
698 return false;
701 if (rhs_code == CALL_EXPR)
703 gimple first_stmt = stmts[0];
704 if (gimple_call_num_args (stmt) != nops
705 || !operand_equal_p (gimple_call_fn (first_stmt),
706 gimple_call_fn (stmt), 0)
707 || gimple_call_fntype (first_stmt)
708 != gimple_call_fntype (stmt))
710 if (dump_enabled_p ())
712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
713 "Build SLP failed: different calls in ");
714 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
715 stmt, 0);
718 vect_free_oprnd_info (oprnds_info);
719 return false;
724 /* Grouped store or load. */
725 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
727 if (REFERENCE_CLASS_P (lhs))
729 /* Store. */
730 if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node,
731 stmt, ncopies_for_cost,
732 (i == 0), &oprnds_info,
733 prologue_cost_vec,
734 body_cost_vec))
736 vect_free_oprnd_info (oprnds_info);
737 return false;
740 else
742 /* Load. */
743 /* FORNOW: Check that there is no gap between the loads. */
744 if ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
745 && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
746 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
747 && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
749 if (dump_enabled_p ())
751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
752 "Build SLP failed: grouped "
753 "loads have gaps ");
754 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
755 stmt, 0);
758 vect_free_oprnd_info (oprnds_info);
759 return false;
762 /* Check that the size of interleaved loads group is not
763 greater than the SLP group size. */
764 if (loop_vinfo
765 && GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size)
767 if (dump_enabled_p ())
769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
770 "Build SLP failed: the number "
771 "of interleaved loads is greater than "
772 "the SLP group size ");
773 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
774 stmt, 0);
777 vect_free_oprnd_info (oprnds_info);
778 return false;
781 old_first_load = first_load;
782 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
783 if (prev_first_load)
785 /* Check that there are no loads from different interleaving
786 chains in the same node. The only exception is complex
787 numbers. */
788 if (prev_first_load != first_load
789 && rhs_code != REALPART_EXPR
790 && rhs_code != IMAGPART_EXPR)
792 if (dump_enabled_p ())
794 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
795 vect_location,
796 "Build SLP failed: different "
797 "interleaving chains in one node ");
798 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
799 stmt, 0);
802 vect_free_oprnd_info (oprnds_info);
803 return false;
806 else
807 prev_first_load = first_load;
809 /* In some cases a group of loads is just the same load
810 repeated N times. Only analyze its cost once. */
811 if (first_load == stmt && old_first_load != first_load)
813 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
814 if (vect_supportable_dr_alignment (first_dr, false)
815 == dr_unaligned_unsupported)
817 if (dump_enabled_p ())
819 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
820 vect_location,
821 "Build SLP failed: unsupported "
822 "unaligned load ");
823 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
824 stmt, 0);
827 vect_free_oprnd_info (oprnds_info);
828 return false;
831 /* Analyze costs (for the first stmt in the group). */
832 vect_model_load_cost (vinfo_for_stmt (stmt),
833 ncopies_for_cost, false, *node,
834 prologue_cost_vec, body_cost_vec);
837 /* Store the place of this load in the interleaving chain. In
838 case that permutation is needed we later decide if a specific
839 permutation is supported. */
840 load_place = vect_get_place_in_interleaving_chain (stmt,
841 first_load);
842 if (load_place != i)
843 permutation = true;
845 load_permutation->safe_push (load_place);
847 /* We stop the tree when we reach a group of loads. */
848 stop_recursion = true;
849 continue;
851 } /* Grouped access. */
852 else
854 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
856 /* Not grouped load. */
857 if (dump_enabled_p ())
859 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
860 "Build SLP failed: not grouped load ");
861 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
864 /* FORNOW: Not grouped loads are not supported. */
865 vect_free_oprnd_info (oprnds_info);
866 return false;
869 /* Not memory operation. */
870 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
871 && TREE_CODE_CLASS (rhs_code) != tcc_unary
872 && rhs_code != COND_EXPR
873 && rhs_code != CALL_EXPR)
875 if (dump_enabled_p ())
877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
878 "Build SLP failed: operation");
879 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
880 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
883 vect_free_oprnd_info (oprnds_info);
884 return false;
887 if (rhs_code == COND_EXPR)
889 tree cond_expr = gimple_assign_rhs1 (stmt);
891 if (i == 0)
892 first_cond_code = TREE_CODE (cond_expr);
893 else if (first_cond_code != TREE_CODE (cond_expr))
895 if (dump_enabled_p ())
897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
898 "Build SLP failed: different"
899 " operation");
900 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
901 stmt, 0);
904 vect_free_oprnd_info (oprnds_info);
905 return false;
909 /* Find the def-stmts. */
910 if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node, stmt,
911 ncopies_for_cost, (i == 0),
912 &oprnds_info, prologue_cost_vec,
913 body_cost_vec))
915 vect_free_oprnd_info (oprnds_info);
916 return false;
921 /* Grouped loads were reached - stop the recursion. */
922 if (stop_recursion)
924 loads->safe_push (*node);
925 if (permutation)
927 gimple first_stmt = stmts[0];
928 *loads_permuted = true;
929 (void) record_stmt_cost (body_cost_vec, group_size, vec_perm,
930 vinfo_for_stmt (first_stmt), 0, vect_body);
932 else
934 /* We don't check here complex numbers chains, so we set
935 LOADS_PERMUTED for further check in
936 vect_supported_load_permutation_p. */
937 if (rhs_code == REALPART_EXPR || rhs_code == IMAGPART_EXPR)
938 *loads_permuted = true;
941 vect_free_oprnd_info (oprnds_info);
942 return true;
945 /* Create SLP_TREE nodes for the definition node/s. */
946 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
948 slp_tree child;
950 if (oprnd_info->first_dt != vect_internal_def)
951 continue;
953 child = vect_create_new_slp_node (oprnd_info->def_stmts);
954 if (!child
955 || !vect_build_slp_tree (loop_vinfo, bb_vinfo, &child, group_size,
956 outside_cost, ncopies_for_cost,
957 max_nunits, load_permutation, loads,
958 vectorization_factor, loads_permuted,
959 prologue_cost_vec, body_cost_vec))
961 if (child)
962 oprnd_info->def_stmts = vNULL;
963 vect_free_slp_tree (child);
964 vect_free_oprnd_info (oprnds_info);
965 return false;
968 oprnd_info->def_stmts.create (0);
969 SLP_TREE_CHILDREN (*node).quick_push (child);
972 vect_free_oprnd_info (oprnds_info);
973 return true;
976 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
978 static void
979 vect_print_slp_tree (int dump_kind, slp_tree node)
981 int i;
982 gimple stmt;
983 slp_void_p child;
985 if (!node)
986 return;
988 dump_printf (dump_kind, "node ");
989 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
991 dump_printf (dump_kind, "\n\tstmt %d ", i);
992 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
994 dump_printf (dump_kind, "\n");
996 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
997 vect_print_slp_tree (dump_kind, (slp_tree) child);
1001 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1002 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1003 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1004 stmts in NODE are to be marked. */
1006 static void
1007 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1009 int i;
1010 gimple stmt;
1011 slp_void_p child;
1013 if (!node)
1014 return;
1016 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1017 if (j < 0 || i == j)
1018 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1020 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1021 vect_mark_slp_stmts ((slp_tree) child, mark, j);
1025 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1027 static void
1028 vect_mark_slp_stmts_relevant (slp_tree node)
1030 int i;
1031 gimple stmt;
1032 stmt_vec_info stmt_info;
1033 slp_void_p child;
1035 if (!node)
1036 return;
1038 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1040 stmt_info = vinfo_for_stmt (stmt);
1041 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1042 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1043 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1046 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1047 vect_mark_slp_stmts_relevant ((slp_tree) child);
1051 /* Check if the permutation required by the SLP INSTANCE is supported.
1052 Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed. */
1054 static bool
1055 vect_supported_slp_permutation_p (slp_instance instance)
1057 slp_tree node = SLP_INSTANCE_LOADS (instance)[0];
1058 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1059 gimple first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
1060 vec<slp_tree> sorted_loads = vNULL;
1061 int index;
1062 slp_tree *tmp_loads = NULL;
1063 int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j;
1064 slp_tree load;
1066 /* FORNOW: The only supported loads permutation is loads from the same
1067 location in all the loads in the node, when the data-refs in
1068 nodes of LOADS constitute an interleaving chain.
1069 Sort the nodes according to the order of accesses in the chain. */
1070 tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size);
1071 for (i = 0, j = 0;
1072 SLP_INSTANCE_LOAD_PERMUTATION (instance).iterate (i, &index)
1073 && SLP_INSTANCE_LOADS (instance).iterate (j, &load);
1074 i += group_size, j++)
1076 gimple scalar_stmt = SLP_TREE_SCALAR_STMTS (load)[0];
1077 /* Check that the loads are all in the same interleaving chain. */
1078 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (scalar_stmt)) != first_load)
1080 if (dump_enabled_p ())
1082 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1083 "Build SLP failed: unsupported data "
1084 "permutation ");
1085 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1086 scalar_stmt, 0);
1089 free (tmp_loads);
1090 return false;
1093 tmp_loads[index] = load;
1096 sorted_loads.create (group_size);
1097 for (i = 0; i < group_size; i++)
1098 sorted_loads.safe_push (tmp_loads[i]);
1100 SLP_INSTANCE_LOADS (instance).release ();
1101 SLP_INSTANCE_LOADS (instance) = sorted_loads;
1102 free (tmp_loads);
1104 if (!vect_transform_slp_perm_load (stmt, vNULL, NULL,
1105 SLP_INSTANCE_UNROLLING_FACTOR (instance),
1106 instance, true))
1107 return false;
1109 return true;
1113 /* Rearrange the statements of NODE according to PERMUTATION. */
1115 static void
1116 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1117 vec<int> permutation)
1119 gimple stmt;
1120 vec<gimple> tmp_stmts;
1121 unsigned int index, i;
1122 slp_void_p child;
1124 if (!node)
1125 return;
1127 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1128 vect_slp_rearrange_stmts ((slp_tree) child, group_size, permutation);
1130 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1131 tmp_stmts.create (group_size);
1133 for (i = 0; i < group_size; i++)
1134 tmp_stmts.safe_push (NULL);
1136 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1138 index = permutation[i];
1139 tmp_stmts[index] = stmt;
1142 SLP_TREE_SCALAR_STMTS (node).release ();
1143 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1147 /* Check if the required load permutation is supported.
1148 LOAD_PERMUTATION contains a list of indices of the loads.
1149 In SLP this permutation is relative to the order of grouped stores that are
1150 the base of the SLP instance. */
1152 static bool
1153 vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
1154 vec<int> load_permutation)
1156 int i = 0, j, prev = -1, next, k, number_of_groups;
1157 bool supported, bad_permutation = false;
1158 sbitmap load_index;
1159 slp_tree node, other_complex_node;
1160 gimple stmt, first = NULL, other_node_first, load, next_load, first_load;
1161 unsigned complex_numbers = 0;
1162 struct data_reference *dr;
1163 bb_vec_info bb_vinfo;
1165 /* FORNOW: permutations are only supported in SLP. */
1166 if (!slp_instn)
1167 return false;
1169 if (dump_enabled_p ())
1171 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1172 FOR_EACH_VEC_ELT (load_permutation, i, next)
1173 dump_printf (MSG_NOTE, "%d ", next);
1176 /* In case of reduction every load permutation is allowed, since the order
1177 of the reduction statements is not important (as opposed to the case of
1178 grouped stores). The only condition we need to check is that all the
1179 load nodes are of the same size and have the same permutation (and then
1180 rearrange all the nodes of the SLP instance according to this
1181 permutation). */
1183 /* Check that all the load nodes are of the same size. */
1184 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1186 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1187 return false;
1189 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1190 if (is_gimple_assign (stmt)
1191 && (gimple_assign_rhs_code (stmt) == REALPART_EXPR
1192 || gimple_assign_rhs_code (stmt) == IMAGPART_EXPR))
1193 complex_numbers++;
1196 /* Complex operands can be swapped as following:
1197 real_c = real_b + real_a;
1198 imag_c = imag_a + imag_b;
1199 i.e., we have {real_b, imag_a} and {real_a, imag_b} instead of
1200 {real_a, imag_a} and {real_b, imag_b}. We check here that if interleaving
1201 chains are mixed, they match the above pattern. */
1202 if (complex_numbers)
1204 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1206 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, stmt)
1208 if (j == 0)
1209 first = stmt;
1210 else
1212 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != first)
1214 if (complex_numbers != 2)
1215 return false;
1217 if (i == 0)
1218 k = 1;
1219 else
1220 k = 0;
1222 other_complex_node = SLP_INSTANCE_LOADS (slp_instn)[k];
1223 other_node_first =
1224 SLP_TREE_SCALAR_STMTS (other_complex_node)[0];
1226 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1227 != other_node_first)
1228 return false;
1235 /* We checked that this case ok, so there is no need to proceed with
1236 permutation tests. */
1237 if (complex_numbers == 2
1238 && SLP_INSTANCE_LOADS (slp_instn).length () == 2)
1240 SLP_INSTANCE_LOADS (slp_instn).release ();
1241 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1242 return true;
1245 node = SLP_INSTANCE_TREE (slp_instn);
1246 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1247 /* LOAD_PERMUTATION is a list of indices of all the loads of the SLP
1248 instance, not all the loads belong to the same node or interleaving
1249 group. Hence, we need to divide them into groups according to
1250 GROUP_SIZE. */
1251 number_of_groups = load_permutation.length () / group_size;
1253 /* Reduction (there are no data-refs in the root).
1254 In reduction chain the order of the loads is important. */
1255 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1256 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1258 int first_group_load_index;
1260 /* Compare all the permutation sequences to the first one. */
1261 for (i = 1; i < number_of_groups; i++)
1263 k = 0;
1264 for (j = i * group_size; j < i * group_size + group_size; j++)
1266 next = load_permutation[j];
1267 first_group_load_index = load_permutation[k];
1269 if (next != first_group_load_index)
1271 bad_permutation = true;
1272 break;
1275 k++;
1278 if (bad_permutation)
1279 break;
1282 if (!bad_permutation)
1284 /* Check that the loads in the first sequence are different and there
1285 are no gaps between them. */
1286 load_index = sbitmap_alloc (group_size);
1287 bitmap_clear (load_index);
1288 for (k = 0; k < group_size; k++)
1290 first_group_load_index = load_permutation[k];
1291 if (bitmap_bit_p (load_index, first_group_load_index))
1293 bad_permutation = true;
1294 break;
1297 bitmap_set_bit (load_index, first_group_load_index);
1300 if (!bad_permutation)
1301 for (k = 0; k < group_size; k++)
1302 if (!bitmap_bit_p (load_index, k))
1304 bad_permutation = true;
1305 break;
1308 sbitmap_free (load_index);
1311 if (!bad_permutation)
1313 /* This permutation is valid for reduction. Since the order of the
1314 statements in the nodes is not important unless they are memory
1315 accesses, we can rearrange the statements in all the nodes
1316 according to the order of the loads. */
1317 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1318 load_permutation);
1319 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1320 return true;
1324 /* In basic block vectorization we allow any subchain of an interleaving
1325 chain.
1326 FORNOW: not supported in loop SLP because of realignment compications. */
1327 bb_vinfo = STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt));
1328 bad_permutation = false;
1329 /* Check that for every node in the instance the loads form a subchain. */
1330 if (bb_vinfo)
1332 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1334 next_load = NULL;
1335 first_load = NULL;
1336 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1338 if (!first_load)
1339 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (load));
1340 else if (first_load
1341 != GROUP_FIRST_ELEMENT (vinfo_for_stmt (load)))
1343 bad_permutation = true;
1344 break;
1347 if (j != 0 && next_load != load)
1349 bad_permutation = true;
1350 break;
1353 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1356 if (bad_permutation)
1357 break;
1360 /* Check that the alignment of the first load in every subchain, i.e.,
1361 the first statement in every load node, is supported. */
1362 if (!bad_permutation)
1364 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1366 first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1367 if (first_load
1368 != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1370 dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1371 if (vect_supportable_dr_alignment (dr, false)
1372 == dr_unaligned_unsupported)
1374 if (dump_enabled_p ())
1376 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1377 vect_location,
1378 "unsupported unaligned load ");
1379 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1380 first_load, 0);
1382 bad_permutation = true;
1383 break;
1388 if (!bad_permutation)
1390 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1391 return true;
1396 /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1397 GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1398 well (unless it's reduction). */
1399 if (load_permutation.length ()
1400 != (unsigned int) (group_size * group_size))
1401 return false;
1403 supported = true;
1404 load_index = sbitmap_alloc (group_size);
1405 bitmap_clear (load_index);
1406 for (j = 0; j < group_size; j++)
1408 for (i = j * group_size, k = 0;
1409 load_permutation.iterate (i, &next) && k < group_size;
1410 i++, k++)
1412 if (i != j * group_size && next != prev)
1414 supported = false;
1415 break;
1418 prev = next;
1421 if (bitmap_bit_p (load_index, prev))
1423 supported = false;
1424 break;
1427 bitmap_set_bit (load_index, prev);
1430 for (j = 0; j < group_size; j++)
1431 if (!bitmap_bit_p (load_index, j))
1433 sbitmap_free (load_index);
1434 return false;
1437 sbitmap_free (load_index);
1439 if (supported && i == group_size * group_size
1440 && vect_supported_slp_permutation_p (slp_instn))
1441 return true;
1443 return false;
1447 /* Find the first load in the loop that belongs to INSTANCE.
1448 When loads are in several SLP nodes, there can be a case in which the first
1449 load does not appear in the first SLP node to be transformed, causing
1450 incorrect order of statements. Since we generate all the loads together,
1451 they must be inserted before the first load of the SLP instance and not
1452 before the first load of the first node of the instance. */
1454 static gimple
1455 vect_find_first_load_in_slp_instance (slp_instance instance)
1457 int i, j;
1458 slp_tree load_node;
1459 gimple first_load = NULL, load;
1461 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1462 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1463 first_load = get_earlier_stmt (load, first_load);
1465 return first_load;
1469 /* Find the last store in SLP INSTANCE. */
1471 static gimple
1472 vect_find_last_store_in_slp_instance (slp_instance instance)
1474 int i;
1475 slp_tree node;
1476 gimple last_store = NULL, store;
1478 node = SLP_INSTANCE_TREE (instance);
1479 for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1480 last_store = get_later_stmt (store, last_store);
1482 return last_store;
1486 /* Analyze an SLP instance starting from a group of grouped stores. Call
1487 vect_build_slp_tree to build a tree of packed stmts if possible.
1488 Return FALSE if it's impossible to SLP any stmt in the loop. */
1490 static bool
1491 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1492 gimple stmt)
1494 slp_instance new_instance;
1495 slp_tree node;
1496 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1497 unsigned int unrolling_factor = 1, nunits;
1498 tree vectype, scalar_type = NULL_TREE;
1499 gimple next;
1500 unsigned int vectorization_factor = 0;
1501 int outside_cost = 0, ncopies_for_cost, i;
1502 unsigned int max_nunits = 0;
1503 vec<int> load_permutation;
1504 vec<slp_tree> loads;
1505 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1506 bool loads_permuted = false;
1507 vec<gimple> scalar_stmts;
1508 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1509 stmt_info_for_cost *si;
1511 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1513 if (dr)
1515 scalar_type = TREE_TYPE (DR_REF (dr));
1516 vectype = get_vectype_for_scalar_type (scalar_type);
1518 else
1520 gcc_assert (loop_vinfo);
1521 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1524 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1526 else
1528 gcc_assert (loop_vinfo);
1529 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1530 group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1533 if (!vectype)
1535 if (dump_enabled_p ())
1537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1538 "Build SLP failed: unsupported data-type ");
1539 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1542 return false;
1545 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1546 if (loop_vinfo)
1547 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1548 else
1549 vectorization_factor = nunits;
1551 /* Calculate the unrolling factor. */
1552 unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1553 if (unrolling_factor != 1 && !loop_vinfo)
1555 if (dump_enabled_p ())
1556 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1557 "Build SLP failed: unrolling required in basic"
1558 " block SLP");
1560 return false;
1563 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1564 scalar_stmts.create (group_size);
1565 next = stmt;
1566 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1568 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1569 while (next)
1571 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1572 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1573 scalar_stmts.safe_push (
1574 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1575 else
1576 scalar_stmts.safe_push (next);
1577 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1580 else
1582 /* Collect reduction statements. */
1583 vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1584 for (i = 0; reductions.iterate (i, &next); i++)
1585 scalar_stmts.safe_push (next);
1588 node = vect_create_new_slp_node (scalar_stmts);
1590 /* Calculate the number of vector stmts to create based on the unrolling
1591 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1592 GROUP_SIZE / NUNITS otherwise. */
1593 ncopies_for_cost = unrolling_factor * group_size / nunits;
1595 load_permutation.create (group_size * group_size);
1596 loads.create (group_size);
1597 prologue_cost_vec.create (10);
1598 body_cost_vec.create (10);
1600 /* Build the tree for the SLP instance. */
1601 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1602 &outside_cost, ncopies_for_cost,
1603 &max_nunits, &load_permutation, &loads,
1604 vectorization_factor, &loads_permuted,
1605 &prologue_cost_vec, &body_cost_vec))
1607 void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1608 : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1610 /* Calculate the unrolling factor based on the smallest type. */
1611 if (max_nunits > nunits)
1612 unrolling_factor = least_common_multiple (max_nunits, group_size)
1613 / group_size;
1615 if (unrolling_factor != 1 && !loop_vinfo)
1617 if (dump_enabled_p ())
1618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1619 "Build SLP failed: unrolling required in basic"
1620 " block SLP");
1621 vect_free_slp_tree (node);
1622 body_cost_vec.release ();
1623 prologue_cost_vec.release ();
1624 load_permutation.release ();
1625 loads.release ();
1626 return false;
1629 /* Create a new SLP instance. */
1630 new_instance = XNEW (struct _slp_instance);
1631 SLP_INSTANCE_TREE (new_instance) = node;
1632 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1633 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1634 SLP_INSTANCE_BODY_COST_VEC (new_instance) = body_cost_vec;
1635 SLP_INSTANCE_LOADS (new_instance) = loads;
1636 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1637 SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation;
1639 if (loads_permuted)
1641 if (!vect_supported_load_permutation_p (new_instance, group_size,
1642 load_permutation))
1644 if (dump_enabled_p ())
1646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1647 "Build SLP failed: unsupported load "
1648 "permutation ");
1649 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1652 vect_free_slp_instance (new_instance);
1653 prologue_cost_vec.release ();
1654 return false;
1657 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1658 = vect_find_first_load_in_slp_instance (new_instance);
1660 else
1661 SLP_INSTANCE_LOAD_PERMUTATION (new_instance).release ();
1663 /* Record the prologue costs, which were delayed until we were
1664 sure that SLP was successful. Unlike the body costs, we know
1665 the final values now regardless of the loop vectorization factor. */
1666 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1668 struct _stmt_vec_info *stmt_info
1669 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1670 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1671 si->misalign, vect_prologue);
1674 prologue_cost_vec.release ();
1676 if (loop_vinfo)
1677 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1678 else
1679 BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1681 if (dump_enabled_p ())
1682 vect_print_slp_tree (MSG_NOTE, node);
1684 return true;
1686 else
1688 body_cost_vec.release ();
1689 prologue_cost_vec.release ();
1692 /* Failed to SLP. */
1693 /* Free the allocated memory. */
1694 vect_free_slp_tree (node);
1695 load_permutation.release ();
1696 loads.release ();
1698 return false;
1702 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1703 trees of packed scalar stmts if SLP is possible. */
1705 bool
1706 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
1708 unsigned int i;
1709 vec<gimple> grouped_stores;
1710 vec<gimple> reductions = vNULL;
1711 vec<gimple> reduc_chains = vNULL;
1712 gimple first_element;
1713 bool ok = false;
1715 if (dump_enabled_p ())
1716 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===");
1718 if (loop_vinfo)
1720 grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1721 reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1722 reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1724 else
1725 grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1727 /* Find SLP sequences starting from groups of grouped stores. */
1728 FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1729 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
1730 ok = true;
1732 if (bb_vinfo && !ok)
1734 if (dump_enabled_p ())
1735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1736 "Failed to SLP the basic block.");
1738 return false;
1741 if (loop_vinfo
1742 && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1744 /* Find SLP sequences starting from reduction chains. */
1745 FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1746 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
1747 ok = true;
1748 else
1749 return false;
1751 /* Don't try to vectorize SLP reductions if reduction chain was
1752 detected. */
1753 return ok;
1756 /* Find SLP sequences starting from groups of reductions. */
1757 if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1758 && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0]))
1759 ok = true;
1761 return true;
1765 /* For each possible SLP instance decide whether to SLP it and calculate overall
1766 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1767 least one instance. */
1769 bool
1770 vect_make_slp_decision (loop_vec_info loop_vinfo)
1772 unsigned int i, unrolling_factor = 1;
1773 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1774 slp_instance instance;
1775 int decided_to_slp = 0;
1777 if (dump_enabled_p ())
1778 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ===");
1780 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1782 /* FORNOW: SLP if you can. */
1783 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1784 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1786 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
1787 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1788 loop-based vectorization. Such stmts will be marked as HYBRID. */
1789 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1790 decided_to_slp++;
1793 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1795 if (decided_to_slp && dump_enabled_p ())
1796 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
1797 "Decided to SLP %d instances. Unrolling factor %d",
1798 decided_to_slp, unrolling_factor);
1800 return (decided_to_slp > 0);
1804 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1805 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1807 static void
1808 vect_detect_hybrid_slp_stmts (slp_tree node)
1810 int i;
1811 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (node);
1812 gimple stmt = stmts[0];
1813 imm_use_iterator imm_iter;
1814 gimple use_stmt;
1815 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1816 slp_void_p child;
1817 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1818 struct loop *loop = NULL;
1819 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo);
1820 basic_block bb = NULL;
1822 if (!node)
1823 return;
1825 if (loop_vinfo)
1826 loop = LOOP_VINFO_LOOP (loop_vinfo);
1827 else
1828 bb = BB_VINFO_BB (bb_vinfo);
1830 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1831 if (PURE_SLP_STMT (vinfo_for_stmt (stmt))
1832 && TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1833 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1834 if (gimple_bb (use_stmt)
1835 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1836 || bb == gimple_bb (use_stmt))
1837 && (stmt_vinfo = vinfo_for_stmt (use_stmt))
1838 && !STMT_SLP_TYPE (stmt_vinfo)
1839 && (STMT_VINFO_RELEVANT (stmt_vinfo)
1840 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo))
1841 || (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)
1842 && STMT_VINFO_RELATED_STMT (stmt_vinfo)
1843 && !STMT_SLP_TYPE (vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo)))))
1844 && !(gimple_code (use_stmt) == GIMPLE_PHI
1845 && STMT_VINFO_DEF_TYPE (stmt_vinfo)
1846 == vect_reduction_def))
1847 vect_mark_slp_stmts (node, hybrid, i);
1849 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1850 vect_detect_hybrid_slp_stmts ((slp_tree) child);
1854 /* Find stmts that must be both vectorized and SLPed. */
1856 void
1857 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1859 unsigned int i;
1860 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1861 slp_instance instance;
1863 if (dump_enabled_p ())
1864 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ===");
1866 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1867 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance));
1871 /* Create and initialize a new bb_vec_info struct for BB, as well as
1872 stmt_vec_info structs for all the stmts in it. */
1874 static bb_vec_info
1875 new_bb_vec_info (basic_block bb)
1877 bb_vec_info res = NULL;
1878 gimple_stmt_iterator gsi;
1880 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
1881 BB_VINFO_BB (res) = bb;
1883 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1885 gimple stmt = gsi_stmt (gsi);
1886 gimple_set_uid (stmt, 0);
1887 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
1890 BB_VINFO_GROUPED_STORES (res).create (10);
1891 BB_VINFO_SLP_INSTANCES (res).create (2);
1892 BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
1894 bb->aux = res;
1895 return res;
1899 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
1900 stmts in the basic block. */
1902 static void
1903 destroy_bb_vec_info (bb_vec_info bb_vinfo)
1905 vec<slp_instance> slp_instances;
1906 slp_instance instance;
1907 basic_block bb;
1908 gimple_stmt_iterator si;
1909 unsigned i;
1911 if (!bb_vinfo)
1912 return;
1914 bb = BB_VINFO_BB (bb_vinfo);
1916 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1918 gimple stmt = gsi_stmt (si);
1919 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1921 if (stmt_info)
1922 /* Free stmt_vec_info. */
1923 free_stmt_vec_info (stmt);
1926 free_data_refs (BB_VINFO_DATAREFS (bb_vinfo));
1927 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
1928 BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
1929 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1930 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1931 vect_free_slp_instance (instance);
1932 BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
1933 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1934 free (bb_vinfo);
1935 bb->aux = NULL;
1939 /* Analyze statements contained in SLP tree node after recursively analyzing
1940 the subtree. Return TRUE if the operations are supported. */
1942 static bool
1943 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
1945 bool dummy;
1946 int i;
1947 gimple stmt;
1948 slp_void_p child;
1950 if (!node)
1951 return true;
1953 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1954 if (!vect_slp_analyze_node_operations (bb_vinfo, (slp_tree) child))
1955 return false;
1957 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1959 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1960 gcc_assert (stmt_info);
1961 gcc_assert (PURE_SLP_STMT (stmt_info));
1963 if (!vect_analyze_stmt (stmt, &dummy, node))
1964 return false;
1967 return true;
1971 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
1972 operations are supported. */
1974 static bool
1975 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
1977 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1978 slp_instance instance;
1979 int i;
1981 for (i = 0; slp_instances.iterate (i, &instance); )
1983 if (!vect_slp_analyze_node_operations (bb_vinfo,
1984 SLP_INSTANCE_TREE (instance)))
1986 vect_free_slp_instance (instance);
1987 slp_instances.ordered_remove (i);
1989 else
1990 i++;
1993 if (!slp_instances.length ())
1994 return false;
1996 return true;
1999 /* Check if vectorization of the basic block is profitable. */
2001 static bool
2002 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2004 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2005 slp_instance instance;
2006 int i, j;
2007 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2008 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2009 unsigned int stmt_cost;
2010 gimple stmt;
2011 gimple_stmt_iterator si;
2012 basic_block bb = BB_VINFO_BB (bb_vinfo);
2013 void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2014 stmt_vec_info stmt_info = NULL;
2015 stmt_vector_for_cost body_cost_vec;
2016 stmt_info_for_cost *ci;
2018 /* Calculate vector costs. */
2019 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2021 body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2023 FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2025 stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2026 (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2027 stmt_info, ci->misalign, vect_body);
2031 /* Calculate scalar cost. */
2032 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2034 stmt = gsi_stmt (si);
2035 stmt_info = vinfo_for_stmt (stmt);
2037 if (!stmt_info || !STMT_VINFO_VECTORIZABLE (stmt_info)
2038 || !PURE_SLP_STMT (stmt_info))
2039 continue;
2041 if (STMT_VINFO_DATA_REF (stmt_info))
2043 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2044 stmt_cost = vect_get_stmt_cost (scalar_load);
2045 else
2046 stmt_cost = vect_get_stmt_cost (scalar_store);
2048 else
2049 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2051 scalar_cost += stmt_cost;
2054 /* Complete the target-specific cost calculation. */
2055 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2056 &vec_inside_cost, &vec_epilogue_cost);
2058 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2060 if (dump_enabled_p ())
2062 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2063 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2064 vec_inside_cost);
2065 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2066 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2067 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d", scalar_cost);
2070 /* Vectorization is profitable if its cost is less than the cost of scalar
2071 version. */
2072 if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2073 return false;
2075 return true;
2078 /* Check if the basic block can be vectorized. */
2080 static bb_vec_info
2081 vect_slp_analyze_bb_1 (basic_block bb)
2083 bb_vec_info bb_vinfo;
2084 vec<ddr_p> ddrs;
2085 vec<slp_instance> slp_instances;
2086 slp_instance instance;
2087 int i;
2088 int min_vf = 2;
2089 int max_vf = MAX_VECTORIZATION_FACTOR;
2091 bb_vinfo = new_bb_vec_info (bb);
2092 if (!bb_vinfo)
2093 return NULL;
2095 if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf))
2097 if (dump_enabled_p ())
2098 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2099 "not vectorized: unhandled data-ref in basic "
2100 "block.\n");
2102 destroy_bb_vec_info (bb_vinfo);
2103 return NULL;
2106 ddrs = BB_VINFO_DDRS (bb_vinfo);
2107 if (!ddrs.length ())
2109 if (dump_enabled_p ())
2110 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2111 "not vectorized: not enough data-refs in "
2112 "basic block.\n");
2114 destroy_bb_vec_info (bb_vinfo);
2115 return NULL;
2118 vect_pattern_recog (NULL, bb_vinfo);
2120 if (!vect_analyze_data_ref_dependences (NULL, bb_vinfo, &max_vf)
2121 || min_vf > max_vf)
2123 if (dump_enabled_p ())
2124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2125 "not vectorized: unhandled data dependence "
2126 "in basic block.\n");
2128 destroy_bb_vec_info (bb_vinfo);
2129 return NULL;
2132 if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2134 if (dump_enabled_p ())
2135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2136 "not vectorized: bad data alignment in basic "
2137 "block.\n");
2139 destroy_bb_vec_info (bb_vinfo);
2140 return NULL;
2143 if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2145 if (dump_enabled_p ())
2146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147 "not vectorized: unhandled data access in "
2148 "basic block.\n");
2150 destroy_bb_vec_info (bb_vinfo);
2151 return NULL;
2154 /* Check the SLP opportunities in the basic block, analyze and build SLP
2155 trees. */
2156 if (!vect_analyze_slp (NULL, bb_vinfo))
2158 if (dump_enabled_p ())
2159 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2160 "not vectorized: failed to find SLP opportunities "
2161 "in basic block.\n");
2163 destroy_bb_vec_info (bb_vinfo);
2164 return NULL;
2167 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2169 /* Mark all the statements that we want to vectorize as pure SLP and
2170 relevant. */
2171 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2173 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2174 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2177 if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2179 if (dump_enabled_p ())
2180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2181 "not vectorized: unsupported alignment in basic "
2182 "block.\n");
2183 destroy_bb_vec_info (bb_vinfo);
2184 return NULL;
2187 if (!vect_slp_analyze_operations (bb_vinfo))
2189 if (dump_enabled_p ())
2190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2191 "not vectorized: bad operation in basic block.\n");
2193 destroy_bb_vec_info (bb_vinfo);
2194 return NULL;
2197 /* Cost model: check if the vectorization is worthwhile. */
2198 if (flag_vect_cost_model
2199 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2201 if (dump_enabled_p ())
2202 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2203 "not vectorized: vectorization is not "
2204 "profitable.\n");
2206 destroy_bb_vec_info (bb_vinfo);
2207 return NULL;
2210 if (dump_enabled_p ())
2211 dump_printf_loc (MSG_NOTE, vect_location,
2212 "Basic block will be vectorized using SLP\n");
2214 return bb_vinfo;
2218 bb_vec_info
2219 vect_slp_analyze_bb (basic_block bb)
2221 bb_vec_info bb_vinfo;
2222 int insns = 0;
2223 gimple_stmt_iterator gsi;
2224 unsigned int vector_sizes;
2226 if (dump_enabled_p ())
2227 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2229 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2231 gimple stmt = gsi_stmt (gsi);
2232 if (!is_gimple_debug (stmt)
2233 && !gimple_nop_p (stmt)
2234 && gimple_code (stmt) != GIMPLE_LABEL)
2235 insns++;
2238 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2240 if (dump_enabled_p ())
2241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2242 "not vectorized: too many instructions in "
2243 "basic block.\n");
2245 return NULL;
2248 /* Autodetect first vector size we try. */
2249 current_vector_size = 0;
2250 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2252 while (1)
2254 bb_vinfo = vect_slp_analyze_bb_1 (bb);
2255 if (bb_vinfo)
2256 return bb_vinfo;
2258 destroy_bb_vec_info (bb_vinfo);
2260 vector_sizes &= ~current_vector_size;
2261 if (vector_sizes == 0
2262 || current_vector_size == 0)
2263 return NULL;
2265 /* Try the next biggest vector size. */
2266 current_vector_size = 1 << floor_log2 (vector_sizes);
2267 if (dump_enabled_p ())
2268 dump_printf_loc (MSG_NOTE, vect_location,
2269 "***** Re-trying analysis with "
2270 "vector size %d\n", current_vector_size);
2275 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2276 the number of created vector stmts depends on the unrolling factor).
2277 However, the actual number of vector stmts for every SLP node depends on
2278 VF which is set later in vect_analyze_operations (). Hence, SLP costs
2279 should be updated. In this function we assume that the inside costs
2280 calculated in vect_model_xxx_cost are linear in ncopies. */
2282 void
2283 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2285 unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2286 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2287 slp_instance instance;
2288 stmt_vector_for_cost body_cost_vec;
2289 stmt_info_for_cost *si;
2290 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2292 if (dump_enabled_p ())
2293 dump_printf_loc (MSG_NOTE, vect_location,
2294 "=== vect_update_slp_costs_according_to_vf ===");
2296 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2298 /* We assume that costs are linear in ncopies. */
2299 int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2301 /* Record the instance's instructions in the target cost model.
2302 This was delayed until here because the count of instructions
2303 isn't known beforehand. */
2304 body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2306 FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2307 (void) add_stmt_cost (data, si->count * ncopies, si->kind,
2308 vinfo_for_stmt (si->stmt), si->misalign,
2309 vect_body);
2314 /* For constant and loop invariant defs of SLP_NODE this function returns
2315 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2316 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2317 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2318 REDUC_INDEX is the index of the reduction operand in the statements, unless
2319 it is -1. */
2321 static void
2322 vect_get_constant_vectors (tree op, slp_tree slp_node,
2323 vec<tree> *vec_oprnds,
2324 unsigned int op_num, unsigned int number_of_vectors,
2325 int reduc_index)
2327 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2328 gimple stmt = stmts[0];
2329 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2330 unsigned nunits;
2331 tree vec_cst;
2332 tree *elts;
2333 unsigned j, number_of_places_left_in_vector;
2334 tree vector_type;
2335 tree vop;
2336 int group_size = stmts.length ();
2337 unsigned int vec_num, i;
2338 unsigned number_of_copies = 1;
2339 vec<tree> voprnds;
2340 voprnds.create (number_of_vectors);
2341 bool constant_p, is_store;
2342 tree neutral_op = NULL;
2343 enum tree_code code = gimple_expr_code (stmt);
2344 gimple def_stmt;
2345 struct loop *loop;
2346 gimple_seq ctor_seq = NULL;
2348 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2349 && reduc_index != -1)
2351 op_num = reduc_index - 1;
2352 op = gimple_op (stmt, reduc_index);
2353 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2354 we need either neutral operands or the original operands. See
2355 get_initial_def_for_reduction() for details. */
2356 switch (code)
2358 case WIDEN_SUM_EXPR:
2359 case DOT_PROD_EXPR:
2360 case PLUS_EXPR:
2361 case MINUS_EXPR:
2362 case BIT_IOR_EXPR:
2363 case BIT_XOR_EXPR:
2364 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2365 neutral_op = build_real (TREE_TYPE (op), dconst0);
2366 else
2367 neutral_op = build_int_cst (TREE_TYPE (op), 0);
2369 break;
2371 case MULT_EXPR:
2372 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2373 neutral_op = build_real (TREE_TYPE (op), dconst1);
2374 else
2375 neutral_op = build_int_cst (TREE_TYPE (op), 1);
2377 break;
2379 case BIT_AND_EXPR:
2380 neutral_op = build_int_cst (TREE_TYPE (op), -1);
2381 break;
2383 /* For MIN/MAX we don't have an easy neutral operand but
2384 the initial values can be used fine here. Only for
2385 a reduction chain we have to force a neutral element. */
2386 case MAX_EXPR:
2387 case MIN_EXPR:
2388 if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2389 neutral_op = NULL;
2390 else
2392 def_stmt = SSA_NAME_DEF_STMT (op);
2393 loop = (gimple_bb (stmt))->loop_father;
2394 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2395 loop_preheader_edge (loop));
2397 break;
2399 default:
2400 neutral_op = NULL;
2404 if (STMT_VINFO_DATA_REF (stmt_vinfo))
2406 is_store = true;
2407 op = gimple_assign_rhs1 (stmt);
2409 else
2410 is_store = false;
2412 gcc_assert (op);
2414 if (CONSTANT_CLASS_P (op))
2415 constant_p = true;
2416 else
2417 constant_p = false;
2419 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2420 gcc_assert (vector_type);
2421 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2423 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2424 created vectors. It is greater than 1 if unrolling is performed.
2426 For example, we have two scalar operands, s1 and s2 (e.g., group of
2427 strided accesses of size two), while NUNITS is four (i.e., four scalars
2428 of this type can be packed in a vector). The output vector will contain
2429 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2430 will be 2).
2432 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2433 containing the operands.
2435 For example, NUNITS is four as before, and the group size is 8
2436 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2437 {s5, s6, s7, s8}. */
2439 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2441 number_of_places_left_in_vector = nunits;
2442 elts = XALLOCAVEC (tree, nunits);
2443 for (j = 0; j < number_of_copies; j++)
2445 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2447 if (is_store)
2448 op = gimple_assign_rhs1 (stmt);
2449 else
2451 switch (code)
2453 case COND_EXPR:
2454 if (op_num == 0 || op_num == 1)
2456 tree cond = gimple_assign_rhs1 (stmt);
2457 op = TREE_OPERAND (cond, op_num);
2459 else
2461 if (op_num == 2)
2462 op = gimple_assign_rhs2 (stmt);
2463 else
2464 op = gimple_assign_rhs3 (stmt);
2466 break;
2468 case CALL_EXPR:
2469 op = gimple_call_arg (stmt, op_num);
2470 break;
2472 case LSHIFT_EXPR:
2473 case RSHIFT_EXPR:
2474 case LROTATE_EXPR:
2475 case RROTATE_EXPR:
2476 op = gimple_op (stmt, op_num + 1);
2477 /* Unlike the other binary operators, shifts/rotates have
2478 the shift count being int, instead of the same type as
2479 the lhs, so make sure the scalar is the right type if
2480 we are dealing with vectors of
2481 long long/long/short/char. */
2482 if (op_num == 1 && constant_p)
2483 op = fold_convert (TREE_TYPE (vector_type), op);
2484 break;
2486 default:
2487 op = gimple_op (stmt, op_num + 1);
2488 break;
2492 if (reduc_index != -1)
2494 loop = (gimple_bb (stmt))->loop_father;
2495 def_stmt = SSA_NAME_DEF_STMT (op);
2497 gcc_assert (loop);
2499 /* Get the def before the loop. In reduction chain we have only
2500 one initial value. */
2501 if ((j != (number_of_copies - 1)
2502 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2503 && i != 0))
2504 && neutral_op)
2505 op = neutral_op;
2506 else
2507 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2508 loop_preheader_edge (loop));
2511 /* Create 'vect_ = {op0,op1,...,opn}'. */
2512 number_of_places_left_in_vector--;
2513 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2515 if (constant_p)
2517 op = fold_unary (VIEW_CONVERT_EXPR,
2518 TREE_TYPE (vector_type), op);
2519 gcc_assert (op && CONSTANT_CLASS_P (op));
2521 else
2523 tree new_temp
2524 = make_ssa_name (TREE_TYPE (vector_type), NULL);
2525 gimple init_stmt;
2526 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
2527 op);
2528 init_stmt
2529 = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR,
2530 new_temp, op, NULL_TREE);
2531 gimple_seq_add_stmt (&ctor_seq, init_stmt);
2532 op = new_temp;
2535 elts[number_of_places_left_in_vector] = op;
2537 if (number_of_places_left_in_vector == 0)
2539 number_of_places_left_in_vector = nunits;
2541 if (constant_p)
2542 vec_cst = build_vector (vector_type, elts);
2543 else
2545 vec<constructor_elt, va_gc> *v;
2546 unsigned k;
2547 vec_alloc (v, nunits);
2548 for (k = 0; k < nunits; ++k)
2549 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2550 vec_cst = build_constructor (vector_type, v);
2552 voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2553 vector_type, NULL));
2554 if (ctor_seq != NULL)
2556 gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2557 gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2558 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2559 GSI_SAME_STMT);
2560 ctor_seq = NULL;
2566 /* Since the vectors are created in the reverse order, we should invert
2567 them. */
2568 vec_num = voprnds.length ();
2569 for (j = vec_num; j != 0; j--)
2571 vop = voprnds[j - 1];
2572 vec_oprnds->quick_push (vop);
2575 voprnds.release ();
2577 /* In case that VF is greater than the unrolling factor needed for the SLP
2578 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2579 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2580 to replicate the vectors. */
2581 while (number_of_vectors > vec_oprnds->length ())
2583 tree neutral_vec = NULL;
2585 if (neutral_op)
2587 if (!neutral_vec)
2588 neutral_vec = build_vector_from_val (vector_type, neutral_op);
2590 vec_oprnds->quick_push (neutral_vec);
2592 else
2594 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2595 vec_oprnds->quick_push (vop);
2601 /* Get vectorized definitions from SLP_NODE that contains corresponding
2602 vectorized def-stmts. */
2604 static void
2605 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2607 tree vec_oprnd;
2608 gimple vec_def_stmt;
2609 unsigned int i;
2611 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2613 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2615 gcc_assert (vec_def_stmt);
2616 vec_oprnd = gimple_get_lhs (vec_def_stmt);
2617 vec_oprnds->quick_push (vec_oprnd);
2622 /* Get vectorized definitions for SLP_NODE.
2623 If the scalar definitions are loop invariants or constants, collect them and
2624 call vect_get_constant_vectors() to create vector stmts.
2625 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2626 must be stored in the corresponding child of SLP_NODE, and we call
2627 vect_get_slp_vect_defs () to retrieve them. */
2629 void
2630 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2631 vec<vec<tree> > *vec_oprnds, int reduc_index)
2633 gimple first_stmt;
2634 int number_of_vects = 0, i;
2635 unsigned int child_index = 0;
2636 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2637 slp_tree child = NULL;
2638 vec<tree> vec_defs;
2639 tree oprnd;
2640 bool vectorized_defs;
2642 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2643 FOR_EACH_VEC_ELT (ops, i, oprnd)
2645 /* For each operand we check if it has vectorized definitions in a child
2646 node or we need to create them (for invariants and constants). We
2647 check if the LHS of the first stmt of the next child matches OPRND.
2648 If it does, we found the correct child. Otherwise, we call
2649 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2650 to check this child node for the next operand. */
2651 vectorized_defs = false;
2652 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2654 child = (slp_tree) SLP_TREE_CHILDREN (slp_node)[child_index];
2656 /* We have to check both pattern and original def, if available. */
2657 gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2658 gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2660 if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2661 || (related
2662 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2664 /* The number of vector defs is determined by the number of
2665 vector statements in the node from which we get those
2666 statements. */
2667 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2668 vectorized_defs = true;
2669 child_index++;
2673 if (!vectorized_defs)
2675 if (i == 0)
2677 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2678 /* Number of vector stmts was calculated according to LHS in
2679 vect_schedule_slp_instance (), fix it by replacing LHS with
2680 RHS, if necessary. See vect_get_smallest_scalar_type () for
2681 details. */
2682 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2683 &rhs_size_unit);
2684 if (rhs_size_unit != lhs_size_unit)
2686 number_of_vects *= rhs_size_unit;
2687 number_of_vects /= lhs_size_unit;
2692 /* Allocate memory for vectorized defs. */
2693 vec_defs = vNULL;
2694 vec_defs.create (number_of_vects);
2696 /* For reduction defs we call vect_get_constant_vectors (), since we are
2697 looking for initial loop invariant values. */
2698 if (vectorized_defs && reduc_index == -1)
2699 /* The defs are already vectorized. */
2700 vect_get_slp_vect_defs (child, &vec_defs);
2701 else
2702 /* Build vectors from scalar defs. */
2703 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2704 number_of_vects, reduc_index);
2706 vec_oprnds->quick_push (vec_defs);
2708 /* For reductions, we only need initial values. */
2709 if (reduc_index != -1)
2710 return;
2715 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2716 building a vector of type MASK_TYPE from it) and two input vectors placed in
2717 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2718 shifting by STRIDE elements of DR_CHAIN for every copy.
2719 (STRIDE is the number of vectorized stmts for NODE divided by the number of
2720 copies).
2721 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2722 the created stmts must be inserted. */
2724 static inline void
2725 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2726 tree mask, int first_vec_indx, int second_vec_indx,
2727 gimple_stmt_iterator *gsi, slp_tree node,
2728 tree vectype, vec<tree> dr_chain,
2729 int ncopies, int vect_stmts_counter)
2731 tree perm_dest;
2732 gimple perm_stmt = NULL;
2733 stmt_vec_info next_stmt_info;
2734 int i, stride;
2735 tree first_vec, second_vec, data_ref;
2737 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2739 /* Initialize the vect stmts of NODE to properly insert the generated
2740 stmts later. */
2741 for (i = SLP_TREE_VEC_STMTS (node).length ();
2742 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2743 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2745 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2746 for (i = 0; i < ncopies; i++)
2748 first_vec = dr_chain[first_vec_indx];
2749 second_vec = dr_chain[second_vec_indx];
2751 /* Generate the permute statement. */
2752 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, perm_dest,
2753 first_vec, second_vec, mask);
2754 data_ref = make_ssa_name (perm_dest, perm_stmt);
2755 gimple_set_lhs (perm_stmt, data_ref);
2756 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2758 /* Store the vector statement in NODE. */
2759 SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2761 first_vec_indx += stride;
2762 second_vec_indx += stride;
2765 /* Mark the scalar stmt as vectorized. */
2766 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2767 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2771 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2772 return in CURRENT_MASK_ELEMENT its equivalent in target specific
2773 representation. Check that the mask is valid and return FALSE if not.
2774 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2775 the next vector, i.e., the current first vector is not needed. */
2777 static bool
2778 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2779 int mask_nunits, bool only_one_vec, int index,
2780 unsigned char *mask, int *current_mask_element,
2781 bool *need_next_vector, int *number_of_mask_fixes,
2782 bool *mask_fixed, bool *needs_first_vector)
2784 int i;
2786 /* Convert to target specific representation. */
2787 *current_mask_element = first_mask_element + m;
2788 /* Adjust the value in case it's a mask for second and third vectors. */
2789 *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2791 if (*current_mask_element < mask_nunits)
2792 *needs_first_vector = true;
2794 /* We have only one input vector to permute but the mask accesses values in
2795 the next vector as well. */
2796 if (only_one_vec && *current_mask_element >= mask_nunits)
2798 if (dump_enabled_p ())
2800 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2801 "permutation requires at least two vectors ");
2802 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2805 return false;
2808 /* The mask requires the next vector. */
2809 if (*current_mask_element >= mask_nunits * 2)
2811 if (*needs_first_vector || *mask_fixed)
2813 /* We either need the first vector too or have already moved to the
2814 next vector. In both cases, this permutation needs three
2815 vectors. */
2816 if (dump_enabled_p ())
2818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2819 "permutation requires at "
2820 "least three vectors ");
2821 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2824 return false;
2827 /* We move to the next vector, dropping the first one and working with
2828 the second and the third - we need to adjust the values of the mask
2829 accordingly. */
2830 *current_mask_element -= mask_nunits * *number_of_mask_fixes;
2832 for (i = 0; i < index; i++)
2833 mask[i] -= mask_nunits * *number_of_mask_fixes;
2835 (*number_of_mask_fixes)++;
2836 *mask_fixed = true;
2839 *need_next_vector = *mask_fixed;
2841 /* This was the last element of this mask. Start a new one. */
2842 if (index == mask_nunits - 1)
2844 *number_of_mask_fixes = 1;
2845 *mask_fixed = false;
2846 *needs_first_vector = false;
2849 return true;
2853 /* Generate vector permute statements from a list of loads in DR_CHAIN.
2854 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
2855 permute statements for SLP_NODE_INSTANCE. */
2856 bool
2857 vect_transform_slp_perm_load (gimple stmt, vec<tree> dr_chain,
2858 gimple_stmt_iterator *gsi, int vf,
2859 slp_instance slp_node_instance, bool analyze_only)
2861 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2862 tree mask_element_type = NULL_TREE, mask_type;
2863 int i, j, k, nunits, vec_index = 0, scalar_index;
2864 slp_tree node;
2865 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2866 gimple next_scalar_stmt;
2867 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
2868 int first_mask_element;
2869 int index, unroll_factor, current_mask_element, ncopies;
2870 unsigned char *mask;
2871 bool only_one_vec = false, need_next_vector = false;
2872 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
2873 int number_of_mask_fixes = 1;
2874 bool mask_fixed = false;
2875 bool needs_first_vector = false;
2876 enum machine_mode mode;
2878 mode = TYPE_MODE (vectype);
2880 if (!can_vec_perm_p (mode, false, NULL))
2882 if (dump_enabled_p ())
2884 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2885 "no vect permute for ");
2886 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2888 return false;
2891 /* The generic VEC_PERM_EXPR code always uses an integral type of the
2892 same size as the vector element being permuted. */
2893 mask_element_type = lang_hooks.types.type_for_mode
2894 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
2895 mask_type = get_vectype_for_scalar_type (mask_element_type);
2896 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2897 mask = XALLOCAVEC (unsigned char, nunits);
2898 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2900 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
2901 unrolling factor. */
2902 orig_vec_stmts_num = group_size *
2903 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
2904 if (orig_vec_stmts_num == 1)
2905 only_one_vec = true;
2907 /* Number of copies is determined by the final vectorization factor
2908 relatively to SLP_NODE_INSTANCE unrolling factor. */
2909 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2911 /* Generate permutation masks for every NODE. Number of masks for each NODE
2912 is equal to GROUP_SIZE.
2913 E.g., we have a group of three nodes with three loads from the same
2914 location in each node, and the vector size is 4. I.e., we have a
2915 a0b0c0a1b1c1... sequence and we need to create the following vectors:
2916 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
2917 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
2920 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
2921 The last mask is illegal since we assume two operands for permute
2922 operation, and the mask element values can't be outside that range.
2923 Hence, the last mask must be converted into {2,5,5,5}.
2924 For the first two permutations we need the first and the second input
2925 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
2926 we need the second and the third vectors: {b1,c1,a2,b2} and
2927 {c2,a3,b3,c3}. */
2929 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_node_instance), i, node)
2931 scalar_index = 0;
2932 index = 0;
2933 vect_stmts_counter = 0;
2934 vec_index = 0;
2935 first_vec_index = vec_index++;
2936 if (only_one_vec)
2937 second_vec_index = first_vec_index;
2938 else
2939 second_vec_index = vec_index++;
2941 for (j = 0; j < unroll_factor; j++)
2943 for (k = 0; k < group_size; k++)
2945 first_mask_element = i + j * group_size;
2946 if (!vect_get_mask_element (stmt, first_mask_element, 0,
2947 nunits, only_one_vec, index,
2948 mask, &current_mask_element,
2949 &need_next_vector,
2950 &number_of_mask_fixes, &mask_fixed,
2951 &needs_first_vector))
2952 return false;
2953 mask[index++] = current_mask_element;
2955 if (index == nunits)
2957 tree mask_vec, *mask_elts;
2958 int l;
2960 if (!can_vec_perm_p (mode, false, mask))
2962 if (dump_enabled_p ())
2964 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
2965 vect_location,
2966 "unsupported vect permute { ");
2967 for (i = 0; i < nunits; ++i)
2968 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
2969 mask[i]);
2970 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
2972 return false;
2975 mask_elts = XALLOCAVEC (tree, nunits);
2976 for (l = 0; l < nunits; ++l)
2977 mask_elts[l] = build_int_cst (mask_element_type, mask[l]);
2978 mask_vec = build_vector (mask_type, mask_elts);
2979 index = 0;
2981 if (!analyze_only)
2983 if (need_next_vector)
2985 first_vec_index = second_vec_index;
2986 second_vec_index = vec_index;
2989 next_scalar_stmt
2990 = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
2992 vect_create_mask_and_perm (stmt, next_scalar_stmt,
2993 mask_vec, first_vec_index, second_vec_index,
2994 gsi, node, vectype, dr_chain,
2995 ncopies, vect_stmts_counter++);
3002 return true;
3007 /* Vectorize SLP instance tree in postorder. */
3009 static bool
3010 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3011 unsigned int vectorization_factor)
3013 gimple stmt;
3014 bool grouped_store, is_store;
3015 gimple_stmt_iterator si;
3016 stmt_vec_info stmt_info;
3017 unsigned int vec_stmts_size, nunits, group_size;
3018 tree vectype;
3019 int i;
3020 slp_tree loads_node;
3021 slp_void_p child;
3023 if (!node)
3024 return false;
3026 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3027 vect_schedule_slp_instance ((slp_tree) child, instance,
3028 vectorization_factor);
3030 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3031 stmt_info = vinfo_for_stmt (stmt);
3033 /* VECTYPE is the type of the destination. */
3034 vectype = STMT_VINFO_VECTYPE (stmt_info);
3035 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3036 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3038 /* For each SLP instance calculate number of vector stmts to be created
3039 for the scalar stmts in each node of the SLP tree. Number of vector
3040 elements in one vector iteration is the number of scalar elements in
3041 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3042 size. */
3043 vec_stmts_size = (vectorization_factor * group_size) / nunits;
3045 /* In case of load permutation we have to allocate vectorized statements for
3046 all the nodes that participate in that permutation. */
3047 if (SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
3049 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, loads_node)
3051 if (!SLP_TREE_VEC_STMTS (loads_node).exists ())
3053 SLP_TREE_VEC_STMTS (loads_node).create (vec_stmts_size);
3054 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
3059 if (!SLP_TREE_VEC_STMTS (node).exists ())
3061 SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3062 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3065 if (dump_enabled_p ())
3067 dump_printf_loc (MSG_NOTE,vect_location,
3068 "------>vectorizing SLP node starting from: ");
3069 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3072 /* Loads should be inserted before the first load. */
3073 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3074 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3075 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3076 && SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
3077 si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3078 else if (is_pattern_stmt_p (stmt_info))
3079 si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3080 else
3081 si = gsi_for_stmt (stmt);
3083 /* Stores should be inserted just before the last store. */
3084 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3085 && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3087 gimple last_store = vect_find_last_store_in_slp_instance (instance);
3088 if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3089 last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3090 si = gsi_for_stmt (last_store);
3093 /* Mark the first element of the reduction chain as reduction to properly
3094 transform the node. In the analysis phase only the last element of the
3095 chain is marked as reduction. */
3096 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3097 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3099 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3100 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3103 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3104 return is_store;
3107 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3108 For loop vectorization this is done in vectorizable_call, but for SLP
3109 it needs to be deferred until end of vect_schedule_slp, because multiple
3110 SLP instances may refer to the same scalar stmt. */
3112 static void
3113 vect_remove_slp_scalar_calls (slp_tree node)
3115 gimple stmt, new_stmt;
3116 gimple_stmt_iterator gsi;
3117 int i;
3118 slp_void_p child;
3119 tree lhs;
3120 stmt_vec_info stmt_info;
3122 if (!node)
3123 return;
3125 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3126 vect_remove_slp_scalar_calls ((slp_tree) child);
3128 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3130 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3131 continue;
3132 stmt_info = vinfo_for_stmt (stmt);
3133 if (stmt_info == NULL
3134 || is_pattern_stmt_p (stmt_info)
3135 || !PURE_SLP_STMT (stmt_info))
3136 continue;
3137 lhs = gimple_call_lhs (stmt);
3138 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3139 set_vinfo_for_stmt (new_stmt, stmt_info);
3140 set_vinfo_for_stmt (stmt, NULL);
3141 STMT_VINFO_STMT (stmt_info) = new_stmt;
3142 gsi = gsi_for_stmt (stmt);
3143 gsi_replace (&gsi, new_stmt, false);
3144 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3148 /* Generate vector code for all SLP instances in the loop/basic block. */
3150 bool
3151 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3153 vec<slp_instance> slp_instances;
3154 slp_instance instance;
3155 slp_tree loads_node;
3156 unsigned int i, j, vf;
3157 bool is_store = false;
3159 if (loop_vinfo)
3161 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3162 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3164 else
3166 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3167 vf = 1;
3170 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3172 /* Schedule the tree of INSTANCE. */
3173 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3174 instance, vf);
3176 /* Clear STMT_VINFO_VEC_STMT of all loads. With shared loads
3177 between SLP instances we fail to properly initialize the
3178 vectorized SLP stmts and confuse different load permutations. */
3179 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, loads_node)
3180 STMT_VINFO_VEC_STMT
3181 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (loads_node)[0])) = NULL;
3183 if (dump_enabled_p ())
3184 dump_printf_loc (MSG_NOTE, vect_location,
3185 "vectorizing stmts using SLP.");
3188 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3190 slp_tree root = SLP_INSTANCE_TREE (instance);
3191 gimple store;
3192 unsigned int j;
3193 gimple_stmt_iterator gsi;
3195 /* Remove scalar call stmts. Do not do this for basic-block
3196 vectorization as not all uses may be vectorized.
3197 ??? Why should this be necessary? DCE should be able to
3198 remove the stmts itself.
3199 ??? For BB vectorization we can as well remove scalar
3200 stmts starting from the SLP tree root if they have no
3201 uses. */
3202 if (loop_vinfo)
3203 vect_remove_slp_scalar_calls (root);
3205 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3206 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3208 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3209 break;
3211 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3212 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3213 /* Free the attached stmt_vec_info and remove the stmt. */
3214 gsi = gsi_for_stmt (store);
3215 unlink_stmt_vdef (store);
3216 gsi_remove (&gsi, true);
3217 release_defs (store);
3218 free_stmt_vec_info (store);
3222 return is_store;
3226 /* Vectorize the basic block. */
3228 void
3229 vect_slp_transform_bb (basic_block bb)
3231 bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3232 gimple_stmt_iterator si;
3234 gcc_assert (bb_vinfo);
3236 if (dump_enabled_p ())
3237 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3239 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3241 gimple stmt = gsi_stmt (si);
3242 stmt_vec_info stmt_info;
3244 if (dump_enabled_p ())
3246 dump_printf_loc (MSG_NOTE, vect_location,
3247 "------>SLPing statement: ");
3248 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3251 stmt_info = vinfo_for_stmt (stmt);
3252 gcc_assert (stmt_info);
3254 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3255 if (STMT_SLP_TYPE (stmt_info))
3257 vect_schedule_slp (NULL, bb_vinfo);
3258 break;
3262 if (dump_enabled_p ())
3263 dump_printf (MSG_OPTIMIZED_LOCATIONS, "BASIC BLOCK VECTORIZED\n");
3265 destroy_bb_vec_info (bb_vinfo);