2013-03-28 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-slp.c
blobe925f57a2f5f015d342c9406ecfcd3967e5cab68
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "target.h"
30 #include "basic-block.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-pass.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "recog.h" /* FIXME: for insn_data */
37 #include "optabs.h"
38 #include "tree-vectorizer.h"
39 #include "langhooks.h"
41 /* Extract the location of the basic block in the source code.
42 Return the basic block location if succeed and NULL if not. */
44 LOC
45 find_bb_location (basic_block bb)
47 gimple stmt = NULL;
48 gimple_stmt_iterator si;
50 if (!bb)
51 return UNKNOWN_LOC;
53 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
55 stmt = gsi_stmt (si);
56 if (gimple_location (stmt) != UNKNOWN_LOC)
57 return gimple_location (stmt);
60 return UNKNOWN_LOC;
64 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
66 static void
67 vect_free_slp_tree (slp_tree node)
69 int i;
70 slp_void_p child;
72 if (!node)
73 return;
75 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
76 vect_free_slp_tree ((slp_tree) child);
78 SLP_TREE_CHILDREN (node).release ();
79 SLP_TREE_SCALAR_STMTS (node).release ();
80 SLP_TREE_VEC_STMTS (node).release ();
82 free (node);
86 /* Free the memory allocated for the SLP instance. */
88 void
89 vect_free_slp_instance (slp_instance instance)
91 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
92 SLP_INSTANCE_LOAD_PERMUTATION (instance).release ();
93 SLP_INSTANCE_LOADS (instance).release ();
94 SLP_INSTANCE_BODY_COST_VEC (instance).release ();
95 free (instance);
99 /* Create an SLP node for SCALAR_STMTS. */
101 static slp_tree
102 vect_create_new_slp_node (vec<gimple> scalar_stmts)
104 slp_tree node;
105 gimple stmt = scalar_stmts[0];
106 unsigned int nops;
108 if (is_gimple_call (stmt))
109 nops = gimple_call_num_args (stmt);
110 else if (is_gimple_assign (stmt))
112 nops = gimple_num_ops (stmt) - 1;
113 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
114 nops++;
116 else
117 return NULL;
119 node = XNEW (struct _slp_tree);
120 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
121 SLP_TREE_VEC_STMTS (node).create (0);
122 SLP_TREE_CHILDREN (node).create (nops);
124 return node;
128 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
129 operand. */
130 static vec<slp_oprnd_info>
131 vect_create_oprnd_info (int nops, int group_size)
133 int i;
134 slp_oprnd_info oprnd_info;
135 vec<slp_oprnd_info> oprnds_info;
137 oprnds_info.create (nops);
138 for (i = 0; i < nops; i++)
140 oprnd_info = XNEW (struct _slp_oprnd_info);
141 oprnd_info->def_stmts.create (group_size);
142 oprnd_info->first_dt = vect_uninitialized_def;
143 oprnd_info->first_def_type = NULL_TREE;
144 oprnd_info->first_const_oprnd = NULL_TREE;
145 oprnd_info->first_pattern = false;
146 oprnds_info.quick_push (oprnd_info);
149 return oprnds_info;
153 /* Free operands info. */
155 static void
156 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
158 int i;
159 slp_oprnd_info oprnd_info;
161 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
163 oprnd_info->def_stmts.release ();
164 XDELETE (oprnd_info);
167 oprnds_info.release ();
171 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
172 they are of a valid type and that they match the defs of the first stmt of
173 the SLP group (stored in OPRNDS_INFO). */
175 static bool
176 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
177 slp_tree slp_node, gimple stmt,
178 int ncopies_for_cost, bool first,
179 vec<slp_oprnd_info> *oprnds_info,
180 stmt_vector_for_cost *prologue_cost_vec,
181 stmt_vector_for_cost *body_cost_vec)
183 tree oprnd;
184 unsigned int i, number_of_oprnds;
185 tree def, def_op0 = NULL_TREE;
186 gimple def_stmt;
187 enum vect_def_type dt = vect_uninitialized_def;
188 enum vect_def_type dt_op0 = vect_uninitialized_def;
189 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
190 tree lhs = gimple_get_lhs (stmt);
191 struct loop *loop = NULL;
192 enum tree_code rhs_code;
193 bool different_types = false;
194 bool pattern = false;
195 slp_oprnd_info oprnd_info, oprnd0_info, oprnd1_info;
196 int op_idx = 1;
197 tree compare_rhs = NULL_TREE;
199 if (loop_vinfo)
200 loop = LOOP_VINFO_LOOP (loop_vinfo);
202 if (is_gimple_call (stmt))
204 number_of_oprnds = gimple_call_num_args (stmt);
205 op_idx = 3;
207 else if (is_gimple_assign (stmt))
209 number_of_oprnds = gimple_num_ops (stmt) - 1;
210 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
211 number_of_oprnds++;
213 else
214 return false;
216 for (i = 0; i < number_of_oprnds; i++)
218 if (compare_rhs)
220 oprnd = compare_rhs;
221 compare_rhs = NULL_TREE;
223 else
224 oprnd = gimple_op (stmt, op_idx++);
226 oprnd_info = (*oprnds_info)[i];
228 if (COMPARISON_CLASS_P (oprnd))
230 compare_rhs = TREE_OPERAND (oprnd, 1);
231 oprnd = TREE_OPERAND (oprnd, 0);
234 if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
235 &def, &dt)
236 || (!def_stmt && dt != vect_constant_def))
238 if (dump_enabled_p ())
240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
241 "Build SLP failed: can't find def for ");
242 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
245 return false;
248 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
249 from the pattern. Check that all the stmts of the node are in the
250 pattern. */
251 if (def_stmt && gimple_bb (def_stmt)
252 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
253 || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
254 && gimple_code (def_stmt) != GIMPLE_PHI))
255 && vinfo_for_stmt (def_stmt)
256 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
257 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
258 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
260 pattern = true;
261 if (!first && !oprnd_info->first_pattern)
263 if (dump_enabled_p ())
265 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
266 "Build SLP failed: some of the stmts"
267 " are in a pattern, and others are not ");
268 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
271 return false;
274 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
275 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
277 if (dt == vect_unknown_def_type)
279 if (dump_enabled_p ())
280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
281 "Unsupported pattern.");
282 return false;
285 switch (gimple_code (def_stmt))
287 case GIMPLE_PHI:
288 def = gimple_phi_result (def_stmt);
289 break;
291 case GIMPLE_ASSIGN:
292 def = gimple_assign_lhs (def_stmt);
293 break;
295 default:
296 if (dump_enabled_p ())
297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
298 "unsupported defining stmt: ");
299 return false;
303 if (first)
305 oprnd_info->first_dt = dt;
306 oprnd_info->first_pattern = pattern;
307 if (def)
309 oprnd_info->first_def_type = TREE_TYPE (def);
310 oprnd_info->first_const_oprnd = NULL_TREE;
312 else
314 oprnd_info->first_def_type = NULL_TREE;
315 oprnd_info->first_const_oprnd = oprnd;
318 if (i == 0)
320 def_op0 = def;
321 dt_op0 = dt;
322 /* Analyze costs (for the first stmt of the group only). */
323 if (REFERENCE_CLASS_P (lhs))
324 /* Store. */
325 vect_model_store_cost (stmt_info, ncopies_for_cost, false,
326 dt, slp_node, prologue_cost_vec,
327 body_cost_vec);
328 else
330 enum vect_def_type dts[2];
331 dts[0] = dt;
332 dts[1] = vect_uninitialized_def;
333 /* Not memory operation (we don't call this function for
334 loads). */
335 vect_model_simple_cost (stmt_info, ncopies_for_cost, dts,
336 prologue_cost_vec, body_cost_vec);
340 else
342 /* Not first stmt of the group, check that the def-stmt/s match
343 the def-stmt/s of the first stmt. Allow different definition
344 types for reduction chains: the first stmt must be a
345 vect_reduction_def (a phi node), and the rest
346 vect_internal_def. */
347 if (((oprnd_info->first_dt != dt
348 && !(oprnd_info->first_dt == vect_reduction_def
349 && dt == vect_internal_def))
350 || (oprnd_info->first_def_type != NULL_TREE
351 && def
352 && !types_compatible_p (oprnd_info->first_def_type,
353 TREE_TYPE (def))))
354 || (!def
355 && !types_compatible_p (TREE_TYPE (oprnd_info->first_const_oprnd),
356 TREE_TYPE (oprnd)))
357 || different_types)
359 if (number_of_oprnds != 2)
361 if (dump_enabled_p ())
362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
363 "Build SLP failed: different types ");
365 return false;
368 /* Try to swap operands in case of binary operation. */
369 if (i == 0)
370 different_types = true;
371 else
373 oprnd0_info = (*oprnds_info)[0];
374 if (is_gimple_assign (stmt)
375 && (rhs_code = gimple_assign_rhs_code (stmt))
376 && TREE_CODE_CLASS (rhs_code) == tcc_binary
377 && commutative_tree_code (rhs_code)
378 && oprnd0_info->first_dt == dt
379 && oprnd_info->first_dt == dt_op0
380 && def_op0 && def
381 && !(oprnd0_info->first_def_type
382 && !types_compatible_p (oprnd0_info->first_def_type,
383 TREE_TYPE (def)))
384 && !(oprnd_info->first_def_type
385 && !types_compatible_p (oprnd_info->first_def_type,
386 TREE_TYPE (def_op0))))
388 if (dump_enabled_p ())
390 dump_printf_loc (MSG_NOTE, vect_location,
391 "Swapping operands of ");
392 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
395 swap_tree_operands (stmt, gimple_assign_rhs1_ptr (stmt),
396 gimple_assign_rhs2_ptr (stmt));
398 else
400 if (dump_enabled_p ())
401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
402 "Build SLP failed: different types ");
404 return false;
410 /* Check the types of the definitions. */
411 switch (dt)
413 case vect_constant_def:
414 case vect_external_def:
415 case vect_reduction_def:
416 break;
418 case vect_internal_def:
419 if (different_types)
421 oprnd0_info = (*oprnds_info)[0];
422 oprnd1_info = (*oprnds_info)[0];
423 if (i == 0)
424 oprnd1_info->def_stmts.quick_push (def_stmt);
425 else
426 oprnd0_info->def_stmts.quick_push (def_stmt);
428 else
429 oprnd_info->def_stmts.quick_push (def_stmt);
431 break;
433 default:
434 /* FORNOW: Not supported. */
435 if (dump_enabled_p ())
437 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
438 "Build SLP failed: illegal type of def ");
439 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
442 return false;
446 return true;
450 /* Recursively build an SLP tree starting from NODE.
451 Fail (and return FALSE) if def-stmts are not isomorphic, require data
452 permutation or are of unsupported types of operation. Otherwise, return
453 TRUE. */
455 static bool
456 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
457 slp_tree *node, unsigned int group_size, int *outside_cost,
458 int ncopies_for_cost, unsigned int *max_nunits,
459 vec<int> *load_permutation,
460 vec<slp_tree> *loads,
461 unsigned int vectorization_factor, bool *loads_permuted,
462 stmt_vector_for_cost *prologue_cost_vec,
463 stmt_vector_for_cost *body_cost_vec)
465 unsigned int i;
466 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (*node);
467 gimple stmt = stmts[0];
468 enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
469 enum tree_code first_cond_code = ERROR_MARK;
470 tree lhs;
471 bool stop_recursion = false, need_same_oprnds = false;
472 tree vectype, scalar_type, first_op1 = NULL_TREE;
473 unsigned int ncopies;
474 optab optab;
475 int icode;
476 enum machine_mode optab_op2_mode;
477 enum machine_mode vec_mode;
478 struct data_reference *first_dr;
479 HOST_WIDE_INT dummy;
480 bool permutation = false;
481 unsigned int load_place;
482 gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
483 vec<slp_oprnd_info> oprnds_info;
484 unsigned int nops;
485 slp_oprnd_info oprnd_info;
486 tree cond;
488 if (is_gimple_call (stmt))
489 nops = gimple_call_num_args (stmt);
490 else if (is_gimple_assign (stmt))
492 nops = gimple_num_ops (stmt) - 1;
493 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
494 nops++;
496 else
497 return false;
499 oprnds_info = vect_create_oprnd_info (nops, group_size);
501 /* For every stmt in NODE find its def stmt/s. */
502 FOR_EACH_VEC_ELT (stmts, i, stmt)
504 if (dump_enabled_p ())
506 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
507 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
510 /* Fail to vectorize statements marked as unvectorizable. */
511 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
513 if (dump_enabled_p ())
515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
516 "Build SLP failed: unvectorizable statement ");
517 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
520 vect_free_oprnd_info (oprnds_info);
521 return false;
524 lhs = gimple_get_lhs (stmt);
525 if (lhs == NULL_TREE)
527 if (dump_enabled_p ())
529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
530 "Build SLP failed: not GIMPLE_ASSIGN nor "
531 "GIMPLE_CALL ");
532 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
535 vect_free_oprnd_info (oprnds_info);
536 return false;
539 if (is_gimple_assign (stmt)
540 && gimple_assign_rhs_code (stmt) == COND_EXPR
541 && (cond = gimple_assign_rhs1 (stmt))
542 && !COMPARISON_CLASS_P (cond))
544 if (dump_enabled_p ())
546 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
547 "Build SLP failed: condition is not "
548 "comparison ");
549 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
552 vect_free_oprnd_info (oprnds_info);
553 return false;
556 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
557 vectype = get_vectype_for_scalar_type (scalar_type);
558 if (!vectype)
560 if (dump_enabled_p ())
562 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
563 "Build SLP failed: unsupported data-type ");
564 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
565 scalar_type);
568 vect_free_oprnd_info (oprnds_info);
569 return false;
572 /* In case of multiple types we need to detect the smallest type. */
573 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
575 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
576 if (bb_vinfo)
577 vectorization_factor = *max_nunits;
580 ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
582 if (is_gimple_call (stmt))
584 rhs_code = CALL_EXPR;
585 if (gimple_call_internal_p (stmt)
586 || gimple_call_tail_p (stmt)
587 || gimple_call_noreturn_p (stmt)
588 || !gimple_call_nothrow_p (stmt)
589 || gimple_call_chain (stmt))
591 if (dump_enabled_p ())
593 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
594 "Build SLP failed: unsupported call type ");
595 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
598 vect_free_oprnd_info (oprnds_info);
599 return false;
602 else
603 rhs_code = gimple_assign_rhs_code (stmt);
605 /* Check the operation. */
606 if (i == 0)
608 first_stmt_code = rhs_code;
610 /* Shift arguments should be equal in all the packed stmts for a
611 vector shift with scalar shift operand. */
612 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
613 || rhs_code == LROTATE_EXPR
614 || rhs_code == RROTATE_EXPR)
616 vec_mode = TYPE_MODE (vectype);
618 /* First see if we have a vector/vector shift. */
619 optab = optab_for_tree_code (rhs_code, vectype,
620 optab_vector);
622 if (!optab
623 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
625 /* No vector/vector shift, try for a vector/scalar shift. */
626 optab = optab_for_tree_code (rhs_code, vectype,
627 optab_scalar);
629 if (!optab)
631 if (dump_enabled_p ())
632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
633 "Build SLP failed: no optab.");
634 vect_free_oprnd_info (oprnds_info);
635 return false;
637 icode = (int) optab_handler (optab, vec_mode);
638 if (icode == CODE_FOR_nothing)
640 if (dump_enabled_p ())
641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
642 "Build SLP failed: "
643 "op not supported by target.");
644 vect_free_oprnd_info (oprnds_info);
645 return false;
647 optab_op2_mode = insn_data[icode].operand[2].mode;
648 if (!VECTOR_MODE_P (optab_op2_mode))
650 need_same_oprnds = true;
651 first_op1 = gimple_assign_rhs2 (stmt);
655 else if (rhs_code == WIDEN_LSHIFT_EXPR)
657 need_same_oprnds = true;
658 first_op1 = gimple_assign_rhs2 (stmt);
661 else
663 if (first_stmt_code != rhs_code
664 && (first_stmt_code != IMAGPART_EXPR
665 || rhs_code != REALPART_EXPR)
666 && (first_stmt_code != REALPART_EXPR
667 || rhs_code != IMAGPART_EXPR)
668 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
669 && (first_stmt_code == ARRAY_REF
670 || first_stmt_code == INDIRECT_REF
671 || first_stmt_code == COMPONENT_REF
672 || first_stmt_code == MEM_REF)))
674 if (dump_enabled_p ())
676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
677 "Build SLP failed: different operation "
678 "in stmt ");
679 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
682 vect_free_oprnd_info (oprnds_info);
683 return false;
686 if (need_same_oprnds
687 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
689 if (dump_enabled_p ())
691 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
692 "Build SLP failed: different shift "
693 "arguments in ");
694 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
697 vect_free_oprnd_info (oprnds_info);
698 return false;
701 if (rhs_code == CALL_EXPR)
703 gimple first_stmt = stmts[0];
704 if (gimple_call_num_args (stmt) != nops
705 || !operand_equal_p (gimple_call_fn (first_stmt),
706 gimple_call_fn (stmt), 0)
707 || gimple_call_fntype (first_stmt)
708 != gimple_call_fntype (stmt))
710 if (dump_enabled_p ())
712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
713 "Build SLP failed: different calls in ");
714 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
715 stmt, 0);
718 vect_free_oprnd_info (oprnds_info);
719 return false;
724 /* Grouped store or load. */
725 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
727 if (REFERENCE_CLASS_P (lhs))
729 /* Store. */
730 if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node,
731 stmt, ncopies_for_cost,
732 (i == 0), &oprnds_info,
733 prologue_cost_vec,
734 body_cost_vec))
736 vect_free_oprnd_info (oprnds_info);
737 return false;
740 else
742 /* Load. */
743 /* FORNOW: Check that there is no gap between the loads
744 and no gap between the groups when we need to load
745 multiple groups at once.
746 ??? We should enhance this to only disallow gaps
747 inside vectors. */
748 if ((ncopies > 1
749 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
750 && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
751 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
752 && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
754 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757 "Build SLP failed: grouped "
758 "loads have gaps ");
759 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
760 stmt, 0);
763 vect_free_oprnd_info (oprnds_info);
764 return false;
767 /* Check that the size of interleaved loads group is not
768 greater than the SLP group size. */
769 if (loop_vinfo
770 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
771 && ((GROUP_SIZE (vinfo_for_stmt (stmt))
772 - GROUP_GAP (vinfo_for_stmt (stmt)))
773 > ncopies * group_size))
775 if (dump_enabled_p ())
777 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
778 "Build SLP failed: the number "
779 "of interleaved loads is greater than "
780 "the SLP group size ");
781 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
782 stmt, 0);
785 vect_free_oprnd_info (oprnds_info);
786 return false;
789 old_first_load = first_load;
790 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
791 if (prev_first_load)
793 /* Check that there are no loads from different interleaving
794 chains in the same node. The only exception is complex
795 numbers. */
796 if (prev_first_load != first_load
797 && rhs_code != REALPART_EXPR
798 && rhs_code != IMAGPART_EXPR)
800 if (dump_enabled_p ())
802 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
803 vect_location,
804 "Build SLP failed: different "
805 "interleaving chains in one node ");
806 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
807 stmt, 0);
810 vect_free_oprnd_info (oprnds_info);
811 return false;
814 else
815 prev_first_load = first_load;
817 /* In some cases a group of loads is just the same load
818 repeated N times. Only analyze its cost once. */
819 if (first_load == stmt && old_first_load != first_load)
821 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
822 if (vect_supportable_dr_alignment (first_dr, false)
823 == dr_unaligned_unsupported)
825 if (dump_enabled_p ())
827 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
828 vect_location,
829 "Build SLP failed: unsupported "
830 "unaligned load ");
831 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
832 stmt, 0);
835 vect_free_oprnd_info (oprnds_info);
836 return false;
839 /* Analyze costs (for the first stmt in the group). */
840 vect_model_load_cost (vinfo_for_stmt (stmt),
841 ncopies_for_cost, false, *node,
842 prologue_cost_vec, body_cost_vec);
845 /* Store the place of this load in the interleaving chain. In
846 case that permutation is needed we later decide if a specific
847 permutation is supported. */
848 load_place = vect_get_place_in_interleaving_chain (stmt,
849 first_load);
850 if (load_place != i)
851 permutation = true;
853 load_permutation->safe_push (load_place);
855 /* We stop the tree when we reach a group of loads. */
856 stop_recursion = true;
857 continue;
859 } /* Grouped access. */
860 else
862 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
864 /* Not grouped load. */
865 if (dump_enabled_p ())
867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
868 "Build SLP failed: not grouped load ");
869 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
872 /* FORNOW: Not grouped loads are not supported. */
873 vect_free_oprnd_info (oprnds_info);
874 return false;
877 /* Not memory operation. */
878 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
879 && TREE_CODE_CLASS (rhs_code) != tcc_unary
880 && rhs_code != COND_EXPR
881 && rhs_code != CALL_EXPR)
883 if (dump_enabled_p ())
885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
886 "Build SLP failed: operation");
887 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
888 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
891 vect_free_oprnd_info (oprnds_info);
892 return false;
895 if (rhs_code == COND_EXPR)
897 tree cond_expr = gimple_assign_rhs1 (stmt);
899 if (i == 0)
900 first_cond_code = TREE_CODE (cond_expr);
901 else if (first_cond_code != TREE_CODE (cond_expr))
903 if (dump_enabled_p ())
905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
906 "Build SLP failed: different"
907 " operation");
908 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
909 stmt, 0);
912 vect_free_oprnd_info (oprnds_info);
913 return false;
917 /* Find the def-stmts. */
918 if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node, stmt,
919 ncopies_for_cost, (i == 0),
920 &oprnds_info, prologue_cost_vec,
921 body_cost_vec))
923 vect_free_oprnd_info (oprnds_info);
924 return false;
929 /* Grouped loads were reached - stop the recursion. */
930 if (stop_recursion)
932 loads->safe_push (*node);
933 if (permutation)
935 gimple first_stmt = stmts[0];
936 *loads_permuted = true;
937 (void) record_stmt_cost (body_cost_vec, group_size, vec_perm,
938 vinfo_for_stmt (first_stmt), 0, vect_body);
940 else
942 /* We don't check here complex numbers chains, so we set
943 LOADS_PERMUTED for further check in
944 vect_supported_load_permutation_p. */
945 if (rhs_code == REALPART_EXPR || rhs_code == IMAGPART_EXPR)
946 *loads_permuted = true;
949 vect_free_oprnd_info (oprnds_info);
950 return true;
953 /* Create SLP_TREE nodes for the definition node/s. */
954 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
956 slp_tree child;
958 if (oprnd_info->first_dt != vect_internal_def)
959 continue;
961 child = vect_create_new_slp_node (oprnd_info->def_stmts);
962 if (!child
963 || !vect_build_slp_tree (loop_vinfo, bb_vinfo, &child, group_size,
964 outside_cost, ncopies_for_cost,
965 max_nunits, load_permutation, loads,
966 vectorization_factor, loads_permuted,
967 prologue_cost_vec, body_cost_vec))
969 if (child)
970 oprnd_info->def_stmts = vNULL;
971 vect_free_slp_tree (child);
972 vect_free_oprnd_info (oprnds_info);
973 return false;
976 oprnd_info->def_stmts.create (0);
977 SLP_TREE_CHILDREN (*node).quick_push (child);
980 vect_free_oprnd_info (oprnds_info);
981 return true;
984 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
986 static void
987 vect_print_slp_tree (int dump_kind, slp_tree node)
989 int i;
990 gimple stmt;
991 slp_void_p child;
993 if (!node)
994 return;
996 dump_printf (dump_kind, "node ");
997 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
999 dump_printf (dump_kind, "\n\tstmt %d ", i);
1000 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1002 dump_printf (dump_kind, "\n");
1004 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1005 vect_print_slp_tree (dump_kind, (slp_tree) child);
1009 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1010 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1011 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1012 stmts in NODE are to be marked. */
1014 static void
1015 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1017 int i;
1018 gimple stmt;
1019 slp_void_p child;
1021 if (!node)
1022 return;
1024 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1025 if (j < 0 || i == j)
1026 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1028 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1029 vect_mark_slp_stmts ((slp_tree) child, mark, j);
1033 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1035 static void
1036 vect_mark_slp_stmts_relevant (slp_tree node)
1038 int i;
1039 gimple stmt;
1040 stmt_vec_info stmt_info;
1041 slp_void_p child;
1043 if (!node)
1044 return;
1046 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1048 stmt_info = vinfo_for_stmt (stmt);
1049 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1050 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1051 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1054 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1055 vect_mark_slp_stmts_relevant ((slp_tree) child);
1059 /* Check if the permutation required by the SLP INSTANCE is supported.
1060 Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed. */
1062 static bool
1063 vect_supported_slp_permutation_p (slp_instance instance)
1065 slp_tree node = SLP_INSTANCE_LOADS (instance)[0];
1066 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1067 gimple first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
1068 vec<slp_tree> sorted_loads = vNULL;
1069 int index;
1070 slp_tree *tmp_loads = NULL;
1071 int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j;
1072 slp_tree load;
1074 /* FORNOW: The only supported loads permutation is loads from the same
1075 location in all the loads in the node, when the data-refs in
1076 nodes of LOADS constitute an interleaving chain.
1077 Sort the nodes according to the order of accesses in the chain. */
1078 tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size);
1079 for (i = 0, j = 0;
1080 SLP_INSTANCE_LOAD_PERMUTATION (instance).iterate (i, &index)
1081 && SLP_INSTANCE_LOADS (instance).iterate (j, &load);
1082 i += group_size, j++)
1084 gimple scalar_stmt = SLP_TREE_SCALAR_STMTS (load)[0];
1085 /* Check that the loads are all in the same interleaving chain. */
1086 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (scalar_stmt)) != first_load)
1088 if (dump_enabled_p ())
1090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1091 "Build SLP failed: unsupported data "
1092 "permutation ");
1093 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1094 scalar_stmt, 0);
1097 free (tmp_loads);
1098 return false;
1101 tmp_loads[index] = load;
1104 sorted_loads.create (group_size);
1105 for (i = 0; i < group_size; i++)
1106 sorted_loads.safe_push (tmp_loads[i]);
1108 SLP_INSTANCE_LOADS (instance).release ();
1109 SLP_INSTANCE_LOADS (instance) = sorted_loads;
1110 free (tmp_loads);
1112 if (!vect_transform_slp_perm_load (stmt, vNULL, NULL,
1113 SLP_INSTANCE_UNROLLING_FACTOR (instance),
1114 instance, true))
1115 return false;
1117 return true;
1121 /* Rearrange the statements of NODE according to PERMUTATION. */
1123 static void
1124 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1125 vec<int> permutation)
1127 gimple stmt;
1128 vec<gimple> tmp_stmts;
1129 unsigned int index, i;
1130 slp_void_p child;
1132 if (!node)
1133 return;
1135 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1136 vect_slp_rearrange_stmts ((slp_tree) child, group_size, permutation);
1138 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1139 tmp_stmts.create (group_size);
1141 for (i = 0; i < group_size; i++)
1142 tmp_stmts.safe_push (NULL);
1144 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1146 index = permutation[i];
1147 tmp_stmts[index] = stmt;
1150 SLP_TREE_SCALAR_STMTS (node).release ();
1151 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1155 /* Check if the required load permutation is supported.
1156 LOAD_PERMUTATION contains a list of indices of the loads.
1157 In SLP this permutation is relative to the order of grouped stores that are
1158 the base of the SLP instance. */
1160 static bool
1161 vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
1162 vec<int> load_permutation)
1164 int i = 0, j, prev = -1, next, k, number_of_groups;
1165 bool supported, bad_permutation = false;
1166 sbitmap load_index;
1167 slp_tree node, other_complex_node;
1168 gimple stmt, first = NULL, other_node_first, load, next_load, first_load;
1169 unsigned complex_numbers = 0;
1170 struct data_reference *dr;
1171 bb_vec_info bb_vinfo;
1173 /* FORNOW: permutations are only supported in SLP. */
1174 if (!slp_instn)
1175 return false;
1177 if (dump_enabled_p ())
1179 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1180 FOR_EACH_VEC_ELT (load_permutation, i, next)
1181 dump_printf (MSG_NOTE, "%d ", next);
1184 /* In case of reduction every load permutation is allowed, since the order
1185 of the reduction statements is not important (as opposed to the case of
1186 grouped stores). The only condition we need to check is that all the
1187 load nodes are of the same size and have the same permutation (and then
1188 rearrange all the nodes of the SLP instance according to this
1189 permutation). */
1191 /* Check that all the load nodes are of the same size. */
1192 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1194 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1195 return false;
1197 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1198 if (is_gimple_assign (stmt)
1199 && (gimple_assign_rhs_code (stmt) == REALPART_EXPR
1200 || gimple_assign_rhs_code (stmt) == IMAGPART_EXPR))
1201 complex_numbers++;
1204 /* Complex operands can be swapped as following:
1205 real_c = real_b + real_a;
1206 imag_c = imag_a + imag_b;
1207 i.e., we have {real_b, imag_a} and {real_a, imag_b} instead of
1208 {real_a, imag_a} and {real_b, imag_b}. We check here that if interleaving
1209 chains are mixed, they match the above pattern. */
1210 if (complex_numbers)
1212 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1214 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, stmt)
1216 if (j == 0)
1217 first = stmt;
1218 else
1220 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != first)
1222 if (complex_numbers != 2)
1223 return false;
1225 if (i == 0)
1226 k = 1;
1227 else
1228 k = 0;
1230 other_complex_node = SLP_INSTANCE_LOADS (slp_instn)[k];
1231 other_node_first =
1232 SLP_TREE_SCALAR_STMTS (other_complex_node)[0];
1234 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1235 != other_node_first)
1236 return false;
1243 /* We checked that this case ok, so there is no need to proceed with
1244 permutation tests. */
1245 if (complex_numbers == 2
1246 && SLP_INSTANCE_LOADS (slp_instn).length () == 2)
1248 SLP_INSTANCE_LOADS (slp_instn).release ();
1249 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1250 return true;
1253 node = SLP_INSTANCE_TREE (slp_instn);
1254 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1255 /* LOAD_PERMUTATION is a list of indices of all the loads of the SLP
1256 instance, not all the loads belong to the same node or interleaving
1257 group. Hence, we need to divide them into groups according to
1258 GROUP_SIZE. */
1259 number_of_groups = load_permutation.length () / group_size;
1261 /* Reduction (there are no data-refs in the root).
1262 In reduction chain the order of the loads is important. */
1263 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1264 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1266 int first_group_load_index;
1268 /* Compare all the permutation sequences to the first one. */
1269 for (i = 1; i < number_of_groups; i++)
1271 k = 0;
1272 for (j = i * group_size; j < i * group_size + group_size; j++)
1274 next = load_permutation[j];
1275 first_group_load_index = load_permutation[k];
1277 if (next != first_group_load_index)
1279 bad_permutation = true;
1280 break;
1283 k++;
1286 if (bad_permutation)
1287 break;
1290 if (!bad_permutation)
1292 /* Check that the loads in the first sequence are different and there
1293 are no gaps between them. */
1294 load_index = sbitmap_alloc (group_size);
1295 bitmap_clear (load_index);
1296 for (k = 0; k < group_size; k++)
1298 first_group_load_index = load_permutation[k];
1299 if (bitmap_bit_p (load_index, first_group_load_index))
1301 bad_permutation = true;
1302 break;
1305 bitmap_set_bit (load_index, first_group_load_index);
1308 if (!bad_permutation)
1309 for (k = 0; k < group_size; k++)
1310 if (!bitmap_bit_p (load_index, k))
1312 bad_permutation = true;
1313 break;
1316 sbitmap_free (load_index);
1319 if (!bad_permutation)
1321 /* This permutation is valid for reduction. Since the order of the
1322 statements in the nodes is not important unless they are memory
1323 accesses, we can rearrange the statements in all the nodes
1324 according to the order of the loads. */
1325 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1326 load_permutation);
1327 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1328 return true;
1332 /* In basic block vectorization we allow any subchain of an interleaving
1333 chain.
1334 FORNOW: not supported in loop SLP because of realignment compications. */
1335 bb_vinfo = STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt));
1336 bad_permutation = false;
1337 /* Check that for every node in the instance the loads form a subchain. */
1338 if (bb_vinfo)
1340 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1342 next_load = NULL;
1343 first_load = NULL;
1344 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1346 if (!first_load)
1347 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (load));
1348 else if (first_load
1349 != GROUP_FIRST_ELEMENT (vinfo_for_stmt (load)))
1351 bad_permutation = true;
1352 break;
1355 if (j != 0 && next_load != load)
1357 bad_permutation = true;
1358 break;
1361 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1364 if (bad_permutation)
1365 break;
1368 /* Check that the alignment of the first load in every subchain, i.e.,
1369 the first statement in every load node, is supported. */
1370 if (!bad_permutation)
1372 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1374 first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1375 if (first_load
1376 != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1378 dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1379 if (vect_supportable_dr_alignment (dr, false)
1380 == dr_unaligned_unsupported)
1382 if (dump_enabled_p ())
1384 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1385 vect_location,
1386 "unsupported unaligned load ");
1387 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1388 first_load, 0);
1390 bad_permutation = true;
1391 break;
1396 if (!bad_permutation)
1398 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1399 return true;
1404 /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1405 GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1406 well (unless it's reduction). */
1407 if (load_permutation.length ()
1408 != (unsigned int) (group_size * group_size))
1409 return false;
1411 supported = true;
1412 load_index = sbitmap_alloc (group_size);
1413 bitmap_clear (load_index);
1414 for (j = 0; j < group_size; j++)
1416 for (i = j * group_size, k = 0;
1417 load_permutation.iterate (i, &next) && k < group_size;
1418 i++, k++)
1420 if (i != j * group_size && next != prev)
1422 supported = false;
1423 break;
1426 prev = next;
1429 if (bitmap_bit_p (load_index, prev))
1431 supported = false;
1432 break;
1435 bitmap_set_bit (load_index, prev);
1438 for (j = 0; j < group_size; j++)
1439 if (!bitmap_bit_p (load_index, j))
1441 sbitmap_free (load_index);
1442 return false;
1445 sbitmap_free (load_index);
1447 if (supported && i == group_size * group_size
1448 && vect_supported_slp_permutation_p (slp_instn))
1449 return true;
1451 return false;
1455 /* Find the first load in the loop that belongs to INSTANCE.
1456 When loads are in several SLP nodes, there can be a case in which the first
1457 load does not appear in the first SLP node to be transformed, causing
1458 incorrect order of statements. Since we generate all the loads together,
1459 they must be inserted before the first load of the SLP instance and not
1460 before the first load of the first node of the instance. */
1462 static gimple
1463 vect_find_first_load_in_slp_instance (slp_instance instance)
1465 int i, j;
1466 slp_tree load_node;
1467 gimple first_load = NULL, load;
1469 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1470 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1471 first_load = get_earlier_stmt (load, first_load);
1473 return first_load;
1477 /* Find the last store in SLP INSTANCE. */
1479 static gimple
1480 vect_find_last_store_in_slp_instance (slp_instance instance)
1482 int i;
1483 slp_tree node;
1484 gimple last_store = NULL, store;
1486 node = SLP_INSTANCE_TREE (instance);
1487 for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1488 last_store = get_later_stmt (store, last_store);
1490 return last_store;
1494 /* Analyze an SLP instance starting from a group of grouped stores. Call
1495 vect_build_slp_tree to build a tree of packed stmts if possible.
1496 Return FALSE if it's impossible to SLP any stmt in the loop. */
1498 static bool
1499 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1500 gimple stmt)
1502 slp_instance new_instance;
1503 slp_tree node;
1504 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1505 unsigned int unrolling_factor = 1, nunits;
1506 tree vectype, scalar_type = NULL_TREE;
1507 gimple next;
1508 unsigned int vectorization_factor = 0;
1509 int outside_cost = 0, ncopies_for_cost, i;
1510 unsigned int max_nunits = 0;
1511 vec<int> load_permutation;
1512 vec<slp_tree> loads;
1513 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1514 bool loads_permuted = false;
1515 vec<gimple> scalar_stmts;
1516 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1517 stmt_info_for_cost *si;
1519 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1521 if (dr)
1523 scalar_type = TREE_TYPE (DR_REF (dr));
1524 vectype = get_vectype_for_scalar_type (scalar_type);
1526 else
1528 gcc_assert (loop_vinfo);
1529 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1532 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1534 else
1536 gcc_assert (loop_vinfo);
1537 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1538 group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1541 if (!vectype)
1543 if (dump_enabled_p ())
1545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1546 "Build SLP failed: unsupported data-type ");
1547 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1550 return false;
1553 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1554 if (loop_vinfo)
1555 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1556 else
1557 vectorization_factor = nunits;
1559 /* Calculate the unrolling factor. */
1560 unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1561 if (unrolling_factor != 1 && !loop_vinfo)
1563 if (dump_enabled_p ())
1564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1565 "Build SLP failed: unrolling required in basic"
1566 " block SLP");
1568 return false;
1571 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1572 scalar_stmts.create (group_size);
1573 next = stmt;
1574 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1576 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1577 while (next)
1579 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1580 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1581 scalar_stmts.safe_push (
1582 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1583 else
1584 scalar_stmts.safe_push (next);
1585 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1588 else
1590 /* Collect reduction statements. */
1591 vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1592 for (i = 0; reductions.iterate (i, &next); i++)
1593 scalar_stmts.safe_push (next);
1596 node = vect_create_new_slp_node (scalar_stmts);
1598 /* Calculate the number of vector stmts to create based on the unrolling
1599 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1600 GROUP_SIZE / NUNITS otherwise. */
1601 ncopies_for_cost = unrolling_factor * group_size / nunits;
1603 load_permutation.create (group_size * group_size);
1604 loads.create (group_size);
1605 prologue_cost_vec.create (10);
1606 body_cost_vec.create (10);
1608 /* Build the tree for the SLP instance. */
1609 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1610 &outside_cost, ncopies_for_cost,
1611 &max_nunits, &load_permutation, &loads,
1612 vectorization_factor, &loads_permuted,
1613 &prologue_cost_vec, &body_cost_vec))
1615 void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1616 : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1618 /* Calculate the unrolling factor based on the smallest type. */
1619 if (max_nunits > nunits)
1620 unrolling_factor = least_common_multiple (max_nunits, group_size)
1621 / group_size;
1623 if (unrolling_factor != 1 && !loop_vinfo)
1625 if (dump_enabled_p ())
1626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1627 "Build SLP failed: unrolling required in basic"
1628 " block SLP");
1629 vect_free_slp_tree (node);
1630 body_cost_vec.release ();
1631 prologue_cost_vec.release ();
1632 load_permutation.release ();
1633 loads.release ();
1634 return false;
1637 /* Create a new SLP instance. */
1638 new_instance = XNEW (struct _slp_instance);
1639 SLP_INSTANCE_TREE (new_instance) = node;
1640 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1641 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1642 SLP_INSTANCE_BODY_COST_VEC (new_instance) = body_cost_vec;
1643 SLP_INSTANCE_LOADS (new_instance) = loads;
1644 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1645 SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation;
1647 if (loads_permuted)
1649 if (!vect_supported_load_permutation_p (new_instance, group_size,
1650 load_permutation))
1652 if (dump_enabled_p ())
1654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1655 "Build SLP failed: unsupported load "
1656 "permutation ");
1657 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1660 vect_free_slp_instance (new_instance);
1661 prologue_cost_vec.release ();
1662 return false;
1665 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1666 = vect_find_first_load_in_slp_instance (new_instance);
1668 else
1669 SLP_INSTANCE_LOAD_PERMUTATION (new_instance).release ();
1671 /* Record the prologue costs, which were delayed until we were
1672 sure that SLP was successful. Unlike the body costs, we know
1673 the final values now regardless of the loop vectorization factor. */
1674 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1676 struct _stmt_vec_info *stmt_info
1677 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1678 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1679 si->misalign, vect_prologue);
1682 prologue_cost_vec.release ();
1684 if (loop_vinfo)
1685 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1686 else
1687 BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1689 if (dump_enabled_p ())
1690 vect_print_slp_tree (MSG_NOTE, node);
1692 return true;
1694 else
1696 body_cost_vec.release ();
1697 prologue_cost_vec.release ();
1700 /* Failed to SLP. */
1701 /* Free the allocated memory. */
1702 vect_free_slp_tree (node);
1703 load_permutation.release ();
1704 loads.release ();
1706 return false;
1710 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1711 trees of packed scalar stmts if SLP is possible. */
1713 bool
1714 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
1716 unsigned int i;
1717 vec<gimple> grouped_stores;
1718 vec<gimple> reductions = vNULL;
1719 vec<gimple> reduc_chains = vNULL;
1720 gimple first_element;
1721 bool ok = false;
1723 if (dump_enabled_p ())
1724 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===");
1726 if (loop_vinfo)
1728 grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1729 reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1730 reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1732 else
1733 grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1735 /* Find SLP sequences starting from groups of grouped stores. */
1736 FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1737 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
1738 ok = true;
1740 if (bb_vinfo && !ok)
1742 if (dump_enabled_p ())
1743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1744 "Failed to SLP the basic block.");
1746 return false;
1749 if (loop_vinfo
1750 && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1752 /* Find SLP sequences starting from reduction chains. */
1753 FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1754 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
1755 ok = true;
1756 else
1757 return false;
1759 /* Don't try to vectorize SLP reductions if reduction chain was
1760 detected. */
1761 return ok;
1764 /* Find SLP sequences starting from groups of reductions. */
1765 if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1766 && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0]))
1767 ok = true;
1769 return true;
1773 /* For each possible SLP instance decide whether to SLP it and calculate overall
1774 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1775 least one instance. */
1777 bool
1778 vect_make_slp_decision (loop_vec_info loop_vinfo)
1780 unsigned int i, unrolling_factor = 1;
1781 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1782 slp_instance instance;
1783 int decided_to_slp = 0;
1785 if (dump_enabled_p ())
1786 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ===");
1788 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1790 /* FORNOW: SLP if you can. */
1791 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1792 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1794 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
1795 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1796 loop-based vectorization. Such stmts will be marked as HYBRID. */
1797 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1798 decided_to_slp++;
1801 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1803 if (decided_to_slp && dump_enabled_p ())
1804 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
1805 "Decided to SLP %d instances. Unrolling factor %d",
1806 decided_to_slp, unrolling_factor);
1808 return (decided_to_slp > 0);
1812 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1813 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1815 static void
1816 vect_detect_hybrid_slp_stmts (slp_tree node)
1818 int i;
1819 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (node);
1820 gimple stmt = stmts[0];
1821 imm_use_iterator imm_iter;
1822 gimple use_stmt;
1823 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1824 slp_void_p child;
1825 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1826 struct loop *loop = NULL;
1827 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo);
1828 basic_block bb = NULL;
1830 if (!node)
1831 return;
1833 if (loop_vinfo)
1834 loop = LOOP_VINFO_LOOP (loop_vinfo);
1835 else
1836 bb = BB_VINFO_BB (bb_vinfo);
1838 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1839 if (PURE_SLP_STMT (vinfo_for_stmt (stmt))
1840 && TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1841 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1842 if (gimple_bb (use_stmt)
1843 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1844 || bb == gimple_bb (use_stmt))
1845 && (stmt_vinfo = vinfo_for_stmt (use_stmt))
1846 && !STMT_SLP_TYPE (stmt_vinfo)
1847 && (STMT_VINFO_RELEVANT (stmt_vinfo)
1848 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
1849 && !(gimple_code (use_stmt) == GIMPLE_PHI
1850 && STMT_VINFO_DEF_TYPE (stmt_vinfo)
1851 == vect_reduction_def))
1852 vect_mark_slp_stmts (node, hybrid, i);
1854 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1855 vect_detect_hybrid_slp_stmts ((slp_tree) child);
1859 /* Find stmts that must be both vectorized and SLPed. */
1861 void
1862 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1864 unsigned int i;
1865 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1866 slp_instance instance;
1868 if (dump_enabled_p ())
1869 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ===");
1871 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1872 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance));
1876 /* Create and initialize a new bb_vec_info struct for BB, as well as
1877 stmt_vec_info structs for all the stmts in it. */
1879 static bb_vec_info
1880 new_bb_vec_info (basic_block bb)
1882 bb_vec_info res = NULL;
1883 gimple_stmt_iterator gsi;
1885 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
1886 BB_VINFO_BB (res) = bb;
1888 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1890 gimple stmt = gsi_stmt (gsi);
1891 gimple_set_uid (stmt, 0);
1892 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
1895 BB_VINFO_GROUPED_STORES (res).create (10);
1896 BB_VINFO_SLP_INSTANCES (res).create (2);
1897 BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
1899 bb->aux = res;
1900 return res;
1904 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
1905 stmts in the basic block. */
1907 static void
1908 destroy_bb_vec_info (bb_vec_info bb_vinfo)
1910 vec<slp_instance> slp_instances;
1911 slp_instance instance;
1912 basic_block bb;
1913 gimple_stmt_iterator si;
1914 unsigned i;
1916 if (!bb_vinfo)
1917 return;
1919 bb = BB_VINFO_BB (bb_vinfo);
1921 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1923 gimple stmt = gsi_stmt (si);
1924 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1926 if (stmt_info)
1927 /* Free stmt_vec_info. */
1928 free_stmt_vec_info (stmt);
1931 free_data_refs (BB_VINFO_DATAREFS (bb_vinfo));
1932 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
1933 BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
1934 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1935 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1936 vect_free_slp_instance (instance);
1937 BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
1938 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1939 free (bb_vinfo);
1940 bb->aux = NULL;
1944 /* Analyze statements contained in SLP tree node after recursively analyzing
1945 the subtree. Return TRUE if the operations are supported. */
1947 static bool
1948 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
1950 bool dummy;
1951 int i;
1952 gimple stmt;
1953 slp_void_p child;
1955 if (!node)
1956 return true;
1958 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1959 if (!vect_slp_analyze_node_operations (bb_vinfo, (slp_tree) child))
1960 return false;
1962 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1964 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1965 gcc_assert (stmt_info);
1966 gcc_assert (PURE_SLP_STMT (stmt_info));
1968 if (!vect_analyze_stmt (stmt, &dummy, node))
1969 return false;
1972 return true;
1976 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
1977 operations are supported. */
1979 static bool
1980 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
1982 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1983 slp_instance instance;
1984 int i;
1986 for (i = 0; slp_instances.iterate (i, &instance); )
1988 if (!vect_slp_analyze_node_operations (bb_vinfo,
1989 SLP_INSTANCE_TREE (instance)))
1991 vect_free_slp_instance (instance);
1992 slp_instances.ordered_remove (i);
1994 else
1995 i++;
1998 if (!slp_instances.length ())
1999 return false;
2001 return true;
2004 /* Check if vectorization of the basic block is profitable. */
2006 static bool
2007 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2009 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2010 slp_instance instance;
2011 int i, j;
2012 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2013 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2014 unsigned int stmt_cost;
2015 gimple stmt;
2016 gimple_stmt_iterator si;
2017 basic_block bb = BB_VINFO_BB (bb_vinfo);
2018 void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2019 stmt_vec_info stmt_info = NULL;
2020 stmt_vector_for_cost body_cost_vec;
2021 stmt_info_for_cost *ci;
2023 /* Calculate vector costs. */
2024 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2026 body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2028 FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2030 stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2031 (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2032 stmt_info, ci->misalign, vect_body);
2036 /* Calculate scalar cost. */
2037 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2039 stmt = gsi_stmt (si);
2040 stmt_info = vinfo_for_stmt (stmt);
2042 if (!stmt_info || !STMT_VINFO_VECTORIZABLE (stmt_info)
2043 || !PURE_SLP_STMT (stmt_info))
2044 continue;
2046 if (STMT_VINFO_DATA_REF (stmt_info))
2048 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2049 stmt_cost = vect_get_stmt_cost (scalar_load);
2050 else
2051 stmt_cost = vect_get_stmt_cost (scalar_store);
2053 else
2054 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2056 scalar_cost += stmt_cost;
2059 /* Complete the target-specific cost calculation. */
2060 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2061 &vec_inside_cost, &vec_epilogue_cost);
2063 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2065 if (dump_enabled_p ())
2067 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2068 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2069 vec_inside_cost);
2070 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2071 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2072 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d", scalar_cost);
2075 /* Vectorization is profitable if its cost is less than the cost of scalar
2076 version. */
2077 if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2078 return false;
2080 return true;
2083 /* Check if the basic block can be vectorized. */
2085 static bb_vec_info
2086 vect_slp_analyze_bb_1 (basic_block bb)
2088 bb_vec_info bb_vinfo;
2089 vec<slp_instance> slp_instances;
2090 slp_instance instance;
2091 int i;
2092 int min_vf = 2;
2094 bb_vinfo = new_bb_vec_info (bb);
2095 if (!bb_vinfo)
2096 return NULL;
2098 if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf))
2100 if (dump_enabled_p ())
2101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2102 "not vectorized: unhandled data-ref in basic "
2103 "block.\n");
2105 destroy_bb_vec_info (bb_vinfo);
2106 return NULL;
2109 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2111 if (dump_enabled_p ())
2112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2113 "not vectorized: not enough data-refs in "
2114 "basic block.\n");
2116 destroy_bb_vec_info (bb_vinfo);
2117 return NULL;
2120 if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2122 if (dump_enabled_p ())
2123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2124 "not vectorized: unhandled data access in "
2125 "basic block.\n");
2127 destroy_bb_vec_info (bb_vinfo);
2128 return NULL;
2131 vect_pattern_recog (NULL, bb_vinfo);
2133 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2135 if (dump_enabled_p ())
2136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2137 "not vectorized: unhandled data dependence "
2138 "in basic block.\n");
2140 destroy_bb_vec_info (bb_vinfo);
2141 return NULL;
2144 if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2146 if (dump_enabled_p ())
2147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2148 "not vectorized: bad data alignment in basic "
2149 "block.\n");
2151 destroy_bb_vec_info (bb_vinfo);
2152 return NULL;
2155 /* Check the SLP opportunities in the basic block, analyze and build SLP
2156 trees. */
2157 if (!vect_analyze_slp (NULL, bb_vinfo))
2159 if (dump_enabled_p ())
2160 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2161 "not vectorized: failed to find SLP opportunities "
2162 "in basic block.\n");
2164 destroy_bb_vec_info (bb_vinfo);
2165 return NULL;
2168 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2170 /* Mark all the statements that we want to vectorize as pure SLP and
2171 relevant. */
2172 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2174 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2175 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2178 if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2180 if (dump_enabled_p ())
2181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182 "not vectorized: unsupported alignment in basic "
2183 "block.\n");
2184 destroy_bb_vec_info (bb_vinfo);
2185 return NULL;
2188 if (!vect_slp_analyze_operations (bb_vinfo))
2190 if (dump_enabled_p ())
2191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2192 "not vectorized: bad operation in basic block.\n");
2194 destroy_bb_vec_info (bb_vinfo);
2195 return NULL;
2198 /* Cost model: check if the vectorization is worthwhile. */
2199 if (flag_vect_cost_model
2200 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2202 if (dump_enabled_p ())
2203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2204 "not vectorized: vectorization is not "
2205 "profitable.\n");
2207 destroy_bb_vec_info (bb_vinfo);
2208 return NULL;
2211 if (dump_enabled_p ())
2212 dump_printf_loc (MSG_NOTE, vect_location,
2213 "Basic block will be vectorized using SLP\n");
2215 return bb_vinfo;
2219 bb_vec_info
2220 vect_slp_analyze_bb (basic_block bb)
2222 bb_vec_info bb_vinfo;
2223 int insns = 0;
2224 gimple_stmt_iterator gsi;
2225 unsigned int vector_sizes;
2227 if (dump_enabled_p ())
2228 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2230 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2232 gimple stmt = gsi_stmt (gsi);
2233 if (!is_gimple_debug (stmt)
2234 && !gimple_nop_p (stmt)
2235 && gimple_code (stmt) != GIMPLE_LABEL)
2236 insns++;
2239 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2241 if (dump_enabled_p ())
2242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2243 "not vectorized: too many instructions in "
2244 "basic block.\n");
2246 return NULL;
2249 /* Autodetect first vector size we try. */
2250 current_vector_size = 0;
2251 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2253 while (1)
2255 bb_vinfo = vect_slp_analyze_bb_1 (bb);
2256 if (bb_vinfo)
2257 return bb_vinfo;
2259 destroy_bb_vec_info (bb_vinfo);
2261 vector_sizes &= ~current_vector_size;
2262 if (vector_sizes == 0
2263 || current_vector_size == 0)
2264 return NULL;
2266 /* Try the next biggest vector size. */
2267 current_vector_size = 1 << floor_log2 (vector_sizes);
2268 if (dump_enabled_p ())
2269 dump_printf_loc (MSG_NOTE, vect_location,
2270 "***** Re-trying analysis with "
2271 "vector size %d\n", current_vector_size);
2276 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2277 the number of created vector stmts depends on the unrolling factor).
2278 However, the actual number of vector stmts for every SLP node depends on
2279 VF which is set later in vect_analyze_operations (). Hence, SLP costs
2280 should be updated. In this function we assume that the inside costs
2281 calculated in vect_model_xxx_cost are linear in ncopies. */
2283 void
2284 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2286 unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2287 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2288 slp_instance instance;
2289 stmt_vector_for_cost body_cost_vec;
2290 stmt_info_for_cost *si;
2291 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2293 if (dump_enabled_p ())
2294 dump_printf_loc (MSG_NOTE, vect_location,
2295 "=== vect_update_slp_costs_according_to_vf ===");
2297 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2299 /* We assume that costs are linear in ncopies. */
2300 int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2302 /* Record the instance's instructions in the target cost model.
2303 This was delayed until here because the count of instructions
2304 isn't known beforehand. */
2305 body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2307 FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2308 (void) add_stmt_cost (data, si->count * ncopies, si->kind,
2309 vinfo_for_stmt (si->stmt), si->misalign,
2310 vect_body);
2315 /* For constant and loop invariant defs of SLP_NODE this function returns
2316 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2317 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2318 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2319 REDUC_INDEX is the index of the reduction operand in the statements, unless
2320 it is -1. */
2322 static void
2323 vect_get_constant_vectors (tree op, slp_tree slp_node,
2324 vec<tree> *vec_oprnds,
2325 unsigned int op_num, unsigned int number_of_vectors,
2326 int reduc_index)
2328 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2329 gimple stmt = stmts[0];
2330 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2331 unsigned nunits;
2332 tree vec_cst;
2333 tree *elts;
2334 unsigned j, number_of_places_left_in_vector;
2335 tree vector_type;
2336 tree vop;
2337 int group_size = stmts.length ();
2338 unsigned int vec_num, i;
2339 unsigned number_of_copies = 1;
2340 vec<tree> voprnds;
2341 voprnds.create (number_of_vectors);
2342 bool constant_p, is_store;
2343 tree neutral_op = NULL;
2344 enum tree_code code = gimple_expr_code (stmt);
2345 gimple def_stmt;
2346 struct loop *loop;
2347 gimple_seq ctor_seq = NULL;
2349 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2350 && reduc_index != -1)
2352 op_num = reduc_index - 1;
2353 op = gimple_op (stmt, reduc_index);
2354 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2355 we need either neutral operands or the original operands. See
2356 get_initial_def_for_reduction() for details. */
2357 switch (code)
2359 case WIDEN_SUM_EXPR:
2360 case DOT_PROD_EXPR:
2361 case PLUS_EXPR:
2362 case MINUS_EXPR:
2363 case BIT_IOR_EXPR:
2364 case BIT_XOR_EXPR:
2365 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2366 neutral_op = build_real (TREE_TYPE (op), dconst0);
2367 else
2368 neutral_op = build_int_cst (TREE_TYPE (op), 0);
2370 break;
2372 case MULT_EXPR:
2373 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2374 neutral_op = build_real (TREE_TYPE (op), dconst1);
2375 else
2376 neutral_op = build_int_cst (TREE_TYPE (op), 1);
2378 break;
2380 case BIT_AND_EXPR:
2381 neutral_op = build_int_cst (TREE_TYPE (op), -1);
2382 break;
2384 case MAX_EXPR:
2385 case MIN_EXPR:
2386 def_stmt = SSA_NAME_DEF_STMT (op);
2387 loop = (gimple_bb (stmt))->loop_father;
2388 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2389 loop_preheader_edge (loop));
2390 break;
2392 default:
2393 neutral_op = NULL;
2397 if (STMT_VINFO_DATA_REF (stmt_vinfo))
2399 is_store = true;
2400 op = gimple_assign_rhs1 (stmt);
2402 else
2403 is_store = false;
2405 gcc_assert (op);
2407 if (CONSTANT_CLASS_P (op))
2408 constant_p = true;
2409 else
2410 constant_p = false;
2412 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2413 gcc_assert (vector_type);
2414 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2416 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2417 created vectors. It is greater than 1 if unrolling is performed.
2419 For example, we have two scalar operands, s1 and s2 (e.g., group of
2420 strided accesses of size two), while NUNITS is four (i.e., four scalars
2421 of this type can be packed in a vector). The output vector will contain
2422 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2423 will be 2).
2425 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2426 containing the operands.
2428 For example, NUNITS is four as before, and the group size is 8
2429 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2430 {s5, s6, s7, s8}. */
2432 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2434 number_of_places_left_in_vector = nunits;
2435 elts = XALLOCAVEC (tree, nunits);
2436 for (j = 0; j < number_of_copies; j++)
2438 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2440 if (is_store)
2441 op = gimple_assign_rhs1 (stmt);
2442 else
2444 switch (code)
2446 case COND_EXPR:
2447 if (op_num == 0 || op_num == 1)
2449 tree cond = gimple_assign_rhs1 (stmt);
2450 op = TREE_OPERAND (cond, op_num);
2452 else
2454 if (op_num == 2)
2455 op = gimple_assign_rhs2 (stmt);
2456 else
2457 op = gimple_assign_rhs3 (stmt);
2459 break;
2461 case CALL_EXPR:
2462 op = gimple_call_arg (stmt, op_num);
2463 break;
2465 case LSHIFT_EXPR:
2466 case RSHIFT_EXPR:
2467 case LROTATE_EXPR:
2468 case RROTATE_EXPR:
2469 op = gimple_op (stmt, op_num + 1);
2470 /* Unlike the other binary operators, shifts/rotates have
2471 the shift count being int, instead of the same type as
2472 the lhs, so make sure the scalar is the right type if
2473 we are dealing with vectors of
2474 long long/long/short/char. */
2475 if (op_num == 1 && constant_p)
2476 op = fold_convert (TREE_TYPE (vector_type), op);
2477 break;
2479 default:
2480 op = gimple_op (stmt, op_num + 1);
2481 break;
2485 if (reduc_index != -1)
2487 loop = (gimple_bb (stmt))->loop_father;
2488 def_stmt = SSA_NAME_DEF_STMT (op);
2490 gcc_assert (loop);
2492 /* Get the def before the loop. In reduction chain we have only
2493 one initial value. */
2494 if ((j != (number_of_copies - 1)
2495 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2496 && i != 0))
2497 && neutral_op)
2498 op = neutral_op;
2499 else
2500 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2501 loop_preheader_edge (loop));
2504 /* Create 'vect_ = {op0,op1,...,opn}'. */
2505 number_of_places_left_in_vector--;
2506 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2508 if (constant_p)
2510 op = fold_unary (VIEW_CONVERT_EXPR,
2511 TREE_TYPE (vector_type), op);
2512 gcc_assert (op && CONSTANT_CLASS_P (op));
2514 else
2516 tree new_temp
2517 = make_ssa_name (TREE_TYPE (vector_type), NULL);
2518 gimple init_stmt;
2519 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
2520 op);
2521 init_stmt
2522 = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR,
2523 new_temp, op, NULL_TREE);
2524 gimple_seq_add_stmt (&ctor_seq, init_stmt);
2525 op = new_temp;
2528 elts[number_of_places_left_in_vector] = op;
2530 if (number_of_places_left_in_vector == 0)
2532 number_of_places_left_in_vector = nunits;
2534 if (constant_p)
2535 vec_cst = build_vector (vector_type, elts);
2536 else
2538 vec<constructor_elt, va_gc> *v;
2539 unsigned k;
2540 vec_alloc (v, nunits);
2541 for (k = 0; k < nunits; ++k)
2542 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2543 vec_cst = build_constructor (vector_type, v);
2545 voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2546 vector_type, NULL));
2547 if (ctor_seq != NULL)
2549 gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2550 gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2551 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2552 GSI_SAME_STMT);
2553 ctor_seq = NULL;
2559 /* Since the vectors are created in the reverse order, we should invert
2560 them. */
2561 vec_num = voprnds.length ();
2562 for (j = vec_num; j != 0; j--)
2564 vop = voprnds[j - 1];
2565 vec_oprnds->quick_push (vop);
2568 voprnds.release ();
2570 /* In case that VF is greater than the unrolling factor needed for the SLP
2571 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2572 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2573 to replicate the vectors. */
2574 while (number_of_vectors > vec_oprnds->length ())
2576 tree neutral_vec = NULL;
2578 if (neutral_op)
2580 if (!neutral_vec)
2581 neutral_vec = build_vector_from_val (vector_type, neutral_op);
2583 vec_oprnds->quick_push (neutral_vec);
2585 else
2587 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2588 vec_oprnds->quick_push (vop);
2594 /* Get vectorized definitions from SLP_NODE that contains corresponding
2595 vectorized def-stmts. */
2597 static void
2598 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2600 tree vec_oprnd;
2601 gimple vec_def_stmt;
2602 unsigned int i;
2604 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2606 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2608 gcc_assert (vec_def_stmt);
2609 vec_oprnd = gimple_get_lhs (vec_def_stmt);
2610 vec_oprnds->quick_push (vec_oprnd);
2615 /* Get vectorized definitions for SLP_NODE.
2616 If the scalar definitions are loop invariants or constants, collect them and
2617 call vect_get_constant_vectors() to create vector stmts.
2618 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2619 must be stored in the corresponding child of SLP_NODE, and we call
2620 vect_get_slp_vect_defs () to retrieve them. */
2622 void
2623 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2624 vec<vec<tree> > *vec_oprnds, int reduc_index)
2626 gimple first_stmt;
2627 int number_of_vects = 0, i;
2628 unsigned int child_index = 0;
2629 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2630 slp_tree child = NULL;
2631 vec<tree> vec_defs;
2632 tree oprnd;
2633 bool vectorized_defs;
2635 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2636 FOR_EACH_VEC_ELT (ops, i, oprnd)
2638 /* For each operand we check if it has vectorized definitions in a child
2639 node or we need to create them (for invariants and constants). We
2640 check if the LHS of the first stmt of the next child matches OPRND.
2641 If it does, we found the correct child. Otherwise, we call
2642 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2643 to check this child node for the next operand. */
2644 vectorized_defs = false;
2645 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2647 child = (slp_tree) SLP_TREE_CHILDREN (slp_node)[child_index];
2649 /* We have to check both pattern and original def, if available. */
2650 gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2651 gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2653 if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2654 || (related
2655 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2657 /* The number of vector defs is determined by the number of
2658 vector statements in the node from which we get those
2659 statements. */
2660 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2661 vectorized_defs = true;
2662 child_index++;
2666 if (!vectorized_defs)
2668 if (i == 0)
2670 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2671 /* Number of vector stmts was calculated according to LHS in
2672 vect_schedule_slp_instance (), fix it by replacing LHS with
2673 RHS, if necessary. See vect_get_smallest_scalar_type () for
2674 details. */
2675 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2676 &rhs_size_unit);
2677 if (rhs_size_unit != lhs_size_unit)
2679 number_of_vects *= rhs_size_unit;
2680 number_of_vects /= lhs_size_unit;
2685 /* Allocate memory for vectorized defs. */
2686 vec_defs = vNULL;
2687 vec_defs.create (number_of_vects);
2689 /* For reduction defs we call vect_get_constant_vectors (), since we are
2690 looking for initial loop invariant values. */
2691 if (vectorized_defs && reduc_index == -1)
2692 /* The defs are already vectorized. */
2693 vect_get_slp_vect_defs (child, &vec_defs);
2694 else
2695 /* Build vectors from scalar defs. */
2696 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2697 number_of_vects, reduc_index);
2699 vec_oprnds->quick_push (vec_defs);
2701 /* For reductions, we only need initial values. */
2702 if (reduc_index != -1)
2703 return;
2708 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2709 building a vector of type MASK_TYPE from it) and two input vectors placed in
2710 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2711 shifting by STRIDE elements of DR_CHAIN for every copy.
2712 (STRIDE is the number of vectorized stmts for NODE divided by the number of
2713 copies).
2714 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2715 the created stmts must be inserted. */
2717 static inline void
2718 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2719 tree mask, int first_vec_indx, int second_vec_indx,
2720 gimple_stmt_iterator *gsi, slp_tree node,
2721 tree vectype, vec<tree> dr_chain,
2722 int ncopies, int vect_stmts_counter)
2724 tree perm_dest;
2725 gimple perm_stmt = NULL;
2726 stmt_vec_info next_stmt_info;
2727 int i, stride;
2728 tree first_vec, second_vec, data_ref;
2730 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2732 /* Initialize the vect stmts of NODE to properly insert the generated
2733 stmts later. */
2734 for (i = SLP_TREE_VEC_STMTS (node).length ();
2735 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2736 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2738 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2739 for (i = 0; i < ncopies; i++)
2741 first_vec = dr_chain[first_vec_indx];
2742 second_vec = dr_chain[second_vec_indx];
2744 /* Generate the permute statement. */
2745 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, perm_dest,
2746 first_vec, second_vec, mask);
2747 data_ref = make_ssa_name (perm_dest, perm_stmt);
2748 gimple_set_lhs (perm_stmt, data_ref);
2749 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2751 /* Store the vector statement in NODE. */
2752 SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2754 first_vec_indx += stride;
2755 second_vec_indx += stride;
2758 /* Mark the scalar stmt as vectorized. */
2759 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2760 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2764 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2765 return in CURRENT_MASK_ELEMENT its equivalent in target specific
2766 representation. Check that the mask is valid and return FALSE if not.
2767 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2768 the next vector, i.e., the current first vector is not needed. */
2770 static bool
2771 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2772 int mask_nunits, bool only_one_vec, int index,
2773 unsigned char *mask, int *current_mask_element,
2774 bool *need_next_vector, int *number_of_mask_fixes,
2775 bool *mask_fixed, bool *needs_first_vector)
2777 int i;
2779 /* Convert to target specific representation. */
2780 *current_mask_element = first_mask_element + m;
2781 /* Adjust the value in case it's a mask for second and third vectors. */
2782 *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2784 if (*current_mask_element < mask_nunits)
2785 *needs_first_vector = true;
2787 /* We have only one input vector to permute but the mask accesses values in
2788 the next vector as well. */
2789 if (only_one_vec && *current_mask_element >= mask_nunits)
2791 if (dump_enabled_p ())
2793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2794 "permutation requires at least two vectors ");
2795 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2798 return false;
2801 /* The mask requires the next vector. */
2802 if (*current_mask_element >= mask_nunits * 2)
2804 if (*needs_first_vector || *mask_fixed)
2806 /* We either need the first vector too or have already moved to the
2807 next vector. In both cases, this permutation needs three
2808 vectors. */
2809 if (dump_enabled_p ())
2811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2812 "permutation requires at "
2813 "least three vectors ");
2814 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2817 return false;
2820 /* We move to the next vector, dropping the first one and working with
2821 the second and the third - we need to adjust the values of the mask
2822 accordingly. */
2823 *current_mask_element -= mask_nunits * *number_of_mask_fixes;
2825 for (i = 0; i < index; i++)
2826 mask[i] -= mask_nunits * *number_of_mask_fixes;
2828 (*number_of_mask_fixes)++;
2829 *mask_fixed = true;
2832 *need_next_vector = *mask_fixed;
2834 /* This was the last element of this mask. Start a new one. */
2835 if (index == mask_nunits - 1)
2837 *number_of_mask_fixes = 1;
2838 *mask_fixed = false;
2839 *needs_first_vector = false;
2842 return true;
2846 /* Generate vector permute statements from a list of loads in DR_CHAIN.
2847 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
2848 permute statements for SLP_NODE_INSTANCE. */
2849 bool
2850 vect_transform_slp_perm_load (gimple stmt, vec<tree> dr_chain,
2851 gimple_stmt_iterator *gsi, int vf,
2852 slp_instance slp_node_instance, bool analyze_only)
2854 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2855 tree mask_element_type = NULL_TREE, mask_type;
2856 int i, j, k, nunits, vec_index = 0, scalar_index;
2857 slp_tree node;
2858 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2859 gimple next_scalar_stmt;
2860 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
2861 int first_mask_element;
2862 int index, unroll_factor, current_mask_element, ncopies;
2863 unsigned char *mask;
2864 bool only_one_vec = false, need_next_vector = false;
2865 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
2866 int number_of_mask_fixes = 1;
2867 bool mask_fixed = false;
2868 bool needs_first_vector = false;
2869 enum machine_mode mode;
2871 mode = TYPE_MODE (vectype);
2873 if (!can_vec_perm_p (mode, false, NULL))
2875 if (dump_enabled_p ())
2877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2878 "no vect permute for ");
2879 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2881 return false;
2884 /* The generic VEC_PERM_EXPR code always uses an integral type of the
2885 same size as the vector element being permuted. */
2886 mask_element_type = lang_hooks.types.type_for_mode
2887 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
2888 mask_type = get_vectype_for_scalar_type (mask_element_type);
2889 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2890 mask = XALLOCAVEC (unsigned char, nunits);
2891 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2893 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
2894 unrolling factor. */
2895 orig_vec_stmts_num = group_size *
2896 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
2897 if (orig_vec_stmts_num == 1)
2898 only_one_vec = true;
2900 /* Number of copies is determined by the final vectorization factor
2901 relatively to SLP_NODE_INSTANCE unrolling factor. */
2902 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2904 /* Generate permutation masks for every NODE. Number of masks for each NODE
2905 is equal to GROUP_SIZE.
2906 E.g., we have a group of three nodes with three loads from the same
2907 location in each node, and the vector size is 4. I.e., we have a
2908 a0b0c0a1b1c1... sequence and we need to create the following vectors:
2909 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
2910 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
2913 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
2914 The last mask is illegal since we assume two operands for permute
2915 operation, and the mask element values can't be outside that range.
2916 Hence, the last mask must be converted into {2,5,5,5}.
2917 For the first two permutations we need the first and the second input
2918 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
2919 we need the second and the third vectors: {b1,c1,a2,b2} and
2920 {c2,a3,b3,c3}. */
2922 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_node_instance), i, node)
2924 scalar_index = 0;
2925 index = 0;
2926 vect_stmts_counter = 0;
2927 vec_index = 0;
2928 first_vec_index = vec_index++;
2929 if (only_one_vec)
2930 second_vec_index = first_vec_index;
2931 else
2932 second_vec_index = vec_index++;
2934 for (j = 0; j < unroll_factor; j++)
2936 for (k = 0; k < group_size; k++)
2938 first_mask_element = i + j * group_size;
2939 if (!vect_get_mask_element (stmt, first_mask_element, 0,
2940 nunits, only_one_vec, index,
2941 mask, &current_mask_element,
2942 &need_next_vector,
2943 &number_of_mask_fixes, &mask_fixed,
2944 &needs_first_vector))
2945 return false;
2946 mask[index++] = current_mask_element;
2948 if (index == nunits)
2950 tree mask_vec, *mask_elts;
2951 int l;
2953 if (!can_vec_perm_p (mode, false, mask))
2955 if (dump_enabled_p ())
2957 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
2958 vect_location,
2959 "unsupported vect permute { ");
2960 for (i = 0; i < nunits; ++i)
2961 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
2962 mask[i]);
2963 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
2965 return false;
2968 mask_elts = XALLOCAVEC (tree, nunits);
2969 for (l = 0; l < nunits; ++l)
2970 mask_elts[l] = build_int_cst (mask_element_type, mask[l]);
2971 mask_vec = build_vector (mask_type, mask_elts);
2972 index = 0;
2974 if (!analyze_only)
2976 if (need_next_vector)
2978 first_vec_index = second_vec_index;
2979 second_vec_index = vec_index;
2982 next_scalar_stmt
2983 = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
2985 vect_create_mask_and_perm (stmt, next_scalar_stmt,
2986 mask_vec, first_vec_index, second_vec_index,
2987 gsi, node, vectype, dr_chain,
2988 ncopies, vect_stmts_counter++);
2995 return true;
3000 /* Vectorize SLP instance tree in postorder. */
3002 static bool
3003 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3004 unsigned int vectorization_factor)
3006 gimple stmt;
3007 bool grouped_store, is_store;
3008 gimple_stmt_iterator si;
3009 stmt_vec_info stmt_info;
3010 unsigned int vec_stmts_size, nunits, group_size;
3011 tree vectype;
3012 int i;
3013 slp_tree loads_node;
3014 slp_void_p child;
3016 if (!node)
3017 return false;
3019 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3020 vect_schedule_slp_instance ((slp_tree) child, instance,
3021 vectorization_factor);
3023 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3024 stmt_info = vinfo_for_stmt (stmt);
3026 /* VECTYPE is the type of the destination. */
3027 vectype = STMT_VINFO_VECTYPE (stmt_info);
3028 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3029 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3031 /* For each SLP instance calculate number of vector stmts to be created
3032 for the scalar stmts in each node of the SLP tree. Number of vector
3033 elements in one vector iteration is the number of scalar elements in
3034 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3035 size. */
3036 vec_stmts_size = (vectorization_factor * group_size) / nunits;
3038 /* In case of load permutation we have to allocate vectorized statements for
3039 all the nodes that participate in that permutation. */
3040 if (SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
3042 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, loads_node)
3044 if (!SLP_TREE_VEC_STMTS (loads_node).exists ())
3046 SLP_TREE_VEC_STMTS (loads_node).create (vec_stmts_size);
3047 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
3052 if (!SLP_TREE_VEC_STMTS (node).exists ())
3054 SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3055 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3058 if (dump_enabled_p ())
3060 dump_printf_loc (MSG_NOTE,vect_location,
3061 "------>vectorizing SLP node starting from: ");
3062 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3065 /* Loads should be inserted before the first load. */
3066 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3067 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3068 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3069 && SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
3070 si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3071 else if (is_pattern_stmt_p (stmt_info))
3072 si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3073 else
3074 si = gsi_for_stmt (stmt);
3076 /* Stores should be inserted just before the last store. */
3077 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3078 && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3080 gimple last_store = vect_find_last_store_in_slp_instance (instance);
3081 if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3082 last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3083 si = gsi_for_stmt (last_store);
3086 /* Mark the first element of the reduction chain as reduction to properly
3087 transform the node. In the analysis phase only the last element of the
3088 chain is marked as reduction. */
3089 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3090 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3092 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3093 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3096 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3097 return is_store;
3100 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3101 For loop vectorization this is done in vectorizable_call, but for SLP
3102 it needs to be deferred until end of vect_schedule_slp, because multiple
3103 SLP instances may refer to the same scalar stmt. */
3105 static void
3106 vect_remove_slp_scalar_calls (slp_tree node)
3108 gimple stmt, new_stmt;
3109 gimple_stmt_iterator gsi;
3110 int i;
3111 slp_void_p child;
3112 tree lhs;
3113 stmt_vec_info stmt_info;
3115 if (!node)
3116 return;
3118 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3119 vect_remove_slp_scalar_calls ((slp_tree) child);
3121 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3123 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3124 continue;
3125 stmt_info = vinfo_for_stmt (stmt);
3126 if (stmt_info == NULL
3127 || is_pattern_stmt_p (stmt_info)
3128 || !PURE_SLP_STMT (stmt_info))
3129 continue;
3130 lhs = gimple_call_lhs (stmt);
3131 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3132 set_vinfo_for_stmt (new_stmt, stmt_info);
3133 set_vinfo_for_stmt (stmt, NULL);
3134 STMT_VINFO_STMT (stmt_info) = new_stmt;
3135 gsi = gsi_for_stmt (stmt);
3136 gsi_replace (&gsi, new_stmt, false);
3137 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3141 /* Generate vector code for all SLP instances in the loop/basic block. */
3143 bool
3144 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3146 vec<slp_instance> slp_instances;
3147 slp_instance instance;
3148 slp_tree loads_node;
3149 unsigned int i, j, vf;
3150 bool is_store = false;
3152 if (loop_vinfo)
3154 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3155 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3157 else
3159 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3160 vf = 1;
3163 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3165 /* Schedule the tree of INSTANCE. */
3166 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3167 instance, vf);
3169 /* Clear STMT_VINFO_VEC_STMT of all loads. With shared loads
3170 between SLP instances we fail to properly initialize the
3171 vectorized SLP stmts and confuse different load permutations. */
3172 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, loads_node)
3173 STMT_VINFO_VEC_STMT
3174 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (loads_node)[0])) = NULL;
3176 if (dump_enabled_p ())
3177 dump_printf_loc (MSG_NOTE, vect_location,
3178 "vectorizing stmts using SLP.");
3181 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3183 slp_tree root = SLP_INSTANCE_TREE (instance);
3184 gimple store;
3185 unsigned int j;
3186 gimple_stmt_iterator gsi;
3188 /* Remove scalar call stmts. Do not do this for basic-block
3189 vectorization as not all uses may be vectorized.
3190 ??? Why should this be necessary? DCE should be able to
3191 remove the stmts itself.
3192 ??? For BB vectorization we can as well remove scalar
3193 stmts starting from the SLP tree root if they have no
3194 uses. */
3195 if (loop_vinfo)
3196 vect_remove_slp_scalar_calls (root);
3198 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3199 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3201 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3202 break;
3204 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3205 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3206 /* Free the attached stmt_vec_info and remove the stmt. */
3207 gsi = gsi_for_stmt (store);
3208 unlink_stmt_vdef (store);
3209 gsi_remove (&gsi, true);
3210 release_defs (store);
3211 free_stmt_vec_info (store);
3215 return is_store;
3219 /* Vectorize the basic block. */
3221 void
3222 vect_slp_transform_bb (basic_block bb)
3224 bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3225 gimple_stmt_iterator si;
3227 gcc_assert (bb_vinfo);
3229 if (dump_enabled_p ())
3230 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3232 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3234 gimple stmt = gsi_stmt (si);
3235 stmt_vec_info stmt_info;
3237 if (dump_enabled_p ())
3239 dump_printf_loc (MSG_NOTE, vect_location,
3240 "------>SLPing statement: ");
3241 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3244 stmt_info = vinfo_for_stmt (stmt);
3245 gcc_assert (stmt_info);
3247 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3248 if (STMT_SLP_TYPE (stmt_info))
3250 vect_schedule_slp (NULL, bb_vinfo);
3251 break;
3255 if (dump_enabled_p ())
3256 dump_printf (MSG_OPTIMIZED_LOCATIONS, "BASIC BLOCK VECTORIZED\n");
3258 destroy_bb_vec_info (bb_vinfo);