aarch64.md (*adds_<optab><mode>_multp2): New pattern.
[official-gcc.git] / gcc / tree-vect-slp.c
blob108a87a27ee21d845845364708f632e1ac6f262b
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "target.h"
30 #include "basic-block.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-pass.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "recog.h" /* FIXME: for insn_data */
37 #include "optabs.h"
38 #include "tree-vectorizer.h"
39 #include "langhooks.h"
41 /* Extract the location of the basic block in the source code.
42 Return the basic block location if succeed and NULL if not. */
44 LOC
45 find_bb_location (basic_block bb)
47 gimple stmt = NULL;
48 gimple_stmt_iterator si;
50 if (!bb)
51 return UNKNOWN_LOC;
53 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
55 stmt = gsi_stmt (si);
56 if (gimple_location (stmt) != UNKNOWN_LOC)
57 return gimple_location (stmt);
60 return UNKNOWN_LOC;
64 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
66 static void
67 vect_free_slp_tree (slp_tree node)
69 int i;
70 slp_tree child;
72 if (!node)
73 return;
75 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
76 vect_free_slp_tree (child);
78 SLP_TREE_CHILDREN (node).release ();
79 SLP_TREE_SCALAR_STMTS (node).release ();
80 SLP_TREE_VEC_STMTS (node).release ();
82 free (node);
86 /* Free the memory allocated for the SLP instance. */
88 void
89 vect_free_slp_instance (slp_instance instance)
91 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
92 SLP_INSTANCE_LOAD_PERMUTATION (instance).release ();
93 SLP_INSTANCE_LOADS (instance).release ();
94 SLP_INSTANCE_BODY_COST_VEC (instance).release ();
95 free (instance);
99 /* Create an SLP node for SCALAR_STMTS. */
101 static slp_tree
102 vect_create_new_slp_node (vec<gimple> scalar_stmts)
104 slp_tree node;
105 gimple stmt = scalar_stmts[0];
106 unsigned int nops;
108 if (is_gimple_call (stmt))
109 nops = gimple_call_num_args (stmt);
110 else if (is_gimple_assign (stmt))
112 nops = gimple_num_ops (stmt) - 1;
113 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
114 nops++;
116 else
117 return NULL;
119 node = XNEW (struct _slp_tree);
120 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
121 SLP_TREE_VEC_STMTS (node).create (0);
122 SLP_TREE_CHILDREN (node).create (nops);
124 return node;
128 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
129 operand. */
130 static vec<slp_oprnd_info>
131 vect_create_oprnd_info (int nops, int group_size)
133 int i;
134 slp_oprnd_info oprnd_info;
135 vec<slp_oprnd_info> oprnds_info;
137 oprnds_info.create (nops);
138 for (i = 0; i < nops; i++)
140 oprnd_info = XNEW (struct _slp_oprnd_info);
141 oprnd_info->def_stmts.create (group_size);
142 oprnd_info->first_dt = vect_uninitialized_def;
143 oprnd_info->first_op_type = NULL_TREE;
144 oprnd_info->first_pattern = false;
145 oprnds_info.quick_push (oprnd_info);
148 return oprnds_info;
152 /* Free operands info. */
154 static void
155 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
157 int i;
158 slp_oprnd_info oprnd_info;
160 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
162 oprnd_info->def_stmts.release ();
163 XDELETE (oprnd_info);
166 oprnds_info.release ();
170 /* Find the place of the data-ref in STMT in the interleaving chain that starts
171 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
173 static int
174 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
176 gimple next_stmt = first_stmt;
177 int result = 0;
179 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
180 return -1;
184 if (next_stmt == stmt)
185 return result;
186 result++;
187 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
189 while (next_stmt);
191 return -1;
195 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
196 they are of a valid type and that they match the defs of the first stmt of
197 the SLP group (stored in OPRNDS_INFO). */
199 static bool
200 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
201 gimple stmt, bool first,
202 vec<slp_oprnd_info> *oprnds_info)
204 tree oprnd;
205 unsigned int i, number_of_oprnds;
206 tree def;
207 gimple def_stmt;
208 enum vect_def_type dt = vect_uninitialized_def;
209 struct loop *loop = NULL;
210 bool pattern = false;
211 slp_oprnd_info oprnd_info;
212 int op_idx = 1;
213 tree compare_rhs = NULL_TREE;
215 if (loop_vinfo)
216 loop = LOOP_VINFO_LOOP (loop_vinfo);
218 if (is_gimple_call (stmt))
220 number_of_oprnds = gimple_call_num_args (stmt);
221 op_idx = 3;
223 else if (is_gimple_assign (stmt))
225 number_of_oprnds = gimple_num_ops (stmt) - 1;
226 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
227 number_of_oprnds++;
229 else
230 return false;
232 for (i = 0; i < number_of_oprnds; i++)
234 if (compare_rhs)
236 oprnd = compare_rhs;
237 compare_rhs = NULL_TREE;
239 else
240 oprnd = gimple_op (stmt, op_idx++);
242 oprnd_info = (*oprnds_info)[i];
244 if (COMPARISON_CLASS_P (oprnd))
246 compare_rhs = TREE_OPERAND (oprnd, 1);
247 oprnd = TREE_OPERAND (oprnd, 0);
250 if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
251 &def, &dt)
252 || (!def_stmt && dt != vect_constant_def))
254 if (dump_enabled_p ())
256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
257 "Build SLP failed: can't find def for ");
258 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
261 return false;
264 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
265 from the pattern. Check that all the stmts of the node are in the
266 pattern. */
267 if (def_stmt && gimple_bb (def_stmt)
268 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
269 || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
270 && gimple_code (def_stmt) != GIMPLE_PHI))
271 && vinfo_for_stmt (def_stmt)
272 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
273 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
274 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
276 pattern = true;
277 if (!first && !oprnd_info->first_pattern)
279 if (dump_enabled_p ())
281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
282 "Build SLP failed: some of the stmts"
283 " are in a pattern, and others are not ");
284 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
287 return false;
290 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
291 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
293 if (dt == vect_unknown_def_type)
295 if (dump_enabled_p ())
296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
297 "Unsupported pattern.");
298 return false;
301 switch (gimple_code (def_stmt))
303 case GIMPLE_PHI:
304 def = gimple_phi_result (def_stmt);
305 break;
307 case GIMPLE_ASSIGN:
308 def = gimple_assign_lhs (def_stmt);
309 break;
311 default:
312 if (dump_enabled_p ())
313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
314 "unsupported defining stmt: ");
315 return false;
319 if (first)
321 oprnd_info->first_dt = dt;
322 oprnd_info->first_pattern = pattern;
323 oprnd_info->first_op_type = TREE_TYPE (oprnd);
325 else
327 /* Not first stmt of the group, check that the def-stmt/s match
328 the def-stmt/s of the first stmt. Allow different definition
329 types for reduction chains: the first stmt must be a
330 vect_reduction_def (a phi node), and the rest
331 vect_internal_def. */
332 if (((oprnd_info->first_dt != dt
333 && !(oprnd_info->first_dt == vect_reduction_def
334 && dt == vect_internal_def)
335 && !((oprnd_info->first_dt == vect_external_def
336 || oprnd_info->first_dt == vect_constant_def)
337 && (dt == vect_external_def
338 || dt == vect_constant_def)))
339 || !types_compatible_p (oprnd_info->first_op_type,
340 TREE_TYPE (oprnd))))
342 if (dump_enabled_p ())
343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
344 "Build SLP failed: different types ");
346 return false;
350 /* Check the types of the definitions. */
351 switch (dt)
353 case vect_constant_def:
354 case vect_external_def:
355 case vect_reduction_def:
356 break;
358 case vect_internal_def:
359 oprnd_info->def_stmts.quick_push (def_stmt);
360 break;
362 default:
363 /* FORNOW: Not supported. */
364 if (dump_enabled_p ())
366 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
367 "Build SLP failed: illegal type of def ");
368 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
371 return false;
375 return true;
379 /* Recursively build an SLP tree starting from NODE.
380 Fail (and return FALSE) if def-stmts are not isomorphic, require data
381 permutation or are of unsupported types of operation. Otherwise, return
382 TRUE. */
384 static bool
385 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
386 slp_tree *node, unsigned int group_size,
387 unsigned int *max_nunits,
388 vec<slp_tree> *loads,
389 unsigned int vectorization_factor)
391 unsigned int i;
392 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (*node);
393 gimple stmt = stmts[0];
394 enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
395 enum tree_code first_cond_code = ERROR_MARK;
396 tree lhs;
397 bool stop_recursion = false, need_same_oprnds = false;
398 tree vectype, scalar_type, first_op1 = NULL_TREE;
399 optab optab;
400 int icode;
401 enum machine_mode optab_op2_mode;
402 enum machine_mode vec_mode;
403 struct data_reference *first_dr;
404 HOST_WIDE_INT dummy;
405 gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
406 vec<slp_oprnd_info> oprnds_info;
407 unsigned int nops;
408 slp_oprnd_info oprnd_info;
409 tree cond;
411 if (is_gimple_call (stmt))
412 nops = gimple_call_num_args (stmt);
413 else if (is_gimple_assign (stmt))
415 nops = gimple_num_ops (stmt) - 1;
416 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
417 nops++;
419 else
420 return false;
422 oprnds_info = vect_create_oprnd_info (nops, group_size);
424 /* For every stmt in NODE find its def stmt/s. */
425 FOR_EACH_VEC_ELT (stmts, i, stmt)
427 if (dump_enabled_p ())
429 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
430 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
433 /* Fail to vectorize statements marked as unvectorizable. */
434 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
436 if (dump_enabled_p ())
438 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
439 "Build SLP failed: unvectorizable statement ");
440 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
443 vect_free_oprnd_info (oprnds_info);
444 return false;
447 lhs = gimple_get_lhs (stmt);
448 if (lhs == NULL_TREE)
450 if (dump_enabled_p ())
452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
453 "Build SLP failed: not GIMPLE_ASSIGN nor "
454 "GIMPLE_CALL ");
455 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
458 vect_free_oprnd_info (oprnds_info);
459 return false;
462 if (is_gimple_assign (stmt)
463 && gimple_assign_rhs_code (stmt) == COND_EXPR
464 && (cond = gimple_assign_rhs1 (stmt))
465 && !COMPARISON_CLASS_P (cond))
467 if (dump_enabled_p ())
469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
470 "Build SLP failed: condition is not "
471 "comparison ");
472 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
475 vect_free_oprnd_info (oprnds_info);
476 return false;
479 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
480 vectype = get_vectype_for_scalar_type (scalar_type);
481 if (!vectype)
483 if (dump_enabled_p ())
485 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
486 "Build SLP failed: unsupported data-type ");
487 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
488 scalar_type);
491 vect_free_oprnd_info (oprnds_info);
492 return false;
495 /* In case of multiple types we need to detect the smallest type. */
496 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
498 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
499 if (bb_vinfo)
500 vectorization_factor = *max_nunits;
503 if (is_gimple_call (stmt))
505 rhs_code = CALL_EXPR;
506 if (gimple_call_internal_p (stmt)
507 || gimple_call_tail_p (stmt)
508 || gimple_call_noreturn_p (stmt)
509 || !gimple_call_nothrow_p (stmt)
510 || gimple_call_chain (stmt))
512 if (dump_enabled_p ())
514 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
515 "Build SLP failed: unsupported call type ");
516 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
519 vect_free_oprnd_info (oprnds_info);
520 return false;
523 else
524 rhs_code = gimple_assign_rhs_code (stmt);
526 /* Check the operation. */
527 if (i == 0)
529 first_stmt_code = rhs_code;
531 /* Shift arguments should be equal in all the packed stmts for a
532 vector shift with scalar shift operand. */
533 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
534 || rhs_code == LROTATE_EXPR
535 || rhs_code == RROTATE_EXPR)
537 vec_mode = TYPE_MODE (vectype);
539 /* First see if we have a vector/vector shift. */
540 optab = optab_for_tree_code (rhs_code, vectype,
541 optab_vector);
543 if (!optab
544 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
546 /* No vector/vector shift, try for a vector/scalar shift. */
547 optab = optab_for_tree_code (rhs_code, vectype,
548 optab_scalar);
550 if (!optab)
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
554 "Build SLP failed: no optab.");
555 vect_free_oprnd_info (oprnds_info);
556 return false;
558 icode = (int) optab_handler (optab, vec_mode);
559 if (icode == CODE_FOR_nothing)
561 if (dump_enabled_p ())
562 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
563 "Build SLP failed: "
564 "op not supported by target.");
565 vect_free_oprnd_info (oprnds_info);
566 return false;
568 optab_op2_mode = insn_data[icode].operand[2].mode;
569 if (!VECTOR_MODE_P (optab_op2_mode))
571 need_same_oprnds = true;
572 first_op1 = gimple_assign_rhs2 (stmt);
576 else if (rhs_code == WIDEN_LSHIFT_EXPR)
578 need_same_oprnds = true;
579 first_op1 = gimple_assign_rhs2 (stmt);
582 else
584 if (first_stmt_code != rhs_code
585 && (first_stmt_code != IMAGPART_EXPR
586 || rhs_code != REALPART_EXPR)
587 && (first_stmt_code != REALPART_EXPR
588 || rhs_code != IMAGPART_EXPR)
589 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
590 && (first_stmt_code == ARRAY_REF
591 || first_stmt_code == BIT_FIELD_REF
592 || first_stmt_code == INDIRECT_REF
593 || first_stmt_code == COMPONENT_REF
594 || first_stmt_code == MEM_REF)))
596 if (dump_enabled_p ())
598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
599 "Build SLP failed: different operation "
600 "in stmt ");
601 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
604 vect_free_oprnd_info (oprnds_info);
605 return false;
608 if (need_same_oprnds
609 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
611 if (dump_enabled_p ())
613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
614 "Build SLP failed: different shift "
615 "arguments in ");
616 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
619 vect_free_oprnd_info (oprnds_info);
620 return false;
623 if (rhs_code == CALL_EXPR)
625 gimple first_stmt = stmts[0];
626 if (gimple_call_num_args (stmt) != nops
627 || !operand_equal_p (gimple_call_fn (first_stmt),
628 gimple_call_fn (stmt), 0)
629 || gimple_call_fntype (first_stmt)
630 != gimple_call_fntype (stmt))
632 if (dump_enabled_p ())
634 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
635 "Build SLP failed: different calls in ");
636 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
637 stmt, 0);
640 vect_free_oprnd_info (oprnds_info);
641 return false;
646 /* Grouped store or load. */
647 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
649 if (REFERENCE_CLASS_P (lhs))
651 /* Store. */
652 if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
653 stmt, (i == 0), &oprnds_info))
655 vect_free_oprnd_info (oprnds_info);
656 return false;
659 else
661 /* Load. */
662 unsigned unrolling_factor
663 = least_common_multiple
664 (*max_nunits, group_size) / group_size;
665 /* FORNOW: Check that there is no gap between the loads
666 and no gap between the groups when we need to load
667 multiple groups at once.
668 ??? We should enhance this to only disallow gaps
669 inside vectors. */
670 if ((unrolling_factor > 1
671 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
672 && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
673 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
674 && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
676 if (dump_enabled_p ())
678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
679 "Build SLP failed: grouped "
680 "loads have gaps ");
681 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
682 stmt, 0);
685 vect_free_oprnd_info (oprnds_info);
686 return false;
689 /* Check that the size of interleaved loads group is not
690 greater than the SLP group size. */
691 unsigned ncopies
692 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
693 if (loop_vinfo
694 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
695 && ((GROUP_SIZE (vinfo_for_stmt (stmt))
696 - GROUP_GAP (vinfo_for_stmt (stmt)))
697 > ncopies * group_size))
699 if (dump_enabled_p ())
701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
702 "Build SLP failed: the number "
703 "of interleaved loads is greater than "
704 "the SLP group size ");
705 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
706 stmt, 0);
709 vect_free_oprnd_info (oprnds_info);
710 return false;
713 old_first_load = first_load;
714 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
715 if (prev_first_load)
717 /* Check that there are no loads from different interleaving
718 chains in the same node. The only exception is complex
719 numbers. */
720 if (prev_first_load != first_load
721 && rhs_code != REALPART_EXPR
722 && rhs_code != IMAGPART_EXPR)
724 if (dump_enabled_p ())
726 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
727 vect_location,
728 "Build SLP failed: different "
729 "interleaving chains in one node ");
730 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
731 stmt, 0);
734 vect_free_oprnd_info (oprnds_info);
735 return false;
738 else
739 prev_first_load = first_load;
741 /* In some cases a group of loads is just the same load
742 repeated N times. Only analyze its cost once. */
743 if (first_load == stmt && old_first_load != first_load)
745 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
746 if (vect_supportable_dr_alignment (first_dr, false)
747 == dr_unaligned_unsupported)
749 if (dump_enabled_p ())
751 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
752 vect_location,
753 "Build SLP failed: unsupported "
754 "unaligned load ");
755 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
756 stmt, 0);
759 vect_free_oprnd_info (oprnds_info);
760 return false;
764 /* We stop the tree when we reach a group of loads. */
765 stop_recursion = true;
766 continue;
768 } /* Grouped access. */
769 else
771 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
773 /* Not grouped load. */
774 if (dump_enabled_p ())
776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
777 "Build SLP failed: not grouped load ");
778 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
781 /* FORNOW: Not grouped loads are not supported. */
782 vect_free_oprnd_info (oprnds_info);
783 return false;
786 /* Not memory operation. */
787 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
788 && TREE_CODE_CLASS (rhs_code) != tcc_unary
789 && rhs_code != COND_EXPR
790 && rhs_code != CALL_EXPR)
792 if (dump_enabled_p ())
794 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
795 "Build SLP failed: operation");
796 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
797 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
800 vect_free_oprnd_info (oprnds_info);
801 return false;
804 if (rhs_code == COND_EXPR)
806 tree cond_expr = gimple_assign_rhs1 (stmt);
808 if (i == 0)
809 first_cond_code = TREE_CODE (cond_expr);
810 else if (first_cond_code != TREE_CODE (cond_expr))
812 if (dump_enabled_p ())
814 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
815 "Build SLP failed: different"
816 " operation");
817 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
818 stmt, 0);
821 vect_free_oprnd_info (oprnds_info);
822 return false;
826 /* Find the def-stmts. */
827 if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, stmt,
828 (i == 0), &oprnds_info))
830 vect_free_oprnd_info (oprnds_info);
831 return false;
836 /* Grouped loads were reached - stop the recursion. */
837 if (stop_recursion)
839 loads->safe_push (*node);
840 vect_free_oprnd_info (oprnds_info);
841 return true;
844 /* Create SLP_TREE nodes for the definition node/s. */
845 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
847 slp_tree child;
849 if (oprnd_info->first_dt != vect_internal_def)
850 continue;
852 child = vect_create_new_slp_node (oprnd_info->def_stmts);
853 if (!child
854 || !vect_build_slp_tree (loop_vinfo, bb_vinfo, &child, group_size,
855 max_nunits, loads,
856 vectorization_factor))
858 if (child)
859 oprnd_info->def_stmts = vNULL;
860 vect_free_slp_tree (child);
861 vect_free_oprnd_info (oprnds_info);
862 return false;
865 oprnd_info->def_stmts.create (0);
866 SLP_TREE_CHILDREN (*node).quick_push (child);
869 vect_free_oprnd_info (oprnds_info);
870 return true;
873 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
875 static void
876 vect_print_slp_tree (int dump_kind, slp_tree node)
878 int i;
879 gimple stmt;
880 slp_tree child;
882 if (!node)
883 return;
885 dump_printf (dump_kind, "node ");
886 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
888 dump_printf (dump_kind, "\n\tstmt %d ", i);
889 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
891 dump_printf (dump_kind, "\n");
893 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
894 vect_print_slp_tree (dump_kind, child);
898 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
899 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
900 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
901 stmts in NODE are to be marked. */
903 static void
904 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
906 int i;
907 gimple stmt;
908 slp_tree child;
910 if (!node)
911 return;
913 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
914 if (j < 0 || i == j)
915 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
917 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
918 vect_mark_slp_stmts (child, mark, j);
922 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
924 static void
925 vect_mark_slp_stmts_relevant (slp_tree node)
927 int i;
928 gimple stmt;
929 stmt_vec_info stmt_info;
930 slp_tree child;
932 if (!node)
933 return;
935 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
937 stmt_info = vinfo_for_stmt (stmt);
938 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
939 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
940 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
943 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
944 vect_mark_slp_stmts_relevant (child);
948 /* Check if the permutation required by the SLP INSTANCE is supported.
949 Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed. */
951 static bool
952 vect_supported_slp_permutation_p (slp_instance instance)
954 slp_tree node = SLP_INSTANCE_LOADS (instance)[0];
955 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
956 gimple first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
957 vec<slp_tree> sorted_loads = vNULL;
958 int index;
959 slp_tree *tmp_loads = NULL;
960 int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j;
961 slp_tree load;
963 /* FORNOW: The only supported loads permutation is loads from the same
964 location in all the loads in the node, when the data-refs in
965 nodes of LOADS constitute an interleaving chain.
966 Sort the nodes according to the order of accesses in the chain. */
967 tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size);
968 for (i = 0, j = 0;
969 SLP_INSTANCE_LOAD_PERMUTATION (instance).iterate (i, &index)
970 && SLP_INSTANCE_LOADS (instance).iterate (j, &load);
971 i += group_size, j++)
973 gimple scalar_stmt = SLP_TREE_SCALAR_STMTS (load)[0];
974 /* Check that the loads are all in the same interleaving chain. */
975 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (scalar_stmt)) != first_load)
977 if (dump_enabled_p ())
979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
980 "Build SLP failed: unsupported data "
981 "permutation ");
982 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
983 scalar_stmt, 0);
986 free (tmp_loads);
987 return false;
990 tmp_loads[index] = load;
993 sorted_loads.create (group_size);
994 for (i = 0; i < group_size; i++)
995 sorted_loads.safe_push (tmp_loads[i]);
997 SLP_INSTANCE_LOADS (instance).release ();
998 SLP_INSTANCE_LOADS (instance) = sorted_loads;
999 free (tmp_loads);
1001 if (!vect_transform_slp_perm_load (stmt, vNULL, NULL,
1002 SLP_INSTANCE_UNROLLING_FACTOR (instance),
1003 instance, true))
1004 return false;
1006 return true;
1010 /* Rearrange the statements of NODE according to PERMUTATION. */
1012 static void
1013 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1014 vec<int> permutation)
1016 gimple stmt;
1017 vec<gimple> tmp_stmts;
1018 unsigned int i;
1019 slp_tree child;
1021 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1022 vect_slp_rearrange_stmts (child, group_size, permutation);
1024 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1025 tmp_stmts.create (group_size);
1026 tmp_stmts.quick_grow_cleared (group_size);
1028 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1029 tmp_stmts[permutation[i]] = stmt;
1031 SLP_TREE_SCALAR_STMTS (node).release ();
1032 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1036 /* Check if the required load permutation is supported.
1037 LOAD_PERMUTATION contains a list of indices of the loads.
1038 In SLP this permutation is relative to the order of grouped stores that are
1039 the base of the SLP instance. */
1041 static bool
1042 vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
1043 vec<int> load_permutation)
1045 int i = 0, j, prev = -1, next, k, number_of_groups;
1046 bool supported, bad_permutation = false;
1047 sbitmap load_index;
1048 slp_tree node, other_complex_node;
1049 gimple stmt, first = NULL, other_node_first, load, next_load, first_load;
1050 unsigned complex_numbers = 0;
1051 struct data_reference *dr;
1052 bb_vec_info bb_vinfo;
1054 /* FORNOW: permutations are only supported in SLP. */
1055 if (!slp_instn)
1056 return false;
1058 if (dump_enabled_p ())
1060 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1061 FOR_EACH_VEC_ELT (load_permutation, i, next)
1062 dump_printf (MSG_NOTE, "%d ", next);
1065 /* In case of reduction every load permutation is allowed, since the order
1066 of the reduction statements is not important (as opposed to the case of
1067 grouped stores). The only condition we need to check is that all the
1068 load nodes are of the same size and have the same permutation (and then
1069 rearrange all the nodes of the SLP instance according to this
1070 permutation). */
1072 /* Check that all the load nodes are of the same size. */
1073 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1075 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1076 return false;
1078 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1079 if (is_gimple_assign (stmt)
1080 && (gimple_assign_rhs_code (stmt) == REALPART_EXPR
1081 || gimple_assign_rhs_code (stmt) == IMAGPART_EXPR))
1082 complex_numbers++;
1085 /* Complex operands can be swapped as following:
1086 real_c = real_b + real_a;
1087 imag_c = imag_a + imag_b;
1088 i.e., we have {real_b, imag_a} and {real_a, imag_b} instead of
1089 {real_a, imag_a} and {real_b, imag_b}. We check here that if interleaving
1090 chains are mixed, they match the above pattern. */
1091 if (complex_numbers)
1093 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1095 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, stmt)
1097 if (j == 0)
1098 first = stmt;
1099 else
1101 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != first)
1103 if (complex_numbers != 2)
1104 return false;
1106 if (i == 0)
1107 k = 1;
1108 else
1109 k = 0;
1111 other_complex_node = SLP_INSTANCE_LOADS (slp_instn)[k];
1112 other_node_first =
1113 SLP_TREE_SCALAR_STMTS (other_complex_node)[0];
1115 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1116 != other_node_first)
1117 return false;
1124 /* We checked that this case ok, so there is no need to proceed with
1125 permutation tests. */
1126 if (complex_numbers == 2
1127 && SLP_INSTANCE_LOADS (slp_instn).length () == 2)
1129 SLP_INSTANCE_LOADS (slp_instn).release ();
1130 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1131 return true;
1134 node = SLP_INSTANCE_TREE (slp_instn);
1135 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1136 /* LOAD_PERMUTATION is a list of indices of all the loads of the SLP
1137 instance, not all the loads belong to the same node or interleaving
1138 group. Hence, we need to divide them into groups according to
1139 GROUP_SIZE. */
1140 number_of_groups = load_permutation.length () / group_size;
1142 /* Reduction (there are no data-refs in the root).
1143 In reduction chain the order of the loads is important. */
1144 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1145 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1147 int first_group_load_index;
1149 /* Compare all the permutation sequences to the first one. */
1150 for (i = 1; i < number_of_groups; i++)
1152 k = 0;
1153 for (j = i * group_size; j < i * group_size + group_size; j++)
1155 next = load_permutation[j];
1156 first_group_load_index = load_permutation[k];
1158 if (next != first_group_load_index)
1160 bad_permutation = true;
1161 break;
1164 k++;
1167 if (bad_permutation)
1168 break;
1171 if (!bad_permutation)
1173 /* Check that the loads in the first sequence are different and there
1174 are no gaps between them. */
1175 load_index = sbitmap_alloc (group_size);
1176 bitmap_clear (load_index);
1177 for (k = 0; k < group_size; k++)
1179 first_group_load_index = load_permutation[k];
1180 if (bitmap_bit_p (load_index, first_group_load_index))
1182 bad_permutation = true;
1183 break;
1186 bitmap_set_bit (load_index, first_group_load_index);
1189 if (!bad_permutation)
1190 for (k = 0; k < group_size; k++)
1191 if (!bitmap_bit_p (load_index, k))
1193 bad_permutation = true;
1194 break;
1197 sbitmap_free (load_index);
1200 if (!bad_permutation)
1202 /* This permutation is valid for reduction. Since the order of the
1203 statements in the nodes is not important unless they are memory
1204 accesses, we can rearrange the statements in all the nodes
1205 according to the order of the loads. */
1206 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1207 load_permutation);
1208 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1209 return true;
1213 /* In basic block vectorization we allow any subchain of an interleaving
1214 chain.
1215 FORNOW: not supported in loop SLP because of realignment compications. */
1216 bb_vinfo = STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt));
1217 bad_permutation = false;
1218 /* Check that for every node in the instance the loads form a subchain. */
1219 if (bb_vinfo)
1221 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1223 next_load = NULL;
1224 first_load = NULL;
1225 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1227 if (!first_load)
1228 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (load));
1229 else if (first_load
1230 != GROUP_FIRST_ELEMENT (vinfo_for_stmt (load)))
1232 bad_permutation = true;
1233 break;
1236 if (j != 0 && next_load != load)
1238 bad_permutation = true;
1239 break;
1242 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1245 if (bad_permutation)
1246 break;
1249 /* Check that the alignment of the first load in every subchain, i.e.,
1250 the first statement in every load node, is supported. */
1251 if (!bad_permutation)
1253 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1255 first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1256 if (first_load
1257 != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1259 dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1260 if (vect_supportable_dr_alignment (dr, false)
1261 == dr_unaligned_unsupported)
1263 if (dump_enabled_p ())
1265 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1266 vect_location,
1267 "unsupported unaligned load ");
1268 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1269 first_load, 0);
1271 bad_permutation = true;
1272 break;
1277 if (!bad_permutation)
1279 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
1280 return true;
1285 /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1286 GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1287 well (unless it's reduction). */
1288 if (load_permutation.length ()
1289 != (unsigned int) (group_size * group_size))
1290 return false;
1292 supported = true;
1293 load_index = sbitmap_alloc (group_size);
1294 bitmap_clear (load_index);
1295 for (j = 0; j < group_size; j++)
1297 for (i = j * group_size, k = 0;
1298 load_permutation.iterate (i, &next) && k < group_size;
1299 i++, k++)
1301 if (i != j * group_size && next != prev)
1303 supported = false;
1304 break;
1307 prev = next;
1310 if (bitmap_bit_p (load_index, prev))
1312 supported = false;
1313 break;
1316 bitmap_set_bit (load_index, prev);
1319 for (j = 0; j < group_size; j++)
1320 if (!bitmap_bit_p (load_index, j))
1322 sbitmap_free (load_index);
1323 return false;
1326 sbitmap_free (load_index);
1328 if (supported && i == group_size * group_size
1329 && vect_supported_slp_permutation_p (slp_instn))
1330 return true;
1332 return false;
1336 /* Find the first load in the loop that belongs to INSTANCE.
1337 When loads are in several SLP nodes, there can be a case in which the first
1338 load does not appear in the first SLP node to be transformed, causing
1339 incorrect order of statements. Since we generate all the loads together,
1340 they must be inserted before the first load of the SLP instance and not
1341 before the first load of the first node of the instance. */
1343 static gimple
1344 vect_find_first_load_in_slp_instance (slp_instance instance)
1346 int i, j;
1347 slp_tree load_node;
1348 gimple first_load = NULL, load;
1350 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1351 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1352 first_load = get_earlier_stmt (load, first_load);
1354 return first_load;
1358 /* Find the last store in SLP INSTANCE. */
1360 static gimple
1361 vect_find_last_store_in_slp_instance (slp_instance instance)
1363 int i;
1364 slp_tree node;
1365 gimple last_store = NULL, store;
1367 node = SLP_INSTANCE_TREE (instance);
1368 for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1369 last_store = get_later_stmt (store, last_store);
1371 return last_store;
1374 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1376 static void
1377 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1378 slp_instance instance, slp_tree node,
1379 stmt_vector_for_cost *prologue_cost_vec,
1380 unsigned ncopies_for_cost)
1382 stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
1384 unsigned i;
1385 slp_tree child;
1386 gimple stmt, s;
1387 stmt_vec_info stmt_info;
1388 tree lhs;
1389 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1391 /* Recurse down the SLP tree. */
1392 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1393 vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1394 instance, child, prologue_cost_vec,
1395 ncopies_for_cost);
1397 /* Look at the first scalar stmt to determine the cost. */
1398 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1399 stmt_info = vinfo_for_stmt (stmt);
1400 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1402 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1403 vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1404 vect_uninitialized_def,
1405 node, prologue_cost_vec, body_cost_vec);
1406 else
1408 int i;
1409 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1410 vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1411 node, prologue_cost_vec, body_cost_vec);
1412 /* If the load is permuted record the cost for the permutation.
1413 ??? Loads from multiple chains are let through here only
1414 for a single special case involving complex numbers where
1415 in the end no permutation is necessary. */
1416 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1417 if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1418 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1419 && vect_get_place_in_interleaving_chain
1420 (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1422 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1423 stmt_info, 0, vect_body);
1424 break;
1428 else
1429 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1430 stmt_info, 0, vect_body);
1432 /* Scan operands and account for prologue cost of constants/externals.
1433 ??? This over-estimates cost for multiple uses and should be
1434 re-engineered. */
1435 lhs = gimple_get_lhs (stmt);
1436 for (i = 0; i < gimple_num_ops (stmt); ++i)
1438 tree def, op = gimple_op (stmt, i);
1439 gimple def_stmt;
1440 enum vect_def_type dt;
1441 if (!op || op == lhs)
1442 continue;
1443 if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
1444 &def_stmt, &def, &dt)
1445 && (dt == vect_constant_def || dt == vect_external_def))
1446 record_stmt_cost (prologue_cost_vec, 1, vector_stmt,
1447 stmt_info, 0, vect_prologue);
1451 /* Compute the cost for the SLP instance INSTANCE. */
1453 static void
1454 vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1455 slp_instance instance, unsigned nunits)
1457 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1458 unsigned ncopies_for_cost;
1459 stmt_info_for_cost *si;
1460 unsigned i;
1462 /* Calculate the number of vector stmts to create based on the unrolling
1463 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1464 GROUP_SIZE / NUNITS otherwise. */
1465 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1466 ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1468 prologue_cost_vec.create (10);
1469 body_cost_vec.create (10);
1470 SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
1471 vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1472 instance, SLP_INSTANCE_TREE (instance),
1473 &prologue_cost_vec, ncopies_for_cost);
1475 /* Record the prologue costs, which were delayed until we were
1476 sure that SLP was successful. Unlike the body costs, we know
1477 the final values now regardless of the loop vectorization factor. */
1478 void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1479 : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1480 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1482 struct _stmt_vec_info *stmt_info
1483 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1484 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1485 si->misalign, vect_prologue);
1488 prologue_cost_vec.release ();
1491 /* Analyze an SLP instance starting from a group of grouped stores. Call
1492 vect_build_slp_tree to build a tree of packed stmts if possible.
1493 Return FALSE if it's impossible to SLP any stmt in the loop. */
1495 static bool
1496 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1497 gimple stmt)
1499 slp_instance new_instance;
1500 slp_tree node;
1501 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1502 unsigned int unrolling_factor = 1, nunits;
1503 tree vectype, scalar_type = NULL_TREE;
1504 gimple next;
1505 unsigned int vectorization_factor = 0;
1506 int i;
1507 unsigned int max_nunits = 0;
1508 vec<slp_tree> loads;
1509 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1510 vec<gimple> scalar_stmts;
1512 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1514 if (dr)
1516 scalar_type = TREE_TYPE (DR_REF (dr));
1517 vectype = get_vectype_for_scalar_type (scalar_type);
1519 else
1521 gcc_assert (loop_vinfo);
1522 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1525 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1527 else
1529 gcc_assert (loop_vinfo);
1530 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1531 group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1534 if (!vectype)
1536 if (dump_enabled_p ())
1538 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1539 "Build SLP failed: unsupported data-type ");
1540 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1543 return false;
1546 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1547 if (loop_vinfo)
1548 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1549 else
1550 vectorization_factor = nunits;
1552 /* Calculate the unrolling factor. */
1553 unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1554 if (unrolling_factor != 1 && !loop_vinfo)
1556 if (dump_enabled_p ())
1557 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1558 "Build SLP failed: unrolling required in basic"
1559 " block SLP");
1561 return false;
1564 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1565 scalar_stmts.create (group_size);
1566 next = stmt;
1567 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1569 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1570 while (next)
1572 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1573 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1574 scalar_stmts.safe_push (
1575 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1576 else
1577 scalar_stmts.safe_push (next);
1578 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1581 else
1583 /* Collect reduction statements. */
1584 vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1585 for (i = 0; reductions.iterate (i, &next); i++)
1586 scalar_stmts.safe_push (next);
1589 node = vect_create_new_slp_node (scalar_stmts);
1591 loads.create (group_size);
1593 /* Build the tree for the SLP instance. */
1594 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1595 &max_nunits, &loads,
1596 vectorization_factor))
1598 /* Calculate the unrolling factor based on the smallest type. */
1599 if (max_nunits > nunits)
1600 unrolling_factor = least_common_multiple (max_nunits, group_size)
1601 / group_size;
1603 if (unrolling_factor != 1 && !loop_vinfo)
1605 if (dump_enabled_p ())
1606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1607 "Build SLP failed: unrolling required in basic"
1608 " block SLP");
1609 vect_free_slp_tree (node);
1610 loads.release ();
1611 return false;
1614 /* Create a new SLP instance. */
1615 new_instance = XNEW (struct _slp_instance);
1616 SLP_INSTANCE_TREE (new_instance) = node;
1617 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1618 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1619 SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
1620 SLP_INSTANCE_LOADS (new_instance) = loads;
1621 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1622 SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = vNULL;
1624 /* Compute the load permutation. */
1625 slp_tree load_node;
1626 bool loads_permuted = false;
1627 vec<int> load_permutation;
1628 load_permutation.create (group_size * group_size);
1629 FOR_EACH_VEC_ELT (loads, i, load_node)
1631 int j;
1632 gimple load;
1633 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1635 int load_place;
1636 load_place = vect_get_place_in_interleaving_chain
1637 (load, GROUP_FIRST_ELEMENT (vinfo_for_stmt (load)));
1638 if (load_place != j
1639 /* ??? We allow loads from different groups to
1640 get to here for a special case handled in
1641 the permutation code. Make sure we get to that. */
1642 || (GROUP_FIRST_ELEMENT
1643 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]))
1644 != GROUP_FIRST_ELEMENT (vinfo_for_stmt (load))))
1645 loads_permuted = true;
1646 load_permutation.safe_push (load_place);
1650 if (loads_permuted)
1652 SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation;
1653 if (!vect_supported_load_permutation_p (new_instance, group_size,
1654 load_permutation))
1656 if (dump_enabled_p ())
1658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1659 "Build SLP failed: unsupported load "
1660 "permutation ");
1661 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1664 vect_free_slp_instance (new_instance);
1665 return false;
1668 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1669 = vect_find_first_load_in_slp_instance (new_instance);
1671 else
1672 load_permutation.release ();
1674 /* Compute the costs of this SLP instance. */
1675 vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
1676 new_instance, TYPE_VECTOR_SUBPARTS (vectype));
1678 if (loop_vinfo)
1679 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1680 else
1681 BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1683 if (dump_enabled_p ())
1684 vect_print_slp_tree (MSG_NOTE, node);
1686 return true;
1689 /* Failed to SLP. */
1690 /* Free the allocated memory. */
1691 vect_free_slp_tree (node);
1692 loads.release ();
1694 return false;
1698 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1699 trees of packed scalar stmts if SLP is possible. */
1701 bool
1702 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
1704 unsigned int i;
1705 vec<gimple> grouped_stores;
1706 vec<gimple> reductions = vNULL;
1707 vec<gimple> reduc_chains = vNULL;
1708 gimple first_element;
1709 bool ok = false;
1711 if (dump_enabled_p ())
1712 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===");
1714 if (loop_vinfo)
1716 grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1717 reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1718 reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1720 else
1721 grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1723 /* Find SLP sequences starting from groups of grouped stores. */
1724 FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1725 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
1726 ok = true;
1728 if (bb_vinfo && !ok)
1730 if (dump_enabled_p ())
1731 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1732 "Failed to SLP the basic block.");
1734 return false;
1737 if (loop_vinfo
1738 && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1740 /* Find SLP sequences starting from reduction chains. */
1741 FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1742 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
1743 ok = true;
1744 else
1745 return false;
1747 /* Don't try to vectorize SLP reductions if reduction chain was
1748 detected. */
1749 return ok;
1752 /* Find SLP sequences starting from groups of reductions. */
1753 if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1754 && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0]))
1755 ok = true;
1757 return true;
1761 /* For each possible SLP instance decide whether to SLP it and calculate overall
1762 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1763 least one instance. */
1765 bool
1766 vect_make_slp_decision (loop_vec_info loop_vinfo)
1768 unsigned int i, unrolling_factor = 1;
1769 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1770 slp_instance instance;
1771 int decided_to_slp = 0;
1773 if (dump_enabled_p ())
1774 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ===");
1776 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1778 /* FORNOW: SLP if you can. */
1779 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1780 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1782 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
1783 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1784 loop-based vectorization. Such stmts will be marked as HYBRID. */
1785 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1786 decided_to_slp++;
1789 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1791 if (decided_to_slp && dump_enabled_p ())
1792 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
1793 "Decided to SLP %d instances. Unrolling factor %d",
1794 decided_to_slp, unrolling_factor);
1796 return (decided_to_slp > 0);
1800 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1801 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1803 static void
1804 vect_detect_hybrid_slp_stmts (slp_tree node)
1806 int i;
1807 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (node);
1808 gimple stmt = stmts[0];
1809 imm_use_iterator imm_iter;
1810 gimple use_stmt;
1811 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1812 slp_tree child;
1813 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1814 struct loop *loop = NULL;
1815 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo);
1816 basic_block bb = NULL;
1818 if (!node)
1819 return;
1821 if (loop_vinfo)
1822 loop = LOOP_VINFO_LOOP (loop_vinfo);
1823 else
1824 bb = BB_VINFO_BB (bb_vinfo);
1826 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1827 if (PURE_SLP_STMT (vinfo_for_stmt (stmt))
1828 && TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1829 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1830 if (gimple_bb (use_stmt)
1831 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1832 || bb == gimple_bb (use_stmt))
1833 && (stmt_vinfo = vinfo_for_stmt (use_stmt))
1834 && !STMT_SLP_TYPE (stmt_vinfo)
1835 && (STMT_VINFO_RELEVANT (stmt_vinfo)
1836 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
1837 && !(gimple_code (use_stmt) == GIMPLE_PHI
1838 && STMT_VINFO_DEF_TYPE (stmt_vinfo)
1839 == vect_reduction_def))
1840 vect_mark_slp_stmts (node, hybrid, i);
1842 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1843 vect_detect_hybrid_slp_stmts (child);
1847 /* Find stmts that must be both vectorized and SLPed. */
1849 void
1850 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1852 unsigned int i;
1853 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1854 slp_instance instance;
1856 if (dump_enabled_p ())
1857 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ===");
1859 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1860 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance));
1864 /* Create and initialize a new bb_vec_info struct for BB, as well as
1865 stmt_vec_info structs for all the stmts in it. */
1867 static bb_vec_info
1868 new_bb_vec_info (basic_block bb)
1870 bb_vec_info res = NULL;
1871 gimple_stmt_iterator gsi;
1873 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
1874 BB_VINFO_BB (res) = bb;
1876 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1878 gimple stmt = gsi_stmt (gsi);
1879 gimple_set_uid (stmt, 0);
1880 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
1883 BB_VINFO_GROUPED_STORES (res).create (10);
1884 BB_VINFO_SLP_INSTANCES (res).create (2);
1885 BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
1887 bb->aux = res;
1888 return res;
1892 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
1893 stmts in the basic block. */
1895 static void
1896 destroy_bb_vec_info (bb_vec_info bb_vinfo)
1898 vec<slp_instance> slp_instances;
1899 slp_instance instance;
1900 basic_block bb;
1901 gimple_stmt_iterator si;
1902 unsigned i;
1904 if (!bb_vinfo)
1905 return;
1907 bb = BB_VINFO_BB (bb_vinfo);
1909 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1911 gimple stmt = gsi_stmt (si);
1912 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1914 if (stmt_info)
1915 /* Free stmt_vec_info. */
1916 free_stmt_vec_info (stmt);
1919 free_data_refs (BB_VINFO_DATAREFS (bb_vinfo));
1920 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
1921 BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
1922 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1923 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1924 vect_free_slp_instance (instance);
1925 BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
1926 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1927 free (bb_vinfo);
1928 bb->aux = NULL;
1932 /* Analyze statements contained in SLP tree node after recursively analyzing
1933 the subtree. Return TRUE if the operations are supported. */
1935 static bool
1936 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
1938 bool dummy;
1939 int i;
1940 gimple stmt;
1941 slp_tree child;
1943 if (!node)
1944 return true;
1946 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1947 if (!vect_slp_analyze_node_operations (bb_vinfo, child))
1948 return false;
1950 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1952 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1953 gcc_assert (stmt_info);
1954 gcc_assert (PURE_SLP_STMT (stmt_info));
1956 if (!vect_analyze_stmt (stmt, &dummy, node))
1957 return false;
1960 return true;
1964 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
1965 operations are supported. */
1967 static bool
1968 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
1970 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1971 slp_instance instance;
1972 int i;
1974 for (i = 0; slp_instances.iterate (i, &instance); )
1976 if (!vect_slp_analyze_node_operations (bb_vinfo,
1977 SLP_INSTANCE_TREE (instance)))
1979 vect_free_slp_instance (instance);
1980 slp_instances.ordered_remove (i);
1982 else
1983 i++;
1986 if (!slp_instances.length ())
1987 return false;
1989 return true;
1992 /* Check if vectorization of the basic block is profitable. */
1994 static bool
1995 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
1997 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
1998 slp_instance instance;
1999 int i, j;
2000 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2001 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2002 unsigned int stmt_cost;
2003 gimple stmt;
2004 gimple_stmt_iterator si;
2005 basic_block bb = BB_VINFO_BB (bb_vinfo);
2006 void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2007 stmt_vec_info stmt_info = NULL;
2008 stmt_vector_for_cost body_cost_vec;
2009 stmt_info_for_cost *ci;
2011 /* Calculate vector costs. */
2012 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2014 body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2016 FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2018 stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2019 (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2020 stmt_info, ci->misalign, vect_body);
2024 /* Calculate scalar cost. */
2025 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2027 stmt = gsi_stmt (si);
2028 stmt_info = vinfo_for_stmt (stmt);
2030 if (!stmt_info || !STMT_VINFO_VECTORIZABLE (stmt_info)
2031 || !PURE_SLP_STMT (stmt_info))
2032 continue;
2034 if (STMT_VINFO_DATA_REF (stmt_info))
2036 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2037 stmt_cost = vect_get_stmt_cost (scalar_load);
2038 else
2039 stmt_cost = vect_get_stmt_cost (scalar_store);
2041 else
2042 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2044 scalar_cost += stmt_cost;
2047 /* Complete the target-specific cost calculation. */
2048 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2049 &vec_inside_cost, &vec_epilogue_cost);
2051 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2053 if (dump_enabled_p ())
2055 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2056 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2057 vec_inside_cost);
2058 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2059 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2060 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d", scalar_cost);
2063 /* Vectorization is profitable if its cost is less than the cost of scalar
2064 version. */
2065 if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2066 return false;
2068 return true;
2071 /* Check if the basic block can be vectorized. */
2073 static bb_vec_info
2074 vect_slp_analyze_bb_1 (basic_block bb)
2076 bb_vec_info bb_vinfo;
2077 vec<slp_instance> slp_instances;
2078 slp_instance instance;
2079 int i;
2080 int min_vf = 2;
2082 bb_vinfo = new_bb_vec_info (bb);
2083 if (!bb_vinfo)
2084 return NULL;
2086 if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf))
2088 if (dump_enabled_p ())
2089 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2090 "not vectorized: unhandled data-ref in basic "
2091 "block.\n");
2093 destroy_bb_vec_info (bb_vinfo);
2094 return NULL;
2097 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2099 if (dump_enabled_p ())
2100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2101 "not vectorized: not enough data-refs in "
2102 "basic block.\n");
2104 destroy_bb_vec_info (bb_vinfo);
2105 return NULL;
2108 if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2110 if (dump_enabled_p ())
2111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2112 "not vectorized: unhandled data access in "
2113 "basic block.\n");
2115 destroy_bb_vec_info (bb_vinfo);
2116 return NULL;
2119 vect_pattern_recog (NULL, bb_vinfo);
2121 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2123 if (dump_enabled_p ())
2124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2125 "not vectorized: unhandled data dependence "
2126 "in basic block.\n");
2128 destroy_bb_vec_info (bb_vinfo);
2129 return NULL;
2132 if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2134 if (dump_enabled_p ())
2135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2136 "not vectorized: bad data alignment in basic "
2137 "block.\n");
2139 destroy_bb_vec_info (bb_vinfo);
2140 return NULL;
2143 /* Check the SLP opportunities in the basic block, analyze and build SLP
2144 trees. */
2145 if (!vect_analyze_slp (NULL, bb_vinfo))
2147 if (dump_enabled_p ())
2148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2149 "not vectorized: failed to find SLP opportunities "
2150 "in basic block.\n");
2152 destroy_bb_vec_info (bb_vinfo);
2153 return NULL;
2156 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2158 /* Mark all the statements that we want to vectorize as pure SLP and
2159 relevant. */
2160 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2162 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2163 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2166 if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2168 if (dump_enabled_p ())
2169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2170 "not vectorized: unsupported alignment in basic "
2171 "block.\n");
2172 destroy_bb_vec_info (bb_vinfo);
2173 return NULL;
2176 if (!vect_slp_analyze_operations (bb_vinfo))
2178 if (dump_enabled_p ())
2179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2180 "not vectorized: bad operation in basic block.\n");
2182 destroy_bb_vec_info (bb_vinfo);
2183 return NULL;
2186 /* Cost model: check if the vectorization is worthwhile. */
2187 if (flag_vect_cost_model
2188 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2190 if (dump_enabled_p ())
2191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2192 "not vectorized: vectorization is not "
2193 "profitable.\n");
2195 destroy_bb_vec_info (bb_vinfo);
2196 return NULL;
2199 if (dump_enabled_p ())
2200 dump_printf_loc (MSG_NOTE, vect_location,
2201 "Basic block will be vectorized using SLP\n");
2203 return bb_vinfo;
2207 bb_vec_info
2208 vect_slp_analyze_bb (basic_block bb)
2210 bb_vec_info bb_vinfo;
2211 int insns = 0;
2212 gimple_stmt_iterator gsi;
2213 unsigned int vector_sizes;
2215 if (dump_enabled_p ())
2216 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2218 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2220 gimple stmt = gsi_stmt (gsi);
2221 if (!is_gimple_debug (stmt)
2222 && !gimple_nop_p (stmt)
2223 && gimple_code (stmt) != GIMPLE_LABEL)
2224 insns++;
2227 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2229 if (dump_enabled_p ())
2230 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2231 "not vectorized: too many instructions in "
2232 "basic block.\n");
2234 return NULL;
2237 /* Autodetect first vector size we try. */
2238 current_vector_size = 0;
2239 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2241 while (1)
2243 bb_vinfo = vect_slp_analyze_bb_1 (bb);
2244 if (bb_vinfo)
2245 return bb_vinfo;
2247 destroy_bb_vec_info (bb_vinfo);
2249 vector_sizes &= ~current_vector_size;
2250 if (vector_sizes == 0
2251 || current_vector_size == 0)
2252 return NULL;
2254 /* Try the next biggest vector size. */
2255 current_vector_size = 1 << floor_log2 (vector_sizes);
2256 if (dump_enabled_p ())
2257 dump_printf_loc (MSG_NOTE, vect_location,
2258 "***** Re-trying analysis with "
2259 "vector size %d\n", current_vector_size);
2264 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2265 the number of created vector stmts depends on the unrolling factor).
2266 However, the actual number of vector stmts for every SLP node depends on
2267 VF which is set later in vect_analyze_operations (). Hence, SLP costs
2268 should be updated. In this function we assume that the inside costs
2269 calculated in vect_model_xxx_cost are linear in ncopies. */
2271 void
2272 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2274 unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2275 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2276 slp_instance instance;
2277 stmt_vector_for_cost body_cost_vec;
2278 stmt_info_for_cost *si;
2279 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2281 if (dump_enabled_p ())
2282 dump_printf_loc (MSG_NOTE, vect_location,
2283 "=== vect_update_slp_costs_according_to_vf ===");
2285 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2287 /* We assume that costs are linear in ncopies. */
2288 int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2290 /* Record the instance's instructions in the target cost model.
2291 This was delayed until here because the count of instructions
2292 isn't known beforehand. */
2293 body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2295 FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2296 (void) add_stmt_cost (data, si->count * ncopies, si->kind,
2297 vinfo_for_stmt (si->stmt), si->misalign,
2298 vect_body);
2303 /* For constant and loop invariant defs of SLP_NODE this function returns
2304 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2305 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2306 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2307 REDUC_INDEX is the index of the reduction operand in the statements, unless
2308 it is -1. */
2310 static void
2311 vect_get_constant_vectors (tree op, slp_tree slp_node,
2312 vec<tree> *vec_oprnds,
2313 unsigned int op_num, unsigned int number_of_vectors,
2314 int reduc_index)
2316 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2317 gimple stmt = stmts[0];
2318 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2319 unsigned nunits;
2320 tree vec_cst;
2321 tree *elts;
2322 unsigned j, number_of_places_left_in_vector;
2323 tree vector_type;
2324 tree vop;
2325 int group_size = stmts.length ();
2326 unsigned int vec_num, i;
2327 unsigned number_of_copies = 1;
2328 vec<tree> voprnds;
2329 voprnds.create (number_of_vectors);
2330 bool constant_p, is_store;
2331 tree neutral_op = NULL;
2332 enum tree_code code = gimple_expr_code (stmt);
2333 gimple def_stmt;
2334 struct loop *loop;
2335 gimple_seq ctor_seq = NULL;
2337 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2338 && reduc_index != -1)
2340 op_num = reduc_index - 1;
2341 op = gimple_op (stmt, reduc_index);
2342 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2343 we need either neutral operands or the original operands. See
2344 get_initial_def_for_reduction() for details. */
2345 switch (code)
2347 case WIDEN_SUM_EXPR:
2348 case DOT_PROD_EXPR:
2349 case PLUS_EXPR:
2350 case MINUS_EXPR:
2351 case BIT_IOR_EXPR:
2352 case BIT_XOR_EXPR:
2353 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2354 neutral_op = build_real (TREE_TYPE (op), dconst0);
2355 else
2356 neutral_op = build_int_cst (TREE_TYPE (op), 0);
2358 break;
2360 case MULT_EXPR:
2361 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2362 neutral_op = build_real (TREE_TYPE (op), dconst1);
2363 else
2364 neutral_op = build_int_cst (TREE_TYPE (op), 1);
2366 break;
2368 case BIT_AND_EXPR:
2369 neutral_op = build_int_cst (TREE_TYPE (op), -1);
2370 break;
2372 case MAX_EXPR:
2373 case MIN_EXPR:
2374 def_stmt = SSA_NAME_DEF_STMT (op);
2375 loop = (gimple_bb (stmt))->loop_father;
2376 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2377 loop_preheader_edge (loop));
2378 break;
2380 default:
2381 neutral_op = NULL;
2385 if (STMT_VINFO_DATA_REF (stmt_vinfo))
2387 is_store = true;
2388 op = gimple_assign_rhs1 (stmt);
2390 else
2391 is_store = false;
2393 gcc_assert (op);
2395 if (CONSTANT_CLASS_P (op))
2396 constant_p = true;
2397 else
2398 constant_p = false;
2400 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2401 gcc_assert (vector_type);
2402 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2404 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2405 created vectors. It is greater than 1 if unrolling is performed.
2407 For example, we have two scalar operands, s1 and s2 (e.g., group of
2408 strided accesses of size two), while NUNITS is four (i.e., four scalars
2409 of this type can be packed in a vector). The output vector will contain
2410 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2411 will be 2).
2413 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2414 containing the operands.
2416 For example, NUNITS is four as before, and the group size is 8
2417 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2418 {s5, s6, s7, s8}. */
2420 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2422 number_of_places_left_in_vector = nunits;
2423 elts = XALLOCAVEC (tree, nunits);
2424 for (j = 0; j < number_of_copies; j++)
2426 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2428 if (is_store)
2429 op = gimple_assign_rhs1 (stmt);
2430 else
2432 switch (code)
2434 case COND_EXPR:
2435 if (op_num == 0 || op_num == 1)
2437 tree cond = gimple_assign_rhs1 (stmt);
2438 op = TREE_OPERAND (cond, op_num);
2440 else
2442 if (op_num == 2)
2443 op = gimple_assign_rhs2 (stmt);
2444 else
2445 op = gimple_assign_rhs3 (stmt);
2447 break;
2449 case CALL_EXPR:
2450 op = gimple_call_arg (stmt, op_num);
2451 break;
2453 case LSHIFT_EXPR:
2454 case RSHIFT_EXPR:
2455 case LROTATE_EXPR:
2456 case RROTATE_EXPR:
2457 op = gimple_op (stmt, op_num + 1);
2458 /* Unlike the other binary operators, shifts/rotates have
2459 the shift count being int, instead of the same type as
2460 the lhs, so make sure the scalar is the right type if
2461 we are dealing with vectors of
2462 long long/long/short/char. */
2463 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2464 op = fold_convert (TREE_TYPE (vector_type), op);
2465 break;
2467 default:
2468 op = gimple_op (stmt, op_num + 1);
2469 break;
2473 if (reduc_index != -1)
2475 loop = (gimple_bb (stmt))->loop_father;
2476 def_stmt = SSA_NAME_DEF_STMT (op);
2478 gcc_assert (loop);
2480 /* Get the def before the loop. In reduction chain we have only
2481 one initial value. */
2482 if ((j != (number_of_copies - 1)
2483 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2484 && i != 0))
2485 && neutral_op)
2486 op = neutral_op;
2487 else
2488 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2489 loop_preheader_edge (loop));
2492 /* Create 'vect_ = {op0,op1,...,opn}'. */
2493 number_of_places_left_in_vector--;
2494 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2496 if (CONSTANT_CLASS_P (op))
2498 op = fold_unary (VIEW_CONVERT_EXPR,
2499 TREE_TYPE (vector_type), op);
2500 gcc_assert (op && CONSTANT_CLASS_P (op));
2502 else
2504 tree new_temp
2505 = make_ssa_name (TREE_TYPE (vector_type), NULL);
2506 gimple init_stmt;
2507 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
2508 op);
2509 init_stmt
2510 = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR,
2511 new_temp, op, NULL_TREE);
2512 gimple_seq_add_stmt (&ctor_seq, init_stmt);
2513 op = new_temp;
2516 elts[number_of_places_left_in_vector] = op;
2517 if (!CONSTANT_CLASS_P (op))
2518 constant_p = false;
2520 if (number_of_places_left_in_vector == 0)
2522 number_of_places_left_in_vector = nunits;
2524 if (constant_p)
2525 vec_cst = build_vector (vector_type, elts);
2526 else
2528 vec<constructor_elt, va_gc> *v;
2529 unsigned k;
2530 vec_alloc (v, nunits);
2531 for (k = 0; k < nunits; ++k)
2532 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2533 vec_cst = build_constructor (vector_type, v);
2535 voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2536 vector_type, NULL));
2537 if (ctor_seq != NULL)
2539 gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2540 gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2541 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2542 GSI_SAME_STMT);
2543 ctor_seq = NULL;
2549 /* Since the vectors are created in the reverse order, we should invert
2550 them. */
2551 vec_num = voprnds.length ();
2552 for (j = vec_num; j != 0; j--)
2554 vop = voprnds[j - 1];
2555 vec_oprnds->quick_push (vop);
2558 voprnds.release ();
2560 /* In case that VF is greater than the unrolling factor needed for the SLP
2561 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2562 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2563 to replicate the vectors. */
2564 while (number_of_vectors > vec_oprnds->length ())
2566 tree neutral_vec = NULL;
2568 if (neutral_op)
2570 if (!neutral_vec)
2571 neutral_vec = build_vector_from_val (vector_type, neutral_op);
2573 vec_oprnds->quick_push (neutral_vec);
2575 else
2577 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2578 vec_oprnds->quick_push (vop);
2584 /* Get vectorized definitions from SLP_NODE that contains corresponding
2585 vectorized def-stmts. */
2587 static void
2588 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2590 tree vec_oprnd;
2591 gimple vec_def_stmt;
2592 unsigned int i;
2594 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2596 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2598 gcc_assert (vec_def_stmt);
2599 vec_oprnd = gimple_get_lhs (vec_def_stmt);
2600 vec_oprnds->quick_push (vec_oprnd);
2605 /* Get vectorized definitions for SLP_NODE.
2606 If the scalar definitions are loop invariants or constants, collect them and
2607 call vect_get_constant_vectors() to create vector stmts.
2608 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2609 must be stored in the corresponding child of SLP_NODE, and we call
2610 vect_get_slp_vect_defs () to retrieve them. */
2612 void
2613 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2614 vec<vec<tree> > *vec_oprnds, int reduc_index)
2616 gimple first_stmt;
2617 int number_of_vects = 0, i;
2618 unsigned int child_index = 0;
2619 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2620 slp_tree child = NULL;
2621 vec<tree> vec_defs;
2622 tree oprnd;
2623 bool vectorized_defs;
2625 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2626 FOR_EACH_VEC_ELT (ops, i, oprnd)
2628 /* For each operand we check if it has vectorized definitions in a child
2629 node or we need to create them (for invariants and constants). We
2630 check if the LHS of the first stmt of the next child matches OPRND.
2631 If it does, we found the correct child. Otherwise, we call
2632 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2633 to check this child node for the next operand. */
2634 vectorized_defs = false;
2635 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2637 child = (slp_tree) SLP_TREE_CHILDREN (slp_node)[child_index];
2639 /* We have to check both pattern and original def, if available. */
2640 gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2641 gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2643 if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2644 || (related
2645 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2647 /* The number of vector defs is determined by the number of
2648 vector statements in the node from which we get those
2649 statements. */
2650 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2651 vectorized_defs = true;
2652 child_index++;
2656 if (!vectorized_defs)
2658 if (i == 0)
2660 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2661 /* Number of vector stmts was calculated according to LHS in
2662 vect_schedule_slp_instance (), fix it by replacing LHS with
2663 RHS, if necessary. See vect_get_smallest_scalar_type () for
2664 details. */
2665 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2666 &rhs_size_unit);
2667 if (rhs_size_unit != lhs_size_unit)
2669 number_of_vects *= rhs_size_unit;
2670 number_of_vects /= lhs_size_unit;
2675 /* Allocate memory for vectorized defs. */
2676 vec_defs = vNULL;
2677 vec_defs.create (number_of_vects);
2679 /* For reduction defs we call vect_get_constant_vectors (), since we are
2680 looking for initial loop invariant values. */
2681 if (vectorized_defs && reduc_index == -1)
2682 /* The defs are already vectorized. */
2683 vect_get_slp_vect_defs (child, &vec_defs);
2684 else
2685 /* Build vectors from scalar defs. */
2686 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2687 number_of_vects, reduc_index);
2689 vec_oprnds->quick_push (vec_defs);
2691 /* For reductions, we only need initial values. */
2692 if (reduc_index != -1)
2693 return;
2698 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2699 building a vector of type MASK_TYPE from it) and two input vectors placed in
2700 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2701 shifting by STRIDE elements of DR_CHAIN for every copy.
2702 (STRIDE is the number of vectorized stmts for NODE divided by the number of
2703 copies).
2704 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2705 the created stmts must be inserted. */
2707 static inline void
2708 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2709 tree mask, int first_vec_indx, int second_vec_indx,
2710 gimple_stmt_iterator *gsi, slp_tree node,
2711 tree vectype, vec<tree> dr_chain,
2712 int ncopies, int vect_stmts_counter)
2714 tree perm_dest;
2715 gimple perm_stmt = NULL;
2716 stmt_vec_info next_stmt_info;
2717 int i, stride;
2718 tree first_vec, second_vec, data_ref;
2720 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2722 /* Initialize the vect stmts of NODE to properly insert the generated
2723 stmts later. */
2724 for (i = SLP_TREE_VEC_STMTS (node).length ();
2725 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2726 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2728 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2729 for (i = 0; i < ncopies; i++)
2731 first_vec = dr_chain[first_vec_indx];
2732 second_vec = dr_chain[second_vec_indx];
2734 /* Generate the permute statement. */
2735 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, perm_dest,
2736 first_vec, second_vec, mask);
2737 data_ref = make_ssa_name (perm_dest, perm_stmt);
2738 gimple_set_lhs (perm_stmt, data_ref);
2739 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2741 /* Store the vector statement in NODE. */
2742 SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2744 first_vec_indx += stride;
2745 second_vec_indx += stride;
2748 /* Mark the scalar stmt as vectorized. */
2749 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2750 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2754 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2755 return in CURRENT_MASK_ELEMENT its equivalent in target specific
2756 representation. Check that the mask is valid and return FALSE if not.
2757 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2758 the next vector, i.e., the current first vector is not needed. */
2760 static bool
2761 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2762 int mask_nunits, bool only_one_vec, int index,
2763 unsigned char *mask, int *current_mask_element,
2764 bool *need_next_vector, int *number_of_mask_fixes,
2765 bool *mask_fixed, bool *needs_first_vector)
2767 int i;
2769 /* Convert to target specific representation. */
2770 *current_mask_element = first_mask_element + m;
2771 /* Adjust the value in case it's a mask for second and third vectors. */
2772 *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2774 if (*current_mask_element < mask_nunits)
2775 *needs_first_vector = true;
2777 /* We have only one input vector to permute but the mask accesses values in
2778 the next vector as well. */
2779 if (only_one_vec && *current_mask_element >= mask_nunits)
2781 if (dump_enabled_p ())
2783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2784 "permutation requires at least two vectors ");
2785 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2788 return false;
2791 /* The mask requires the next vector. */
2792 if (*current_mask_element >= mask_nunits * 2)
2794 if (*needs_first_vector || *mask_fixed)
2796 /* We either need the first vector too or have already moved to the
2797 next vector. In both cases, this permutation needs three
2798 vectors. */
2799 if (dump_enabled_p ())
2801 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2802 "permutation requires at "
2803 "least three vectors ");
2804 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2807 return false;
2810 /* We move to the next vector, dropping the first one and working with
2811 the second and the third - we need to adjust the values of the mask
2812 accordingly. */
2813 *current_mask_element -= mask_nunits * *number_of_mask_fixes;
2815 for (i = 0; i < index; i++)
2816 mask[i] -= mask_nunits * *number_of_mask_fixes;
2818 (*number_of_mask_fixes)++;
2819 *mask_fixed = true;
2822 *need_next_vector = *mask_fixed;
2824 /* This was the last element of this mask. Start a new one. */
2825 if (index == mask_nunits - 1)
2827 *number_of_mask_fixes = 1;
2828 *mask_fixed = false;
2829 *needs_first_vector = false;
2832 return true;
2836 /* Generate vector permute statements from a list of loads in DR_CHAIN.
2837 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
2838 permute statements for SLP_NODE_INSTANCE. */
2839 bool
2840 vect_transform_slp_perm_load (gimple stmt, vec<tree> dr_chain,
2841 gimple_stmt_iterator *gsi, int vf,
2842 slp_instance slp_node_instance, bool analyze_only)
2844 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2845 tree mask_element_type = NULL_TREE, mask_type;
2846 int i, j, k, nunits, vec_index = 0, scalar_index;
2847 slp_tree node;
2848 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2849 gimple next_scalar_stmt;
2850 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
2851 int first_mask_element;
2852 int index, unroll_factor, current_mask_element, ncopies;
2853 unsigned char *mask;
2854 bool only_one_vec = false, need_next_vector = false;
2855 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
2856 int number_of_mask_fixes = 1;
2857 bool mask_fixed = false;
2858 bool needs_first_vector = false;
2859 enum machine_mode mode;
2861 mode = TYPE_MODE (vectype);
2863 if (!can_vec_perm_p (mode, false, NULL))
2865 if (dump_enabled_p ())
2867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2868 "no vect permute for ");
2869 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2871 return false;
2874 /* The generic VEC_PERM_EXPR code always uses an integral type of the
2875 same size as the vector element being permuted. */
2876 mask_element_type = lang_hooks.types.type_for_mode
2877 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
2878 mask_type = get_vectype_for_scalar_type (mask_element_type);
2879 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2880 mask = XALLOCAVEC (unsigned char, nunits);
2881 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2883 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
2884 unrolling factor. */
2885 orig_vec_stmts_num = group_size *
2886 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
2887 if (orig_vec_stmts_num == 1)
2888 only_one_vec = true;
2890 /* Number of copies is determined by the final vectorization factor
2891 relatively to SLP_NODE_INSTANCE unrolling factor. */
2892 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
2894 /* Generate permutation masks for every NODE. Number of masks for each NODE
2895 is equal to GROUP_SIZE.
2896 E.g., we have a group of three nodes with three loads from the same
2897 location in each node, and the vector size is 4. I.e., we have a
2898 a0b0c0a1b1c1... sequence and we need to create the following vectors:
2899 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
2900 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
2903 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
2904 The last mask is illegal since we assume two operands for permute
2905 operation, and the mask element values can't be outside that range.
2906 Hence, the last mask must be converted into {2,5,5,5}.
2907 For the first two permutations we need the first and the second input
2908 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
2909 we need the second and the third vectors: {b1,c1,a2,b2} and
2910 {c2,a3,b3,c3}. */
2912 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_node_instance), i, node)
2914 scalar_index = 0;
2915 index = 0;
2916 vect_stmts_counter = 0;
2917 vec_index = 0;
2918 first_vec_index = vec_index++;
2919 if (only_one_vec)
2920 second_vec_index = first_vec_index;
2921 else
2922 second_vec_index = vec_index++;
2924 for (j = 0; j < unroll_factor; j++)
2926 for (k = 0; k < group_size; k++)
2928 first_mask_element = i + j * group_size;
2929 if (!vect_get_mask_element (stmt, first_mask_element, 0,
2930 nunits, only_one_vec, index,
2931 mask, &current_mask_element,
2932 &need_next_vector,
2933 &number_of_mask_fixes, &mask_fixed,
2934 &needs_first_vector))
2935 return false;
2936 mask[index++] = current_mask_element;
2938 if (index == nunits)
2940 tree mask_vec, *mask_elts;
2941 int l;
2943 if (!can_vec_perm_p (mode, false, mask))
2945 if (dump_enabled_p ())
2947 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
2948 vect_location,
2949 "unsupported vect permute { ");
2950 for (i = 0; i < nunits; ++i)
2951 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
2952 mask[i]);
2953 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
2955 return false;
2958 mask_elts = XALLOCAVEC (tree, nunits);
2959 for (l = 0; l < nunits; ++l)
2960 mask_elts[l] = build_int_cst (mask_element_type, mask[l]);
2961 mask_vec = build_vector (mask_type, mask_elts);
2962 index = 0;
2964 if (!analyze_only)
2966 if (need_next_vector)
2968 first_vec_index = second_vec_index;
2969 second_vec_index = vec_index;
2972 next_scalar_stmt
2973 = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
2975 vect_create_mask_and_perm (stmt, next_scalar_stmt,
2976 mask_vec, first_vec_index, second_vec_index,
2977 gsi, node, vectype, dr_chain,
2978 ncopies, vect_stmts_counter++);
2985 return true;
2990 /* Vectorize SLP instance tree in postorder. */
2992 static bool
2993 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
2994 unsigned int vectorization_factor)
2996 gimple stmt;
2997 bool grouped_store, is_store;
2998 gimple_stmt_iterator si;
2999 stmt_vec_info stmt_info;
3000 unsigned int vec_stmts_size, nunits, group_size;
3001 tree vectype;
3002 int i;
3003 slp_tree loads_node;
3004 slp_tree child;
3006 if (!node)
3007 return false;
3009 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3010 vect_schedule_slp_instance (child, instance, vectorization_factor);
3012 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3013 stmt_info = vinfo_for_stmt (stmt);
3015 /* VECTYPE is the type of the destination. */
3016 vectype = STMT_VINFO_VECTYPE (stmt_info);
3017 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3018 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3020 /* For each SLP instance calculate number of vector stmts to be created
3021 for the scalar stmts in each node of the SLP tree. Number of vector
3022 elements in one vector iteration is the number of scalar elements in
3023 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3024 size. */
3025 vec_stmts_size = (vectorization_factor * group_size) / nunits;
3027 /* In case of load permutation we have to allocate vectorized statements for
3028 all the nodes that participate in that permutation. */
3029 if (SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
3031 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, loads_node)
3033 if (!SLP_TREE_VEC_STMTS (loads_node).exists ())
3035 SLP_TREE_VEC_STMTS (loads_node).create (vec_stmts_size);
3036 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
3041 if (!SLP_TREE_VEC_STMTS (node).exists ())
3043 SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3044 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3047 if (dump_enabled_p ())
3049 dump_printf_loc (MSG_NOTE,vect_location,
3050 "------>vectorizing SLP node starting from: ");
3051 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3054 /* Loads should be inserted before the first load. */
3055 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3056 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3057 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3058 && SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
3059 si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3060 else if (is_pattern_stmt_p (stmt_info))
3061 si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3062 else
3063 si = gsi_for_stmt (stmt);
3065 /* Stores should be inserted just before the last store. */
3066 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3067 && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3069 gimple last_store = vect_find_last_store_in_slp_instance (instance);
3070 if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3071 last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3072 si = gsi_for_stmt (last_store);
3075 /* Mark the first element of the reduction chain as reduction to properly
3076 transform the node. In the analysis phase only the last element of the
3077 chain is marked as reduction. */
3078 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3079 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3081 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3082 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3085 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3086 return is_store;
3089 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3090 For loop vectorization this is done in vectorizable_call, but for SLP
3091 it needs to be deferred until end of vect_schedule_slp, because multiple
3092 SLP instances may refer to the same scalar stmt. */
3094 static void
3095 vect_remove_slp_scalar_calls (slp_tree node)
3097 gimple stmt, new_stmt;
3098 gimple_stmt_iterator gsi;
3099 int i;
3100 slp_tree child;
3101 tree lhs;
3102 stmt_vec_info stmt_info;
3104 if (!node)
3105 return;
3107 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3108 vect_remove_slp_scalar_calls (child);
3110 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3112 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3113 continue;
3114 stmt_info = vinfo_for_stmt (stmt);
3115 if (stmt_info == NULL
3116 || is_pattern_stmt_p (stmt_info)
3117 || !PURE_SLP_STMT (stmt_info))
3118 continue;
3119 lhs = gimple_call_lhs (stmt);
3120 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3121 set_vinfo_for_stmt (new_stmt, stmt_info);
3122 set_vinfo_for_stmt (stmt, NULL);
3123 STMT_VINFO_STMT (stmt_info) = new_stmt;
3124 gsi = gsi_for_stmt (stmt);
3125 gsi_replace (&gsi, new_stmt, false);
3126 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3130 /* Generate vector code for all SLP instances in the loop/basic block. */
3132 bool
3133 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3135 vec<slp_instance> slp_instances;
3136 slp_instance instance;
3137 slp_tree loads_node;
3138 unsigned int i, j, vf;
3139 bool is_store = false;
3141 if (loop_vinfo)
3143 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3144 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3146 else
3148 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3149 vf = 1;
3152 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3154 /* Schedule the tree of INSTANCE. */
3155 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3156 instance, vf);
3158 /* Clear STMT_VINFO_VEC_STMT of all loads. With shared loads
3159 between SLP instances we fail to properly initialize the
3160 vectorized SLP stmts and confuse different load permutations. */
3161 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, loads_node)
3162 STMT_VINFO_VEC_STMT
3163 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (loads_node)[0])) = NULL;
3165 if (dump_enabled_p ())
3166 dump_printf_loc (MSG_NOTE, vect_location,
3167 "vectorizing stmts using SLP.");
3170 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3172 slp_tree root = SLP_INSTANCE_TREE (instance);
3173 gimple store;
3174 unsigned int j;
3175 gimple_stmt_iterator gsi;
3177 /* Remove scalar call stmts. Do not do this for basic-block
3178 vectorization as not all uses may be vectorized.
3179 ??? Why should this be necessary? DCE should be able to
3180 remove the stmts itself.
3181 ??? For BB vectorization we can as well remove scalar
3182 stmts starting from the SLP tree root if they have no
3183 uses. */
3184 if (loop_vinfo)
3185 vect_remove_slp_scalar_calls (root);
3187 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3188 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3190 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3191 break;
3193 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3194 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3195 /* Free the attached stmt_vec_info and remove the stmt. */
3196 gsi = gsi_for_stmt (store);
3197 unlink_stmt_vdef (store);
3198 gsi_remove (&gsi, true);
3199 release_defs (store);
3200 free_stmt_vec_info (store);
3204 return is_store;
3208 /* Vectorize the basic block. */
3210 void
3211 vect_slp_transform_bb (basic_block bb)
3213 bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3214 gimple_stmt_iterator si;
3216 gcc_assert (bb_vinfo);
3218 if (dump_enabled_p ())
3219 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3221 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3223 gimple stmt = gsi_stmt (si);
3224 stmt_vec_info stmt_info;
3226 if (dump_enabled_p ())
3228 dump_printf_loc (MSG_NOTE, vect_location,
3229 "------>SLPing statement: ");
3230 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3233 stmt_info = vinfo_for_stmt (stmt);
3234 gcc_assert (stmt_info);
3236 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3237 if (STMT_SLP_TYPE (stmt_info))
3239 vect_schedule_slp (NULL, bb_vinfo);
3240 break;
3244 if (dump_enabled_p ())
3245 dump_printf (MSG_OPTIMIZED_LOCATIONS, "BASIC BLOCK VECTORIZED\n");
3247 destroy_bb_vec_info (bb_vinfo);