config.gcc (powerpc*-*-*): Add support for a new configure option --with-advance...
[official-gcc.git] / gcc / tree-vect-slp.c
blob5fd16352e395eb7c03b3002ece1aed40505ddd3f
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "target.h"
40 #include "predict.h"
41 #include "hard-reg-set.h"
42 #include "function.h"
43 #include "basic-block.h"
44 #include "gimple-pretty-print.h"
45 #include "tree-ssa-alias.h"
46 #include "internal-fn.h"
47 #include "gimple-expr.h"
48 #include "is-a.h"
49 #include "gimple.h"
50 #include "gimple-iterator.h"
51 #include "gimple-ssa.h"
52 #include "tree-phinodes.h"
53 #include "ssa-iterators.h"
54 #include "stringpool.h"
55 #include "tree-ssanames.h"
56 #include "tree-pass.h"
57 #include "cfgloop.h"
58 #include "hashtab.h"
59 #include "rtl.h"
60 #include "flags.h"
61 #include "statistics.h"
62 #include "real.h"
63 #include "fixed-value.h"
64 #include "insn-config.h"
65 #include "expmed.h"
66 #include "dojump.h"
67 #include "explow.h"
68 #include "calls.h"
69 #include "emit-rtl.h"
70 #include "varasm.h"
71 #include "stmt.h"
72 #include "expr.h"
73 #include "recog.h" /* FIXME: for insn_data */
74 #include "insn-codes.h"
75 #include "optabs.h"
76 #include "tree-vectorizer.h"
77 #include "langhooks.h"
78 #include "gimple-walk.h"
80 /* Extract the location of the basic block in the source code.
81 Return the basic block location if succeed and NULL if not. */
83 source_location
84 find_bb_location (basic_block bb)
86 gimple stmt = NULL;
87 gimple_stmt_iterator si;
89 if (!bb)
90 return UNKNOWN_LOCATION;
92 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
94 stmt = gsi_stmt (si);
95 if (gimple_location (stmt) != UNKNOWN_LOCATION)
96 return gimple_location (stmt);
99 return UNKNOWN_LOCATION;
103 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
105 static void
106 vect_free_slp_tree (slp_tree node)
108 int i;
109 slp_tree child;
111 if (!node)
112 return;
114 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
115 vect_free_slp_tree (child);
117 SLP_TREE_CHILDREN (node).release ();
118 SLP_TREE_SCALAR_STMTS (node).release ();
119 SLP_TREE_VEC_STMTS (node).release ();
120 SLP_TREE_LOAD_PERMUTATION (node).release ();
122 free (node);
126 /* Free the memory allocated for the SLP instance. */
128 void
129 vect_free_slp_instance (slp_instance instance)
131 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
132 SLP_INSTANCE_LOADS (instance).release ();
133 free (instance);
137 /* Create an SLP node for SCALAR_STMTS. */
139 static slp_tree
140 vect_create_new_slp_node (vec<gimple> scalar_stmts)
142 slp_tree node;
143 gimple stmt = scalar_stmts[0];
144 unsigned int nops;
146 if (is_gimple_call (stmt))
147 nops = gimple_call_num_args (stmt);
148 else if (is_gimple_assign (stmt))
150 nops = gimple_num_ops (stmt) - 1;
151 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
152 nops++;
154 else
155 return NULL;
157 node = XNEW (struct _slp_tree);
158 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
159 SLP_TREE_VEC_STMTS (node).create (0);
160 SLP_TREE_CHILDREN (node).create (nops);
161 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
162 SLP_TREE_TWO_OPERATORS (node) = false;
164 return node;
168 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
169 operand. */
170 static vec<slp_oprnd_info>
171 vect_create_oprnd_info (int nops, int group_size)
173 int i;
174 slp_oprnd_info oprnd_info;
175 vec<slp_oprnd_info> oprnds_info;
177 oprnds_info.create (nops);
178 for (i = 0; i < nops; i++)
180 oprnd_info = XNEW (struct _slp_oprnd_info);
181 oprnd_info->def_stmts.create (group_size);
182 oprnd_info->first_dt = vect_uninitialized_def;
183 oprnd_info->first_op_type = NULL_TREE;
184 oprnd_info->first_pattern = false;
185 oprnd_info->second_pattern = false;
186 oprnds_info.quick_push (oprnd_info);
189 return oprnds_info;
193 /* Free operands info. */
195 static void
196 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
198 int i;
199 slp_oprnd_info oprnd_info;
201 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
203 oprnd_info->def_stmts.release ();
204 XDELETE (oprnd_info);
207 oprnds_info.release ();
211 /* Find the place of the data-ref in STMT in the interleaving chain that starts
212 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
214 static int
215 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
217 gimple next_stmt = first_stmt;
218 int result = 0;
220 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
221 return -1;
225 if (next_stmt == stmt)
226 return result;
227 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
228 if (next_stmt)
229 result += GROUP_GAP (vinfo_for_stmt (next_stmt));
231 while (next_stmt);
233 return -1;
237 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
238 they are of a valid type and that they match the defs of the first stmt of
239 the SLP group (stored in OPRNDS_INFO). If there was a fatal error
240 return -1, if the error could be corrected by swapping operands of the
241 operation return 1, if everything is ok return 0. */
243 static int
244 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
245 gimple stmt, unsigned stmt_num,
246 vec<slp_oprnd_info> *oprnds_info)
248 tree oprnd;
249 unsigned int i, number_of_oprnds;
250 tree def;
251 gimple def_stmt;
252 enum vect_def_type dt = vect_uninitialized_def;
253 struct loop *loop = NULL;
254 bool pattern = false;
255 slp_oprnd_info oprnd_info;
256 int first_op_idx = 1;
257 bool commutative = false;
258 bool first_op_cond = false;
259 bool first = stmt_num == 0;
260 bool second = stmt_num == 1;
262 if (loop_vinfo)
263 loop = LOOP_VINFO_LOOP (loop_vinfo);
265 if (is_gimple_call (stmt))
267 number_of_oprnds = gimple_call_num_args (stmt);
268 first_op_idx = 3;
270 else if (is_gimple_assign (stmt))
272 enum tree_code code = gimple_assign_rhs_code (stmt);
273 number_of_oprnds = gimple_num_ops (stmt) - 1;
274 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
276 first_op_cond = true;
277 commutative = true;
278 number_of_oprnds++;
280 else
281 commutative = commutative_tree_code (code);
283 else
284 return -1;
286 bool swapped = false;
287 for (i = 0; i < number_of_oprnds; i++)
289 again:
290 if (first_op_cond)
292 if (i == 0 || i == 1)
293 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
294 swapped ? !i : i);
295 else
296 oprnd = gimple_op (stmt, first_op_idx + i - 1);
298 else
299 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
301 oprnd_info = (*oprnds_info)[i];
303 if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
304 &def, &dt))
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 "Build SLP failed: can't analyze def for ");
310 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
311 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
314 return -1;
317 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
318 from the pattern. Check that all the stmts of the node are in the
319 pattern. */
320 if (def_stmt && gimple_bb (def_stmt)
321 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
322 || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
323 && gimple_code (def_stmt) != GIMPLE_PHI))
324 && vinfo_for_stmt (def_stmt)
325 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
326 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
327 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
329 pattern = true;
330 if (!first && !oprnd_info->first_pattern
331 /* Allow different pattern state for the defs of the
332 first stmt in reduction chains. */
333 && (oprnd_info->first_dt != vect_reduction_def
334 || (!second && !oprnd_info->second_pattern)))
336 if (i == 0
337 && !swapped
338 && commutative)
340 swapped = true;
341 goto again;
344 if (dump_enabled_p ())
346 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
347 "Build SLP failed: some of the stmts"
348 " are in a pattern, and others are not ");
349 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
350 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
353 return 1;
356 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
357 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
359 if (dt == vect_unknown_def_type)
361 if (dump_enabled_p ())
362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
363 "Unsupported pattern.\n");
364 return -1;
367 switch (gimple_code (def_stmt))
369 case GIMPLE_PHI:
370 def = gimple_phi_result (def_stmt);
371 break;
373 case GIMPLE_ASSIGN:
374 def = gimple_assign_lhs (def_stmt);
375 break;
377 default:
378 if (dump_enabled_p ())
379 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
380 "unsupported defining stmt:\n");
381 return -1;
385 if (second)
386 oprnd_info->second_pattern = pattern;
388 if (first)
390 oprnd_info->first_dt = dt;
391 oprnd_info->first_pattern = pattern;
392 oprnd_info->first_op_type = TREE_TYPE (oprnd);
394 else
396 /* Not first stmt of the group, check that the def-stmt/s match
397 the def-stmt/s of the first stmt. Allow different definition
398 types for reduction chains: the first stmt must be a
399 vect_reduction_def (a phi node), and the rest
400 vect_internal_def. */
401 if (((oprnd_info->first_dt != dt
402 && !(oprnd_info->first_dt == vect_reduction_def
403 && dt == vect_internal_def)
404 && !((oprnd_info->first_dt == vect_external_def
405 || oprnd_info->first_dt == vect_constant_def)
406 && (dt == vect_external_def
407 || dt == vect_constant_def)))
408 || !types_compatible_p (oprnd_info->first_op_type,
409 TREE_TYPE (oprnd))))
411 /* Try swapping operands if we got a mismatch. */
412 if (i == 0
413 && !swapped
414 && commutative)
416 swapped = true;
417 goto again;
420 if (dump_enabled_p ())
421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
422 "Build SLP failed: different types\n");
424 return 1;
428 /* Check the types of the definitions. */
429 switch (dt)
431 case vect_constant_def:
432 case vect_external_def:
433 case vect_reduction_def:
434 break;
436 case vect_internal_def:
437 oprnd_info->def_stmts.quick_push (def_stmt);
438 break;
440 default:
441 /* FORNOW: Not supported. */
442 if (dump_enabled_p ())
444 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
445 "Build SLP failed: illegal type of def ");
446 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
447 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
450 return -1;
454 /* Swap operands. */
455 if (swapped)
457 if (first_op_cond)
459 tree cond = gimple_assign_rhs1 (stmt);
460 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
461 &TREE_OPERAND (cond, 1));
462 TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
464 else
465 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
466 gimple_assign_rhs2_ptr (stmt));
469 return 0;
473 /* Verify if the scalar stmts STMTS are isomorphic, require data
474 permutation or are of unsupported types of operation. Return
475 true if they are, otherwise return false and indicate in *MATCHES
476 which stmts are not isomorphic to the first one. If MATCHES[0]
477 is false then this indicates the comparison could not be
478 carried out or the stmts will never be vectorized by SLP. */
480 static bool
481 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
482 vec<gimple> stmts, unsigned int group_size,
483 unsigned nops, unsigned int *max_nunits,
484 unsigned int vectorization_factor, bool *matches,
485 bool *two_operators)
487 unsigned int i;
488 gimple first_stmt = stmts[0], stmt = stmts[0];
489 enum tree_code first_stmt_code = ERROR_MARK;
490 enum tree_code alt_stmt_code = ERROR_MARK;
491 enum tree_code rhs_code = ERROR_MARK;
492 enum tree_code first_cond_code = ERROR_MARK;
493 tree lhs;
494 bool need_same_oprnds = false;
495 tree vectype, scalar_type, first_op1 = NULL_TREE;
496 optab optab;
497 int icode;
498 machine_mode optab_op2_mode;
499 machine_mode vec_mode;
500 struct data_reference *first_dr;
501 HOST_WIDE_INT dummy;
502 gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
503 tree cond;
505 /* For every stmt in NODE find its def stmt/s. */
506 FOR_EACH_VEC_ELT (stmts, i, stmt)
508 matches[i] = false;
510 if (dump_enabled_p ())
512 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
513 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
514 dump_printf (MSG_NOTE, "\n");
517 /* Fail to vectorize statements marked as unvectorizable. */
518 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "Build SLP failed: unvectorizable statement ");
524 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
525 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
527 /* Fatal mismatch. */
528 matches[0] = false;
529 return false;
532 lhs = gimple_get_lhs (stmt);
533 if (lhs == NULL_TREE)
535 if (dump_enabled_p ())
537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
538 "Build SLP failed: not GIMPLE_ASSIGN nor "
539 "GIMPLE_CALL ");
540 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
541 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
543 /* Fatal mismatch. */
544 matches[0] = false;
545 return false;
548 if (is_gimple_assign (stmt)
549 && gimple_assign_rhs_code (stmt) == COND_EXPR
550 && (cond = gimple_assign_rhs1 (stmt))
551 && !COMPARISON_CLASS_P (cond))
553 if (dump_enabled_p ())
555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
556 "Build SLP failed: condition is not "
557 "comparison ");
558 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
559 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
561 /* Fatal mismatch. */
562 matches[0] = false;
563 return false;
566 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
567 vectype = get_vectype_for_scalar_type (scalar_type);
568 if (!vectype)
570 if (dump_enabled_p ())
572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
573 "Build SLP failed: unsupported data-type ");
574 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
575 scalar_type);
576 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
578 /* Fatal mismatch. */
579 matches[0] = false;
580 return false;
583 /* If populating the vector type requires unrolling then fail
584 before adjusting *max_nunits for basic-block vectorization. */
585 if (bb_vinfo
586 && TYPE_VECTOR_SUBPARTS (vectype) > group_size)
588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
589 "Build SLP failed: unrolling required "
590 "in basic block SLP\n");
591 /* Fatal mismatch. */
592 matches[0] = false;
593 return false;
596 /* In case of multiple types we need to detect the smallest type. */
597 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
599 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
600 if (bb_vinfo)
601 vectorization_factor = *max_nunits;
604 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
606 rhs_code = CALL_EXPR;
607 if (gimple_call_internal_p (call_stmt)
608 || gimple_call_tail_p (call_stmt)
609 || gimple_call_noreturn_p (call_stmt)
610 || !gimple_call_nothrow_p (call_stmt)
611 || gimple_call_chain (call_stmt))
613 if (dump_enabled_p ())
615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
616 "Build SLP failed: unsupported call type ");
617 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
618 call_stmt, 0);
619 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
621 /* Fatal mismatch. */
622 matches[0] = false;
623 return false;
626 else
627 rhs_code = gimple_assign_rhs_code (stmt);
629 /* Check the operation. */
630 if (i == 0)
632 first_stmt_code = rhs_code;
634 /* Shift arguments should be equal in all the packed stmts for a
635 vector shift with scalar shift operand. */
636 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
637 || rhs_code == LROTATE_EXPR
638 || rhs_code == RROTATE_EXPR)
640 vec_mode = TYPE_MODE (vectype);
642 /* First see if we have a vector/vector shift. */
643 optab = optab_for_tree_code (rhs_code, vectype,
644 optab_vector);
646 if (!optab
647 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
649 /* No vector/vector shift, try for a vector/scalar shift. */
650 optab = optab_for_tree_code (rhs_code, vectype,
651 optab_scalar);
653 if (!optab)
655 if (dump_enabled_p ())
656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
657 "Build SLP failed: no optab.\n");
658 /* Fatal mismatch. */
659 matches[0] = false;
660 return false;
662 icode = (int) optab_handler (optab, vec_mode);
663 if (icode == CODE_FOR_nothing)
665 if (dump_enabled_p ())
666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
667 "Build SLP failed: "
668 "op not supported by target.\n");
669 /* Fatal mismatch. */
670 matches[0] = false;
671 return false;
673 optab_op2_mode = insn_data[icode].operand[2].mode;
674 if (!VECTOR_MODE_P (optab_op2_mode))
676 need_same_oprnds = true;
677 first_op1 = gimple_assign_rhs2 (stmt);
681 else if (rhs_code == WIDEN_LSHIFT_EXPR)
683 need_same_oprnds = true;
684 first_op1 = gimple_assign_rhs2 (stmt);
687 else
689 if (first_stmt_code != rhs_code
690 && alt_stmt_code == ERROR_MARK)
691 alt_stmt_code = rhs_code;
692 if (first_stmt_code != rhs_code
693 && (first_stmt_code != IMAGPART_EXPR
694 || rhs_code != REALPART_EXPR)
695 && (first_stmt_code != REALPART_EXPR
696 || rhs_code != IMAGPART_EXPR)
697 /* Handle mismatches in plus/minus by computing both
698 and merging the results. */
699 && !((first_stmt_code == PLUS_EXPR
700 || first_stmt_code == MINUS_EXPR)
701 && (alt_stmt_code == PLUS_EXPR
702 || alt_stmt_code == MINUS_EXPR)
703 && rhs_code == alt_stmt_code)
704 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
705 && (first_stmt_code == ARRAY_REF
706 || first_stmt_code == BIT_FIELD_REF
707 || first_stmt_code == INDIRECT_REF
708 || first_stmt_code == COMPONENT_REF
709 || first_stmt_code == MEM_REF)))
711 if (dump_enabled_p ())
713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
714 "Build SLP failed: different operation "
715 "in stmt ");
716 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
718 "original stmt ");
719 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
720 first_stmt, 0);
722 /* Mismatch. */
723 continue;
726 if (need_same_oprnds
727 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
729 if (dump_enabled_p ())
731 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
732 "Build SLP failed: different shift "
733 "arguments in ");
734 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
735 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
737 /* Mismatch. */
738 continue;
741 if (rhs_code == CALL_EXPR)
743 gimple first_stmt = stmts[0];
744 if (gimple_call_num_args (stmt) != nops
745 || !operand_equal_p (gimple_call_fn (first_stmt),
746 gimple_call_fn (stmt), 0)
747 || gimple_call_fntype (first_stmt)
748 != gimple_call_fntype (stmt))
750 if (dump_enabled_p ())
752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
753 "Build SLP failed: different calls in ");
754 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
755 stmt, 0);
756 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
758 /* Mismatch. */
759 continue;
764 /* Grouped store or load. */
765 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
767 if (REFERENCE_CLASS_P (lhs))
769 /* Store. */
772 else
774 /* Load. */
775 unsigned unrolling_factor
776 = least_common_multiple
777 (*max_nunits, group_size) / group_size;
778 /* FORNOW: Check that there is no gap between the loads
779 and no gap between the groups when we need to load
780 multiple groups at once. */
781 if (unrolling_factor > 1
782 && ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
783 && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
784 /* If the group is split up then GROUP_GAP
785 isn't correct here, nor is GROUP_FIRST_ELEMENT. */
786 || GROUP_SIZE (vinfo_for_stmt (stmt)) > group_size))
788 if (dump_enabled_p ())
790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
791 "Build SLP failed: grouped "
792 "loads have gaps ");
793 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
794 stmt, 0);
795 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
797 /* Fatal mismatch. */
798 matches[0] = false;
799 return false;
802 /* Check that the size of interleaved loads group is not
803 greater than the SLP group size. */
804 unsigned ncopies
805 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
806 if (loop_vinfo
807 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
808 && ((GROUP_SIZE (vinfo_for_stmt (stmt))
809 - GROUP_GAP (vinfo_for_stmt (stmt)))
810 > ncopies * group_size))
812 if (dump_enabled_p ())
814 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
815 "Build SLP failed: the number "
816 "of interleaved loads is greater than "
817 "the SLP group size ");
818 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
819 stmt, 0);
820 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
822 /* Fatal mismatch. */
823 matches[0] = false;
824 return false;
827 old_first_load = first_load;
828 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
829 if (prev_first_load)
831 /* Check that there are no loads from different interleaving
832 chains in the same node. */
833 if (prev_first_load != first_load)
835 if (dump_enabled_p ())
837 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
838 vect_location,
839 "Build SLP failed: different "
840 "interleaving chains in one node ");
841 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
842 stmt, 0);
843 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
845 /* Mismatch. */
846 continue;
849 else
850 prev_first_load = first_load;
852 /* In some cases a group of loads is just the same load
853 repeated N times. Only analyze its cost once. */
854 if (first_load == stmt && old_first_load != first_load)
856 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
857 if (vect_supportable_dr_alignment (first_dr, false)
858 == dr_unaligned_unsupported)
860 if (dump_enabled_p ())
862 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
863 vect_location,
864 "Build SLP failed: unsupported "
865 "unaligned load ");
866 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
867 stmt, 0);
868 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
870 /* Fatal mismatch. */
871 matches[0] = false;
872 return false;
876 } /* Grouped access. */
877 else
879 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
881 /* Not grouped load. */
882 if (dump_enabled_p ())
884 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
885 "Build SLP failed: not grouped load ");
886 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
887 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
890 /* FORNOW: Not grouped loads are not supported. */
891 /* Fatal mismatch. */
892 matches[0] = false;
893 return false;
896 /* Not memory operation. */
897 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
898 && TREE_CODE_CLASS (rhs_code) != tcc_unary
899 && TREE_CODE_CLASS (rhs_code) != tcc_expression
900 && rhs_code != CALL_EXPR)
902 if (dump_enabled_p ())
904 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
905 "Build SLP failed: operation");
906 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
907 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
908 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
910 /* Fatal mismatch. */
911 matches[0] = false;
912 return false;
915 if (rhs_code == COND_EXPR)
917 tree cond_expr = gimple_assign_rhs1 (stmt);
919 if (i == 0)
920 first_cond_code = TREE_CODE (cond_expr);
921 else if (first_cond_code != TREE_CODE (cond_expr))
923 if (dump_enabled_p ())
925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
926 "Build SLP failed: different"
927 " operation");
928 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
929 stmt, 0);
930 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
932 /* Mismatch. */
933 continue;
938 matches[i] = true;
941 for (i = 0; i < group_size; ++i)
942 if (!matches[i])
943 return false;
945 /* If we allowed a two-operation SLP node verify the target can cope
946 with the permute we are going to use. */
947 if (alt_stmt_code != ERROR_MARK
948 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
950 unsigned char *sel
951 = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (vectype));
952 for (i = 0; i < TYPE_VECTOR_SUBPARTS (vectype); ++i)
954 sel[i] = i;
955 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
956 sel[i] += TYPE_VECTOR_SUBPARTS (vectype);
958 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
960 for (i = 0; i < group_size; ++i)
961 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
963 matches[i] = false;
964 if (dump_enabled_p ())
966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
967 "Build SLP failed: different operation "
968 "in stmt ");
969 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
970 stmts[i], 0);
971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
972 "original stmt ");
973 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
974 first_stmt, 0);
977 return false;
979 *two_operators = true;
982 return true;
985 /* Recursively build an SLP tree starting from NODE.
986 Fail (and return a value not equal to zero) if def-stmts are not
987 isomorphic, require data permutation or are of unsupported types of
988 operation. Otherwise, return 0.
989 The value returned is the depth in the SLP tree where a mismatch
990 was found. */
992 static bool
993 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
994 slp_tree *node, unsigned int group_size,
995 unsigned int *max_nunits,
996 vec<slp_tree> *loads,
997 unsigned int vectorization_factor,
998 bool *matches, unsigned *npermutes, unsigned *tree_size,
999 unsigned max_tree_size)
1001 unsigned nops, i, this_tree_size = 0;
1002 gimple stmt;
1004 matches[0] = false;
1006 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
1007 if (is_gimple_call (stmt))
1008 nops = gimple_call_num_args (stmt);
1009 else if (is_gimple_assign (stmt))
1011 nops = gimple_num_ops (stmt) - 1;
1012 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1013 nops++;
1015 else
1016 return false;
1018 bool two_operators = false;
1019 if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
1020 SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
1021 max_nunits, vectorization_factor, matches,
1022 &two_operators))
1023 return false;
1024 SLP_TREE_TWO_OPERATORS (*node) = two_operators;
1026 /* If the SLP node is a load, terminate the recursion. */
1027 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
1028 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1030 loads->safe_push (*node);
1031 return true;
1034 /* Get at the operands, verifying they are compatible. */
1035 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1036 slp_oprnd_info oprnd_info;
1037 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
1039 switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
1040 stmt, i, &oprnds_info))
1042 case 0:
1043 break;
1044 case -1:
1045 matches[0] = false;
1046 vect_free_oprnd_info (oprnds_info);
1047 return false;
1048 case 1:
1049 matches[i] = false;
1050 break;
1053 for (i = 0; i < group_size; ++i)
1054 if (!matches[i])
1056 vect_free_oprnd_info (oprnds_info);
1057 return false;
1060 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
1062 /* Create SLP_TREE nodes for the definition node/s. */
1063 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1065 slp_tree child;
1066 unsigned old_nloads = loads->length ();
1067 unsigned old_max_nunits = *max_nunits;
1069 if (oprnd_info->first_dt != vect_internal_def)
1070 continue;
1072 if (++this_tree_size > max_tree_size)
1074 vect_free_oprnd_info (oprnds_info);
1075 return false;
1078 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1079 if (!child)
1081 vect_free_oprnd_info (oprnds_info);
1082 return false;
1085 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1086 group_size, max_nunits, loads,
1087 vectorization_factor, matches,
1088 npermutes, &this_tree_size, max_tree_size))
1090 /* If we have all children of child built up from scalars then just
1091 throw that away and build it up this node from scalars. */
1092 if (!SLP_TREE_CHILDREN (child).is_empty ())
1094 unsigned int j;
1095 slp_tree grandchild;
1097 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1098 if (grandchild != NULL)
1099 break;
1100 if (!grandchild)
1102 /* Roll back. */
1103 *max_nunits = old_max_nunits;
1104 loads->truncate (old_nloads);
1105 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1106 vect_free_slp_tree (grandchild);
1107 SLP_TREE_CHILDREN (child).truncate (0);
1109 dump_printf_loc (MSG_NOTE, vect_location,
1110 "Building parent vector operands from "
1111 "scalars instead\n");
1112 oprnd_info->def_stmts = vNULL;
1113 vect_free_slp_tree (child);
1114 SLP_TREE_CHILDREN (*node).quick_push (NULL);
1115 continue;
1119 oprnd_info->def_stmts = vNULL;
1120 SLP_TREE_CHILDREN (*node).quick_push (child);
1121 continue;
1124 /* If the SLP build failed fatally and we analyze a basic-block
1125 simply treat nodes we fail to build as externally defined
1126 (and thus build vectors from the scalar defs).
1127 The cost model will reject outright expensive cases.
1128 ??? This doesn't treat cases where permutation ultimatively
1129 fails (or we don't try permutation below). Ideally we'd
1130 even compute a permutation that will end up with the maximum
1131 SLP tree size... */
1132 if (bb_vinfo
1133 && !matches[0]
1134 /* ??? Rejecting patterns this way doesn't work. We'd have to
1135 do extra work to cancel the pattern so the uses see the
1136 scalar version. */
1137 && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1139 unsigned int j;
1140 slp_tree grandchild;
1142 /* Roll back. */
1143 *max_nunits = old_max_nunits;
1144 loads->truncate (old_nloads);
1145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1146 vect_free_slp_tree (grandchild);
1147 SLP_TREE_CHILDREN (child).truncate (0);
1149 dump_printf_loc (MSG_NOTE, vect_location,
1150 "Building vector operands from scalars\n");
1151 oprnd_info->def_stmts = vNULL;
1152 vect_free_slp_tree (child);
1153 SLP_TREE_CHILDREN (*node).quick_push (NULL);
1154 continue;
1157 /* If the SLP build for operand zero failed and operand zero
1158 and one can be commutated try that for the scalar stmts
1159 that failed the match. */
1160 if (i == 0
1161 /* A first scalar stmt mismatch signals a fatal mismatch. */
1162 && matches[0]
1163 /* ??? For COND_EXPRs we can swap the comparison operands
1164 as well as the arms under some constraints. */
1165 && nops == 2
1166 && oprnds_info[1]->first_dt == vect_internal_def
1167 && is_gimple_assign (stmt)
1168 && commutative_tree_code (gimple_assign_rhs_code (stmt))
1169 && !SLP_TREE_TWO_OPERATORS (*node)
1170 /* Do so only if the number of not successful permutes was nor more
1171 than a cut-ff as re-trying the recursive match on
1172 possibly each level of the tree would expose exponential
1173 behavior. */
1174 && *npermutes < 4)
1176 unsigned int j;
1177 slp_tree grandchild;
1179 /* Roll back. */
1180 *max_nunits = old_max_nunits;
1181 loads->truncate (old_nloads);
1182 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1183 vect_free_slp_tree (grandchild);
1184 SLP_TREE_CHILDREN (child).truncate (0);
1186 /* Swap mismatched definition stmts. */
1187 dump_printf_loc (MSG_NOTE, vect_location,
1188 "Re-trying with swapped operands of stmts ");
1189 for (j = 0; j < group_size; ++j)
1190 if (!matches[j])
1192 gimple tem = oprnds_info[0]->def_stmts[j];
1193 oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
1194 oprnds_info[1]->def_stmts[j] = tem;
1195 dump_printf (MSG_NOTE, "%d ", j);
1197 dump_printf (MSG_NOTE, "\n");
1198 /* And try again with scratch 'matches' ... */
1199 bool *tem = XALLOCAVEC (bool, group_size);
1200 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1201 group_size, max_nunits, loads,
1202 vectorization_factor,
1203 tem, npermutes, &this_tree_size,
1204 max_tree_size))
1206 /* ... so if successful we can apply the operand swapping
1207 to the GIMPLE IL. This is necessary because for example
1208 vect_get_slp_defs uses operand indexes and thus expects
1209 canonical operand order. */
1210 for (j = 0; j < group_size; ++j)
1211 if (!matches[j])
1213 gimple stmt = SLP_TREE_SCALAR_STMTS (*node)[j];
1214 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1215 gimple_assign_rhs2_ptr (stmt));
1217 oprnd_info->def_stmts = vNULL;
1218 SLP_TREE_CHILDREN (*node).quick_push (child);
1219 continue;
1222 ++*npermutes;
1225 oprnd_info->def_stmts = vNULL;
1226 vect_free_slp_tree (child);
1227 vect_free_oprnd_info (oprnds_info);
1228 return false;
1231 if (tree_size)
1232 *tree_size += this_tree_size;
1234 vect_free_oprnd_info (oprnds_info);
1235 return true;
1238 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1240 static void
1241 vect_print_slp_tree (int dump_kind, slp_tree node)
1243 int i;
1244 gimple stmt;
1245 slp_tree child;
1247 if (!node)
1248 return;
1250 dump_printf (dump_kind, "node ");
1251 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1253 dump_printf (dump_kind, "\n\tstmt %d ", i);
1254 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1256 dump_printf (dump_kind, "\n");
1258 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1259 vect_print_slp_tree (dump_kind, child);
1263 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1264 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1265 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1266 stmts in NODE are to be marked. */
1268 static void
1269 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1271 int i;
1272 gimple stmt;
1273 slp_tree child;
1275 if (!node)
1276 return;
1278 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1279 if (j < 0 || i == j)
1280 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1282 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1283 vect_mark_slp_stmts (child, mark, j);
1287 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1289 static void
1290 vect_mark_slp_stmts_relevant (slp_tree node)
1292 int i;
1293 gimple stmt;
1294 stmt_vec_info stmt_info;
1295 slp_tree child;
1297 if (!node)
1298 return;
1300 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1302 stmt_info = vinfo_for_stmt (stmt);
1303 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1304 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1305 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1308 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1309 vect_mark_slp_stmts_relevant (child);
1313 /* Rearrange the statements of NODE according to PERMUTATION. */
1315 static void
1316 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1317 vec<unsigned> permutation)
1319 gimple stmt;
1320 vec<gimple> tmp_stmts;
1321 unsigned int i;
1322 slp_tree child;
1324 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1325 vect_slp_rearrange_stmts (child, group_size, permutation);
1327 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1328 tmp_stmts.create (group_size);
1329 tmp_stmts.quick_grow_cleared (group_size);
1331 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1332 tmp_stmts[permutation[i]] = stmt;
1334 SLP_TREE_SCALAR_STMTS (node).release ();
1335 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1339 /* Check if the required load permutations in the SLP instance
1340 SLP_INSTN are supported. */
1342 static bool
1343 vect_supported_load_permutation_p (slp_instance slp_instn)
1345 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1346 unsigned int i, j, k, next;
1347 sbitmap load_index;
1348 slp_tree node;
1349 gimple stmt, load, next_load, first_load;
1350 struct data_reference *dr;
1352 if (dump_enabled_p ())
1354 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1355 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1356 if (node->load_permutation.exists ())
1357 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1358 dump_printf (MSG_NOTE, "%d ", next);
1359 else
1360 for (k = 0; k < group_size; ++k)
1361 dump_printf (MSG_NOTE, "%d ", k);
1362 dump_printf (MSG_NOTE, "\n");
1365 /* In case of reduction every load permutation is allowed, since the order
1366 of the reduction statements is not important (as opposed to the case of
1367 grouped stores). The only condition we need to check is that all the
1368 load nodes are of the same size and have the same permutation (and then
1369 rearrange all the nodes of the SLP instance according to this
1370 permutation). */
1372 /* Check that all the load nodes are of the same size. */
1373 /* ??? Can't we assert this? */
1374 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1375 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1376 return false;
1378 node = SLP_INSTANCE_TREE (slp_instn);
1379 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1381 /* Reduction (there are no data-refs in the root).
1382 In reduction chain the order of the loads is important. */
1383 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1384 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1386 slp_tree load;
1387 unsigned int lidx;
1389 /* Compare all the permutation sequences to the first one. We know
1390 that at least one load is permuted. */
1391 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1392 if (!node->load_permutation.exists ())
1393 return false;
1394 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1396 if (!load->load_permutation.exists ())
1397 return false;
1398 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1399 if (lidx != node->load_permutation[j])
1400 return false;
1403 /* Check that the loads in the first sequence are different and there
1404 are no gaps between them. */
1405 load_index = sbitmap_alloc (group_size);
1406 bitmap_clear (load_index);
1407 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1409 if (bitmap_bit_p (load_index, lidx))
1411 sbitmap_free (load_index);
1412 return false;
1414 bitmap_set_bit (load_index, lidx);
1416 for (i = 0; i < group_size; i++)
1417 if (!bitmap_bit_p (load_index, i))
1419 sbitmap_free (load_index);
1420 return false;
1422 sbitmap_free (load_index);
1424 /* This permutation is valid for reduction. Since the order of the
1425 statements in the nodes is not important unless they are memory
1426 accesses, we can rearrange the statements in all the nodes
1427 according to the order of the loads. */
1428 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1429 node->load_permutation);
1431 /* We are done, no actual permutations need to be generated. */
1432 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1433 SLP_TREE_LOAD_PERMUTATION (node).release ();
1434 return true;
1437 /* In basic block vectorization we allow any subchain of an interleaving
1438 chain.
1439 FORNOW: not supported in loop SLP because of realignment compications. */
1440 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1442 /* Check whether the loads in an instance form a subchain and thus
1443 no permutation is necessary. */
1444 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1446 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1447 continue;
1448 bool subchain_p = true;
1449 next_load = NULL;
1450 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1452 if (j != 0 && next_load != load)
1454 subchain_p = false;
1455 break;
1457 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1459 if (subchain_p)
1460 SLP_TREE_LOAD_PERMUTATION (node).release ();
1461 else
1463 /* Verify the permutation can be generated. */
1464 vec<tree> tem;
1465 if (!vect_transform_slp_perm_load (node, tem, NULL,
1466 1, slp_instn, true))
1468 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1469 vect_location,
1470 "unsupported load permutation\n");
1471 return false;
1476 /* Check that the alignment of the first load in every subchain, i.e.,
1477 the first statement in every load node, is supported.
1478 ??? This belongs in alignment checking. */
1479 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1481 first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1482 if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1484 dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1485 if (vect_supportable_dr_alignment (dr, false)
1486 == dr_unaligned_unsupported)
1488 if (dump_enabled_p ())
1490 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1491 vect_location,
1492 "unsupported unaligned load ");
1493 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1494 first_load, 0);
1495 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1497 return false;
1502 return true;
1505 /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1506 GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1507 well (unless it's reduction). */
1508 if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
1509 return false;
1510 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1511 if (!node->load_permutation.exists ())
1512 return false;
1514 load_index = sbitmap_alloc (group_size);
1515 bitmap_clear (load_index);
1516 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1518 unsigned int lidx = node->load_permutation[0];
1519 if (bitmap_bit_p (load_index, lidx))
1521 sbitmap_free (load_index);
1522 return false;
1524 bitmap_set_bit (load_index, lidx);
1525 FOR_EACH_VEC_ELT (node->load_permutation, j, k)
1526 if (k != lidx)
1528 sbitmap_free (load_index);
1529 return false;
1532 for (i = 0; i < group_size; i++)
1533 if (!bitmap_bit_p (load_index, i))
1535 sbitmap_free (load_index);
1536 return false;
1538 sbitmap_free (load_index);
1540 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1541 if (node->load_permutation.exists ()
1542 && !vect_transform_slp_perm_load
1543 (node, vNULL, NULL,
1544 SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1545 return false;
1546 return true;
1550 /* Find the last store in SLP INSTANCE. */
1552 static gimple
1553 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1555 gimple last = NULL, stmt;
1557 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1559 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1560 if (is_pattern_stmt_p (stmt_vinfo))
1561 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1562 else
1563 last = get_later_stmt (stmt, last);
1566 return last;
1569 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1571 static void
1572 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1573 stmt_vector_for_cost *prologue_cost_vec,
1574 stmt_vector_for_cost *body_cost_vec,
1575 unsigned ncopies_for_cost)
1577 unsigned i;
1578 slp_tree child;
1579 gimple stmt, s;
1580 stmt_vec_info stmt_info;
1581 tree lhs;
1582 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1584 /* Recurse down the SLP tree. */
1585 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1586 if (child)
1587 vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1588 body_cost_vec, ncopies_for_cost);
1590 /* Look at the first scalar stmt to determine the cost. */
1591 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1592 stmt_info = vinfo_for_stmt (stmt);
1593 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1595 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1596 vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1597 vect_uninitialized_def,
1598 node, prologue_cost_vec, body_cost_vec);
1599 else
1601 int i;
1602 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1603 vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1604 node, prologue_cost_vec, body_cost_vec);
1605 /* If the load is permuted record the cost for the permutation.
1606 ??? Loads from multiple chains are let through here only
1607 for a single special case involving complex numbers where
1608 in the end no permutation is necessary. */
1609 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1610 if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1611 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1612 && vect_get_place_in_interleaving_chain
1613 (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1615 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1616 stmt_info, 0, vect_body);
1617 break;
1621 else
1623 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1624 stmt_info, 0, vect_body);
1625 if (SLP_TREE_TWO_OPERATORS (node))
1627 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1628 stmt_info, 0, vect_body);
1629 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1630 stmt_info, 0, vect_body);
1634 /* Scan operands and account for prologue cost of constants/externals.
1635 ??? This over-estimates cost for multiple uses and should be
1636 re-engineered. */
1637 lhs = gimple_get_lhs (stmt);
1638 for (i = 0; i < gimple_num_ops (stmt); ++i)
1640 tree def, op = gimple_op (stmt, i);
1641 gimple def_stmt;
1642 enum vect_def_type dt;
1643 if (!op || op == lhs)
1644 continue;
1645 if (vect_is_simple_use (op, NULL, STMT_VINFO_LOOP_VINFO (stmt_info),
1646 STMT_VINFO_BB_VINFO (stmt_info),
1647 &def_stmt, &def, &dt))
1649 /* Without looking at the actual initializer a vector of
1650 constants can be implemented as load from the constant pool.
1651 ??? We need to pass down stmt_info for a vector type
1652 even if it points to the wrong stmt. */
1653 if (dt == vect_constant_def)
1654 record_stmt_cost (prologue_cost_vec, 1, vector_load,
1655 stmt_info, 0, vect_prologue);
1656 else if (dt == vect_external_def)
1657 record_stmt_cost (prologue_cost_vec, 1, vec_construct,
1658 stmt_info, 0, vect_prologue);
1663 /* Compute the cost for the SLP instance INSTANCE. */
1665 static void
1666 vect_analyze_slp_cost (slp_instance instance, void *data)
1668 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1669 unsigned ncopies_for_cost;
1670 stmt_info_for_cost *si;
1671 unsigned i;
1673 /* Calculate the number of vector stmts to create based on the unrolling
1674 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1675 GROUP_SIZE / NUNITS otherwise. */
1676 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1677 slp_tree node = SLP_INSTANCE_TREE (instance);
1678 stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1679 /* Adjust the group_size by the vectorization factor which is always one
1680 for basic-block vectorization. */
1681 if (STMT_VINFO_LOOP_VINFO (stmt_info))
1682 group_size *= LOOP_VINFO_VECT_FACTOR (STMT_VINFO_LOOP_VINFO (stmt_info));
1683 unsigned nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1684 /* For reductions look at a reduction operand in case the reduction
1685 operation is widening like DOT_PROD or SAD. */
1686 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1688 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1689 switch (gimple_assign_rhs_code (stmt))
1691 case DOT_PROD_EXPR:
1692 case SAD_EXPR:
1693 nunits = TYPE_VECTOR_SUBPARTS (get_vectype_for_scalar_type
1694 (TREE_TYPE (gimple_assign_rhs1 (stmt))));
1695 break;
1696 default:;
1699 ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1701 prologue_cost_vec.create (10);
1702 body_cost_vec.create (10);
1703 vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
1704 &prologue_cost_vec, &body_cost_vec,
1705 ncopies_for_cost);
1707 /* Record the prologue costs, which were delayed until we were
1708 sure that SLP was successful. */
1709 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1711 struct _stmt_vec_info *stmt_info
1712 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1713 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1714 si->misalign, vect_prologue);
1717 /* Record the instance's instructions in the target cost model. */
1718 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1720 struct _stmt_vec_info *stmt_info
1721 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1722 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1723 si->misalign, vect_body);
1726 prologue_cost_vec.release ();
1727 body_cost_vec.release ();
1730 /* Analyze an SLP instance starting from a group of grouped stores. Call
1731 vect_build_slp_tree to build a tree of packed stmts if possible.
1732 Return FALSE if it's impossible to SLP any stmt in the loop. */
1734 static bool
1735 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1736 gimple stmt, unsigned max_tree_size)
1738 slp_instance new_instance;
1739 slp_tree node;
1740 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1741 unsigned int unrolling_factor = 1, nunits;
1742 tree vectype, scalar_type = NULL_TREE;
1743 gimple next;
1744 unsigned int vectorization_factor = 0;
1745 int i;
1746 unsigned int max_nunits = 0;
1747 vec<slp_tree> loads;
1748 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1749 vec<gimple> scalar_stmts;
1751 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1753 if (dr)
1755 scalar_type = TREE_TYPE (DR_REF (dr));
1756 vectype = get_vectype_for_scalar_type (scalar_type);
1758 else
1760 gcc_assert (loop_vinfo);
1761 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1764 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1766 else
1768 gcc_assert (loop_vinfo);
1769 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1770 group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1773 if (!vectype)
1775 if (dump_enabled_p ())
1777 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1778 "Build SLP failed: unsupported data-type ");
1779 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1780 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1783 return false;
1786 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1787 if (loop_vinfo)
1788 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1789 else
1790 vectorization_factor = nunits;
1792 /* Calculate the unrolling factor. */
1793 unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1794 if (unrolling_factor != 1 && !loop_vinfo)
1796 if (dump_enabled_p ())
1797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1798 "Build SLP failed: unrolling required in basic"
1799 " block SLP\n");
1801 return false;
1804 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1805 scalar_stmts.create (group_size);
1806 next = stmt;
1807 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1809 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1810 while (next)
1812 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1813 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1814 scalar_stmts.safe_push (
1815 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1816 else
1817 scalar_stmts.safe_push (next);
1818 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1820 /* Mark the first element of the reduction chain as reduction to properly
1821 transform the node. In the reduction analysis phase only the last
1822 element of the chain is marked as reduction. */
1823 if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
1824 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
1826 else
1828 /* Collect reduction statements. */
1829 vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1830 for (i = 0; reductions.iterate (i, &next); i++)
1831 scalar_stmts.safe_push (next);
1834 node = vect_create_new_slp_node (scalar_stmts);
1836 loads.create (group_size);
1838 /* Build the tree for the SLP instance. */
1839 bool *matches = XALLOCAVEC (bool, group_size);
1840 unsigned npermutes = 0;
1841 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1842 &max_nunits, &loads,
1843 vectorization_factor, matches, &npermutes, NULL,
1844 max_tree_size))
1846 /* Calculate the unrolling factor based on the smallest type. */
1847 if (max_nunits > nunits)
1848 unrolling_factor = least_common_multiple (max_nunits, group_size)
1849 / group_size;
1851 if (unrolling_factor != 1 && !loop_vinfo)
1853 if (dump_enabled_p ())
1854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1855 "Build SLP failed: unrolling required in basic"
1856 " block SLP\n");
1857 vect_free_slp_tree (node);
1858 loads.release ();
1859 return false;
1862 /* Create a new SLP instance. */
1863 new_instance = XNEW (struct _slp_instance);
1864 SLP_INSTANCE_TREE (new_instance) = node;
1865 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1866 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1867 SLP_INSTANCE_LOADS (new_instance) = loads;
1869 /* Compute the load permutation. */
1870 slp_tree load_node;
1871 bool loads_permuted = false;
1872 FOR_EACH_VEC_ELT (loads, i, load_node)
1874 vec<unsigned> load_permutation;
1875 int j;
1876 gimple load, first_stmt;
1877 bool this_load_permuted = false;
1878 load_permutation.create (group_size);
1879 first_stmt = GROUP_FIRST_ELEMENT
1880 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1881 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1883 int load_place
1884 = vect_get_place_in_interleaving_chain (load, first_stmt);
1885 gcc_assert (load_place != -1);
1886 if (load_place != j)
1887 this_load_permuted = true;
1888 load_permutation.safe_push (load_place);
1890 if (!this_load_permuted)
1892 load_permutation.release ();
1893 continue;
1895 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1896 loads_permuted = true;
1899 if (loads_permuted)
1901 if (!vect_supported_load_permutation_p (new_instance))
1903 if (dump_enabled_p ())
1905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1906 "Build SLP failed: unsupported load "
1907 "permutation ");
1908 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1909 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1911 vect_free_slp_instance (new_instance);
1912 return false;
1917 if (loop_vinfo)
1918 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1919 else
1920 BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1922 if (dump_enabled_p ())
1923 vect_print_slp_tree (MSG_NOTE, node);
1925 return true;
1928 /* Failed to SLP. */
1929 /* Free the allocated memory. */
1930 vect_free_slp_tree (node);
1931 loads.release ();
1933 return false;
1937 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1938 trees of packed scalar stmts if SLP is possible. */
1940 bool
1941 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1942 unsigned max_tree_size)
1944 unsigned int i;
1945 vec<gimple> grouped_stores;
1946 vec<gimple> reductions = vNULL;
1947 vec<gimple> reduc_chains = vNULL;
1948 gimple first_element;
1949 bool ok = false;
1951 if (dump_enabled_p ())
1952 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1954 if (loop_vinfo)
1956 grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1957 reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1958 reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1960 else
1961 grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1963 /* Find SLP sequences starting from groups of grouped stores. */
1964 FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1965 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1966 max_tree_size))
1967 ok = true;
1969 if (reduc_chains.length () > 0)
1971 /* Find SLP sequences starting from reduction chains. */
1972 FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1973 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1974 max_tree_size))
1975 ok = true;
1976 else
1977 return false;
1979 /* Don't try to vectorize SLP reductions if reduction chain was
1980 detected. */
1981 return ok;
1984 /* Find SLP sequences starting from groups of reductions. */
1985 if (reductions.length () > 1
1986 && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1987 max_tree_size))
1988 ok = true;
1990 return true;
1994 /* For each possible SLP instance decide whether to SLP it and calculate overall
1995 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1996 least one instance. */
1998 bool
1999 vect_make_slp_decision (loop_vec_info loop_vinfo)
2001 unsigned int i, unrolling_factor = 1;
2002 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2003 slp_instance instance;
2004 int decided_to_slp = 0;
2006 if (dump_enabled_p ())
2007 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
2008 "\n");
2010 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2012 /* FORNOW: SLP if you can. */
2013 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
2014 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
2016 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2017 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2018 loop-based vectorization. Such stmts will be marked as HYBRID. */
2019 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2020 decided_to_slp++;
2023 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2025 if (decided_to_slp && dump_enabled_p ())
2026 dump_printf_loc (MSG_NOTE, vect_location,
2027 "Decided to SLP %d instances. Unrolling factor %d\n",
2028 decided_to_slp, unrolling_factor);
2030 return (decided_to_slp > 0);
2034 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2035 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2037 static void
2038 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2040 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[i];
2041 imm_use_iterator imm_iter;
2042 gimple use_stmt;
2043 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
2044 slp_tree child;
2045 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2046 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2047 int j;
2049 /* Propagate hybrid down the SLP tree. */
2050 if (stype == hybrid)
2052 else if (HYBRID_SLP_STMT (stmt_vinfo))
2053 stype = hybrid;
2054 else
2056 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2057 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2058 /* We always get the pattern stmt here, but for immediate
2059 uses we have to use the LHS of the original stmt. */
2060 gcc_checking_assert (!STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2061 if (STMT_VINFO_RELATED_STMT (stmt_vinfo))
2062 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
2063 if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
2064 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
2066 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2067 continue;
2068 use_vinfo = vinfo_for_stmt (use_stmt);
2069 if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2070 && STMT_VINFO_RELATED_STMT (use_vinfo))
2071 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
2072 if (!STMT_SLP_TYPE (use_vinfo)
2073 && (STMT_VINFO_RELEVANT (use_vinfo)
2074 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2075 && !(gimple_code (use_stmt) == GIMPLE_PHI
2076 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2077 stype = hybrid;
2081 if (stype == hybrid)
2083 if (dump_enabled_p ())
2085 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2086 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2088 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2091 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2092 if (child)
2093 vect_detect_hybrid_slp_stmts (child, i, stype);
2096 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2098 static tree
2099 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2101 walk_stmt_info *wi = (walk_stmt_info *)data;
2102 struct loop *loopp = (struct loop *)wi->info;
2104 if (wi->is_lhs)
2105 return NULL_TREE;
2107 if (TREE_CODE (*tp) == SSA_NAME
2108 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2110 gimple def_stmt = SSA_NAME_DEF_STMT (*tp);
2111 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2112 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2114 if (dump_enabled_p ())
2116 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2117 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2119 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2123 return NULL_TREE;
2126 static tree
2127 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2128 walk_stmt_info *)
2130 /* If the stmt is in a SLP instance then this isn't a reason
2131 to mark use definitions in other SLP instances as hybrid. */
2132 if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
2133 *handled = true;
2134 return NULL_TREE;
2137 /* Find stmts that must be both vectorized and SLPed. */
2139 void
2140 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2142 unsigned int i;
2143 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2144 slp_instance instance;
2146 if (dump_enabled_p ())
2147 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2148 "\n");
2150 /* First walk all pattern stmt in the loop and mark defs of uses as
2151 hybrid because immediate uses in them are not recorded. */
2152 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2154 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2155 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2156 gsi_next (&gsi))
2158 gimple stmt = gsi_stmt (gsi);
2159 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2160 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2162 walk_stmt_info wi;
2163 memset (&wi, 0, sizeof (wi));
2164 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2165 gimple_stmt_iterator gsi2
2166 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2167 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2168 vect_detect_hybrid_slp_1, &wi);
2169 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2170 vect_detect_hybrid_slp_2,
2171 vect_detect_hybrid_slp_1, &wi);
2176 /* Then walk the SLP instance trees marking stmts with uses in
2177 non-SLP stmts as hybrid, also propagating hybrid down the
2178 SLP tree, collecting the above info on-the-fly. */
2179 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2181 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2182 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2183 i, pure_slp);
2188 /* Create and initialize a new bb_vec_info struct for BB, as well as
2189 stmt_vec_info structs for all the stmts in it. */
2191 static bb_vec_info
2192 new_bb_vec_info (basic_block bb)
2194 bb_vec_info res = NULL;
2195 gimple_stmt_iterator gsi;
2197 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
2198 BB_VINFO_BB (res) = bb;
2200 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2202 gimple stmt = gsi_stmt (gsi);
2203 gimple_set_uid (stmt, 0);
2204 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
2207 BB_VINFO_GROUPED_STORES (res).create (10);
2208 BB_VINFO_SLP_INSTANCES (res).create (2);
2209 BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2211 bb->aux = res;
2212 return res;
2216 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2217 stmts in the basic block. */
2219 static void
2220 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2222 vec<slp_instance> slp_instances;
2223 slp_instance instance;
2224 basic_block bb;
2225 gimple_stmt_iterator si;
2226 unsigned i;
2228 if (!bb_vinfo)
2229 return;
2231 bb = BB_VINFO_BB (bb_vinfo);
2233 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2235 gimple stmt = gsi_stmt (si);
2236 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2238 if (stmt_info)
2239 /* Free stmt_vec_info. */
2240 free_stmt_vec_info (stmt);
2243 vect_destroy_datarefs (NULL, bb_vinfo);
2244 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2245 BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2246 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2247 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2248 vect_free_slp_instance (instance);
2249 BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2250 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2251 free (bb_vinfo);
2252 bb->aux = NULL;
2256 /* Analyze statements contained in SLP tree node after recursively analyzing
2257 the subtree. Return TRUE if the operations are supported. */
2259 static bool
2260 vect_slp_analyze_node_operations (slp_tree node)
2262 bool dummy;
2263 int i;
2264 gimple stmt;
2265 slp_tree child;
2267 if (!node)
2268 return true;
2270 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2271 if (!vect_slp_analyze_node_operations (child))
2272 return false;
2274 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2276 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2277 gcc_assert (stmt_info);
2278 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2280 if (!vect_analyze_stmt (stmt, &dummy, node))
2281 return false;
2284 return true;
2288 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
2289 operations are supported. */
2291 bool
2292 vect_slp_analyze_operations (vec<slp_instance> slp_instances, void *data)
2294 slp_instance instance;
2295 int i;
2297 if (dump_enabled_p ())
2298 dump_printf_loc (MSG_NOTE, vect_location,
2299 "=== vect_slp_analyze_operations ===\n");
2301 for (i = 0; slp_instances.iterate (i, &instance); )
2303 if (!vect_slp_analyze_node_operations (SLP_INSTANCE_TREE (instance)))
2305 dump_printf_loc (MSG_NOTE, vect_location,
2306 "removing SLP instance operations starting from: ");
2307 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2308 SLP_TREE_SCALAR_STMTS
2309 (SLP_INSTANCE_TREE (instance))[0], 0);
2310 vect_free_slp_instance (instance);
2311 slp_instances.ordered_remove (i);
2313 else
2315 /* Compute the costs of the SLP instance. */
2316 vect_analyze_slp_cost (instance, data);
2317 i++;
2321 if (!slp_instances.length ())
2322 return false;
2324 return true;
2328 /* Compute the scalar cost of the SLP node NODE and its children
2329 and return it. Do not account defs that are marked in LIFE and
2330 update LIFE according to uses of NODE. */
2332 static unsigned
2333 vect_bb_slp_scalar_cost (basic_block bb,
2334 slp_tree node, vec<bool, va_heap> *life)
2336 unsigned scalar_cost = 0;
2337 unsigned i;
2338 gimple stmt;
2339 slp_tree child;
2341 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2343 unsigned stmt_cost;
2344 ssa_op_iter op_iter;
2345 def_operand_p def_p;
2346 stmt_vec_info stmt_info;
2348 if ((*life)[i])
2349 continue;
2351 /* If there is a non-vectorized use of the defs then the scalar
2352 stmt is kept live in which case we do not account it or any
2353 required defs in the SLP children in the scalar cost. This
2354 way we make the vectorization more costly when compared to
2355 the scalar cost. */
2356 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2358 imm_use_iterator use_iter;
2359 gimple use_stmt;
2360 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2361 if (!is_gimple_debug (use_stmt)
2362 && (gimple_code (use_stmt) == GIMPLE_PHI
2363 || gimple_bb (use_stmt) != bb
2364 || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2366 (*life)[i] = true;
2367 BREAK_FROM_IMM_USE_STMT (use_iter);
2370 if ((*life)[i])
2371 continue;
2373 stmt_info = vinfo_for_stmt (stmt);
2374 if (STMT_VINFO_DATA_REF (stmt_info))
2376 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2377 stmt_cost = vect_get_stmt_cost (scalar_load);
2378 else
2379 stmt_cost = vect_get_stmt_cost (scalar_store);
2381 else
2382 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2384 scalar_cost += stmt_cost;
2387 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2388 if (child)
2389 scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2391 return scalar_cost;
2394 /* Check if vectorization of the basic block is profitable. */
2396 static bool
2397 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2399 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2400 slp_instance instance;
2401 int i;
2402 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2403 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2405 /* Calculate scalar cost. */
2406 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2408 auto_vec<bool, 20> life;
2409 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2410 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2411 SLP_INSTANCE_TREE (instance),
2412 &life);
2415 /* Complete the target-specific cost calculation. */
2416 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2417 &vec_inside_cost, &vec_epilogue_cost);
2419 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2421 if (dump_enabled_p ())
2423 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2424 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2425 vec_inside_cost);
2426 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2427 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2428 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2431 /* Vectorization is profitable if its cost is less than the cost of scalar
2432 version. */
2433 if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2434 return false;
2436 return true;
2439 /* Check if the basic block can be vectorized. */
2441 static bb_vec_info
2442 vect_slp_analyze_bb_1 (basic_block bb)
2444 bb_vec_info bb_vinfo;
2445 vec<slp_instance> slp_instances;
2446 slp_instance instance;
2447 int i;
2448 int min_vf = 2;
2449 unsigned n_stmts = 0;
2451 bb_vinfo = new_bb_vec_info (bb);
2452 if (!bb_vinfo)
2453 return NULL;
2455 if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2457 if (dump_enabled_p ())
2458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2459 "not vectorized: unhandled data-ref in basic "
2460 "block.\n");
2462 destroy_bb_vec_info (bb_vinfo);
2463 return NULL;
2466 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2468 if (dump_enabled_p ())
2469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2470 "not vectorized: not enough data-refs in "
2471 "basic block.\n");
2473 destroy_bb_vec_info (bb_vinfo);
2474 return NULL;
2477 if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2479 if (dump_enabled_p ())
2480 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2481 "not vectorized: unhandled data access in "
2482 "basic block.\n");
2484 destroy_bb_vec_info (bb_vinfo);
2485 return NULL;
2488 vect_pattern_recog (NULL, bb_vinfo);
2490 if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2492 if (dump_enabled_p ())
2493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2494 "not vectorized: bad data alignment in basic "
2495 "block.\n");
2497 destroy_bb_vec_info (bb_vinfo);
2498 return NULL;
2501 /* Check the SLP opportunities in the basic block, analyze and build SLP
2502 trees. */
2503 if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2505 if (dump_enabled_p ())
2507 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2508 "Failed to SLP the basic block.\n");
2509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2510 "not vectorized: failed to find SLP opportunities "
2511 "in basic block.\n");
2514 destroy_bb_vec_info (bb_vinfo);
2515 return NULL;
2518 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2520 /* Mark all the statements that we want to vectorize as pure SLP and
2521 relevant. */
2522 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2524 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2525 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2528 /* Mark all the statements that we do not want to vectorize. */
2529 for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2530 !gsi_end_p (gsi); gsi_next (&gsi))
2532 stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2533 if (STMT_SLP_TYPE (vinfo) != pure_slp)
2534 STMT_VINFO_VECTORIZABLE (vinfo) = false;
2537 /* Analyze dependences. At this point all stmts not participating in
2538 vectorization have to be marked. Dependence analysis assumes
2539 that we either vectorize all SLP instances or none at all. */
2540 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2542 if (dump_enabled_p ())
2543 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2544 "not vectorized: unhandled data dependence "
2545 "in basic block.\n");
2547 destroy_bb_vec_info (bb_vinfo);
2548 return NULL;
2551 if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2553 if (dump_enabled_p ())
2554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555 "not vectorized: unsupported alignment in basic "
2556 "block.\n");
2557 destroy_bb_vec_info (bb_vinfo);
2558 return NULL;
2561 if (!vect_slp_analyze_operations (BB_VINFO_SLP_INSTANCES (bb_vinfo),
2562 BB_VINFO_TARGET_COST_DATA (bb_vinfo)))
2564 if (dump_enabled_p ())
2565 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2566 "not vectorized: bad operation in basic block.\n");
2568 destroy_bb_vec_info (bb_vinfo);
2569 return NULL;
2572 /* Cost model: check if the vectorization is worthwhile. */
2573 if (!unlimited_cost_model (NULL)
2574 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2576 if (dump_enabled_p ())
2577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2578 "not vectorized: vectorization is not "
2579 "profitable.\n");
2581 destroy_bb_vec_info (bb_vinfo);
2582 return NULL;
2585 if (dump_enabled_p ())
2586 dump_printf_loc (MSG_NOTE, vect_location,
2587 "Basic block will be vectorized using SLP\n");
2589 return bb_vinfo;
2593 bb_vec_info
2594 vect_slp_analyze_bb (basic_block bb)
2596 bb_vec_info bb_vinfo;
2597 int insns = 0;
2598 gimple_stmt_iterator gsi;
2599 unsigned int vector_sizes;
2601 if (dump_enabled_p ())
2602 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2604 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2606 gimple stmt = gsi_stmt (gsi);
2607 if (!is_gimple_debug (stmt)
2608 && !gimple_nop_p (stmt)
2609 && gimple_code (stmt) != GIMPLE_LABEL)
2610 insns++;
2613 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2615 if (dump_enabled_p ())
2616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2617 "not vectorized: too many instructions in "
2618 "basic block.\n");
2620 return NULL;
2623 /* Autodetect first vector size we try. */
2624 current_vector_size = 0;
2625 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2627 while (1)
2629 bb_vinfo = vect_slp_analyze_bb_1 (bb);
2630 if (bb_vinfo)
2631 return bb_vinfo;
2633 destroy_bb_vec_info (bb_vinfo);
2635 vector_sizes &= ~current_vector_size;
2636 if (vector_sizes == 0
2637 || current_vector_size == 0)
2638 return NULL;
2640 /* Try the next biggest vector size. */
2641 current_vector_size = 1 << floor_log2 (vector_sizes);
2642 if (dump_enabled_p ())
2643 dump_printf_loc (MSG_NOTE, vect_location,
2644 "***** Re-trying analysis with "
2645 "vector size %d\n", current_vector_size);
2650 /* For constant and loop invariant defs of SLP_NODE this function returns
2651 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2652 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2653 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2654 REDUC_INDEX is the index of the reduction operand in the statements, unless
2655 it is -1. */
2657 static void
2658 vect_get_constant_vectors (tree op, slp_tree slp_node,
2659 vec<tree> *vec_oprnds,
2660 unsigned int op_num, unsigned int number_of_vectors,
2661 int reduc_index)
2663 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2664 gimple stmt = stmts[0];
2665 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2666 unsigned nunits;
2667 tree vec_cst;
2668 tree *elts;
2669 unsigned j, number_of_places_left_in_vector;
2670 tree vector_type;
2671 tree vop;
2672 int group_size = stmts.length ();
2673 unsigned int vec_num, i;
2674 unsigned number_of_copies = 1;
2675 vec<tree> voprnds;
2676 voprnds.create (number_of_vectors);
2677 bool constant_p, is_store;
2678 tree neutral_op = NULL;
2679 enum tree_code code = gimple_expr_code (stmt);
2680 gimple def_stmt;
2681 struct loop *loop;
2682 gimple_seq ctor_seq = NULL;
2684 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2685 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2687 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2688 && reduc_index != -1)
2690 op_num = reduc_index;
2691 op = gimple_op (stmt, op_num + 1);
2692 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2693 we need either neutral operands or the original operands. See
2694 get_initial_def_for_reduction() for details. */
2695 switch (code)
2697 case WIDEN_SUM_EXPR:
2698 case DOT_PROD_EXPR:
2699 case SAD_EXPR:
2700 case PLUS_EXPR:
2701 case MINUS_EXPR:
2702 case BIT_IOR_EXPR:
2703 case BIT_XOR_EXPR:
2704 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2705 neutral_op = build_real (TREE_TYPE (op), dconst0);
2706 else
2707 neutral_op = build_int_cst (TREE_TYPE (op), 0);
2709 break;
2711 case MULT_EXPR:
2712 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2713 neutral_op = build_real (TREE_TYPE (op), dconst1);
2714 else
2715 neutral_op = build_int_cst (TREE_TYPE (op), 1);
2717 break;
2719 case BIT_AND_EXPR:
2720 neutral_op = build_int_cst (TREE_TYPE (op), -1);
2721 break;
2723 /* For MIN/MAX we don't have an easy neutral operand but
2724 the initial values can be used fine here. Only for
2725 a reduction chain we have to force a neutral element. */
2726 case MAX_EXPR:
2727 case MIN_EXPR:
2728 if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2729 neutral_op = NULL;
2730 else
2732 def_stmt = SSA_NAME_DEF_STMT (op);
2733 loop = (gimple_bb (stmt))->loop_father;
2734 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2735 loop_preheader_edge (loop));
2737 break;
2739 default:
2740 gcc_assert (!GROUP_FIRST_ELEMENT (stmt_vinfo));
2741 neutral_op = NULL;
2745 if (STMT_VINFO_DATA_REF (stmt_vinfo))
2747 is_store = true;
2748 op = gimple_assign_rhs1 (stmt);
2750 else
2751 is_store = false;
2753 gcc_assert (op);
2755 if (CONSTANT_CLASS_P (op))
2756 constant_p = true;
2757 else
2758 constant_p = false;
2760 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2761 created vectors. It is greater than 1 if unrolling is performed.
2763 For example, we have two scalar operands, s1 and s2 (e.g., group of
2764 strided accesses of size two), while NUNITS is four (i.e., four scalars
2765 of this type can be packed in a vector). The output vector will contain
2766 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2767 will be 2).
2769 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2770 containing the operands.
2772 For example, NUNITS is four as before, and the group size is 8
2773 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2774 {s5, s6, s7, s8}. */
2776 number_of_copies = nunits * number_of_vectors / group_size;
2778 number_of_places_left_in_vector = nunits;
2779 elts = XALLOCAVEC (tree, nunits);
2780 bool place_after_defs = false;
2781 for (j = 0; j < number_of_copies; j++)
2783 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2785 if (is_store)
2786 op = gimple_assign_rhs1 (stmt);
2787 else
2789 switch (code)
2791 case COND_EXPR:
2792 if (op_num == 0 || op_num == 1)
2794 tree cond = gimple_assign_rhs1 (stmt);
2795 op = TREE_OPERAND (cond, op_num);
2797 else
2799 if (op_num == 2)
2800 op = gimple_assign_rhs2 (stmt);
2801 else
2802 op = gimple_assign_rhs3 (stmt);
2804 break;
2806 case CALL_EXPR:
2807 op = gimple_call_arg (stmt, op_num);
2808 break;
2810 case LSHIFT_EXPR:
2811 case RSHIFT_EXPR:
2812 case LROTATE_EXPR:
2813 case RROTATE_EXPR:
2814 op = gimple_op (stmt, op_num + 1);
2815 /* Unlike the other binary operators, shifts/rotates have
2816 the shift count being int, instead of the same type as
2817 the lhs, so make sure the scalar is the right type if
2818 we are dealing with vectors of
2819 long long/long/short/char. */
2820 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2821 op = fold_convert (TREE_TYPE (vector_type), op);
2822 break;
2824 default:
2825 op = gimple_op (stmt, op_num + 1);
2826 break;
2830 if (reduc_index != -1)
2832 loop = (gimple_bb (stmt))->loop_father;
2833 def_stmt = SSA_NAME_DEF_STMT (op);
2835 gcc_assert (loop);
2837 /* Get the def before the loop. In reduction chain we have only
2838 one initial value. */
2839 if ((j != (number_of_copies - 1)
2840 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2841 && i != 0))
2842 && neutral_op)
2843 op = neutral_op;
2844 else
2845 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2846 loop_preheader_edge (loop));
2849 /* Create 'vect_ = {op0,op1,...,opn}'. */
2850 number_of_places_left_in_vector--;
2851 tree orig_op = op;
2852 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2854 if (CONSTANT_CLASS_P (op))
2856 op = fold_unary (VIEW_CONVERT_EXPR,
2857 TREE_TYPE (vector_type), op);
2858 gcc_assert (op && CONSTANT_CLASS_P (op));
2860 else
2862 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2863 gimple init_stmt;
2864 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2865 init_stmt
2866 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2867 gimple_seq_add_stmt (&ctor_seq, init_stmt);
2868 op = new_temp;
2871 elts[number_of_places_left_in_vector] = op;
2872 if (!CONSTANT_CLASS_P (op))
2873 constant_p = false;
2874 if (TREE_CODE (orig_op) == SSA_NAME
2875 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
2876 && STMT_VINFO_BB_VINFO (stmt_vinfo)
2877 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
2878 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
2879 place_after_defs = true;
2881 if (number_of_places_left_in_vector == 0)
2883 number_of_places_left_in_vector = nunits;
2885 if (constant_p)
2886 vec_cst = build_vector (vector_type, elts);
2887 else
2889 vec<constructor_elt, va_gc> *v;
2890 unsigned k;
2891 vec_alloc (v, nunits);
2892 for (k = 0; k < nunits; ++k)
2893 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2894 vec_cst = build_constructor (vector_type, v);
2896 tree init;
2897 gimple_stmt_iterator gsi;
2898 if (place_after_defs)
2900 gsi = gsi_for_stmt
2901 (vect_find_last_scalar_stmt_in_slp (slp_node));
2902 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
2904 else
2905 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
2906 if (ctor_seq != NULL)
2908 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
2909 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2910 GSI_SAME_STMT);
2911 ctor_seq = NULL;
2913 voprnds.quick_push (init);
2914 place_after_defs = false;
2919 /* Since the vectors are created in the reverse order, we should invert
2920 them. */
2921 vec_num = voprnds.length ();
2922 for (j = vec_num; j != 0; j--)
2924 vop = voprnds[j - 1];
2925 vec_oprnds->quick_push (vop);
2928 voprnds.release ();
2930 /* In case that VF is greater than the unrolling factor needed for the SLP
2931 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2932 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2933 to replicate the vectors. */
2934 while (number_of_vectors > vec_oprnds->length ())
2936 tree neutral_vec = NULL;
2938 if (neutral_op)
2940 if (!neutral_vec)
2941 neutral_vec = build_vector_from_val (vector_type, neutral_op);
2943 vec_oprnds->quick_push (neutral_vec);
2945 else
2947 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2948 vec_oprnds->quick_push (vop);
2954 /* Get vectorized definitions from SLP_NODE that contains corresponding
2955 vectorized def-stmts. */
2957 static void
2958 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2960 tree vec_oprnd;
2961 gimple vec_def_stmt;
2962 unsigned int i;
2964 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2966 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2968 gcc_assert (vec_def_stmt);
2969 vec_oprnd = gimple_get_lhs (vec_def_stmt);
2970 vec_oprnds->quick_push (vec_oprnd);
2975 /* Get vectorized definitions for SLP_NODE.
2976 If the scalar definitions are loop invariants or constants, collect them and
2977 call vect_get_constant_vectors() to create vector stmts.
2978 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2979 must be stored in the corresponding child of SLP_NODE, and we call
2980 vect_get_slp_vect_defs () to retrieve them. */
2982 void
2983 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2984 vec<vec<tree> > *vec_oprnds, int reduc_index)
2986 gimple first_stmt;
2987 int number_of_vects = 0, i;
2988 unsigned int child_index = 0;
2989 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2990 slp_tree child = NULL;
2991 vec<tree> vec_defs;
2992 tree oprnd;
2993 bool vectorized_defs;
2995 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2996 FOR_EACH_VEC_ELT (ops, i, oprnd)
2998 /* For each operand we check if it has vectorized definitions in a child
2999 node or we need to create them (for invariants and constants). We
3000 check if the LHS of the first stmt of the next child matches OPRND.
3001 If it does, we found the correct child. Otherwise, we call
3002 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3003 to check this child node for the next operand. */
3004 vectorized_defs = false;
3005 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3007 child = SLP_TREE_CHILDREN (slp_node)[child_index];
3009 /* We have to check both pattern and original def, if available. */
3010 if (child)
3012 gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
3013 gimple related
3014 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
3016 if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
3017 || (related
3018 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
3020 /* The number of vector defs is determined by the number of
3021 vector statements in the node from which we get those
3022 statements. */
3023 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3024 vectorized_defs = true;
3025 child_index++;
3028 else
3029 child_index++;
3032 if (!vectorized_defs)
3034 if (i == 0)
3036 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3037 /* Number of vector stmts was calculated according to LHS in
3038 vect_schedule_slp_instance (), fix it by replacing LHS with
3039 RHS, if necessary. See vect_get_smallest_scalar_type () for
3040 details. */
3041 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
3042 &rhs_size_unit);
3043 if (rhs_size_unit != lhs_size_unit)
3045 number_of_vects *= rhs_size_unit;
3046 number_of_vects /= lhs_size_unit;
3051 /* Allocate memory for vectorized defs. */
3052 vec_defs = vNULL;
3053 vec_defs.create (number_of_vects);
3055 /* For reduction defs we call vect_get_constant_vectors (), since we are
3056 looking for initial loop invariant values. */
3057 if (vectorized_defs && reduc_index == -1)
3058 /* The defs are already vectorized. */
3059 vect_get_slp_vect_defs (child, &vec_defs);
3060 else
3061 /* Build vectors from scalar defs. */
3062 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3063 number_of_vects, reduc_index);
3065 vec_oprnds->quick_push (vec_defs);
3067 /* For reductions, we only need initial values. */
3068 if (reduc_index != -1)
3069 return;
3074 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
3075 building a vector of type MASK_TYPE from it) and two input vectors placed in
3076 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
3077 shifting by STRIDE elements of DR_CHAIN for every copy.
3078 (STRIDE is the number of vectorized stmts for NODE divided by the number of
3079 copies).
3080 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
3081 the created stmts must be inserted. */
3083 static inline void
3084 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
3085 tree mask, int first_vec_indx, int second_vec_indx,
3086 gimple_stmt_iterator *gsi, slp_tree node,
3087 tree vectype, vec<tree> dr_chain,
3088 int ncopies, int vect_stmts_counter)
3090 tree perm_dest;
3091 gimple perm_stmt = NULL;
3092 stmt_vec_info next_stmt_info;
3093 int i, stride;
3094 tree first_vec, second_vec, data_ref;
3096 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
3098 /* Initialize the vect stmts of NODE to properly insert the generated
3099 stmts later. */
3100 for (i = SLP_TREE_VEC_STMTS (node).length ();
3101 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3102 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3104 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
3105 for (i = 0; i < ncopies; i++)
3107 first_vec = dr_chain[first_vec_indx];
3108 second_vec = dr_chain[second_vec_indx];
3110 /* Generate the permute statement. */
3111 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3112 first_vec, second_vec, mask);
3113 data_ref = make_ssa_name (perm_dest, perm_stmt);
3114 gimple_set_lhs (perm_stmt, data_ref);
3115 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3117 /* Store the vector statement in NODE. */
3118 SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
3120 first_vec_indx += stride;
3121 second_vec_indx += stride;
3124 /* Mark the scalar stmt as vectorized. */
3125 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
3126 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
3130 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
3131 return in CURRENT_MASK_ELEMENT its equivalent in target specific
3132 representation. Check that the mask is valid and return FALSE if not.
3133 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
3134 the next vector, i.e., the current first vector is not needed. */
3136 static bool
3137 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
3138 int mask_nunits, bool only_one_vec, int index,
3139 unsigned char *mask, int *current_mask_element,
3140 bool *need_next_vector, int *number_of_mask_fixes,
3141 bool *mask_fixed, bool *needs_first_vector)
3143 int i;
3145 /* Convert to target specific representation. */
3146 *current_mask_element = first_mask_element + m;
3147 /* Adjust the value in case it's a mask for second and third vectors. */
3148 *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
3150 if (*current_mask_element < 0)
3152 if (dump_enabled_p ())
3154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3155 "permutation requires past vector ");
3156 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3157 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3159 return false;
3162 if (*current_mask_element < mask_nunits)
3163 *needs_first_vector = true;
3165 /* We have only one input vector to permute but the mask accesses values in
3166 the next vector as well. */
3167 if (only_one_vec && *current_mask_element >= mask_nunits)
3169 if (dump_enabled_p ())
3171 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3172 "permutation requires at least two vectors ");
3173 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3174 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3177 return false;
3180 /* The mask requires the next vector. */
3181 while (*current_mask_element >= mask_nunits * 2)
3183 if (*needs_first_vector || *mask_fixed)
3185 /* We either need the first vector too or have already moved to the
3186 next vector. In both cases, this permutation needs three
3187 vectors. */
3188 if (dump_enabled_p ())
3190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3191 "permutation requires at "
3192 "least three vectors ");
3193 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3194 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3197 return false;
3200 /* We move to the next vector, dropping the first one and working with
3201 the second and the third - we need to adjust the values of the mask
3202 accordingly. */
3203 *current_mask_element -= mask_nunits * *number_of_mask_fixes;
3205 for (i = 0; i < index; i++)
3206 mask[i] -= mask_nunits * *number_of_mask_fixes;
3208 (*number_of_mask_fixes)++;
3209 *mask_fixed = true;
3212 *need_next_vector = *mask_fixed;
3214 /* This was the last element of this mask. Start a new one. */
3215 if (index == mask_nunits - 1)
3217 *number_of_mask_fixes = 1;
3218 *mask_fixed = false;
3219 *needs_first_vector = false;
3222 return true;
3226 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3227 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3228 permute statements for the SLP node NODE of the SLP instance
3229 SLP_NODE_INSTANCE. */
3231 bool
3232 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3233 gimple_stmt_iterator *gsi, int vf,
3234 slp_instance slp_node_instance, bool analyze_only)
3236 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3237 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3238 tree mask_element_type = NULL_TREE, mask_type;
3239 int i, j, k, nunits, vec_index = 0, scalar_index;
3240 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3241 gimple next_scalar_stmt;
3242 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3243 int first_mask_element;
3244 int index, unroll_factor, current_mask_element, ncopies;
3245 unsigned char *mask;
3246 bool only_one_vec = false, need_next_vector = false;
3247 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3248 int number_of_mask_fixes = 1;
3249 bool mask_fixed = false;
3250 bool needs_first_vector = false;
3251 machine_mode mode;
3253 mode = TYPE_MODE (vectype);
3255 if (!can_vec_perm_p (mode, false, NULL))
3257 if (dump_enabled_p ())
3259 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3260 "no vect permute for ");
3261 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3262 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3264 return false;
3267 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3268 same size as the vector element being permuted. */
3269 mask_element_type = lang_hooks.types.type_for_mode
3270 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3271 mask_type = get_vectype_for_scalar_type (mask_element_type);
3272 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3273 mask = XALLOCAVEC (unsigned char, nunits);
3274 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3276 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3277 unrolling factor. */
3278 orig_vec_stmts_num = group_size *
3279 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
3280 if (orig_vec_stmts_num == 1)
3281 only_one_vec = true;
3283 /* Number of copies is determined by the final vectorization factor
3284 relatively to SLP_NODE_INSTANCE unrolling factor. */
3285 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3287 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3288 return false;
3290 /* Generate permutation masks for every NODE. Number of masks for each NODE
3291 is equal to GROUP_SIZE.
3292 E.g., we have a group of three nodes with three loads from the same
3293 location in each node, and the vector size is 4. I.e., we have a
3294 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3295 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3296 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3299 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3300 The last mask is illegal since we assume two operands for permute
3301 operation, and the mask element values can't be outside that range.
3302 Hence, the last mask must be converted into {2,5,5,5}.
3303 For the first two permutations we need the first and the second input
3304 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3305 we need the second and the third vectors: {b1,c1,a2,b2} and
3306 {c2,a3,b3,c3}. */
3309 scalar_index = 0;
3310 index = 0;
3311 vect_stmts_counter = 0;
3312 vec_index = 0;
3313 first_vec_index = vec_index++;
3314 if (only_one_vec)
3315 second_vec_index = first_vec_index;
3316 else
3317 second_vec_index = vec_index++;
3319 for (j = 0; j < unroll_factor; j++)
3321 for (k = 0; k < group_size; k++)
3323 i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3324 first_mask_element = i + j * group_size;
3325 if (!vect_get_mask_element (stmt, first_mask_element, 0,
3326 nunits, only_one_vec, index,
3327 mask, &current_mask_element,
3328 &need_next_vector,
3329 &number_of_mask_fixes, &mask_fixed,
3330 &needs_first_vector))
3331 return false;
3332 gcc_assert (current_mask_element >= 0
3333 && current_mask_element < 2 * nunits);
3334 mask[index++] = current_mask_element;
3336 if (index == nunits)
3338 index = 0;
3339 if (!can_vec_perm_p (mode, false, mask))
3341 if (dump_enabled_p ())
3343 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3344 vect_location,
3345 "unsupported vect permute { ");
3346 for (i = 0; i < nunits; ++i)
3347 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3348 mask[i]);
3349 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3351 return false;
3354 if (!analyze_only)
3356 int l;
3357 tree mask_vec, *mask_elts;
3358 mask_elts = XALLOCAVEC (tree, nunits);
3359 for (l = 0; l < nunits; ++l)
3360 mask_elts[l] = build_int_cst (mask_element_type,
3361 mask[l]);
3362 mask_vec = build_vector (mask_type, mask_elts);
3364 if (need_next_vector)
3366 first_vec_index = second_vec_index;
3367 second_vec_index = vec_index;
3370 next_scalar_stmt
3371 = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3373 vect_create_mask_and_perm (stmt, next_scalar_stmt,
3374 mask_vec, first_vec_index, second_vec_index,
3375 gsi, node, vectype, dr_chain,
3376 ncopies, vect_stmts_counter++);
3383 return true;
3388 /* Vectorize SLP instance tree in postorder. */
3390 static bool
3391 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3392 unsigned int vectorization_factor)
3394 gimple stmt;
3395 bool grouped_store, is_store;
3396 gimple_stmt_iterator si;
3397 stmt_vec_info stmt_info;
3398 unsigned int vec_stmts_size, nunits, group_size;
3399 tree vectype;
3400 int i;
3401 slp_tree child;
3403 if (!node)
3404 return false;
3406 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3407 vect_schedule_slp_instance (child, instance, vectorization_factor);
3409 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3410 stmt_info = vinfo_for_stmt (stmt);
3412 /* VECTYPE is the type of the destination. */
3413 vectype = STMT_VINFO_VECTYPE (stmt_info);
3414 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3415 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3417 /* For each SLP instance calculate number of vector stmts to be created
3418 for the scalar stmts in each node of the SLP tree. Number of vector
3419 elements in one vector iteration is the number of scalar elements in
3420 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3421 size.
3422 Unless this is a SLP reduction in which case the number of vector
3423 stmts is equal to the number of vector stmts of the children. */
3424 if (GROUP_FIRST_ELEMENT (stmt_info)
3425 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
3426 vec_stmts_size = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
3427 else
3428 vec_stmts_size = (vectorization_factor * group_size) / nunits;
3430 if (!SLP_TREE_VEC_STMTS (node).exists ())
3432 SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3433 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3436 if (dump_enabled_p ())
3438 dump_printf_loc (MSG_NOTE,vect_location,
3439 "------>vectorizing SLP node starting from: ");
3440 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3441 dump_printf (MSG_NOTE, "\n");
3444 /* Vectorized stmts go before the last scalar stmt which is where
3445 all uses are ready. */
3446 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
3448 /* Mark the first element of the reduction chain as reduction to properly
3449 transform the node. In the analysis phase only the last element of the
3450 chain is marked as reduction. */
3451 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3452 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3454 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3455 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3458 /* Handle two-operation SLP nodes by vectorizing the group with
3459 both operations and then performing a merge. */
3460 if (SLP_TREE_TWO_OPERATORS (node))
3462 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3463 enum tree_code ocode;
3464 gimple ostmt;
3465 unsigned char *mask = XALLOCAVEC (unsigned char, group_size);
3466 bool allsame = true;
3467 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
3468 if (gimple_assign_rhs_code (ostmt) != code0)
3470 mask[i] = 1;
3471 allsame = false;
3472 ocode = gimple_assign_rhs_code (ostmt);
3474 else
3475 mask[i] = 0;
3476 if (!allsame)
3478 vec<gimple> v0;
3479 vec<gimple> v1;
3480 unsigned j;
3481 tree tmask = NULL_TREE;
3482 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3483 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3484 SLP_TREE_VEC_STMTS (node).truncate (0);
3485 gimple_assign_set_rhs_code (stmt, ocode);
3486 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3487 gimple_assign_set_rhs_code (stmt, code0);
3488 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3489 SLP_TREE_VEC_STMTS (node).truncate (0);
3490 tree meltype = build_nonstandard_integer_type
3491 (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))), 1);
3492 tree mvectype = get_same_sized_vectype (meltype, vectype);
3493 unsigned k = 0, l;
3494 for (j = 0; j < v0.length (); ++j)
3496 tree *melts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (vectype));
3497 for (l = 0; l < TYPE_VECTOR_SUBPARTS (vectype); ++l)
3499 if (k >= group_size)
3500 k = 0;
3501 melts[l] = build_int_cst
3502 (meltype, mask[k++] * TYPE_VECTOR_SUBPARTS (vectype) + l);
3504 tmask = build_vector (mvectype, melts);
3506 /* ??? Not all targets support a VEC_PERM_EXPR with a
3507 constant mask that would translate to a vec_merge RTX
3508 (with their vec_perm_const_ok). We can either not
3509 vectorize in that case or let veclower do its job.
3510 Unfortunately that isn't too great and at least for
3511 plus/minus we'd eventually like to match targets
3512 vector addsub instructions. */
3513 gimple vstmt;
3514 vstmt = gimple_build_assign (make_ssa_name (vectype),
3515 VEC_PERM_EXPR,
3516 gimple_assign_lhs (v0[j]),
3517 gimple_assign_lhs (v1[j]), tmask);
3518 vect_finish_stmt_generation (stmt, vstmt, &si);
3519 SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
3521 v0.release ();
3522 v1.release ();
3523 return false;
3526 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3527 return is_store;
3530 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3531 For loop vectorization this is done in vectorizable_call, but for SLP
3532 it needs to be deferred until end of vect_schedule_slp, because multiple
3533 SLP instances may refer to the same scalar stmt. */
3535 static void
3536 vect_remove_slp_scalar_calls (slp_tree node)
3538 gimple stmt, new_stmt;
3539 gimple_stmt_iterator gsi;
3540 int i;
3541 slp_tree child;
3542 tree lhs;
3543 stmt_vec_info stmt_info;
3545 if (!node)
3546 return;
3548 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3549 vect_remove_slp_scalar_calls (child);
3551 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3553 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3554 continue;
3555 stmt_info = vinfo_for_stmt (stmt);
3556 if (stmt_info == NULL
3557 || is_pattern_stmt_p (stmt_info)
3558 || !PURE_SLP_STMT (stmt_info))
3559 continue;
3560 lhs = gimple_call_lhs (stmt);
3561 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3562 set_vinfo_for_stmt (new_stmt, stmt_info);
3563 set_vinfo_for_stmt (stmt, NULL);
3564 STMT_VINFO_STMT (stmt_info) = new_stmt;
3565 gsi = gsi_for_stmt (stmt);
3566 gsi_replace (&gsi, new_stmt, false);
3567 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3571 /* Generate vector code for all SLP instances in the loop/basic block. */
3573 bool
3574 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3576 vec<slp_instance> slp_instances;
3577 slp_instance instance;
3578 unsigned int i, vf;
3579 bool is_store = false;
3581 if (loop_vinfo)
3583 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3584 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3586 else
3588 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3589 vf = 1;
3592 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3594 /* Schedule the tree of INSTANCE. */
3595 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3596 instance, vf);
3597 if (dump_enabled_p ())
3598 dump_printf_loc (MSG_NOTE, vect_location,
3599 "vectorizing stmts using SLP.\n");
3602 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3604 slp_tree root = SLP_INSTANCE_TREE (instance);
3605 gimple store;
3606 unsigned int j;
3607 gimple_stmt_iterator gsi;
3609 /* Remove scalar call stmts. Do not do this for basic-block
3610 vectorization as not all uses may be vectorized.
3611 ??? Why should this be necessary? DCE should be able to
3612 remove the stmts itself.
3613 ??? For BB vectorization we can as well remove scalar
3614 stmts starting from the SLP tree root if they have no
3615 uses. */
3616 if (loop_vinfo)
3617 vect_remove_slp_scalar_calls (root);
3619 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3620 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3622 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3623 break;
3625 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3626 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3627 /* Free the attached stmt_vec_info and remove the stmt. */
3628 gsi = gsi_for_stmt (store);
3629 unlink_stmt_vdef (store);
3630 gsi_remove (&gsi, true);
3631 release_defs (store);
3632 free_stmt_vec_info (store);
3636 return is_store;
3640 /* Vectorize the basic block. */
3642 void
3643 vect_slp_transform_bb (basic_block bb)
3645 bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3646 gimple_stmt_iterator si;
3648 gcc_assert (bb_vinfo);
3650 if (dump_enabled_p ())
3651 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3653 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3655 gimple stmt = gsi_stmt (si);
3656 stmt_vec_info stmt_info;
3658 if (dump_enabled_p ())
3660 dump_printf_loc (MSG_NOTE, vect_location,
3661 "------>SLPing statement: ");
3662 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3663 dump_printf (MSG_NOTE, "\n");
3666 stmt_info = vinfo_for_stmt (stmt);
3667 gcc_assert (stmt_info);
3669 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3670 if (STMT_SLP_TYPE (stmt_info))
3672 vect_schedule_slp (NULL, bb_vinfo);
3673 break;
3677 if (dump_enabled_p ())
3678 dump_printf_loc (MSG_NOTE, vect_location,
3679 "BASIC BLOCK VECTORIZED\n");
3681 destroy_bb_vec_info (bb_vinfo);