gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "insn-config.h"
  35 #include "recog.h"              /* FIXME: for insn_data */
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "gimple-iterator.h"
  39 #include "cfgloop.h"
  40 #include "tree-vectorizer.h"
  41 #include "langhooks.h"
  42 #include "gimple-walk.h"
  43 #include "dbgcnt.h"
  44 #include "tree-vector-builder.h"
  45 #include "vec-perm-indices.h"
  46 #include "gimple-fold.h"
  47 #include "internal-fn.h"
  48 #include "dump-context.h"
  49 #include "cfganal.h"
  50 #include "tree-eh.h"
  51 #include "tree-cfg.h"
  52 #include "alloc-pool.h"
  53 #include "sreal.h"
  54 #include "predict.h"
  55
  56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  57                                             load_permutation_t &,
  58                                             const vec<tree> &,
  59                                             gimple_stmt_iterator *,
  60                                             poly_uint64, bool, bool,
  61                                             unsigned *,
  62                                             unsigned * = nullptr,
  63                                             bool = false);
  64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  65                                            slp_tree, lane_permutation_t &,
  66                                            vec<slp_tree> &, bool);
  67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  68                                           slp_tree, stmt_vector_for_cost *);
  69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  70
  71 static object_allocator<_slp_tree> *slp_tree_pool;
  72 static slp_tree slp_first_node;
  73
  74 void
  75 vect_slp_init (void)
  76 {
  77   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  78 }
  79
  80 void
  81 vect_slp_fini (void)
  82 {
  83   while (slp_first_node)
  84     delete slp_first_node;
  85   delete slp_tree_pool;
  86   slp_tree_pool = NULL;
  87 }
  88
  89 void *
  90 _slp_tree::operator new (size_t n)
  91 {
  92   gcc_assert (n == sizeof (_slp_tree));
  93   return slp_tree_pool->allocate_raw ();
  94 }
  95
  96 void
  97 _slp_tree::operator delete (void *node, size_t n)
  98 {
  99   gcc_assert (n == sizeof (_slp_tree));
 100   slp_tree_pool->remove_raw (node);
 101 }
 102
 103
 104 /* Initialize a SLP node.  */
 105
 106 _slp_tree::_slp_tree ()
 107 {
 108   this->prev_node = NULL;
 109   if (slp_first_node)
 110     slp_first_node->prev_node = this;
 111   this->next_node = slp_first_node;
 112   slp_first_node = this;
 113   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 114   SLP_TREE_SCALAR_OPS (this) = vNULL;
 115   SLP_TREE_VEC_DEFS (this) = vNULL;
 116   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 117   SLP_TREE_CHILDREN (this) = vNULL;
 118   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 119   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 120   SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
 121   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 122   SLP_TREE_CODE (this) = ERROR_MARK;
 123   SLP_TREE_VECTYPE (this) = NULL_TREE;
 124   SLP_TREE_REPRESENTATIVE (this) = NULL;
 125   SLP_TREE_REF_COUNT (this) = 1;
 126   this->failed = NULL;
 127   this->max_nunits = 1;
 128   this->lanes = 0;
 129 }
 130
 131 /* Tear down a SLP node.  */
 132
 133 _slp_tree::~_slp_tree ()
 134 {
 135   if (this->prev_node)
 136     this->prev_node->next_node = this->next_node;
 137   else
 138     slp_first_node = this->next_node;
 139   if (this->next_node)
 140     this->next_node->prev_node = this->prev_node;
 141   SLP_TREE_CHILDREN (this).release ();
 142   SLP_TREE_SCALAR_STMTS (this).release ();
 143   SLP_TREE_SCALAR_OPS (this).release ();
 144   SLP_TREE_VEC_DEFS (this).release ();
 145   SLP_TREE_LOAD_PERMUTATION (this).release ();
 146   SLP_TREE_LANE_PERMUTATION (this).release ();
 147   SLP_TREE_SIMD_CLONE_INFO (this).release ();
 148   if (this->failed)
 149     free (failed);
 150 }
 151
 152 /* Push the single SSA definition in DEF to the vector of vector defs.  */
 153
 154 void
 155 _slp_tree::push_vec_def (gimple *def)
 156 {
 157   if (gphi *phi = dyn_cast <gphi *> (def))
 158     vec_defs.quick_push (gimple_phi_result (phi));
 159   else
 160     {
 161       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
 162       vec_defs.quick_push (get_def_from_ptr (defop));
 163     }
 164 }
 165
 166 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 167
 168 void
 169 vect_free_slp_tree (slp_tree node)
 170 {
 171   int i;
 172   slp_tree child;
 173
 174   if (--SLP_TREE_REF_COUNT (node) != 0)
 175     return;
 176
 177   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 178     if (child)
 179       vect_free_slp_tree (child);
 180
 181   /* If the node defines any SLP only patterns then those patterns are no
 182      longer valid and should be removed.  */
 183   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 184   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 185     {
 186       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 187       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 188       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 189     }
 190
 191   delete node;
 192 }
 193
 194 /* Return a location suitable for dumpings related to the SLP instance.  */
 195
 196 dump_user_location_t
 197 _slp_instance::location () const
 198 {
 199   if (!root_stmts.is_empty ())
 200     return root_stmts[0]->stmt;
 201   else
 202     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 203 }
 204
 205
 206 /* Free the memory allocated for the SLP instance.  */
 207
 208 void
 209 vect_free_slp_instance (slp_instance instance)
 210 {
 211   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 212   SLP_INSTANCE_LOADS (instance).release ();
 213   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 214   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
 215   instance->subgraph_entries.release ();
 216   instance->cost_vec.release ();
 217   free (instance);
 218 }
 219
 220
 221 /* Create an SLP node for SCALAR_STMTS.  */
 222
 223 slp_tree
 224 vect_create_new_slp_node (unsigned nops, tree_code code)
 225 {
 226   slp_tree node = new _slp_tree;
 227   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 228   SLP_TREE_CHILDREN (node).create (nops);
 229   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 230   SLP_TREE_CODE (node) = code;
 231   return node;
 232 }
 233 /* Create an SLP node for SCALAR_STMTS.  */
 234
 235 static slp_tree
 236 vect_create_new_slp_node (slp_tree node,
 237                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 238 {
 239   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 240   SLP_TREE_CHILDREN (node).create (nops);
 241   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 242   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 243   SLP_TREE_LANES (node) = scalar_stmts.length ();
 244   return node;
 245 }
 246
 247 /* Create an SLP node for SCALAR_STMTS.  */
 248
 249 static slp_tree
 250 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 251 {
 252   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 253 }
 254
 255 /* Create an SLP node for OPS.  */
 256
 257 static slp_tree
 258 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 259 {
 260   SLP_TREE_SCALAR_OPS (node) = ops;
 261   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 262   SLP_TREE_LANES (node) = ops.length ();
 263   return node;
 264 }
 265
 266 /* Create an SLP node for OPS.  */
 267
 268 static slp_tree
 269 vect_create_new_slp_node (vec<tree> ops)
 270 {
 271   return vect_create_new_slp_node (new _slp_tree, ops);
 272 }
 273
 274
 275 /* This structure is used in creation of an SLP tree.  Each instance
 276    corresponds to the same operand in a group of scalar stmts in an SLP
 277    node.  */
 278 typedef struct _slp_oprnd_info
 279 {
 280   /* Def-stmts for the operands.  */
 281   vec<stmt_vec_info> def_stmts;
 282   /* Operands.  */
 283   vec<tree> ops;
 284   /* Information about the first statement, its vector def-type, type, the
 285      operand itself in case it's constant, and an indication if it's a pattern
 286      stmt and gather/scatter info.  */
 287   tree first_op_type;
 288   enum vect_def_type first_dt;
 289   bool any_pattern;
 290   bool first_gs_p;
 291   gather_scatter_info first_gs_info;
 292 } *slp_oprnd_info;
 293
 294
 295 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 296    operand.  */
 297 static vec<slp_oprnd_info>
 298 vect_create_oprnd_info (int nops, int group_size)
 299 {
 300   int i;
 301   slp_oprnd_info oprnd_info;
 302   vec<slp_oprnd_info> oprnds_info;
 303
 304   oprnds_info.create (nops);
 305   for (i = 0; i < nops; i++)
 306     {
 307       oprnd_info = XNEW (struct _slp_oprnd_info);
 308       oprnd_info->def_stmts.create (group_size);
 309       oprnd_info->ops.create (group_size);
 310       oprnd_info->first_dt = vect_uninitialized_def;
 311       oprnd_info->first_op_type = NULL_TREE;
 312       oprnd_info->any_pattern = false;
 313       oprnd_info->first_gs_p = false;
 314       oprnds_info.quick_push (oprnd_info);
 315     }
 316
 317   return oprnds_info;
 318 }
 319
 320
 321 /* Free operands info.  */
 322
 323 static void
 324 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 325 {
 326   int i;
 327   slp_oprnd_info oprnd_info;
 328
 329   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 330     {
 331       oprnd_info->def_stmts.release ();
 332       oprnd_info->ops.release ();
 333       XDELETE (oprnd_info);
 334     }
 335
 336   oprnds_info.release ();
 337 }
 338
 339 /* Return the execution frequency of NODE (so that a higher value indicates
 340    a "more important" node when optimizing for speed).  */
 341
 342 static sreal
 343 vect_slp_node_weight (slp_tree node)
 344 {
 345   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 346   basic_block bb = gimple_bb (stmt_info->stmt);
 347   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 348 }
 349
 350 /* Return true if STMTS contains a pattern statement.  */
 351
 352 static bool
 353 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 354 {
 355   stmt_vec_info stmt_info;
 356   unsigned int i;
 357   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 358     if (is_pattern_stmt_p (stmt_info))
 359       return true;
 360   return false;
 361 }
 362
 363 /* Return true when all lanes in the external or constant NODE have
 364    the same value.  */
 365
 366 static bool
 367 vect_slp_tree_uniform_p (slp_tree node)
 368 {
 369   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 370               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 371
 372   /* Pre-exsting vectors.  */
 373   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 374     return false;
 375
 376   unsigned i;
 377   tree op, first = NULL_TREE;
 378   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 379     if (!first)
 380       first = op;
 381     else if (!operand_equal_p (first, op, 0))
 382       return false;
 383
 384   return true;
 385 }
 386
 387 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 388    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 389    of the chain.  */
 390
 391 int
 392 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 393                                       stmt_vec_info first_stmt_info)
 394 {
 395   stmt_vec_info next_stmt_info = first_stmt_info;
 396   int result = 0;
 397
 398   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 399     return -1;
 400
 401   do
 402     {
 403       if (next_stmt_info == stmt_info)
 404         return result;
 405       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 406       if (next_stmt_info)
 407         result += DR_GROUP_GAP (next_stmt_info);
 408     }
 409   while (next_stmt_info);
 410
 411   return -1;
 412 }
 413
 414 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 415    using the method implemented by duplicate_and_interleave.  Return true
 416    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 417    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 418    (if nonnull).  */
 419
 420 bool
 421 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 422                                 tree elt_type, unsigned int *nvectors_out,
 423                                 tree *vector_type_out,
 424                                 tree *permutes)
 425 {
 426   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 427   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 428     return false;
 429
 430   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 431   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 432   unsigned int nvectors = 1;
 433   for (;;)
 434     {
 435       scalar_int_mode int_mode;
 436       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 437       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 438         {
 439           /* Get the natural vector type for this SLP group size.  */
 440           tree int_type = build_nonstandard_integer_type
 441             (GET_MODE_BITSIZE (int_mode), 1);
 442           tree vector_type
 443             = get_vectype_for_scalar_type (vinfo, int_type, count);
 444           poly_int64 half_nelts;
 445           if (vector_type
 446               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 447               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 448                            GET_MODE_SIZE (base_vector_mode))
 449               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
 450                              2, &half_nelts))
 451             {
 452               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 453                  together into elements of type INT_TYPE and using the result
 454                  to build NVECTORS vectors.  */
 455               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 456               vec_perm_builder sel1 (nelts, 2, 3);
 457               vec_perm_builder sel2 (nelts, 2, 3);
 458
 459               for (unsigned int i = 0; i < 3; ++i)
 460                 {
 461                   sel1.quick_push (i);
 462                   sel1.quick_push (i + nelts);
 463                   sel2.quick_push (half_nelts + i);
 464                   sel2.quick_push (half_nelts + i + nelts);
 465                 }
 466               vec_perm_indices indices1 (sel1, 2, nelts);
 467               vec_perm_indices indices2 (sel2, 2, nelts);
 468               machine_mode vmode = TYPE_MODE (vector_type);
 469               if (can_vec_perm_const_p (vmode, vmode, indices1)
 470                   && can_vec_perm_const_p (vmode, vmode, indices2))
 471                 {
 472                   if (nvectors_out)
 473                     *nvectors_out = nvectors;
 474                   if (vector_type_out)
 475                     *vector_type_out = vector_type;
 476                   if (permutes)
 477                     {
 478                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 479                                                                 indices1);
 480                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 481                                                                 indices2);
 482                     }
 483                   return true;
 484                 }
 485             }
 486         }
 487       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 488         return false;
 489       nvectors *= 2;
 490     }
 491 }
 492
 493 /* Return true if DTA and DTB match.  */
 494
 495 static bool
 496 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 497 {
 498   return (dta == dtb
 499           || ((dta == vect_external_def || dta == vect_constant_def)
 500               && (dtb == vect_external_def || dtb == vect_constant_def)));
 501 }
 502
 503 static const int cond_expr_maps[3][5] = {
 504   { 4, -1, -2, 1, 2 },
 505   { 4, -2, -1, 1, 2 },
 506   { 4, -1, -2, 2, 1 }
 507 };
 508 static const int arg1_map[] = { 1, 1 };
 509 static const int arg2_map[] = { 1, 2 };
 510 static const int arg1_arg4_map[] = { 2, 1, 4 };
 511 static const int arg3_arg2_map[] = { 2, 3, 2 };
 512 static const int op1_op0_map[] = { 2, 1, 0 };
 513 static const int off_map[] = { 1, -3 };
 514 static const int off_op0_map[] = { 2, -3, 0 };
 515 static const int off_arg2_map[] = { 2, -3, 2 };
 516 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
 517 static const int mask_call_maps[6][7] = {
 518   { 1, 1, },
 519   { 2, 1, 2, },
 520   { 3, 1, 2, 3, },
 521   { 4, 1, 2, 3, 4, },
 522   { 5, 1, 2, 3, 4, 5, },
 523   { 6, 1, 2, 3, 4, 5, 6 },
 524 };
 525
 526 /* For most SLP statements, there is a one-to-one mapping between
 527    gimple arguments and child nodes.  If that is not true for STMT,
 528    return an array that contains:
 529
 530    - the number of child nodes, followed by
 531    - for each child node, the index of the argument associated with that node.
 532      The special index -1 is the first operand of an embedded comparison and
 533      the special index -2 is the second operand of an embedded comparison.
 534      The special indes -3 is the offset of a gather as analyzed by
 535      vect_check_gather_scatter.
 536
 537    SWAP is as for vect_get_and_check_slp_defs.  */
 538
 539 static const int *
 540 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
 541                       unsigned char swap = 0)
 542 {
 543   if (auto assign = dyn_cast<const gassign *> (stmt))
 544     {
 545       if (gimple_assign_rhs_code (assign) == COND_EXPR
 546           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 547         return cond_expr_maps[swap];
 548       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 549           && swap)
 550         return op1_op0_map;
 551       if (gather_scatter_p)
 552         return gimple_vdef (stmt) ? off_op0_map : off_map;
 553     }
 554   gcc_assert (!swap);
 555   if (auto call = dyn_cast<const gcall *> (stmt))
 556     {
 557       if (gimple_call_internal_p (call))
 558         switch (gimple_call_internal_fn (call))
 559           {
 560           case IFN_MASK_LOAD:
 561             return gather_scatter_p ? off_arg2_map : arg2_map;
 562
 563           case IFN_GATHER_LOAD:
 564             return arg1_map;
 565
 566           case IFN_MASK_GATHER_LOAD:
 567           case IFN_MASK_LEN_GATHER_LOAD:
 568             return arg1_arg4_map;
 569
 570           case IFN_MASK_STORE:
 571             return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
 572
 573           case IFN_MASK_CALL:
 574             {
 575               unsigned nargs = gimple_call_num_args (call);
 576               if (nargs >= 2 && nargs <= 7)
 577                 return mask_call_maps[nargs-2];
 578               else
 579                 return nullptr;
 580             }
 581
 582           default:
 583             break;
 584           }
 585     }
 586   return nullptr;
 587 }
 588
 589 /* Return the SLP node child index for operand OP of STMT.  */
 590
 591 int
 592 vect_slp_child_index_for_operand (const gimple *stmt, int op)
 593 {
 594   const int *opmap = vect_get_operand_map (stmt);
 595   if (!opmap)
 596     return op;
 597   for (int i = 1; i < 1 + opmap[0]; ++i)
 598     if (opmap[i] == op)
 599       return i - 1;
 600   gcc_unreachable ();
 601 }
 602
 603 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 604    they are of a valid type and that they match the defs of the first stmt of
 605    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 606    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 607    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 608    is 1 if STMT is cond and operands of comparison need to be swapped;
 609    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 610
 611    If there was a fatal error return -1; if the error could be corrected by
 612    swapping operands of father node of this one, return 1; if everything is
 613    ok return 0.  */
 614 static int
 615 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 616                              bool *skip_args,
 617                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 618                              vec<slp_oprnd_info> *oprnds_info)
 619 {
 620   stmt_vec_info stmt_info = stmts[stmt_num];
 621   tree oprnd;
 622   unsigned int i, number_of_oprnds;
 623   enum vect_def_type dt = vect_uninitialized_def;
 624   slp_oprnd_info oprnd_info;
 625   gather_scatter_info gs_info;
 626   unsigned int gs_op = -1u;
 627   unsigned int commutative_op = -1U;
 628   bool first = stmt_num == 0;
 629
 630   if (!is_a<gcall *> (stmt_info->stmt)
 631       && !is_a<gassign *> (stmt_info->stmt)
 632       && !is_a<gphi *> (stmt_info->stmt))
 633     return -1;
 634
 635   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 636   const int *map
 637     = vect_get_operand_map (stmt_info->stmt,
 638                             STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
 639   if (map)
 640     number_of_oprnds = *map++;
 641   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 642     {
 643       if (gimple_call_internal_p (stmt))
 644         {
 645           internal_fn ifn = gimple_call_internal_fn (stmt);
 646           commutative_op = first_commutative_argument (ifn);
 647         }
 648     }
 649   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 650     {
 651       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 652         commutative_op = 0;
 653     }
 654
 655   bool swapped = (swap != 0);
 656   bool backedge = false;
 657   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 658   for (i = 0; i < number_of_oprnds; i++)
 659     {
 660       oprnd_info = (*oprnds_info)[i];
 661       int opno = map ? map[i] : int (i);
 662       if (opno == -3)
 663         {
 664           gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
 665           if (!is_a <loop_vec_info> (vinfo)
 666               || !vect_check_gather_scatter (stmt_info,
 667                                              as_a <loop_vec_info> (vinfo),
 668                                              first ? &oprnd_info->first_gs_info
 669                                              : &gs_info))
 670             return -1;
 671
 672           if (first)
 673             {
 674               oprnd_info->first_gs_p = true;
 675               oprnd = oprnd_info->first_gs_info.offset;
 676             }
 677           else
 678             {
 679               gs_op = i;
 680               oprnd = gs_info.offset;
 681             }
 682         }
 683       else if (opno < 0)
 684         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 685       else
 686         {
 687           oprnd = gimple_arg (stmt_info->stmt, opno);
 688           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 689             {
 690               edge e = gimple_phi_arg_edge (stmt, opno);
 691               backedge = (is_a <bb_vec_info> (vinfo)
 692                           ? e->flags & EDGE_DFS_BACK
 693                           : dominated_by_p (CDI_DOMINATORS, e->src,
 694                                             gimple_bb (stmt_info->stmt)));
 695             }
 696         }
 697       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 698         oprnd = TREE_OPERAND (oprnd, 0);
 699
 700       stmt_vec_info def_stmt_info;
 701       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 702         {
 703           if (dump_enabled_p ())
 704             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 705                              "Build SLP failed: can't analyze def for %T\n",
 706                              oprnd);
 707
 708           return -1;
 709         }
 710
 711       if (skip_args[i])
 712         {
 713           oprnd_info->def_stmts.quick_push (NULL);
 714           oprnd_info->ops.quick_push (NULL_TREE);
 715           oprnd_info->first_dt = vect_uninitialized_def;
 716           continue;
 717         }
 718
 719       oprnd_info->def_stmts.quick_push (def_stmt_info);
 720       oprnd_info->ops.quick_push (oprnd);
 721
 722       if (def_stmt_info
 723           && is_pattern_stmt_p (def_stmt_info))
 724         {
 725           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 726               != def_stmt_info)
 727             oprnd_info->any_pattern = true;
 728           else
 729             /* If we promote this to external use the original stmt def.  */
 730             oprnd_info->ops.last ()
 731               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 732         }
 733
 734       /* If there's a extern def on a backedge make sure we can
 735          code-generate at the region start.
 736          ???  This is another case that could be fixed by adjusting
 737          how we split the function but at the moment we'd have conflicting
 738          goals there.  */
 739       if (backedge
 740           && dts[i] == vect_external_def
 741           && is_a <bb_vec_info> (vinfo)
 742           && TREE_CODE (oprnd) == SSA_NAME
 743           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 744           && !dominated_by_p (CDI_DOMINATORS,
 745                               as_a <bb_vec_info> (vinfo)->bbs[0],
 746                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 747         {
 748           if (dump_enabled_p ())
 749             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 750                              "Build SLP failed: extern def %T only defined "
 751                              "on backedge\n", oprnd);
 752           return -1;
 753         }
 754
 755       if (first)
 756         {
 757           tree type = TREE_TYPE (oprnd);
 758           dt = dts[i];
 759           if ((dt == vect_constant_def
 760                || dt == vect_external_def)
 761               && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
 762               && (TREE_CODE (type) == BOOLEAN_TYPE
 763                   || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
 764                                                       type)))
 765             {
 766               if (dump_enabled_p ())
 767                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 768                                  "Build SLP failed: invalid type of def "
 769                                  "for variable-length SLP %T\n", oprnd);
 770               return -1;
 771             }
 772
 773           /* For the swapping logic below force vect_reduction_def
 774              for the reduction op in a SLP reduction group.  */
 775           if (!STMT_VINFO_DATA_REF (stmt_info)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 777               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 778               && def_stmt_info)
 779             dts[i] = dt = vect_reduction_def;
 780
 781           /* Check the types of the definition.  */
 782           switch (dt)
 783             {
 784             case vect_external_def:
 785             case vect_constant_def:
 786             case vect_internal_def:
 787             case vect_reduction_def:
 788             case vect_induction_def:
 789             case vect_nested_cycle:
 790             case vect_first_order_recurrence:
 791               break;
 792
 793             default:
 794               /* FORNOW: Not supported.  */
 795               if (dump_enabled_p ())
 796                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 797                                  "Build SLP failed: illegal type of def %T\n",
 798                                  oprnd);
 799               return -1;
 800             }
 801
 802           oprnd_info->first_dt = dt;
 803           oprnd_info->first_op_type = type;
 804         }
 805     }
 806   if (first)
 807     return 0;
 808
 809   /* Now match the operand definition types to that of the first stmt.  */
 810   for (i = 0; i < number_of_oprnds;)
 811     {
 812       if (skip_args[i])
 813         {
 814           ++i;
 815           continue;
 816         }
 817
 818       oprnd_info = (*oprnds_info)[i];
 819       dt = dts[i];
 820       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 821       oprnd = oprnd_info->ops[stmt_num];
 822       tree type = TREE_TYPE (oprnd);
 823
 824       if (!types_compatible_p (oprnd_info->first_op_type, type))
 825         {
 826           if (dump_enabled_p ())
 827             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 828                              "Build SLP failed: different operand types\n");
 829           return 1;
 830         }
 831
 832       if ((gs_op == i) != oprnd_info->first_gs_p)
 833         {
 834           if (dump_enabled_p ())
 835             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 836                              "Build SLP failed: mixed gather and non-gather\n");
 837           return 1;
 838         }
 839       else if (gs_op == i)
 840         {
 841           if (!operand_equal_p (oprnd_info->first_gs_info.base,
 842                                 gs_info.base))
 843             {
 844               if (dump_enabled_p ())
 845                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 846                                  "Build SLP failed: different gather base\n");
 847               return 1;
 848             }
 849           if (oprnd_info->first_gs_info.scale != gs_info.scale)
 850             {
 851               if (dump_enabled_p ())
 852                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 853                                  "Build SLP failed: different gather scale\n");
 854               return 1;
 855             }
 856         }
 857
 858       /* Not first stmt of the group, check that the def-stmt/s match
 859          the def-stmt/s of the first stmt.  Allow different definition
 860          types for reduction chains: the first stmt must be a
 861          vect_reduction_def (a phi node), and the rest
 862          end in the reduction chain.  */
 863       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 864            && !(oprnd_info->first_dt == vect_reduction_def
 865                 && !STMT_VINFO_DATA_REF (stmt_info)
 866                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 867                 && def_stmt_info
 868                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 869                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 870                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 871           || (!STMT_VINFO_DATA_REF (stmt_info)
 872               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 873               && ((!def_stmt_info
 874                    || STMT_VINFO_DATA_REF (def_stmt_info)
 875                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 876                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 877                   != (oprnd_info->first_dt != vect_reduction_def))))
 878         {
 879           /* Try swapping operands if we got a mismatch.  For BB
 880              vectorization only in case it will clearly improve things.  */
 881           if (i == commutative_op && !swapped
 882               && (!is_a <bb_vec_info> (vinfo)
 883                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 884                                              dts[i+1])
 885                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 886                           || vect_def_types_match
 887                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 888             {
 889               if (dump_enabled_p ())
 890                 dump_printf_loc (MSG_NOTE, vect_location,
 891                                  "trying swapped operands\n");
 892               std::swap (dts[i], dts[i+1]);
 893               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 894                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 895               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 896                          (*oprnds_info)[i+1]->ops[stmt_num]);
 897               swapped = true;
 898               continue;
 899             }
 900
 901           if (is_a <bb_vec_info> (vinfo)
 902               && !oprnd_info->any_pattern)
 903             {
 904               /* Now for commutative ops we should see whether we can
 905                  make the other operand matching.  */
 906               if (dump_enabled_p ())
 907                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 908                                  "treating operand as external\n");
 909               oprnd_info->first_dt = dt = vect_external_def;
 910             }
 911           else
 912             {
 913               if (dump_enabled_p ())
 914                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 915                                  "Build SLP failed: different types\n");
 916               return 1;
 917             }
 918         }
 919
 920       /* Make sure to demote the overall operand to external.  */
 921       if (dt == vect_external_def)
 922         oprnd_info->first_dt = vect_external_def;
 923       /* For a SLP reduction chain we want to duplicate the reduction to
 924          each of the chain members.  That gets us a sane SLP graph (still
 925          the stmts are not 100% correct wrt the initial values).  */
 926       else if ((dt == vect_internal_def
 927                 || dt == vect_reduction_def)
 928                && oprnd_info->first_dt == vect_reduction_def
 929                && !STMT_VINFO_DATA_REF (stmt_info)
 930                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 931                && !STMT_VINFO_DATA_REF (def_stmt_info)
 932                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 933                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 934         {
 935           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 936           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 937         }
 938
 939       ++i;
 940     }
 941
 942   /* Swap operands.  */
 943   if (swapped)
 944     {
 945       if (dump_enabled_p ())
 946         dump_printf_loc (MSG_NOTE, vect_location,
 947                          "swapped operands to match def types in %G",
 948                          stmt_info->stmt);
 949     }
 950
 951   return 0;
 952 }
 953
 954 /* Return true if call statements CALL1 and CALL2 are similar enough
 955    to be combined into the same SLP group.  */
 956
 957 bool
 958 compatible_calls_p (gcall *call1, gcall *call2)
 959 {
 960   unsigned int nargs = gimple_call_num_args (call1);
 961   if (nargs != gimple_call_num_args (call2))
 962     return false;
 963
 964   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 965     return false;
 966
 967   if (gimple_call_internal_p (call1))
 968     {
 969       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
 970                                TREE_TYPE (gimple_call_lhs (call2))))
 971         return false;
 972       for (unsigned int i = 0; i < nargs; ++i)
 973         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
 974                                  TREE_TYPE (gimple_call_arg (call2, i))))
 975           return false;
 976     }
 977   else
 978     {
 979       if (!operand_equal_p (gimple_call_fn (call1),
 980                             gimple_call_fn (call2), 0))
 981         return false;
 982
 983       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
 984         return false;
 985     }
 986
 987   /* Check that any unvectorized arguments are equal.  */
 988   if (const int *map = vect_get_operand_map (call1))
 989     {
 990       unsigned int nkept = *map++;
 991       unsigned int mapi = 0;
 992       for (unsigned int i = 0; i < nargs; ++i)
 993         if (mapi < nkept && map[mapi] == int (i))
 994           mapi += 1;
 995         else if (!operand_equal_p (gimple_call_arg (call1, i),
 996                                    gimple_call_arg (call2, i)))
 997           return false;
 998     }
 999
1000   return true;
1001 }
1002
1003 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1004    caller's attempt to find the vector type in STMT_INFO with the narrowest
1005    element type.  Return true if VECTYPE is nonnull and if it is valid
1006    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
1007    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
1008    vect_build_slp_tree.  */
1009
1010 static bool
1011 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1012                         unsigned int group_size,
1013                         tree vectype, poly_uint64 *max_nunits)
1014 {
1015   if (!vectype)
1016     {
1017       if (dump_enabled_p ())
1018         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1019                          "Build SLP failed: unsupported data-type in %G\n",
1020                          stmt_info->stmt);
1021       /* Fatal mismatch.  */
1022       return false;
1023     }
1024
1025   /* If populating the vector type requires unrolling then fail
1026      before adjusting *max_nunits for basic-block vectorization.  */
1027   if (is_a <bb_vec_info> (vinfo)
1028       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1029     {
1030       if (dump_enabled_p ())
1031         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1032                          "Build SLP failed: unrolling required "
1033                          "in basic block SLP\n");
1034       /* Fatal mismatch.  */
1035       return false;
1036     }
1037
1038   /* In case of multiple types we need to detect the smallest type.  */
1039   vect_update_max_nunits (max_nunits, vectype);
1040   return true;
1041 }
1042
1043 /* Verify if the scalar stmts STMTS are isomorphic, require data
1044    permutation or are of unsupported types of operation.  Return
1045    true if they are, otherwise return false and indicate in *MATCHES
1046    which stmts are not isomorphic to the first one.  If MATCHES[0]
1047    is false then this indicates the comparison could not be
1048    carried out or the stmts will never be vectorized by SLP.
1049
1050    Note COND_EXPR is possibly isomorphic to another one after swapping its
1051    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1052    the first stmt by swapping the two operands of comparison; set SWAP[i]
1053    to 2 if stmt I is isormorphic to the first stmt by inverting the code
1054    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1055    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
1056
1057 static bool
1058 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1059                        vec<stmt_vec_info> stmts, unsigned int group_size,
1060                        poly_uint64 *max_nunits, bool *matches,
1061                        bool *two_operators, tree *node_vectype)
1062 {
1063   unsigned int i;
1064   stmt_vec_info first_stmt_info = stmts[0];
1065   code_helper first_stmt_code = ERROR_MARK;
1066   code_helper alt_stmt_code = ERROR_MARK;
1067   code_helper rhs_code = ERROR_MARK;
1068   code_helper first_cond_code = ERROR_MARK;
1069   tree lhs;
1070   bool need_same_oprnds = false;
1071   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1072   stmt_vec_info first_load = NULL, prev_first_load = NULL;
1073   bool first_stmt_ldst_p = false, ldst_p = false;
1074   bool first_stmt_phi_p = false, phi_p = false;
1075   bool maybe_soft_fail = false;
1076   tree soft_fail_nunits_vectype = NULL_TREE;
1077
1078   /* For every stmt in NODE find its def stmt/s.  */
1079   stmt_vec_info stmt_info;
1080   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1081     {
1082       gimple *stmt = stmt_info->stmt;
1083       swap[i] = 0;
1084       matches[i] = false;
1085
1086       if (dump_enabled_p ())
1087         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1088
1089       /* Fail to vectorize statements marked as unvectorizable, throw
1090          or are volatile.  */
1091       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1092           || stmt_can_throw_internal (cfun, stmt)
1093           || gimple_has_volatile_ops (stmt))
1094         {
1095           if (dump_enabled_p ())
1096             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1097                              "Build SLP failed: unvectorizable statement %G",
1098                              stmt);
1099           /* ???  For BB vectorization we want to commutate operands in a way
1100              to shuffle all unvectorizable defs into one operand and have
1101              the other still vectorized.  The following doesn't reliably
1102              work for this though but it's the easiest we can do here.  */
1103           if (is_a <bb_vec_info> (vinfo) && i != 0)
1104             continue;
1105           /* Fatal mismatch.  */
1106           matches[0] = false;
1107           return false;
1108         }
1109
1110       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1111       lhs = gimple_get_lhs (stmt);
1112       if (lhs == NULL_TREE
1113           && (!call_stmt
1114               || !gimple_call_internal_p (stmt)
1115               || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1116         {
1117           if (dump_enabled_p ())
1118             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1119                              "Build SLP failed: not GIMPLE_ASSIGN nor "
1120                              "GIMPLE_CALL %G", stmt);
1121           if (is_a <bb_vec_info> (vinfo) && i != 0)
1122             continue;
1123           /* Fatal mismatch.  */
1124           matches[0] = false;
1125           return false;
1126         }
1127
1128       tree nunits_vectype;
1129       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1130                                            &nunits_vectype, group_size))
1131         {
1132           if (is_a <bb_vec_info> (vinfo) && i != 0)
1133             continue;
1134           /* Fatal mismatch.  */
1135           matches[0] = false;
1136           return false;
1137         }
1138       /* Record nunits required but continue analysis, producing matches[]
1139          as if nunits was not an issue.  This allows splitting of groups
1140          to happen.  */
1141       if (nunits_vectype
1142           && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1143                                       nunits_vectype, max_nunits))
1144         {
1145           gcc_assert (is_a <bb_vec_info> (vinfo));
1146           maybe_soft_fail = true;
1147           soft_fail_nunits_vectype = nunits_vectype;
1148         }
1149
1150       gcc_assert (vectype);
1151
1152       if (call_stmt)
1153         {
1154           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1155           if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1156             rhs_code = cfn;
1157           else
1158             rhs_code = CALL_EXPR;
1159
1160           if (cfn == CFN_MASK_LOAD
1161               || cfn == CFN_GATHER_LOAD
1162               || cfn == CFN_MASK_GATHER_LOAD
1163               || cfn == CFN_MASK_LEN_GATHER_LOAD)
1164             ldst_p = true;
1165           else if (cfn == CFN_MASK_STORE)
1166             {
1167               ldst_p = true;
1168               rhs_code = CFN_MASK_STORE;
1169             }
1170           else if ((cfn != CFN_LAST
1171                     && cfn != CFN_MASK_CALL
1172                     && internal_fn_p (cfn)
1173                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1174                    || gimple_call_tail_p (call_stmt)
1175                    || gimple_call_noreturn_p (call_stmt)
1176                    || gimple_call_chain (call_stmt))
1177             {
1178               if (dump_enabled_p ())
1179                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1180                                  "Build SLP failed: unsupported call type %G",
1181                                  (gimple *) call_stmt);
1182               if (is_a <bb_vec_info> (vinfo) && i != 0)
1183                 continue;
1184               /* Fatal mismatch.  */
1185               matches[0] = false;
1186               return false;
1187             }
1188         }
1189       else if (gimple_code (stmt) == GIMPLE_PHI)
1190         {
1191           rhs_code = ERROR_MARK;
1192           phi_p = true;
1193         }
1194       else
1195         {
1196           rhs_code = gimple_assign_rhs_code (stmt);
1197           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1198         }
1199
1200       /* Check the operation.  */
1201       if (i == 0)
1202         {
1203           *node_vectype = vectype;
1204           first_stmt_code = rhs_code;
1205           first_stmt_ldst_p = ldst_p;
1206           first_stmt_phi_p = phi_p;
1207
1208           /* Shift arguments should be equal in all the packed stmts for a
1209              vector shift with scalar shift operand.  */
1210           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1211               || rhs_code == LROTATE_EXPR
1212               || rhs_code == RROTATE_EXPR)
1213             {
1214               /* First see if we have a vector/vector shift.  */
1215               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1216                 {
1217                   /* No vector/vector shift, try for a vector/scalar shift.  */
1218                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1219                     {
1220                       if (dump_enabled_p ())
1221                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1222                                          "Build SLP failed: "
1223                                          "op not supported by target.\n");
1224                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1225                         continue;
1226                       /* Fatal mismatch.  */
1227                       matches[0] = false;
1228                       return false;
1229                     }
1230                   need_same_oprnds = true;
1231                   first_op1 = gimple_assign_rhs2 (stmt);
1232                 }
1233             }
1234           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1235             {
1236               need_same_oprnds = true;
1237               first_op1 = gimple_assign_rhs2 (stmt);
1238             }
1239           else if (!ldst_p
1240                    && rhs_code == BIT_FIELD_REF)
1241             {
1242               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1243               if (!is_a <bb_vec_info> (vinfo)
1244                   || TREE_CODE (vec) != SSA_NAME
1245                   /* When the element types are not compatible we pun the
1246                      source to the target vectype which requires equal size.  */
1247                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1248                        || !types_compatible_p (TREE_TYPE (vectype),
1249                                                TREE_TYPE (TREE_TYPE (vec))))
1250                       && !operand_equal_p (TYPE_SIZE (vectype),
1251                                            TYPE_SIZE (TREE_TYPE (vec)))))
1252                 {
1253                   if (dump_enabled_p ())
1254                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1255                                      "Build SLP failed: "
1256                                      "BIT_FIELD_REF not supported\n");
1257                   /* Fatal mismatch.  */
1258                   matches[0] = false;
1259                   return false;
1260                 }
1261             }
1262           else if (rhs_code == CFN_DIV_POW2)
1263             {
1264               need_same_oprnds = true;
1265               first_op1 = gimple_call_arg (call_stmt, 1);
1266             }
1267         }
1268       else
1269         {
1270           if (first_stmt_code != rhs_code
1271               && alt_stmt_code == ERROR_MARK)
1272             alt_stmt_code = rhs_code;
1273           if ((first_stmt_code != rhs_code
1274                && (first_stmt_code != IMAGPART_EXPR
1275                    || rhs_code != REALPART_EXPR)
1276                && (first_stmt_code != REALPART_EXPR
1277                    || rhs_code != IMAGPART_EXPR)
1278                /* Handle mismatches in plus/minus by computing both
1279                   and merging the results.  */
1280                && !((first_stmt_code == PLUS_EXPR
1281                      || first_stmt_code == MINUS_EXPR)
1282                     && (alt_stmt_code == PLUS_EXPR
1283                         || alt_stmt_code == MINUS_EXPR)
1284                     && rhs_code == alt_stmt_code)
1285                && !(first_stmt_code.is_tree_code ()
1286                     && rhs_code.is_tree_code ()
1287                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1288                         == tcc_comparison)
1289                     && (swap_tree_comparison (tree_code (first_stmt_code))
1290                         == tree_code (rhs_code)))
1291                && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1292                     && (first_stmt_code == ARRAY_REF
1293                         || first_stmt_code == BIT_FIELD_REF
1294                         || first_stmt_code == INDIRECT_REF
1295                         || first_stmt_code == COMPONENT_REF
1296                         || first_stmt_code == MEM_REF)
1297                     && (rhs_code == ARRAY_REF
1298                         || rhs_code == BIT_FIELD_REF
1299                         || rhs_code == INDIRECT_REF
1300                         || rhs_code == COMPONENT_REF
1301                         || rhs_code == MEM_REF)))
1302               || (ldst_p
1303                   && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1304                       != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1305               || (ldst_p
1306                   && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1307                       != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1308               || first_stmt_ldst_p != ldst_p
1309               || first_stmt_phi_p != phi_p)
1310             {
1311               if (dump_enabled_p ())
1312                 {
1313                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314                                    "Build SLP failed: different operation "
1315                                    "in stmt %G", stmt);
1316                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1317                                    "original stmt %G", first_stmt_info->stmt);
1318                 }
1319               /* Mismatch.  */
1320               continue;
1321             }
1322
1323           if (!ldst_p
1324               && first_stmt_code == BIT_FIELD_REF
1325               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1326                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1327             {
1328               if (dump_enabled_p ())
1329                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1330                                  "Build SLP failed: different BIT_FIELD_REF "
1331                                  "arguments in %G", stmt);
1332               /* Mismatch.  */
1333               continue;
1334             }
1335
1336           if (call_stmt
1337               && first_stmt_code != CFN_MASK_LOAD
1338               && first_stmt_code != CFN_MASK_STORE)
1339             {
1340               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1341                                        call_stmt))
1342                 {
1343                   if (dump_enabled_p ())
1344                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1345                                      "Build SLP failed: different calls in %G",
1346                                      stmt);
1347                   /* Mismatch.  */
1348                   continue;
1349                 }
1350             }
1351
1352           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1353               && (gimple_bb (first_stmt_info->stmt)
1354                   != gimple_bb (stmt_info->stmt)))
1355             {
1356               if (dump_enabled_p ())
1357                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1358                                  "Build SLP failed: different BB for PHI "
1359                                  "or possibly trapping operation in %G", stmt);
1360               /* Mismatch.  */
1361               continue;
1362             }
1363
1364           if (need_same_oprnds)
1365             {
1366               tree other_op1 = gimple_arg (stmt, 1);
1367               if (!operand_equal_p (first_op1, other_op1, 0))
1368                 {
1369                   if (dump_enabled_p ())
1370                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1371                                      "Build SLP failed: different shift "
1372                                      "arguments in %G", stmt);
1373                   /* Mismatch.  */
1374                   continue;
1375                 }
1376             }
1377
1378           if (!types_compatible_p (vectype, *node_vectype))
1379             {
1380               if (dump_enabled_p ())
1381                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1382                                  "Build SLP failed: different vector type "
1383                                  "in %G", stmt);
1384               /* Mismatch.  */
1385               continue;
1386             }
1387         }
1388
1389       /* Grouped store or load.  */
1390       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1391         {
1392           gcc_assert (ldst_p);
1393           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1394             {
1395               /* Store.  */
1396               gcc_assert (rhs_code == CFN_MASK_STORE
1397                           || REFERENCE_CLASS_P (lhs)
1398                           || DECL_P (lhs));
1399             }
1400           else
1401             {
1402               /* Load.  */
1403               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1404               if (prev_first_load)
1405                 {
1406                   /* Check that there are no loads from different interleaving
1407                      chains in the same node.  */
1408                   if (prev_first_load != first_load)
1409                     {
1410                       if (dump_enabled_p ())
1411                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1412                                          vect_location,
1413                                          "Build SLP failed: different "
1414                                          "interleaving chains in one node %G",
1415                                          stmt);
1416                       /* Mismatch.  */
1417                       continue;
1418                     }
1419                 }
1420               else
1421                 prev_first_load = first_load;
1422            }
1423         }
1424       /* Non-grouped store or load.  */
1425       else if (ldst_p)
1426         {
1427           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1428               && rhs_code != CFN_GATHER_LOAD
1429               && rhs_code != CFN_MASK_GATHER_LOAD
1430               && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1431               && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1432               /* Not grouped loads are handled as externals for BB
1433                  vectorization.  For loop vectorization we can handle
1434                  splats the same we handle single element interleaving.  */
1435               && (is_a <bb_vec_info> (vinfo)
1436                   || stmt_info != first_stmt_info))
1437             {
1438               /* Not grouped load.  */
1439               if (dump_enabled_p ())
1440                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1441                                  "Build SLP failed: not grouped load %G", stmt);
1442
1443               if (i != 0)
1444                 continue;
1445               /* Fatal mismatch.  */
1446               matches[0] = false;
1447               return false;
1448             }
1449         }
1450       /* Not memory operation.  */
1451       else
1452         {
1453           if (!phi_p
1454               && rhs_code.is_tree_code ()
1455               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1456               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1457               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1458               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1459               && rhs_code != VIEW_CONVERT_EXPR
1460               && rhs_code != CALL_EXPR
1461               && rhs_code != BIT_FIELD_REF)
1462             {
1463               if (dump_enabled_p ())
1464                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465                                  "Build SLP failed: operation unsupported %G",
1466                                  stmt);
1467               if (is_a <bb_vec_info> (vinfo) && i != 0)
1468                 continue;
1469               /* Fatal mismatch.  */
1470               matches[0] = false;
1471               return false;
1472             }
1473
1474           if (rhs_code == COND_EXPR)
1475             {
1476               tree cond_expr = gimple_assign_rhs1 (stmt);
1477               enum tree_code cond_code = TREE_CODE (cond_expr);
1478               enum tree_code swap_code = ERROR_MARK;
1479               enum tree_code invert_code = ERROR_MARK;
1480
1481               if (i == 0)
1482                 first_cond_code = TREE_CODE (cond_expr);
1483               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1484                 {
1485                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1486                   swap_code = swap_tree_comparison (cond_code);
1487                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1488                 }
1489
1490               if (first_cond_code == cond_code)
1491                 ;
1492               /* Isomorphic can be achieved by swapping.  */
1493               else if (first_cond_code == swap_code)
1494                 swap[i] = 1;
1495               /* Isomorphic can be achieved by inverting.  */
1496               else if (first_cond_code == invert_code)
1497                 swap[i] = 2;
1498               else
1499                 {
1500                   if (dump_enabled_p ())
1501                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1502                                      "Build SLP failed: different"
1503                                      " operation %G", stmt);
1504                   /* Mismatch.  */
1505                   continue;
1506                 }
1507             }
1508
1509           if (rhs_code.is_tree_code ()
1510               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1511               && (swap_tree_comparison ((tree_code)first_stmt_code)
1512                   == (tree_code)rhs_code))
1513             swap[i] = 1;
1514         }
1515
1516       matches[i] = true;
1517     }
1518
1519   for (i = 0; i < group_size; ++i)
1520     if (!matches[i])
1521       return false;
1522
1523   /* If we allowed a two-operation SLP node verify the target can cope
1524      with the permute we are going to use.  */
1525   if (alt_stmt_code != ERROR_MARK
1526       && (!alt_stmt_code.is_tree_code ()
1527           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1528               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1529     {
1530       *two_operators = true;
1531     }
1532
1533   if (maybe_soft_fail)
1534     {
1535       unsigned HOST_WIDE_INT const_nunits;
1536       if (!TYPE_VECTOR_SUBPARTS
1537             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1538           || const_nunits > group_size)
1539         matches[0] = false;
1540       else
1541         {
1542           /* With constant vector elements simulate a mismatch at the
1543              point we need to split.  */
1544           unsigned tail = group_size & (const_nunits - 1);
1545           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1546         }
1547       return false;
1548     }
1549
1550   return true;
1551 }
1552
1553 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1554    Note we never remove apart from at destruction time so we do not
1555    need a special value for deleted that differs from empty.  */
1556 struct bst_traits
1557 {
1558   typedef vec <stmt_vec_info> value_type;
1559   typedef vec <stmt_vec_info> compare_type;
1560   static inline hashval_t hash (value_type);
1561   static inline bool equal (value_type existing, value_type candidate);
1562   static inline bool is_empty (value_type x) { return !x.exists (); }
1563   static inline bool is_deleted (value_type x) { return !x.exists (); }
1564   static const bool empty_zero_p = true;
1565   static inline void mark_empty (value_type &x) { x.release (); }
1566   static inline void mark_deleted (value_type &x) { x.release (); }
1567   static inline void remove (value_type &x) { x.release (); }
1568 };
1569 inline hashval_t
1570 bst_traits::hash (value_type x)
1571 {
1572   inchash::hash h;
1573   for (unsigned i = 0; i < x.length (); ++i)
1574     h.add_int (gimple_uid (x[i]->stmt));
1575   return h.end ();
1576 }
1577 inline bool
1578 bst_traits::equal (value_type existing, value_type candidate)
1579 {
1580   if (existing.length () != candidate.length ())
1581     return false;
1582   for (unsigned i = 0; i < existing.length (); ++i)
1583     if (existing[i] != candidate[i])
1584       return false;
1585   return true;
1586 }
1587
1588 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1589    but then vec::insert does memmove and that's not compatible with
1590    std::pair.  */
1591 struct chain_op_t
1592 {
1593   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1594       : code (code_), dt (dt_), op (op_) {}
1595   tree_code code;
1596   vect_def_type dt;
1597   tree op;
1598 };
1599
1600 /* Comparator for sorting associatable chains.  */
1601
1602 static int
1603 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1604 {
1605   auto *op1 = (const chain_op_t *) op1_;
1606   auto *op2 = (const chain_op_t *) op2_;
1607   if (op1->dt != op2->dt)
1608     return (int)op1->dt - (int)op2->dt;
1609   return (int)op1->code - (int)op2->code;
1610 }
1611
1612 /* Linearize the associatable expression chain at START with the
1613    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1614    filling CHAIN with the result and using WORKLIST as intermediate storage.
1615    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1616    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1617    stmts, starting with START.  */
1618
1619 static void
1620 vect_slp_linearize_chain (vec_info *vinfo,
1621                           vec<std::pair<tree_code, gimple *> > &worklist,
1622                           vec<chain_op_t> &chain,
1623                           enum tree_code code, gimple *start,
1624                           gimple *&code_stmt, gimple *&alt_code_stmt,
1625                           vec<gimple *> *chain_stmts)
1626 {
1627   /* For each lane linearize the addition/subtraction (or other
1628      uniform associatable operation) expression tree.  */
1629   worklist.safe_push (std::make_pair (code, start));
1630   while (!worklist.is_empty ())
1631     {
1632       auto entry = worklist.pop ();
1633       gassign *stmt = as_a <gassign *> (entry.second);
1634       enum tree_code in_code = entry.first;
1635       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1636       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1637       if (!code_stmt
1638           && gimple_assign_rhs_code (stmt) == code)
1639         code_stmt = stmt;
1640       else if (!alt_code_stmt
1641                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1642         alt_code_stmt = stmt;
1643       if (chain_stmts)
1644         chain_stmts->safe_push (stmt);
1645       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1646         {
1647           tree op = gimple_op (stmt, opnum);
1648           vect_def_type dt;
1649           stmt_vec_info def_stmt_info;
1650           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1651           gcc_assert (res);
1652           if (dt == vect_internal_def
1653               && is_pattern_stmt_p (def_stmt_info))
1654             op = gimple_get_lhs (def_stmt_info->stmt);
1655           gimple *use_stmt;
1656           use_operand_p use_p;
1657           if (dt == vect_internal_def
1658               && single_imm_use (op, &use_p, &use_stmt)
1659               && is_gimple_assign (def_stmt_info->stmt)
1660               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1661                   || (code == PLUS_EXPR
1662                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1663                           == MINUS_EXPR))))
1664             {
1665               tree_code op_def_code = this_code;
1666               if (op_def_code == MINUS_EXPR && opnum == 1)
1667                 op_def_code = PLUS_EXPR;
1668               if (in_code == MINUS_EXPR)
1669                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1670               worklist.safe_push (std::make_pair (op_def_code,
1671                                                   def_stmt_info->stmt));
1672             }
1673           else
1674             {
1675               tree_code op_def_code = this_code;
1676               if (op_def_code == MINUS_EXPR && opnum == 1)
1677                 op_def_code = PLUS_EXPR;
1678               if (in_code == MINUS_EXPR)
1679                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1680               chain.safe_push (chain_op_t (op_def_code, dt, op));
1681             }
1682         }
1683     }
1684 }
1685
1686 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1687                   simple_hashmap_traits <bst_traits, slp_tree> >
1688   scalar_stmts_to_slp_tree_map_t;
1689
1690 static slp_tree
1691 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1692                        vec<stmt_vec_info> stmts, unsigned int group_size,
1693                        poly_uint64 *max_nunits,
1694                        bool *matches, unsigned *limit, unsigned *tree_size,
1695                        scalar_stmts_to_slp_tree_map_t *bst_map);
1696
1697 static slp_tree
1698 vect_build_slp_tree (vec_info *vinfo,
1699                      vec<stmt_vec_info> stmts, unsigned int group_size,
1700                      poly_uint64 *max_nunits,
1701                      bool *matches, unsigned *limit, unsigned *tree_size,
1702                      scalar_stmts_to_slp_tree_map_t *bst_map)
1703 {
1704   if (slp_tree *leader = bst_map->get (stmts))
1705     {
1706       if (dump_enabled_p ())
1707         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1708                          !(*leader)->failed ? "" : "failed ",
1709                          (void *) *leader);
1710       if (!(*leader)->failed)
1711         {
1712           SLP_TREE_REF_COUNT (*leader)++;
1713           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1714           stmts.release ();
1715           return *leader;
1716         }
1717       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1718       return NULL;
1719     }
1720
1721   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1722      so we can pick up backedge destinations during discovery.  */
1723   slp_tree res = new _slp_tree;
1724   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1725   SLP_TREE_SCALAR_STMTS (res) = stmts;
1726   bst_map->put (stmts.copy (), res);
1727
1728   if (*limit == 0)
1729     {
1730       if (dump_enabled_p ())
1731         dump_printf_loc (MSG_NOTE, vect_location,
1732                          "SLP discovery limit exceeded\n");
1733       /* Mark the node invalid so we can detect those when still in use
1734          as backedge destinations.  */
1735       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1736       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1737       res->failed = XNEWVEC (bool, group_size);
1738       memset (res->failed, 0, sizeof (bool) * group_size);
1739       memset (matches, 0, sizeof (bool) * group_size);
1740       return NULL;
1741     }
1742   --*limit;
1743
1744   if (dump_enabled_p ())
1745     dump_printf_loc (MSG_NOTE, vect_location,
1746                      "starting SLP discovery for node %p\n", (void *) res);
1747
1748   poly_uint64 this_max_nunits = 1;
1749   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1750                                         &this_max_nunits,
1751                                         matches, limit, tree_size, bst_map);
1752   if (!res_)
1753     {
1754       if (dump_enabled_p ())
1755         dump_printf_loc (MSG_NOTE, vect_location,
1756                          "SLP discovery for node %p failed\n", (void *) res);
1757       /* Mark the node invalid so we can detect those when still in use
1758          as backedge destinations.  */
1759       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1760       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1761       res->failed = XNEWVEC (bool, group_size);
1762       if (flag_checking)
1763         {
1764           unsigned i;
1765           for (i = 0; i < group_size; ++i)
1766             if (!matches[i])
1767               break;
1768           gcc_assert (i < group_size);
1769         }
1770       memcpy (res->failed, matches, sizeof (bool) * group_size);
1771     }
1772   else
1773     {
1774       if (dump_enabled_p ())
1775         dump_printf_loc (MSG_NOTE, vect_location,
1776                          "SLP discovery for node %p succeeded\n",
1777                          (void *) res);
1778       gcc_assert (res_ == res);
1779       res->max_nunits = this_max_nunits;
1780       vect_update_max_nunits (max_nunits, this_max_nunits);
1781       /* Keep a reference for the bst_map use.  */
1782       SLP_TREE_REF_COUNT (res)++;
1783     }
1784   return res_;
1785 }
1786
1787 /* Helper for building an associated SLP node chain.  */
1788
1789 static void
1790 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1791                                    slp_tree op0, slp_tree op1,
1792                                    stmt_vec_info oper1, stmt_vec_info oper2,
1793                                    vec<std::pair<unsigned, unsigned> > lperm)
1794 {
1795   unsigned group_size = SLP_TREE_LANES (op1);
1796
1797   slp_tree child1 = new _slp_tree;
1798   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1799   SLP_TREE_VECTYPE (child1) = vectype;
1800   SLP_TREE_LANES (child1) = group_size;
1801   SLP_TREE_CHILDREN (child1).create (2);
1802   SLP_TREE_CHILDREN (child1).quick_push (op0);
1803   SLP_TREE_CHILDREN (child1).quick_push (op1);
1804   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1805
1806   slp_tree child2 = new _slp_tree;
1807   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1808   SLP_TREE_VECTYPE (child2) = vectype;
1809   SLP_TREE_LANES (child2) = group_size;
1810   SLP_TREE_CHILDREN (child2).create (2);
1811   SLP_TREE_CHILDREN (child2).quick_push (op0);
1812   SLP_TREE_REF_COUNT (op0)++;
1813   SLP_TREE_CHILDREN (child2).quick_push (op1);
1814   SLP_TREE_REF_COUNT (op1)++;
1815   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1816
1817   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1818   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1819   SLP_TREE_VECTYPE (perm) = vectype;
1820   SLP_TREE_LANES (perm) = group_size;
1821   /* ???  We should set this NULL but that's not expected.  */
1822   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1823   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1824   SLP_TREE_CHILDREN (perm).quick_push (child1);
1825   SLP_TREE_CHILDREN (perm).quick_push (child2);
1826 }
1827
1828 /* Recursively build an SLP tree starting from NODE.
1829    Fail (and return a value not equal to zero) if def-stmts are not
1830    isomorphic, require data permutation or are of unsupported types of
1831    operation.  Otherwise, return 0.
1832    The value returned is the depth in the SLP tree where a mismatch
1833    was found.  */
1834
1835 static slp_tree
1836 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1837                        vec<stmt_vec_info> stmts, unsigned int group_size,
1838                        poly_uint64 *max_nunits,
1839                        bool *matches, unsigned *limit, unsigned *tree_size,
1840                        scalar_stmts_to_slp_tree_map_t *bst_map)
1841 {
1842   unsigned nops, i, this_tree_size = 0;
1843   poly_uint64 this_max_nunits = *max_nunits;
1844
1845   matches[0] = false;
1846
1847   stmt_vec_info stmt_info = stmts[0];
1848   if (!is_a<gcall *> (stmt_info->stmt)
1849       && !is_a<gassign *> (stmt_info->stmt)
1850       && !is_a<gphi *> (stmt_info->stmt))
1851     return NULL;
1852
1853   nops = gimple_num_args (stmt_info->stmt);
1854   if (const int *map = vect_get_operand_map (stmt_info->stmt,
1855                                              STMT_VINFO_GATHER_SCATTER_P
1856                                                (stmt_info)))
1857     nops = map[0];
1858
1859   /* If the SLP node is a PHI (induction or reduction), terminate
1860      the recursion.  */
1861   bool *skip_args = XALLOCAVEC (bool, nops);
1862   memset (skip_args, 0, sizeof (bool) * nops);
1863   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1864     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1865       {
1866         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1867         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1868                                                     group_size);
1869         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1870                                      max_nunits))
1871           return NULL;
1872
1873         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1874         if (def_type == vect_induction_def)
1875           {
1876             /* Induction PHIs are not cycles but walk the initial
1877                value.  Only for inner loops through, for outer loops
1878                we need to pick up the value from the actual PHIs
1879                to more easily support peeling and epilogue vectorization.  */
1880             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1881             if (!nested_in_vect_loop_p (loop, stmt_info))
1882               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1883             else
1884               loop = loop->inner;
1885             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1886           }
1887         else if (def_type == vect_reduction_def
1888                  || def_type == vect_double_reduction_def
1889                  || def_type == vect_nested_cycle
1890                  || def_type == vect_first_order_recurrence)
1891           {
1892             /* Else def types have to match.  */
1893             stmt_vec_info other_info;
1894             bool all_same = true;
1895             FOR_EACH_VEC_ELT (stmts, i, other_info)
1896               {
1897                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1898                   return NULL;
1899                 if (other_info != stmt_info)
1900                   all_same = false;
1901               }
1902             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1903             /* Reduction initial values are not explicitely represented.  */
1904             if (def_type != vect_first_order_recurrence
1905                 && !nested_in_vect_loop_p (loop, stmt_info))
1906               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1907             /* Reduction chain backedge defs are filled manually.
1908                ???  Need a better way to identify a SLP reduction chain PHI.
1909                Or a better overall way to SLP match those.  */
1910             if (all_same && def_type == vect_reduction_def)
1911               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1912           }
1913         else if (def_type != vect_internal_def)
1914           return NULL;
1915       }
1916
1917
1918   bool two_operators = false;
1919   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1920   tree vectype = NULL_TREE;
1921   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1922                               &this_max_nunits, matches, &two_operators,
1923                               &vectype))
1924     return NULL;
1925
1926   /* If the SLP node is a load, terminate the recursion unless masked.  */
1927   if (STMT_VINFO_DATA_REF (stmt_info)
1928       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1929     {
1930       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1931         gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1932                     || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1933                     || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
1934                     || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD));
1935       else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1936         gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1937       else
1938         {
1939           *max_nunits = this_max_nunits;
1940           (*tree_size)++;
1941           node = vect_create_new_slp_node (node, stmts, 0);
1942           SLP_TREE_VECTYPE (node) = vectype;
1943           /* And compute the load permutation.  Whether it is actually
1944              a permutation depends on the unrolling factor which is
1945              decided later.  */
1946           vec<unsigned> load_permutation;
1947           int j;
1948           stmt_vec_info load_info;
1949           load_permutation.create (group_size);
1950           stmt_vec_info first_stmt_info
1951             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1952           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1953             {
1954               int load_place;
1955               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1956                 load_place = vect_get_place_in_interleaving_chain
1957                                 (load_info, first_stmt_info);
1958               else
1959                 load_place = 0;
1960               gcc_assert (load_place != -1);
1961               load_permutation.safe_push (load_place);
1962             }
1963           SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1964           return node;
1965         }
1966     }
1967   else if (gimple_assign_single_p (stmt_info->stmt)
1968            && !gimple_vuse (stmt_info->stmt)
1969            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1970     {
1971       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1972          the same SSA name vector of a compatible type to vectype.  */
1973       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1974       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1975       stmt_vec_info estmt_info;
1976       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1977         {
1978           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1979           tree bfref = gimple_assign_rhs1 (estmt);
1980           HOST_WIDE_INT lane;
1981           if (!known_eq (bit_field_size (bfref),
1982                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1983               || !constant_multiple_p (bit_field_offset (bfref),
1984                                        bit_field_size (bfref), &lane))
1985             {
1986               lperm.release ();
1987               matches[0] = false;
1988               return NULL;
1989             }
1990           lperm.safe_push (std::make_pair (0, (unsigned)lane));
1991         }
1992       slp_tree vnode = vect_create_new_slp_node (vNULL);
1993       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1994         /* ???  We record vectype here but we hide eventually necessary
1995            punning and instead rely on code generation to materialize
1996            VIEW_CONVERT_EXPRs as necessary.  We instead should make
1997            this explicit somehow.  */
1998         SLP_TREE_VECTYPE (vnode) = vectype;
1999       else
2000         {
2001           /* For different size but compatible elements we can still
2002              use VEC_PERM_EXPR without punning.  */
2003           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2004                       && types_compatible_p (TREE_TYPE (vectype),
2005                                              TREE_TYPE (TREE_TYPE (vec))));
2006           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2007         }
2008       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2009       unsigned HOST_WIDE_INT const_nunits;
2010       if (nunits.is_constant (&const_nunits))
2011         SLP_TREE_LANES (vnode) = const_nunits;
2012       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2013       /* We are always building a permutation node even if it is an identity
2014          permute to shield the rest of the vectorizer from the odd node
2015          representing an actual vector without any scalar ops.
2016          ???  We could hide it completely with making the permute node
2017          external?  */
2018       node = vect_create_new_slp_node (node, stmts, 1);
2019       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2020       SLP_TREE_LANE_PERMUTATION (node) = lperm;
2021       SLP_TREE_VECTYPE (node) = vectype;
2022       SLP_TREE_CHILDREN (node).quick_push (vnode);
2023       return node;
2024     }
2025   /* When discovery reaches an associatable operation see whether we can
2026      improve that to match up lanes in a way superior to the operand
2027      swapping code which at most looks at two defs.
2028      ???  For BB vectorization we cannot do the brute-force search
2029      for matching as we can succeed by means of builds from scalars
2030      and have no good way to "cost" one build against another.  */
2031   else if (is_a <loop_vec_info> (vinfo)
2032            /* ???  We don't handle !vect_internal_def defs below.  */
2033            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2034            && is_gimple_assign (stmt_info->stmt)
2035            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2036                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2037            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2038                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2039                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2040     {
2041       /* See if we have a chain of (mixed) adds or subtracts or other
2042          associatable ops.  */
2043       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2044       if (code == MINUS_EXPR)
2045         code = PLUS_EXPR;
2046       stmt_vec_info other_op_stmt_info = NULL;
2047       stmt_vec_info op_stmt_info = NULL;
2048       unsigned chain_len = 0;
2049       auto_vec<chain_op_t> chain;
2050       auto_vec<std::pair<tree_code, gimple *> > worklist;
2051       auto_vec<vec<chain_op_t> > chains (group_size);
2052       auto_vec<slp_tree, 4> children;
2053       bool hard_fail = true;
2054       for (unsigned lane = 0; lane < group_size; ++lane)
2055         {
2056           /* For each lane linearize the addition/subtraction (or other
2057              uniform associatable operation) expression tree.  */
2058           gimple *op_stmt = NULL, *other_op_stmt = NULL;
2059           vect_slp_linearize_chain (vinfo, worklist, chain, code,
2060                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
2061                                     NULL);
2062           if (!op_stmt_info && op_stmt)
2063             op_stmt_info = vinfo->lookup_stmt (op_stmt);
2064           if (!other_op_stmt_info && other_op_stmt)
2065             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2066           if (chain.length () == 2)
2067             {
2068               /* In a chain of just two elements resort to the regular
2069                  operand swapping scheme.  If we run into a length
2070                  mismatch still hard-FAIL.  */
2071               if (chain_len == 0)
2072                 hard_fail = false;
2073               else
2074                 {
2075                   matches[lane] = false;
2076                   /* ???  We might want to process the other lanes, but
2077                      make sure to not give false matching hints to the
2078                      caller for lanes we did not process.  */
2079                   if (lane != group_size - 1)
2080                     matches[0] = false;
2081                 }
2082               break;
2083             }
2084           else if (chain_len == 0)
2085             chain_len = chain.length ();
2086           else if (chain.length () != chain_len)
2087             {
2088               /* ???  Here we could slip in magic to compensate with
2089                  neutral operands.  */
2090               matches[lane] = false;
2091               if (lane != group_size - 1)
2092                 matches[0] = false;
2093               break;
2094             }
2095           chains.quick_push (chain.copy ());
2096           chain.truncate (0);
2097         }
2098       if (chains.length () == group_size)
2099         {
2100           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
2101           if (!op_stmt_info)
2102             {
2103               hard_fail = false;
2104               goto out;
2105             }
2106           /* Now we have a set of chains with the same length.  */
2107           /* 1. pre-sort according to def_type and operation.  */
2108           for (unsigned lane = 0; lane < group_size; ++lane)
2109             chains[lane].stablesort (dt_sort_cmp, vinfo);
2110           if (dump_enabled_p ())
2111             {
2112               dump_printf_loc (MSG_NOTE, vect_location,
2113                                "pre-sorted chains of %s\n",
2114                                get_tree_code_name (code));
2115               for (unsigned lane = 0; lane < group_size; ++lane)
2116                 {
2117                   for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2118                     dump_printf (MSG_NOTE, "%s %T ",
2119                                  get_tree_code_name (chains[lane][opnum].code),
2120                                  chains[lane][opnum].op);
2121                   dump_printf (MSG_NOTE, "\n");
2122                 }
2123             }
2124           /* 2. try to build children nodes, associating as necessary.  */
2125           for (unsigned n = 0; n < chain_len; ++n)
2126             {
2127               vect_def_type dt = chains[0][n].dt;
2128               unsigned lane;
2129               for (lane = 0; lane < group_size; ++lane)
2130                 if (chains[lane][n].dt != dt)
2131                   {
2132                     if (dt == vect_constant_def
2133                         && chains[lane][n].dt == vect_external_def)
2134                       dt = vect_external_def;
2135                     else if (dt == vect_external_def
2136                              && chains[lane][n].dt == vect_constant_def)
2137                       ;
2138                     else
2139                       break;
2140                   }
2141               if (lane != group_size)
2142                 {
2143                   if (dump_enabled_p ())
2144                     dump_printf_loc (MSG_NOTE, vect_location,
2145                                      "giving up on chain due to mismatched "
2146                                      "def types\n");
2147                   matches[lane] = false;
2148                   if (lane != group_size - 1)
2149                     matches[0] = false;
2150                   goto out;
2151                 }
2152               if (dt == vect_constant_def
2153                   || dt == vect_external_def)
2154                 {
2155                   /* Check whether we can build the invariant.  If we can't
2156                      we never will be able to.  */
2157                   tree type = TREE_TYPE (chains[0][n].op);
2158                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2159                       && (TREE_CODE (type) == BOOLEAN_TYPE
2160                           || !can_duplicate_and_interleave_p (vinfo, group_size,
2161                                                               type)))
2162                     {
2163                       matches[0] = false;
2164                       goto out;
2165                     }
2166                   vec<tree> ops;
2167                   ops.create (group_size);
2168                   for (lane = 0; lane < group_size; ++lane)
2169                     ops.quick_push (chains[lane][n].op);
2170                   slp_tree child = vect_create_new_slp_node (ops);
2171                   SLP_TREE_DEF_TYPE (child) = dt;
2172                   children.safe_push (child);
2173                 }
2174               else if (dt != vect_internal_def)
2175                 {
2176                   /* Not sure, we might need sth special.
2177                      gcc.dg/vect/pr96854.c,
2178                      gfortran.dg/vect/fast-math-pr37021.f90
2179                      and gfortran.dg/vect/pr61171.f trigger.  */
2180                   /* Soft-fail for now.  */
2181                   hard_fail = false;
2182                   goto out;
2183                 }
2184               else
2185                 {
2186                   vec<stmt_vec_info> op_stmts;
2187                   op_stmts.create (group_size);
2188                   slp_tree child = NULL;
2189                   /* Brute-force our way.  We have to consider a lane
2190                      failing after fixing an earlier fail up in the
2191                      SLP discovery recursion.  So track the current
2192                      permute per lane.  */
2193                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2194                   memset (perms, 0, sizeof (unsigned) * group_size);
2195                   do
2196                     {
2197                       op_stmts.truncate (0);
2198                       for (lane = 0; lane < group_size; ++lane)
2199                         op_stmts.quick_push
2200                           (vinfo->lookup_def (chains[lane][n].op));
2201                       child = vect_build_slp_tree (vinfo, op_stmts,
2202                                                    group_size, &this_max_nunits,
2203                                                    matches, limit,
2204                                                    &this_tree_size, bst_map);
2205                       /* ???  We're likely getting too many fatal mismatches
2206                          here so maybe we want to ignore them (but then we
2207                          have no idea which lanes fatally mismatched).  */
2208                       if (child || !matches[0])
2209                         break;
2210                       /* Swap another lane we have not yet matched up into
2211                          lanes that did not match.  If we run out of
2212                          permute possibilities for a lane terminate the
2213                          search.  */
2214                       bool term = false;
2215                       for (lane = 1; lane < group_size; ++lane)
2216                         if (!matches[lane])
2217                           {
2218                             if (n + perms[lane] + 1 == chain_len)
2219                               {
2220                                 term = true;
2221                                 break;
2222                               }
2223                             std::swap (chains[lane][n],
2224                                        chains[lane][n + perms[lane] + 1]);
2225                             perms[lane]++;
2226                           }
2227                       if (term)
2228                         break;
2229                     }
2230                   while (1);
2231                   if (!child)
2232                     {
2233                       if (dump_enabled_p ())
2234                         dump_printf_loc (MSG_NOTE, vect_location,
2235                                          "failed to match up op %d\n", n);
2236                       op_stmts.release ();
2237                       if (lane != group_size - 1)
2238                         matches[0] = false;
2239                       else
2240                         matches[lane] = false;
2241                       goto out;
2242                     }
2243                   if (dump_enabled_p ())
2244                     {
2245                       dump_printf_loc (MSG_NOTE, vect_location,
2246                                        "matched up op %d to\n", n);
2247                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2248                     }
2249                   children.safe_push (child);
2250                 }
2251             }
2252           /* 3. build SLP nodes to combine the chain.  */
2253           for (unsigned lane = 0; lane < group_size; ++lane)
2254             if (chains[lane][0].code != code)
2255               {
2256                 /* See if there's any alternate all-PLUS entry.  */
2257                 unsigned n;
2258                 for (n = 1; n < chain_len; ++n)
2259                   {
2260                     for (lane = 0; lane < group_size; ++lane)
2261                       if (chains[lane][n].code != code)
2262                         break;
2263                     if (lane == group_size)
2264                       break;
2265                   }
2266                 if (n != chain_len)
2267                   {
2268                     /* Swap that in at first position.  */
2269                     std::swap (children[0], children[n]);
2270                     for (lane = 0; lane < group_size; ++lane)
2271                       std::swap (chains[lane][0], chains[lane][n]);
2272                   }
2273                 else
2274                   {
2275                     /* ???  When this triggers and we end up with two
2276                        vect_constant/external_def up-front things break (ICE)
2277                        spectacularly finding an insertion place for the
2278                        all-constant op.  We should have a fully
2279                        vect_internal_def operand though(?) so we can swap
2280                        that into first place and then prepend the all-zero
2281                        constant.  */
2282                     if (dump_enabled_p ())
2283                       dump_printf_loc (MSG_NOTE, vect_location,
2284                                        "inserting constant zero to compensate "
2285                                        "for (partially) negated first "
2286                                        "operand\n");
2287                     chain_len++;
2288                     for (lane = 0; lane < group_size; ++lane)
2289                       chains[lane].safe_insert
2290                         (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2291                     vec<tree> zero_ops;
2292                     zero_ops.create (group_size);
2293                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2294                     for (lane = 1; lane < group_size; ++lane)
2295                       zero_ops.quick_push (zero_ops[0]);
2296                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2297                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2298                     children.safe_insert (0, zero);
2299                   }
2300                 break;
2301               }
2302           for (unsigned i = 1; i < children.length (); ++i)
2303             {
2304               slp_tree op0 = children[i - 1];
2305               slp_tree op1 = children[i];
2306               bool this_two_op = false;
2307               for (unsigned lane = 0; lane < group_size; ++lane)
2308                 if (chains[lane][i].code != chains[0][i].code)
2309                   {
2310                     this_two_op = true;
2311                     break;
2312                   }
2313               slp_tree child;
2314               if (i == children.length () - 1)
2315                 child = vect_create_new_slp_node (node, stmts, 2);
2316               else
2317                 child = vect_create_new_slp_node (2, ERROR_MARK);
2318               if (this_two_op)
2319                 {
2320                   vec<std::pair<unsigned, unsigned> > lperm;
2321                   lperm.create (group_size);
2322                   for (unsigned lane = 0; lane < group_size; ++lane)
2323                     lperm.quick_push (std::make_pair
2324                       (chains[lane][i].code != chains[0][i].code, lane));
2325                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2326                                                      (chains[0][i].code == code
2327                                                       ? op_stmt_info
2328                                                       : other_op_stmt_info),
2329                                                      (chains[0][i].code == code
2330                                                       ? other_op_stmt_info
2331                                                       : op_stmt_info),
2332                                                      lperm);
2333                 }
2334               else
2335                 {
2336                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2337                   SLP_TREE_VECTYPE (child) = vectype;
2338                   SLP_TREE_LANES (child) = group_size;
2339                   SLP_TREE_CHILDREN (child).quick_push (op0);
2340                   SLP_TREE_CHILDREN (child).quick_push (op1);
2341                   SLP_TREE_REPRESENTATIVE (child)
2342                     = (chains[0][i].code == code
2343                        ? op_stmt_info : other_op_stmt_info);
2344                 }
2345               children[i] = child;
2346             }
2347           *tree_size += this_tree_size + 1;
2348           *max_nunits = this_max_nunits;
2349           while (!chains.is_empty ())
2350             chains.pop ().release ();
2351           return node;
2352         }
2353 out:
2354       while (!children.is_empty ())
2355         vect_free_slp_tree (children.pop ());
2356       while (!chains.is_empty ())
2357         chains.pop ().release ();
2358       /* Hard-fail, otherwise we might run into quadratic processing of the
2359          chains starting one stmt into the chain again.  */
2360       if (hard_fail)
2361         return NULL;
2362       /* Fall thru to normal processing.  */
2363     }
2364
2365   /* Get at the operands, verifying they are compatible.  */
2366   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2367   slp_oprnd_info oprnd_info;
2368   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2369     {
2370       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2371                                              stmts, i, &oprnds_info);
2372       if (res != 0)
2373         matches[(res == -1) ? 0 : i] = false;
2374       if (!matches[0])
2375         break;
2376     }
2377   for (i = 0; i < group_size; ++i)
2378     if (!matches[i])
2379       {
2380         vect_free_oprnd_info (oprnds_info);
2381         return NULL;
2382       }
2383   swap = NULL;
2384
2385   auto_vec<slp_tree, 4> children;
2386
2387   stmt_info = stmts[0];
2388
2389   /* Create SLP_TREE nodes for the definition node/s.  */
2390   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2391     {
2392       slp_tree child;
2393       unsigned int j;
2394
2395       /* We're skipping certain operands from processing, for example
2396          outer loop reduction initial defs.  */
2397       if (skip_args[i])
2398         {
2399           children.safe_push (NULL);
2400           continue;
2401         }
2402
2403       if (oprnd_info->first_dt == vect_uninitialized_def)
2404         {
2405           /* COND_EXPR have one too many eventually if the condition
2406              is a SSA name.  */
2407           gcc_assert (i == 3 && nops == 4);
2408           continue;
2409         }
2410
2411       if (is_a <bb_vec_info> (vinfo)
2412           && oprnd_info->first_dt == vect_internal_def
2413           && !oprnd_info->any_pattern)
2414         {
2415           /* For BB vectorization, if all defs are the same do not
2416              bother to continue the build along the single-lane
2417              graph but use a splat of the scalar value.  */
2418           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2419           for (j = 1; j < group_size; ++j)
2420             if (oprnd_info->def_stmts[j] != first_def)
2421               break;
2422           if (j == group_size
2423               /* But avoid doing this for loads where we may be
2424                  able to CSE things, unless the stmt is not
2425                  vectorizable.  */
2426               && (!STMT_VINFO_VECTORIZABLE (first_def)
2427                   || !gimple_vuse (first_def->stmt)))
2428             {
2429               if (dump_enabled_p ())
2430                 dump_printf_loc (MSG_NOTE, vect_location,
2431                                  "Using a splat of the uniform operand %G",
2432                                  first_def->stmt);
2433               oprnd_info->first_dt = vect_external_def;
2434             }
2435         }
2436
2437       if (oprnd_info->first_dt == vect_external_def
2438           || oprnd_info->first_dt == vect_constant_def)
2439         {
2440           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2441           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2442           oprnd_info->ops = vNULL;
2443           children.safe_push (invnode);
2444           continue;
2445         }
2446
2447       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2448                                         group_size, &this_max_nunits,
2449                                         matches, limit,
2450                                         &this_tree_size, bst_map)) != NULL)
2451         {
2452           oprnd_info->def_stmts = vNULL;
2453           children.safe_push (child);
2454           continue;
2455         }
2456
2457       /* If the SLP build for operand zero failed and operand zero
2458          and one can be commutated try that for the scalar stmts
2459          that failed the match.  */
2460       if (i == 0
2461           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2462           && matches[0]
2463           /* ???  For COND_EXPRs we can swap the comparison operands
2464              as well as the arms under some constraints.  */
2465           && nops == 2
2466           && oprnds_info[1]->first_dt == vect_internal_def
2467           && is_gimple_assign (stmt_info->stmt)
2468           /* Swapping operands for reductions breaks assumptions later on.  */
2469           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2470           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2471         {
2472           /* See whether we can swap the matching or the non-matching
2473              stmt operands.  */
2474           bool swap_not_matching = true;
2475           do
2476             {
2477               for (j = 0; j < group_size; ++j)
2478                 {
2479                   if (matches[j] != !swap_not_matching)
2480                     continue;
2481                   stmt_vec_info stmt_info = stmts[j];
2482                   /* Verify if we can swap operands of this stmt.  */
2483                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2484                   if (!stmt
2485                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2486                     {
2487                       if (!swap_not_matching)
2488                         goto fail;
2489                       swap_not_matching = false;
2490                       break;
2491                     }
2492                 }
2493             }
2494           while (j != group_size);
2495
2496           /* Swap mismatched definition stmts.  */
2497           if (dump_enabled_p ())
2498             dump_printf_loc (MSG_NOTE, vect_location,
2499                              "Re-trying with swapped operands of stmts ");
2500           for (j = 0; j < group_size; ++j)
2501             if (matches[j] == !swap_not_matching)
2502               {
2503                 std::swap (oprnds_info[0]->def_stmts[j],
2504                            oprnds_info[1]->def_stmts[j]);
2505                 std::swap (oprnds_info[0]->ops[j],
2506                            oprnds_info[1]->ops[j]);
2507                 if (dump_enabled_p ())
2508                   dump_printf (MSG_NOTE, "%d ", j);
2509               }
2510           if (dump_enabled_p ())
2511             dump_printf (MSG_NOTE, "\n");
2512           /* After swapping some operands we lost track whether an
2513              operand has any pattern defs so be conservative here.  */
2514           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2515             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2516           /* And try again with scratch 'matches' ... */
2517           bool *tem = XALLOCAVEC (bool, group_size);
2518           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2519                                             group_size, &this_max_nunits,
2520                                             tem, limit,
2521                                             &this_tree_size, bst_map)) != NULL)
2522             {
2523               oprnd_info->def_stmts = vNULL;
2524               children.safe_push (child);
2525               continue;
2526             }
2527         }
2528 fail:
2529
2530       /* If the SLP build failed and we analyze a basic-block
2531          simply treat nodes we fail to build as externally defined
2532          (and thus build vectors from the scalar defs).
2533          The cost model will reject outright expensive cases.
2534          ???  This doesn't treat cases where permutation ultimatively
2535          fails (or we don't try permutation below).  Ideally we'd
2536          even compute a permutation that will end up with the maximum
2537          SLP tree size...  */
2538       if (is_a <bb_vec_info> (vinfo)
2539           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2540              do extra work to cancel the pattern so the uses see the
2541              scalar version.  */
2542           && !is_pattern_stmt_p (stmt_info)
2543           && !oprnd_info->any_pattern)
2544         {
2545           /* But if there's a leading vector sized set of matching stmts
2546              fail here so we can split the group.  This matches the condition
2547              vect_analyze_slp_instance uses.  */
2548           /* ???  We might want to split here and combine the results to support
2549              multiple vector sizes better.  */
2550           for (j = 0; j < group_size; ++j)
2551             if (!matches[j])
2552               break;
2553           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2554             {
2555               if (dump_enabled_p ())
2556                 dump_printf_loc (MSG_NOTE, vect_location,
2557                                  "Building vector operands from scalars\n");
2558               this_tree_size++;
2559               child = vect_create_new_slp_node (oprnd_info->ops);
2560               children.safe_push (child);
2561               oprnd_info->ops = vNULL;
2562               continue;
2563             }
2564         }
2565
2566       gcc_assert (child == NULL);
2567       FOR_EACH_VEC_ELT (children, j, child)
2568         if (child)
2569           vect_free_slp_tree (child);
2570       vect_free_oprnd_info (oprnds_info);
2571       return NULL;
2572     }
2573
2574   vect_free_oprnd_info (oprnds_info);
2575
2576   /* If we have all children of a child built up from uniform scalars
2577      or does more than one possibly expensive vector construction then
2578      just throw that away, causing it built up from scalars.
2579      The exception is the SLP node for the vector store.  */
2580   if (is_a <bb_vec_info> (vinfo)
2581       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2582       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2583          do extra work to cancel the pattern so the uses see the
2584          scalar version.  */
2585       && !is_pattern_stmt_p (stmt_info))
2586     {
2587       slp_tree child;
2588       unsigned j;
2589       bool all_uniform_p = true;
2590       unsigned n_vector_builds = 0;
2591       FOR_EACH_VEC_ELT (children, j, child)
2592         {
2593           if (!child)
2594             ;
2595           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2596             all_uniform_p = false;
2597           else if (!vect_slp_tree_uniform_p (child))
2598             {
2599               all_uniform_p = false;
2600               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2601                 n_vector_builds++;
2602             }
2603         }
2604       if (all_uniform_p
2605           || n_vector_builds > 1
2606           || (n_vector_builds == children.length ()
2607               && is_a <gphi *> (stmt_info->stmt)))
2608         {
2609           /* Roll back.  */
2610           matches[0] = false;
2611           FOR_EACH_VEC_ELT (children, j, child)
2612             if (child)
2613               vect_free_slp_tree (child);
2614
2615           if (dump_enabled_p ())
2616             dump_printf_loc (MSG_NOTE, vect_location,
2617                              "Building parent vector operands from "
2618                              "scalars instead\n");
2619           return NULL;
2620         }
2621     }
2622
2623   *tree_size += this_tree_size + 1;
2624   *max_nunits = this_max_nunits;
2625
2626   if (two_operators)
2627     {
2628       /* ???  We'd likely want to either cache in bst_map sth like
2629          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2630          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2631          explicit stmts to put in so the keying on 'stmts' doesn't
2632          work (but we have the same issue with nodes that use 'ops').  */
2633       slp_tree one = new _slp_tree;
2634       slp_tree two = new _slp_tree;
2635       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2636       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2637       SLP_TREE_VECTYPE (one) = vectype;
2638       SLP_TREE_VECTYPE (two) = vectype;
2639       SLP_TREE_CHILDREN (one).safe_splice (children);
2640       SLP_TREE_CHILDREN (two).safe_splice (children);
2641       slp_tree child;
2642       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2643         SLP_TREE_REF_COUNT (child)++;
2644
2645       /* Here we record the original defs since this
2646          node represents the final lane configuration.  */
2647       node = vect_create_new_slp_node (node, stmts, 2);
2648       SLP_TREE_VECTYPE (node) = vectype;
2649       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2650       SLP_TREE_CHILDREN (node).quick_push (one);
2651       SLP_TREE_CHILDREN (node).quick_push (two);
2652       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2653       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2654       enum tree_code ocode = ERROR_MARK;
2655       stmt_vec_info ostmt_info;
2656       unsigned j = 0;
2657       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2658         {
2659           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2660           if (gimple_assign_rhs_code (ostmt) != code0)
2661             {
2662               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2663               ocode = gimple_assign_rhs_code (ostmt);
2664               j = i;
2665             }
2666           else
2667             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2668         }
2669       SLP_TREE_CODE (one) = code0;
2670       SLP_TREE_CODE (two) = ocode;
2671       SLP_TREE_LANES (one) = stmts.length ();
2672       SLP_TREE_LANES (two) = stmts.length ();
2673       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2674       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2675       return node;
2676     }
2677
2678   node = vect_create_new_slp_node (node, stmts, nops);
2679   SLP_TREE_VECTYPE (node) = vectype;
2680   SLP_TREE_CHILDREN (node).splice (children);
2681   return node;
2682 }
2683
2684 /* Dump a single SLP tree NODE.  */
2685
2686 static void
2687 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2688                      slp_tree node)
2689 {
2690   unsigned i, j;
2691   slp_tree child;
2692   stmt_vec_info stmt_info;
2693   tree op;
2694
2695   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2696   dump_user_location_t user_loc = loc.get_user_location ();
2697   dump_printf_loc (metadata, user_loc,
2698                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2699                    ", refcnt=%u)",
2700                    SLP_TREE_DEF_TYPE (node) == vect_external_def
2701                    ? " (external)"
2702                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2703                       ? " (constant)"
2704                       : ""), (void *) node,
2705                    estimated_poly_value (node->max_nunits),
2706                                          SLP_TREE_REF_COUNT (node));
2707   if (SLP_TREE_VECTYPE (node))
2708     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2709   dump_printf (metadata, "\n");
2710   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2711     {
2712       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2713         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2714       else
2715         dump_printf_loc (metadata, user_loc, "op template: %G",
2716                          SLP_TREE_REPRESENTATIVE (node)->stmt);
2717     }
2718   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2719     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2720       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2721   else
2722     {
2723       dump_printf_loc (metadata, user_loc, "\t{ ");
2724       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2725         dump_printf (metadata, "%T%s ", op,
2726                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2727       dump_printf (metadata, "}\n");
2728     }
2729   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2730     {
2731       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2732       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2733         dump_printf (dump_kind, " %u", j);
2734       dump_printf (dump_kind, " }\n");
2735     }
2736   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2737     {
2738       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2739       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2740         dump_printf (dump_kind, " %u[%u]",
2741                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
2742                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
2743       dump_printf (dump_kind, " }\n");
2744     }
2745   if (SLP_TREE_CHILDREN (node).is_empty ())
2746     return;
2747   dump_printf_loc (metadata, user_loc, "\tchildren");
2748   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2749     dump_printf (dump_kind, " %p", (void *)child);
2750   dump_printf (dump_kind, "\n");
2751 }
2752
2753 DEBUG_FUNCTION void
2754 debug (slp_tree node)
2755 {
2756   debug_dump_context ctx;
2757   vect_print_slp_tree (MSG_NOTE,
2758                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2759                        node);
2760 }
2761
2762 /* Recursive helper for the dot producer below.  */
2763
2764 static void
2765 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2766 {
2767   if (visited.add (node))
2768     return;
2769
2770   fprintf (f, "\"%p\" [label=\"", (void *)node);
2771   vect_print_slp_tree (MSG_NOTE,
2772                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2773                        node);
2774   fprintf (f, "\"];\n");
2775
2776
2777   for (slp_tree child : SLP_TREE_CHILDREN (node))
2778     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2779
2780   for (slp_tree child : SLP_TREE_CHILDREN (node))
2781     if (child)
2782       dot_slp_tree (f, child, visited);
2783 }
2784
2785 DEBUG_FUNCTION void
2786 dot_slp_tree (const char *fname, slp_tree node)
2787 {
2788   FILE *f = fopen (fname, "w");
2789   fprintf (f, "digraph {\n");
2790   fflush (f);
2791     {
2792       debug_dump_context ctx (f);
2793       hash_set<slp_tree> visited;
2794       dot_slp_tree (f, node, visited);
2795     }
2796   fflush (f);
2797   fprintf (f, "}\n");
2798   fclose (f);
2799 }
2800
2801 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2802
2803 static void
2804 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2805                       slp_tree node, hash_set<slp_tree> &visited)
2806 {
2807   unsigned i;
2808   slp_tree child;
2809
2810   if (visited.add (node))
2811     return;
2812
2813   vect_print_slp_tree (dump_kind, loc, node);
2814
2815   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2816     if (child)
2817       vect_print_slp_graph (dump_kind, loc, child, visited);
2818 }
2819
2820 static void
2821 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2822                       slp_tree entry)
2823 {
2824   hash_set<slp_tree> visited;
2825   vect_print_slp_graph (dump_kind, loc, entry, visited);
2826 }
2827
2828 /* Mark the tree rooted at NODE with PURE_SLP.  */
2829
2830 static void
2831 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2832 {
2833   int i;
2834   stmt_vec_info stmt_info;
2835   slp_tree child;
2836
2837   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2838     return;
2839
2840   if (visited.add (node))
2841     return;
2842
2843   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2844     STMT_SLP_TYPE (stmt_info) = pure_slp;
2845
2846   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2847     if (child)
2848       vect_mark_slp_stmts (child, visited);
2849 }
2850
2851 static void
2852 vect_mark_slp_stmts (slp_tree node)
2853 {
2854   hash_set<slp_tree> visited;
2855   vect_mark_slp_stmts (node, visited);
2856 }
2857
2858 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2859
2860 static void
2861 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2862 {
2863   int i;
2864   stmt_vec_info stmt_info;
2865   slp_tree child;
2866
2867   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2868     return;
2869
2870   if (visited.add (node))
2871     return;
2872
2873   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2874     {
2875       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2876                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2877       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2878     }
2879
2880   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2881     if (child)
2882       vect_mark_slp_stmts_relevant (child, visited);
2883 }
2884
2885 static void
2886 vect_mark_slp_stmts_relevant (slp_tree node)
2887 {
2888   hash_set<slp_tree> visited;
2889   vect_mark_slp_stmts_relevant (node, visited);
2890 }
2891
2892
2893 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2894
2895 static void
2896 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2897                        hash_set<slp_tree> &visited)
2898 {
2899   if (!node || visited.add (node))
2900     return;
2901
2902   if (SLP_TREE_CHILDREN (node).length () == 0)
2903     {
2904       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2905         return;
2906       stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2907       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2908           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2909         loads.safe_push (node);
2910     }
2911   else
2912     {
2913       unsigned i;
2914       slp_tree child;
2915       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2916         vect_gather_slp_loads (loads, child, visited);
2917     }
2918 }
2919
2920
2921 /* Find the last store in SLP INSTANCE.  */
2922
2923 stmt_vec_info
2924 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2925 {
2926   stmt_vec_info last = NULL;
2927   stmt_vec_info stmt_vinfo;
2928
2929   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2930     {
2931       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2932       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2933     }
2934
2935   return last;
2936 }
2937
2938 /* Find the first stmt in NODE.  */
2939
2940 stmt_vec_info
2941 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2942 {
2943   stmt_vec_info first = NULL;
2944   stmt_vec_info stmt_vinfo;
2945
2946   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2947     {
2948       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2949       if (!first
2950           || get_later_stmt (stmt_vinfo, first) == first)
2951         first = stmt_vinfo;
2952     }
2953
2954   return first;
2955 }
2956
2957 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2958    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2959    (also containing the first GROUP1_SIZE stmts, since stores are
2960    consecutive), the second containing the remainder.
2961    Return the first stmt in the second group.  */
2962
2963 static stmt_vec_info
2964 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2965 {
2966   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2967   gcc_assert (group1_size > 0);
2968   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2969   gcc_assert (group2_size > 0);
2970   DR_GROUP_SIZE (first_vinfo) = group1_size;
2971
2972   stmt_vec_info stmt_info = first_vinfo;
2973   for (unsigned i = group1_size; i > 1; i--)
2974     {
2975       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2976       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2977     }
2978   /* STMT is now the last element of the first group.  */
2979   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2980   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2981
2982   DR_GROUP_SIZE (group2) = group2_size;
2983   for (stmt_info = group2; stmt_info;
2984        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2985     {
2986       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2987       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2988     }
2989
2990   /* For the second group, the DR_GROUP_GAP is that before the original group,
2991      plus skipping over the first vector.  */
2992   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2993
2994   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
2995   DR_GROUP_GAP (first_vinfo) += group2_size;
2996
2997   if (dump_enabled_p ())
2998     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2999                      group1_size, group2_size);
3000
3001   return group2;
3002 }
3003
3004 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3005    statements and a vector of NUNITS elements.  */
3006
3007 static poly_uint64
3008 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3009 {
3010   return exact_div (common_multiple (nunits, group_size), group_size);
3011 }
3012
3013 /* Helper that checks to see if a node is a load node.  */
3014
3015 static inline bool
3016 vect_is_slp_load_node  (slp_tree root)
3017 {
3018   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3019          && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3020          && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3021 }
3022
3023
3024 /* Helper function of optimize_load_redistribution that performs the operation
3025    recursively.  */
3026
3027 static slp_tree
3028 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3029                                 vec_info *vinfo, unsigned int group_size,
3030                                 hash_map<slp_tree, slp_tree> *load_map,
3031                                 slp_tree root)
3032 {
3033   if (slp_tree *leader = load_map->get (root))
3034     return *leader;
3035
3036   slp_tree node;
3037   unsigned i;
3038
3039   /* For now, we don't know anything about externals so do not do anything.  */
3040   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3041     return NULL;
3042   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3043     {
3044       /* First convert this node into a load node and add it to the leaves
3045          list and flatten the permute from a lane to a load one.  If it's
3046          unneeded it will be elided later.  */
3047       vec<stmt_vec_info> stmts;
3048       stmts.create (SLP_TREE_LANES (root));
3049       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3050       for (unsigned j = 0; j < lane_perm.length (); j++)
3051         {
3052           std::pair<unsigned, unsigned> perm = lane_perm[j];
3053           node = SLP_TREE_CHILDREN (root)[perm.first];
3054
3055           if (!vect_is_slp_load_node (node)
3056               || SLP_TREE_CHILDREN (node).exists ())
3057             {
3058               stmts.release ();
3059               goto next;
3060             }
3061
3062           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3063         }
3064
3065       if (dump_enabled_p ())
3066         dump_printf_loc (MSG_NOTE, vect_location,
3067                          "converting stmts on permute node %p\n",
3068                          (void *) root);
3069
3070       bool *matches = XALLOCAVEC (bool, group_size);
3071       poly_uint64 max_nunits = 1;
3072       unsigned tree_size = 0, limit = 1;
3073       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3074                                   matches, &limit, &tree_size, bst_map);
3075       if (!node)
3076         stmts.release ();
3077
3078       load_map->put (root, node);
3079       return node;
3080     }
3081
3082 next:
3083   load_map->put (root, NULL);
3084
3085   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3086     {
3087       slp_tree value
3088         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3089                                           node);
3090       if (value)
3091         {
3092           SLP_TREE_REF_COUNT (value)++;
3093           SLP_TREE_CHILDREN (root)[i] = value;
3094           /* ???  We know the original leafs of the replaced nodes will
3095              be referenced by bst_map, only the permutes created by
3096              pattern matching are not.  */
3097           if (SLP_TREE_REF_COUNT (node) == 1)
3098             load_map->remove (node);
3099           vect_free_slp_tree (node);
3100         }
3101     }
3102
3103   return NULL;
3104 }
3105
3106 /* Temporary workaround for loads not being CSEd during SLP build.  This
3107    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3108    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3109    same DR such that the final operation is equal to a permuted load.  Such
3110    NODES are then directly converted into LOADS themselves.  The nodes are
3111    CSEd using BST_MAP.  */
3112
3113 static void
3114 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3115                               vec_info *vinfo, unsigned int group_size,
3116                               hash_map<slp_tree, slp_tree> *load_map,
3117                               slp_tree root)
3118 {
3119   slp_tree node;
3120   unsigned i;
3121
3122   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3123     {
3124       slp_tree value
3125         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3126                                           node);
3127       if (value)
3128         {
3129           SLP_TREE_REF_COUNT (value)++;
3130           SLP_TREE_CHILDREN (root)[i] = value;
3131           /* ???  We know the original leafs of the replaced nodes will
3132              be referenced by bst_map, only the permutes created by
3133              pattern matching are not.  */
3134           if (SLP_TREE_REF_COUNT (node) == 1)
3135             load_map->remove (node);
3136           vect_free_slp_tree (node);
3137         }
3138     }
3139 }
3140
3141 /* Helper function of vect_match_slp_patterns.
3142
3143    Attempts to match patterns against the slp tree rooted in REF_NODE using
3144    VINFO.  Patterns are matched in post-order traversal.
3145
3146    If matching is successful the value in REF_NODE is updated and returned, if
3147    not then it is returned unchanged.  */
3148
3149 static bool
3150 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3151                            slp_tree_to_load_perm_map_t *perm_cache,
3152                            slp_compat_nodes_map_t *compat_cache,
3153                            hash_set<slp_tree> *visited)
3154 {
3155   unsigned i;
3156   slp_tree node = *ref_node;
3157   bool found_p = false;
3158   if (!node || visited->add (node))
3159     return false;
3160
3161   slp_tree child;
3162   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3163     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3164                                           vinfo, perm_cache, compat_cache,
3165                                           visited);
3166
3167   for (unsigned x = 0; x < num__slp_patterns; x++)
3168     {
3169       vect_pattern *pattern
3170         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3171       if (pattern)
3172         {
3173           pattern->build (vinfo);
3174           delete pattern;
3175           found_p = true;
3176         }
3177     }
3178
3179   return found_p;
3180 }
3181
3182 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3183    vec_info VINFO.
3184
3185    The modified tree is returned.  Patterns are tried in order and multiple
3186    patterns may match.  */
3187
3188 static bool
3189 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3190                          hash_set<slp_tree> *visited,
3191                          slp_tree_to_load_perm_map_t *perm_cache,
3192                          slp_compat_nodes_map_t *compat_cache)
3193 {
3194   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3195   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3196
3197   if (dump_enabled_p ())
3198     dump_printf_loc (MSG_NOTE, vect_location,
3199                      "Analyzing SLP tree %p for patterns\n",
3200                      (void *) SLP_INSTANCE_TREE (instance));
3201
3202   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3203                                     visited);
3204 }
3205
3206 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3207    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3208    Return true if we could use IFN_STORE_LANES instead and if that appears
3209    to be the better approach.  */
3210
3211 static bool
3212 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3213                                unsigned int group_size,
3214                                unsigned int new_group_size)
3215 {
3216   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3217   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3218   if (!vectype)
3219     return false;
3220   /* Allow the split if one of the two new groups would operate on full
3221      vectors *within* rather than across one scalar loop iteration.
3222      This is purely a heuristic, but it should work well for group
3223      sizes of 3 and 4, where the possible splits are:
3224
3225        3->2+1:  OK if the vector has exactly two elements
3226        4->2+2:  Likewise
3227        4->3+1:  Less clear-cut.  */
3228   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3229       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3230     return false;
3231   return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3232 }
3233
3234 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3235    vect_build_slp_tree to build a tree of packed stmts if possible.
3236    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3237
3238 static bool
3239 vect_analyze_slp_instance (vec_info *vinfo,
3240                            scalar_stmts_to_slp_tree_map_t *bst_map,
3241                            stmt_vec_info stmt_info, slp_instance_kind kind,
3242                            unsigned max_tree_size, unsigned *limit);
3243
3244 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3245    of KIND.  Return true if successful.  */
3246
3247 static bool
3248 vect_build_slp_instance (vec_info *vinfo,
3249                          slp_instance_kind kind,
3250                          vec<stmt_vec_info> &scalar_stmts,
3251                          vec<stmt_vec_info> &root_stmt_infos,
3252                          vec<tree> &remain,
3253                          unsigned max_tree_size, unsigned *limit,
3254                          scalar_stmts_to_slp_tree_map_t *bst_map,
3255                          /* ???  We need stmt_info for group splitting.  */
3256                          stmt_vec_info stmt_info_)
3257 {
3258   if (kind == slp_inst_kind_ctor)
3259     {
3260       if (dump_enabled_p ())
3261         dump_printf_loc (MSG_NOTE, vect_location,
3262                          "Analyzing vectorizable constructor: %G\n",
3263                          root_stmt_infos[0]->stmt);
3264     }
3265
3266   if (dump_enabled_p ())
3267     {
3268       dump_printf_loc (MSG_NOTE, vect_location,
3269                        "Starting SLP discovery for\n");
3270       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3271         dump_printf_loc (MSG_NOTE, vect_location,
3272                          "  %G", scalar_stmts[i]->stmt);
3273     }
3274
3275   /* When a BB reduction doesn't have an even number of lanes
3276      strip it down, treating the remaining lane as scalar.
3277      ???  Selecting the optimal set of lanes to vectorize would be nice
3278      but SLP build for all lanes will fail quickly because we think
3279      we're going to need unrolling.  */
3280   if (kind == slp_inst_kind_bb_reduc
3281       && (scalar_stmts.length () & 1))
3282     remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt));
3283
3284   /* Build the tree for the SLP instance.  */
3285   unsigned int group_size = scalar_stmts.length ();
3286   bool *matches = XALLOCAVEC (bool, group_size);
3287   poly_uint64 max_nunits = 1;
3288   unsigned tree_size = 0;
3289   unsigned i;
3290   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3291                                        &max_nunits, matches, limit,
3292                                        &tree_size, bst_map);
3293   if (node != NULL)
3294     {
3295       /* Calculate the unrolling factor based on the smallest type.  */
3296       poly_uint64 unrolling_factor
3297         = calculate_unrolling_factor (max_nunits, group_size);
3298
3299       if (maybe_ne (unrolling_factor, 1U)
3300           && is_a <bb_vec_info> (vinfo))
3301         {
3302           unsigned HOST_WIDE_INT const_max_nunits;
3303           if (!max_nunits.is_constant (&const_max_nunits)
3304               || const_max_nunits > group_size)
3305             {
3306               if (dump_enabled_p ())
3307                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3308                                  "Build SLP failed: store group "
3309                                  "size not a multiple of the vector size "
3310                                  "in basic block SLP\n");
3311               vect_free_slp_tree (node);
3312               return false;
3313             }
3314           /* Fatal mismatch.  */
3315           if (dump_enabled_p ())
3316             dump_printf_loc (MSG_NOTE, vect_location,
3317                              "SLP discovery succeeded but node needs "
3318                              "splitting\n");
3319           memset (matches, true, group_size);
3320           matches[group_size / const_max_nunits * const_max_nunits] = false;
3321           vect_free_slp_tree (node);
3322         }
3323       else
3324         {
3325           /* Create a new SLP instance.  */
3326           slp_instance new_instance = XNEW (class _slp_instance);
3327           SLP_INSTANCE_TREE (new_instance) = node;
3328           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3329           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3330           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3331           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3332           SLP_INSTANCE_KIND (new_instance) = kind;
3333           new_instance->reduc_phis = NULL;
3334           new_instance->cost_vec = vNULL;
3335           new_instance->subgraph_entries = vNULL;
3336
3337           if (dump_enabled_p ())
3338             dump_printf_loc (MSG_NOTE, vect_location,
3339                              "SLP size %u vs. limit %u.\n",
3340                              tree_size, max_tree_size);
3341
3342           /* Fixup SLP reduction chains.  */
3343           if (kind == slp_inst_kind_reduc_chain)
3344             {
3345               /* If this is a reduction chain with a conversion in front
3346                  amend the SLP tree with a node for that.  */
3347               gimple *scalar_def
3348                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3349               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3350                 {
3351                   /* Get at the conversion stmt - we know it's the single use
3352                      of the last stmt of the reduction chain.  */
3353                   use_operand_p use_p;
3354                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3355                                            &use_p, &scalar_def);
3356                   gcc_assert (r);
3357                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3358                   next_info = vect_stmt_to_vectorize (next_info);
3359                   scalar_stmts = vNULL;
3360                   scalar_stmts.create (group_size);
3361                   for (unsigned i = 0; i < group_size; ++i)
3362                     scalar_stmts.quick_push (next_info);
3363                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3364                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3365                   SLP_TREE_CHILDREN (conv).quick_push (node);
3366                   SLP_INSTANCE_TREE (new_instance) = conv;
3367                   /* We also have to fake this conversion stmt as SLP reduction
3368                      group so we don't have to mess with too much code
3369                      elsewhere.  */
3370                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3371                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3372                 }
3373               /* Fill the backedge child of the PHI SLP node.  The
3374                  general matching code cannot find it because the
3375                  scalar code does not reflect how we vectorize the
3376                  reduction.  */
3377               use_operand_p use_p;
3378               imm_use_iterator imm_iter;
3379               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3380               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3381                                      gimple_get_lhs (scalar_def))
3382                 /* There are exactly two non-debug uses, the reduction
3383                    PHI and the loop-closed PHI node.  */
3384                 if (!is_gimple_debug (USE_STMT (use_p))
3385                     && gimple_bb (USE_STMT (use_p)) == loop->header)
3386                   {
3387                     auto_vec<stmt_vec_info, 64> phis (group_size);
3388                     stmt_vec_info phi_info
3389                       = vinfo->lookup_stmt (USE_STMT (use_p));
3390                     for (unsigned i = 0; i < group_size; ++i)
3391                       phis.quick_push (phi_info);
3392                     slp_tree *phi_node = bst_map->get (phis);
3393                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3394                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3395                       = SLP_INSTANCE_TREE (new_instance);
3396                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
3397                   }
3398             }
3399
3400           vinfo->slp_instances.safe_push (new_instance);
3401
3402           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3403              the number of scalar stmts in the root in a few places.
3404              Verify that assumption holds.  */
3405           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3406                         .length () == group_size);
3407
3408           if (dump_enabled_p ())
3409             {
3410               dump_printf_loc (MSG_NOTE, vect_location,
3411                                "Final SLP tree for instance %p:\n",
3412                                (void *) new_instance);
3413               vect_print_slp_graph (MSG_NOTE, vect_location,
3414                                     SLP_INSTANCE_TREE (new_instance));
3415             }
3416
3417           return true;
3418         }
3419     }
3420   else
3421     {
3422       /* Failed to SLP.  */
3423       /* Free the allocated memory.  */
3424       scalar_stmts.release ();
3425     }
3426
3427   stmt_vec_info stmt_info = stmt_info_;
3428   /* Try to break the group up into pieces.  */
3429   if (kind == slp_inst_kind_store)
3430     {
3431       /* ???  We could delay all the actual splitting of store-groups
3432          until after SLP discovery of the original group completed.
3433          Then we can recurse to vect_build_slp_instance directly.  */
3434       for (i = 0; i < group_size; i++)
3435         if (!matches[i])
3436           break;
3437
3438       /* For basic block SLP, try to break the group up into multiples of
3439          a vector size.  */
3440       if (is_a <bb_vec_info> (vinfo)
3441           && (i > 1 && i < group_size))
3442         {
3443           tree scalar_type
3444             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3445           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3446                                                       1 << floor_log2 (i));
3447           unsigned HOST_WIDE_INT const_nunits;
3448           if (vectype
3449               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3450             {
3451               /* Split into two groups at the first vector boundary.  */
3452               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3453               unsigned group1_size = i & ~(const_nunits - 1);
3454
3455               if (dump_enabled_p ())
3456                 dump_printf_loc (MSG_NOTE, vect_location,
3457                                  "Splitting SLP group at stmt %u\n", i);
3458               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3459                                                                group1_size);
3460               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3461                                                     kind, max_tree_size,
3462                                                     limit);
3463               /* Split the rest at the failure point and possibly
3464                  re-analyze the remaining matching part if it has
3465                  at least two lanes.  */
3466               if (group1_size < i
3467                   && (i + 1 < group_size
3468                       || i - group1_size > 1))
3469                 {
3470                   stmt_vec_info rest2 = rest;
3471                   rest = vect_split_slp_store_group (rest, i - group1_size);
3472                   if (i - group1_size > 1)
3473                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3474                                                       kind, max_tree_size,
3475                                                       limit);
3476                 }
3477               /* Re-analyze the non-matching tail if it has at least
3478                  two lanes.  */
3479               if (i + 1 < group_size)
3480                 res |= vect_analyze_slp_instance (vinfo, bst_map,
3481                                                   rest, kind, max_tree_size,
3482                                                   limit);
3483               return res;
3484             }
3485         }
3486
3487       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3488       if (is_a <loop_vec_info> (vinfo)
3489           && (i > 1 && i < group_size)
3490           && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3491         {
3492           unsigned group1_size = i;
3493
3494           if (dump_enabled_p ())
3495             dump_printf_loc (MSG_NOTE, vect_location,
3496                              "Splitting SLP group at stmt %u\n", i);
3497
3498           stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3499                                                            group1_size);
3500           /* Loop vectorization cannot handle gaps in stores, make sure
3501              the split group appears as strided.  */
3502           STMT_VINFO_STRIDED_P (rest) = 1;
3503           DR_GROUP_GAP (rest) = 0;
3504           STMT_VINFO_STRIDED_P (stmt_info) = 1;
3505           DR_GROUP_GAP (stmt_info) = 0;
3506
3507           bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3508                                                 kind, max_tree_size, limit);
3509           if (i + 1 < group_size)
3510             res |= vect_analyze_slp_instance (vinfo, bst_map,
3511                                               rest, kind, max_tree_size, limit);
3512
3513           return res;
3514         }
3515
3516       /* Even though the first vector did not all match, we might be able to SLP
3517          (some) of the remainder.  FORNOW ignore this possibility.  */
3518     }
3519
3520   /* Failed to SLP.  */
3521   if (dump_enabled_p ())
3522     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3523   return false;
3524 }
3525
3526
3527 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3528    vect_build_slp_tree to build a tree of packed stmts if possible.
3529    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3530
3531 static bool
3532 vect_analyze_slp_instance (vec_info *vinfo,
3533                            scalar_stmts_to_slp_tree_map_t *bst_map,
3534                            stmt_vec_info stmt_info,
3535                            slp_instance_kind kind,
3536                            unsigned max_tree_size, unsigned *limit)
3537 {
3538   unsigned int i;
3539   vec<stmt_vec_info> scalar_stmts;
3540
3541   if (is_a <bb_vec_info> (vinfo))
3542     vect_location = stmt_info->stmt;
3543
3544   stmt_vec_info next_info = stmt_info;
3545   if (kind == slp_inst_kind_store)
3546     {
3547       /* Collect the stores and store them in scalar_stmts.  */
3548       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3549       while (next_info)
3550         {
3551           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3552           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3553         }
3554     }
3555   else if (kind == slp_inst_kind_reduc_chain)
3556     {
3557       /* Collect the reduction stmts and store them in scalar_stmts.  */
3558       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3559       while (next_info)
3560         {
3561           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3562           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3563         }
3564       /* Mark the first element of the reduction chain as reduction to properly
3565          transform the node.  In the reduction analysis phase only the last
3566          element of the chain is marked as reduction.  */
3567       STMT_VINFO_DEF_TYPE (stmt_info)
3568         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3569       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3570         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3571     }
3572   else if (kind == slp_inst_kind_reduc_group)
3573     {
3574       /* Collect reduction statements.  */
3575       const vec<stmt_vec_info> &reductions
3576         = as_a <loop_vec_info> (vinfo)->reductions;
3577       scalar_stmts.create (reductions.length ());
3578       for (i = 0; reductions.iterate (i, &next_info); i++)
3579         if ((STMT_VINFO_RELEVANT_P (next_info)
3580              || STMT_VINFO_LIVE_P (next_info))
3581             /* ???  Make sure we didn't skip a conversion around a reduction
3582                path.  In that case we'd have to reverse engineer that conversion
3583                stmt following the chain using reduc_idx and from the PHI
3584                using reduc_def.  */
3585             && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3586           scalar_stmts.quick_push (next_info);
3587       /* If less than two were relevant/live there's nothing to SLP.  */
3588       if (scalar_stmts.length () < 2)
3589         return false;
3590     }
3591   else
3592     gcc_unreachable ();
3593
3594   vec<stmt_vec_info> roots = vNULL;
3595   vec<tree> remain = vNULL;
3596   /* Build the tree for the SLP instance.  */
3597   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3598                                       roots, remain,
3599                                       max_tree_size, limit, bst_map,
3600                                       kind == slp_inst_kind_store
3601                                       ? stmt_info : NULL);
3602
3603   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3604      where we should do store group splitting.  */
3605
3606   return res;
3607 }
3608
3609 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3610    trees of packed scalar stmts if SLP is possible.  */
3611
3612 opt_result
3613 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3614 {
3615   unsigned int i;
3616   stmt_vec_info first_element;
3617   slp_instance instance;
3618
3619   DUMP_VECT_SCOPE ("vect_analyze_slp");
3620
3621   unsigned limit = max_tree_size;
3622
3623   scalar_stmts_to_slp_tree_map_t *bst_map
3624     = new scalar_stmts_to_slp_tree_map_t ();
3625
3626   /* Find SLP sequences starting from groups of grouped stores.  */
3627   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3628     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3629                                slp_inst_kind_store, max_tree_size, &limit);
3630
3631   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3632     {
3633       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3634         {
3635           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3636           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3637                                        bb_vinfo->roots[i].stmts,
3638                                        bb_vinfo->roots[i].roots,
3639                                        bb_vinfo->roots[i].remain,
3640                                        max_tree_size, &limit, bst_map, NULL))
3641             {
3642               bb_vinfo->roots[i].stmts = vNULL;
3643               bb_vinfo->roots[i].roots = vNULL;
3644               bb_vinfo->roots[i].remain = vNULL;
3645             }
3646         }
3647     }
3648
3649   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3650     {
3651       /* Find SLP sequences starting from reduction chains.  */
3652       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3653         if (! STMT_VINFO_RELEVANT_P (first_element)
3654             && ! STMT_VINFO_LIVE_P (first_element))
3655           ;
3656         else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3657                                               slp_inst_kind_reduc_chain,
3658                                               max_tree_size, &limit))
3659           {
3660             /* Dissolve reduction chain group.  */
3661             stmt_vec_info vinfo = first_element;
3662             stmt_vec_info last = NULL;
3663             while (vinfo)
3664               {
3665                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3666                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3667                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3668                 last = vinfo;
3669                 vinfo = next;
3670               }
3671             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3672             /* It can be still vectorized as part of an SLP reduction.  */
3673             loop_vinfo->reductions.safe_push (last);
3674           }
3675
3676       /* Find SLP sequences starting from groups of reductions.  */
3677       if (loop_vinfo->reductions.length () > 1)
3678         vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3679                                    slp_inst_kind_reduc_group, max_tree_size,
3680                                    &limit);
3681     }
3682
3683   hash_set<slp_tree> visited_patterns;
3684   slp_tree_to_load_perm_map_t perm_cache;
3685   slp_compat_nodes_map_t compat_cache;
3686
3687   /* See if any patterns can be found in the SLP tree.  */
3688   bool pattern_found = false;
3689   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3690     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3691                                               &visited_patterns, &perm_cache,
3692                                               &compat_cache);
3693
3694   /* If any were found optimize permutations of loads.  */
3695   if (pattern_found)
3696     {
3697       hash_map<slp_tree, slp_tree> load_map;
3698       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3699         {
3700           slp_tree root = SLP_INSTANCE_TREE (instance);
3701           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3702                                         &load_map, root);
3703         }
3704     }
3705
3706
3707
3708   /* The map keeps a reference on SLP nodes built, release that.  */
3709   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3710        it != bst_map->end (); ++it)
3711     if ((*it).second)
3712       vect_free_slp_tree ((*it).second);
3713   delete bst_map;
3714
3715   if (pattern_found && dump_enabled_p ())
3716     {
3717       dump_printf_loc (MSG_NOTE, vect_location,
3718                        "Pattern matched SLP tree\n");
3719       hash_set<slp_tree> visited;
3720       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3721         vect_print_slp_graph (MSG_NOTE, vect_location,
3722                               SLP_INSTANCE_TREE (instance), visited);
3723     }
3724
3725   return opt_result::success ();
3726 }
3727
3728 /* Estimates the cost of inserting layout changes into the SLP graph.
3729    It can also say that the insertion is impossible.  */
3730
3731 struct slpg_layout_cost
3732 {
3733   slpg_layout_cost () = default;
3734   slpg_layout_cost (sreal, bool);
3735
3736   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3737   bool is_possible () const { return depth != sreal::max (); }
3738
3739   bool operator== (const slpg_layout_cost &) const;
3740   bool operator!= (const slpg_layout_cost &) const;
3741
3742   bool is_better_than (const slpg_layout_cost &, bool) const;
3743
3744   void add_parallel_cost (const slpg_layout_cost &);
3745   void add_serial_cost (const slpg_layout_cost &);
3746   void split (unsigned int);
3747
3748   /* The longest sequence of layout changes needed during any traversal
3749      of the partition dag, weighted by execution frequency.
3750
3751      This is the most important metric when optimizing for speed, since
3752      it helps to ensure that we keep the number of operations on
3753      critical paths to a minimum.  */
3754   sreal depth = 0;
3755
3756   /* An estimate of the total number of operations needed.  It is weighted by
3757      execution frequency when optimizing for speed but not when optimizing for
3758      size.  In order to avoid double-counting, a node with a fanout of N will
3759      distribute 1/N of its total cost to each successor.
3760
3761      This is the most important metric when optimizing for size, since
3762      it helps to keep the total number of operations to a minimum,  */
3763   sreal total = 0;
3764 };
3765
3766 /* Construct costs for a node with weight WEIGHT.  A higher weight
3767    indicates more frequent execution.  IS_FOR_SIZE is true if we are
3768    optimizing for size rather than speed.  */
3769
3770 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3771   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3772 {
3773 }
3774
3775 bool
3776 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3777 {
3778   return depth == other.depth && total == other.total;
3779 }
3780
3781 bool
3782 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3783 {
3784   return !operator== (other);
3785 }
3786
3787 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
3788    true if we are optimizing for size rather than speed.  */
3789
3790 bool
3791 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3792                                   bool is_for_size) const
3793 {
3794   if (is_for_size)
3795     {
3796       if (total != other.total)
3797         return total < other.total;
3798       return depth < other.depth;
3799     }
3800   else
3801     {
3802       if (depth != other.depth)
3803         return depth < other.depth;
3804       return total < other.total;
3805     }
3806 }
3807
3808 /* Increase the costs to account for something with cost INPUT_COST
3809    happening in parallel with the current costs.  */
3810
3811 void
3812 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3813 {
3814   depth = std::max (depth, input_cost.depth);
3815   total += input_cost.total;
3816 }
3817
3818 /* Increase the costs to account for something with cost INPUT_COST
3819    happening in series with the current costs.  */
3820
3821 void
3822 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3823 {
3824   depth += other.depth;
3825   total += other.total;
3826 }
3827
3828 /* Split the total cost among TIMES successors or predecessors.  */
3829
3830 void
3831 slpg_layout_cost::split (unsigned int times)
3832 {
3833   if (times > 1)
3834     total /= times;
3835 }
3836
3837 /* Information about one node in the SLP graph, for use during
3838    vect_optimize_slp_pass.  */
3839
3840 struct slpg_vertex
3841 {
3842   slpg_vertex (slp_tree node_) : node (node_) {}
3843
3844   /* The node itself.  */
3845   slp_tree node;
3846
3847   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
3848      partitions are flexible; they can have whichever layout consumers
3849      want them to have.  */
3850   int partition = -1;
3851
3852   /* The number of nodes that directly use the result of this one
3853      (i.e. the number of nodes that count this one as a child).  */
3854   unsigned int out_degree = 0;
3855
3856   /* The execution frequency of the node.  */
3857   sreal weight = 0;
3858
3859   /* The total execution frequency of all nodes that directly use the
3860      result of this one.  */
3861   sreal out_weight = 0;
3862 };
3863
3864 /* Information about one partition of the SLP graph, for use during
3865    vect_optimize_slp_pass.  */
3866
3867 struct slpg_partition_info
3868 {
3869   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3870      of m_partitioned_nodes.  */
3871   unsigned int node_begin = 0;
3872   unsigned int node_end = 0;
3873
3874   /* Which layout we've chosen to use for this partition, or -1 if
3875      we haven't picked one yet.  */
3876   int layout = -1;
3877
3878   /* The number of predecessors and successors in the partition dag.
3879      The predecessors always have lower partition numbers and the
3880      successors always have higher partition numbers.
3881
3882      Note that the directions of these edges are not necessarily the
3883      same as in the data flow graph.  For example, if an SCC has separate
3884      partitions for an inner loop and an outer loop, the inner loop's
3885      partition will have at least two incoming edges from the outer loop's
3886      partition: one for a live-in value and one for a live-out value.
3887      In data flow terms, one of these edges would also be from the outer loop
3888      to the inner loop, but the other would be in the opposite direction.  */
3889   unsigned int in_degree = 0;
3890   unsigned int out_degree = 0;
3891 };
3892
3893 /* Information about the costs of using a particular layout for a
3894    particular partition.  It can also say that the combination is
3895    impossible.  */
3896
3897 struct slpg_partition_layout_costs
3898 {
3899   bool is_possible () const { return internal_cost.is_possible (); }
3900   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3901
3902   /* The costs inherited from predecessor partitions.  */
3903   slpg_layout_cost in_cost;
3904
3905   /* The inherent cost of the layout within the node itself.  For example,
3906      this is nonzero for a load if choosing a particular layout would require
3907      the load to permute the loaded elements.  It is nonzero for a
3908      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3909      to full-vector moves.  */
3910   slpg_layout_cost internal_cost;
3911
3912   /* The costs inherited from successor partitions.  */
3913   slpg_layout_cost out_cost;
3914 };
3915
3916 /* This class tries to optimize the layout of vectors in order to avoid
3917    unnecessary shuffling.  At the moment, the set of possible layouts are
3918    restricted to bijective permutations.
3919
3920    The goal of the pass depends on whether we're optimizing for size or
3921    for speed.  When optimizing for size, the goal is to reduce the overall
3922    number of layout changes (including layout changes implied by things
3923    like load permutations).  When optimizing for speed, the goal is to
3924    reduce the maximum latency attributable to layout changes on any
3925    non-cyclical path through the data flow graph.
3926
3927    For example, when optimizing a loop nest for speed, we will prefer
3928    to make layout changes outside of a loop rather than inside of a loop,
3929    and will prefer to make layout changes in parallel rather than serially,
3930    even if that increases the overall number of layout changes.
3931
3932    The high-level procedure is:
3933
3934    (1) Build a graph in which edges go from uses (parents) to definitions
3935        (children).
3936
3937    (2) Divide the graph into a dag of strongly-connected components (SCCs).
3938
3939    (3) When optimizing for speed, partition the nodes in each SCC based
3940        on their containing cfg loop.  When optimizing for size, treat
3941        each SCC as a single partition.
3942
3943        This gives us a dag of partitions.  The goal is now to assign a
3944        layout to each partition.
3945
3946    (4) Construct a set of vector layouts that are worth considering.
3947        Record which nodes must keep their current layout.
3948
3949    (5) Perform a forward walk over the partition dag (from loads to stores)
3950        accumulating the "forward" cost of using each layout.  When visiting
3951        each partition, assign a tentative choice of layout to the partition
3952        and use that choice when calculating the cost of using a different
3953        layout in successor partitions.
3954
3955    (6) Perform a backward walk over the partition dag (from stores to loads),
3956        accumulating the "backward" cost of using each layout.  When visiting
3957        each partition, make a final choice of layout for that partition based
3958        on the accumulated forward costs (from (5)) and backward costs
3959        (from (6)).
3960
3961    (7) Apply the chosen layouts to the SLP graph.
3962
3963    For example, consider the SLP statements:
3964
3965    S1:      a_1 = load
3966        loop:
3967    S2:      a_2 = PHI<a_1, a_3>
3968    S3:      b_1 = load
3969    S4:      a_3 = a_2 + b_1
3970        exit:
3971    S5:      a_4 = PHI<a_3>
3972    S6:      store a_4
3973
3974    S2 and S4 form an SCC and are part of the same loop.  Every other
3975    statement is in a singleton SCC.  In this example there is a one-to-one
3976    mapping between SCCs and partitions and the partition dag looks like this;
3977
3978         S1     S3
3979          \     /
3980           S2+S4
3981             |
3982            S5
3983             |
3984            S6
3985
3986    S2, S3 and S4 will have a higher execution frequency than the other
3987    statements, so when optimizing for speed, the goal is to avoid any
3988    layout changes:
3989
3990    - within S3
3991    - within S2+S4
3992    - on the S3->S2+S4 edge
3993
3994    For example, if S3 was originally a reversing load, the goal of the
3995    pass is to make it an unreversed load and change the layout on the
3996    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
3997    on S1->S2+S4 and S5->S6 would also be acceptable.)
3998
3999    The difference between SCCs and partitions becomes important if we
4000    add an outer loop:
4001
4002    S1:      a_1 = ...
4003        loop1:
4004    S2:      a_2 = PHI<a_1, a_6>
4005    S3:      b_1 = load
4006    S4:      a_3 = a_2 + b_1
4007        loop2:
4008    S5:      a_4 = PHI<a_3, a_5>
4009    S6:      c_1 = load
4010    S7:      a_5 = a_4 + c_1
4011        exit2:
4012    S8:      a_6 = PHI<a_5>
4013    S9:      store a_6
4014        exit1:
4015
4016    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
4017    for speed, we usually do not want restrictions in the outer loop to "infect"
4018    the decision for the inner loop.  For example, if an outer-loop node
4019    in the SCC contains a statement with a fixed layout, that should not
4020    prevent the inner loop from using a different layout.  Conversely,
4021    the inner loop should not dictate a layout to the outer loop: if the
4022    outer loop does a lot of computation, then it may not be efficient to
4023    do all of that computation in the inner loop's preferred layout.
4024
4025    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4026    and S5+S7 (inner).  We also try to arrange partitions so that:
4027
4028    - the partition for an outer loop comes before the partition for
4029      an inner loop
4030
4031    - if a sibling loop A dominates a sibling loop B, A's partition
4032      comes before B's
4033
4034    This gives the following partition dag for the example above:
4035
4036         S1        S3
4037          \        /
4038           S2+S4+S8   S6
4039            |   \\    /
4040            |    S5+S7
4041            |
4042           S9
4043
4044    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4045    one for a reversal of the edge S7->S8.
4046
4047    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
4048    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4049    preferred layout against the cost of changing the layout on entry to the
4050    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4051
4052    Although this works well when optimizing for speed, it has the downside
4053    when optimizing for size that the choice of layout for S5+S7 is completely
4054    independent of S9, which lessens the chance of reducing the overall number
4055    of permutations.  We therefore do not partition SCCs when optimizing
4056    for size.
4057
4058    To give a concrete example of the difference between optimizing
4059    for size and speed, consider:
4060
4061    a[0] = (b[1] << c[3]) - d[1];
4062    a[1] = (b[0] << c[2]) - d[0];
4063    a[2] = (b[3] << c[1]) - d[3];
4064    a[3] = (b[2] << c[0]) - d[2];
4065
4066    There are three different layouts here: one for a, one for b and d,
4067    and one for c.  When optimizing for speed it is better to permute each
4068    of b, c and d into the order required by a, since those permutations
4069    happen in parallel.  But when optimizing for size, it is better to:
4070
4071    - permute c into the same order as b
4072    - do the arithmetic
4073    - permute the result into the order required by a
4074
4075    This gives 2 permutations rather than 3.  */
4076
4077 class vect_optimize_slp_pass
4078 {
4079 public:
4080   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4081   void run ();
4082
4083 private:
4084   /* Graph building.  */
4085   struct loop *containing_loop (slp_tree);
4086   bool is_cfg_latch_edge (graph_edge *);
4087   void build_vertices (hash_set<slp_tree> &, slp_tree);
4088   void build_vertices ();
4089   void build_graph ();
4090
4091   /* Partitioning.  */
4092   void create_partitions ();
4093   template<typename T> void for_each_partition_edge (unsigned int, T);
4094
4095   /* Layout selection.  */
4096   bool is_compatible_layout (slp_tree, unsigned int);
4097   int change_layout_cost (slp_tree, unsigned int, unsigned int);
4098   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4099                                                        unsigned int);
4100   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4101                                int, unsigned int);
4102   int internal_node_cost (slp_tree, int, unsigned int);
4103   void start_choosing_layouts ();
4104
4105   /* Cost propagation.  */
4106   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4107                                      unsigned int, unsigned int);
4108   slpg_layout_cost total_in_cost (unsigned int);
4109   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4110   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4111   void forward_pass ();
4112   void backward_pass ();
4113
4114   /* Rematerialization.  */
4115   slp_tree get_result_with_layout (slp_tree, unsigned int);
4116   void materialize ();
4117
4118   /* Clean-up.  */
4119   void remove_redundant_permutations ();
4120
4121   void dump ();
4122
4123   vec_info *m_vinfo;
4124
4125   /* True if we should optimize the graph for size, false if we should
4126      optimize it for speed.  (It wouldn't be easy to make this decision
4127      more locally.)  */
4128   bool m_optimize_size;
4129
4130   /* A graph of all SLP nodes, with edges leading from uses to definitions.
4131      In other words, a node's predecessors are its slp_tree parents and
4132      a node's successors are its slp_tree children.  */
4133   graph *m_slpg = nullptr;
4134
4135   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
4136   auto_vec<slpg_vertex> m_vertices;
4137
4138   /* The list of all leaves of M_SLPG. such as external definitions, constants,
4139      and loads.  */
4140   auto_vec<int> m_leafs;
4141
4142   /* This array has one entry for every vector layout that we're considering.
4143      Element 0 is null and indicates "no change".  Other entries describe
4144      permutations that are inherent in the current graph and that we would
4145      like to reverse if possible.
4146
4147      For example, a permutation { 1, 2, 3, 0 } means that something has
4148      effectively been permuted in that way, such as a load group
4149      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4150      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4151      in order to put things "back" in order.  */
4152   auto_vec<vec<unsigned> > m_perms;
4153
4154   /* A partitioning of the nodes for which a layout must be chosen.
4155      Each partition represents an <SCC, cfg loop> pair; that is,
4156      nodes in different SCCs belong to different partitions, and nodes
4157      within an SCC can be further partitioned according to a containing
4158      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
4159
4160      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4161        from leaves (such as loads) to roots (such as stores).
4162
4163      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
4164   auto_vec<slpg_partition_info> m_partitions;
4165
4166   /* The list of all nodes for which a layout must be chosen.  Nodes for
4167      partition P come before the nodes for partition P+1.  Nodes within a
4168      partition are in reverse postorder.  */
4169   auto_vec<unsigned int> m_partitioned_nodes;
4170
4171   /* Index P * num-layouts + L contains the cost of using layout L
4172      for partition P.  */
4173   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4174
4175   /* Index N * num-layouts + L, if nonnull, is a node that provides the
4176      original output of node N adjusted to have layout L.  */
4177   auto_vec<slp_tree> m_node_layouts;
4178 };
4179
4180 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4181    Also record whether we should optimize anything for speed rather
4182    than size.  */
4183
4184 void
4185 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4186                                         slp_tree node)
4187 {
4188   unsigned i;
4189   slp_tree child;
4190
4191   if (visited.add (node))
4192     return;
4193
4194   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4195     {
4196       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4197       if (optimize_bb_for_speed_p (bb))
4198         m_optimize_size = false;
4199     }
4200
4201   node->vertex = m_vertices.length ();
4202   m_vertices.safe_push (slpg_vertex (node));
4203
4204   bool leaf = true;
4205   bool force_leaf = false;
4206   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4207     if (child)
4208       {
4209         leaf = false;
4210         build_vertices (visited, child);
4211       }
4212     else
4213       force_leaf = true;
4214   /* Since SLP discovery works along use-def edges all cycles have an
4215      entry - but there's the exception of cycles where we do not handle
4216      the entry explicitely (but with a NULL SLP node), like some reductions
4217      and inductions.  Force those SLP PHIs to act as leafs to make them
4218      backwards reachable.  */
4219   if (leaf || force_leaf)
4220     m_leafs.safe_push (node->vertex);
4221 }
4222
4223 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
4224
4225 void
4226 vect_optimize_slp_pass::build_vertices ()
4227 {
4228   hash_set<slp_tree> visited;
4229   unsigned i;
4230   slp_instance instance;
4231   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4232     build_vertices (visited, SLP_INSTANCE_TREE (instance));
4233 }
4234
4235 /* Apply (reverse) bijectite PERM to VEC.  */
4236
4237 template <class T>
4238 static void
4239 vect_slp_permute (vec<unsigned> perm,
4240                   vec<T> &vec, bool reverse)
4241 {
4242   auto_vec<T, 64> saved;
4243   saved.create (vec.length ());
4244   for (unsigned i = 0; i < vec.length (); ++i)
4245     saved.quick_push (vec[i]);
4246
4247   if (reverse)
4248     {
4249       for (unsigned i = 0; i < vec.length (); ++i)
4250         vec[perm[i]] = saved[i];
4251       for (unsigned i = 0; i < vec.length (); ++i)
4252         gcc_assert (vec[perm[i]] == saved[i]);
4253     }
4254   else
4255     {
4256       for (unsigned i = 0; i < vec.length (); ++i)
4257         vec[i] = saved[perm[i]];
4258       for (unsigned i = 0; i < vec.length (); ++i)
4259         gcc_assert (vec[i] == saved[perm[i]]);
4260     }
4261 }
4262
4263 /* Return the cfg loop that contains NODE.  */
4264
4265 struct loop *
4266 vect_optimize_slp_pass::containing_loop (slp_tree node)
4267 {
4268   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4269   if (!rep)
4270     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4271   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4272 }
4273
4274 /* Return true if UD (an edge from a use to a definition) is associated
4275    with a loop latch edge in the cfg.  */
4276
4277 bool
4278 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4279 {
4280   slp_tree use = m_vertices[ud->src].node;
4281   slp_tree def = m_vertices[ud->dest].node;
4282   if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4283       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4284     return false;
4285
4286   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4287   return (is_a<gphi *> (use_rep->stmt)
4288           && bb_loop_header_p (gimple_bb (use_rep->stmt))
4289           && containing_loop (def) == containing_loop (use));
4290 }
4291
4292 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
4293    a nonnull data field.  */
4294
4295 void
4296 vect_optimize_slp_pass::build_graph ()
4297 {
4298   m_optimize_size = true;
4299   build_vertices ();
4300
4301   m_slpg = new_graph (m_vertices.length ());
4302   for (slpg_vertex &v : m_vertices)
4303     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4304       if (child)
4305         {
4306           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4307           if (is_cfg_latch_edge (ud))
4308             ud->data = this;
4309         }
4310 }
4311
4312 /* Return true if E corresponds to a loop latch edge in the cfg.  */
4313
4314 static bool
4315 skip_cfg_latch_edges (graph_edge *e)
4316 {
4317   return e->data;
4318 }
4319
4320 /* Create the node partitions.  */
4321
4322 void
4323 vect_optimize_slp_pass::create_partitions ()
4324 {
4325   /* Calculate a postorder of the graph, ignoring edges that correspond
4326      to natural latch edges in the cfg.  Reading the vector from the end
4327      to the beginning gives the reverse postorder.  */
4328   auto_vec<int> initial_rpo;
4329   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4330                false, NULL, skip_cfg_latch_edges);
4331   gcc_assert (initial_rpo.length () == m_vertices.length ());
4332
4333   /* Calculate the strongly connected components of the graph.  */
4334   auto_vec<int> scc_grouping;
4335   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4336
4337   /* Create a new index order in which all nodes from the same SCC are
4338      consecutive.  Use scc_pos to record the index of the first node in
4339      each SCC.  */
4340   auto_vec<unsigned int> scc_pos (num_sccs);
4341   int last_component = -1;
4342   unsigned int node_count = 0;
4343   for (unsigned int node_i : scc_grouping)
4344     {
4345       if (last_component != m_slpg->vertices[node_i].component)
4346         {
4347           last_component = m_slpg->vertices[node_i].component;
4348           gcc_assert (last_component == int (scc_pos.length ()));
4349           scc_pos.quick_push (node_count);
4350         }
4351       node_count += 1;
4352     }
4353   gcc_assert (node_count == initial_rpo.length ()
4354               && last_component + 1 == int (num_sccs));
4355
4356   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4357      inside each SCC following the RPO we calculated above.  The fact that
4358      we ignored natural latch edges when calculating the RPO should ensure
4359      that, for natural loop nests:
4360
4361      - the first node that we encounter in a cfg loop is the loop header phi
4362      - the loop header phis are in dominance order
4363
4364      Arranging for this is an optimization (see below) rather than a
4365      correctness issue.  Unnatural loops with a tangled mess of backedges
4366      will still work correctly, but might give poorer results.
4367
4368      Also update scc_pos so that it gives 1 + the index of the last node
4369      in the SCC.  */
4370   m_partitioned_nodes.safe_grow (node_count);
4371   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4372     {
4373       unsigned int node_i = initial_rpo[old_i];
4374       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4375       m_partitioned_nodes[new_i] = node_i;
4376     }
4377
4378   /* When optimizing for speed, partition each SCC based on the containing
4379      cfg loop. The order we constructed above should ensure that, for natural
4380      cfg loops, we'll create sub-SCC partitions for outer loops before
4381      the corresponding sub-SCC partitions for inner loops.  Similarly,
4382      when one sibling loop A dominates another sibling loop B, we should
4383      create a sub-SCC partition for A before a sub-SCC partition for B.
4384
4385      As above, nothing depends for correctness on whether this achieves
4386      a natural nesting, but we should get better results when it does.  */
4387   m_partitions.reserve (m_vertices.length ());
4388   unsigned int next_partition_i = 0;
4389   hash_map<struct loop *, int> loop_partitions;
4390   unsigned int rpo_begin = 0;
4391   unsigned int num_partitioned_nodes = 0;
4392   for (unsigned int rpo_end : scc_pos)
4393     {
4394       loop_partitions.empty ();
4395       unsigned int partition_i = next_partition_i;
4396       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4397         {
4398           /* Handle externals and constants optimistically throughout.
4399              But treat existing vectors as fixed since we do not handle
4400              permuting them.  */
4401           unsigned int node_i = m_partitioned_nodes[rpo_i];
4402           auto &vertex = m_vertices[node_i];
4403           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4404                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4405               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4406             vertex.partition = -1;
4407           else
4408             {
4409               bool existed;
4410               if (m_optimize_size)
4411                 existed = next_partition_i > partition_i;
4412               else
4413                 {
4414                   struct loop *loop = containing_loop (vertex.node);
4415                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
4416                   if (!existed)
4417                     entry = next_partition_i;
4418                   partition_i = entry;
4419                 }
4420               if (!existed)
4421                 {
4422                   m_partitions.quick_push (slpg_partition_info ());
4423                   next_partition_i += 1;
4424                 }
4425               vertex.partition = partition_i;
4426               num_partitioned_nodes += 1;
4427               m_partitions[partition_i].node_end += 1;
4428             }
4429         }
4430       rpo_begin = rpo_end;
4431     }
4432
4433   /* Assign ranges of consecutive node indices to each partition,
4434      in partition order.  Start with node_end being the same as
4435      node_begin so that the next loop can use it as a counter.  */
4436   unsigned int node_begin = 0;
4437   for (auto &partition : m_partitions)
4438     {
4439       partition.node_begin = node_begin;
4440       node_begin += partition.node_end;
4441       partition.node_end = partition.node_begin;
4442     }
4443   gcc_assert (node_begin == num_partitioned_nodes);
4444
4445   /* Finally build the list of nodes in partition order.  */
4446   m_partitioned_nodes.truncate (num_partitioned_nodes);
4447   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4448     {
4449       int partition_i = m_vertices[node_i].partition;
4450       if (partition_i >= 0)
4451         {
4452           unsigned int order_i = m_partitions[partition_i].node_end++;
4453           m_partitioned_nodes[order_i] = node_i;
4454         }
4455     }
4456 }
4457
4458 /* Look for edges from earlier partitions into node NODE_I and edges from
4459    node NODE_I into later partitions.  Call:
4460
4461       FN (ud, other_node_i)
4462
4463    for each such use-to-def edge ud, where other_node_i is the node at the
4464    other end of the edge.  */
4465
4466 template<typename T>
4467 void
4468 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4469 {
4470   int partition_i = m_vertices[node_i].partition;
4471   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4472        pred; pred = pred->pred_next)
4473     {
4474       int src_partition_i = m_vertices[pred->src].partition;
4475       if (src_partition_i >= 0 && src_partition_i != partition_i)
4476         fn (pred, pred->src);
4477     }
4478   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4479        succ; succ = succ->succ_next)
4480     {
4481       int dest_partition_i = m_vertices[succ->dest].partition;
4482       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4483         fn (succ, succ->dest);
4484     }
4485 }
4486
4487 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4488    that NODE would operate on.  This test is independent of NODE's actual
4489    operation.  */
4490
4491 bool
4492 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4493                                               unsigned int layout_i)
4494 {
4495   if (layout_i == 0)
4496     return true;
4497
4498   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4499     return false;
4500
4501   return true;
4502 }
4503
4504 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4505    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
4506    layouts is incompatible with NODE or if the change is not possible for
4507    some other reason.
4508
4509    The properties taken from NODE include the number of lanes and the
4510    vector type.  The actual operation doesn't matter.  */
4511
4512 int
4513 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4514                                             unsigned int from_layout_i,
4515                                             unsigned int to_layout_i)
4516 {
4517   if (!is_compatible_layout (node, from_layout_i)
4518       || !is_compatible_layout (node, to_layout_i))
4519     return -1;
4520
4521   if (from_layout_i == to_layout_i)
4522     return 0;
4523
4524   auto_vec<slp_tree, 1> children (1);
4525   children.quick_push (node);
4526   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4527   if (from_layout_i > 0)
4528     for (unsigned int i : m_perms[from_layout_i])
4529       perm.quick_push ({ 0, i });
4530   else
4531     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4532       perm.quick_push ({ 0, i });
4533   if (to_layout_i > 0)
4534     vect_slp_permute (m_perms[to_layout_i], perm, true);
4535   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4536                                                children, false);
4537   if (count >= 0)
4538     return MAX (count, 1);
4539
4540   /* ??? In principle we could try changing via layout 0, giving two
4541      layout changes rather than 1.  Doing that would require
4542      corresponding support in get_result_with_layout.  */
4543   return -1;
4544 }
4545
4546 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
4547
4548 inline slpg_partition_layout_costs &
4549 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4550                                                 unsigned int layout_i)
4551 {
4552   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4553 }
4554
4555 /* Change PERM in one of two ways:
4556
4557    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4558      chosen for child I of NODE.
4559
4560    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4561
4562    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
4563
4564 void
4565 vect_optimize_slp_pass::
4566 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4567                         int in_layout_i, unsigned int out_layout_i)
4568 {
4569   for (auto &entry : perm)
4570     {
4571       int this_in_layout_i = in_layout_i;
4572       if (this_in_layout_i < 0)
4573         {
4574           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4575           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4576           this_in_layout_i = m_partitions[in_partition_i].layout;
4577         }
4578       if (this_in_layout_i > 0)
4579         entry.second = m_perms[this_in_layout_i][entry.second];
4580     }
4581   if (out_layout_i > 0)
4582     vect_slp_permute (m_perms[out_layout_i], perm, true);
4583 }
4584
4585 /* Check whether the target allows NODE to be rearranged so that the node's
4586    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
4587    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
4588
4589    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4590    NODE can adapt to the layout changes that have (perhaps provisionally)
4591    been chosen for NODE's children, so that no extra permutations are
4592    needed on either the input or the output of NODE.
4593
4594    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4595    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4596
4597    IN_LAYOUT_I has no meaning for other types of node.
4598
4599    Keeping the node as-is is always valid.  If the target doesn't appear
4600    to support the node as-is, but might realistically support other layouts,
4601    then layout 0 instead has the cost of a worst-case permutation.  On the
4602    one hand, this ensures that every node has at least one valid layout,
4603    avoiding what would otherwise be an awkward special case.  On the other,
4604    it still encourages the pass to change an invalid pre-existing layout
4605    choice into a valid one.  */
4606
4607 int
4608 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4609                                             unsigned int out_layout_i)
4610 {
4611   const int fallback_cost = 1;
4612
4613   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4614     {
4615       auto_lane_permutation_t tmp_perm;
4616       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4617
4618       /* Check that the child nodes support the chosen layout.  Checking
4619          the first child is enough, since any second child would have the
4620          same shape.  */
4621       auto first_child = SLP_TREE_CHILDREN (node)[0];
4622       if (in_layout_i > 0
4623           && !is_compatible_layout (first_child, in_layout_i))
4624         return -1;
4625
4626       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4627       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4628                                                   node, tmp_perm,
4629                                                   SLP_TREE_CHILDREN (node),
4630                                                   false);
4631       if (count < 0)
4632         {
4633           if (in_layout_i == 0 && out_layout_i == 0)
4634             {
4635               /* Use the fallback cost if the node could in principle support
4636                  some nonzero layout for both the inputs and the outputs.
4637                  Otherwise assume that the node will be rejected later
4638                  and rebuilt from scalars.  */
4639               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4640                 return fallback_cost;
4641               return 0;
4642             }
4643           return -1;
4644         }
4645
4646       /* We currently have no way of telling whether the new layout is cheaper
4647          or more expensive than the old one.  But at least in principle,
4648          it should be worth making zero permutations (whole-vector shuffles)
4649          cheaper than real permutations, in case the pass is able to remove
4650          the latter.  */
4651       return count == 0 ? 0 : 1;
4652     }
4653
4654   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4655   if (rep
4656       && STMT_VINFO_DATA_REF (rep)
4657       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4658       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4659     {
4660       auto_load_permutation_t tmp_perm;
4661       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4662       if (out_layout_i > 0)
4663         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4664
4665       poly_uint64 vf = 1;
4666       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4667         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4668       unsigned int n_perms;
4669       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4670                                            nullptr, vf, true, false, &n_perms))
4671         {
4672           auto rep = SLP_TREE_REPRESENTATIVE (node);
4673           if (out_layout_i == 0)
4674             {
4675               /* Use the fallback cost if the load is an N-to-N permutation.
4676                  Otherwise assume that the node will be rejected later
4677                  and rebuilt from scalars.  */
4678               if (STMT_VINFO_GROUPED_ACCESS (rep)
4679                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4680                       == SLP_TREE_LANES (node)))
4681                 return fallback_cost;
4682               return 0;
4683             }
4684           return -1;
4685         }
4686
4687       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
4688       return n_perms == 0 ? 0 : 1;
4689     }
4690
4691   return 0;
4692 }
4693
4694 /* Decide which element layouts we should consider using.  Calculate the
4695    weights associated with inserting layout changes on partition edges.
4696    Also mark partitions that cannot change layout, by setting their
4697    layout to zero.  */
4698
4699 void
4700 vect_optimize_slp_pass::start_choosing_layouts ()
4701 {
4702   /* Used to assign unique permutation indices.  */
4703   using perm_hash = unbounded_hashmap_traits<
4704     vec_free_hash_base<int_hash_base<unsigned>>,
4705     int_hash<int, -1, -2>
4706   >;
4707   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4708
4709   /* Layout 0 is "no change".  */
4710   m_perms.safe_push (vNULL);
4711
4712   /* Create layouts from existing permutations.  */
4713   auto_load_permutation_t tmp_perm;
4714   for (unsigned int node_i : m_partitioned_nodes)
4715     {
4716       /* Leafs also double as entries to the reverse graph.  Allow the
4717          layout of those to be changed.  */
4718       auto &vertex = m_vertices[node_i];
4719       auto &partition = m_partitions[vertex.partition];
4720       if (!m_slpg->vertices[node_i].succ)
4721         partition.layout = 0;
4722
4723       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
4724       slp_tree node = vertex.node;
4725       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4726       slp_tree child;
4727       unsigned HOST_WIDE_INT imin, imax = 0;
4728       bool any_permute = false;
4729       tmp_perm.truncate (0);
4730       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4731         {
4732           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4733              unpermuted, record a layout that reverses this permutation.
4734
4735              We would need more work to cope with loads that are internally
4736              permuted and also have inputs (such as masks for
4737              IFN_MASK_LOADs).  */
4738           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4739           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4740             {
4741               partition.layout = -1;
4742               continue;
4743             }
4744           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4745           imin = DR_GROUP_SIZE (dr_stmt) + 1;
4746           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4747         }
4748       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4749                && SLP_TREE_CHILDREN (node).length () == 1
4750                && (child = SLP_TREE_CHILDREN (node)[0])
4751                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4752                    .is_constant (&imin)))
4753         {
4754           /* If the child has the same vector size as this node,
4755              reversing the permutation can make the permutation a no-op.
4756              In other cases it can change a true permutation into a
4757              full-vector extract.  */
4758           tmp_perm.reserve (SLP_TREE_LANES (node));
4759           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4760             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4761         }
4762       else
4763         continue;
4764
4765       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4766         {
4767           unsigned idx = tmp_perm[j];
4768           imin = MIN (imin, idx);
4769           imax = MAX (imax, idx);
4770           if (idx - tmp_perm[0] != j)
4771             any_permute = true;
4772         }
4773       /* If the span doesn't match we'd disrupt VF computation, avoid
4774          that for now.  */
4775       if (imax - imin + 1 != SLP_TREE_LANES (node))
4776         continue;
4777       /* If there's no permute no need to split one out.  In this case
4778          we can consider turning a load into a permuted load, if that
4779          turns out to be cheaper than alternatives.  */
4780       if (!any_permute)
4781         {
4782           partition.layout = -1;
4783           continue;
4784         }
4785
4786       /* For now only handle true permutes, like
4787          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
4788          when permuting constants and invariants keeping the permute
4789          bijective.  */
4790       auto_sbitmap load_index (SLP_TREE_LANES (node));
4791       bitmap_clear (load_index);
4792       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4793         bitmap_set_bit (load_index, tmp_perm[j] - imin);
4794       unsigned j;
4795       for (j = 0; j < SLP_TREE_LANES (node); ++j)
4796         if (!bitmap_bit_p (load_index, j))
4797           break;
4798       if (j != SLP_TREE_LANES (node))
4799         continue;
4800
4801       vec<unsigned> perm = vNULL;
4802       perm.safe_grow (SLP_TREE_LANES (node), true);
4803       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4804         perm[j] = tmp_perm[j] - imin;
4805
4806       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4807         {
4808           /* Continue to use existing layouts, but don't add any more.  */
4809           int *entry = layout_ids.get (perm);
4810           partition.layout = entry ? *entry : 0;
4811           perm.release ();
4812         }
4813       else
4814         {
4815           bool existed;
4816           int &layout_i = layout_ids.get_or_insert (perm, &existed);
4817           if (existed)
4818             perm.release ();
4819           else
4820             {
4821               layout_i = m_perms.length ();
4822               m_perms.safe_push (perm);
4823             }
4824           partition.layout = layout_i;
4825         }
4826     }
4827
4828   /* Initially assume that every layout is possible and has zero cost
4829      in every partition.  */
4830   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4831                                               * m_perms.length ());
4832
4833   /* We have to mark outgoing permutations facing non-associating-reduction
4834      graph entries that are not represented as to be materialized.
4835      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
4836   for (slp_instance instance : m_vinfo->slp_instances)
4837     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4838       {
4839         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4840         m_partitions[m_vertices[node_i].partition].layout = 0;
4841       }
4842     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4843       {
4844         stmt_vec_info stmt_info
4845           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4846         stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4847         if (needs_fold_left_reduction_p (TREE_TYPE
4848                                            (gimple_get_lhs (stmt_info->stmt)),
4849                                          STMT_VINFO_REDUC_CODE (reduc_info)))
4850           {
4851             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4852             m_partitions[m_vertices[node_i].partition].layout = 0;
4853           }
4854       }
4855
4856   /* Check which layouts each node and partition can handle.  Calculate the
4857      weights associated with inserting layout changes on edges.  */
4858   for (unsigned int node_i : m_partitioned_nodes)
4859     {
4860       auto &vertex = m_vertices[node_i];
4861       auto &partition = m_partitions[vertex.partition];
4862       slp_tree node = vertex.node;
4863
4864       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4865         {
4866           vertex.weight = vect_slp_node_weight (node);
4867
4868           /* We do not handle stores with a permutation, so all
4869              incoming permutations must have been materialized.
4870
4871              We also don't handle masked grouped loads, which lack a
4872              permutation vector.  In this case the memory locations
4873              form an implicit second input to the loads, on top of the
4874              explicit mask input, and the memory input's layout cannot
4875              be changed.
4876
4877              On the other hand, we do support permuting gather loads and
4878              masked gather loads, where each scalar load is independent
4879              of the others.  This can be useful if the address/index input
4880              benefits from permutation.  */
4881           if (STMT_VINFO_DATA_REF (rep)
4882               && STMT_VINFO_GROUPED_ACCESS (rep)
4883               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4884             partition.layout = 0;
4885
4886           /* We cannot change the layout of an operation that is
4887              not independent on lanes.  Note this is an explicit
4888              negative list since that's much shorter than the respective
4889              positive one but it's critical to keep maintaining it.  */
4890           if (is_gimple_call (STMT_VINFO_STMT (rep)))
4891             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4892               {
4893               case CFN_COMPLEX_ADD_ROT90:
4894               case CFN_COMPLEX_ADD_ROT270:
4895               case CFN_COMPLEX_MUL:
4896               case CFN_COMPLEX_MUL_CONJ:
4897               case CFN_VEC_ADDSUB:
4898               case CFN_VEC_FMADDSUB:
4899               case CFN_VEC_FMSUBADD:
4900                 partition.layout = 0;
4901               default:;
4902               }
4903         }
4904
4905       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4906         {
4907           auto &other_vertex = m_vertices[other_node_i];
4908
4909           /* Count the number of edges from earlier partitions and the number
4910              of edges to later partitions.  */
4911           if (other_vertex.partition < vertex.partition)
4912             partition.in_degree += 1;
4913           else
4914             partition.out_degree += 1;
4915
4916           /* If the current node uses the result of OTHER_NODE_I, accumulate
4917              the effects of that.  */
4918           if (ud->src == int (node_i))
4919             {
4920               other_vertex.out_weight += vertex.weight;
4921               other_vertex.out_degree += 1;
4922             }
4923         };
4924       for_each_partition_edge (node_i, process_edge);
4925     }
4926 }
4927
4928 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4929    its current (provisional) choice of layout.  The inputs do not necessarily
4930    have the same layout as each other.  */
4931
4932 slpg_layout_cost
4933 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4934 {
4935   auto &vertex = m_vertices[node_i];
4936   slpg_layout_cost cost;
4937   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4938     {
4939       auto &other_vertex = m_vertices[other_node_i];
4940       if (other_vertex.partition < vertex.partition)
4941         {
4942           auto &other_partition = m_partitions[other_vertex.partition];
4943           auto &other_costs = partition_layout_costs (other_vertex.partition,
4944                                                       other_partition.layout);
4945           slpg_layout_cost this_cost = other_costs.in_cost;
4946           this_cost.add_serial_cost (other_costs.internal_cost);
4947           this_cost.split (other_partition.out_degree);
4948           cost.add_parallel_cost (this_cost);
4949         }
4950     };
4951   for_each_partition_edge (node_i, add_cost);
4952   return cost;
4953 }
4954
4955 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4956    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
4957    slpg_layout_cost::impossible () if the change isn't possible.  */
4958
4959 slpg_layout_cost
4960 vect_optimize_slp_pass::
4961 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4962                   unsigned int layout2_i)
4963 {
4964   auto &def_vertex = m_vertices[ud->dest];
4965   auto &use_vertex = m_vertices[ud->src];
4966   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4967   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4968   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4969                                     use_layout_i);
4970   if (factor < 0)
4971     return slpg_layout_cost::impossible ();
4972
4973   /* We have a choice of putting the layout change at the site of the
4974      definition or at the site of the use.  Prefer the former when
4975      optimizing for size or when the execution frequency of the
4976      definition is no greater than the combined execution frequencies of
4977      the uses.  When putting the layout change at the site of the definition,
4978      divvy up the cost among all consumers.  */
4979   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4980     {
4981       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4982       cost.split (def_vertex.out_degree);
4983       return cost;
4984     }
4985   return { use_vertex.weight * factor, m_optimize_size };
4986 }
4987
4988 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4989    partition; FROM_NODE_I could be the definition node or the use node.
4990    The node at the other end of the link wants to use layout TO_LAYOUT_I.
4991    Return the cost of any necessary fix-ups on edge UD, or return
4992    slpg_layout_cost::impossible () if the change isn't possible.
4993
4994    At this point, FROM_NODE_I's partition has chosen the cheapest
4995    layout based on the information available so far, but this choice
4996    is only provisional.  */
4997
4998 slpg_layout_cost
4999 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
5000                                       unsigned int to_layout_i)
5001 {
5002   auto &from_vertex = m_vertices[from_node_i];
5003   unsigned int from_partition_i = from_vertex.partition;
5004   slpg_partition_info &from_partition = m_partitions[from_partition_i];
5005   gcc_assert (from_partition.layout >= 0);
5006
5007   /* First calculate the cost on the assumption that FROM_PARTITION sticks
5008      with its current layout preference.  */
5009   slpg_layout_cost cost = slpg_layout_cost::impossible ();
5010   auto edge_cost = edge_layout_cost (ud, from_node_i,
5011                                      from_partition.layout, to_layout_i);
5012   if (edge_cost.is_possible ())
5013     {
5014       auto &from_costs = partition_layout_costs (from_partition_i,
5015                                                  from_partition.layout);
5016       cost = from_costs.in_cost;
5017       cost.add_serial_cost (from_costs.internal_cost);
5018       cost.split (from_partition.out_degree);
5019       cost.add_serial_cost (edge_cost);
5020     }
5021
5022   /* Take the minimum of that cost and the cost that applies if
5023      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
5024   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
5025                                                       to_layout_i);
5026   if (direct_layout_costs.is_possible ())
5027     {
5028       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5029       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
5030       direct_cost.split (from_partition.out_degree);
5031       if (!cost.is_possible ()
5032           || direct_cost.is_better_than (cost, m_optimize_size))
5033         cost = direct_cost;
5034     }
5035
5036   return cost;
5037 }
5038
5039 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
5040    partition; TO_NODE_I could be the definition node or the use node.
5041    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5042    return the cost of any necessary fix-ups on edge UD, or
5043    slpg_layout_cost::impossible () if the choice cannot be made.
5044
5045    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
5046
5047 slpg_layout_cost
5048 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
5049                                        unsigned int from_layout_i)
5050 {
5051   auto &to_vertex = m_vertices[to_node_i];
5052   unsigned int to_partition_i = to_vertex.partition;
5053   slpg_partition_info &to_partition = m_partitions[to_partition_i];
5054   gcc_assert (to_partition.layout >= 0);
5055
5056   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
5057      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
5058      any other inputs keep their current choice of layout.  */
5059   auto &to_costs = partition_layout_costs (to_partition_i,
5060                                            to_partition.layout);
5061   if (ud->src == int (to_node_i)
5062       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5063     {
5064       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
5065       auto old_layout = from_partition.layout;
5066       from_partition.layout = from_layout_i;
5067       int factor = internal_node_cost (to_vertex.node, -1,
5068                                        to_partition.layout);
5069       from_partition.layout = old_layout;
5070       if (factor >= 0)
5071         {
5072           slpg_layout_cost cost = to_costs.out_cost;
5073           cost.add_serial_cost ({ to_vertex.weight * factor,
5074                                   m_optimize_size });
5075           cost.split (to_partition.in_degree);
5076           return cost;
5077         }
5078     }
5079
5080   /* Compute the cost if we insert any necessary layout change on edge UD.  */
5081   auto edge_cost = edge_layout_cost (ud, to_node_i,
5082                                      to_partition.layout, from_layout_i);
5083   if (edge_cost.is_possible ())
5084     {
5085       slpg_layout_cost cost = to_costs.out_cost;
5086       cost.add_serial_cost (to_costs.internal_cost);
5087       cost.split (to_partition.in_degree);
5088       cost.add_serial_cost (edge_cost);
5089       return cost;
5090     }
5091
5092   return slpg_layout_cost::impossible ();
5093 }
5094
5095 /* Make a forward pass through the partitions, accumulating input costs.
5096    Make a tentative (provisional) choice of layout for each partition,
5097    ensuring that this choice still allows later partitions to keep
5098    their original layout.  */
5099
5100 void
5101 vect_optimize_slp_pass::forward_pass ()
5102 {
5103   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5104        ++partition_i)
5105     {
5106       auto &partition = m_partitions[partition_i];
5107
5108       /* If the partition consists of a single VEC_PERM_EXPR, precompute
5109          the incoming cost that would apply if every predecessor partition
5110          keeps its current layout.  This is used within the loop below.  */
5111       slpg_layout_cost in_cost;
5112       slp_tree single_node = nullptr;
5113       if (partition.node_end == partition.node_begin + 1)
5114         {
5115           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5116           single_node = m_vertices[node_i].node;
5117           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5118             in_cost = total_in_cost (node_i);
5119         }
5120
5121       /* Go through the possible layouts.  Decide which ones are valid
5122          for this partition and record which of the valid layouts has
5123          the lowest cost.  */
5124       unsigned int min_layout_i = 0;
5125       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5126       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5127         {
5128           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5129           if (!layout_costs.is_possible ())
5130             continue;
5131
5132           /* If the recorded layout is already 0 then the layout cannot
5133              change.  */
5134           if (partition.layout == 0 && layout_i != 0)
5135             {
5136               layout_costs.mark_impossible ();
5137               continue;
5138             }
5139
5140           bool is_possible = true;
5141           for (unsigned int order_i = partition.node_begin;
5142                order_i < partition.node_end; ++order_i)
5143             {
5144               unsigned int node_i = m_partitioned_nodes[order_i];
5145               auto &vertex = m_vertices[node_i];
5146
5147               /* Reject the layout if it is individually incompatible
5148                  with any node in the partition.  */
5149               if (!is_compatible_layout (vertex.node, layout_i))
5150                 {
5151                   is_possible = false;
5152                   break;
5153                 }
5154
5155               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5156                 {
5157                   auto &other_vertex = m_vertices[other_node_i];
5158                   if (other_vertex.partition < vertex.partition)
5159                     {
5160                       /* Accumulate the incoming costs from earlier
5161                          partitions, plus the cost of any layout changes
5162                          on UD itself.  */
5163                       auto cost = forward_cost (ud, other_node_i, layout_i);
5164                       if (!cost.is_possible ())
5165                         is_possible = false;
5166                       else
5167                         layout_costs.in_cost.add_parallel_cost (cost);
5168                     }
5169                   else
5170                     /* Reject the layout if it would make layout 0 impossible
5171                        for later partitions.  This amounts to testing that the
5172                        target supports reversing the layout change on edges
5173                        to later partitions.
5174
5175                        In principle, it might be possible to push a layout
5176                        change all the way down a graph, so that it never
5177                        needs to be reversed and so that the target doesn't
5178                        need to support the reverse operation.  But it would
5179                        be awkward to bail out if we hit a partition that
5180                        does not support the new layout, especially since
5181                        we are not dealing with a lattice.  */
5182                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
5183                                                      layout_i).is_possible ();
5184                 };
5185               for_each_partition_edge (node_i, add_cost);
5186
5187               /* Accumulate the cost of using LAYOUT_I within NODE,
5188                  both for the inputs and the outputs.  */
5189               int factor = internal_node_cost (vertex.node, layout_i,
5190                                                layout_i);
5191               if (factor < 0)
5192                 {
5193                   is_possible = false;
5194                   break;
5195                 }
5196               else if (factor)
5197                 layout_costs.internal_cost.add_serial_cost
5198                   ({ vertex.weight * factor, m_optimize_size });
5199             }
5200           if (!is_possible)
5201             {
5202               layout_costs.mark_impossible ();
5203               continue;
5204             }
5205
5206           /* Combine the incoming and partition-internal costs.  */
5207           slpg_layout_cost combined_cost = layout_costs.in_cost;
5208           combined_cost.add_serial_cost (layout_costs.internal_cost);
5209
5210           /* If this partition consists of a single VEC_PERM_EXPR, see
5211              if the VEC_PERM_EXPR can be changed to support output layout
5212              LAYOUT_I while keeping all the provisional choices of input
5213              layout.  */
5214           if (single_node
5215               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5216             {
5217               int factor = internal_node_cost (single_node, -1, layout_i);
5218               if (factor >= 0)
5219                 {
5220                   auto weight = m_vertices[single_node->vertex].weight;
5221                   slpg_layout_cost internal_cost
5222                     = { weight * factor, m_optimize_size };
5223
5224                   slpg_layout_cost alt_cost = in_cost;
5225                   alt_cost.add_serial_cost (internal_cost);
5226                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5227                     {
5228                       combined_cost = alt_cost;
5229                       layout_costs.in_cost = in_cost;
5230                       layout_costs.internal_cost = internal_cost;
5231                     }
5232                 }
5233             }
5234
5235           /* Record the layout with the lowest cost.  Prefer layout 0 in
5236              the event of a tie between it and another layout.  */
5237           if (!min_layout_cost.is_possible ()
5238               || combined_cost.is_better_than (min_layout_cost,
5239                                                m_optimize_size))
5240             {
5241               min_layout_i = layout_i;
5242               min_layout_cost = combined_cost;
5243             }
5244         }
5245
5246       /* This loop's handling of earlier partitions should ensure that
5247          choosing the original layout for the current partition is no
5248          less valid than it was in the original graph, even with the
5249          provisional layout choices for those earlier partitions.  */
5250       gcc_assert (min_layout_cost.is_possible ());
5251       partition.layout = min_layout_i;
5252     }
5253 }
5254
5255 /* Make a backward pass through the partitions, accumulating output costs.
5256    Make a final choice of layout for each partition.  */
5257
5258 void
5259 vect_optimize_slp_pass::backward_pass ()
5260 {
5261   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5262     {
5263       auto &partition = m_partitions[partition_i];
5264
5265       unsigned int min_layout_i = 0;
5266       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5267       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5268         {
5269           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5270           if (!layout_costs.is_possible ())
5271             continue;
5272
5273           /* Accumulate the costs from successor partitions.  */
5274           bool is_possible = true;
5275           for (unsigned int order_i = partition.node_begin;
5276                order_i < partition.node_end; ++order_i)
5277             {
5278               unsigned int node_i = m_partitioned_nodes[order_i];
5279               auto &vertex = m_vertices[node_i];
5280               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5281                 {
5282                   auto &other_vertex = m_vertices[other_node_i];
5283                   auto &other_partition = m_partitions[other_vertex.partition];
5284                   if (other_vertex.partition > vertex.partition)
5285                     {
5286                       /* Accumulate the incoming costs from later
5287                          partitions, plus the cost of any layout changes
5288                          on UD itself.  */
5289                       auto cost = backward_cost (ud, other_node_i, layout_i);
5290                       if (!cost.is_possible ())
5291                         is_possible = false;
5292                       else
5293                         layout_costs.out_cost.add_parallel_cost (cost);
5294                     }
5295                   else
5296                     /* Make sure that earlier partitions can (if necessary
5297                        or beneficial) keep the layout that they chose in
5298                        the forward pass.  This ensures that there is at
5299                        least one valid choice of layout.  */
5300                     is_possible &= edge_layout_cost (ud, other_node_i,
5301                                                      other_partition.layout,
5302                                                      layout_i).is_possible ();
5303                 };
5304               for_each_partition_edge (node_i, add_cost);
5305             }
5306           if (!is_possible)
5307             {
5308               layout_costs.mark_impossible ();
5309               continue;
5310             }
5311
5312           /* Locally combine the costs from the forward and backward passes.
5313              (This combined cost is not passed on, since that would lead
5314              to double counting.)  */
5315           slpg_layout_cost combined_cost = layout_costs.in_cost;
5316           combined_cost.add_serial_cost (layout_costs.internal_cost);
5317           combined_cost.add_serial_cost (layout_costs.out_cost);
5318
5319           /* Record the layout with the lowest cost.  Prefer layout 0 in
5320              the event of a tie between it and another layout.  */
5321           if (!min_layout_cost.is_possible ()
5322               || combined_cost.is_better_than (min_layout_cost,
5323                                                m_optimize_size))
5324             {
5325               min_layout_i = layout_i;
5326               min_layout_cost = combined_cost;
5327             }
5328         }
5329
5330       gcc_assert (min_layout_cost.is_possible ());
5331       partition.layout = min_layout_i;
5332     }
5333 }
5334
5335 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5336    NODE already has the layout that was selected for its partition.  */
5337
5338 slp_tree
5339 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5340                                                 unsigned int to_layout_i)
5341 {
5342   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5343   slp_tree result = m_node_layouts[result_i];
5344   if (result)
5345     return result;
5346
5347   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5348       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5349           /* We can't permute vector defs in place.  */
5350           && SLP_TREE_VEC_DEFS (node).is_empty ()))
5351     {
5352       /* If the vector is uniform or unchanged, there's nothing to do.  */
5353       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5354         result = node;
5355       else
5356         {
5357           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5358           result = vect_create_new_slp_node (scalar_ops);
5359           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5360         }
5361     }
5362   else
5363     {
5364       unsigned int partition_i = m_vertices[node->vertex].partition;
5365       unsigned int from_layout_i = m_partitions[partition_i].layout;
5366       if (from_layout_i == to_layout_i)
5367         return node;
5368
5369       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5370          permutation instead of a serial one.  Leave the new permutation
5371          in TMP_PERM on success.  */
5372       auto_lane_permutation_t tmp_perm;
5373       unsigned int num_inputs = 1;
5374       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5375         {
5376           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5377           if (from_layout_i != 0)
5378             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5379           if (to_layout_i != 0)
5380             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5381           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5382                                               tmp_perm,
5383                                               SLP_TREE_CHILDREN (node),
5384                                               false) >= 0)
5385             num_inputs = SLP_TREE_CHILDREN (node).length ();
5386           else
5387             tmp_perm.truncate (0);
5388         }
5389
5390       if (dump_enabled_p ())
5391         {
5392           if (tmp_perm.length () > 0)
5393             dump_printf_loc (MSG_NOTE, vect_location,
5394                              "duplicating permutation node %p with"
5395                              " layout %d\n",
5396                              (void *) node, to_layout_i);
5397           else
5398             dump_printf_loc (MSG_NOTE, vect_location,
5399                              "inserting permutation node in place of %p\n",
5400                              (void *) node);
5401         }
5402
5403       unsigned int num_lanes = SLP_TREE_LANES (node);
5404       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5405       if (SLP_TREE_SCALAR_STMTS (node).length ())
5406         {
5407           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5408           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5409           if (from_layout_i != 0)
5410             vect_slp_permute (m_perms[from_layout_i], stmts, false);
5411           if (to_layout_i != 0)
5412             vect_slp_permute (m_perms[to_layout_i], stmts, true);
5413         }
5414       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5415       SLP_TREE_LANES (result) = num_lanes;
5416       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5417       result->vertex = -1;
5418
5419       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5420       if (tmp_perm.length ())
5421         {
5422           lane_perm.safe_splice (tmp_perm);
5423           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5424         }
5425       else
5426         {
5427           lane_perm.create (num_lanes);
5428           for (unsigned j = 0; j < num_lanes; ++j)
5429             lane_perm.quick_push ({ 0, j });
5430           if (from_layout_i != 0)
5431             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5432           if (to_layout_i != 0)
5433             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5434           SLP_TREE_CHILDREN (result).safe_push (node);
5435         }
5436       for (slp_tree child : SLP_TREE_CHILDREN (result))
5437         child->refcnt++;
5438     }
5439   m_node_layouts[result_i] = result;
5440   return result;
5441 }
5442
5443 /* Apply the chosen vector layouts to the SLP graph.  */
5444
5445 void
5446 vect_optimize_slp_pass::materialize ()
5447 {
5448   /* We no longer need the costs, so avoid having two O(N * P) arrays
5449      live at the same time.  */
5450   m_partition_layout_costs.release ();
5451   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5452
5453   auto_sbitmap fully_folded (m_vertices.length ());
5454   bitmap_clear (fully_folded);
5455   for (unsigned int node_i : m_partitioned_nodes)
5456     {
5457       auto &vertex = m_vertices[node_i];
5458       slp_tree node = vertex.node;
5459       int layout_i = m_partitions[vertex.partition].layout;
5460       gcc_assert (layout_i >= 0);
5461
5462       /* Rearrange the scalar statements to match the chosen layout.  */
5463       if (layout_i > 0)
5464         vect_slp_permute (m_perms[layout_i],
5465                           SLP_TREE_SCALAR_STMTS (node), true);
5466
5467       /* Update load and lane permutations.  */
5468       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5469         {
5470           /* First try to absorb the input vector layouts.  If that fails,
5471              force the inputs to have layout LAYOUT_I too.  We checked that
5472              that was possible before deciding to use nonzero output layouts.
5473              (Note that at this stage we don't really have any guarantee that
5474              the target supports the original VEC_PERM_EXPR.)  */
5475           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5476           auto_lane_permutation_t tmp_perm;
5477           tmp_perm.safe_splice (perm);
5478           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5479           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5480                                               tmp_perm,
5481                                               SLP_TREE_CHILDREN (node),
5482                                               false) >= 0)
5483             {
5484               if (dump_enabled_p ()
5485                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5486                                   perm.begin ()))
5487                 dump_printf_loc (MSG_NOTE, vect_location,
5488                                  "absorbing input layouts into %p\n",
5489                                  (void *) node);
5490               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5491               bitmap_set_bit (fully_folded, node_i);
5492             }
5493           else
5494             {
5495               /* Not MSG_MISSED because it would make no sense to users.  */
5496               if (dump_enabled_p ())
5497                 dump_printf_loc (MSG_NOTE, vect_location,
5498                                  "failed to absorb input layouts into %p\n",
5499                                  (void *) node);
5500               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5501             }
5502         }
5503       else
5504         {
5505           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5506           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5507           if (layout_i > 0)
5508             /* ???  When we handle non-bijective permutes the idea
5509                is that we can force the load-permutation to be
5510                { min, min + 1, min + 2, ... max }.  But then the
5511                scalar defs might no longer match the lane content
5512                which means wrong-code with live lane vectorization.
5513                So we possibly have to have NULL entries for those.  */
5514             vect_slp_permute (m_perms[layout_i], load_perm, true);
5515         }
5516     }
5517
5518   /* Do this before any nodes disappear, since it involves a walk
5519      over the leaves.  */
5520   remove_redundant_permutations ();
5521
5522   /* Replace each child with a correctly laid-out version.  */
5523   for (unsigned int node_i : m_partitioned_nodes)
5524     {
5525       /* Skip nodes that have already been handled above.  */
5526       if (bitmap_bit_p (fully_folded, node_i))
5527         continue;
5528
5529       auto &vertex = m_vertices[node_i];
5530       int in_layout_i = m_partitions[vertex.partition].layout;
5531       gcc_assert (in_layout_i >= 0);
5532
5533       unsigned j;
5534       slp_tree child;
5535       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5536         {
5537           if (!child)
5538             continue;
5539
5540           slp_tree new_child = get_result_with_layout (child, in_layout_i);
5541           if (new_child != child)
5542             {
5543               vect_free_slp_tree (child);
5544               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5545               new_child->refcnt += 1;
5546             }
5547         }
5548     }
5549 }
5550
5551 /* Elide load permutations that are not necessary.  Such permutations might
5552    be pre-existing, rather than created by the layout optimizations.  */
5553
5554 void
5555 vect_optimize_slp_pass::remove_redundant_permutations ()
5556 {
5557   for (unsigned int node_i : m_leafs)
5558     {
5559       slp_tree node = m_vertices[node_i].node;
5560       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5561         continue;
5562
5563       /* In basic block vectorization we allow any subchain of an interleaving
5564          chain.
5565          FORNOW: not in loop SLP because of realignment complications.  */
5566       if (is_a <bb_vec_info> (m_vinfo))
5567         {
5568           bool subchain_p = true;
5569           stmt_vec_info next_load_info = NULL;
5570           stmt_vec_info load_info;
5571           unsigned j;
5572           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5573             {
5574               if (j != 0
5575                   && (next_load_info != load_info
5576                       || DR_GROUP_GAP (load_info) != 1))
5577                 {
5578                   subchain_p = false;
5579                   break;
5580                 }
5581               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5582             }
5583           if (subchain_p)
5584             {
5585               SLP_TREE_LOAD_PERMUTATION (node).release ();
5586               continue;
5587             }
5588         }
5589       else
5590         {
5591           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5592           stmt_vec_info load_info;
5593           bool this_load_permuted = false;
5594           unsigned j;
5595           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5596             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5597               {
5598                 this_load_permuted = true;
5599                 break;
5600               }
5601           /* When this isn't a grouped access we know it's single element
5602              and contiguous.  */
5603           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5604             {
5605               if (!this_load_permuted
5606                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5607                       || SLP_TREE_LANES (node) == 1))
5608                 SLP_TREE_LOAD_PERMUTATION (node).release ();
5609               continue;
5610             }
5611           stmt_vec_info first_stmt_info
5612             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5613           if (!this_load_permuted
5614               /* The load requires permutation when unrolling exposes
5615                  a gap either because the group is larger than the SLP
5616                  group-size or because there is a gap between the groups.  */
5617               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5618                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5619                       && DR_GROUP_GAP (first_stmt_info) == 0)))
5620             {
5621               SLP_TREE_LOAD_PERMUTATION (node).release ();
5622               continue;
5623             }
5624         }
5625     }
5626 }
5627
5628 /* Print the partition graph and layout information to the dump file.  */
5629
5630 void
5631 vect_optimize_slp_pass::dump ()
5632 {
5633   dump_printf_loc (MSG_NOTE, vect_location,
5634                    "SLP optimize permutations:\n");
5635   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5636     {
5637       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
5638       const char *sep = "";
5639       for (unsigned int idx : m_perms[layout_i])
5640         {
5641           dump_printf (MSG_NOTE, "%s%d", sep, idx);
5642           sep = ", ";
5643         }
5644       dump_printf (MSG_NOTE, " }\n");
5645     }
5646   dump_printf_loc (MSG_NOTE, vect_location,
5647                    "SLP optimize partitions:\n");
5648   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5649        ++partition_i)
5650     {
5651       auto &partition = m_partitions[partition_i];
5652       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
5653       dump_printf_loc (MSG_NOTE, vect_location,
5654                        "  partition %d (layout %d):\n",
5655                        partition_i, partition.layout);
5656       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
5657       for (unsigned int order_i = partition.node_begin;
5658            order_i < partition.node_end; ++order_i)
5659         {
5660           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5661           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
5662                            (void *) vertex.node);
5663           dump_printf_loc (MSG_NOTE, vect_location,
5664                            "          weight: %f\n",
5665                            vertex.weight.to_double ());
5666           if (vertex.out_degree)
5667             dump_printf_loc (MSG_NOTE, vect_location,
5668                              "          out weight: %f (degree %d)\n",
5669                              vertex.out_weight.to_double (),
5670                              vertex.out_degree);
5671           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5672             dump_printf_loc (MSG_NOTE, vect_location,
5673                              "          op: VEC_PERM_EXPR\n");
5674           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5675             dump_printf_loc (MSG_NOTE, vect_location,
5676                              "          op template: %G", rep->stmt);
5677         }
5678       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
5679       for (unsigned int order_i = partition.node_begin;
5680            order_i < partition.node_end; ++order_i)
5681         {
5682           unsigned int node_i = m_partitioned_nodes[order_i];
5683           auto &vertex = m_vertices[node_i];
5684           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5685             {
5686               auto &other_vertex = m_vertices[other_node_i];
5687               if (other_vertex.partition < vertex.partition)
5688                 dump_printf_loc (MSG_NOTE, vect_location,
5689                                  "      - %p [%d] --> %p\n",
5690                                  (void *) other_vertex.node,
5691                                  other_vertex.partition,
5692                                  (void *) vertex.node);
5693               else
5694                 dump_printf_loc (MSG_NOTE, vect_location,
5695                                  "      - %p --> [%d] %p\n",
5696                                  (void *) vertex.node,
5697                                  other_vertex.partition,
5698                                  (void *) other_vertex.node);
5699             };
5700           for_each_partition_edge (node_i, print_edge);
5701         }
5702
5703       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5704         {
5705           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5706           if (layout_costs.is_possible ())
5707             {
5708               dump_printf_loc (MSG_NOTE, vect_location,
5709                                "    layout %d:%s\n", layout_i,
5710                                partition.layout == int (layout_i)
5711                                ? " (*)" : "");
5712               slpg_layout_cost combined_cost = layout_costs.in_cost;
5713               combined_cost.add_serial_cost (layout_costs.internal_cost);
5714               combined_cost.add_serial_cost (layout_costs.out_cost);
5715 #define TEMPLATE "{depth: %f, total: %f}"
5716               dump_printf_loc (MSG_NOTE, vect_location,
5717                                "        " TEMPLATE "\n",
5718                                layout_costs.in_cost.depth.to_double (),
5719                                layout_costs.in_cost.total.to_double ());
5720               dump_printf_loc (MSG_NOTE, vect_location,
5721                                "      + " TEMPLATE "\n",
5722                                layout_costs.internal_cost.depth.to_double (),
5723                                layout_costs.internal_cost.total.to_double ());
5724               dump_printf_loc (MSG_NOTE, vect_location,
5725                                "      + " TEMPLATE "\n",
5726                                layout_costs.out_cost.depth.to_double (),
5727                                layout_costs.out_cost.total.to_double ());
5728               dump_printf_loc (MSG_NOTE, vect_location,
5729                                "      = " TEMPLATE "\n",
5730                                combined_cost.depth.to_double (),
5731                                combined_cost.total.to_double ());
5732 #undef TEMPLATE
5733             }
5734           else
5735             dump_printf_loc (MSG_NOTE, vect_location,
5736                              "    layout %d: rejected\n", layout_i);
5737         }
5738     }
5739 }
5740
5741 /* Main entry point for the SLP graph optimization pass.  */
5742
5743 void
5744 vect_optimize_slp_pass::run ()
5745 {
5746   build_graph ();
5747   create_partitions ();
5748   start_choosing_layouts ();
5749   if (m_perms.length () > 1)
5750     {
5751       forward_pass ();
5752       backward_pass ();
5753       if (dump_enabled_p ())
5754         dump ();
5755       materialize ();
5756       while (!m_perms.is_empty ())
5757         m_perms.pop ().release ();
5758     }
5759   else
5760     remove_redundant_permutations ();
5761   free_graph (m_slpg);
5762 }
5763
5764 /* Optimize the SLP graph of VINFO.  */
5765
5766 void
5767 vect_optimize_slp (vec_info *vinfo)
5768 {
5769   if (vinfo->slp_instances.is_empty ())
5770     return;
5771   vect_optimize_slp_pass (vinfo).run ();
5772 }
5773
5774 /* Gather loads reachable from the individual SLP graph entries.  */
5775
5776 void
5777 vect_gather_slp_loads (vec_info *vinfo)
5778 {
5779   unsigned i;
5780   slp_instance instance;
5781   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5782     {
5783       hash_set<slp_tree> visited;
5784       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5785                              SLP_INSTANCE_TREE (instance), visited);
5786     }
5787 }
5788
5789
5790 /* For each possible SLP instance decide whether to SLP it and calculate overall
5791    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
5792    least one instance.  */
5793
5794 bool
5795 vect_make_slp_decision (loop_vec_info loop_vinfo)
5796 {
5797   unsigned int i;
5798   poly_uint64 unrolling_factor = 1;
5799   const vec<slp_instance> &slp_instances
5800     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5801   slp_instance instance;
5802   int decided_to_slp = 0;
5803
5804   DUMP_VECT_SCOPE ("vect_make_slp_decision");
5805
5806   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5807     {
5808       /* FORNOW: SLP if you can.  */
5809       /* All unroll factors have the form:
5810
5811            GET_MODE_SIZE (vinfo->vector_mode) * X
5812
5813          for some rational X, so they must have a common multiple.  */
5814       unrolling_factor
5815         = force_common_multiple (unrolling_factor,
5816                                  SLP_INSTANCE_UNROLLING_FACTOR (instance));
5817
5818       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
5819          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5820          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
5821       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5822       decided_to_slp++;
5823     }
5824
5825   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5826
5827   if (decided_to_slp && dump_enabled_p ())
5828     {
5829       dump_printf_loc (MSG_NOTE, vect_location,
5830                        "Decided to SLP %d instances. Unrolling factor ",
5831                        decided_to_slp);
5832       dump_dec (MSG_NOTE, unrolling_factor);
5833       dump_printf (MSG_NOTE, "\n");
5834     }
5835
5836   return (decided_to_slp > 0);
5837 }
5838
5839 /* Private data for vect_detect_hybrid_slp.  */
5840 struct vdhs_data
5841 {
5842   loop_vec_info loop_vinfo;
5843   vec<stmt_vec_info> *worklist;
5844 };
5845
5846 /* Walker for walk_gimple_op.  */
5847
5848 static tree
5849 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5850 {
5851   walk_stmt_info *wi = (walk_stmt_info *)data;
5852   vdhs_data *dat = (vdhs_data *)wi->info;
5853
5854   if (wi->is_lhs)
5855     return NULL_TREE;
5856
5857   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5858   if (!def_stmt_info)
5859     return NULL_TREE;
5860   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5861   if (PURE_SLP_STMT (def_stmt_info))
5862     {
5863       if (dump_enabled_p ())
5864         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5865                          def_stmt_info->stmt);
5866       STMT_SLP_TYPE (def_stmt_info) = hybrid;
5867       dat->worklist->safe_push (def_stmt_info);
5868     }
5869
5870   return NULL_TREE;
5871 }
5872
5873 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5874    if so, otherwise pushing it to WORKLIST.  */
5875
5876 static void
5877 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5878                                vec<stmt_vec_info> &worklist,
5879                                stmt_vec_info stmt_info)
5880 {
5881   if (dump_enabled_p ())
5882     dump_printf_loc (MSG_NOTE, vect_location,
5883                      "Processing hybrid candidate : %G", stmt_info->stmt);
5884   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5885   imm_use_iterator iter2;
5886   ssa_op_iter iter1;
5887   use_operand_p use_p;
5888   def_operand_p def_p;
5889   bool any_def = false;
5890   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5891     {
5892       any_def = true;
5893       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5894         {
5895           if (is_gimple_debug (USE_STMT (use_p)))
5896             continue;
5897           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5898           /* An out-of loop use means this is a loop_vect sink.  */
5899           if (!use_info)
5900             {
5901               if (dump_enabled_p ())
5902                 dump_printf_loc (MSG_NOTE, vect_location,
5903                                  "Found loop_vect sink: %G", stmt_info->stmt);
5904               worklist.safe_push (stmt_info);
5905               return;
5906             }
5907           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5908             {
5909               if (dump_enabled_p ())
5910                 dump_printf_loc (MSG_NOTE, vect_location,
5911                                  "Found loop_vect use: %G", use_info->stmt);
5912               worklist.safe_push (stmt_info);
5913               return;
5914             }
5915         }
5916     }
5917   /* No def means this is a loo_vect sink.  */
5918   if (!any_def)
5919     {
5920       if (dump_enabled_p ())
5921         dump_printf_loc (MSG_NOTE, vect_location,
5922                          "Found loop_vect sink: %G", stmt_info->stmt);
5923       worklist.safe_push (stmt_info);
5924       return;
5925     }
5926   if (dump_enabled_p ())
5927     dump_printf_loc (MSG_NOTE, vect_location,
5928                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5929   STMT_SLP_TYPE (stmt_info) = pure_slp;
5930 }
5931
5932 /* Find stmts that must be both vectorized and SLPed.  */
5933
5934 void
5935 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5936 {
5937   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5938
5939   /* All stmts participating in SLP are marked pure_slp, all other
5940      stmts are loop_vect.
5941      First collect all loop_vect stmts into a worklist.
5942      SLP patterns cause not all original scalar stmts to appear in
5943      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5944      Rectify this here and do a backward walk over the IL only considering
5945      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5946      mark them as pure_slp.  */
5947   auto_vec<stmt_vec_info> worklist;
5948   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5949     {
5950       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5951       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5952            gsi_next (&gsi))
5953         {
5954           gphi *phi = gsi.phi ();
5955           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5956           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5957             maybe_push_to_hybrid_worklist (loop_vinfo,
5958                                            worklist, stmt_info);
5959         }
5960       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5961            gsi_prev (&gsi))
5962         {
5963           gimple *stmt = gsi_stmt (gsi);
5964           if (is_gimple_debug (stmt))
5965             continue;
5966           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5967           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5968             {
5969               for (gimple_stmt_iterator gsi2
5970                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5971                    !gsi_end_p (gsi2); gsi_next (&gsi2))
5972                 {
5973                   stmt_vec_info patt_info
5974                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5975                   if (!STMT_SLP_TYPE (patt_info)
5976                       && STMT_VINFO_RELEVANT (patt_info))
5977                     maybe_push_to_hybrid_worklist (loop_vinfo,
5978                                                    worklist, patt_info);
5979                 }
5980               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5981             }
5982           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5983             maybe_push_to_hybrid_worklist (loop_vinfo,
5984                                            worklist, stmt_info);
5985         }
5986     }
5987
5988   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5989      mark any SLP vectorized stmt as hybrid.
5990      ???  We're visiting def stmts N times (once for each non-SLP and
5991      once for each hybrid-SLP use).  */
5992   walk_stmt_info wi;
5993   vdhs_data dat;
5994   dat.worklist = &worklist;
5995   dat.loop_vinfo = loop_vinfo;
5996   memset (&wi, 0, sizeof (wi));
5997   wi.info = (void *)&dat;
5998   while (!worklist.is_empty ())
5999     {
6000       stmt_vec_info stmt_info = worklist.pop ();
6001       /* Since SSA operands are not set up for pattern stmts we need
6002          to use walk_gimple_op.  */
6003       wi.is_lhs = 0;
6004       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
6005       /* For gather/scatter make sure to walk the offset operand, that
6006          can be a scaling and conversion away.  */
6007       gather_scatter_info gs_info;
6008       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6009           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6010         {
6011           int dummy;
6012           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
6013         }
6014     }
6015 }
6016
6017
6018 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
6019
6020 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6021   : vec_info (vec_info::bb, shared),
6022     bbs (_bbs),
6023     roots (vNULL)
6024 {
6025   for (unsigned i = 0; i < bbs.length (); ++i)
6026     {
6027       if (i != 0)
6028         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6029              gsi_next (&si))
6030           {
6031             gphi *phi = si.phi ();
6032             gimple_set_uid (phi, 0);
6033             add_stmt (phi);
6034           }
6035       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6036            !gsi_end_p (gsi); gsi_next (&gsi))
6037         {
6038           gimple *stmt = gsi_stmt (gsi);
6039           gimple_set_uid (stmt, 0);
6040           if (is_gimple_debug (stmt))
6041             continue;
6042           add_stmt (stmt);
6043         }
6044     }
6045 }
6046
6047
6048 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
6049    stmts in the basic block.  */
6050
6051 _bb_vec_info::~_bb_vec_info ()
6052 {
6053   /* Reset region marker.  */
6054   for (unsigned i = 0; i < bbs.length (); ++i)
6055     {
6056       if (i != 0)
6057         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6058              gsi_next (&si))
6059           {
6060             gphi *phi = si.phi ();
6061             gimple_set_uid (phi, -1);
6062           }
6063       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6064            !gsi_end_p (gsi); gsi_next (&gsi))
6065         {
6066           gimple *stmt = gsi_stmt (gsi);
6067           gimple_set_uid (stmt, -1);
6068         }
6069     }
6070
6071   for (unsigned i = 0; i < roots.length (); ++i)
6072     {
6073       roots[i].stmts.release ();
6074       roots[i].roots.release ();
6075       roots[i].remain.release ();
6076     }
6077   roots.release ();
6078 }
6079
6080 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
6081    given then that child nodes have already been processed, and that
6082    their def types currently match their SLP node's def type.  */
6083
6084 static bool
6085 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6086                                     slp_instance node_instance,
6087                                     stmt_vector_for_cost *cost_vec)
6088 {
6089   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6090
6091   /* Calculate the number of vector statements to be created for the
6092      scalar stmts in this node.  For SLP reductions it is equal to the
6093      number of vector statements in the children (which has already been
6094      calculated by the recursive call).  Otherwise it is the number of
6095      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6096      VF divided by the number of elements in a vector.  */
6097   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6098       && !STMT_VINFO_DATA_REF (stmt_info)
6099       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6100     {
6101       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6102         if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6103           {
6104             SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6105               = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6106             break;
6107           }
6108     }
6109   else
6110     {
6111       poly_uint64 vf;
6112       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6113         vf = loop_vinfo->vectorization_factor;
6114       else
6115         vf = 1;
6116       unsigned int group_size = SLP_TREE_LANES (node);
6117       tree vectype = SLP_TREE_VECTYPE (node);
6118       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6119         = vect_get_num_vectors (vf * group_size, vectype);
6120     }
6121
6122   /* Handle purely internal nodes.  */
6123   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6124     {
6125       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6126         return false;
6127
6128       stmt_vec_info slp_stmt_info;
6129       unsigned int i;
6130       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6131         {
6132           if (STMT_VINFO_LIVE_P (slp_stmt_info)
6133               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6134                                                node_instance, i,
6135                                                false, cost_vec))
6136             return false;
6137         }
6138       return true;
6139     }
6140
6141   bool dummy;
6142   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6143                             node, node_instance, cost_vec);
6144 }
6145
6146 /* Try to build NODE from scalars, returning true on success.
6147    NODE_INSTANCE is the SLP instance that contains NODE.  */
6148
6149 static bool
6150 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6151                               slp_instance node_instance)
6152 {
6153   stmt_vec_info stmt_info;
6154   unsigned int i;
6155
6156   if (!is_a <bb_vec_info> (vinfo)
6157       || node == SLP_INSTANCE_TREE (node_instance)
6158       || !SLP_TREE_SCALAR_STMTS (node).exists ()
6159       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6160       /* Force the mask use to be built from scalars instead.  */
6161       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6162     return false;
6163
6164   if (dump_enabled_p ())
6165     dump_printf_loc (MSG_NOTE, vect_location,
6166                      "Building vector operands of %p from scalars instead\n",
6167                      (void *) node);
6168
6169   /* Don't remove and free the child nodes here, since they could be
6170      referenced by other structures.  The analysis and scheduling phases
6171      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
6172   unsigned int group_size = SLP_TREE_LANES (node);
6173   SLP_TREE_DEF_TYPE (node) = vect_external_def;
6174   /* Invariants get their vector type from the uses.  */
6175   SLP_TREE_VECTYPE (node) = NULL_TREE;
6176   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6177   SLP_TREE_LOAD_PERMUTATION (node).release ();
6178   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6179     {
6180       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6181       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6182     }
6183   return true;
6184 }
6185
6186 /* Return true if all elements of the slice are the same.  */
6187 bool
6188 vect_scalar_ops_slice::all_same_p () const
6189 {
6190   for (unsigned int i = 1; i < length; ++i)
6191     if (!operand_equal_p (op (0), op (i)))
6192       return false;
6193   return true;
6194 }
6195
6196 hashval_t
6197 vect_scalar_ops_slice_hash::hash (const value_type &s)
6198 {
6199   hashval_t hash = 0;
6200   for (unsigned i = 0; i < s.length; ++i)
6201     hash = iterative_hash_expr (s.op (i), hash);
6202   return hash;
6203 }
6204
6205 bool
6206 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6207                                    const compare_type &s2)
6208 {
6209   if (s1.length != s2.length)
6210     return false;
6211   for (unsigned i = 0; i < s1.length; ++i)
6212     if (!operand_equal_p (s1.op (i), s2.op (i)))
6213       return false;
6214   return true;
6215 }
6216
6217 /* Compute the prologue cost for invariant or constant operands represented
6218    by NODE.  */
6219
6220 static void
6221 vect_prologue_cost_for_slp (slp_tree node,
6222                             stmt_vector_for_cost *cost_vec)
6223 {
6224   /* There's a special case of an existing vector, that costs nothing.  */
6225   if (SLP_TREE_SCALAR_OPS (node).length () == 0
6226       && !SLP_TREE_VEC_DEFS (node).is_empty ())
6227     return;
6228   /* Without looking at the actual initializer a vector of
6229      constants can be implemented as load from the constant pool.
6230      When all elements are the same we can use a splat.  */
6231   tree vectype = SLP_TREE_VECTYPE (node);
6232   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6233   unsigned HOST_WIDE_INT const_nunits;
6234   unsigned nelt_limit;
6235   auto ops = &SLP_TREE_SCALAR_OPS (node);
6236   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6237   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6238       && ! multiple_p (const_nunits, group_size))
6239     {
6240       nelt_limit = const_nunits;
6241       hash_set<vect_scalar_ops_slice_hash> vector_ops;
6242       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6243         if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6244           starts.quick_push (i * const_nunits);
6245     }
6246   else
6247     {
6248       /* If either the vector has variable length or the vectors
6249          are composed of repeated whole groups we only need to
6250          cost construction once.  All vectors will be the same.  */
6251       nelt_limit = group_size;
6252       starts.quick_push (0);
6253     }
6254   /* ???  We're just tracking whether vectors in a single node are the same.
6255      Ideally we'd do something more global.  */
6256   bool passed = false;
6257   for (unsigned int start : starts)
6258     {
6259       vect_cost_for_stmt kind;
6260       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6261         kind = vector_load;
6262       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6263         kind = scalar_to_vec;
6264       else
6265         kind = vec_construct;
6266       /* The target cost hook has no idea which part of the SLP node
6267          we are costing so avoid passing it down more than once.  Pass
6268          it to the first vec_construct or scalar_to_vec part since for those
6269          the x86 backend tries to account for GPR to XMM register moves.  */
6270       record_stmt_cost (cost_vec, 1, kind,
6271                         (kind != vector_load && !passed) ? node : nullptr,
6272                         vectype, 0, vect_prologue);
6273       if (kind != vector_load)
6274         passed = true;
6275     }
6276 }
6277
6278 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6279    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6280
6281    Return true if the operations are supported.  */
6282
6283 static bool
6284 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6285                                   slp_instance node_instance,
6286                                   hash_set<slp_tree> &visited_set,
6287                                   vec<slp_tree> &visited_vec,
6288                                   stmt_vector_for_cost *cost_vec)
6289 {
6290   int i, j;
6291   slp_tree child;
6292
6293   /* Assume we can code-generate all invariants.  */
6294   if (!node
6295       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6296       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6297     return true;
6298
6299   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6300     {
6301       if (dump_enabled_p ())
6302         dump_printf_loc (MSG_NOTE, vect_location,
6303                          "Failed cyclic SLP reference in %p\n", (void *) node);
6304       return false;
6305     }
6306   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6307
6308   /* If we already analyzed the exact same set of scalar stmts we're done.
6309      We share the generated vector stmts for those.  */
6310   if (visited_set.add (node))
6311     return true;
6312   visited_vec.safe_push (node);
6313
6314   bool res = true;
6315   unsigned visited_rec_start = visited_vec.length ();
6316   unsigned cost_vec_rec_start = cost_vec->length ();
6317   bool seen_non_constant_child = false;
6318   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6319     {
6320       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6321                                               visited_set, visited_vec,
6322                                               cost_vec);
6323       if (!res)
6324         break;
6325       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6326         seen_non_constant_child = true;
6327     }
6328   /* We're having difficulties scheduling nodes with just constant
6329      operands and no scalar stmts since we then cannot compute a stmt
6330      insertion place.  */
6331   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6332     {
6333       if (dump_enabled_p ())
6334         dump_printf_loc (MSG_NOTE, vect_location,
6335                          "Cannot vectorize all-constant op node %p\n",
6336                          (void *) node);
6337       res = false;
6338     }
6339
6340   if (res)
6341     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6342                                               cost_vec);
6343   /* If analysis failed we have to pop all recursive visited nodes
6344      plus ourselves.  */
6345   if (!res)
6346     {
6347       while (visited_vec.length () >= visited_rec_start)
6348         visited_set.remove (visited_vec.pop ());
6349       cost_vec->truncate (cost_vec_rec_start);
6350     }
6351
6352   /* When the node can be vectorized cost invariant nodes it references.
6353      This is not done in DFS order to allow the refering node
6354      vectorizable_* calls to nail down the invariant nodes vector type
6355      and possibly unshare it if it needs a different vector type than
6356      other referrers.  */
6357   if (res)
6358     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6359       if (child
6360           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6361               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6362           /* Perform usual caching, note code-generation still
6363              code-gens these nodes multiple times but we expect
6364              to CSE them later.  */
6365           && !visited_set.add (child))
6366         {
6367           visited_vec.safe_push (child);
6368           /* ???  After auditing more code paths make a "default"
6369              and push the vector type from NODE to all children
6370              if it is not already set.  */
6371           /* Compute the number of vectors to be generated.  */
6372           tree vector_type = SLP_TREE_VECTYPE (child);
6373           if (!vector_type)
6374             {
6375               /* For shifts with a scalar argument we don't need
6376                  to cost or code-generate anything.
6377                  ???  Represent this more explicitely.  */
6378               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6379                            == shift_vec_info_type)
6380                           && j == 1);
6381               continue;
6382             }
6383           unsigned group_size = SLP_TREE_LANES (child);
6384           poly_uint64 vf = 1;
6385           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6386             vf = loop_vinfo->vectorization_factor;
6387           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6388             = vect_get_num_vectors (vf * group_size, vector_type);
6389           /* And cost them.  */
6390           vect_prologue_cost_for_slp (child, cost_vec);
6391         }
6392
6393   /* If this node or any of its children can't be vectorized, try pruning
6394      the tree here rather than felling the whole thing.  */
6395   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6396     {
6397       /* We'll need to revisit this for invariant costing and number
6398          of vectorized stmt setting.   */
6399       res = true;
6400     }
6401
6402   return res;
6403 }
6404
6405 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6406    region and that can be vectorized using vectorizable_live_operation
6407    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
6408    scalar code computing it to be retained.  */
6409
6410 static void
6411 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6412                              slp_instance instance,
6413                              stmt_vector_for_cost *cost_vec,
6414                              hash_set<stmt_vec_info> &svisited,
6415                              hash_set<slp_tree> &visited)
6416 {
6417   if (visited.add (node))
6418     return;
6419
6420   unsigned i;
6421   stmt_vec_info stmt_info;
6422   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6423   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6424     {
6425       if (svisited.contains (stmt_info))
6426         continue;
6427       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6428       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6429           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6430         /* Only the pattern root stmt computes the original scalar value.  */
6431         continue;
6432       bool mark_visited = true;
6433       gimple *orig_stmt = orig_stmt_info->stmt;
6434       ssa_op_iter op_iter;
6435       def_operand_p def_p;
6436       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6437         {
6438           imm_use_iterator use_iter;
6439           gimple *use_stmt;
6440           stmt_vec_info use_stmt_info;
6441           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6442             if (!is_gimple_debug (use_stmt))
6443               {
6444                 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6445                 if (!use_stmt_info
6446                     || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6447                   {
6448                     STMT_VINFO_LIVE_P (stmt_info) = true;
6449                     if (vectorizable_live_operation (bb_vinfo, stmt_info,
6450                                                      node, instance, i,
6451                                                      false, cost_vec))
6452                       /* ???  So we know we can vectorize the live stmt
6453                          from one SLP node.  If we cannot do so from all
6454                          or none consistently we'd have to record which
6455                          SLP node (and lane) we want to use for the live
6456                          operation.  So make sure we can code-generate
6457                          from all nodes.  */
6458                       mark_visited = false;
6459                     else
6460                       STMT_VINFO_LIVE_P (stmt_info) = false;
6461                     break;
6462                   }
6463               }
6464           /* We have to verify whether we can insert the lane extract
6465              before all uses.  The following is a conservative approximation.
6466              We cannot put this into vectorizable_live_operation because
6467              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6468              doesn't work.
6469              Note that while the fact that we emit code for loads at the
6470              first load should make this a non-problem leafs we construct
6471              from scalars are vectorized after the last scalar def.
6472              ???  If we'd actually compute the insert location during
6473              analysis we could use sth less conservative than the last
6474              scalar stmt in the node for the dominance check.  */
6475           /* ???  What remains is "live" uses in vector CTORs in the same
6476              SLP graph which is where those uses can end up code-generated
6477              right after their definition instead of close to their original
6478              use.  But that would restrict us to code-generate lane-extracts
6479              from the latest stmt in a node.  So we compensate for this
6480              during code-generation, simply not replacing uses for those
6481              hopefully rare cases.  */
6482           if (STMT_VINFO_LIVE_P (stmt_info))
6483             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6484               if (!is_gimple_debug (use_stmt)
6485                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6486                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6487                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6488                 {
6489                   if (dump_enabled_p ())
6490                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6491                                      "Cannot determine insertion place for "
6492                                      "lane extract\n");
6493                   STMT_VINFO_LIVE_P (stmt_info) = false;
6494                   mark_visited = true;
6495                 }
6496         }
6497       if (mark_visited)
6498         svisited.add (stmt_info);
6499     }
6500
6501   slp_tree child;
6502   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6503     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6504       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6505                                    cost_vec, svisited, visited);
6506 }
6507
6508 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
6509
6510 static bool
6511 vectorizable_bb_reduc_epilogue (slp_instance instance,
6512                                 stmt_vector_for_cost *cost_vec)
6513 {
6514   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6515   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6516   if (reduc_code == MINUS_EXPR)
6517     reduc_code = PLUS_EXPR;
6518   internal_fn reduc_fn;
6519   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6520   if (!vectype
6521       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6522       || reduc_fn == IFN_LAST
6523       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6524       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6525                                      TREE_TYPE (vectype)))
6526     {
6527       if (dump_enabled_p ())
6528         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6529                          "not vectorized: basic block reduction epilogue "
6530                          "operation unsupported.\n");
6531       return false;
6532     }
6533
6534   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6535      cost log2 vector operations plus shuffles and one extraction.  */
6536   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6537   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6538                     vectype, 0, vect_body);
6539   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6540                     vectype, 0, vect_body);
6541   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6542                     vectype, 0, vect_body);
6543
6544   /* Since we replace all stmts of a possibly longer scalar reduction
6545      chain account for the extra scalar stmts for that.  */
6546   record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6547                     instance->root_stmts[0], 0, vect_body);
6548   return true;
6549 }
6550
6551 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6552    and recurse to children.  */
6553
6554 static void
6555 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6556                               hash_set<slp_tree> &visited)
6557 {
6558   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6559       || visited.add (node))
6560     return;
6561
6562   stmt_vec_info stmt;
6563   unsigned i;
6564   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6565     roots.remove (vect_orig_stmt (stmt));
6566
6567   slp_tree child;
6568   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6569     if (child)
6570       vect_slp_prune_covered_roots (child, roots, visited);
6571 }
6572
6573 /* Analyze statements in SLP instances of VINFO.  Return true if the
6574    operations are supported. */
6575
6576 bool
6577 vect_slp_analyze_operations (vec_info *vinfo)
6578 {
6579   slp_instance instance;
6580   int i;
6581
6582   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6583
6584   hash_set<slp_tree> visited;
6585   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6586     {
6587       auto_vec<slp_tree> visited_vec;
6588       stmt_vector_for_cost cost_vec;
6589       cost_vec.create (2);
6590       if (is_a <bb_vec_info> (vinfo))
6591         vect_location = instance->location ();
6592       if (!vect_slp_analyze_node_operations (vinfo,
6593                                              SLP_INSTANCE_TREE (instance),
6594                                              instance, visited, visited_vec,
6595                                              &cost_vec)
6596           /* CTOR instances require vectorized defs for the SLP tree root.  */
6597           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6598               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6599                   != vect_internal_def
6600                   /* Make sure we vectorized with the expected type.  */
6601                   || !useless_type_conversion_p
6602                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6603                                               (instance->root_stmts[0]->stmt))),
6604                          TREE_TYPE (SLP_TREE_VECTYPE
6605                                             (SLP_INSTANCE_TREE (instance))))))
6606           /* Check we can vectorize the reduction.  */
6607           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6608               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6609         {
6610           slp_tree node = SLP_INSTANCE_TREE (instance);
6611           stmt_vec_info stmt_info;
6612           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6613             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6614           else
6615             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6616           if (dump_enabled_p ())
6617             dump_printf_loc (MSG_NOTE, vect_location,
6618                              "removing SLP instance operations starting from: %G",
6619                              stmt_info->stmt);
6620           vect_free_slp_instance (instance);
6621           vinfo->slp_instances.ordered_remove (i);
6622           cost_vec.release ();
6623           while (!visited_vec.is_empty ())
6624             visited.remove (visited_vec.pop ());
6625         }
6626       else
6627         {
6628           i++;
6629           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6630             {
6631               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6632               cost_vec.release ();
6633             }
6634           else
6635             /* For BB vectorization remember the SLP graph entry
6636                cost for later.  */
6637             instance->cost_vec = cost_vec;
6638         }
6639     }
6640
6641   /* Now look for SLP instances with a root that are covered by other
6642      instances and remove them.  */
6643   hash_set<stmt_vec_info> roots;
6644   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6645     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6646       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6647   if (!roots.is_empty ())
6648     {
6649       visited.empty ();
6650       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6651         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6652                                       visited);
6653       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6654         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6655             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6656           {
6657             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6658             if (dump_enabled_p ())
6659               dump_printf_loc (MSG_NOTE, vect_location,
6660                                "removing SLP instance operations starting "
6661                                "from: %G", root->stmt);
6662             vect_free_slp_instance (instance);
6663             vinfo->slp_instances.ordered_remove (i);
6664           }
6665         else
6666           ++i;
6667     }
6668
6669   /* Compute vectorizable live stmts.  */
6670   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6671     {
6672       hash_set<stmt_vec_info> svisited;
6673       hash_set<slp_tree> visited;
6674       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6675         {
6676           vect_location = instance->location ();
6677           vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6678                                        instance, &instance->cost_vec, svisited,
6679                                        visited);
6680         }
6681     }
6682
6683   return !vinfo->slp_instances.is_empty ();
6684 }
6685
6686 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6687    closing the eventual chain.  */
6688
6689 static slp_instance
6690 get_ultimate_leader (slp_instance instance,
6691                      hash_map<slp_instance, slp_instance> &instance_leader)
6692 {
6693   auto_vec<slp_instance *, 8> chain;
6694   slp_instance *tem;
6695   while (*(tem = instance_leader.get (instance)) != instance)
6696     {
6697       chain.safe_push (tem);
6698       instance = *tem;
6699     }
6700   while (!chain.is_empty ())
6701     *chain.pop () = instance;
6702   return instance;
6703 }
6704
6705 namespace {
6706 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
6707    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6708    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
6709
6710    INSTANCE_LEADER is as for get_ultimate_leader.  */
6711
6712 template<typename T>
6713 bool
6714 vect_map_to_instance (slp_instance instance, T key,
6715                       hash_map<T, slp_instance> &key_to_instance,
6716                       hash_map<slp_instance, slp_instance> &instance_leader)
6717 {
6718   bool existed_p;
6719   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6720   if (!existed_p)
6721     ;
6722   else if (key_instance != instance)
6723     {
6724       /* If we're running into a previously marked key make us the
6725          leader of the current ultimate leader.  This keeps the
6726          leader chain acyclic and works even when the current instance
6727          connects two previously independent graph parts.  */
6728       slp_instance key_leader
6729         = get_ultimate_leader (key_instance, instance_leader);
6730       if (key_leader != instance)
6731         instance_leader.put (key_leader, instance);
6732     }
6733   key_instance = instance;
6734   return existed_p;
6735 }
6736 }
6737
6738 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
6739
6740 static void
6741 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6742                            slp_instance instance, slp_tree node,
6743                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6744                            hash_map<slp_tree, slp_instance> &node_to_instance,
6745                            hash_map<slp_instance, slp_instance> &instance_leader)
6746 {
6747   stmt_vec_info stmt_info;
6748   unsigned i;
6749
6750   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6751     vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6752                           instance_leader);
6753
6754   if (vect_map_to_instance (instance, node, node_to_instance,
6755                             instance_leader))
6756     return;
6757
6758   slp_tree child;
6759   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6760     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6761       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6762                                  node_to_instance, instance_leader);
6763 }
6764
6765 /* Partition the SLP graph into pieces that can be costed independently.  */
6766
6767 static void
6768 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6769 {
6770   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6771
6772   /* First walk the SLP graph assigning each involved scalar stmt a
6773      corresponding SLP graph entry and upon visiting a previously
6774      marked stmt, make the stmts leader the current SLP graph entry.  */
6775   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6776   hash_map<slp_tree, slp_instance> node_to_instance;
6777   hash_map<slp_instance, slp_instance> instance_leader;
6778   slp_instance instance;
6779   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6780     {
6781       instance_leader.put (instance, instance);
6782       vect_bb_partition_graph_r (bb_vinfo,
6783                                  instance, SLP_INSTANCE_TREE (instance),
6784                                  stmt_to_instance, node_to_instance,
6785                                  instance_leader);
6786     }
6787
6788   /* Then collect entries to each independent subgraph.  */
6789   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6790     {
6791       slp_instance leader = get_ultimate_leader (instance, instance_leader);
6792       leader->subgraph_entries.safe_push (instance);
6793       if (dump_enabled_p ()
6794           && leader != instance)
6795         dump_printf_loc (MSG_NOTE, vect_location,
6796                          "instance %p is leader of %p\n",
6797                          (void *) leader, (void *) instance);
6798     }
6799 }
6800
6801 /* Compute the set of scalar stmts participating in internal and external
6802    nodes.  */
6803
6804 static void
6805 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6806                                          hash_set<slp_tree> &visited,
6807                                          hash_set<stmt_vec_info> &vstmts,
6808                                          hash_set<stmt_vec_info> &estmts)
6809 {
6810   int i;
6811   stmt_vec_info stmt_info;
6812   slp_tree child;
6813
6814   if (visited.add (node))
6815     return;
6816
6817   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6818     {
6819       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6820         vstmts.add (stmt_info);
6821
6822       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6823         if (child)
6824           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6825                                                    vstmts, estmts);
6826     }
6827   else
6828     for (tree def : SLP_TREE_SCALAR_OPS (node))
6829       {
6830         stmt_vec_info def_stmt = vinfo->lookup_def (def);
6831         if (def_stmt)
6832           estmts.add (def_stmt);
6833       }
6834 }
6835
6836
6837 /* Compute the scalar cost of the SLP node NODE and its children
6838    and return it.  Do not account defs that are marked in LIFE and
6839    update LIFE according to uses of NODE.  */
6840
6841 static void
6842 vect_bb_slp_scalar_cost (vec_info *vinfo,
6843                          slp_tree node, vec<bool, va_heap> *life,
6844                          stmt_vector_for_cost *cost_vec,
6845                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6846                          hash_set<slp_tree> &visited)
6847 {
6848   unsigned i;
6849   stmt_vec_info stmt_info;
6850   slp_tree child;
6851
6852   if (visited.add (node))
6853     return;
6854
6855   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6856     {
6857       ssa_op_iter op_iter;
6858       def_operand_p def_p;
6859
6860       if ((*life)[i])
6861         continue;
6862
6863       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6864       gimple *orig_stmt = orig_stmt_info->stmt;
6865
6866       /* If there is a non-vectorized use of the defs then the scalar
6867          stmt is kept live in which case we do not account it or any
6868          required defs in the SLP children in the scalar cost.  This
6869          way we make the vectorization more costly when compared to
6870          the scalar cost.  */
6871       if (!STMT_VINFO_LIVE_P (stmt_info))
6872         {
6873           auto_vec<gimple *, 8> worklist;
6874           hash_set<gimple *> *worklist_visited = NULL;
6875           worklist.quick_push (orig_stmt);
6876           do
6877             {
6878               gimple *work_stmt = worklist.pop ();
6879               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6880                 {
6881                   imm_use_iterator use_iter;
6882                   gimple *use_stmt;
6883                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6884                                          DEF_FROM_PTR (def_p))
6885                     if (!is_gimple_debug (use_stmt))
6886                       {
6887                         stmt_vec_info use_stmt_info
6888                           = vinfo->lookup_stmt (use_stmt);
6889                         if (!use_stmt_info
6890                             || !vectorized_scalar_stmts.contains (use_stmt_info))
6891                           {
6892                             if (use_stmt_info
6893                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6894                               {
6895                                 /* For stmts participating in patterns we have
6896                                    to check its uses recursively.  */
6897                                 if (!worklist_visited)
6898                                   worklist_visited = new hash_set<gimple *> ();
6899                                 if (!worklist_visited->add (use_stmt))
6900                                   worklist.safe_push (use_stmt);
6901                                 continue;
6902                               }
6903                             (*life)[i] = true;
6904                             goto next_lane;
6905                           }
6906                       }
6907                 }
6908             }
6909           while (!worklist.is_empty ());
6910 next_lane:
6911           if (worklist_visited)
6912             delete worklist_visited;
6913           if ((*life)[i])
6914             continue;
6915         }
6916
6917       /* Count scalar stmts only once.  */
6918       if (gimple_visited_p (orig_stmt))
6919         continue;
6920       gimple_set_visited (orig_stmt, true);
6921
6922       vect_cost_for_stmt kind;
6923       if (STMT_VINFO_DATA_REF (orig_stmt_info))
6924         {
6925           if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6926             kind = scalar_load;
6927           else
6928             kind = scalar_store;
6929         }
6930       else if (vect_nop_conversion_p (orig_stmt_info))
6931         continue;
6932       /* For single-argument PHIs assume coalescing which means zero cost
6933          for the scalar and the vector PHIs.  This avoids artificially
6934          favoring the vector path (but may pessimize it in some cases).  */
6935       else if (is_a <gphi *> (orig_stmt_info->stmt)
6936                && gimple_phi_num_args
6937                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6938         continue;
6939       else
6940         kind = scalar_stmt;
6941       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6942                         SLP_TREE_VECTYPE (node), 0, vect_body);
6943     }
6944
6945   auto_vec<bool, 20> subtree_life;
6946   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6947     {
6948       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6949         {
6950           /* Do not directly pass LIFE to the recursive call, copy it to
6951              confine changes in the callee to the current child/subtree.  */
6952           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6953             {
6954               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6955               for (unsigned j = 0;
6956                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6957                 {
6958                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6959                   if (perm.first == i)
6960                     subtree_life[perm.second] = (*life)[j];
6961                 }
6962             }
6963           else
6964             {
6965               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6966               subtree_life.safe_splice (*life);
6967             }
6968           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6969                                    vectorized_scalar_stmts, visited);
6970           subtree_life.truncate (0);
6971         }
6972     }
6973 }
6974
6975 /* Comparator for the loop-index sorted cost vectors.  */
6976
6977 static int
6978 li_cost_vec_cmp (const void *a_, const void *b_)
6979 {
6980   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6981   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6982   if (a->first < b->first)
6983     return -1;
6984   else if (a->first == b->first)
6985     return 0;
6986   return 1;
6987 }
6988
6989 /* Check if vectorization of the basic block is profitable for the
6990    subgraph denoted by SLP_INSTANCES.  */
6991
6992 static bool
6993 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6994                                     vec<slp_instance> slp_instances,
6995                                     loop_p orig_loop)
6996 {
6997   slp_instance instance;
6998   int i;
6999   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
7000   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
7001
7002   if (dump_enabled_p ())
7003     {
7004       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
7005       hash_set<slp_tree> visited;
7006       FOR_EACH_VEC_ELT (slp_instances, i, instance)
7007         vect_print_slp_graph (MSG_NOTE, vect_location,
7008                               SLP_INSTANCE_TREE (instance), visited);
7009     }
7010
7011   /* Compute the set of scalar stmts we know will go away 'locally' when
7012      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
7013      not accurate for nodes promoted extern late or for scalar stmts that
7014      are used both in extern defs and in vectorized defs.  */
7015   hash_set<stmt_vec_info> vectorized_scalar_stmts;
7016   hash_set<stmt_vec_info> scalar_stmts_in_externs;
7017   hash_set<slp_tree> visited;
7018   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7019     {
7020       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
7021                                                SLP_INSTANCE_TREE (instance),
7022                                                visited,
7023                                                vectorized_scalar_stmts,
7024                                                scalar_stmts_in_externs);
7025       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7026         vectorized_scalar_stmts.add (rstmt);
7027     }
7028   /* Scalar stmts used as defs in external nodes need to be preseved, so
7029      remove them from vectorized_scalar_stmts.  */
7030   for (stmt_vec_info stmt : scalar_stmts_in_externs)
7031     vectorized_scalar_stmts.remove (stmt);
7032
7033   /* Calculate scalar cost and sum the cost for the vector stmts
7034      previously collected.  */
7035   stmt_vector_for_cost scalar_costs = vNULL;
7036   stmt_vector_for_cost vector_costs = vNULL;
7037   visited.empty ();
7038   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7039     {
7040       auto_vec<bool, 20> life;
7041       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7042                               true);
7043       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7044         record_stmt_cost (&scalar_costs,
7045                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
7046                           scalar_stmt,
7047                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
7048       vect_bb_slp_scalar_cost (bb_vinfo,
7049                                SLP_INSTANCE_TREE (instance),
7050                                &life, &scalar_costs, vectorized_scalar_stmts,
7051                                visited);
7052       vector_costs.safe_splice (instance->cost_vec);
7053       instance->cost_vec.release ();
7054     }
7055
7056   if (dump_enabled_p ())
7057     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7058
7059   /* When costing non-loop vectorization we need to consider each covered
7060      loop independently and make sure vectorization is profitable.  For
7061      now we assume a loop may be not entered or executed an arbitrary
7062      number of iterations (???  static information can provide more
7063      precise info here) which means we can simply cost each containing
7064      loops stmts separately.  */
7065
7066   /* First produce cost vectors sorted by loop index.  */
7067   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7068     li_scalar_costs (scalar_costs.length ());
7069   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7070     li_vector_costs (vector_costs.length ());
7071   stmt_info_for_cost *cost;
7072   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7073     {
7074       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7075       li_scalar_costs.quick_push (std::make_pair (l, cost));
7076     }
7077   /* Use a random used loop as fallback in case the first vector_costs
7078      entry does not have a stmt_info associated with it.  */
7079   unsigned l = li_scalar_costs[0].first;
7080   FOR_EACH_VEC_ELT (vector_costs, i, cost)
7081     {
7082       /* We inherit from the previous COST, invariants, externals and
7083          extracts immediately follow the cost for the related stmt.  */
7084       if (cost->stmt_info)
7085         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7086       li_vector_costs.quick_push (std::make_pair (l, cost));
7087     }
7088   li_scalar_costs.qsort (li_cost_vec_cmp);
7089   li_vector_costs.qsort (li_cost_vec_cmp);
7090
7091   /* Now cost the portions individually.  */
7092   unsigned vi = 0;
7093   unsigned si = 0;
7094   bool profitable = true;
7095   while (si < li_scalar_costs.length ()
7096          && vi < li_vector_costs.length ())
7097     {
7098       unsigned sl = li_scalar_costs[si].first;
7099       unsigned vl = li_vector_costs[vi].first;
7100       if (sl != vl)
7101         {
7102           if (dump_enabled_p ())
7103             dump_printf_loc (MSG_NOTE, vect_location,
7104                              "Scalar %d and vector %d loop part do not "
7105                              "match up, skipping scalar part\n", sl, vl);
7106           /* Skip the scalar part, assuming zero cost on the vector side.  */
7107           do
7108             {
7109               si++;
7110             }
7111           while (si < li_scalar_costs.length ()
7112                  && li_scalar_costs[si].first == sl);
7113           continue;
7114         }
7115
7116       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7117       do
7118         {
7119           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7120           si++;
7121         }
7122       while (si < li_scalar_costs.length ()
7123              && li_scalar_costs[si].first == sl);
7124       unsigned dummy;
7125       finish_cost (scalar_target_cost_data, nullptr,
7126                    &dummy, &scalar_cost, &dummy);
7127
7128       /* Complete the target-specific vector cost calculation.  */
7129       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7130       do
7131         {
7132           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7133           vi++;
7134         }
7135       while (vi < li_vector_costs.length ()
7136              && li_vector_costs[vi].first == vl);
7137       finish_cost (vect_target_cost_data, scalar_target_cost_data,
7138                    &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7139       delete scalar_target_cost_data;
7140       delete vect_target_cost_data;
7141
7142       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7143
7144       if (dump_enabled_p ())
7145         {
7146           dump_printf_loc (MSG_NOTE, vect_location,
7147                            "Cost model analysis for part in loop %d:\n", sl);
7148           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
7149                        vec_inside_cost + vec_outside_cost);
7150           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
7151         }
7152
7153       /* Vectorization is profitable if its cost is more than the cost of scalar
7154          version.  Note that we err on the vector side for equal cost because
7155          the cost estimate is otherwise quite pessimistic (constant uses are
7156          free on the scalar side but cost a load on the vector side for
7157          example).  */
7158       if (vec_outside_cost + vec_inside_cost > scalar_cost)
7159         {
7160           profitable = false;
7161           break;
7162         }
7163     }
7164   if (profitable && vi < li_vector_costs.length ())
7165     {
7166       if (dump_enabled_p ())
7167         dump_printf_loc (MSG_NOTE, vect_location,
7168                          "Excess vector cost for part in loop %d:\n",
7169                          li_vector_costs[vi].first);
7170       profitable = false;
7171     }
7172
7173   /* Unset visited flag.  This is delayed when the subgraph is profitable
7174      and we process the loop for remaining unvectorized if-converted code.  */
7175   if (!orig_loop || !profitable)
7176     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7177       gimple_set_visited  (cost->stmt_info->stmt, false);
7178
7179   scalar_costs.release ();
7180   vector_costs.release ();
7181
7182   return profitable;
7183 }
7184
7185 /* qsort comparator for lane defs.  */
7186
7187 static int
7188 vld_cmp (const void *a_, const void *b_)
7189 {
7190   auto *a = (const std::pair<unsigned, tree> *)a_;
7191   auto *b = (const std::pair<unsigned, tree> *)b_;
7192   return a->first - b->first;
7193 }
7194
7195 /* Return true if USE_STMT is a vector lane insert into VEC and set
7196    *THIS_LANE to the lane number that is set.  */
7197
7198 static bool
7199 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7200 {
7201   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7202   if (!use_ass
7203       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7204       || (vec
7205           ? gimple_assign_rhs1 (use_ass) != vec
7206           : ((vec = gimple_assign_rhs1 (use_ass)), false))
7207       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7208                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7209       || !constant_multiple_p
7210             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7211              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7212              this_lane))
7213     return false;
7214   return true;
7215 }
7216
7217 /* Find any vectorizable constructors and add them to the grouped_store
7218    array.  */
7219
7220 static void
7221 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7222 {
7223   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7224     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7225          !gsi_end_p (gsi); gsi_next (&gsi))
7226     {
7227       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7228       if (!assign)
7229         continue;
7230
7231       tree rhs = gimple_assign_rhs1 (assign);
7232       enum tree_code code = gimple_assign_rhs_code (assign);
7233       use_operand_p use_p;
7234       gimple *use_stmt;
7235       if (code == CONSTRUCTOR)
7236         {
7237           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7238               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7239                            CONSTRUCTOR_NELTS (rhs))
7240               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7241               || uniform_vector_p (rhs))
7242             continue;
7243
7244           unsigned j;
7245           tree val;
7246           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7247             if (TREE_CODE (val) != SSA_NAME
7248                 || !bb_vinfo->lookup_def (val))
7249               break;
7250           if (j != CONSTRUCTOR_NELTS (rhs))
7251             continue;
7252
7253           vec<stmt_vec_info> roots = vNULL;
7254           roots.safe_push (bb_vinfo->lookup_stmt (assign));
7255           vec<stmt_vec_info> stmts;
7256           stmts.create (CONSTRUCTOR_NELTS (rhs));
7257           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7258             stmts.quick_push
7259               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7260           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7261                                                stmts, roots));
7262         }
7263       else if (code == BIT_INSERT_EXPR
7264                && VECTOR_TYPE_P (TREE_TYPE (rhs))
7265                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7266                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7267                && integer_zerop (gimple_assign_rhs3 (assign))
7268                && useless_type_conversion_p
7269                     (TREE_TYPE (TREE_TYPE (rhs)),
7270                      TREE_TYPE (gimple_assign_rhs2 (assign)))
7271                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7272         {
7273           /* We start to match on insert to lane zero but since the
7274              inserts need not be ordered we'd have to search both
7275              the def and the use chains.  */
7276           tree vectype = TREE_TYPE (rhs);
7277           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7278           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7279           auto_sbitmap lanes (nlanes);
7280           bitmap_clear (lanes);
7281           bitmap_set_bit (lanes, 0);
7282           tree def = gimple_assign_lhs (assign);
7283           lane_defs.quick_push
7284                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
7285           unsigned lanes_found = 1;
7286           /* Start with the use chains, the last stmt will be the root.  */
7287           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7288           vec<stmt_vec_info> roots = vNULL;
7289           roots.safe_push (last);
7290           do
7291             {
7292               use_operand_p use_p;
7293               gimple *use_stmt;
7294               if (!single_imm_use (def, &use_p, &use_stmt))
7295                 break;
7296               unsigned this_lane;
7297               if (!bb_vinfo->lookup_stmt (use_stmt)
7298                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7299                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7300                 break;
7301               if (bitmap_bit_p (lanes, this_lane))
7302                 break;
7303               lanes_found++;
7304               bitmap_set_bit (lanes, this_lane);
7305               gassign *use_ass = as_a <gassign *> (use_stmt);
7306               lane_defs.quick_push (std::make_pair
7307                                      (this_lane, gimple_assign_rhs2 (use_ass)));
7308               last = bb_vinfo->lookup_stmt (use_ass);
7309               roots.safe_push (last);
7310               def = gimple_assign_lhs (use_ass);
7311             }
7312           while (lanes_found < nlanes);
7313           if (roots.length () > 1)
7314             std::swap(roots[0], roots[roots.length () - 1]);
7315           if (lanes_found < nlanes)
7316             {
7317               /* Now search the def chain.  */
7318               def = gimple_assign_rhs1 (assign);
7319               do
7320                 {
7321                   if (TREE_CODE (def) != SSA_NAME
7322                       || !has_single_use (def))
7323                     break;
7324                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7325                   unsigned this_lane;
7326                   if (!bb_vinfo->lookup_stmt (def_stmt)
7327                       || !vect_slp_is_lane_insert (def_stmt,
7328                                                    NULL_TREE, &this_lane)
7329                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7330                     break;
7331                   if (bitmap_bit_p (lanes, this_lane))
7332                     break;
7333                   lanes_found++;
7334                   bitmap_set_bit (lanes, this_lane);
7335                   lane_defs.quick_push (std::make_pair
7336                                           (this_lane,
7337                                            gimple_assign_rhs2 (def_stmt)));
7338                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7339                   def = gimple_assign_rhs1 (def_stmt);
7340                 }
7341               while (lanes_found < nlanes);
7342             }
7343           if (lanes_found == nlanes)
7344             {
7345               /* Sort lane_defs after the lane index and register the root.  */
7346               lane_defs.qsort (vld_cmp);
7347               vec<stmt_vec_info> stmts;
7348               stmts.create (nlanes);
7349               for (unsigned i = 0; i < nlanes; ++i)
7350                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7351               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7352                                                    stmts, roots));
7353             }
7354           else
7355             roots.release ();
7356         }
7357       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7358                && (associative_tree_code (code) || code == MINUS_EXPR)
7359                /* ???  This pessimizes a two-element reduction.  PR54400.
7360                   ???  In-order reduction could be handled if we only
7361                   traverse one operand chain in vect_slp_linearize_chain.  */
7362                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7363                /* Ops with constants at the tail can be stripped here.  */
7364                && TREE_CODE (rhs) == SSA_NAME
7365                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7366                /* Should be the chain end.  */
7367                && (!single_imm_use (gimple_assign_lhs (assign),
7368                                     &use_p, &use_stmt)
7369                    || !is_gimple_assign (use_stmt)
7370                    || (gimple_assign_rhs_code (use_stmt) != code
7371                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
7372                            || (gimple_assign_rhs_code (use_stmt)
7373                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7374         {
7375           /* We start the match at the end of a possible association
7376              chain.  */
7377           auto_vec<chain_op_t> chain;
7378           auto_vec<std::pair<tree_code, gimple *> > worklist;
7379           auto_vec<gimple *> chain_stmts;
7380           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7381           if (code == MINUS_EXPR)
7382             code = PLUS_EXPR;
7383           internal_fn reduc_fn;
7384           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7385               || reduc_fn == IFN_LAST)
7386             continue;
7387           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7388                                     /* ??? */
7389                                     code_stmt, alt_code_stmt, &chain_stmts);
7390           if (chain.length () > 1)
7391             {
7392               /* Sort the chain according to def_type and operation.  */
7393               chain.sort (dt_sort_cmp, bb_vinfo);
7394               /* ???  Now we'd want to strip externals and constants
7395                  but record those to be handled in the epilogue.  */
7396               /* ???  For now do not allow mixing ops or externs/constants.  */
7397               bool invalid = false;
7398               unsigned remain_cnt = 0;
7399               for (unsigned i = 0; i < chain.length (); ++i)
7400                 {
7401                   if (chain[i].code != code)
7402                     {
7403                       invalid = true;
7404                       break;
7405                     }
7406                   if (chain[i].dt != vect_internal_def)
7407                     remain_cnt++;
7408                 }
7409               if (!invalid && chain.length () - remain_cnt > 1)
7410                 {
7411                   vec<stmt_vec_info> stmts;
7412                   vec<tree> remain = vNULL;
7413                   stmts.create (chain.length ());
7414                   if (remain_cnt > 0)
7415                     remain.create (remain_cnt);
7416                   for (unsigned i = 0; i < chain.length (); ++i)
7417                     {
7418                       if (chain[i].dt == vect_internal_def)
7419                         stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7420                       else
7421                         remain.quick_push (chain[i].op);
7422                     }
7423                   vec<stmt_vec_info> roots;
7424                   roots.create (chain_stmts.length ());
7425                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
7426                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7427                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7428                                                        stmts, roots, remain));
7429                 }
7430             }
7431         }
7432     }
7433 }
7434
7435 /* Walk the grouped store chains and replace entries with their
7436    pattern variant if any.  */
7437
7438 static void
7439 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7440 {
7441   stmt_vec_info first_element;
7442   unsigned i;
7443
7444   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7445     {
7446       /* We also have CTORs in this array.  */
7447       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7448         continue;
7449       if (STMT_VINFO_IN_PATTERN_P (first_element))
7450         {
7451           stmt_vec_info orig = first_element;
7452           first_element = STMT_VINFO_RELATED_STMT (first_element);
7453           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7454           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7455           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7456           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7457           vinfo->grouped_stores[i] = first_element;
7458         }
7459       stmt_vec_info prev = first_element;
7460       while (DR_GROUP_NEXT_ELEMENT (prev))
7461         {
7462           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7463           if (STMT_VINFO_IN_PATTERN_P (elt))
7464             {
7465               stmt_vec_info orig = elt;
7466               elt = STMT_VINFO_RELATED_STMT (elt);
7467               DR_GROUP_NEXT_ELEMENT (prev) = elt;
7468               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7469               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7470             }
7471           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7472           prev = elt;
7473         }
7474     }
7475 }
7476
7477 /* Check if the region described by BB_VINFO can be vectorized, returning
7478    true if so.  When returning false, set FATAL to true if the same failure
7479    would prevent vectorization at other vector sizes, false if it is still
7480    worth trying other sizes.  N_STMTS is the number of statements in the
7481    region.  */
7482
7483 static bool
7484 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7485                        vec<int> *dataref_groups)
7486 {
7487   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7488
7489   slp_instance instance;
7490   int i;
7491   poly_uint64 min_vf = 2;
7492
7493   /* The first group of checks is independent of the vector size.  */
7494   fatal = true;
7495
7496   /* Analyze the data references.  */
7497
7498   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7499     {
7500       if (dump_enabled_p ())
7501         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7502                          "not vectorized: unhandled data-ref in basic "
7503                          "block.\n");
7504       return false;
7505     }
7506
7507   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7508     {
7509      if (dump_enabled_p ())
7510        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7511                         "not vectorized: unhandled data access in "
7512                         "basic block.\n");
7513       return false;
7514     }
7515
7516   vect_slp_check_for_roots (bb_vinfo);
7517
7518   /* If there are no grouped stores and no constructors in the region
7519      there is no need to continue with pattern recog as vect_analyze_slp
7520      will fail anyway.  */
7521   if (bb_vinfo->grouped_stores.is_empty ()
7522       && bb_vinfo->roots.is_empty ())
7523     {
7524       if (dump_enabled_p ())
7525         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7526                          "not vectorized: no grouped stores in "
7527                          "basic block.\n");
7528       return false;
7529     }
7530
7531   /* While the rest of the analysis below depends on it in some way.  */
7532   fatal = false;
7533
7534   vect_pattern_recog (bb_vinfo);
7535
7536   /* Update store groups from pattern processing.  */
7537   vect_fixup_store_groups_with_patterns (bb_vinfo);
7538
7539   /* Check the SLP opportunities in the basic block, analyze and build SLP
7540      trees.  */
7541   if (!vect_analyze_slp (bb_vinfo, n_stmts))
7542     {
7543       if (dump_enabled_p ())
7544         {
7545           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7546                            "Failed to SLP the basic block.\n");
7547           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7548                            "not vectorized: failed to find SLP opportunities "
7549                            "in basic block.\n");
7550         }
7551       return false;
7552     }
7553
7554   /* Optimize permutations.  */
7555   vect_optimize_slp (bb_vinfo);
7556
7557   /* Gather the loads reachable from the SLP graph entries.  */
7558   vect_gather_slp_loads (bb_vinfo);
7559
7560   vect_record_base_alignments (bb_vinfo);
7561
7562   /* Analyze and verify the alignment of data references and the
7563      dependence in the SLP instances.  */
7564   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7565     {
7566       vect_location = instance->location ();
7567       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7568           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7569         {
7570           slp_tree node = SLP_INSTANCE_TREE (instance);
7571           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7572           if (dump_enabled_p ())
7573             dump_printf_loc (MSG_NOTE, vect_location,
7574                              "removing SLP instance operations starting from: %G",
7575                              stmt_info->stmt);
7576           vect_free_slp_instance (instance);
7577           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7578           continue;
7579         }
7580
7581       /* Mark all the statements that we want to vectorize as pure SLP and
7582          relevant.  */
7583       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7584       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7585       unsigned j;
7586       stmt_vec_info root;
7587       /* Likewise consider instance root stmts as vectorized.  */
7588       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7589         STMT_SLP_TYPE (root) = pure_slp;
7590
7591       i++;
7592     }
7593   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7594     return false;
7595
7596   if (!vect_slp_analyze_operations (bb_vinfo))
7597     {
7598       if (dump_enabled_p ())
7599         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7600                          "not vectorized: bad operation in basic block.\n");
7601       return false;
7602     }
7603
7604   vect_bb_partition_graph (bb_vinfo);
7605
7606   return true;
7607 }
7608
7609 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
7610    basic blocks in BBS, returning true on success.
7611    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
7612
7613 static bool
7614 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7615                  vec<int> *dataref_groups, unsigned int n_stmts,
7616                  loop_p orig_loop)
7617 {
7618   bb_vec_info bb_vinfo;
7619   auto_vector_modes vector_modes;
7620
7621   /* Autodetect first vector size we try.  */
7622   machine_mode next_vector_mode = VOIDmode;
7623   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7624   unsigned int mode_i = 0;
7625
7626   vec_info_shared shared;
7627
7628   machine_mode autodetected_vector_mode = VOIDmode;
7629   while (1)
7630     {
7631       bool vectorized = false;
7632       bool fatal = false;
7633       bb_vinfo = new _bb_vec_info (bbs, &shared);
7634
7635       bool first_time_p = shared.datarefs.is_empty ();
7636       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7637       if (first_time_p)
7638         bb_vinfo->shared->save_datarefs ();
7639       else
7640         bb_vinfo->shared->check_datarefs ();
7641       bb_vinfo->vector_mode = next_vector_mode;
7642
7643       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7644         {
7645           if (dump_enabled_p ())
7646             {
7647               dump_printf_loc (MSG_NOTE, vect_location,
7648                                "***** Analysis succeeded with vector mode"
7649                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7650               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7651             }
7652
7653           bb_vinfo->shared->check_datarefs ();
7654
7655           auto_vec<slp_instance> profitable_subgraphs;
7656           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7657             {
7658               if (instance->subgraph_entries.is_empty ())
7659                 continue;
7660
7661               dump_user_location_t saved_vect_location = vect_location;
7662               vect_location = instance->location ();
7663               if (!unlimited_cost_model (NULL)
7664                   && !vect_bb_vectorization_profitable_p
7665                         (bb_vinfo, instance->subgraph_entries, orig_loop))
7666                 {
7667                   if (dump_enabled_p ())
7668                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7669                                      "not vectorized: vectorization is not "
7670                                      "profitable.\n");
7671                   vect_location = saved_vect_location;
7672                   continue;
7673                 }
7674
7675               vect_location = saved_vect_location;
7676               if (!dbg_cnt (vect_slp))
7677                 continue;
7678
7679               profitable_subgraphs.safe_push (instance);
7680             }
7681
7682           /* When we're vectorizing an if-converted loop body make sure
7683              we vectorized all if-converted code.  */
7684           if (!profitable_subgraphs.is_empty ()
7685               && orig_loop)
7686             {
7687               gcc_assert (bb_vinfo->bbs.length () == 1);
7688               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7689                    !gsi_end_p (gsi); gsi_next (&gsi))
7690                 {
7691                   /* The costing above left us with DCEable vectorized scalar
7692                      stmts having the visited flag set on profitable
7693                      subgraphs.  Do the delayed clearing of the flag here.  */
7694                   if (gimple_visited_p (gsi_stmt (gsi)))
7695                     {
7696                       gimple_set_visited (gsi_stmt (gsi), false);
7697                       continue;
7698                     }
7699                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7700                     continue;
7701
7702                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7703                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
7704                       {
7705                         if (!profitable_subgraphs.is_empty ()
7706                             && dump_enabled_p ())
7707                           dump_printf_loc (MSG_NOTE, vect_location,
7708                                            "not profitable because of "
7709                                            "unprofitable if-converted scalar "
7710                                            "code\n");
7711                         profitable_subgraphs.truncate (0);
7712                       }
7713                 }
7714             }
7715
7716           /* Finally schedule the profitable subgraphs.  */
7717           for (slp_instance instance : profitable_subgraphs)
7718             {
7719               if (!vectorized && dump_enabled_p ())
7720                 dump_printf_loc (MSG_NOTE, vect_location,
7721                                  "Basic block will be vectorized "
7722                                  "using SLP\n");
7723               vectorized = true;
7724
7725               /* Dump before scheduling as store vectorization will remove
7726                  the original stores and mess with the instance tree
7727                  so querying its location will eventually ICE.  */
7728               if (flag_checking)
7729                 for (slp_instance sub : instance->subgraph_entries)
7730                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7731               unsigned HOST_WIDE_INT bytes;
7732               if (dump_enabled_p ())
7733                 for (slp_instance sub : instance->subgraph_entries)
7734                   {
7735                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7736                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7737                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7738                                        sub->location (),
7739                                        "basic block part vectorized using %wu "
7740                                        "byte vectors\n", bytes);
7741                     else
7742                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7743                                        sub->location (),
7744                                        "basic block part vectorized using "
7745                                        "variable length vectors\n");
7746                   }
7747
7748               dump_user_location_t saved_vect_location = vect_location;
7749               vect_location = instance->location ();
7750
7751               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7752
7753               vect_location = saved_vect_location;
7754             }
7755         }
7756       else
7757         {
7758           if (dump_enabled_p ())
7759             dump_printf_loc (MSG_NOTE, vect_location,
7760                              "***** Analysis failed with vector mode %s\n",
7761                              GET_MODE_NAME (bb_vinfo->vector_mode));
7762         }
7763
7764       if (mode_i == 0)
7765         autodetected_vector_mode = bb_vinfo->vector_mode;
7766
7767       if (!fatal)
7768         while (mode_i < vector_modes.length ()
7769                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7770           {
7771             if (dump_enabled_p ())
7772               dump_printf_loc (MSG_NOTE, vect_location,
7773                                "***** The result for vector mode %s would"
7774                                " be the same\n",
7775                                GET_MODE_NAME (vector_modes[mode_i]));
7776             mode_i += 1;
7777           }
7778
7779       delete bb_vinfo;
7780
7781       if (mode_i < vector_modes.length ()
7782           && VECTOR_MODE_P (autodetected_vector_mode)
7783           && (related_vector_mode (vector_modes[mode_i],
7784                                    GET_MODE_INNER (autodetected_vector_mode))
7785               == autodetected_vector_mode)
7786           && (related_vector_mode (autodetected_vector_mode,
7787                                    GET_MODE_INNER (vector_modes[mode_i]))
7788               == vector_modes[mode_i]))
7789         {
7790           if (dump_enabled_p ())
7791             dump_printf_loc (MSG_NOTE, vect_location,
7792                              "***** Skipping vector mode %s, which would"
7793                              " repeat the analysis for %s\n",
7794                              GET_MODE_NAME (vector_modes[mode_i]),
7795                              GET_MODE_NAME (autodetected_vector_mode));
7796           mode_i += 1;
7797         }
7798
7799       if (vectorized
7800           || mode_i == vector_modes.length ()
7801           || autodetected_vector_mode == VOIDmode
7802           /* If vect_slp_analyze_bb_1 signaled that analysis for all
7803              vector sizes will fail do not bother iterating.  */
7804           || fatal)
7805         return vectorized;
7806
7807       /* Try the next biggest vector size.  */
7808       next_vector_mode = vector_modes[mode_i++];
7809       if (dump_enabled_p ())
7810         dump_printf_loc (MSG_NOTE, vect_location,
7811                          "***** Re-trying analysis with vector mode %s\n",
7812                          GET_MODE_NAME (next_vector_mode));
7813     }
7814 }
7815
7816
7817 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
7818    true if anything in the basic-block was vectorized.  */
7819
7820 static bool
7821 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7822 {
7823   vec<data_reference_p> datarefs = vNULL;
7824   auto_vec<int> dataref_groups;
7825   int insns = 0;
7826   int current_group = 0;
7827
7828   for (unsigned i = 0; i < bbs.length (); i++)
7829     {
7830       basic_block bb = bbs[i];
7831       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7832            gsi_next (&gsi))
7833         {
7834           gimple *stmt = gsi_stmt (gsi);
7835           if (is_gimple_debug (stmt))
7836             continue;
7837
7838           insns++;
7839
7840           if (gimple_location (stmt) != UNKNOWN_LOCATION)
7841             vect_location = stmt;
7842
7843           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7844                                               &dataref_groups, current_group))
7845             ++current_group;
7846         }
7847       /* New BBs always start a new DR group.  */
7848       ++current_group;
7849     }
7850
7851   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7852 }
7853
7854 /* Special entry for the BB vectorizer.  Analyze and transform a single
7855    if-converted BB with ORIG_LOOPs body being the not if-converted
7856    representation.  Returns true if anything in the basic-block was
7857    vectorized.  */
7858
7859 bool
7860 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7861 {
7862   auto_vec<basic_block> bbs;
7863   bbs.safe_push (bb);
7864   return vect_slp_bbs (bbs, orig_loop);
7865 }
7866
7867 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
7868    true if anything in the basic-block was vectorized.  */
7869
7870 bool
7871 vect_slp_function (function *fun)
7872 {
7873   bool r = false;
7874   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7875   auto_bitmap exit_bbs;
7876   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
7877   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
7878   unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
7879                                                       true, rpo, NULL);
7880
7881   /* For the moment split the function into pieces to avoid making
7882      the iteration on the vector mode moot.  Split at points we know
7883      to not handle well which is CFG merges (SLP discovery doesn't
7884      handle non-loop-header PHIs) and loop exits.  Since pattern
7885      recog requires reverse iteration to visit uses before defs
7886      simply chop RPO into pieces.  */
7887   auto_vec<basic_block> bbs;
7888   for (unsigned i = 0; i < n; i++)
7889     {
7890       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7891       bool split = false;
7892
7893       /* Split when a BB is not dominated by the first block.  */
7894       if (!bbs.is_empty ()
7895           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7896         {
7897           if (dump_enabled_p ())
7898             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7899                              "splitting region at dominance boundary bb%d\n",
7900                              bb->index);
7901           split = true;
7902         }
7903       /* Split when the loop determined by the first block
7904          is exited.  This is because we eventually insert
7905          invariants at region begin.  */
7906       else if (!bbs.is_empty ()
7907                && bbs[0]->loop_father != bb->loop_father
7908                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7909         {
7910           if (dump_enabled_p ())
7911             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7912                              "splitting region at loop %d exit at bb%d\n",
7913                              bbs[0]->loop_father->num, bb->index);
7914           split = true;
7915         }
7916       else if (!bbs.is_empty ()
7917                && bb->loop_father->header == bb
7918                && bb->loop_father->dont_vectorize)
7919         {
7920           if (dump_enabled_p ())
7921             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7922                              "splitting region at dont-vectorize loop %d "
7923                              "entry at bb%d\n",
7924                              bb->loop_father->num, bb->index);
7925           split = true;
7926         }
7927
7928       if (split && !bbs.is_empty ())
7929         {
7930           r |= vect_slp_bbs (bbs, NULL);
7931           bbs.truncate (0);
7932         }
7933
7934       if (bbs.is_empty ())
7935         {
7936           /* We need to be able to insert at the head of the region which
7937              we cannot for region starting with a returns-twice call.  */
7938           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
7939             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
7940               {
7941                 if (dump_enabled_p ())
7942                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7943                                    "skipping bb%d as start of region as it "
7944                                    "starts with returns-twice call\n",
7945                                    bb->index);
7946                 continue;
7947               }
7948           /* If the loop this BB belongs to is marked as not to be vectorized
7949              honor that also for BB vectorization.  */
7950           if (bb->loop_father->dont_vectorize)
7951             continue;
7952         }
7953
7954       bbs.safe_push (bb);
7955
7956       /* When we have a stmt ending this block and defining a
7957          value we have to insert on edges when inserting after it for
7958          a vector containing its definition.  Avoid this for now.  */
7959       if (gimple *last = *gsi_last_bb (bb))
7960         if (gimple_get_lhs (last)
7961             && is_ctrl_altering_stmt (last))
7962           {
7963             if (dump_enabled_p ())
7964               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7965                                "splitting region at control altering "
7966                                "definition %G", last);
7967             r |= vect_slp_bbs (bbs, NULL);
7968             bbs.truncate (0);
7969           }
7970     }
7971
7972   if (!bbs.is_empty ())
7973     r |= vect_slp_bbs (bbs, NULL);
7974
7975   free (rpo);
7976
7977   return r;
7978 }
7979
7980 /* Build a variable-length vector in which the elements in ELTS are repeated
7981    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
7982    RESULTS and add any new instructions to SEQ.
7983
7984    The approach we use is:
7985
7986    (1) Find a vector mode VM with integer elements of mode IM.
7987
7988    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7989        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
7990        from small vectors to IM.
7991
7992    (3) Duplicate each ELTS'[I] into a vector of mode VM.
7993
7994    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7995        correct byte contents.
7996
7997    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7998
7999    We try to find the largest IM for which this sequence works, in order
8000    to cut down on the number of interleaves.  */
8001
8002 void
8003 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
8004                           const vec<tree> &elts, unsigned int nresults,
8005                           vec<tree> &results)
8006 {
8007   unsigned int nelts = elts.length ();
8008   tree element_type = TREE_TYPE (vector_type);
8009
8010   /* (1) Find a vector mode VM with integer elements of mode IM.  */
8011   unsigned int nvectors = 1;
8012   tree new_vector_type;
8013   tree permutes[2];
8014   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
8015                                        &nvectors, &new_vector_type,
8016                                        permutes))
8017     gcc_unreachable ();
8018
8019   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
8020   unsigned int partial_nelts = nelts / nvectors;
8021   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8022
8023   tree_vector_builder partial_elts;
8024   auto_vec<tree, 32> pieces (nvectors * 2);
8025   pieces.quick_grow_cleared (nvectors * 2);
8026   for (unsigned int i = 0; i < nvectors; ++i)
8027     {
8028       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8029              ELTS' has mode IM.  */
8030       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
8031       for (unsigned int j = 0; j < partial_nelts; ++j)
8032         partial_elts.quick_push (elts[i * partial_nelts + j]);
8033       tree t = gimple_build_vector (seq, &partial_elts);
8034       t = gimple_build (seq, VIEW_CONVERT_EXPR,
8035                         TREE_TYPE (new_vector_type), t);
8036
8037       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
8038       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
8039     }
8040
8041   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
8042          correct byte contents.
8043
8044      Conceptually, we need to repeat the following operation log2(nvectors)
8045      times, where hi_start = nvectors / 2:
8046
8047         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
8048         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
8049
8050      However, if each input repeats every N elements and the VF is
8051      a multiple of N * 2, the HI result is the same as the LO result.
8052      This will be true for the first N1 iterations of the outer loop,
8053      followed by N2 iterations for which both the LO and HI results
8054      are needed.  I.e.:
8055
8056         N1 + N2 = log2(nvectors)
8057
8058      Each "N1 iteration" doubles the number of redundant vectors and the
8059      effect of the process as a whole is to have a sequence of nvectors/2**N1
8060      vectors that repeats 2**N1 times.  Rather than generate these redundant
8061      vectors, we halve the number of vectors for each N1 iteration.  */
8062   unsigned int in_start = 0;
8063   unsigned int out_start = nvectors;
8064   unsigned int new_nvectors = nvectors;
8065   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
8066     {
8067       unsigned int hi_start = new_nvectors / 2;
8068       unsigned int out_i = 0;
8069       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
8070         {
8071           if ((in_i & 1) != 0
8072               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
8073                              2 * in_repeat))
8074             continue;
8075
8076           tree output = make_ssa_name (new_vector_type);
8077           tree input1 = pieces[in_start + (in_i / 2)];
8078           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
8079           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8080                                                input1, input2,
8081                                                permutes[in_i & 1]);
8082           gimple_seq_add_stmt (seq, stmt);
8083           pieces[out_start + out_i] = output;
8084           out_i += 1;
8085         }
8086       std::swap (in_start, out_start);
8087       new_nvectors = out_i;
8088     }
8089
8090   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
8091   results.reserve (nresults);
8092   for (unsigned int i = 0; i < nresults; ++i)
8093     if (i < new_nvectors)
8094       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
8095                                         pieces[in_start + i]));
8096     else
8097       results.quick_push (results[i - new_nvectors]);
8098 }
8099
8100
8101 /* For constant and loop invariant defs in OP_NODE this function creates
8102    vector defs that will be used in the vectorized stmts and stores them
8103    to SLP_TREE_VEC_DEFS of OP_NODE.  */
8104
8105 static void
8106 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8107 {
8108   unsigned HOST_WIDE_INT nunits;
8109   tree vec_cst;
8110   unsigned j, number_of_places_left_in_vector;
8111   tree vector_type;
8112   tree vop;
8113   int group_size = op_node->ops.length ();
8114   unsigned int vec_num, i;
8115   unsigned number_of_copies = 1;
8116   bool constant_p;
8117   gimple_seq ctor_seq = NULL;
8118   auto_vec<tree, 16> permute_results;
8119
8120   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
8121   vector_type = SLP_TREE_VECTYPE (op_node);
8122
8123   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8124   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8125   auto_vec<tree> voprnds (number_of_vectors);
8126
8127   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8128      created vectors. It is greater than 1 if unrolling is performed.
8129
8130      For example, we have two scalar operands, s1 and s2 (e.g., group of
8131      strided accesses of size two), while NUNITS is four (i.e., four scalars
8132      of this type can be packed in a vector).  The output vector will contain
8133      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
8134      will be 2).
8135
8136      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8137      containing the operands.
8138
8139      For example, NUNITS is four as before, and the group size is 8
8140      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
8141      {s5, s6, s7, s8}.  */
8142
8143   /* When using duplicate_and_interleave, we just need one element for
8144      each scalar statement.  */
8145   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8146     nunits = group_size;
8147
8148   number_of_copies = nunits * number_of_vectors / group_size;
8149
8150   number_of_places_left_in_vector = nunits;
8151   constant_p = true;
8152   tree_vector_builder elts (vector_type, nunits, 1);
8153   elts.quick_grow (nunits);
8154   stmt_vec_info insert_after = NULL;
8155   for (j = 0; j < number_of_copies; j++)
8156     {
8157       tree op;
8158       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8159         {
8160           /* Create 'vect_ = {op0,op1,...,opn}'.  */
8161           number_of_places_left_in_vector--;
8162           tree orig_op = op;
8163           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8164             {
8165               if (CONSTANT_CLASS_P (op))
8166                 {
8167                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8168                     {
8169                       /* Can't use VIEW_CONVERT_EXPR for booleans because
8170                          of possibly different sizes of scalar value and
8171                          vector element.  */
8172                       if (integer_zerop (op))
8173                         op = build_int_cst (TREE_TYPE (vector_type), 0);
8174                       else if (integer_onep (op))
8175                         op = build_all_ones_cst (TREE_TYPE (vector_type));
8176                       else
8177                         gcc_unreachable ();
8178                     }
8179                   else
8180                     op = fold_unary (VIEW_CONVERT_EXPR,
8181                                      TREE_TYPE (vector_type), op);
8182                   gcc_assert (op && CONSTANT_CLASS_P (op));
8183                 }
8184               else
8185                 {
8186                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8187                   gimple *init_stmt;
8188                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8189                     {
8190                       tree true_val
8191                         = build_all_ones_cst (TREE_TYPE (vector_type));
8192                       tree false_val
8193                         = build_zero_cst (TREE_TYPE (vector_type));
8194                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8195                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8196                                                        op, true_val,
8197                                                        false_val);
8198                     }
8199                   else
8200                     {
8201                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8202                                    op);
8203                       init_stmt
8204                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8205                                                op);
8206                     }
8207                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
8208                   op = new_temp;
8209                 }
8210             }
8211           elts[number_of_places_left_in_vector] = op;
8212           if (!CONSTANT_CLASS_P (op))
8213             constant_p = false;
8214           /* For BB vectorization we have to compute an insert location
8215              when a def is inside the analyzed region since we cannot
8216              simply insert at the BB start in this case.  */
8217           stmt_vec_info opdef;
8218           if (TREE_CODE (orig_op) == SSA_NAME
8219               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8220               && is_a <bb_vec_info> (vinfo)
8221               && (opdef = vinfo->lookup_def (orig_op)))
8222             {
8223               if (!insert_after)
8224                 insert_after = opdef;
8225               else
8226                 insert_after = get_later_stmt (insert_after, opdef);
8227             }
8228
8229           if (number_of_places_left_in_vector == 0)
8230             {
8231               if (constant_p
8232                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
8233                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
8234                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8235               else
8236                 {
8237                   if (permute_results.is_empty ())
8238                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8239                                               elts, number_of_vectors,
8240                                               permute_results);
8241                   vec_cst = permute_results[number_of_vectors - j - 1];
8242                 }
8243               if (!gimple_seq_empty_p (ctor_seq))
8244                 {
8245                   if (insert_after)
8246                     {
8247                       gimple_stmt_iterator gsi;
8248                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8249                         {
8250                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8251                           gsi_insert_seq_before (&gsi, ctor_seq,
8252                                                  GSI_CONTINUE_LINKING);
8253                         }
8254                       else if (!stmt_ends_bb_p (insert_after->stmt))
8255                         {
8256                           gsi = gsi_for_stmt (insert_after->stmt);
8257                           gsi_insert_seq_after (&gsi, ctor_seq,
8258                                                 GSI_CONTINUE_LINKING);
8259                         }
8260                       else
8261                         {
8262                           /* When we want to insert after a def where the
8263                              defining stmt throws then insert on the fallthru
8264                              edge.  */
8265                           edge e = find_fallthru_edge
8266                                      (gimple_bb (insert_after->stmt)->succs);
8267                           basic_block new_bb
8268                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8269                           gcc_assert (!new_bb);
8270                         }
8271                     }
8272                   else
8273                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
8274                   ctor_seq = NULL;
8275                 }
8276               voprnds.quick_push (vec_cst);
8277               insert_after = NULL;
8278               number_of_places_left_in_vector = nunits;
8279               constant_p = true;
8280               elts.new_vector (vector_type, nunits, 1);
8281               elts.quick_grow (nunits);
8282             }
8283         }
8284     }
8285
8286   /* Since the vectors are created in the reverse order, we should invert
8287      them.  */
8288   vec_num = voprnds.length ();
8289   for (j = vec_num; j != 0; j--)
8290     {
8291       vop = voprnds[j - 1];
8292       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8293     }
8294
8295   /* In case that VF is greater than the unrolling factor needed for the SLP
8296      group of stmts, NUMBER_OF_VECTORS to be created is greater than
8297      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8298      to replicate the vectors.  */
8299   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8300     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8301          i++)
8302       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8303 }
8304
8305 /* Get the Ith vectorized definition from SLP_NODE.  */
8306
8307 tree
8308 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8309 {
8310   return SLP_TREE_VEC_DEFS (slp_node)[i];
8311 }
8312
8313 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
8314
8315 void
8316 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8317 {
8318   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8319   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8320 }
8321
8322 /* Get N vectorized definitions for SLP_NODE.  */
8323
8324 void
8325 vect_get_slp_defs (vec_info *,
8326                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8327 {
8328   if (n == -1U)
8329     n = SLP_TREE_CHILDREN (slp_node).length ();
8330
8331   for (unsigned i = 0; i < n; ++i)
8332     {
8333       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8334       vec<tree> vec_defs = vNULL;
8335       vect_get_slp_defs (child, &vec_defs);
8336       vec_oprnds->quick_push (vec_defs);
8337     }
8338 }
8339
8340 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8341    - PERM gives the permutation that the caller wants to use for NODE,
8342      which might be different from SLP_LOAD_PERMUTATION.
8343    - DUMP_P controls whether the function dumps information.  */
8344
8345 static bool
8346 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8347                                 load_permutation_t &perm,
8348                                 const vec<tree> &dr_chain,
8349                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
8350                                 bool analyze_only, bool dump_p,
8351                                 unsigned *n_perms, unsigned int *n_loads,
8352                                 bool dce_chain)
8353 {
8354   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8355   int vec_index = 0;
8356   tree vectype = SLP_TREE_VECTYPE (node);
8357   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8358   unsigned int mask_element;
8359   unsigned dr_group_size;
8360   machine_mode mode;
8361
8362   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8363     dr_group_size = 1;
8364   else
8365     {
8366       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8367       dr_group_size = DR_GROUP_SIZE (stmt_info);
8368     }
8369
8370   mode = TYPE_MODE (vectype);
8371   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8372   unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8373
8374   /* Initialize the vect stmts of NODE to properly insert the generated
8375      stmts later.  */
8376   if (! analyze_only)
8377     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8378       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8379
8380   /* Generate permutation masks for every NODE. Number of masks for each NODE
8381      is equal to GROUP_SIZE.
8382      E.g., we have a group of three nodes with three loads from the same
8383      location in each node, and the vector size is 4. I.e., we have a
8384      a0b0c0a1b1c1... sequence and we need to create the following vectors:
8385      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8386      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8387      ...
8388
8389      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8390      The last mask is illegal since we assume two operands for permute
8391      operation, and the mask element values can't be outside that range.
8392      Hence, the last mask must be converted into {2,5,5,5}.
8393      For the first two permutations we need the first and the second input
8394      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8395      we need the second and the third vectors: {b1,c1,a2,b2} and
8396      {c2,a3,b3,c3}.  */
8397
8398   int vect_stmts_counter = 0;
8399   unsigned int index = 0;
8400   int first_vec_index = -1;
8401   int second_vec_index = -1;
8402   bool noop_p = true;
8403   *n_perms = 0;
8404
8405   vec_perm_builder mask;
8406   unsigned int nelts_to_build;
8407   unsigned int nvectors_per_build;
8408   unsigned int in_nlanes;
8409   bool repeating_p = (group_size == dr_group_size
8410                       && multiple_p (nunits, group_size));
8411   if (repeating_p)
8412     {
8413       /* A single vector contains a whole number of copies of the node, so:
8414          (a) all permutes can use the same mask; and
8415          (b) the permutes only need a single vector input.  */
8416       mask.new_vector (nunits, group_size, 3);
8417       nelts_to_build = mask.encoded_nelts ();
8418       /* It's possible to obtain zero nstmts during analyze_only, so make
8419          it at least one to ensure the later computation for n_perms
8420          proceed.  */
8421       nvectors_per_build = nstmts > 0 ? nstmts : 1;
8422       in_nlanes = dr_group_size * 3;
8423     }
8424   else
8425     {
8426       /* We need to construct a separate mask for each vector statement.  */
8427       unsigned HOST_WIDE_INT const_nunits, const_vf;
8428       if (!nunits.is_constant (&const_nunits)
8429           || !vf.is_constant (&const_vf))
8430         return false;
8431       mask.new_vector (const_nunits, const_nunits, 1);
8432       nelts_to_build = const_vf * group_size;
8433       nvectors_per_build = 1;
8434       in_nlanes = const_vf * dr_group_size;
8435     }
8436   auto_sbitmap used_in_lanes (in_nlanes);
8437   bitmap_clear (used_in_lanes);
8438   auto_bitmap used_defs;
8439
8440   unsigned int count = mask.encoded_nelts ();
8441   mask.quick_grow (count);
8442   vec_perm_indices indices;
8443
8444   for (unsigned int j = 0; j < nelts_to_build; j++)
8445     {
8446       unsigned int iter_num = j / group_size;
8447       unsigned int stmt_num = j % group_size;
8448       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8449       bitmap_set_bit (used_in_lanes, i);
8450       if (repeating_p)
8451         {
8452           first_vec_index = 0;
8453           mask_element = i;
8454         }
8455       else
8456         {
8457           /* Enforced before the loop when !repeating_p.  */
8458           unsigned int const_nunits = nunits.to_constant ();
8459           vec_index = i / const_nunits;
8460           mask_element = i % const_nunits;
8461           if (vec_index == first_vec_index
8462               || first_vec_index == -1)
8463             {
8464               first_vec_index = vec_index;
8465             }
8466           else if (vec_index == second_vec_index
8467                    || second_vec_index == -1)
8468             {
8469               second_vec_index = vec_index;
8470               mask_element += const_nunits;
8471             }
8472           else
8473             {
8474               if (dump_p)
8475                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8476                                  "permutation requires at "
8477                                  "least three vectors %G",
8478                                  stmt_info->stmt);
8479               gcc_assert (analyze_only);
8480               return false;
8481             }
8482
8483           gcc_assert (mask_element < 2 * const_nunits);
8484         }
8485
8486       if (mask_element != index)
8487         noop_p = false;
8488       mask[index++] = mask_element;
8489
8490       if (index == count)
8491         {
8492           if (!noop_p)
8493             {
8494               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8495               if (!can_vec_perm_const_p (mode, mode, indices))
8496                 {
8497                   if (dump_p)
8498                     {
8499                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8500                                        "unsupported vect permute { ");
8501                       for (i = 0; i < count; ++i)
8502                         {
8503                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8504                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8505                         }
8506                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8507                     }
8508                   gcc_assert (analyze_only);
8509                   return false;
8510                 }
8511
8512               tree mask_vec = NULL_TREE;
8513               if (!analyze_only)
8514                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8515
8516               if (second_vec_index == -1)
8517                 second_vec_index = first_vec_index;
8518
8519               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8520                 {
8521                   ++*n_perms;
8522                   if (analyze_only)
8523                     continue;
8524                   /* Generate the permute statement if necessary.  */
8525                   tree first_vec = dr_chain[first_vec_index + ri];
8526                   tree second_vec = dr_chain[second_vec_index + ri];
8527                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8528                   tree perm_dest
8529                     = vect_create_destination_var (gimple_assign_lhs (stmt),
8530                                                    vectype);
8531                   perm_dest = make_ssa_name (perm_dest);
8532                   gimple *perm_stmt
8533                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8534                                            second_vec, mask_vec);
8535                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8536                                                gsi);
8537                   if (dce_chain)
8538                     {
8539                       bitmap_set_bit (used_defs, first_vec_index + ri);
8540                       bitmap_set_bit (used_defs, second_vec_index + ri);
8541                     }
8542
8543                   /* Store the vector statement in NODE.  */
8544                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8545                 }
8546             }
8547           else if (!analyze_only)
8548             {
8549               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8550                 {
8551                   tree first_vec = dr_chain[first_vec_index + ri];
8552                   /* If mask was NULL_TREE generate the requested
8553                      identity transform.  */
8554                   if (dce_chain)
8555                     bitmap_set_bit (used_defs, first_vec_index + ri);
8556
8557                   /* Store the vector statement in NODE.  */
8558                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8559                 }
8560             }
8561
8562           index = 0;
8563           first_vec_index = -1;
8564           second_vec_index = -1;
8565           noop_p = true;
8566         }
8567     }
8568
8569   if (n_loads)
8570     {
8571       if (repeating_p)
8572         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8573       else
8574         {
8575           /* Enforced above when !repeating_p.  */
8576           unsigned int const_nunits = nunits.to_constant ();
8577           *n_loads = 0;
8578           bool load_seen = false;
8579           for (unsigned i = 0; i < in_nlanes; ++i)
8580             {
8581               if (i % const_nunits == 0)
8582                 {
8583                   if (load_seen)
8584                     *n_loads += 1;
8585                   load_seen = false;
8586                 }
8587               if (bitmap_bit_p (used_in_lanes, i))
8588                 load_seen = true;
8589             }
8590           if (load_seen)
8591             *n_loads += 1;
8592         }
8593     }
8594
8595   if (dce_chain)
8596     for (unsigned i = 0; i < dr_chain.length (); ++i)
8597       if (!bitmap_bit_p (used_defs, i))
8598         {
8599           gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8600           gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8601           gsi_remove (&rgsi, true);
8602           release_defs (stmt);
8603         }
8604
8605   return true;
8606 }
8607
8608 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8609    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8610    permute statements for the SLP node NODE.  Store the number of vector
8611    permute instructions in *N_PERMS and the number of vector load
8612    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
8613    that were not needed.  */
8614
8615 bool
8616 vect_transform_slp_perm_load (vec_info *vinfo,
8617                               slp_tree node, const vec<tree> &dr_chain,
8618                               gimple_stmt_iterator *gsi, poly_uint64 vf,
8619                               bool analyze_only, unsigned *n_perms,
8620                               unsigned int *n_loads, bool dce_chain)
8621 {
8622   return vect_transform_slp_perm_load_1 (vinfo, node,
8623                                          SLP_TREE_LOAD_PERMUTATION (node),
8624                                          dr_chain, gsi, vf, analyze_only,
8625                                          dump_enabled_p (), n_perms, n_loads,
8626                                          dce_chain);
8627 }
8628
8629 /* Produce the next vector result for SLP permutation NODE by adding a vector
8630    statement at GSI.  If MASK_VEC is nonnull, add:
8631
8632       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8633
8634    otherwise add:
8635
8636       <new SSA name> = FIRST_DEF.  */
8637
8638 static void
8639 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8640                           slp_tree node, tree first_def, tree second_def,
8641                           tree mask_vec, poly_uint64 identity_offset)
8642 {
8643   tree vectype = SLP_TREE_VECTYPE (node);
8644
8645   /* ???  We SLP match existing vector element extracts but
8646      allow punning which we need to re-instantiate at uses
8647      but have no good way of explicitly representing.  */
8648   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8649       && !types_compatible_p (TREE_TYPE (first_def), vectype))
8650     {
8651       gassign *conv_stmt
8652         = gimple_build_assign (make_ssa_name (vectype),
8653                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8654       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8655       first_def = gimple_assign_lhs (conv_stmt);
8656     }
8657   gassign *perm_stmt;
8658   tree perm_dest = make_ssa_name (vectype);
8659   if (mask_vec)
8660     {
8661       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8662                            TYPE_SIZE (vectype))
8663           && !types_compatible_p (TREE_TYPE (second_def), vectype))
8664         {
8665           gassign *conv_stmt
8666             = gimple_build_assign (make_ssa_name (vectype),
8667                                    build1 (VIEW_CONVERT_EXPR,
8668                                            vectype, second_def));
8669           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8670           second_def = gimple_assign_lhs (conv_stmt);
8671         }
8672       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8673                                        first_def, second_def,
8674                                        mask_vec);
8675     }
8676   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8677     {
8678       /* For identity permutes we still need to handle the case
8679          of offsetted extracts or concats.  */
8680       unsigned HOST_WIDE_INT c;
8681       auto first_def_nunits
8682         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8683       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8684         {
8685           unsigned HOST_WIDE_INT elsz
8686             = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8687           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8688                                  TYPE_SIZE (vectype),
8689                                  bitsize_int (identity_offset * elsz));
8690           perm_stmt = gimple_build_assign (perm_dest, lowpart);
8691         }
8692       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8693                                     first_def_nunits, &c) && c == 2)
8694         {
8695           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8696                                             NULL_TREE, second_def);
8697           perm_stmt = gimple_build_assign (perm_dest, ctor);
8698         }
8699       else
8700         gcc_unreachable ();
8701     }
8702   else
8703     {
8704       /* We need a copy here in case the def was external.  */
8705       perm_stmt = gimple_build_assign (perm_dest, first_def);
8706     }
8707   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8708   /* Store the vector statement in NODE.  */
8709   node->push_vec_def (perm_stmt);
8710 }
8711
8712 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
8713    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8714    If GSI is nonnull, emit the permutation there.
8715
8716    When GSI is null, the only purpose of NODE is to give properties
8717    of the result, such as the vector type and number of SLP lanes.
8718    The node does not need to be a VEC_PERM_EXPR.
8719
8720    If the target supports the operation, return the number of individual
8721    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
8722    dump file if DUMP_P is true.  */
8723
8724 static int
8725 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8726                                 slp_tree node, lane_permutation_t &perm,
8727                                 vec<slp_tree> &children, bool dump_p)
8728 {
8729   tree vectype = SLP_TREE_VECTYPE (node);
8730
8731   /* ???  We currently only support all same vector input types
8732      while the SLP IL should really do a concat + select and thus accept
8733      arbitrary mismatches.  */
8734   slp_tree child;
8735   unsigned i;
8736   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8737   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8738   tree op_vectype = NULL_TREE;
8739   FOR_EACH_VEC_ELT (children, i, child)
8740     if (SLP_TREE_VECTYPE (child))
8741       {
8742         op_vectype = SLP_TREE_VECTYPE (child);
8743         break;
8744       }
8745   if (!op_vectype)
8746     op_vectype = vectype;
8747   FOR_EACH_VEC_ELT (children, i, child)
8748     {
8749       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8750            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8751           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8752           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8753         {
8754           if (dump_p)
8755             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8756                              "Unsupported vector types in lane permutation\n");
8757           return -1;
8758         }
8759       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8760         repeating_p = false;
8761     }
8762
8763   gcc_assert (perm.length () == SLP_TREE_LANES (node));
8764   if (dump_p)
8765     {
8766       dump_printf_loc (MSG_NOTE, vect_location,
8767                        "vectorizing permutation");
8768       for (unsigned i = 0; i < perm.length (); ++i)
8769         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8770       if (repeating_p)
8771         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8772       dump_printf (MSG_NOTE, "\n");
8773     }
8774
8775   /* REPEATING_P is true if every output vector is guaranteed to use the
8776      same permute vector.  We can handle that case for both variable-length
8777      and constant-length vectors, but we only handle other cases for
8778      constant-length vectors.
8779
8780      Set:
8781
8782      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8783        mask vector that we want to build.
8784
8785      - NCOPIES to the number of copies of PERM that we need in order
8786        to build the necessary permute mask vectors.
8787
8788      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8789        for each permute mask vector.  This is only relevant when GSI is
8790        nonnull.  */
8791   uint64_t npatterns;
8792   unsigned nelts_per_pattern;
8793   uint64_t ncopies;
8794   unsigned noutputs_per_mask;
8795   if (repeating_p)
8796     {
8797       /* We need a single permute mask vector that has the form:
8798
8799            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8800
8801          In other words, the original n-element permute in PERM is
8802          "unrolled" to fill a full vector.  The stepped vector encoding
8803          that we use for permutes requires 3n elements.  */
8804       npatterns = SLP_TREE_LANES (node);
8805       nelts_per_pattern = ncopies = 3;
8806       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8807     }
8808   else
8809     {
8810       /* Calculate every element of every permute mask vector explicitly,
8811          instead of relying on the pattern described above.  */
8812       if (!nunits.is_constant (&npatterns))
8813         return -1;
8814       nelts_per_pattern = ncopies = 1;
8815       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8816         if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8817           return -1;
8818       noutputs_per_mask = 1;
8819     }
8820   unsigned olanes = ncopies * SLP_TREE_LANES (node);
8821   gcc_assert (repeating_p || multiple_p (olanes, nunits));
8822
8823   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8824      from the { SLP operand, scalar lane } permutation as recorded in the
8825      SLP node as intermediate step.  This part should already work
8826      with SLP children with arbitrary number of lanes.  */
8827   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8828   auto_vec<unsigned> active_lane;
8829   vperm.create (olanes);
8830   active_lane.safe_grow_cleared (children.length (), true);
8831   for (unsigned i = 0; i < ncopies; ++i)
8832     {
8833       for (unsigned pi = 0; pi < perm.length (); ++pi)
8834         {
8835           std::pair<unsigned, unsigned> p = perm[pi];
8836           tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8837           if (repeating_p)
8838             vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8839           else
8840             {
8841               /* We checked above that the vectors are constant-length.  */
8842               unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8843               unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8844               unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8845               vperm.quick_push ({{p.first, vi}, vl});
8846             }
8847         }
8848       /* Advance to the next group.  */
8849       for (unsigned j = 0; j < children.length (); ++j)
8850         active_lane[j] += SLP_TREE_LANES (children[j]);
8851     }
8852
8853   if (dump_p)
8854     {
8855       dump_printf_loc (MSG_NOTE, vect_location,
8856                        "vectorizing permutation");
8857       for (unsigned i = 0; i < perm.length (); ++i)
8858         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8859       if (repeating_p)
8860         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8861       dump_printf (MSG_NOTE, "\n");
8862       dump_printf_loc (MSG_NOTE, vect_location, "as");
8863       for (unsigned i = 0; i < vperm.length (); ++i)
8864         {
8865           if (i != 0
8866               && (repeating_p
8867                   ? multiple_p (i, npatterns)
8868                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8869             dump_printf (MSG_NOTE, ",");
8870           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8871                        vperm[i].first.first, vperm[i].first.second,
8872                        vperm[i].second);
8873         }
8874       dump_printf (MSG_NOTE, "\n");
8875     }
8876
8877   /* We can only handle two-vector permutes, everything else should
8878      be lowered on the SLP level.  The following is closely inspired
8879      by vect_transform_slp_perm_load and is supposed to eventually
8880      replace it.
8881      ???   As intermediate step do code-gen in the SLP tree representation
8882      somehow?  */
8883   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8884   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8885   unsigned int index = 0;
8886   poly_uint64 mask_element;
8887   vec_perm_builder mask;
8888   mask.new_vector (nunits, npatterns, nelts_per_pattern);
8889   unsigned int count = mask.encoded_nelts ();
8890   mask.quick_grow (count);
8891   vec_perm_indices indices;
8892   unsigned nperms = 0;
8893   for (unsigned i = 0; i < vperm.length (); ++i)
8894     {
8895       mask_element = vperm[i].second;
8896       if (first_vec.first == -1U
8897           || first_vec == vperm[i].first)
8898         first_vec = vperm[i].first;
8899       else if (second_vec.first == -1U
8900                || second_vec == vperm[i].first)
8901         {
8902           second_vec = vperm[i].first;
8903           mask_element += nunits;
8904         }
8905       else
8906         {
8907           if (dump_p)
8908             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8909                              "permutation requires at "
8910                              "least three vectors\n");
8911           gcc_assert (!gsi);
8912           return -1;
8913         }
8914
8915       mask[index++] = mask_element;
8916
8917       if (index == count)
8918         {
8919           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8920                               TYPE_VECTOR_SUBPARTS (op_vectype));
8921           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
8922                              && constant_multiple_p (mask[0], nunits));
8923           machine_mode vmode = TYPE_MODE (vectype);
8924           machine_mode op_vmode = TYPE_MODE (op_vectype);
8925           unsigned HOST_WIDE_INT c;
8926           if ((!identity_p
8927                && !can_vec_perm_const_p (vmode, op_vmode, indices))
8928               || (identity_p
8929                   && !known_le (nunits,
8930                                 TYPE_VECTOR_SUBPARTS (op_vectype))
8931                   && (!constant_multiple_p (nunits,
8932                                             TYPE_VECTOR_SUBPARTS (op_vectype),
8933                                             &c) || c != 2)))
8934             {
8935               if (dump_p)
8936                 {
8937                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8938                                    vect_location,
8939                                    "unsupported vect permute { ");
8940                   for (i = 0; i < count; ++i)
8941                     {
8942                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8943                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8944                     }
8945                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8946                 }
8947               gcc_assert (!gsi);
8948               return -1;
8949             }
8950
8951           if (!identity_p)
8952             nperms++;
8953           if (gsi)
8954             {
8955               if (second_vec.first == -1U)
8956                 second_vec = first_vec;
8957
8958               slp_tree
8959                 first_node = children[first_vec.first],
8960                 second_node = children[second_vec.first];
8961
8962               tree mask_vec = NULL_TREE;
8963               if (!identity_p)
8964                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8965
8966               for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8967                 {
8968                   tree first_def
8969                     = vect_get_slp_vect_def (first_node,
8970                                              first_vec.second + vi);
8971                   tree second_def
8972                     = vect_get_slp_vect_def (second_node,
8973                                              second_vec.second + vi);
8974                   vect_add_slp_permutation (vinfo, gsi, node, first_def,
8975                                             second_def, mask_vec, mask[0]);
8976                 }
8977             }
8978
8979           index = 0;
8980           first_vec = std::make_pair (-1U, -1U);
8981           second_vec = std::make_pair (-1U, -1U);
8982         }
8983     }
8984
8985   return nperms;
8986 }
8987
8988 /* Vectorize the SLP permutations in NODE as specified
8989    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8990    child number and lane number.
8991    Interleaving of two two-lane two-child SLP subtrees (not supported):
8992      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8993    A blend of two four-lane two-child SLP subtrees:
8994      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8995    Highpart of a four-lane one-child SLP subtree (not supported):
8996      [ { 0, 2 }, { 0, 3 } ]
8997    Where currently only a subset is supported by code generating below.  */
8998
8999 static bool
9000 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
9001                               slp_tree node, stmt_vector_for_cost *cost_vec)
9002 {
9003   tree vectype = SLP_TREE_VECTYPE (node);
9004   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
9005   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
9006                                                SLP_TREE_CHILDREN (node),
9007                                                dump_enabled_p ());
9008   if (nperms < 0)
9009     return false;
9010
9011   if (!gsi)
9012     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
9013
9014   return true;
9015 }
9016
9017 /* Vectorize SLP NODE.  */
9018
9019 static void
9020 vect_schedule_slp_node (vec_info *vinfo,
9021                         slp_tree node, slp_instance instance)
9022 {
9023   gimple_stmt_iterator si;
9024   int i;
9025   slp_tree child;
9026
9027   /* For existing vectors there's nothing to do.  */
9028   if (SLP_TREE_DEF_TYPE (node) == vect_external_def
9029       && SLP_TREE_VEC_DEFS (node).exists ())
9030     return;
9031
9032   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9033
9034   /* Vectorize externals and constants.  */
9035   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9036       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
9037     {
9038       /* ???  vectorizable_shift can end up using a scalar operand which is
9039          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
9040          node in this case.  */
9041       if (!SLP_TREE_VECTYPE (node))
9042         return;
9043
9044       vect_create_constant_vectors (vinfo, node);
9045       return;
9046     }
9047
9048   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9049
9050   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
9051   SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9052
9053   if (dump_enabled_p ())
9054     dump_printf_loc (MSG_NOTE, vect_location,
9055                      "------>vectorizing SLP node starting from: %G",
9056                      stmt_info->stmt);
9057
9058   if (STMT_VINFO_DATA_REF (stmt_info)
9059       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9060     {
9061       /* Vectorized loads go before the first scalar load to make it
9062          ready early, vectorized stores go before the last scalar
9063          stmt which is where all uses are ready.  */
9064       stmt_vec_info last_stmt_info = NULL;
9065       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9066         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9067       else /* DR_IS_WRITE */
9068         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9069       si = gsi_for_stmt (last_stmt_info->stmt);
9070     }
9071   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9072             || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9073             || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9074            && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9075     {
9076       /* For PHI node vectorization we do not use the insertion iterator.  */
9077       si = gsi_none ();
9078     }
9079   else
9080     {
9081       /* Emit other stmts after the children vectorized defs which is
9082          earliest possible.  */
9083       gimple *last_stmt = NULL;
9084       bool seen_vector_def = false;
9085       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9086         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9087           {
9088             /* For fold-left reductions we are retaining the scalar
9089                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9090                set so the representation isn't perfect.  Resort to the
9091                last scalar def here.  */
9092             if (SLP_TREE_VEC_DEFS (child).is_empty ())
9093               {
9094                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9095                             == cycle_phi_info_type);
9096                 gphi *phi = as_a <gphi *>
9097                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
9098                 if (!last_stmt
9099                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
9100                   last_stmt = phi;
9101               }
9102             /* We are emitting all vectorized stmts in the same place and
9103                the last one is the last.
9104                ???  Unless we have a load permutation applied and that
9105                figures to re-use an earlier generated load.  */
9106             unsigned j;
9107             tree vdef;
9108             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9109               {
9110                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9111                 if (!last_stmt
9112                     || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9113                   last_stmt = vstmt;
9114               }
9115           }
9116         else if (!SLP_TREE_VECTYPE (child))
9117           {
9118             /* For externals we use unvectorized at all scalar defs.  */
9119             unsigned j;
9120             tree def;
9121             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9122               if (TREE_CODE (def) == SSA_NAME
9123                   && !SSA_NAME_IS_DEFAULT_DEF (def))
9124                 {
9125                   gimple *stmt = SSA_NAME_DEF_STMT (def);
9126                   if (!last_stmt
9127                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9128                     last_stmt = stmt;
9129                 }
9130           }
9131         else
9132           {
9133             /* For externals we have to look at all defs since their
9134                insertion place is decided per vector.  But beware
9135                of pre-existing vectors where we need to make sure
9136                we do not insert before the region boundary.  */
9137             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9138                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9139               seen_vector_def = true;
9140             else
9141               {
9142                 unsigned j;
9143                 tree vdef;
9144                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9145                   if (TREE_CODE (vdef) == SSA_NAME
9146                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9147                     {
9148                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9149                       if (!last_stmt
9150                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9151                         last_stmt = vstmt;
9152                     }
9153               }
9154           }
9155       /* This can happen when all children are pre-existing vectors or
9156          constants.  */
9157       if (!last_stmt)
9158         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9159       if (!last_stmt)
9160         {
9161           gcc_assert (seen_vector_def);
9162           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9163         }
9164       else if (is_ctrl_altering_stmt (last_stmt))
9165         {
9166           /* We split regions to vectorize at control altering stmts
9167              with a definition so this must be an external which
9168              we can insert at the start of the region.  */
9169           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9170         }
9171       else if (is_a <bb_vec_info> (vinfo)
9172                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9173                && gimple_could_trap_p (stmt_info->stmt))
9174         {
9175           /* We've constrained possibly trapping operations to all come
9176              from the same basic-block, if vectorized defs would allow earlier
9177              scheduling still force vectorized stmts to the original block.
9178              This is only necessary for BB vectorization since for loop vect
9179              all operations are in a single BB and scalar stmt based
9180              placement doesn't play well with epilogue vectorization.  */
9181           gcc_assert (dominated_by_p (CDI_DOMINATORS,
9182                                       gimple_bb (stmt_info->stmt),
9183                                       gimple_bb (last_stmt)));
9184           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9185         }
9186       else if (is_a <gphi *> (last_stmt))
9187         si = gsi_after_labels (gimple_bb (last_stmt));
9188       else
9189         {
9190           si = gsi_for_stmt (last_stmt);
9191           gsi_next (&si);
9192         }
9193     }
9194
9195   /* Handle purely internal nodes.  */
9196   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9197     {
9198       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
9199          be shared with different SLP nodes (but usually it's the same
9200          operation apart from the case the stmt is only there for denoting
9201          the actual scalar lane defs ...).  So do not call vect_transform_stmt
9202          but open-code it here (partly).  */
9203       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9204       gcc_assert (done);
9205       stmt_vec_info slp_stmt_info;
9206       unsigned int i;
9207       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9208         if (STMT_VINFO_LIVE_P (slp_stmt_info))
9209           {
9210             done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9211                                                 instance, i, true, NULL);
9212             gcc_assert (done);
9213           }
9214     }
9215   else
9216     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9217 }
9218
9219 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9220    For loop vectorization this is done in vectorizable_call, but for SLP
9221    it needs to be deferred until end of vect_schedule_slp, because multiple
9222    SLP instances may refer to the same scalar stmt.  */
9223
9224 static void
9225 vect_remove_slp_scalar_calls (vec_info *vinfo,
9226                               slp_tree node, hash_set<slp_tree> &visited)
9227 {
9228   gimple *new_stmt;
9229   gimple_stmt_iterator gsi;
9230   int i;
9231   slp_tree child;
9232   tree lhs;
9233   stmt_vec_info stmt_info;
9234
9235   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9236     return;
9237
9238   if (visited.add (node))
9239     return;
9240
9241   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9242     vect_remove_slp_scalar_calls (vinfo, child, visited);
9243
9244   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9245     {
9246       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9247       if (!stmt || gimple_bb (stmt) == NULL)
9248         continue;
9249       if (is_pattern_stmt_p (stmt_info)
9250           || !PURE_SLP_STMT (stmt_info))
9251         continue;
9252       lhs = gimple_call_lhs (stmt);
9253       if (lhs)
9254         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9255       else
9256         {
9257           new_stmt = gimple_build_nop ();
9258           unlink_stmt_vdef (stmt_info->stmt);
9259         }
9260       gsi = gsi_for_stmt (stmt);
9261       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9262       if (lhs)
9263         SSA_NAME_DEF_STMT (lhs) = new_stmt;
9264     }
9265 }
9266
9267 static void
9268 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9269 {
9270   hash_set<slp_tree> visited;
9271   vect_remove_slp_scalar_calls (vinfo, node, visited);
9272 }
9273
9274 /* Vectorize the instance root.  */
9275
9276 void
9277 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9278 {
9279   gassign *rstmt = NULL;
9280
9281   if (instance->kind == slp_inst_kind_ctor)
9282     {
9283       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9284         {
9285           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9286           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9287           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9288                                           TREE_TYPE (vect_lhs)))
9289             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9290                                vect_lhs);
9291           rstmt = gimple_build_assign (root_lhs, vect_lhs);
9292         }
9293       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9294         {
9295           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9296           tree child_def;
9297           int j;
9298           vec<constructor_elt, va_gc> *v;
9299           vec_alloc (v, nelts);
9300
9301           /* A CTOR can handle V16HI composition from VNx8HI so we
9302              do not need to convert vector elements if the types
9303              do not match.  */
9304           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9305             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9306           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9307           tree rtype
9308             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9309           tree r_constructor = build_constructor (rtype, v);
9310           rstmt = gimple_build_assign (lhs, r_constructor);
9311         }
9312     }
9313   else if (instance->kind == slp_inst_kind_bb_reduc)
9314     {
9315       /* Largely inspired by reduction chain epilogue handling in
9316          vect_create_epilog_for_reduction.  */
9317       vec<tree> vec_defs = vNULL;
9318       vect_get_slp_defs (node, &vec_defs);
9319       enum tree_code reduc_code
9320         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9321       /* ???  We actually have to reflect signs somewhere.  */
9322       if (reduc_code == MINUS_EXPR)
9323         reduc_code = PLUS_EXPR;
9324       gimple_seq epilogue = NULL;
9325       /* We may end up with more than one vector result, reduce them
9326          to one vector.  */
9327       tree vec_def = vec_defs[0];
9328       tree vectype = TREE_TYPE (vec_def);
9329       tree compute_vectype = vectype;
9330       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9331                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
9332                                  && operation_can_overflow (reduc_code));
9333       if (pun_for_overflow_p)
9334         {
9335           compute_vectype = unsigned_type_for (vectype);
9336           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9337                                   compute_vectype, vec_def);
9338         }
9339       for (unsigned i = 1; i < vec_defs.length (); ++i)
9340         {
9341           tree def = vec_defs[i];
9342           if (pun_for_overflow_p)
9343             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9344                                 compute_vectype, def);
9345           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9346                                   vec_def, def);
9347         }
9348       vec_defs.release ();
9349       /* ???  Support other schemes than direct internal fn.  */
9350       internal_fn reduc_fn;
9351       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9352           || reduc_fn == IFN_LAST)
9353         gcc_unreachable ();
9354       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9355                                       TREE_TYPE (compute_vectype), vec_def);
9356       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9357         {
9358           tree rem_def = NULL_TREE;
9359           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9360             {
9361               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9362               if (!rem_def)
9363                 rem_def = def;
9364               else
9365                 rem_def = gimple_build (&epilogue, reduc_code,
9366                                         TREE_TYPE (scalar_def),
9367                                         rem_def, def);
9368             }
9369           scalar_def = gimple_build (&epilogue, reduc_code,
9370                                      TREE_TYPE (scalar_def),
9371                                      scalar_def, rem_def);
9372         }
9373       scalar_def = gimple_convert (&epilogue,
9374                                    TREE_TYPE (vectype), scalar_def);
9375       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9376       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9377       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9378       update_stmt (gsi_stmt (rgsi));
9379       return;
9380     }
9381   else
9382     gcc_unreachable ();
9383
9384   gcc_assert (rstmt);
9385
9386   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9387   gsi_replace (&rgsi, rstmt, true);
9388 }
9389
9390 struct slp_scc_info
9391 {
9392   bool on_stack;
9393   int dfs;
9394   int lowlink;
9395 };
9396
9397 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
9398
9399 static void
9400 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9401                    hash_map<slp_tree, slp_scc_info> &scc_info,
9402                    int &maxdfs, vec<slp_tree> &stack)
9403 {
9404   bool existed_p;
9405   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9406   gcc_assert (!existed_p);
9407   info->dfs = maxdfs;
9408   info->lowlink = maxdfs;
9409   maxdfs++;
9410
9411   /* Leaf.  */
9412   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9413     {
9414       info->on_stack = false;
9415       vect_schedule_slp_node (vinfo, node, instance);
9416       return;
9417     }
9418
9419   info->on_stack = true;
9420   stack.safe_push (node);
9421
9422   unsigned i;
9423   slp_tree child;
9424   /* DFS recurse.  */
9425   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9426     {
9427       if (!child)
9428         continue;
9429       slp_scc_info *child_info = scc_info.get (child);
9430       if (!child_info)
9431         {
9432           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9433           /* Recursion might have re-allocated the node.  */
9434           info = scc_info.get (node);
9435           child_info = scc_info.get (child);
9436           info->lowlink = MIN (info->lowlink, child_info->lowlink);
9437         }
9438       else if (child_info->on_stack)
9439         info->lowlink = MIN (info->lowlink, child_info->dfs);
9440     }
9441   if (info->lowlink != info->dfs)
9442     return;
9443
9444   auto_vec<slp_tree, 4> phis_to_fixup;
9445
9446   /* Singleton.  */
9447   if (stack.last () == node)
9448     {
9449       stack.pop ();
9450       info->on_stack = false;
9451       vect_schedule_slp_node (vinfo, node, instance);
9452       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9453           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9454         phis_to_fixup.quick_push (node);
9455     }
9456   else
9457     {
9458       /* SCC.  */
9459       int last_idx = stack.length () - 1;
9460       while (stack[last_idx] != node)
9461         last_idx--;
9462       /* We can break the cycle at PHIs who have at least one child
9463          code generated.  Then we could re-start the DFS walk until
9464          all nodes in the SCC are covered (we might have new entries
9465          for only back-reachable nodes).  But it's simpler to just
9466          iterate and schedule those that are ready.  */
9467       unsigned todo = stack.length () - last_idx;
9468       do
9469         {
9470           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9471             {
9472               slp_tree entry = stack[idx];
9473               if (!entry)
9474                 continue;
9475               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9476                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9477               bool ready = !phi;
9478               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9479                   if (!child)
9480                     {
9481                       gcc_assert (phi);
9482                       ready = true;
9483                       break;
9484                     }
9485                   else if (scc_info.get (child)->on_stack)
9486                     {
9487                       if (!phi)
9488                         {
9489                           ready = false;
9490                           break;
9491                         }
9492                     }
9493                   else
9494                     {
9495                       if (phi)
9496                         {
9497                           ready = true;
9498                           break;
9499                         }
9500                     }
9501               if (ready)
9502                 {
9503                   vect_schedule_slp_node (vinfo, entry, instance);
9504                   scc_info.get (entry)->on_stack = false;
9505                   stack[idx] = NULL;
9506                   todo--;
9507                   if (phi)
9508                     phis_to_fixup.safe_push (entry);
9509                 }
9510             }
9511         }
9512       while (todo != 0);
9513
9514       /* Pop the SCC.  */
9515       stack.truncate (last_idx);
9516     }
9517
9518   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
9519   slp_tree phi_node;
9520   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9521     {
9522       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9523       edge_iterator ei;
9524       edge e;
9525       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9526         {
9527           unsigned dest_idx = e->dest_idx;
9528           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9529           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9530             continue;
9531           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9532           /* Simply fill all args.  */
9533           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9534               != vect_first_order_recurrence)
9535             for (unsigned i = 0; i < n; ++i)
9536               {
9537                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9538                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9539                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9540                              e, gimple_phi_arg_location (phi, dest_idx));
9541               }
9542           else
9543             {
9544               /* Unless it is a first order recurrence which needs
9545                  args filled in for both the PHI node and the permutes.  */
9546               gimple *perm
9547                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9548               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9549               add_phi_arg (as_a <gphi *> (rphi),
9550                            vect_get_slp_vect_def (child, n - 1),
9551                            e, gimple_phi_arg_location (phi, dest_idx));
9552               for (unsigned i = 0; i < n; ++i)
9553                 {
9554                   gimple *perm
9555                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9556                   if (i > 0)
9557                     gimple_assign_set_rhs1 (perm,
9558                                             vect_get_slp_vect_def (child, i - 1));
9559                   gimple_assign_set_rhs2 (perm,
9560                                           vect_get_slp_vect_def (child, i));
9561                   update_stmt (perm);
9562                 }
9563             }
9564         }
9565     }
9566 }
9567
9568 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
9569
9570 void
9571 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9572 {
9573   slp_instance instance;
9574   unsigned int i;
9575
9576   hash_map<slp_tree, slp_scc_info> scc_info;
9577   int maxdfs = 0;
9578   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9579     {
9580       slp_tree node = SLP_INSTANCE_TREE (instance);
9581       if (dump_enabled_p ())
9582         {
9583           dump_printf_loc (MSG_NOTE, vect_location,
9584                            "Vectorizing SLP tree:\n");
9585           /* ???  Dump all?  */
9586           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9587             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9588                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9589           vect_print_slp_graph (MSG_NOTE, vect_location,
9590                                 SLP_INSTANCE_TREE (instance));
9591         }
9592       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9593          have a PHI be the node breaking the cycle.  */
9594       auto_vec<slp_tree> stack;
9595       if (!scc_info.get (node))
9596         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9597
9598       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9599         vectorize_slp_instance_root_stmt (node, instance);
9600
9601       if (dump_enabled_p ())
9602         dump_printf_loc (MSG_NOTE, vect_location,
9603                          "vectorizing stmts using SLP.\n");
9604     }
9605
9606   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9607     {
9608       slp_tree root = SLP_INSTANCE_TREE (instance);
9609       stmt_vec_info store_info;
9610       unsigned int j;
9611
9612       /* Remove scalar call stmts.  Do not do this for basic-block
9613          vectorization as not all uses may be vectorized.
9614          ???  Why should this be necessary?  DCE should be able to
9615          remove the stmts itself.
9616          ???  For BB vectorization we can as well remove scalar
9617          stmts starting from the SLP tree root if they have no
9618          uses.  */
9619       if (is_a <loop_vec_info> (vinfo))
9620         vect_remove_slp_scalar_calls (vinfo, root);
9621
9622       /* Remove vectorized stores original scalar stmts.  */
9623       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9624         {
9625           if (!STMT_VINFO_DATA_REF (store_info)
9626               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9627             break;
9628
9629           store_info = vect_orig_stmt (store_info);
9630           /* Free the attached stmt_vec_info and remove the stmt.  */
9631           vinfo->remove_stmt (store_info);
9632
9633           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9634              to not crash in vect_free_slp_tree later.  */
9635           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9636             SLP_TREE_REPRESENTATIVE (root) = NULL;
9637         }
9638     }
9639 }