gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "insn-config.h"
  35 #include "recog.h"              /* FIXME: for insn_data */
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "gimple-iterator.h"
  39 #include "cfgloop.h"
  40 #include "tree-vectorizer.h"
  41 #include "langhooks.h"
  42 #include "gimple-walk.h"
  43 #include "dbgcnt.h"
  44 #include "tree-vector-builder.h"
  45 #include "vec-perm-indices.h"
  46 #include "gimple-fold.h"
  47 #include "internal-fn.h"
  48 #include "dump-context.h"
  49 #include "cfganal.h"
  50 #include "tree-eh.h"
  51 #include "tree-cfg.h"
  52 #include "alloc-pool.h"
  53 #include "sreal.h"
  54 #include "predict.h"
  55
  56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  57                                             load_permutation_t &,
  58                                             const vec<tree> &,
  59                                             gimple_stmt_iterator *,
  60                                             poly_uint64, bool, bool,
  61                                             unsigned *,
  62                                             unsigned * = nullptr,
  63                                             bool = false);
  64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  65                                            slp_tree, lane_permutation_t &,
  66                                            vec<slp_tree> &, bool);
  67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  68                                           slp_tree, stmt_vector_for_cost *);
  69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  70
  71 static object_allocator<_slp_tree> *slp_tree_pool;
  72 static slp_tree slp_first_node;
  73
  74 void
  75 vect_slp_init (void)
  76 {
  77   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  78 }
  79
  80 void
  81 vect_slp_fini (void)
  82 {
  83   while (slp_first_node)
  84     delete slp_first_node;
  85   delete slp_tree_pool;
  86   slp_tree_pool = NULL;
  87 }
  88
  89 void *
  90 _slp_tree::operator new (size_t n)
  91 {
  92   gcc_assert (n == sizeof (_slp_tree));
  93   return slp_tree_pool->allocate_raw ();
  94 }
  95
  96 void
  97 _slp_tree::operator delete (void *node, size_t n)
  98 {
  99   gcc_assert (n == sizeof (_slp_tree));
 100   slp_tree_pool->remove_raw (node);
 101 }
 102
 103
 104 /* Initialize a SLP node.  */
 105
 106 _slp_tree::_slp_tree ()
 107 {
 108   this->prev_node = NULL;
 109   if (slp_first_node)
 110     slp_first_node->prev_node = this;
 111   this->next_node = slp_first_node;
 112   slp_first_node = this;
 113   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 114   SLP_TREE_SCALAR_OPS (this) = vNULL;
 115   SLP_TREE_VEC_DEFS (this) = vNULL;
 116   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 117   SLP_TREE_CHILDREN (this) = vNULL;
 118   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 119   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 120   SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
 121   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 122   SLP_TREE_CODE (this) = ERROR_MARK;
 123   SLP_TREE_VECTYPE (this) = NULL_TREE;
 124   SLP_TREE_REPRESENTATIVE (this) = NULL;
 125   SLP_TREE_REF_COUNT (this) = 1;
 126   this->failed = NULL;
 127   this->max_nunits = 1;
 128   this->lanes = 0;
 129 }
 130
 131 /* Tear down a SLP node.  */
 132
 133 _slp_tree::~_slp_tree ()
 134 {
 135   if (this->prev_node)
 136     this->prev_node->next_node = this->next_node;
 137   else
 138     slp_first_node = this->next_node;
 139   if (this->next_node)
 140     this->next_node->prev_node = this->prev_node;
 141   SLP_TREE_CHILDREN (this).release ();
 142   SLP_TREE_SCALAR_STMTS (this).release ();
 143   SLP_TREE_SCALAR_OPS (this).release ();
 144   SLP_TREE_VEC_DEFS (this).release ();
 145   SLP_TREE_LOAD_PERMUTATION (this).release ();
 146   SLP_TREE_LANE_PERMUTATION (this).release ();
 147   SLP_TREE_SIMD_CLONE_INFO (this).release ();
 148   if (this->failed)
 149     free (failed);
 150 }
 151
 152 /* Push the single SSA definition in DEF to the vector of vector defs.  */
 153
 154 void
 155 _slp_tree::push_vec_def (gimple *def)
 156 {
 157   if (gphi *phi = dyn_cast <gphi *> (def))
 158     vec_defs.quick_push (gimple_phi_result (phi));
 159   else
 160     {
 161       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
 162       vec_defs.quick_push (get_def_from_ptr (defop));
 163     }
 164 }
 165
 166 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 167
 168 void
 169 vect_free_slp_tree (slp_tree node)
 170 {
 171   int i;
 172   slp_tree child;
 173
 174   if (--SLP_TREE_REF_COUNT (node) != 0)
 175     return;
 176
 177   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 178     if (child)
 179       vect_free_slp_tree (child);
 180
 181   /* If the node defines any SLP only patterns then those patterns are no
 182      longer valid and should be removed.  */
 183   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 184   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 185     {
 186       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 187       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 188       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 189     }
 190
 191   delete node;
 192 }
 193
 194 /* Return a location suitable for dumpings related to the SLP instance.  */
 195
 196 dump_user_location_t
 197 _slp_instance::location () const
 198 {
 199   if (!root_stmts.is_empty ())
 200     return root_stmts[0]->stmt;
 201   else
 202     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 203 }
 204
 205
 206 /* Free the memory allocated for the SLP instance.  */
 207
 208 void
 209 vect_free_slp_instance (slp_instance instance)
 210 {
 211   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 212   SLP_INSTANCE_LOADS (instance).release ();
 213   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 214   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
 215   instance->subgraph_entries.release ();
 216   instance->cost_vec.release ();
 217   free (instance);
 218 }
 219
 220
 221 /* Create an SLP node for SCALAR_STMTS.  */
 222
 223 slp_tree
 224 vect_create_new_slp_node (unsigned nops, tree_code code)
 225 {
 226   slp_tree node = new _slp_tree;
 227   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 228   SLP_TREE_CHILDREN (node).create (nops);
 229   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 230   SLP_TREE_CODE (node) = code;
 231   return node;
 232 }
 233 /* Create an SLP node for SCALAR_STMTS.  */
 234
 235 static slp_tree
 236 vect_create_new_slp_node (slp_tree node,
 237                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 238 {
 239   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 240   SLP_TREE_CHILDREN (node).create (nops);
 241   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 242   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 243   SLP_TREE_LANES (node) = scalar_stmts.length ();
 244   return node;
 245 }
 246
 247 /* Create an SLP node for SCALAR_STMTS.  */
 248
 249 static slp_tree
 250 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 251 {
 252   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 253 }
 254
 255 /* Create an SLP node for OPS.  */
 256
 257 static slp_tree
 258 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 259 {
 260   SLP_TREE_SCALAR_OPS (node) = ops;
 261   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 262   SLP_TREE_LANES (node) = ops.length ();
 263   return node;
 264 }
 265
 266 /* Create an SLP node for OPS.  */
 267
 268 static slp_tree
 269 vect_create_new_slp_node (vec<tree> ops)
 270 {
 271   return vect_create_new_slp_node (new _slp_tree, ops);
 272 }
 273
 274
 275 /* This structure is used in creation of an SLP tree.  Each instance
 276    corresponds to the same operand in a group of scalar stmts in an SLP
 277    node.  */
 278 typedef struct _slp_oprnd_info
 279 {
 280   /* Def-stmts for the operands.  */
 281   vec<stmt_vec_info> def_stmts;
 282   /* Operands.  */
 283   vec<tree> ops;
 284   /* Information about the first statement, its vector def-type, type, the
 285      operand itself in case it's constant, and an indication if it's a pattern
 286      stmt and gather/scatter info.  */
 287   tree first_op_type;
 288   enum vect_def_type first_dt;
 289   bool any_pattern;
 290   bool first_gs_p;
 291   gather_scatter_info first_gs_info;
 292 } *slp_oprnd_info;
 293
 294
 295 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 296    operand.  */
 297 static vec<slp_oprnd_info>
 298 vect_create_oprnd_info (int nops, int group_size)
 299 {
 300   int i;
 301   slp_oprnd_info oprnd_info;
 302   vec<slp_oprnd_info> oprnds_info;
 303
 304   oprnds_info.create (nops);
 305   for (i = 0; i < nops; i++)
 306     {
 307       oprnd_info = XNEW (struct _slp_oprnd_info);
 308       oprnd_info->def_stmts.create (group_size);
 309       oprnd_info->ops.create (group_size);
 310       oprnd_info->first_dt = vect_uninitialized_def;
 311       oprnd_info->first_op_type = NULL_TREE;
 312       oprnd_info->any_pattern = false;
 313       oprnd_info->first_gs_p = false;
 314       oprnds_info.quick_push (oprnd_info);
 315     }
 316
 317   return oprnds_info;
 318 }
 319
 320
 321 /* Free operands info.  */
 322
 323 static void
 324 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 325 {
 326   int i;
 327   slp_oprnd_info oprnd_info;
 328
 329   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 330     {
 331       oprnd_info->def_stmts.release ();
 332       oprnd_info->ops.release ();
 333       XDELETE (oprnd_info);
 334     }
 335
 336   oprnds_info.release ();
 337 }
 338
 339 /* Return the execution frequency of NODE (so that a higher value indicates
 340    a "more important" node when optimizing for speed).  */
 341
 342 static sreal
 343 vect_slp_node_weight (slp_tree node)
 344 {
 345   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 346   basic_block bb = gimple_bb (stmt_info->stmt);
 347   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 348 }
 349
 350 /* Return true if STMTS contains a pattern statement.  */
 351
 352 static bool
 353 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 354 {
 355   stmt_vec_info stmt_info;
 356   unsigned int i;
 357   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 358     if (is_pattern_stmt_p (stmt_info))
 359       return true;
 360   return false;
 361 }
 362
 363 /* Return true when all lanes in the external or constant NODE have
 364    the same value.  */
 365
 366 static bool
 367 vect_slp_tree_uniform_p (slp_tree node)
 368 {
 369   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 370               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 371
 372   /* Pre-exsting vectors.  */
 373   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 374     return false;
 375
 376   unsigned i;
 377   tree op, first = NULL_TREE;
 378   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 379     if (!first)
 380       first = op;
 381     else if (!operand_equal_p (first, op, 0))
 382       return false;
 383
 384   return true;
 385 }
 386
 387 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 388    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 389    of the chain.  */
 390
 391 int
 392 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 393                                       stmt_vec_info first_stmt_info)
 394 {
 395   stmt_vec_info next_stmt_info = first_stmt_info;
 396   int result = 0;
 397
 398   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 399     return -1;
 400
 401   do
 402     {
 403       if (next_stmt_info == stmt_info)
 404         return result;
 405       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 406       if (next_stmt_info)
 407         result += DR_GROUP_GAP (next_stmt_info);
 408     }
 409   while (next_stmt_info);
 410
 411   return -1;
 412 }
 413
 414 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 415    using the method implemented by duplicate_and_interleave.  Return true
 416    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 417    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 418    (if nonnull).  */
 419
 420 bool
 421 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 422                                 tree elt_type, unsigned int *nvectors_out,
 423                                 tree *vector_type_out,
 424                                 tree *permutes)
 425 {
 426   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 427   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 428     return false;
 429
 430   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 431   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 432   unsigned int nvectors = 1;
 433   for (;;)
 434     {
 435       scalar_int_mode int_mode;
 436       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 437       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 438         {
 439           /* Get the natural vector type for this SLP group size.  */
 440           tree int_type = build_nonstandard_integer_type
 441             (GET_MODE_BITSIZE (int_mode), 1);
 442           tree vector_type
 443             = get_vectype_for_scalar_type (vinfo, int_type, count);
 444           poly_int64 half_nelts;
 445           if (vector_type
 446               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 447               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 448                            GET_MODE_SIZE (base_vector_mode))
 449               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
 450                              2, &half_nelts))
 451             {
 452               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 453                  together into elements of type INT_TYPE and using the result
 454                  to build NVECTORS vectors.  */
 455               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 456               vec_perm_builder sel1 (nelts, 2, 3);
 457               vec_perm_builder sel2 (nelts, 2, 3);
 458
 459               for (unsigned int i = 0; i < 3; ++i)
 460                 {
 461                   sel1.quick_push (i);
 462                   sel1.quick_push (i + nelts);
 463                   sel2.quick_push (half_nelts + i);
 464                   sel2.quick_push (half_nelts + i + nelts);
 465                 }
 466               vec_perm_indices indices1 (sel1, 2, nelts);
 467               vec_perm_indices indices2 (sel2, 2, nelts);
 468               machine_mode vmode = TYPE_MODE (vector_type);
 469               if (can_vec_perm_const_p (vmode, vmode, indices1)
 470                   && can_vec_perm_const_p (vmode, vmode, indices2))
 471                 {
 472                   if (nvectors_out)
 473                     *nvectors_out = nvectors;
 474                   if (vector_type_out)
 475                     *vector_type_out = vector_type;
 476                   if (permutes)
 477                     {
 478                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 479                                                                 indices1);
 480                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 481                                                                 indices2);
 482                     }
 483                   return true;
 484                 }
 485             }
 486         }
 487       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 488         return false;
 489       nvectors *= 2;
 490     }
 491 }
 492
 493 /* Return true if DTA and DTB match.  */
 494
 495 static bool
 496 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 497 {
 498   return (dta == dtb
 499           || ((dta == vect_external_def || dta == vect_constant_def)
 500               && (dtb == vect_external_def || dtb == vect_constant_def)));
 501 }
 502
 503 static const int cond_expr_maps[3][5] = {
 504   { 4, -1, -2, 1, 2 },
 505   { 4, -2, -1, 1, 2 },
 506   { 4, -1, -2, 2, 1 }
 507 };
 508 static const int arg0_map[] = { 1, 0 };
 509 static const int arg1_map[] = { 1, 1 };
 510 static const int arg2_map[] = { 1, 2 };
 511 static const int arg1_arg4_map[] = { 2, 1, 4 };
 512 static const int arg3_arg2_map[] = { 2, 3, 2 };
 513 static const int op1_op0_map[] = { 2, 1, 0 };
 514 static const int off_map[] = { 1, -3 };
 515 static const int off_op0_map[] = { 2, -3, 0 };
 516 static const int off_arg2_map[] = { 2, -3, 2 };
 517 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
 518 static const int mask_call_maps[6][7] = {
 519   { 1, 1, },
 520   { 2, 1, 2, },
 521   { 3, 1, 2, 3, },
 522   { 4, 1, 2, 3, 4, },
 523   { 5, 1, 2, 3, 4, 5, },
 524   { 6, 1, 2, 3, 4, 5, 6 },
 525 };
 526
 527 /* For most SLP statements, there is a one-to-one mapping between
 528    gimple arguments and child nodes.  If that is not true for STMT,
 529    return an array that contains:
 530
 531    - the number of child nodes, followed by
 532    - for each child node, the index of the argument associated with that node.
 533      The special index -1 is the first operand of an embedded comparison and
 534      the special index -2 is the second operand of an embedded comparison.
 535      The special indes -3 is the offset of a gather as analyzed by
 536      vect_check_gather_scatter.
 537
 538    SWAP is as for vect_get_and_check_slp_defs.  */
 539
 540 static const int *
 541 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
 542                       unsigned char swap = 0)
 543 {
 544   if (auto assign = dyn_cast<const gassign *> (stmt))
 545     {
 546       if (gimple_assign_rhs_code (assign) == COND_EXPR
 547           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 548         return cond_expr_maps[swap];
 549       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 550           && swap)
 551         return op1_op0_map;
 552       if (gather_scatter_p)
 553         return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
 554                 ? off_op0_map : off_map);
 555     }
 556   gcc_assert (!swap);
 557   if (auto call = dyn_cast<const gcall *> (stmt))
 558     {
 559       if (gimple_call_internal_p (call))
 560         switch (gimple_call_internal_fn (call))
 561           {
 562           case IFN_MASK_LOAD:
 563             return gather_scatter_p ? off_arg2_map : arg2_map;
 564
 565           case IFN_GATHER_LOAD:
 566             return arg1_map;
 567
 568           case IFN_MASK_GATHER_LOAD:
 569           case IFN_MASK_LEN_GATHER_LOAD:
 570             return arg1_arg4_map;
 571
 572           case IFN_MASK_STORE:
 573             return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
 574
 575           case IFN_MASK_CALL:
 576             {
 577               unsigned nargs = gimple_call_num_args (call);
 578               if (nargs >= 2 && nargs <= 7)
 579                 return mask_call_maps[nargs-2];
 580               else
 581                 return nullptr;
 582             }
 583
 584           case IFN_CLZ:
 585           case IFN_CTZ:
 586             return arg0_map;
 587
 588           default:
 589             break;
 590           }
 591     }
 592   return nullptr;
 593 }
 594
 595 /* Return the SLP node child index for operand OP of STMT.  */
 596
 597 int
 598 vect_slp_child_index_for_operand (const gimple *stmt, int op,
 599                                   bool gather_scatter_p)
 600 {
 601   const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
 602   if (!opmap)
 603     return op;
 604   for (int i = 1; i < 1 + opmap[0]; ++i)
 605     if (opmap[i] == op)
 606       return i - 1;
 607   gcc_unreachable ();
 608 }
 609
 610 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 611    they are of a valid type and that they match the defs of the first stmt of
 612    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 613    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 614    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 615    is 1 if STMT is cond and operands of comparison need to be swapped;
 616    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 617
 618    If there was a fatal error return -1; if the error could be corrected by
 619    swapping operands of father node of this one, return 1; if everything is
 620    ok return 0.  */
 621 static int
 622 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 623                              bool *skip_args,
 624                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 625                              vec<slp_oprnd_info> *oprnds_info)
 626 {
 627   stmt_vec_info stmt_info = stmts[stmt_num];
 628   tree oprnd;
 629   unsigned int i, number_of_oprnds;
 630   enum vect_def_type dt = vect_uninitialized_def;
 631   slp_oprnd_info oprnd_info;
 632   gather_scatter_info gs_info;
 633   unsigned int gs_op = -1u;
 634   unsigned int commutative_op = -1U;
 635   bool first = stmt_num == 0;
 636
 637   if (!is_a<gcall *> (stmt_info->stmt)
 638       && !is_a<gassign *> (stmt_info->stmt)
 639       && !is_a<gphi *> (stmt_info->stmt))
 640     return -1;
 641
 642   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 643   const int *map
 644     = vect_get_operand_map (stmt_info->stmt,
 645                             STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
 646   if (map)
 647     number_of_oprnds = *map++;
 648   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 649     {
 650       if (gimple_call_internal_p (stmt))
 651         {
 652           internal_fn ifn = gimple_call_internal_fn (stmt);
 653           commutative_op = first_commutative_argument (ifn);
 654         }
 655     }
 656   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 657     {
 658       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 659         commutative_op = 0;
 660     }
 661
 662   bool swapped = (swap != 0);
 663   bool backedge = false;
 664   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 665   for (i = 0; i < number_of_oprnds; i++)
 666     {
 667       oprnd_info = (*oprnds_info)[i];
 668       int opno = map ? map[i] : int (i);
 669       if (opno == -3)
 670         {
 671           gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
 672           if (!is_a <loop_vec_info> (vinfo)
 673               || !vect_check_gather_scatter (stmt_info,
 674                                              as_a <loop_vec_info> (vinfo),
 675                                              first ? &oprnd_info->first_gs_info
 676                                              : &gs_info))
 677             return -1;
 678
 679           if (first)
 680             {
 681               oprnd_info->first_gs_p = true;
 682               oprnd = oprnd_info->first_gs_info.offset;
 683             }
 684           else
 685             {
 686               gs_op = i;
 687               oprnd = gs_info.offset;
 688             }
 689         }
 690       else if (opno < 0)
 691         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 692       else
 693         {
 694           oprnd = gimple_arg (stmt_info->stmt, opno);
 695           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 696             {
 697               edge e = gimple_phi_arg_edge (stmt, opno);
 698               backedge = (is_a <bb_vec_info> (vinfo)
 699                           ? e->flags & EDGE_DFS_BACK
 700                           : dominated_by_p (CDI_DOMINATORS, e->src,
 701                                             gimple_bb (stmt_info->stmt)));
 702             }
 703         }
 704       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 705         oprnd = TREE_OPERAND (oprnd, 0);
 706
 707       stmt_vec_info def_stmt_info;
 708       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 709         {
 710           if (dump_enabled_p ())
 711             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 712                              "Build SLP failed: can't analyze def for %T\n",
 713                              oprnd);
 714
 715           return -1;
 716         }
 717
 718       if (skip_args[i])
 719         {
 720           oprnd_info->def_stmts.quick_push (NULL);
 721           oprnd_info->ops.quick_push (NULL_TREE);
 722           oprnd_info->first_dt = vect_uninitialized_def;
 723           continue;
 724         }
 725
 726       oprnd_info->def_stmts.quick_push (def_stmt_info);
 727       oprnd_info->ops.quick_push (oprnd);
 728
 729       if (def_stmt_info
 730           && is_pattern_stmt_p (def_stmt_info))
 731         {
 732           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 733               != def_stmt_info)
 734             oprnd_info->any_pattern = true;
 735           else
 736             /* If we promote this to external use the original stmt def.  */
 737             oprnd_info->ops.last ()
 738               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 739         }
 740
 741       /* If there's a extern def on a backedge make sure we can
 742          code-generate at the region start.
 743          ???  This is another case that could be fixed by adjusting
 744          how we split the function but at the moment we'd have conflicting
 745          goals there.  */
 746       if (backedge
 747           && dts[i] == vect_external_def
 748           && is_a <bb_vec_info> (vinfo)
 749           && TREE_CODE (oprnd) == SSA_NAME
 750           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 751           && !dominated_by_p (CDI_DOMINATORS,
 752                               as_a <bb_vec_info> (vinfo)->bbs[0],
 753                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 754         {
 755           if (dump_enabled_p ())
 756             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 757                              "Build SLP failed: extern def %T only defined "
 758                              "on backedge\n", oprnd);
 759           return -1;
 760         }
 761
 762       if (first)
 763         {
 764           tree type = TREE_TYPE (oprnd);
 765           dt = dts[i];
 766
 767           /* For the swapping logic below force vect_reduction_def
 768              for the reduction op in a SLP reduction group.  */
 769           if (!STMT_VINFO_DATA_REF (stmt_info)
 770               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 771               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 772               && def_stmt_info)
 773             dts[i] = dt = vect_reduction_def;
 774
 775           /* Check the types of the definition.  */
 776           switch (dt)
 777             {
 778             case vect_external_def:
 779             case vect_constant_def:
 780             case vect_internal_def:
 781             case vect_reduction_def:
 782             case vect_induction_def:
 783             case vect_nested_cycle:
 784             case vect_first_order_recurrence:
 785               break;
 786
 787             default:
 788               /* FORNOW: Not supported.  */
 789               if (dump_enabled_p ())
 790                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 791                                  "Build SLP failed: illegal type of def %T\n",
 792                                  oprnd);
 793               return -1;
 794             }
 795
 796           oprnd_info->first_dt = dt;
 797           oprnd_info->first_op_type = type;
 798         }
 799     }
 800   if (first)
 801     return 0;
 802
 803   /* Now match the operand definition types to that of the first stmt.  */
 804   for (i = 0; i < number_of_oprnds;)
 805     {
 806       if (skip_args[i])
 807         {
 808           ++i;
 809           continue;
 810         }
 811
 812       oprnd_info = (*oprnds_info)[i];
 813       dt = dts[i];
 814       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 815       oprnd = oprnd_info->ops[stmt_num];
 816       tree type = TREE_TYPE (oprnd);
 817
 818       if (!types_compatible_p (oprnd_info->first_op_type, type))
 819         {
 820           if (dump_enabled_p ())
 821             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 822                              "Build SLP failed: different operand types\n");
 823           return 1;
 824         }
 825
 826       if ((gs_op == i) != oprnd_info->first_gs_p)
 827         {
 828           if (dump_enabled_p ())
 829             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 830                              "Build SLP failed: mixed gather and non-gather\n");
 831           return 1;
 832         }
 833       else if (gs_op == i)
 834         {
 835           if (!operand_equal_p (oprnd_info->first_gs_info.base,
 836                                 gs_info.base))
 837             {
 838               if (dump_enabled_p ())
 839                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 840                                  "Build SLP failed: different gather base\n");
 841               return 1;
 842             }
 843           if (oprnd_info->first_gs_info.scale != gs_info.scale)
 844             {
 845               if (dump_enabled_p ())
 846                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 847                                  "Build SLP failed: different gather scale\n");
 848               return 1;
 849             }
 850         }
 851
 852       /* Not first stmt of the group, check that the def-stmt/s match
 853          the def-stmt/s of the first stmt.  Allow different definition
 854          types for reduction chains: the first stmt must be a
 855          vect_reduction_def (a phi node), and the rest
 856          end in the reduction chain.  */
 857       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 858            && !(oprnd_info->first_dt == vect_reduction_def
 859                 && !STMT_VINFO_DATA_REF (stmt_info)
 860                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 861                 && def_stmt_info
 862                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 863                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 864                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 865           || (!STMT_VINFO_DATA_REF (stmt_info)
 866               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 867               && ((!def_stmt_info
 868                    || STMT_VINFO_DATA_REF (def_stmt_info)
 869                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 870                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 871                   != (oprnd_info->first_dt != vect_reduction_def))))
 872         {
 873           /* Try swapping operands if we got a mismatch.  For BB
 874              vectorization only in case it will clearly improve things.  */
 875           if (i == commutative_op && !swapped
 876               && (!is_a <bb_vec_info> (vinfo)
 877                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 878                                              dts[i+1])
 879                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 880                           || vect_def_types_match
 881                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 882             {
 883               if (dump_enabled_p ())
 884                 dump_printf_loc (MSG_NOTE, vect_location,
 885                                  "trying swapped operands\n");
 886               std::swap (dts[i], dts[i+1]);
 887               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 888                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 889               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 890                          (*oprnds_info)[i+1]->ops[stmt_num]);
 891               swapped = true;
 892               continue;
 893             }
 894
 895           if (is_a <bb_vec_info> (vinfo)
 896               && !oprnd_info->any_pattern)
 897             {
 898               /* Now for commutative ops we should see whether we can
 899                  make the other operand matching.  */
 900               if (dump_enabled_p ())
 901                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 902                                  "treating operand as external\n");
 903               oprnd_info->first_dt = dt = vect_external_def;
 904             }
 905           else
 906             {
 907               if (dump_enabled_p ())
 908                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 909                                  "Build SLP failed: different types\n");
 910               return 1;
 911             }
 912         }
 913
 914       /* Make sure to demote the overall operand to external.  */
 915       if (dt == vect_external_def)
 916         oprnd_info->first_dt = vect_external_def;
 917       /* For a SLP reduction chain we want to duplicate the reduction to
 918          each of the chain members.  That gets us a sane SLP graph (still
 919          the stmts are not 100% correct wrt the initial values).  */
 920       else if ((dt == vect_internal_def
 921                 || dt == vect_reduction_def)
 922                && oprnd_info->first_dt == vect_reduction_def
 923                && !STMT_VINFO_DATA_REF (stmt_info)
 924                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 925                && !STMT_VINFO_DATA_REF (def_stmt_info)
 926                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 927                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 928         {
 929           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 930           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 931         }
 932
 933       ++i;
 934     }
 935
 936   /* Swap operands.  */
 937   if (swapped)
 938     {
 939       if (dump_enabled_p ())
 940         dump_printf_loc (MSG_NOTE, vect_location,
 941                          "swapped operands to match def types in %G",
 942                          stmt_info->stmt);
 943     }
 944
 945   return 0;
 946 }
 947
 948 /* Return true if call statements CALL1 and CALL2 are similar enough
 949    to be combined into the same SLP group.  */
 950
 951 bool
 952 compatible_calls_p (gcall *call1, gcall *call2)
 953 {
 954   unsigned int nargs = gimple_call_num_args (call1);
 955   if (nargs != gimple_call_num_args (call2))
 956     return false;
 957
 958   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 959     return false;
 960
 961   if (gimple_call_internal_p (call1))
 962     {
 963       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
 964                                TREE_TYPE (gimple_call_lhs (call2))))
 965         return false;
 966       for (unsigned int i = 0; i < nargs; ++i)
 967         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
 968                                  TREE_TYPE (gimple_call_arg (call2, i))))
 969           return false;
 970     }
 971   else
 972     {
 973       if (!operand_equal_p (gimple_call_fn (call1),
 974                             gimple_call_fn (call2), 0))
 975         return false;
 976
 977       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
 978         return false;
 979     }
 980
 981   /* Check that any unvectorized arguments are equal.  */
 982   if (const int *map = vect_get_operand_map (call1))
 983     {
 984       unsigned int nkept = *map++;
 985       unsigned int mapi = 0;
 986       for (unsigned int i = 0; i < nargs; ++i)
 987         if (mapi < nkept && map[mapi] == int (i))
 988           mapi += 1;
 989         else if (!operand_equal_p (gimple_call_arg (call1, i),
 990                                    gimple_call_arg (call2, i)))
 991           return false;
 992     }
 993
 994   return true;
 995 }
 996
 997 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
 998    caller's attempt to find the vector type in STMT_INFO with the narrowest
 999    element type.  Return true if VECTYPE is nonnull and if it is valid
1000    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
1001    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
1002    vect_build_slp_tree.  */
1003
1004 static bool
1005 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1006                         unsigned int group_size,
1007                         tree vectype, poly_uint64 *max_nunits)
1008 {
1009   if (!vectype)
1010     {
1011       if (dump_enabled_p ())
1012         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1013                          "Build SLP failed: unsupported data-type in %G\n",
1014                          stmt_info->stmt);
1015       /* Fatal mismatch.  */
1016       return false;
1017     }
1018
1019   /* If populating the vector type requires unrolling then fail
1020      before adjusting *max_nunits for basic-block vectorization.  */
1021   if (is_a <bb_vec_info> (vinfo)
1022       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1023     {
1024       if (dump_enabled_p ())
1025         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026                          "Build SLP failed: unrolling required "
1027                          "in basic block SLP\n");
1028       /* Fatal mismatch.  */
1029       return false;
1030     }
1031
1032   /* In case of multiple types we need to detect the smallest type.  */
1033   vect_update_max_nunits (max_nunits, vectype);
1034   return true;
1035 }
1036
1037 /* Verify if the scalar stmts STMTS are isomorphic, require data
1038    permutation or are of unsupported types of operation.  Return
1039    true if they are, otherwise return false and indicate in *MATCHES
1040    which stmts are not isomorphic to the first one.  If MATCHES[0]
1041    is false then this indicates the comparison could not be
1042    carried out or the stmts will never be vectorized by SLP.
1043
1044    Note COND_EXPR is possibly isomorphic to another one after swapping its
1045    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1046    the first stmt by swapping the two operands of comparison; set SWAP[i]
1047    to 2 if stmt I is isormorphic to the first stmt by inverting the code
1048    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1049    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
1050
1051 static bool
1052 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1053                        vec<stmt_vec_info> stmts, unsigned int group_size,
1054                        poly_uint64 *max_nunits, bool *matches,
1055                        bool *two_operators, tree *node_vectype)
1056 {
1057   unsigned int i;
1058   stmt_vec_info first_stmt_info = stmts[0];
1059   code_helper first_stmt_code = ERROR_MARK;
1060   code_helper alt_stmt_code = ERROR_MARK;
1061   code_helper rhs_code = ERROR_MARK;
1062   code_helper first_cond_code = ERROR_MARK;
1063   tree lhs;
1064   bool need_same_oprnds = false;
1065   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1066   stmt_vec_info first_load = NULL, prev_first_load = NULL;
1067   bool first_stmt_ldst_p = false, ldst_p = false;
1068   bool first_stmt_phi_p = false, phi_p = false;
1069   bool maybe_soft_fail = false;
1070   tree soft_fail_nunits_vectype = NULL_TREE;
1071
1072   /* For every stmt in NODE find its def stmt/s.  */
1073   stmt_vec_info stmt_info;
1074   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1075     {
1076       gimple *stmt = stmt_info->stmt;
1077       swap[i] = 0;
1078       matches[i] = false;
1079
1080       if (dump_enabled_p ())
1081         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1082
1083       /* Fail to vectorize statements marked as unvectorizable, throw
1084          or are volatile.  */
1085       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1086           || stmt_can_throw_internal (cfun, stmt)
1087           || gimple_has_volatile_ops (stmt))
1088         {
1089           if (dump_enabled_p ())
1090             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1091                              "Build SLP failed: unvectorizable statement %G",
1092                              stmt);
1093           /* ???  For BB vectorization we want to commutate operands in a way
1094              to shuffle all unvectorizable defs into one operand and have
1095              the other still vectorized.  The following doesn't reliably
1096              work for this though but it's the easiest we can do here.  */
1097           if (is_a <bb_vec_info> (vinfo) && i != 0)
1098             continue;
1099           /* Fatal mismatch.  */
1100           matches[0] = false;
1101           return false;
1102         }
1103
1104       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1105       lhs = gimple_get_lhs (stmt);
1106       if (lhs == NULL_TREE
1107           && (!call_stmt
1108               || !gimple_call_internal_p (stmt)
1109               || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1110         {
1111           if (dump_enabled_p ())
1112             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113                              "Build SLP failed: not GIMPLE_ASSIGN nor "
1114                              "GIMPLE_CALL %G", stmt);
1115           if (is_a <bb_vec_info> (vinfo) && i != 0)
1116             continue;
1117           /* Fatal mismatch.  */
1118           matches[0] = false;
1119           return false;
1120         }
1121
1122       tree nunits_vectype;
1123       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1124                                            &nunits_vectype, group_size))
1125         {
1126           if (is_a <bb_vec_info> (vinfo) && i != 0)
1127             continue;
1128           /* Fatal mismatch.  */
1129           matches[0] = false;
1130           return false;
1131         }
1132       /* Record nunits required but continue analysis, producing matches[]
1133          as if nunits was not an issue.  This allows splitting of groups
1134          to happen.  */
1135       if (nunits_vectype
1136           && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1137                                       nunits_vectype, max_nunits))
1138         {
1139           gcc_assert (is_a <bb_vec_info> (vinfo));
1140           maybe_soft_fail = true;
1141           soft_fail_nunits_vectype = nunits_vectype;
1142         }
1143
1144       gcc_assert (vectype);
1145
1146       if (call_stmt)
1147         {
1148           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1149           if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1150             rhs_code = cfn;
1151           else
1152             rhs_code = CALL_EXPR;
1153
1154           if (cfn == CFN_MASK_LOAD
1155               || cfn == CFN_GATHER_LOAD
1156               || cfn == CFN_MASK_GATHER_LOAD
1157               || cfn == CFN_MASK_LEN_GATHER_LOAD)
1158             ldst_p = true;
1159           else if (cfn == CFN_MASK_STORE)
1160             {
1161               ldst_p = true;
1162               rhs_code = CFN_MASK_STORE;
1163             }
1164           else if ((cfn != CFN_LAST
1165                     && cfn != CFN_MASK_CALL
1166                     && internal_fn_p (cfn)
1167                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1168                    || gimple_call_tail_p (call_stmt)
1169                    || gimple_call_noreturn_p (call_stmt)
1170                    || gimple_call_chain (call_stmt))
1171             {
1172               if (dump_enabled_p ())
1173                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1174                                  "Build SLP failed: unsupported call type %G",
1175                                  (gimple *) call_stmt);
1176               if (is_a <bb_vec_info> (vinfo) && i != 0)
1177                 continue;
1178               /* Fatal mismatch.  */
1179               matches[0] = false;
1180               return false;
1181             }
1182         }
1183       else if (gimple_code (stmt) == GIMPLE_PHI)
1184         {
1185           rhs_code = ERROR_MARK;
1186           phi_p = true;
1187         }
1188       else
1189         {
1190           rhs_code = gimple_assign_rhs_code (stmt);
1191           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1192         }
1193
1194       /* Check the operation.  */
1195       if (i == 0)
1196         {
1197           *node_vectype = vectype;
1198           first_stmt_code = rhs_code;
1199           first_stmt_ldst_p = ldst_p;
1200           first_stmt_phi_p = phi_p;
1201
1202           /* Shift arguments should be equal in all the packed stmts for a
1203              vector shift with scalar shift operand.  */
1204           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1205               || rhs_code == LROTATE_EXPR
1206               || rhs_code == RROTATE_EXPR)
1207             {
1208               /* First see if we have a vector/vector shift.  */
1209               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1210                 {
1211                   /* No vector/vector shift, try for a vector/scalar shift.  */
1212                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1213                     {
1214                       if (dump_enabled_p ())
1215                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1216                                          "Build SLP failed: "
1217                                          "op not supported by target.\n");
1218                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1219                         continue;
1220                       /* Fatal mismatch.  */
1221                       matches[0] = false;
1222                       return false;
1223                     }
1224                   need_same_oprnds = true;
1225                   first_op1 = gimple_assign_rhs2 (stmt);
1226                 }
1227             }
1228           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1229             {
1230               need_same_oprnds = true;
1231               first_op1 = gimple_assign_rhs2 (stmt);
1232             }
1233           else if (!ldst_p
1234                    && rhs_code == BIT_FIELD_REF)
1235             {
1236               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1237               if (!is_a <bb_vec_info> (vinfo)
1238                   || TREE_CODE (vec) != SSA_NAME
1239                   /* When the element types are not compatible we pun the
1240                      source to the target vectype which requires equal size.  */
1241                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1242                        || !types_compatible_p (TREE_TYPE (vectype),
1243                                                TREE_TYPE (TREE_TYPE (vec))))
1244                       && !operand_equal_p (TYPE_SIZE (vectype),
1245                                            TYPE_SIZE (TREE_TYPE (vec)))))
1246                 {
1247                   if (dump_enabled_p ())
1248                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249                                      "Build SLP failed: "
1250                                      "BIT_FIELD_REF not supported\n");
1251                   /* Fatal mismatch.  */
1252                   matches[0] = false;
1253                   return false;
1254                 }
1255             }
1256           else if (rhs_code == CFN_DIV_POW2)
1257             {
1258               need_same_oprnds = true;
1259               first_op1 = gimple_call_arg (call_stmt, 1);
1260             }
1261         }
1262       else
1263         {
1264           if (first_stmt_code != rhs_code
1265               && alt_stmt_code == ERROR_MARK)
1266             alt_stmt_code = rhs_code;
1267           if ((first_stmt_code != rhs_code
1268                && (first_stmt_code != IMAGPART_EXPR
1269                    || rhs_code != REALPART_EXPR)
1270                && (first_stmt_code != REALPART_EXPR
1271                    || rhs_code != IMAGPART_EXPR)
1272                /* Handle mismatches in plus/minus by computing both
1273                   and merging the results.  */
1274                && !((first_stmt_code == PLUS_EXPR
1275                      || first_stmt_code == MINUS_EXPR)
1276                     && (alt_stmt_code == PLUS_EXPR
1277                         || alt_stmt_code == MINUS_EXPR)
1278                     && rhs_code == alt_stmt_code)
1279                && !(first_stmt_code.is_tree_code ()
1280                     && rhs_code.is_tree_code ()
1281                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1282                         == tcc_comparison)
1283                     && (swap_tree_comparison (tree_code (first_stmt_code))
1284                         == tree_code (rhs_code)))
1285                && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1286                     && (first_stmt_code == ARRAY_REF
1287                         || first_stmt_code == BIT_FIELD_REF
1288                         || first_stmt_code == INDIRECT_REF
1289                         || first_stmt_code == COMPONENT_REF
1290                         || first_stmt_code == MEM_REF)
1291                     && (rhs_code == ARRAY_REF
1292                         || rhs_code == BIT_FIELD_REF
1293                         || rhs_code == INDIRECT_REF
1294                         || rhs_code == COMPONENT_REF
1295                         || rhs_code == MEM_REF)))
1296               || (ldst_p
1297                   && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1298                       != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1299               || (ldst_p
1300                   && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1301                       != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1302               || first_stmt_ldst_p != ldst_p
1303               || first_stmt_phi_p != phi_p)
1304             {
1305               if (dump_enabled_p ())
1306                 {
1307                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1308                                    "Build SLP failed: different operation "
1309                                    "in stmt %G", stmt);
1310                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1311                                    "original stmt %G", first_stmt_info->stmt);
1312                 }
1313               /* Mismatch.  */
1314               continue;
1315             }
1316
1317           if (!ldst_p
1318               && first_stmt_code == BIT_FIELD_REF
1319               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1320                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1321             {
1322               if (dump_enabled_p ())
1323                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1324                                  "Build SLP failed: different BIT_FIELD_REF "
1325                                  "arguments in %G", stmt);
1326               /* Mismatch.  */
1327               continue;
1328             }
1329
1330           if (call_stmt
1331               && first_stmt_code != CFN_MASK_LOAD
1332               && first_stmt_code != CFN_MASK_STORE)
1333             {
1334               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1335                                        call_stmt))
1336                 {
1337                   if (dump_enabled_p ())
1338                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1339                                      "Build SLP failed: different calls in %G",
1340                                      stmt);
1341                   /* Mismatch.  */
1342                   continue;
1343                 }
1344             }
1345
1346           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1347               && (gimple_bb (first_stmt_info->stmt)
1348                   != gimple_bb (stmt_info->stmt)))
1349             {
1350               if (dump_enabled_p ())
1351                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1352                                  "Build SLP failed: different BB for PHI "
1353                                  "or possibly trapping operation in %G", stmt);
1354               /* Mismatch.  */
1355               continue;
1356             }
1357
1358           if (need_same_oprnds)
1359             {
1360               tree other_op1 = gimple_arg (stmt, 1);
1361               if (!operand_equal_p (first_op1, other_op1, 0))
1362                 {
1363                   if (dump_enabled_p ())
1364                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1365                                      "Build SLP failed: different shift "
1366                                      "arguments in %G", stmt);
1367                   /* Mismatch.  */
1368                   continue;
1369                 }
1370             }
1371
1372           if (!types_compatible_p (vectype, *node_vectype))
1373             {
1374               if (dump_enabled_p ())
1375                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1376                                  "Build SLP failed: different vector type "
1377                                  "in %G", stmt);
1378               /* Mismatch.  */
1379               continue;
1380             }
1381         }
1382
1383       /* Grouped store or load.  */
1384       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1385         {
1386           gcc_assert (ldst_p);
1387           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1388             {
1389               /* Store.  */
1390               gcc_assert (rhs_code == CFN_MASK_STORE
1391                           || REFERENCE_CLASS_P (lhs)
1392                           || DECL_P (lhs));
1393             }
1394           else
1395             {
1396               /* Load.  */
1397               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1398               if (prev_first_load)
1399                 {
1400                   /* Check that there are no loads from different interleaving
1401                      chains in the same node.  */
1402                   if (prev_first_load != first_load)
1403                     {
1404                       if (dump_enabled_p ())
1405                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1406                                          vect_location,
1407                                          "Build SLP failed: different "
1408                                          "interleaving chains in one node %G",
1409                                          stmt);
1410                       /* Mismatch.  */
1411                       continue;
1412                     }
1413                 }
1414               else
1415                 prev_first_load = first_load;
1416            }
1417         }
1418       /* Non-grouped store or load.  */
1419       else if (ldst_p)
1420         {
1421           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1422               && rhs_code != CFN_GATHER_LOAD
1423               && rhs_code != CFN_MASK_GATHER_LOAD
1424               && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1425               && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1426               /* Not grouped loads are handled as externals for BB
1427                  vectorization.  For loop vectorization we can handle
1428                  splats the same we handle single element interleaving.  */
1429               && (is_a <bb_vec_info> (vinfo)
1430                   || stmt_info != first_stmt_info))
1431             {
1432               /* Not grouped load.  */
1433               if (dump_enabled_p ())
1434                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1435                                  "Build SLP failed: not grouped load %G", stmt);
1436
1437               if (i != 0)
1438                 continue;
1439               /* Fatal mismatch.  */
1440               matches[0] = false;
1441               return false;
1442             }
1443         }
1444       /* Not memory operation.  */
1445       else
1446         {
1447           if (!phi_p
1448               && rhs_code.is_tree_code ()
1449               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1450               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1451               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1452               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1453               && rhs_code != VIEW_CONVERT_EXPR
1454               && rhs_code != CALL_EXPR
1455               && rhs_code != BIT_FIELD_REF)
1456             {
1457               if (dump_enabled_p ())
1458                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459                                  "Build SLP failed: operation unsupported %G",
1460                                  stmt);
1461               if (is_a <bb_vec_info> (vinfo) && i != 0)
1462                 continue;
1463               /* Fatal mismatch.  */
1464               matches[0] = false;
1465               return false;
1466             }
1467
1468           if (rhs_code == COND_EXPR)
1469             {
1470               tree cond_expr = gimple_assign_rhs1 (stmt);
1471               enum tree_code cond_code = TREE_CODE (cond_expr);
1472               enum tree_code swap_code = ERROR_MARK;
1473               enum tree_code invert_code = ERROR_MARK;
1474
1475               if (i == 0)
1476                 first_cond_code = TREE_CODE (cond_expr);
1477               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1478                 {
1479                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1480                   swap_code = swap_tree_comparison (cond_code);
1481                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1482                 }
1483
1484               if (first_cond_code == cond_code)
1485                 ;
1486               /* Isomorphic can be achieved by swapping.  */
1487               else if (first_cond_code == swap_code)
1488                 swap[i] = 1;
1489               /* Isomorphic can be achieved by inverting.  */
1490               else if (first_cond_code == invert_code)
1491                 swap[i] = 2;
1492               else
1493                 {
1494                   if (dump_enabled_p ())
1495                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1496                                      "Build SLP failed: different"
1497                                      " operation %G", stmt);
1498                   /* Mismatch.  */
1499                   continue;
1500                 }
1501             }
1502
1503           if (rhs_code.is_tree_code ()
1504               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1505               && (swap_tree_comparison ((tree_code)first_stmt_code)
1506                   == (tree_code)rhs_code))
1507             swap[i] = 1;
1508         }
1509
1510       matches[i] = true;
1511     }
1512
1513   for (i = 0; i < group_size; ++i)
1514     if (!matches[i])
1515       return false;
1516
1517   /* If we allowed a two-operation SLP node verify the target can cope
1518      with the permute we are going to use.  */
1519   if (alt_stmt_code != ERROR_MARK
1520       && (!alt_stmt_code.is_tree_code ()
1521           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1522               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1523     {
1524       *two_operators = true;
1525     }
1526
1527   if (maybe_soft_fail)
1528     {
1529       unsigned HOST_WIDE_INT const_nunits;
1530       if (!TYPE_VECTOR_SUBPARTS
1531             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1532           || const_nunits > group_size)
1533         matches[0] = false;
1534       else
1535         {
1536           /* With constant vector elements simulate a mismatch at the
1537              point we need to split.  */
1538           unsigned tail = group_size & (const_nunits - 1);
1539           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1540         }
1541       return false;
1542     }
1543
1544   return true;
1545 }
1546
1547 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1548    Note we never remove apart from at destruction time so we do not
1549    need a special value for deleted that differs from empty.  */
1550 struct bst_traits
1551 {
1552   typedef vec <stmt_vec_info> value_type;
1553   typedef vec <stmt_vec_info> compare_type;
1554   static inline hashval_t hash (value_type);
1555   static inline bool equal (value_type existing, value_type candidate);
1556   static inline bool is_empty (value_type x) { return !x.exists (); }
1557   static inline bool is_deleted (value_type x) { return !x.exists (); }
1558   static const bool empty_zero_p = true;
1559   static inline void mark_empty (value_type &x) { x.release (); }
1560   static inline void mark_deleted (value_type &x) { x.release (); }
1561   static inline void remove (value_type &x) { x.release (); }
1562 };
1563 inline hashval_t
1564 bst_traits::hash (value_type x)
1565 {
1566   inchash::hash h;
1567   for (unsigned i = 0; i < x.length (); ++i)
1568     h.add_int (gimple_uid (x[i]->stmt));
1569   return h.end ();
1570 }
1571 inline bool
1572 bst_traits::equal (value_type existing, value_type candidate)
1573 {
1574   if (existing.length () != candidate.length ())
1575     return false;
1576   for (unsigned i = 0; i < existing.length (); ++i)
1577     if (existing[i] != candidate[i])
1578       return false;
1579   return true;
1580 }
1581
1582 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1583    but then vec::insert does memmove and that's not compatible with
1584    std::pair.  */
1585 struct chain_op_t
1586 {
1587   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1588       : code (code_), dt (dt_), op (op_) {}
1589   tree_code code;
1590   vect_def_type dt;
1591   tree op;
1592 };
1593
1594 /* Comparator for sorting associatable chains.  */
1595
1596 static int
1597 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1598 {
1599   auto *op1 = (const chain_op_t *) op1_;
1600   auto *op2 = (const chain_op_t *) op2_;
1601   if (op1->dt != op2->dt)
1602     return (int)op1->dt - (int)op2->dt;
1603   return (int)op1->code - (int)op2->code;
1604 }
1605
1606 /* Linearize the associatable expression chain at START with the
1607    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1608    filling CHAIN with the result and using WORKLIST as intermediate storage.
1609    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1610    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1611    stmts, starting with START.  */
1612
1613 static void
1614 vect_slp_linearize_chain (vec_info *vinfo,
1615                           vec<std::pair<tree_code, gimple *> > &worklist,
1616                           vec<chain_op_t> &chain,
1617                           enum tree_code code, gimple *start,
1618                           gimple *&code_stmt, gimple *&alt_code_stmt,
1619                           vec<gimple *> *chain_stmts)
1620 {
1621   /* For each lane linearize the addition/subtraction (or other
1622      uniform associatable operation) expression tree.  */
1623   worklist.safe_push (std::make_pair (code, start));
1624   while (!worklist.is_empty ())
1625     {
1626       auto entry = worklist.pop ();
1627       gassign *stmt = as_a <gassign *> (entry.second);
1628       enum tree_code in_code = entry.first;
1629       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1630       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1631       if (!code_stmt
1632           && gimple_assign_rhs_code (stmt) == code)
1633         code_stmt = stmt;
1634       else if (!alt_code_stmt
1635                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1636         alt_code_stmt = stmt;
1637       if (chain_stmts)
1638         chain_stmts->safe_push (stmt);
1639       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1640         {
1641           tree op = gimple_op (stmt, opnum);
1642           vect_def_type dt;
1643           stmt_vec_info def_stmt_info;
1644           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1645           gcc_assert (res);
1646           if (dt == vect_internal_def
1647               && is_pattern_stmt_p (def_stmt_info))
1648             op = gimple_get_lhs (def_stmt_info->stmt);
1649           gimple *use_stmt;
1650           use_operand_p use_p;
1651           if (dt == vect_internal_def
1652               && single_imm_use (op, &use_p, &use_stmt)
1653               && is_gimple_assign (def_stmt_info->stmt)
1654               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1655                   || (code == PLUS_EXPR
1656                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1657                           == MINUS_EXPR))))
1658             {
1659               tree_code op_def_code = this_code;
1660               if (op_def_code == MINUS_EXPR && opnum == 1)
1661                 op_def_code = PLUS_EXPR;
1662               if (in_code == MINUS_EXPR)
1663                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1664               worklist.safe_push (std::make_pair (op_def_code,
1665                                                   def_stmt_info->stmt));
1666             }
1667           else
1668             {
1669               tree_code op_def_code = this_code;
1670               if (op_def_code == MINUS_EXPR && opnum == 1)
1671                 op_def_code = PLUS_EXPR;
1672               if (in_code == MINUS_EXPR)
1673                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1674               chain.safe_push (chain_op_t (op_def_code, dt, op));
1675             }
1676         }
1677     }
1678 }
1679
1680 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1681                   simple_hashmap_traits <bst_traits, slp_tree> >
1682   scalar_stmts_to_slp_tree_map_t;
1683
1684 static slp_tree
1685 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1686                        vec<stmt_vec_info> stmts, unsigned int group_size,
1687                        poly_uint64 *max_nunits,
1688                        bool *matches, unsigned *limit, unsigned *tree_size,
1689                        scalar_stmts_to_slp_tree_map_t *bst_map);
1690
1691 static slp_tree
1692 vect_build_slp_tree (vec_info *vinfo,
1693                      vec<stmt_vec_info> stmts, unsigned int group_size,
1694                      poly_uint64 *max_nunits,
1695                      bool *matches, unsigned *limit, unsigned *tree_size,
1696                      scalar_stmts_to_slp_tree_map_t *bst_map)
1697 {
1698   if (slp_tree *leader = bst_map->get (stmts))
1699     {
1700       if (dump_enabled_p ())
1701         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1702                          !(*leader)->failed ? "" : "failed ",
1703                          (void *) *leader);
1704       if (!(*leader)->failed)
1705         {
1706           SLP_TREE_REF_COUNT (*leader)++;
1707           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1708           stmts.release ();
1709           return *leader;
1710         }
1711       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1712       return NULL;
1713     }
1714
1715   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1716      so we can pick up backedge destinations during discovery.  */
1717   slp_tree res = new _slp_tree;
1718   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1719   SLP_TREE_SCALAR_STMTS (res) = stmts;
1720   bst_map->put (stmts.copy (), res);
1721
1722   if (*limit == 0)
1723     {
1724       if (dump_enabled_p ())
1725         dump_printf_loc (MSG_NOTE, vect_location,
1726                          "SLP discovery limit exceeded\n");
1727       /* Mark the node invalid so we can detect those when still in use
1728          as backedge destinations.  */
1729       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1730       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1731       res->failed = XNEWVEC (bool, group_size);
1732       memset (res->failed, 0, sizeof (bool) * group_size);
1733       memset (matches, 0, sizeof (bool) * group_size);
1734       return NULL;
1735     }
1736   --*limit;
1737
1738   if (dump_enabled_p ())
1739     dump_printf_loc (MSG_NOTE, vect_location,
1740                      "starting SLP discovery for node %p\n", (void *) res);
1741
1742   poly_uint64 this_max_nunits = 1;
1743   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1744                                         &this_max_nunits,
1745                                         matches, limit, tree_size, bst_map);
1746   if (!res_)
1747     {
1748       if (dump_enabled_p ())
1749         dump_printf_loc (MSG_NOTE, vect_location,
1750                          "SLP discovery for node %p failed\n", (void *) res);
1751       /* Mark the node invalid so we can detect those when still in use
1752          as backedge destinations.  */
1753       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1754       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1755       res->failed = XNEWVEC (bool, group_size);
1756       if (flag_checking)
1757         {
1758           unsigned i;
1759           for (i = 0; i < group_size; ++i)
1760             if (!matches[i])
1761               break;
1762           gcc_assert (i < group_size);
1763         }
1764       memcpy (res->failed, matches, sizeof (bool) * group_size);
1765     }
1766   else
1767     {
1768       if (dump_enabled_p ())
1769         dump_printf_loc (MSG_NOTE, vect_location,
1770                          "SLP discovery for node %p succeeded\n",
1771                          (void *) res);
1772       gcc_assert (res_ == res);
1773       res->max_nunits = this_max_nunits;
1774       vect_update_max_nunits (max_nunits, this_max_nunits);
1775       /* Keep a reference for the bst_map use.  */
1776       SLP_TREE_REF_COUNT (res)++;
1777     }
1778   return res_;
1779 }
1780
1781 /* Helper for building an associated SLP node chain.  */
1782
1783 static void
1784 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1785                                    slp_tree op0, slp_tree op1,
1786                                    stmt_vec_info oper1, stmt_vec_info oper2,
1787                                    vec<std::pair<unsigned, unsigned> > lperm)
1788 {
1789   unsigned group_size = SLP_TREE_LANES (op1);
1790
1791   slp_tree child1 = new _slp_tree;
1792   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1793   SLP_TREE_VECTYPE (child1) = vectype;
1794   SLP_TREE_LANES (child1) = group_size;
1795   SLP_TREE_CHILDREN (child1).create (2);
1796   SLP_TREE_CHILDREN (child1).quick_push (op0);
1797   SLP_TREE_CHILDREN (child1).quick_push (op1);
1798   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1799
1800   slp_tree child2 = new _slp_tree;
1801   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1802   SLP_TREE_VECTYPE (child2) = vectype;
1803   SLP_TREE_LANES (child2) = group_size;
1804   SLP_TREE_CHILDREN (child2).create (2);
1805   SLP_TREE_CHILDREN (child2).quick_push (op0);
1806   SLP_TREE_REF_COUNT (op0)++;
1807   SLP_TREE_CHILDREN (child2).quick_push (op1);
1808   SLP_TREE_REF_COUNT (op1)++;
1809   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1810
1811   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1812   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1813   SLP_TREE_VECTYPE (perm) = vectype;
1814   SLP_TREE_LANES (perm) = group_size;
1815   /* ???  We should set this NULL but that's not expected.  */
1816   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1817   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1818   SLP_TREE_CHILDREN (perm).quick_push (child1);
1819   SLP_TREE_CHILDREN (perm).quick_push (child2);
1820 }
1821
1822 /* Recursively build an SLP tree starting from NODE.
1823    Fail (and return a value not equal to zero) if def-stmts are not
1824    isomorphic, require data permutation or are of unsupported types of
1825    operation.  Otherwise, return 0.
1826    The value returned is the depth in the SLP tree where a mismatch
1827    was found.  */
1828
1829 static slp_tree
1830 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1831                        vec<stmt_vec_info> stmts, unsigned int group_size,
1832                        poly_uint64 *max_nunits,
1833                        bool *matches, unsigned *limit, unsigned *tree_size,
1834                        scalar_stmts_to_slp_tree_map_t *bst_map)
1835 {
1836   unsigned nops, i, this_tree_size = 0;
1837   poly_uint64 this_max_nunits = *max_nunits;
1838
1839   matches[0] = false;
1840
1841   stmt_vec_info stmt_info = stmts[0];
1842   if (!is_a<gcall *> (stmt_info->stmt)
1843       && !is_a<gassign *> (stmt_info->stmt)
1844       && !is_a<gphi *> (stmt_info->stmt))
1845     return NULL;
1846
1847   nops = gimple_num_args (stmt_info->stmt);
1848   if (const int *map = vect_get_operand_map (stmt_info->stmt,
1849                                              STMT_VINFO_GATHER_SCATTER_P
1850                                                (stmt_info)))
1851     nops = map[0];
1852
1853   /* If the SLP node is a PHI (induction or reduction), terminate
1854      the recursion.  */
1855   bool *skip_args = XALLOCAVEC (bool, nops);
1856   memset (skip_args, 0, sizeof (bool) * nops);
1857   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1858     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1859       {
1860         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1861         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1862                                                     group_size);
1863         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1864                                      max_nunits))
1865           return NULL;
1866
1867         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1868         if (def_type == vect_induction_def)
1869           {
1870             /* Induction PHIs are not cycles but walk the initial
1871                value.  Only for inner loops through, for outer loops
1872                we need to pick up the value from the actual PHIs
1873                to more easily support peeling and epilogue vectorization.  */
1874             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1875             if (!nested_in_vect_loop_p (loop, stmt_info))
1876               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1877             else
1878               loop = loop->inner;
1879             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1880           }
1881         else if (def_type == vect_reduction_def
1882                  || def_type == vect_double_reduction_def
1883                  || def_type == vect_nested_cycle
1884                  || def_type == vect_first_order_recurrence)
1885           {
1886             /* Else def types have to match.  */
1887             stmt_vec_info other_info;
1888             bool all_same = true;
1889             FOR_EACH_VEC_ELT (stmts, i, other_info)
1890               {
1891                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1892                   return NULL;
1893                 if (other_info != stmt_info)
1894                   all_same = false;
1895               }
1896             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1897             /* Reduction initial values are not explicitely represented.  */
1898             if (def_type != vect_first_order_recurrence
1899                 && !nested_in_vect_loop_p (loop, stmt_info))
1900               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1901             /* Reduction chain backedge defs are filled manually.
1902                ???  Need a better way to identify a SLP reduction chain PHI.
1903                Or a better overall way to SLP match those.  */
1904             if (all_same && def_type == vect_reduction_def)
1905               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1906           }
1907         else if (def_type != vect_internal_def)
1908           return NULL;
1909       }
1910
1911
1912   bool two_operators = false;
1913   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1914   tree vectype = NULL_TREE;
1915   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1916                               &this_max_nunits, matches, &two_operators,
1917                               &vectype))
1918     return NULL;
1919
1920   /* If the SLP node is a load, terminate the recursion unless masked.  */
1921   if (STMT_VINFO_DATA_REF (stmt_info)
1922       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1923     {
1924       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1925         gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1926       else
1927         {
1928           *max_nunits = this_max_nunits;
1929           (*tree_size)++;
1930           node = vect_create_new_slp_node (node, stmts, 0);
1931           SLP_TREE_VECTYPE (node) = vectype;
1932           /* And compute the load permutation.  Whether it is actually
1933              a permutation depends on the unrolling factor which is
1934              decided later.  */
1935           vec<unsigned> load_permutation;
1936           int j;
1937           stmt_vec_info load_info;
1938           load_permutation.create (group_size);
1939           stmt_vec_info first_stmt_info
1940             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1941           bool any_permute = false;
1942           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1943             {
1944               int load_place;
1945               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1946                 load_place = vect_get_place_in_interleaving_chain
1947                     (load_info, first_stmt_info);
1948               else
1949                 load_place = 0;
1950               gcc_assert (load_place != -1);
1951               any_permute |= load_place != j;
1952               load_permutation.quick_push (load_place);
1953             }
1954
1955           if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1956             {
1957               gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1958                           || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1959                           || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
1960                           || gimple_call_internal_p (stmt,
1961                                                      IFN_MASK_LEN_GATHER_LOAD));
1962               load_permutation.release ();
1963               /* We cannot handle permuted masked loads, see PR114375.  */
1964               if (any_permute
1965                   || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1966                       && DR_GROUP_SIZE (first_stmt_info) != group_size)
1967                   || STMT_VINFO_STRIDED_P (stmt_info))
1968                 {
1969                   matches[0] = false;
1970                   return NULL;
1971                 }
1972             }
1973           else
1974             {
1975               SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1976               return node;
1977             }
1978         }
1979     }
1980   else if (gimple_assign_single_p (stmt_info->stmt)
1981            && !gimple_vuse (stmt_info->stmt)
1982            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1983     {
1984       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1985          the same SSA name vector of a compatible type to vectype.  */
1986       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1987       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1988       stmt_vec_info estmt_info;
1989       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1990         {
1991           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1992           tree bfref = gimple_assign_rhs1 (estmt);
1993           HOST_WIDE_INT lane;
1994           if (!known_eq (bit_field_size (bfref),
1995                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1996               || !constant_multiple_p (bit_field_offset (bfref),
1997                                        bit_field_size (bfref), &lane))
1998             {
1999               lperm.release ();
2000               matches[0] = false;
2001               return NULL;
2002             }
2003           lperm.safe_push (std::make_pair (0, (unsigned)lane));
2004         }
2005       slp_tree vnode = vect_create_new_slp_node (vNULL);
2006       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2007         /* ???  We record vectype here but we hide eventually necessary
2008            punning and instead rely on code generation to materialize
2009            VIEW_CONVERT_EXPRs as necessary.  We instead should make
2010            this explicit somehow.  */
2011         SLP_TREE_VECTYPE (vnode) = vectype;
2012       else
2013         {
2014           /* For different size but compatible elements we can still
2015              use VEC_PERM_EXPR without punning.  */
2016           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2017                       && types_compatible_p (TREE_TYPE (vectype),
2018                                              TREE_TYPE (TREE_TYPE (vec))));
2019           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2020         }
2021       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2022       unsigned HOST_WIDE_INT const_nunits;
2023       if (nunits.is_constant (&const_nunits))
2024         SLP_TREE_LANES (vnode) = const_nunits;
2025       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2026       /* We are always building a permutation node even if it is an identity
2027          permute to shield the rest of the vectorizer from the odd node
2028          representing an actual vector without any scalar ops.
2029          ???  We could hide it completely with making the permute node
2030          external?  */
2031       node = vect_create_new_slp_node (node, stmts, 1);
2032       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2033       SLP_TREE_LANE_PERMUTATION (node) = lperm;
2034       SLP_TREE_VECTYPE (node) = vectype;
2035       SLP_TREE_CHILDREN (node).quick_push (vnode);
2036       return node;
2037     }
2038   /* When discovery reaches an associatable operation see whether we can
2039      improve that to match up lanes in a way superior to the operand
2040      swapping code which at most looks at two defs.
2041      ???  For BB vectorization we cannot do the brute-force search
2042      for matching as we can succeed by means of builds from scalars
2043      and have no good way to "cost" one build against another.  */
2044   else if (is_a <loop_vec_info> (vinfo)
2045            /* ???  We don't handle !vect_internal_def defs below.  */
2046            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2047            && is_gimple_assign (stmt_info->stmt)
2048            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2049                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2050            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2051                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2052                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2053     {
2054       /* See if we have a chain of (mixed) adds or subtracts or other
2055          associatable ops.  */
2056       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2057       if (code == MINUS_EXPR)
2058         code = PLUS_EXPR;
2059       stmt_vec_info other_op_stmt_info = NULL;
2060       stmt_vec_info op_stmt_info = NULL;
2061       unsigned chain_len = 0;
2062       auto_vec<chain_op_t> chain;
2063       auto_vec<std::pair<tree_code, gimple *> > worklist;
2064       auto_vec<vec<chain_op_t> > chains (group_size);
2065       auto_vec<slp_tree, 4> children;
2066       bool hard_fail = true;
2067       for (unsigned lane = 0; lane < group_size; ++lane)
2068         {
2069           /* For each lane linearize the addition/subtraction (or other
2070              uniform associatable operation) expression tree.  */
2071           gimple *op_stmt = NULL, *other_op_stmt = NULL;
2072           vect_slp_linearize_chain (vinfo, worklist, chain, code,
2073                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
2074                                     NULL);
2075           if (!op_stmt_info && op_stmt)
2076             op_stmt_info = vinfo->lookup_stmt (op_stmt);
2077           if (!other_op_stmt_info && other_op_stmt)
2078             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2079           if (chain.length () == 2)
2080             {
2081               /* In a chain of just two elements resort to the regular
2082                  operand swapping scheme.  If we run into a length
2083                  mismatch still hard-FAIL.  */
2084               if (chain_len == 0)
2085                 hard_fail = false;
2086               else
2087                 {
2088                   matches[lane] = false;
2089                   /* ???  We might want to process the other lanes, but
2090                      make sure to not give false matching hints to the
2091                      caller for lanes we did not process.  */
2092                   if (lane != group_size - 1)
2093                     matches[0] = false;
2094                 }
2095               break;
2096             }
2097           else if (chain_len == 0)
2098             chain_len = chain.length ();
2099           else if (chain.length () != chain_len)
2100             {
2101               /* ???  Here we could slip in magic to compensate with
2102                  neutral operands.  */
2103               matches[lane] = false;
2104               if (lane != group_size - 1)
2105                 matches[0] = false;
2106               break;
2107             }
2108           chains.quick_push (chain.copy ());
2109           chain.truncate (0);
2110         }
2111       if (chains.length () == group_size)
2112         {
2113           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
2114           if (!op_stmt_info)
2115             {
2116               hard_fail = false;
2117               goto out;
2118             }
2119           /* Now we have a set of chains with the same length.  */
2120           /* 1. pre-sort according to def_type and operation.  */
2121           for (unsigned lane = 0; lane < group_size; ++lane)
2122             chains[lane].stablesort (dt_sort_cmp, vinfo);
2123           if (dump_enabled_p ())
2124             {
2125               dump_printf_loc (MSG_NOTE, vect_location,
2126                                "pre-sorted chains of %s\n",
2127                                get_tree_code_name (code));
2128               for (unsigned lane = 0; lane < group_size; ++lane)
2129                 {
2130                   for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2131                     dump_printf (MSG_NOTE, "%s %T ",
2132                                  get_tree_code_name (chains[lane][opnum].code),
2133                                  chains[lane][opnum].op);
2134                   dump_printf (MSG_NOTE, "\n");
2135                 }
2136             }
2137           /* 2. try to build children nodes, associating as necessary.  */
2138           for (unsigned n = 0; n < chain_len; ++n)
2139             {
2140               vect_def_type dt = chains[0][n].dt;
2141               unsigned lane;
2142               for (lane = 0; lane < group_size; ++lane)
2143                 if (chains[lane][n].dt != dt)
2144                   {
2145                     if (dt == vect_constant_def
2146                         && chains[lane][n].dt == vect_external_def)
2147                       dt = vect_external_def;
2148                     else if (dt == vect_external_def
2149                              && chains[lane][n].dt == vect_constant_def)
2150                       ;
2151                     else
2152                       break;
2153                   }
2154               if (lane != group_size)
2155                 {
2156                   if (dump_enabled_p ())
2157                     dump_printf_loc (MSG_NOTE, vect_location,
2158                                      "giving up on chain due to mismatched "
2159                                      "def types\n");
2160                   matches[lane] = false;
2161                   if (lane != group_size - 1)
2162                     matches[0] = false;
2163                   goto out;
2164                 }
2165               if (dt == vect_constant_def
2166                   || dt == vect_external_def)
2167                 {
2168                   /* Check whether we can build the invariant.  If we can't
2169                      we never will be able to.  */
2170                   tree type = TREE_TYPE (chains[0][n].op);
2171                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2172                       && (TREE_CODE (type) == BOOLEAN_TYPE
2173                           || !can_duplicate_and_interleave_p (vinfo, group_size,
2174                                                               type)))
2175                     {
2176                       matches[0] = false;
2177                       goto out;
2178                     }
2179                   vec<tree> ops;
2180                   ops.create (group_size);
2181                   for (lane = 0; lane < group_size; ++lane)
2182                     ops.quick_push (chains[lane][n].op);
2183                   slp_tree child = vect_create_new_slp_node (ops);
2184                   SLP_TREE_DEF_TYPE (child) = dt;
2185                   children.safe_push (child);
2186                 }
2187               else if (dt != vect_internal_def)
2188                 {
2189                   /* Not sure, we might need sth special.
2190                      gcc.dg/vect/pr96854.c,
2191                      gfortran.dg/vect/fast-math-pr37021.f90
2192                      and gfortran.dg/vect/pr61171.f trigger.  */
2193                   /* Soft-fail for now.  */
2194                   hard_fail = false;
2195                   goto out;
2196                 }
2197               else
2198                 {
2199                   vec<stmt_vec_info> op_stmts;
2200                   op_stmts.create (group_size);
2201                   slp_tree child = NULL;
2202                   /* Brute-force our way.  We have to consider a lane
2203                      failing after fixing an earlier fail up in the
2204                      SLP discovery recursion.  So track the current
2205                      permute per lane.  */
2206                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2207                   memset (perms, 0, sizeof (unsigned) * group_size);
2208                   do
2209                     {
2210                       op_stmts.truncate (0);
2211                       for (lane = 0; lane < group_size; ++lane)
2212                         op_stmts.quick_push
2213                           (vinfo->lookup_def (chains[lane][n].op));
2214                       child = vect_build_slp_tree (vinfo, op_stmts,
2215                                                    group_size, &this_max_nunits,
2216                                                    matches, limit,
2217                                                    &this_tree_size, bst_map);
2218                       /* ???  We're likely getting too many fatal mismatches
2219                          here so maybe we want to ignore them (but then we
2220                          have no idea which lanes fatally mismatched).  */
2221                       if (child || !matches[0])
2222                         break;
2223                       /* Swap another lane we have not yet matched up into
2224                          lanes that did not match.  If we run out of
2225                          permute possibilities for a lane terminate the
2226                          search.  */
2227                       bool term = false;
2228                       for (lane = 1; lane < group_size; ++lane)
2229                         if (!matches[lane])
2230                           {
2231                             if (n + perms[lane] + 1 == chain_len)
2232                               {
2233                                 term = true;
2234                                 break;
2235                               }
2236                             std::swap (chains[lane][n],
2237                                        chains[lane][n + perms[lane] + 1]);
2238                             perms[lane]++;
2239                           }
2240                       if (term)
2241                         break;
2242                     }
2243                   while (1);
2244                   if (!child)
2245                     {
2246                       if (dump_enabled_p ())
2247                         dump_printf_loc (MSG_NOTE, vect_location,
2248                                          "failed to match up op %d\n", n);
2249                       op_stmts.release ();
2250                       if (lane != group_size - 1)
2251                         matches[0] = false;
2252                       else
2253                         matches[lane] = false;
2254                       goto out;
2255                     }
2256                   if (dump_enabled_p ())
2257                     {
2258                       dump_printf_loc (MSG_NOTE, vect_location,
2259                                        "matched up op %d to\n", n);
2260                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2261                     }
2262                   children.safe_push (child);
2263                 }
2264             }
2265           /* 3. build SLP nodes to combine the chain.  */
2266           for (unsigned lane = 0; lane < group_size; ++lane)
2267             if (chains[lane][0].code != code)
2268               {
2269                 /* See if there's any alternate all-PLUS entry.  */
2270                 unsigned n;
2271                 for (n = 1; n < chain_len; ++n)
2272                   {
2273                     for (lane = 0; lane < group_size; ++lane)
2274                       if (chains[lane][n].code != code)
2275                         break;
2276                     if (lane == group_size)
2277                       break;
2278                   }
2279                 if (n != chain_len)
2280                   {
2281                     /* Swap that in at first position.  */
2282                     std::swap (children[0], children[n]);
2283                     for (lane = 0; lane < group_size; ++lane)
2284                       std::swap (chains[lane][0], chains[lane][n]);
2285                   }
2286                 else
2287                   {
2288                     /* ???  When this triggers and we end up with two
2289                        vect_constant/external_def up-front things break (ICE)
2290                        spectacularly finding an insertion place for the
2291                        all-constant op.  We should have a fully
2292                        vect_internal_def operand though(?) so we can swap
2293                        that into first place and then prepend the all-zero
2294                        constant.  */
2295                     if (dump_enabled_p ())
2296                       dump_printf_loc (MSG_NOTE, vect_location,
2297                                        "inserting constant zero to compensate "
2298                                        "for (partially) negated first "
2299                                        "operand\n");
2300                     chain_len++;
2301                     for (lane = 0; lane < group_size; ++lane)
2302                       chains[lane].safe_insert
2303                         (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2304                     vec<tree> zero_ops;
2305                     zero_ops.create (group_size);
2306                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2307                     for (lane = 1; lane < group_size; ++lane)
2308                       zero_ops.quick_push (zero_ops[0]);
2309                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2310                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2311                     children.safe_insert (0, zero);
2312                   }
2313                 break;
2314               }
2315           for (unsigned i = 1; i < children.length (); ++i)
2316             {
2317               slp_tree op0 = children[i - 1];
2318               slp_tree op1 = children[i];
2319               bool this_two_op = false;
2320               for (unsigned lane = 0; lane < group_size; ++lane)
2321                 if (chains[lane][i].code != chains[0][i].code)
2322                   {
2323                     this_two_op = true;
2324                     break;
2325                   }
2326               slp_tree child;
2327               if (i == children.length () - 1)
2328                 child = vect_create_new_slp_node (node, stmts, 2);
2329               else
2330                 child = vect_create_new_slp_node (2, ERROR_MARK);
2331               if (this_two_op)
2332                 {
2333                   vec<std::pair<unsigned, unsigned> > lperm;
2334                   lperm.create (group_size);
2335                   for (unsigned lane = 0; lane < group_size; ++lane)
2336                     lperm.quick_push (std::make_pair
2337                       (chains[lane][i].code != chains[0][i].code, lane));
2338                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2339                                                      (chains[0][i].code == code
2340                                                       ? op_stmt_info
2341                                                       : other_op_stmt_info),
2342                                                      (chains[0][i].code == code
2343                                                       ? other_op_stmt_info
2344                                                       : op_stmt_info),
2345                                                      lperm);
2346                 }
2347               else
2348                 {
2349                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2350                   SLP_TREE_VECTYPE (child) = vectype;
2351                   SLP_TREE_LANES (child) = group_size;
2352                   SLP_TREE_CHILDREN (child).quick_push (op0);
2353                   SLP_TREE_CHILDREN (child).quick_push (op1);
2354                   SLP_TREE_REPRESENTATIVE (child)
2355                     = (chains[0][i].code == code
2356                        ? op_stmt_info : other_op_stmt_info);
2357                 }
2358               children[i] = child;
2359             }
2360           *tree_size += this_tree_size + 1;
2361           *max_nunits = this_max_nunits;
2362           while (!chains.is_empty ())
2363             chains.pop ().release ();
2364           return node;
2365         }
2366 out:
2367       while (!children.is_empty ())
2368         vect_free_slp_tree (children.pop ());
2369       while (!chains.is_empty ())
2370         chains.pop ().release ();
2371       /* Hard-fail, otherwise we might run into quadratic processing of the
2372          chains starting one stmt into the chain again.  */
2373       if (hard_fail)
2374         return NULL;
2375       /* Fall thru to normal processing.  */
2376     }
2377
2378   /* Get at the operands, verifying they are compatible.  */
2379   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2380   slp_oprnd_info oprnd_info;
2381   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2382     {
2383       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2384                                              stmts, i, &oprnds_info);
2385       if (res != 0)
2386         matches[(res == -1) ? 0 : i] = false;
2387       if (!matches[0])
2388         break;
2389     }
2390   for (i = 0; i < group_size; ++i)
2391     if (!matches[i])
2392       {
2393         vect_free_oprnd_info (oprnds_info);
2394         return NULL;
2395       }
2396   swap = NULL;
2397
2398   auto_vec<slp_tree, 4> children;
2399
2400   stmt_info = stmts[0];
2401
2402   /* Create SLP_TREE nodes for the definition node/s.  */
2403   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2404     {
2405       slp_tree child = nullptr;
2406       unsigned int j;
2407
2408       /* We're skipping certain operands from processing, for example
2409          outer loop reduction initial defs.  */
2410       if (skip_args[i])
2411         {
2412           children.safe_push (NULL);
2413           continue;
2414         }
2415
2416       if (oprnd_info->first_dt == vect_uninitialized_def)
2417         {
2418           /* COND_EXPR have one too many eventually if the condition
2419              is a SSA name.  */
2420           gcc_assert (i == 3 && nops == 4);
2421           continue;
2422         }
2423
2424       if (is_a <bb_vec_info> (vinfo)
2425           && oprnd_info->first_dt == vect_internal_def
2426           && !oprnd_info->any_pattern)
2427         {
2428           /* For BB vectorization, if all defs are the same do not
2429              bother to continue the build along the single-lane
2430              graph but use a splat of the scalar value.  */
2431           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2432           for (j = 1; j < group_size; ++j)
2433             if (oprnd_info->def_stmts[j] != first_def)
2434               break;
2435           if (j == group_size
2436               /* But avoid doing this for loads where we may be
2437                  able to CSE things, unless the stmt is not
2438                  vectorizable.  */
2439               && (!STMT_VINFO_VECTORIZABLE (first_def)
2440                   || !gimple_vuse (first_def->stmt)))
2441             {
2442               if (dump_enabled_p ())
2443                 dump_printf_loc (MSG_NOTE, vect_location,
2444                                  "Using a splat of the uniform operand %G",
2445                                  first_def->stmt);
2446               oprnd_info->first_dt = vect_external_def;
2447             }
2448         }
2449
2450       if (oprnd_info->first_dt == vect_external_def
2451           || oprnd_info->first_dt == vect_constant_def)
2452         {
2453           if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2454             {
2455               tree op0;
2456               tree uniform_val = op0 = oprnd_info->ops[0];
2457               for (j = 1; j < oprnd_info->ops.length (); ++j)
2458                 if (!operand_equal_p (uniform_val, oprnd_info->ops[j]))
2459                   {
2460                     uniform_val = NULL_TREE;
2461                     break;
2462                   }
2463               if (!uniform_val
2464                   && !can_duplicate_and_interleave_p (vinfo,
2465                                                       oprnd_info->ops.length (),
2466                                                       TREE_TYPE (op0)))
2467                 {
2468                   matches[j] = false;
2469                   if (dump_enabled_p ())
2470                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2471                                      "Build SLP failed: invalid type of def "
2472                                      "for variable-length SLP %T\n", op0);
2473                   goto fail;
2474                 }
2475             }
2476           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2477           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2478           oprnd_info->ops = vNULL;
2479           children.safe_push (invnode);
2480           continue;
2481         }
2482
2483       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2484                                         group_size, &this_max_nunits,
2485                                         matches, limit,
2486                                         &this_tree_size, bst_map)) != NULL)
2487         {
2488           oprnd_info->def_stmts = vNULL;
2489           children.safe_push (child);
2490           continue;
2491         }
2492
2493       /* If the SLP build for operand zero failed and operand zero
2494          and one can be commutated try that for the scalar stmts
2495          that failed the match.  */
2496       if (i == 0
2497           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2498           && matches[0]
2499           /* ???  For COND_EXPRs we can swap the comparison operands
2500              as well as the arms under some constraints.  */
2501           && nops == 2
2502           && oprnds_info[1]->first_dt == vect_internal_def
2503           && is_gimple_assign (stmt_info->stmt)
2504           /* Swapping operands for reductions breaks assumptions later on.  */
2505           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2506           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2507         {
2508           /* See whether we can swap the matching or the non-matching
2509              stmt operands.  */
2510           bool swap_not_matching = true;
2511           do
2512             {
2513               for (j = 0; j < group_size; ++j)
2514                 {
2515                   if (matches[j] != !swap_not_matching)
2516                     continue;
2517                   stmt_vec_info stmt_info = stmts[j];
2518                   /* Verify if we can swap operands of this stmt.  */
2519                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2520                   if (!stmt
2521                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2522                     {
2523                       if (!swap_not_matching)
2524                         goto fail;
2525                       swap_not_matching = false;
2526                       break;
2527                     }
2528                 }
2529             }
2530           while (j != group_size);
2531
2532           /* Swap mismatched definition stmts.  */
2533           if (dump_enabled_p ())
2534             dump_printf_loc (MSG_NOTE, vect_location,
2535                              "Re-trying with swapped operands of stmts ");
2536           for (j = 0; j < group_size; ++j)
2537             if (matches[j] == !swap_not_matching)
2538               {
2539                 std::swap (oprnds_info[0]->def_stmts[j],
2540                            oprnds_info[1]->def_stmts[j]);
2541                 std::swap (oprnds_info[0]->ops[j],
2542                            oprnds_info[1]->ops[j]);
2543                 if (dump_enabled_p ())
2544                   dump_printf (MSG_NOTE, "%d ", j);
2545               }
2546           if (dump_enabled_p ())
2547             dump_printf (MSG_NOTE, "\n");
2548           /* After swapping some operands we lost track whether an
2549              operand has any pattern defs so be conservative here.  */
2550           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2551             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2552           /* And try again with scratch 'matches' ... */
2553           bool *tem = XALLOCAVEC (bool, group_size);
2554           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2555                                             group_size, &this_max_nunits,
2556                                             tem, limit,
2557                                             &this_tree_size, bst_map)) != NULL)
2558             {
2559               oprnd_info->def_stmts = vNULL;
2560               children.safe_push (child);
2561               continue;
2562             }
2563         }
2564 fail:
2565
2566       /* If the SLP build failed and we analyze a basic-block
2567          simply treat nodes we fail to build as externally defined
2568          (and thus build vectors from the scalar defs).
2569          The cost model will reject outright expensive cases.
2570          ???  This doesn't treat cases where permutation ultimatively
2571          fails (or we don't try permutation below).  Ideally we'd
2572          even compute a permutation that will end up with the maximum
2573          SLP tree size...  */
2574       if (is_a <bb_vec_info> (vinfo)
2575           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2576              do extra work to cancel the pattern so the uses see the
2577              scalar version.  */
2578           && !is_pattern_stmt_p (stmt_info)
2579           && !oprnd_info->any_pattern)
2580         {
2581           /* But if there's a leading vector sized set of matching stmts
2582              fail here so we can split the group.  This matches the condition
2583              vect_analyze_slp_instance uses.  */
2584           /* ???  We might want to split here and combine the results to support
2585              multiple vector sizes better.  */
2586           for (j = 0; j < group_size; ++j)
2587             if (!matches[j])
2588               break;
2589           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2590             {
2591               if (dump_enabled_p ())
2592                 dump_printf_loc (MSG_NOTE, vect_location,
2593                                  "Building vector operands from scalars\n");
2594               this_tree_size++;
2595               child = vect_create_new_slp_node (oprnd_info->ops);
2596               children.safe_push (child);
2597               oprnd_info->ops = vNULL;
2598               continue;
2599             }
2600         }
2601
2602       gcc_assert (child == NULL);
2603       FOR_EACH_VEC_ELT (children, j, child)
2604         if (child)
2605           vect_free_slp_tree (child);
2606       vect_free_oprnd_info (oprnds_info);
2607       return NULL;
2608     }
2609
2610   vect_free_oprnd_info (oprnds_info);
2611
2612   /* If we have all children of a child built up from uniform scalars
2613      or does more than one possibly expensive vector construction then
2614      just throw that away, causing it built up from scalars.
2615      The exception is the SLP node for the vector store.  */
2616   if (is_a <bb_vec_info> (vinfo)
2617       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2618       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2619          do extra work to cancel the pattern so the uses see the
2620          scalar version.  */
2621       && !is_pattern_stmt_p (stmt_info))
2622     {
2623       slp_tree child;
2624       unsigned j;
2625       bool all_uniform_p = true;
2626       unsigned n_vector_builds = 0;
2627       FOR_EACH_VEC_ELT (children, j, child)
2628         {
2629           if (!child)
2630             ;
2631           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2632             all_uniform_p = false;
2633           else if (!vect_slp_tree_uniform_p (child))
2634             {
2635               all_uniform_p = false;
2636               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2637                 n_vector_builds++;
2638             }
2639         }
2640       if (all_uniform_p
2641           || n_vector_builds > 1
2642           || (n_vector_builds == children.length ()
2643               && is_a <gphi *> (stmt_info->stmt)))
2644         {
2645           /* Roll back.  */
2646           matches[0] = false;
2647           FOR_EACH_VEC_ELT (children, j, child)
2648             if (child)
2649               vect_free_slp_tree (child);
2650
2651           if (dump_enabled_p ())
2652             dump_printf_loc (MSG_NOTE, vect_location,
2653                              "Building parent vector operands from "
2654                              "scalars instead\n");
2655           return NULL;
2656         }
2657     }
2658
2659   *tree_size += this_tree_size + 1;
2660   *max_nunits = this_max_nunits;
2661
2662   if (two_operators)
2663     {
2664       /* ???  We'd likely want to either cache in bst_map sth like
2665          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2666          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2667          explicit stmts to put in so the keying on 'stmts' doesn't
2668          work (but we have the same issue with nodes that use 'ops').  */
2669       slp_tree one = new _slp_tree;
2670       slp_tree two = new _slp_tree;
2671       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2672       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2673       SLP_TREE_VECTYPE (one) = vectype;
2674       SLP_TREE_VECTYPE (two) = vectype;
2675       SLP_TREE_CHILDREN (one).safe_splice (children);
2676       SLP_TREE_CHILDREN (two).safe_splice (children);
2677       slp_tree child;
2678       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2679         SLP_TREE_REF_COUNT (child)++;
2680
2681       /* Here we record the original defs since this
2682          node represents the final lane configuration.  */
2683       node = vect_create_new_slp_node (node, stmts, 2);
2684       SLP_TREE_VECTYPE (node) = vectype;
2685       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2686       SLP_TREE_CHILDREN (node).quick_push (one);
2687       SLP_TREE_CHILDREN (node).quick_push (two);
2688       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2689       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2690       enum tree_code ocode = ERROR_MARK;
2691       stmt_vec_info ostmt_info;
2692       unsigned j = 0;
2693       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2694         {
2695           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2696           if (gimple_assign_rhs_code (ostmt) != code0)
2697             {
2698               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2699               ocode = gimple_assign_rhs_code (ostmt);
2700               j = i;
2701             }
2702           else
2703             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2704         }
2705       SLP_TREE_CODE (one) = code0;
2706       SLP_TREE_CODE (two) = ocode;
2707       SLP_TREE_LANES (one) = stmts.length ();
2708       SLP_TREE_LANES (two) = stmts.length ();
2709       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2710       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2711       return node;
2712     }
2713
2714   node = vect_create_new_slp_node (node, stmts, nops);
2715   SLP_TREE_VECTYPE (node) = vectype;
2716   SLP_TREE_CHILDREN (node).splice (children);
2717   return node;
2718 }
2719
2720 /* Dump a single SLP tree NODE.  */
2721
2722 static void
2723 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2724                      slp_tree node)
2725 {
2726   unsigned i, j;
2727   slp_tree child;
2728   stmt_vec_info stmt_info;
2729   tree op;
2730
2731   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2732   dump_user_location_t user_loc = loc.get_user_location ();
2733   dump_printf_loc (metadata, user_loc,
2734                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2735                    ", refcnt=%u)",
2736                    SLP_TREE_DEF_TYPE (node) == vect_external_def
2737                    ? " (external)"
2738                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2739                       ? " (constant)"
2740                       : ""), (void *) node,
2741                    estimated_poly_value (node->max_nunits),
2742                                          SLP_TREE_REF_COUNT (node));
2743   if (SLP_TREE_VECTYPE (node))
2744     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2745   dump_printf (metadata, "\n");
2746   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2747     {
2748       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2749         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2750       else
2751         dump_printf_loc (metadata, user_loc, "op template: %G",
2752                          SLP_TREE_REPRESENTATIVE (node)->stmt);
2753     }
2754   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2755     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2756       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2757   else
2758     {
2759       dump_printf_loc (metadata, user_loc, "\t{ ");
2760       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2761         dump_printf (metadata, "%T%s ", op,
2762                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2763       dump_printf (metadata, "}\n");
2764     }
2765   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2766     {
2767       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2768       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2769         dump_printf (dump_kind, " %u", j);
2770       dump_printf (dump_kind, " }\n");
2771     }
2772   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2773     {
2774       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2775       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2776         dump_printf (dump_kind, " %u[%u]",
2777                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
2778                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
2779       dump_printf (dump_kind, " }\n");
2780     }
2781   if (SLP_TREE_CHILDREN (node).is_empty ())
2782     return;
2783   dump_printf_loc (metadata, user_loc, "\tchildren");
2784   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2785     dump_printf (dump_kind, " %p", (void *)child);
2786   dump_printf (dump_kind, "\n");
2787 }
2788
2789 DEBUG_FUNCTION void
2790 debug (slp_tree node)
2791 {
2792   debug_dump_context ctx;
2793   vect_print_slp_tree (MSG_NOTE,
2794                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2795                        node);
2796 }
2797
2798 /* Recursive helper for the dot producer below.  */
2799
2800 static void
2801 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2802 {
2803   if (visited.add (node))
2804     return;
2805
2806   fprintf (f, "\"%p\" [label=\"", (void *)node);
2807   vect_print_slp_tree (MSG_NOTE,
2808                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2809                        node);
2810   fprintf (f, "\"];\n");
2811
2812
2813   for (slp_tree child : SLP_TREE_CHILDREN (node))
2814     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2815
2816   for (slp_tree child : SLP_TREE_CHILDREN (node))
2817     if (child)
2818       dot_slp_tree (f, child, visited);
2819 }
2820
2821 DEBUG_FUNCTION void
2822 dot_slp_tree (const char *fname, slp_tree node)
2823 {
2824   FILE *f = fopen (fname, "w");
2825   fprintf (f, "digraph {\n");
2826   fflush (f);
2827     {
2828       debug_dump_context ctx (f);
2829       hash_set<slp_tree> visited;
2830       dot_slp_tree (f, node, visited);
2831     }
2832   fflush (f);
2833   fprintf (f, "}\n");
2834   fclose (f);
2835 }
2836
2837 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2838
2839 static void
2840 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2841                       slp_tree node, hash_set<slp_tree> &visited)
2842 {
2843   unsigned i;
2844   slp_tree child;
2845
2846   if (visited.add (node))
2847     return;
2848
2849   vect_print_slp_tree (dump_kind, loc, node);
2850
2851   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2852     if (child)
2853       vect_print_slp_graph (dump_kind, loc, child, visited);
2854 }
2855
2856 static void
2857 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2858                       slp_tree entry)
2859 {
2860   hash_set<slp_tree> visited;
2861   vect_print_slp_graph (dump_kind, loc, entry, visited);
2862 }
2863
2864 /* Mark the tree rooted at NODE with PURE_SLP.  */
2865
2866 static void
2867 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2868 {
2869   int i;
2870   stmt_vec_info stmt_info;
2871   slp_tree child;
2872
2873   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2874     return;
2875
2876   if (visited.add (node))
2877     return;
2878
2879   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2880     STMT_SLP_TYPE (stmt_info) = pure_slp;
2881
2882   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2883     if (child)
2884       vect_mark_slp_stmts (child, visited);
2885 }
2886
2887 static void
2888 vect_mark_slp_stmts (slp_tree node)
2889 {
2890   hash_set<slp_tree> visited;
2891   vect_mark_slp_stmts (node, visited);
2892 }
2893
2894 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2895
2896 static void
2897 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2898 {
2899   int i;
2900   stmt_vec_info stmt_info;
2901   slp_tree child;
2902
2903   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2904     return;
2905
2906   if (visited.add (node))
2907     return;
2908
2909   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2910     {
2911       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2912                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2913       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2914     }
2915
2916   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2917     if (child)
2918       vect_mark_slp_stmts_relevant (child, visited);
2919 }
2920
2921 static void
2922 vect_mark_slp_stmts_relevant (slp_tree node)
2923 {
2924   hash_set<slp_tree> visited;
2925   vect_mark_slp_stmts_relevant (node, visited);
2926 }
2927
2928
2929 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2930
2931 static void
2932 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2933                        hash_set<slp_tree> &visited)
2934 {
2935   if (!node || visited.add (node))
2936     return;
2937
2938   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2939     return;
2940
2941   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
2942     {
2943       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2944       if (STMT_VINFO_DATA_REF (stmt_info)
2945           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2946         loads.safe_push (node);
2947     }
2948
2949   unsigned i;
2950   slp_tree child;
2951   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2952     vect_gather_slp_loads (loads, child, visited);
2953 }
2954
2955
2956 /* Find the last store in SLP INSTANCE.  */
2957
2958 stmt_vec_info
2959 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2960 {
2961   stmt_vec_info last = NULL;
2962   stmt_vec_info stmt_vinfo;
2963
2964   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2965     {
2966       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2967       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2968     }
2969
2970   return last;
2971 }
2972
2973 /* Find the first stmt in NODE.  */
2974
2975 stmt_vec_info
2976 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2977 {
2978   stmt_vec_info first = NULL;
2979   stmt_vec_info stmt_vinfo;
2980
2981   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2982     {
2983       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2984       if (!first
2985           || get_later_stmt (stmt_vinfo, first) == first)
2986         first = stmt_vinfo;
2987     }
2988
2989   return first;
2990 }
2991
2992 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2993    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2994    (also containing the first GROUP1_SIZE stmts, since stores are
2995    consecutive), the second containing the remainder.
2996    Return the first stmt in the second group.  */
2997
2998 static stmt_vec_info
2999 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3000 {
3001   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3002   gcc_assert (group1_size > 0);
3003   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3004   gcc_assert (group2_size > 0);
3005   DR_GROUP_SIZE (first_vinfo) = group1_size;
3006
3007   stmt_vec_info stmt_info = first_vinfo;
3008   for (unsigned i = group1_size; i > 1; i--)
3009     {
3010       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3011       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3012     }
3013   /* STMT is now the last element of the first group.  */
3014   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3015   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3016
3017   DR_GROUP_SIZE (group2) = group2_size;
3018   for (stmt_info = group2; stmt_info;
3019        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3020     {
3021       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3022       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3023     }
3024
3025   /* For the second group, the DR_GROUP_GAP is that before the original group,
3026      plus skipping over the first vector.  */
3027   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3028
3029   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
3030   DR_GROUP_GAP (first_vinfo) += group2_size;
3031
3032   if (dump_enabled_p ())
3033     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3034                      group1_size, group2_size);
3035
3036   return group2;
3037 }
3038
3039 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3040    statements and a vector of NUNITS elements.  */
3041
3042 static poly_uint64
3043 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3044 {
3045   return exact_div (common_multiple (nunits, group_size), group_size);
3046 }
3047
3048 /* Helper that checks to see if a node is a load node.  */
3049
3050 static inline bool
3051 vect_is_slp_load_node  (slp_tree root)
3052 {
3053   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3054          && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3055          && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3056 }
3057
3058
3059 /* Helper function of optimize_load_redistribution that performs the operation
3060    recursively.  */
3061
3062 static slp_tree
3063 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3064                                 vec_info *vinfo, unsigned int group_size,
3065                                 hash_map<slp_tree, slp_tree> *load_map,
3066                                 slp_tree root)
3067 {
3068   if (slp_tree *leader = load_map->get (root))
3069     return *leader;
3070
3071   slp_tree node;
3072   unsigned i;
3073
3074   /* For now, we don't know anything about externals so do not do anything.  */
3075   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3076     return NULL;
3077   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3078     {
3079       /* First convert this node into a load node and add it to the leaves
3080          list and flatten the permute from a lane to a load one.  If it's
3081          unneeded it will be elided later.  */
3082       vec<stmt_vec_info> stmts;
3083       stmts.create (SLP_TREE_LANES (root));
3084       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3085       for (unsigned j = 0; j < lane_perm.length (); j++)
3086         {
3087           std::pair<unsigned, unsigned> perm = lane_perm[j];
3088           node = SLP_TREE_CHILDREN (root)[perm.first];
3089
3090           if (!vect_is_slp_load_node (node)
3091               || SLP_TREE_CHILDREN (node).exists ())
3092             {
3093               stmts.release ();
3094               goto next;
3095             }
3096
3097           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3098         }
3099
3100       if (dump_enabled_p ())
3101         dump_printf_loc (MSG_NOTE, vect_location,
3102                          "converting stmts on permute node %p\n",
3103                          (void *) root);
3104
3105       bool *matches = XALLOCAVEC (bool, group_size);
3106       poly_uint64 max_nunits = 1;
3107       unsigned tree_size = 0, limit = 1;
3108       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3109                                   matches, &limit, &tree_size, bst_map);
3110       if (!node)
3111         stmts.release ();
3112
3113       load_map->put (root, node);
3114       return node;
3115     }
3116
3117 next:
3118   load_map->put (root, NULL);
3119
3120   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3121     {
3122       slp_tree value
3123         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3124                                           node);
3125       if (value)
3126         {
3127           SLP_TREE_REF_COUNT (value)++;
3128           SLP_TREE_CHILDREN (root)[i] = value;
3129           /* ???  We know the original leafs of the replaced nodes will
3130              be referenced by bst_map, only the permutes created by
3131              pattern matching are not.  */
3132           if (SLP_TREE_REF_COUNT (node) == 1)
3133             load_map->remove (node);
3134           vect_free_slp_tree (node);
3135         }
3136     }
3137
3138   return NULL;
3139 }
3140
3141 /* Temporary workaround for loads not being CSEd during SLP build.  This
3142    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3143    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3144    same DR such that the final operation is equal to a permuted load.  Such
3145    NODES are then directly converted into LOADS themselves.  The nodes are
3146    CSEd using BST_MAP.  */
3147
3148 static void
3149 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3150                               vec_info *vinfo, unsigned int group_size,
3151                               hash_map<slp_tree, slp_tree> *load_map,
3152                               slp_tree root)
3153 {
3154   slp_tree node;
3155   unsigned i;
3156
3157   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3158     {
3159       slp_tree value
3160         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3161                                           node);
3162       if (value)
3163         {
3164           SLP_TREE_REF_COUNT (value)++;
3165           SLP_TREE_CHILDREN (root)[i] = value;
3166           /* ???  We know the original leafs of the replaced nodes will
3167              be referenced by bst_map, only the permutes created by
3168              pattern matching are not.  */
3169           if (SLP_TREE_REF_COUNT (node) == 1)
3170             load_map->remove (node);
3171           vect_free_slp_tree (node);
3172         }
3173     }
3174 }
3175
3176 /* Helper function of vect_match_slp_patterns.
3177
3178    Attempts to match patterns against the slp tree rooted in REF_NODE using
3179    VINFO.  Patterns are matched in post-order traversal.
3180
3181    If matching is successful the value in REF_NODE is updated and returned, if
3182    not then it is returned unchanged.  */
3183
3184 static bool
3185 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3186                            slp_tree_to_load_perm_map_t *perm_cache,
3187                            slp_compat_nodes_map_t *compat_cache,
3188                            hash_set<slp_tree> *visited)
3189 {
3190   unsigned i;
3191   slp_tree node = *ref_node;
3192   bool found_p = false;
3193   if (!node || visited->add (node))
3194     return false;
3195
3196   slp_tree child;
3197   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3198     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3199                                           vinfo, perm_cache, compat_cache,
3200                                           visited);
3201
3202   for (unsigned x = 0; x < num__slp_patterns; x++)
3203     {
3204       vect_pattern *pattern
3205         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3206       if (pattern)
3207         {
3208           pattern->build (vinfo);
3209           delete pattern;
3210           found_p = true;
3211         }
3212     }
3213
3214   return found_p;
3215 }
3216
3217 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3218    vec_info VINFO.
3219
3220    The modified tree is returned.  Patterns are tried in order and multiple
3221    patterns may match.  */
3222
3223 static bool
3224 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3225                          hash_set<slp_tree> *visited,
3226                          slp_tree_to_load_perm_map_t *perm_cache,
3227                          slp_compat_nodes_map_t *compat_cache)
3228 {
3229   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3230   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3231
3232   if (dump_enabled_p ())
3233     dump_printf_loc (MSG_NOTE, vect_location,
3234                      "Analyzing SLP tree %p for patterns\n",
3235                      (void *) SLP_INSTANCE_TREE (instance));
3236
3237   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3238                                     visited);
3239 }
3240
3241 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3242    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3243    Return true if we could use IFN_STORE_LANES instead and if that appears
3244    to be the better approach.  */
3245
3246 static bool
3247 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3248                                unsigned int group_size,
3249                                unsigned int new_group_size)
3250 {
3251   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3252   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3253   if (!vectype)
3254     return false;
3255   /* Allow the split if one of the two new groups would operate on full
3256      vectors *within* rather than across one scalar loop iteration.
3257      This is purely a heuristic, but it should work well for group
3258      sizes of 3 and 4, where the possible splits are:
3259
3260        3->2+1:  OK if the vector has exactly two elements
3261        4->2+2:  Likewise
3262        4->3+1:  Less clear-cut.  */
3263   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3264       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3265     return false;
3266   return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3267 }
3268
3269 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3270    vect_build_slp_tree to build a tree of packed stmts if possible.
3271    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3272
3273 static bool
3274 vect_analyze_slp_instance (vec_info *vinfo,
3275                            scalar_stmts_to_slp_tree_map_t *bst_map,
3276                            stmt_vec_info stmt_info, slp_instance_kind kind,
3277                            unsigned max_tree_size, unsigned *limit);
3278
3279 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3280    of KIND.  Return true if successful.  */
3281
3282 static bool
3283 vect_build_slp_instance (vec_info *vinfo,
3284                          slp_instance_kind kind,
3285                          vec<stmt_vec_info> &scalar_stmts,
3286                          vec<stmt_vec_info> &root_stmt_infos,
3287                          vec<tree> &remain,
3288                          unsigned max_tree_size, unsigned *limit,
3289                          scalar_stmts_to_slp_tree_map_t *bst_map,
3290                          /* ???  We need stmt_info for group splitting.  */
3291                          stmt_vec_info stmt_info_)
3292 {
3293   if (kind == slp_inst_kind_ctor)
3294     {
3295       if (dump_enabled_p ())
3296         dump_printf_loc (MSG_NOTE, vect_location,
3297                          "Analyzing vectorizable constructor: %G\n",
3298                          root_stmt_infos[0]->stmt);
3299     }
3300
3301   if (dump_enabled_p ())
3302     {
3303       dump_printf_loc (MSG_NOTE, vect_location,
3304                        "Starting SLP discovery for\n");
3305       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3306         dump_printf_loc (MSG_NOTE, vect_location,
3307                          "  %G", scalar_stmts[i]->stmt);
3308     }
3309
3310   /* Build the tree for the SLP instance.  */
3311   unsigned int group_size = scalar_stmts.length ();
3312   bool *matches = XALLOCAVEC (bool, group_size);
3313   poly_uint64 max_nunits = 1;
3314   unsigned tree_size = 0;
3315   unsigned i;
3316   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3317                                        &max_nunits, matches, limit,
3318                                        &tree_size, bst_map);
3319   if (node != NULL)
3320     {
3321       /* Calculate the unrolling factor based on the smallest type.  */
3322       poly_uint64 unrolling_factor
3323         = calculate_unrolling_factor (max_nunits, group_size);
3324
3325       if (maybe_ne (unrolling_factor, 1U)
3326           && is_a <bb_vec_info> (vinfo))
3327         {
3328           unsigned HOST_WIDE_INT const_max_nunits;
3329           if (!max_nunits.is_constant (&const_max_nunits)
3330               || const_max_nunits > group_size)
3331             {
3332               if (dump_enabled_p ())
3333                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3334                                  "Build SLP failed: store group "
3335                                  "size not a multiple of the vector size "
3336                                  "in basic block SLP\n");
3337               vect_free_slp_tree (node);
3338               return false;
3339             }
3340           /* Fatal mismatch.  */
3341           if (dump_enabled_p ())
3342             dump_printf_loc (MSG_NOTE, vect_location,
3343                              "SLP discovery succeeded but node needs "
3344                              "splitting\n");
3345           memset (matches, true, group_size);
3346           matches[group_size / const_max_nunits * const_max_nunits] = false;
3347           vect_free_slp_tree (node);
3348         }
3349       else
3350         {
3351           /* Create a new SLP instance.  */
3352           slp_instance new_instance = XNEW (class _slp_instance);
3353           SLP_INSTANCE_TREE (new_instance) = node;
3354           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3355           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3356           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3357           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3358           SLP_INSTANCE_KIND (new_instance) = kind;
3359           new_instance->reduc_phis = NULL;
3360           new_instance->cost_vec = vNULL;
3361           new_instance->subgraph_entries = vNULL;
3362
3363           if (dump_enabled_p ())
3364             dump_printf_loc (MSG_NOTE, vect_location,
3365                              "SLP size %u vs. limit %u.\n",
3366                              tree_size, max_tree_size);
3367
3368           /* Fixup SLP reduction chains.  */
3369           if (kind == slp_inst_kind_reduc_chain)
3370             {
3371               /* If this is a reduction chain with a conversion in front
3372                  amend the SLP tree with a node for that.  */
3373               gimple *scalar_def
3374                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3375               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3376                 {
3377                   /* Get at the conversion stmt - we know it's the single use
3378                      of the last stmt of the reduction chain.  */
3379                   use_operand_p use_p;
3380                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3381                                            &use_p, &scalar_def);
3382                   gcc_assert (r);
3383                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3384                   next_info = vect_stmt_to_vectorize (next_info);
3385                   scalar_stmts = vNULL;
3386                   scalar_stmts.create (group_size);
3387                   for (unsigned i = 0; i < group_size; ++i)
3388                     scalar_stmts.quick_push (next_info);
3389                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3390                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3391                   SLP_TREE_CHILDREN (conv).quick_push (node);
3392                   SLP_INSTANCE_TREE (new_instance) = conv;
3393                   /* We also have to fake this conversion stmt as SLP reduction
3394                      group so we don't have to mess with too much code
3395                      elsewhere.  */
3396                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3397                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3398                 }
3399               /* Fill the backedge child of the PHI SLP node.  The
3400                  general matching code cannot find it because the
3401                  scalar code does not reflect how we vectorize the
3402                  reduction.  */
3403               use_operand_p use_p;
3404               imm_use_iterator imm_iter;
3405               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3406               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3407                                      gimple_get_lhs (scalar_def))
3408                 /* There are exactly two non-debug uses, the reduction
3409                    PHI and the loop-closed PHI node.  */
3410                 if (!is_gimple_debug (USE_STMT (use_p))
3411                     && gimple_bb (USE_STMT (use_p)) == loop->header)
3412                   {
3413                     auto_vec<stmt_vec_info, 64> phis (group_size);
3414                     stmt_vec_info phi_info
3415                       = vinfo->lookup_stmt (USE_STMT (use_p));
3416                     for (unsigned i = 0; i < group_size; ++i)
3417                       phis.quick_push (phi_info);
3418                     slp_tree *phi_node = bst_map->get (phis);
3419                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3420                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3421                       = SLP_INSTANCE_TREE (new_instance);
3422                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
3423                   }
3424             }
3425
3426           vinfo->slp_instances.safe_push (new_instance);
3427
3428           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3429              the number of scalar stmts in the root in a few places.
3430              Verify that assumption holds.  */
3431           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3432                         .length () == group_size);
3433
3434           if (dump_enabled_p ())
3435             {
3436               dump_printf_loc (MSG_NOTE, vect_location,
3437                                "Final SLP tree for instance %p:\n",
3438                                (void *) new_instance);
3439               vect_print_slp_graph (MSG_NOTE, vect_location,
3440                                     SLP_INSTANCE_TREE (new_instance));
3441             }
3442
3443           return true;
3444         }
3445     }
3446   else
3447     {
3448       /* Failed to SLP.  */
3449       /* Free the allocated memory.  */
3450       scalar_stmts.release ();
3451     }
3452
3453   stmt_vec_info stmt_info = stmt_info_;
3454   /* Try to break the group up into pieces.  */
3455   if (kind == slp_inst_kind_store)
3456     {
3457       /* ???  We could delay all the actual splitting of store-groups
3458          until after SLP discovery of the original group completed.
3459          Then we can recurse to vect_build_slp_instance directly.  */
3460       for (i = 0; i < group_size; i++)
3461         if (!matches[i])
3462           break;
3463
3464       /* For basic block SLP, try to break the group up into multiples of
3465          a vector size.  */
3466       if (is_a <bb_vec_info> (vinfo)
3467           && (i > 1 && i < group_size))
3468         {
3469           tree scalar_type
3470             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3471           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3472                                                       1 << floor_log2 (i));
3473           unsigned HOST_WIDE_INT const_nunits;
3474           if (vectype
3475               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3476             {
3477               /* Split into two groups at the first vector boundary.  */
3478               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3479               unsigned group1_size = i & ~(const_nunits - 1);
3480
3481               if (dump_enabled_p ())
3482                 dump_printf_loc (MSG_NOTE, vect_location,
3483                                  "Splitting SLP group at stmt %u\n", i);
3484               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3485                                                                group1_size);
3486               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3487                                                     kind, max_tree_size,
3488                                                     limit);
3489               /* Split the rest at the failure point and possibly
3490                  re-analyze the remaining matching part if it has
3491                  at least two lanes.  */
3492               if (group1_size < i
3493                   && (i + 1 < group_size
3494                       || i - group1_size > 1))
3495                 {
3496                   stmt_vec_info rest2 = rest;
3497                   rest = vect_split_slp_store_group (rest, i - group1_size);
3498                   if (i - group1_size > 1)
3499                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3500                                                       kind, max_tree_size,
3501                                                       limit);
3502                 }
3503               /* Re-analyze the non-matching tail if it has at least
3504                  two lanes.  */
3505               if (i + 1 < group_size)
3506                 res |= vect_analyze_slp_instance (vinfo, bst_map,
3507                                                   rest, kind, max_tree_size,
3508                                                   limit);
3509               return res;
3510             }
3511         }
3512
3513       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3514       if (is_a <loop_vec_info> (vinfo)
3515           && (i > 1 && i < group_size)
3516           && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3517         {
3518           unsigned group1_size = i;
3519
3520           if (dump_enabled_p ())
3521             dump_printf_loc (MSG_NOTE, vect_location,
3522                              "Splitting SLP group at stmt %u\n", i);
3523
3524           stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3525                                                            group1_size);
3526           /* Loop vectorization cannot handle gaps in stores, make sure
3527              the split group appears as strided.  */
3528           STMT_VINFO_STRIDED_P (rest) = 1;
3529           DR_GROUP_GAP (rest) = 0;
3530           STMT_VINFO_STRIDED_P (stmt_info) = 1;
3531           DR_GROUP_GAP (stmt_info) = 0;
3532
3533           bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3534                                                 kind, max_tree_size, limit);
3535           if (i + 1 < group_size)
3536             res |= vect_analyze_slp_instance (vinfo, bst_map,
3537                                               rest, kind, max_tree_size, limit);
3538
3539           return res;
3540         }
3541
3542       /* Even though the first vector did not all match, we might be able to SLP
3543          (some) of the remainder.  FORNOW ignore this possibility.  */
3544     }
3545
3546   /* Failed to SLP.  */
3547   if (dump_enabled_p ())
3548     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3549   return false;
3550 }
3551
3552
3553 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3554    vect_build_slp_tree to build a tree of packed stmts if possible.
3555    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3556
3557 static bool
3558 vect_analyze_slp_instance (vec_info *vinfo,
3559                            scalar_stmts_to_slp_tree_map_t *bst_map,
3560                            stmt_vec_info stmt_info,
3561                            slp_instance_kind kind,
3562                            unsigned max_tree_size, unsigned *limit)
3563 {
3564   unsigned int i;
3565   vec<stmt_vec_info> scalar_stmts;
3566
3567   if (is_a <bb_vec_info> (vinfo))
3568     vect_location = stmt_info->stmt;
3569
3570   stmt_vec_info next_info = stmt_info;
3571   if (kind == slp_inst_kind_store)
3572     {
3573       /* Collect the stores and store them in scalar_stmts.  */
3574       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3575       while (next_info)
3576         {
3577           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3578           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3579         }
3580     }
3581   else if (kind == slp_inst_kind_reduc_chain)
3582     {
3583       /* Collect the reduction stmts and store them in scalar_stmts.  */
3584       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3585       while (next_info)
3586         {
3587           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3588           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3589         }
3590       /* Mark the first element of the reduction chain as reduction to properly
3591          transform the node.  In the reduction analysis phase only the last
3592          element of the chain is marked as reduction.  */
3593       STMT_VINFO_DEF_TYPE (stmt_info)
3594         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3595       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3596         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3597     }
3598   else if (kind == slp_inst_kind_reduc_group)
3599     {
3600       /* Collect reduction statements.  */
3601       const vec<stmt_vec_info> &reductions
3602         = as_a <loop_vec_info> (vinfo)->reductions;
3603       scalar_stmts.create (reductions.length ());
3604       for (i = 0; reductions.iterate (i, &next_info); i++)
3605         if ((STMT_VINFO_RELEVANT_P (next_info)
3606              || STMT_VINFO_LIVE_P (next_info))
3607             /* ???  Make sure we didn't skip a conversion around a reduction
3608                path.  In that case we'd have to reverse engineer that conversion
3609                stmt following the chain using reduc_idx and from the PHI
3610                using reduc_def.  */
3611             && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3612           scalar_stmts.quick_push (next_info);
3613       /* If less than two were relevant/live there's nothing to SLP.  */
3614       if (scalar_stmts.length () < 2)
3615         return false;
3616     }
3617   else
3618     gcc_unreachable ();
3619
3620   vec<stmt_vec_info> roots = vNULL;
3621   vec<tree> remain = vNULL;
3622   /* Build the tree for the SLP instance.  */
3623   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3624                                       roots, remain,
3625                                       max_tree_size, limit, bst_map,
3626                                       kind == slp_inst_kind_store
3627                                       ? stmt_info : NULL);
3628
3629   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3630      where we should do store group splitting.  */
3631
3632   return res;
3633 }
3634
3635 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3636    trees of packed scalar stmts if SLP is possible.  */
3637
3638 opt_result
3639 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3640 {
3641   unsigned int i;
3642   stmt_vec_info first_element;
3643   slp_instance instance;
3644
3645   DUMP_VECT_SCOPE ("vect_analyze_slp");
3646
3647   unsigned limit = max_tree_size;
3648
3649   scalar_stmts_to_slp_tree_map_t *bst_map
3650     = new scalar_stmts_to_slp_tree_map_t ();
3651
3652   /* Find SLP sequences starting from groups of grouped stores.  */
3653   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3654     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3655                                slp_inst_kind_store, max_tree_size, &limit);
3656
3657   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3658     {
3659       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3660         {
3661           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3662           /* Apply patterns.  */
3663           for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
3664             bb_vinfo->roots[i].stmts[j]
3665               = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
3666           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3667                                        bb_vinfo->roots[i].stmts,
3668                                        bb_vinfo->roots[i].roots,
3669                                        bb_vinfo->roots[i].remain,
3670                                        max_tree_size, &limit, bst_map, NULL))
3671             {
3672               bb_vinfo->roots[i].stmts = vNULL;
3673               bb_vinfo->roots[i].roots = vNULL;
3674               bb_vinfo->roots[i].remain = vNULL;
3675             }
3676         }
3677     }
3678
3679   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3680     {
3681       /* Find SLP sequences starting from reduction chains.  */
3682       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3683         if (! STMT_VINFO_RELEVANT_P (first_element)
3684             && ! STMT_VINFO_LIVE_P (first_element))
3685           ;
3686         else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3687                                               slp_inst_kind_reduc_chain,
3688                                               max_tree_size, &limit))
3689           {
3690             /* Dissolve reduction chain group.  */
3691             stmt_vec_info vinfo = first_element;
3692             stmt_vec_info last = NULL;
3693             while (vinfo)
3694               {
3695                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3696                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3697                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3698                 last = vinfo;
3699                 vinfo = next;
3700               }
3701             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3702             /* It can be still vectorized as part of an SLP reduction.  */
3703             loop_vinfo->reductions.safe_push (last);
3704           }
3705
3706       /* Find SLP sequences starting from groups of reductions.  */
3707       if (loop_vinfo->reductions.length () > 1)
3708         vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3709                                    slp_inst_kind_reduc_group, max_tree_size,
3710                                    &limit);
3711     }
3712
3713   hash_set<slp_tree> visited_patterns;
3714   slp_tree_to_load_perm_map_t perm_cache;
3715   slp_compat_nodes_map_t compat_cache;
3716
3717   /* See if any patterns can be found in the SLP tree.  */
3718   bool pattern_found = false;
3719   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3720     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3721                                               &visited_patterns, &perm_cache,
3722                                               &compat_cache);
3723
3724   /* If any were found optimize permutations of loads.  */
3725   if (pattern_found)
3726     {
3727       hash_map<slp_tree, slp_tree> load_map;
3728       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3729         {
3730           slp_tree root = SLP_INSTANCE_TREE (instance);
3731           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3732                                         &load_map, root);
3733         }
3734     }
3735
3736
3737
3738   /* The map keeps a reference on SLP nodes built, release that.  */
3739   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3740        it != bst_map->end (); ++it)
3741     if ((*it).second)
3742       vect_free_slp_tree ((*it).second);
3743   delete bst_map;
3744
3745   if (pattern_found && dump_enabled_p ())
3746     {
3747       dump_printf_loc (MSG_NOTE, vect_location,
3748                        "Pattern matched SLP tree\n");
3749       hash_set<slp_tree> visited;
3750       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3751         vect_print_slp_graph (MSG_NOTE, vect_location,
3752                               SLP_INSTANCE_TREE (instance), visited);
3753     }
3754
3755   return opt_result::success ();
3756 }
3757
3758 /* Estimates the cost of inserting layout changes into the SLP graph.
3759    It can also say that the insertion is impossible.  */
3760
3761 struct slpg_layout_cost
3762 {
3763   slpg_layout_cost () = default;
3764   slpg_layout_cost (sreal, bool);
3765
3766   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3767   bool is_possible () const { return depth != sreal::max (); }
3768
3769   bool operator== (const slpg_layout_cost &) const;
3770   bool operator!= (const slpg_layout_cost &) const;
3771
3772   bool is_better_than (const slpg_layout_cost &, bool) const;
3773
3774   void add_parallel_cost (const slpg_layout_cost &);
3775   void add_serial_cost (const slpg_layout_cost &);
3776   void split (unsigned int);
3777
3778   /* The longest sequence of layout changes needed during any traversal
3779      of the partition dag, weighted by execution frequency.
3780
3781      This is the most important metric when optimizing for speed, since
3782      it helps to ensure that we keep the number of operations on
3783      critical paths to a minimum.  */
3784   sreal depth = 0;
3785
3786   /* An estimate of the total number of operations needed.  It is weighted by
3787      execution frequency when optimizing for speed but not when optimizing for
3788      size.  In order to avoid double-counting, a node with a fanout of N will
3789      distribute 1/N of its total cost to each successor.
3790
3791      This is the most important metric when optimizing for size, since
3792      it helps to keep the total number of operations to a minimum,  */
3793   sreal total = 0;
3794 };
3795
3796 /* Construct costs for a node with weight WEIGHT.  A higher weight
3797    indicates more frequent execution.  IS_FOR_SIZE is true if we are
3798    optimizing for size rather than speed.  */
3799
3800 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3801   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3802 {
3803 }
3804
3805 bool
3806 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3807 {
3808   return depth == other.depth && total == other.total;
3809 }
3810
3811 bool
3812 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3813 {
3814   return !operator== (other);
3815 }
3816
3817 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
3818    true if we are optimizing for size rather than speed.  */
3819
3820 bool
3821 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3822                                   bool is_for_size) const
3823 {
3824   if (is_for_size)
3825     {
3826       if (total != other.total)
3827         return total < other.total;
3828       return depth < other.depth;
3829     }
3830   else
3831     {
3832       if (depth != other.depth)
3833         return depth < other.depth;
3834       return total < other.total;
3835     }
3836 }
3837
3838 /* Increase the costs to account for something with cost INPUT_COST
3839    happening in parallel with the current costs.  */
3840
3841 void
3842 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3843 {
3844   depth = std::max (depth, input_cost.depth);
3845   total += input_cost.total;
3846 }
3847
3848 /* Increase the costs to account for something with cost INPUT_COST
3849    happening in series with the current costs.  */
3850
3851 void
3852 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3853 {
3854   depth += other.depth;
3855   total += other.total;
3856 }
3857
3858 /* Split the total cost among TIMES successors or predecessors.  */
3859
3860 void
3861 slpg_layout_cost::split (unsigned int times)
3862 {
3863   if (times > 1)
3864     total /= times;
3865 }
3866
3867 /* Information about one node in the SLP graph, for use during
3868    vect_optimize_slp_pass.  */
3869
3870 struct slpg_vertex
3871 {
3872   slpg_vertex (slp_tree node_) : node (node_) {}
3873
3874   /* The node itself.  */
3875   slp_tree node;
3876
3877   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
3878      partitions are flexible; they can have whichever layout consumers
3879      want them to have.  */
3880   int partition = -1;
3881
3882   /* The number of nodes that directly use the result of this one
3883      (i.e. the number of nodes that count this one as a child).  */
3884   unsigned int out_degree = 0;
3885
3886   /* The execution frequency of the node.  */
3887   sreal weight = 0;
3888
3889   /* The total execution frequency of all nodes that directly use the
3890      result of this one.  */
3891   sreal out_weight = 0;
3892 };
3893
3894 /* Information about one partition of the SLP graph, for use during
3895    vect_optimize_slp_pass.  */
3896
3897 struct slpg_partition_info
3898 {
3899   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3900      of m_partitioned_nodes.  */
3901   unsigned int node_begin = 0;
3902   unsigned int node_end = 0;
3903
3904   /* Which layout we've chosen to use for this partition, or -1 if
3905      we haven't picked one yet.  */
3906   int layout = -1;
3907
3908   /* The number of predecessors and successors in the partition dag.
3909      The predecessors always have lower partition numbers and the
3910      successors always have higher partition numbers.
3911
3912      Note that the directions of these edges are not necessarily the
3913      same as in the data flow graph.  For example, if an SCC has separate
3914      partitions for an inner loop and an outer loop, the inner loop's
3915      partition will have at least two incoming edges from the outer loop's
3916      partition: one for a live-in value and one for a live-out value.
3917      In data flow terms, one of these edges would also be from the outer loop
3918      to the inner loop, but the other would be in the opposite direction.  */
3919   unsigned int in_degree = 0;
3920   unsigned int out_degree = 0;
3921 };
3922
3923 /* Information about the costs of using a particular layout for a
3924    particular partition.  It can also say that the combination is
3925    impossible.  */
3926
3927 struct slpg_partition_layout_costs
3928 {
3929   bool is_possible () const { return internal_cost.is_possible (); }
3930   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3931
3932   /* The costs inherited from predecessor partitions.  */
3933   slpg_layout_cost in_cost;
3934
3935   /* The inherent cost of the layout within the node itself.  For example,
3936      this is nonzero for a load if choosing a particular layout would require
3937      the load to permute the loaded elements.  It is nonzero for a
3938      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3939      to full-vector moves.  */
3940   slpg_layout_cost internal_cost;
3941
3942   /* The costs inherited from successor partitions.  */
3943   slpg_layout_cost out_cost;
3944 };
3945
3946 /* This class tries to optimize the layout of vectors in order to avoid
3947    unnecessary shuffling.  At the moment, the set of possible layouts are
3948    restricted to bijective permutations.
3949
3950    The goal of the pass depends on whether we're optimizing for size or
3951    for speed.  When optimizing for size, the goal is to reduce the overall
3952    number of layout changes (including layout changes implied by things
3953    like load permutations).  When optimizing for speed, the goal is to
3954    reduce the maximum latency attributable to layout changes on any
3955    non-cyclical path through the data flow graph.
3956
3957    For example, when optimizing a loop nest for speed, we will prefer
3958    to make layout changes outside of a loop rather than inside of a loop,
3959    and will prefer to make layout changes in parallel rather than serially,
3960    even if that increases the overall number of layout changes.
3961
3962    The high-level procedure is:
3963
3964    (1) Build a graph in which edges go from uses (parents) to definitions
3965        (children).
3966
3967    (2) Divide the graph into a dag of strongly-connected components (SCCs).
3968
3969    (3) When optimizing for speed, partition the nodes in each SCC based
3970        on their containing cfg loop.  When optimizing for size, treat
3971        each SCC as a single partition.
3972
3973        This gives us a dag of partitions.  The goal is now to assign a
3974        layout to each partition.
3975
3976    (4) Construct a set of vector layouts that are worth considering.
3977        Record which nodes must keep their current layout.
3978
3979    (5) Perform a forward walk over the partition dag (from loads to stores)
3980        accumulating the "forward" cost of using each layout.  When visiting
3981        each partition, assign a tentative choice of layout to the partition
3982        and use that choice when calculating the cost of using a different
3983        layout in successor partitions.
3984
3985    (6) Perform a backward walk over the partition dag (from stores to loads),
3986        accumulating the "backward" cost of using each layout.  When visiting
3987        each partition, make a final choice of layout for that partition based
3988        on the accumulated forward costs (from (5)) and backward costs
3989        (from (6)).
3990
3991    (7) Apply the chosen layouts to the SLP graph.
3992
3993    For example, consider the SLP statements:
3994
3995    S1:      a_1 = load
3996        loop:
3997    S2:      a_2 = PHI<a_1, a_3>
3998    S3:      b_1 = load
3999    S4:      a_3 = a_2 + b_1
4000        exit:
4001    S5:      a_4 = PHI<a_3>
4002    S6:      store a_4
4003
4004    S2 and S4 form an SCC and are part of the same loop.  Every other
4005    statement is in a singleton SCC.  In this example there is a one-to-one
4006    mapping between SCCs and partitions and the partition dag looks like this;
4007
4008         S1     S3
4009          \     /
4010           S2+S4
4011             |
4012            S5
4013             |
4014            S6
4015
4016    S2, S3 and S4 will have a higher execution frequency than the other
4017    statements, so when optimizing for speed, the goal is to avoid any
4018    layout changes:
4019
4020    - within S3
4021    - within S2+S4
4022    - on the S3->S2+S4 edge
4023
4024    For example, if S3 was originally a reversing load, the goal of the
4025    pass is to make it an unreversed load and change the layout on the
4026    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
4027    on S1->S2+S4 and S5->S6 would also be acceptable.)
4028
4029    The difference between SCCs and partitions becomes important if we
4030    add an outer loop:
4031
4032    S1:      a_1 = ...
4033        loop1:
4034    S2:      a_2 = PHI<a_1, a_6>
4035    S3:      b_1 = load
4036    S4:      a_3 = a_2 + b_1
4037        loop2:
4038    S5:      a_4 = PHI<a_3, a_5>
4039    S6:      c_1 = load
4040    S7:      a_5 = a_4 + c_1
4041        exit2:
4042    S8:      a_6 = PHI<a_5>
4043    S9:      store a_6
4044        exit1:
4045
4046    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
4047    for speed, we usually do not want restrictions in the outer loop to "infect"
4048    the decision for the inner loop.  For example, if an outer-loop node
4049    in the SCC contains a statement with a fixed layout, that should not
4050    prevent the inner loop from using a different layout.  Conversely,
4051    the inner loop should not dictate a layout to the outer loop: if the
4052    outer loop does a lot of computation, then it may not be efficient to
4053    do all of that computation in the inner loop's preferred layout.
4054
4055    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4056    and S5+S7 (inner).  We also try to arrange partitions so that:
4057
4058    - the partition for an outer loop comes before the partition for
4059      an inner loop
4060
4061    - if a sibling loop A dominates a sibling loop B, A's partition
4062      comes before B's
4063
4064    This gives the following partition dag for the example above:
4065
4066         S1        S3
4067          \        /
4068           S2+S4+S8   S6
4069            |   \\    /
4070            |    S5+S7
4071            |
4072           S9
4073
4074    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4075    one for a reversal of the edge S7->S8.
4076
4077    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
4078    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4079    preferred layout against the cost of changing the layout on entry to the
4080    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4081
4082    Although this works well when optimizing for speed, it has the downside
4083    when optimizing for size that the choice of layout for S5+S7 is completely
4084    independent of S9, which lessens the chance of reducing the overall number
4085    of permutations.  We therefore do not partition SCCs when optimizing
4086    for size.
4087
4088    To give a concrete example of the difference between optimizing
4089    for size and speed, consider:
4090
4091    a[0] = (b[1] << c[3]) - d[1];
4092    a[1] = (b[0] << c[2]) - d[0];
4093    a[2] = (b[3] << c[1]) - d[3];
4094    a[3] = (b[2] << c[0]) - d[2];
4095
4096    There are three different layouts here: one for a, one for b and d,
4097    and one for c.  When optimizing for speed it is better to permute each
4098    of b, c and d into the order required by a, since those permutations
4099    happen in parallel.  But when optimizing for size, it is better to:
4100
4101    - permute c into the same order as b
4102    - do the arithmetic
4103    - permute the result into the order required by a
4104
4105    This gives 2 permutations rather than 3.  */
4106
4107 class vect_optimize_slp_pass
4108 {
4109 public:
4110   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4111   void run ();
4112
4113 private:
4114   /* Graph building.  */
4115   struct loop *containing_loop (slp_tree);
4116   bool is_cfg_latch_edge (graph_edge *);
4117   void build_vertices (hash_set<slp_tree> &, slp_tree);
4118   void build_vertices ();
4119   void build_graph ();
4120
4121   /* Partitioning.  */
4122   void create_partitions ();
4123   template<typename T> void for_each_partition_edge (unsigned int, T);
4124
4125   /* Layout selection.  */
4126   bool is_compatible_layout (slp_tree, unsigned int);
4127   int change_layout_cost (slp_tree, unsigned int, unsigned int);
4128   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4129                                                        unsigned int);
4130   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4131                                int, unsigned int);
4132   int internal_node_cost (slp_tree, int, unsigned int);
4133   void start_choosing_layouts ();
4134
4135   /* Cost propagation.  */
4136   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4137                                      unsigned int, unsigned int);
4138   slpg_layout_cost total_in_cost (unsigned int);
4139   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4140   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4141   void forward_pass ();
4142   void backward_pass ();
4143
4144   /* Rematerialization.  */
4145   slp_tree get_result_with_layout (slp_tree, unsigned int);
4146   void materialize ();
4147
4148   /* Clean-up.  */
4149   void remove_redundant_permutations ();
4150
4151   void dump ();
4152
4153   vec_info *m_vinfo;
4154
4155   /* True if we should optimize the graph for size, false if we should
4156      optimize it for speed.  (It wouldn't be easy to make this decision
4157      more locally.)  */
4158   bool m_optimize_size;
4159
4160   /* A graph of all SLP nodes, with edges leading from uses to definitions.
4161      In other words, a node's predecessors are its slp_tree parents and
4162      a node's successors are its slp_tree children.  */
4163   graph *m_slpg = nullptr;
4164
4165   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
4166   auto_vec<slpg_vertex> m_vertices;
4167
4168   /* The list of all leaves of M_SLPG. such as external definitions, constants,
4169      and loads.  */
4170   auto_vec<int> m_leafs;
4171
4172   /* This array has one entry for every vector layout that we're considering.
4173      Element 0 is null and indicates "no change".  Other entries describe
4174      permutations that are inherent in the current graph and that we would
4175      like to reverse if possible.
4176
4177      For example, a permutation { 1, 2, 3, 0 } means that something has
4178      effectively been permuted in that way, such as a load group
4179      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4180      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4181      in order to put things "back" in order.  */
4182   auto_vec<vec<unsigned> > m_perms;
4183
4184   /* A partitioning of the nodes for which a layout must be chosen.
4185      Each partition represents an <SCC, cfg loop> pair; that is,
4186      nodes in different SCCs belong to different partitions, and nodes
4187      within an SCC can be further partitioned according to a containing
4188      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
4189
4190      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4191        from leaves (such as loads) to roots (such as stores).
4192
4193      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
4194   auto_vec<slpg_partition_info> m_partitions;
4195
4196   /* The list of all nodes for which a layout must be chosen.  Nodes for
4197      partition P come before the nodes for partition P+1.  Nodes within a
4198      partition are in reverse postorder.  */
4199   auto_vec<unsigned int> m_partitioned_nodes;
4200
4201   /* Index P * num-layouts + L contains the cost of using layout L
4202      for partition P.  */
4203   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4204
4205   /* Index N * num-layouts + L, if nonnull, is a node that provides the
4206      original output of node N adjusted to have layout L.  */
4207   auto_vec<slp_tree> m_node_layouts;
4208 };
4209
4210 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4211    Also record whether we should optimize anything for speed rather
4212    than size.  */
4213
4214 void
4215 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4216                                         slp_tree node)
4217 {
4218   unsigned i;
4219   slp_tree child;
4220
4221   if (visited.add (node))
4222     return;
4223
4224   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4225     {
4226       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4227       if (optimize_bb_for_speed_p (bb))
4228         m_optimize_size = false;
4229     }
4230
4231   node->vertex = m_vertices.length ();
4232   m_vertices.safe_push (slpg_vertex (node));
4233
4234   bool leaf = true;
4235   bool force_leaf = false;
4236   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4237     if (child)
4238       {
4239         leaf = false;
4240         build_vertices (visited, child);
4241       }
4242     else
4243       force_leaf = true;
4244   /* Since SLP discovery works along use-def edges all cycles have an
4245      entry - but there's the exception of cycles where we do not handle
4246      the entry explicitely (but with a NULL SLP node), like some reductions
4247      and inductions.  Force those SLP PHIs to act as leafs to make them
4248      backwards reachable.  */
4249   if (leaf || force_leaf)
4250     m_leafs.safe_push (node->vertex);
4251 }
4252
4253 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
4254
4255 void
4256 vect_optimize_slp_pass::build_vertices ()
4257 {
4258   hash_set<slp_tree> visited;
4259   unsigned i;
4260   slp_instance instance;
4261   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4262     build_vertices (visited, SLP_INSTANCE_TREE (instance));
4263 }
4264
4265 /* Apply (reverse) bijectite PERM to VEC.  */
4266
4267 template <class T>
4268 static void
4269 vect_slp_permute (vec<unsigned> perm,
4270                   vec<T> &vec, bool reverse)
4271 {
4272   auto_vec<T, 64> saved;
4273   saved.create (vec.length ());
4274   for (unsigned i = 0; i < vec.length (); ++i)
4275     saved.quick_push (vec[i]);
4276
4277   if (reverse)
4278     {
4279       for (unsigned i = 0; i < vec.length (); ++i)
4280         vec[perm[i]] = saved[i];
4281       for (unsigned i = 0; i < vec.length (); ++i)
4282         gcc_assert (vec[perm[i]] == saved[i]);
4283     }
4284   else
4285     {
4286       for (unsigned i = 0; i < vec.length (); ++i)
4287         vec[i] = saved[perm[i]];
4288       for (unsigned i = 0; i < vec.length (); ++i)
4289         gcc_assert (vec[i] == saved[perm[i]]);
4290     }
4291 }
4292
4293 /* Return the cfg loop that contains NODE.  */
4294
4295 struct loop *
4296 vect_optimize_slp_pass::containing_loop (slp_tree node)
4297 {
4298   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4299   if (!rep)
4300     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4301   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4302 }
4303
4304 /* Return true if UD (an edge from a use to a definition) is associated
4305    with a loop latch edge in the cfg.  */
4306
4307 bool
4308 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4309 {
4310   slp_tree use = m_vertices[ud->src].node;
4311   slp_tree def = m_vertices[ud->dest].node;
4312   if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4313       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4314     return false;
4315
4316   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4317   return (is_a<gphi *> (use_rep->stmt)
4318           && bb_loop_header_p (gimple_bb (use_rep->stmt))
4319           && containing_loop (def) == containing_loop (use));
4320 }
4321
4322 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
4323    a nonnull data field.  */
4324
4325 void
4326 vect_optimize_slp_pass::build_graph ()
4327 {
4328   m_optimize_size = true;
4329   build_vertices ();
4330
4331   m_slpg = new_graph (m_vertices.length ());
4332   for (slpg_vertex &v : m_vertices)
4333     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4334       if (child)
4335         {
4336           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4337           if (is_cfg_latch_edge (ud))
4338             ud->data = this;
4339         }
4340 }
4341
4342 /* Return true if E corresponds to a loop latch edge in the cfg.  */
4343
4344 static bool
4345 skip_cfg_latch_edges (graph_edge *e)
4346 {
4347   return e->data;
4348 }
4349
4350 /* Create the node partitions.  */
4351
4352 void
4353 vect_optimize_slp_pass::create_partitions ()
4354 {
4355   /* Calculate a postorder of the graph, ignoring edges that correspond
4356      to natural latch edges in the cfg.  Reading the vector from the end
4357      to the beginning gives the reverse postorder.  */
4358   auto_vec<int> initial_rpo;
4359   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4360                false, NULL, skip_cfg_latch_edges);
4361   gcc_assert (initial_rpo.length () == m_vertices.length ());
4362
4363   /* Calculate the strongly connected components of the graph.  */
4364   auto_vec<int> scc_grouping;
4365   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4366
4367   /* Create a new index order in which all nodes from the same SCC are
4368      consecutive.  Use scc_pos to record the index of the first node in
4369      each SCC.  */
4370   auto_vec<unsigned int> scc_pos (num_sccs);
4371   int last_component = -1;
4372   unsigned int node_count = 0;
4373   for (unsigned int node_i : scc_grouping)
4374     {
4375       if (last_component != m_slpg->vertices[node_i].component)
4376         {
4377           last_component = m_slpg->vertices[node_i].component;
4378           gcc_assert (last_component == int (scc_pos.length ()));
4379           scc_pos.quick_push (node_count);
4380         }
4381       node_count += 1;
4382     }
4383   gcc_assert (node_count == initial_rpo.length ()
4384               && last_component + 1 == int (num_sccs));
4385
4386   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4387      inside each SCC following the RPO we calculated above.  The fact that
4388      we ignored natural latch edges when calculating the RPO should ensure
4389      that, for natural loop nests:
4390
4391      - the first node that we encounter in a cfg loop is the loop header phi
4392      - the loop header phis are in dominance order
4393
4394      Arranging for this is an optimization (see below) rather than a
4395      correctness issue.  Unnatural loops with a tangled mess of backedges
4396      will still work correctly, but might give poorer results.
4397
4398      Also update scc_pos so that it gives 1 + the index of the last node
4399      in the SCC.  */
4400   m_partitioned_nodes.safe_grow (node_count);
4401   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4402     {
4403       unsigned int node_i = initial_rpo[old_i];
4404       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4405       m_partitioned_nodes[new_i] = node_i;
4406     }
4407
4408   /* When optimizing for speed, partition each SCC based on the containing
4409      cfg loop. The order we constructed above should ensure that, for natural
4410      cfg loops, we'll create sub-SCC partitions for outer loops before
4411      the corresponding sub-SCC partitions for inner loops.  Similarly,
4412      when one sibling loop A dominates another sibling loop B, we should
4413      create a sub-SCC partition for A before a sub-SCC partition for B.
4414
4415      As above, nothing depends for correctness on whether this achieves
4416      a natural nesting, but we should get better results when it does.  */
4417   m_partitions.reserve (m_vertices.length ());
4418   unsigned int next_partition_i = 0;
4419   hash_map<struct loop *, int> loop_partitions;
4420   unsigned int rpo_begin = 0;
4421   unsigned int num_partitioned_nodes = 0;
4422   for (unsigned int rpo_end : scc_pos)
4423     {
4424       loop_partitions.empty ();
4425       unsigned int partition_i = next_partition_i;
4426       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4427         {
4428           /* Handle externals and constants optimistically throughout.
4429              But treat existing vectors as fixed since we do not handle
4430              permuting them.  */
4431           unsigned int node_i = m_partitioned_nodes[rpo_i];
4432           auto &vertex = m_vertices[node_i];
4433           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4434                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4435               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4436             vertex.partition = -1;
4437           else
4438             {
4439               bool existed;
4440               if (m_optimize_size)
4441                 existed = next_partition_i > partition_i;
4442               else
4443                 {
4444                   struct loop *loop = containing_loop (vertex.node);
4445                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
4446                   if (!existed)
4447                     entry = next_partition_i;
4448                   partition_i = entry;
4449                 }
4450               if (!existed)
4451                 {
4452                   m_partitions.quick_push (slpg_partition_info ());
4453                   next_partition_i += 1;
4454                 }
4455               vertex.partition = partition_i;
4456               num_partitioned_nodes += 1;
4457               m_partitions[partition_i].node_end += 1;
4458             }
4459         }
4460       rpo_begin = rpo_end;
4461     }
4462
4463   /* Assign ranges of consecutive node indices to each partition,
4464      in partition order.  Start with node_end being the same as
4465      node_begin so that the next loop can use it as a counter.  */
4466   unsigned int node_begin = 0;
4467   for (auto &partition : m_partitions)
4468     {
4469       partition.node_begin = node_begin;
4470       node_begin += partition.node_end;
4471       partition.node_end = partition.node_begin;
4472     }
4473   gcc_assert (node_begin == num_partitioned_nodes);
4474
4475   /* Finally build the list of nodes in partition order.  */
4476   m_partitioned_nodes.truncate (num_partitioned_nodes);
4477   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4478     {
4479       int partition_i = m_vertices[node_i].partition;
4480       if (partition_i >= 0)
4481         {
4482           unsigned int order_i = m_partitions[partition_i].node_end++;
4483           m_partitioned_nodes[order_i] = node_i;
4484         }
4485     }
4486 }
4487
4488 /* Look for edges from earlier partitions into node NODE_I and edges from
4489    node NODE_I into later partitions.  Call:
4490
4491       FN (ud, other_node_i)
4492
4493    for each such use-to-def edge ud, where other_node_i is the node at the
4494    other end of the edge.  */
4495
4496 template<typename T>
4497 void
4498 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4499 {
4500   int partition_i = m_vertices[node_i].partition;
4501   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4502        pred; pred = pred->pred_next)
4503     {
4504       int src_partition_i = m_vertices[pred->src].partition;
4505       if (src_partition_i >= 0 && src_partition_i != partition_i)
4506         fn (pred, pred->src);
4507     }
4508   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4509        succ; succ = succ->succ_next)
4510     {
4511       int dest_partition_i = m_vertices[succ->dest].partition;
4512       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4513         fn (succ, succ->dest);
4514     }
4515 }
4516
4517 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4518    that NODE would operate on.  This test is independent of NODE's actual
4519    operation.  */
4520
4521 bool
4522 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4523                                               unsigned int layout_i)
4524 {
4525   if (layout_i == 0)
4526     return true;
4527
4528   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4529     return false;
4530
4531   return true;
4532 }
4533
4534 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4535    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
4536    layouts is incompatible with NODE or if the change is not possible for
4537    some other reason.
4538
4539    The properties taken from NODE include the number of lanes and the
4540    vector type.  The actual operation doesn't matter.  */
4541
4542 int
4543 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4544                                             unsigned int from_layout_i,
4545                                             unsigned int to_layout_i)
4546 {
4547   if (!is_compatible_layout (node, from_layout_i)
4548       || !is_compatible_layout (node, to_layout_i))
4549     return -1;
4550
4551   if (from_layout_i == to_layout_i)
4552     return 0;
4553
4554   auto_vec<slp_tree, 1> children (1);
4555   children.quick_push (node);
4556   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4557   if (from_layout_i > 0)
4558     for (unsigned int i : m_perms[from_layout_i])
4559       perm.quick_push ({ 0, i });
4560   else
4561     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4562       perm.quick_push ({ 0, i });
4563   if (to_layout_i > 0)
4564     vect_slp_permute (m_perms[to_layout_i], perm, true);
4565   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4566                                                children, false);
4567   if (count >= 0)
4568     return MAX (count, 1);
4569
4570   /* ??? In principle we could try changing via layout 0, giving two
4571      layout changes rather than 1.  Doing that would require
4572      corresponding support in get_result_with_layout.  */
4573   return -1;
4574 }
4575
4576 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
4577
4578 inline slpg_partition_layout_costs &
4579 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4580                                                 unsigned int layout_i)
4581 {
4582   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4583 }
4584
4585 /* Change PERM in one of two ways:
4586
4587    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4588      chosen for child I of NODE.
4589
4590    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4591
4592    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
4593
4594 void
4595 vect_optimize_slp_pass::
4596 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4597                         int in_layout_i, unsigned int out_layout_i)
4598 {
4599   for (auto &entry : perm)
4600     {
4601       int this_in_layout_i = in_layout_i;
4602       if (this_in_layout_i < 0)
4603         {
4604           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4605           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4606           this_in_layout_i = m_partitions[in_partition_i].layout;
4607         }
4608       if (this_in_layout_i > 0)
4609         entry.second = m_perms[this_in_layout_i][entry.second];
4610     }
4611   if (out_layout_i > 0)
4612     vect_slp_permute (m_perms[out_layout_i], perm, true);
4613 }
4614
4615 /* Check whether the target allows NODE to be rearranged so that the node's
4616    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
4617    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
4618
4619    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4620    NODE can adapt to the layout changes that have (perhaps provisionally)
4621    been chosen for NODE's children, so that no extra permutations are
4622    needed on either the input or the output of NODE.
4623
4624    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4625    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4626
4627    IN_LAYOUT_I has no meaning for other types of node.
4628
4629    Keeping the node as-is is always valid.  If the target doesn't appear
4630    to support the node as-is, but might realistically support other layouts,
4631    then layout 0 instead has the cost of a worst-case permutation.  On the
4632    one hand, this ensures that every node has at least one valid layout,
4633    avoiding what would otherwise be an awkward special case.  On the other,
4634    it still encourages the pass to change an invalid pre-existing layout
4635    choice into a valid one.  */
4636
4637 int
4638 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4639                                             unsigned int out_layout_i)
4640 {
4641   const int fallback_cost = 1;
4642
4643   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4644     {
4645       auto_lane_permutation_t tmp_perm;
4646       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4647
4648       /* Check that the child nodes support the chosen layout.  Checking
4649          the first child is enough, since any second child would have the
4650          same shape.  */
4651       auto first_child = SLP_TREE_CHILDREN (node)[0];
4652       if (in_layout_i > 0
4653           && !is_compatible_layout (first_child, in_layout_i))
4654         return -1;
4655
4656       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4657       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4658                                                   node, tmp_perm,
4659                                                   SLP_TREE_CHILDREN (node),
4660                                                   false);
4661       if (count < 0)
4662         {
4663           if (in_layout_i == 0 && out_layout_i == 0)
4664             {
4665               /* Use the fallback cost if the node could in principle support
4666                  some nonzero layout for both the inputs and the outputs.
4667                  Otherwise assume that the node will be rejected later
4668                  and rebuilt from scalars.  */
4669               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4670                 return fallback_cost;
4671               return 0;
4672             }
4673           return -1;
4674         }
4675
4676       /* We currently have no way of telling whether the new layout is cheaper
4677          or more expensive than the old one.  But at least in principle,
4678          it should be worth making zero permutations (whole-vector shuffles)
4679          cheaper than real permutations, in case the pass is able to remove
4680          the latter.  */
4681       return count == 0 ? 0 : 1;
4682     }
4683
4684   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4685   if (rep
4686       && STMT_VINFO_DATA_REF (rep)
4687       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4688       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4689     {
4690       auto_load_permutation_t tmp_perm;
4691       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4692       if (out_layout_i > 0)
4693         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4694
4695       poly_uint64 vf = 1;
4696       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4697         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4698       unsigned int n_perms;
4699       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4700                                            nullptr, vf, true, false, &n_perms))
4701         {
4702           auto rep = SLP_TREE_REPRESENTATIVE (node);
4703           if (out_layout_i == 0)
4704             {
4705               /* Use the fallback cost if the load is an N-to-N permutation.
4706                  Otherwise assume that the node will be rejected later
4707                  and rebuilt from scalars.  */
4708               if (STMT_VINFO_GROUPED_ACCESS (rep)
4709                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4710                       == SLP_TREE_LANES (node)))
4711                 return fallback_cost;
4712               return 0;
4713             }
4714           return -1;
4715         }
4716
4717       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
4718       return n_perms == 0 ? 0 : 1;
4719     }
4720
4721   return 0;
4722 }
4723
4724 /* Decide which element layouts we should consider using.  Calculate the
4725    weights associated with inserting layout changes on partition edges.
4726    Also mark partitions that cannot change layout, by setting their
4727    layout to zero.  */
4728
4729 void
4730 vect_optimize_slp_pass::start_choosing_layouts ()
4731 {
4732   /* Used to assign unique permutation indices.  */
4733   using perm_hash = unbounded_hashmap_traits<
4734     vec_free_hash_base<int_hash_base<unsigned>>,
4735     int_hash<int, -1, -2>
4736   >;
4737   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4738
4739   /* Layout 0 is "no change".  */
4740   m_perms.safe_push (vNULL);
4741
4742   /* Create layouts from existing permutations.  */
4743   auto_load_permutation_t tmp_perm;
4744   for (unsigned int node_i : m_partitioned_nodes)
4745     {
4746       /* Leafs also double as entries to the reverse graph.  Allow the
4747          layout of those to be changed.  */
4748       auto &vertex = m_vertices[node_i];
4749       auto &partition = m_partitions[vertex.partition];
4750       if (!m_slpg->vertices[node_i].succ)
4751         partition.layout = 0;
4752
4753       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
4754       slp_tree node = vertex.node;
4755       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4756       slp_tree child;
4757       unsigned HOST_WIDE_INT imin, imax = 0;
4758       bool any_permute = false;
4759       tmp_perm.truncate (0);
4760       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4761         {
4762           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4763              unpermuted, record a layout that reverses this permutation.
4764
4765              We would need more work to cope with loads that are internally
4766              permuted and also have inputs (such as masks for
4767              IFN_MASK_LOADs).  */
4768           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4769           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4770             {
4771               partition.layout = -1;
4772               continue;
4773             }
4774           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4775           imin = DR_GROUP_SIZE (dr_stmt) + 1;
4776           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4777         }
4778       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4779                && SLP_TREE_CHILDREN (node).length () == 1
4780                && (child = SLP_TREE_CHILDREN (node)[0])
4781                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4782                    .is_constant (&imin)))
4783         {
4784           /* If the child has the same vector size as this node,
4785              reversing the permutation can make the permutation a no-op.
4786              In other cases it can change a true permutation into a
4787              full-vector extract.  */
4788           tmp_perm.reserve (SLP_TREE_LANES (node));
4789           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4790             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4791         }
4792       else
4793         continue;
4794
4795       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4796         {
4797           unsigned idx = tmp_perm[j];
4798           imin = MIN (imin, idx);
4799           imax = MAX (imax, idx);
4800           if (idx - tmp_perm[0] != j)
4801             any_permute = true;
4802         }
4803       /* If the span doesn't match we'd disrupt VF computation, avoid
4804          that for now.  */
4805       if (imax - imin + 1 != SLP_TREE_LANES (node))
4806         continue;
4807       /* If there's no permute no need to split one out.  In this case
4808          we can consider turning a load into a permuted load, if that
4809          turns out to be cheaper than alternatives.  */
4810       if (!any_permute)
4811         {
4812           partition.layout = -1;
4813           continue;
4814         }
4815
4816       /* For now only handle true permutes, like
4817          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
4818          when permuting constants and invariants keeping the permute
4819          bijective.  */
4820       auto_sbitmap load_index (SLP_TREE_LANES (node));
4821       bitmap_clear (load_index);
4822       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4823         bitmap_set_bit (load_index, tmp_perm[j] - imin);
4824       unsigned j;
4825       for (j = 0; j < SLP_TREE_LANES (node); ++j)
4826         if (!bitmap_bit_p (load_index, j))
4827           break;
4828       if (j != SLP_TREE_LANES (node))
4829         continue;
4830
4831       vec<unsigned> perm = vNULL;
4832       perm.safe_grow (SLP_TREE_LANES (node), true);
4833       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4834         perm[j] = tmp_perm[j] - imin;
4835
4836       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4837         {
4838           /* Continue to use existing layouts, but don't add any more.  */
4839           int *entry = layout_ids.get (perm);
4840           partition.layout = entry ? *entry : 0;
4841           perm.release ();
4842         }
4843       else
4844         {
4845           bool existed;
4846           int &layout_i = layout_ids.get_or_insert (perm, &existed);
4847           if (existed)
4848             perm.release ();
4849           else
4850             {
4851               layout_i = m_perms.length ();
4852               m_perms.safe_push (perm);
4853             }
4854           partition.layout = layout_i;
4855         }
4856     }
4857
4858   /* Initially assume that every layout is possible and has zero cost
4859      in every partition.  */
4860   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4861                                               * m_perms.length ());
4862
4863   /* We have to mark outgoing permutations facing non-associating-reduction
4864      graph entries that are not represented as to be materialized.
4865      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
4866   for (slp_instance instance : m_vinfo->slp_instances)
4867     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4868       {
4869         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4870         m_partitions[m_vertices[node_i].partition].layout = 0;
4871       }
4872     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4873       {
4874         stmt_vec_info stmt_info
4875           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4876         stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4877         if (needs_fold_left_reduction_p (TREE_TYPE
4878                                            (gimple_get_lhs (stmt_info->stmt)),
4879                                          STMT_VINFO_REDUC_CODE (reduc_info)))
4880           {
4881             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4882             m_partitions[m_vertices[node_i].partition].layout = 0;
4883           }
4884       }
4885
4886   /* Check which layouts each node and partition can handle.  Calculate the
4887      weights associated with inserting layout changes on edges.  */
4888   for (unsigned int node_i : m_partitioned_nodes)
4889     {
4890       auto &vertex = m_vertices[node_i];
4891       auto &partition = m_partitions[vertex.partition];
4892       slp_tree node = vertex.node;
4893
4894       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4895         {
4896           vertex.weight = vect_slp_node_weight (node);
4897
4898           /* We do not handle stores with a permutation, so all
4899              incoming permutations must have been materialized.
4900
4901              We also don't handle masked grouped loads, which lack a
4902              permutation vector.  In this case the memory locations
4903              form an implicit second input to the loads, on top of the
4904              explicit mask input, and the memory input's layout cannot
4905              be changed.
4906
4907              On the other hand, we do support permuting gather loads and
4908              masked gather loads, where each scalar load is independent
4909              of the others.  This can be useful if the address/index input
4910              benefits from permutation.  */
4911           if (STMT_VINFO_DATA_REF (rep)
4912               && STMT_VINFO_GROUPED_ACCESS (rep)
4913               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4914             partition.layout = 0;
4915
4916           /* We cannot change the layout of an operation that is
4917              not independent on lanes.  Note this is an explicit
4918              negative list since that's much shorter than the respective
4919              positive one but it's critical to keep maintaining it.  */
4920           if (is_gimple_call (STMT_VINFO_STMT (rep)))
4921             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4922               {
4923               case CFN_COMPLEX_ADD_ROT90:
4924               case CFN_COMPLEX_ADD_ROT270:
4925               case CFN_COMPLEX_MUL:
4926               case CFN_COMPLEX_MUL_CONJ:
4927               case CFN_VEC_ADDSUB:
4928               case CFN_VEC_FMADDSUB:
4929               case CFN_VEC_FMSUBADD:
4930                 partition.layout = 0;
4931               default:;
4932               }
4933         }
4934
4935       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4936         {
4937           auto &other_vertex = m_vertices[other_node_i];
4938
4939           /* Count the number of edges from earlier partitions and the number
4940              of edges to later partitions.  */
4941           if (other_vertex.partition < vertex.partition)
4942             partition.in_degree += 1;
4943           else
4944             partition.out_degree += 1;
4945
4946           /* If the current node uses the result of OTHER_NODE_I, accumulate
4947              the effects of that.  */
4948           if (ud->src == int (node_i))
4949             {
4950               other_vertex.out_weight += vertex.weight;
4951               other_vertex.out_degree += 1;
4952             }
4953         };
4954       for_each_partition_edge (node_i, process_edge);
4955     }
4956 }
4957
4958 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4959    its current (provisional) choice of layout.  The inputs do not necessarily
4960    have the same layout as each other.  */
4961
4962 slpg_layout_cost
4963 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4964 {
4965   auto &vertex = m_vertices[node_i];
4966   slpg_layout_cost cost;
4967   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4968     {
4969       auto &other_vertex = m_vertices[other_node_i];
4970       if (other_vertex.partition < vertex.partition)
4971         {
4972           auto &other_partition = m_partitions[other_vertex.partition];
4973           auto &other_costs = partition_layout_costs (other_vertex.partition,
4974                                                       other_partition.layout);
4975           slpg_layout_cost this_cost = other_costs.in_cost;
4976           this_cost.add_serial_cost (other_costs.internal_cost);
4977           this_cost.split (other_partition.out_degree);
4978           cost.add_parallel_cost (this_cost);
4979         }
4980     };
4981   for_each_partition_edge (node_i, add_cost);
4982   return cost;
4983 }
4984
4985 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4986    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
4987    slpg_layout_cost::impossible () if the change isn't possible.  */
4988
4989 slpg_layout_cost
4990 vect_optimize_slp_pass::
4991 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4992                   unsigned int layout2_i)
4993 {
4994   auto &def_vertex = m_vertices[ud->dest];
4995   auto &use_vertex = m_vertices[ud->src];
4996   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4997   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4998   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4999                                     use_layout_i);
5000   if (factor < 0)
5001     return slpg_layout_cost::impossible ();
5002
5003   /* We have a choice of putting the layout change at the site of the
5004      definition or at the site of the use.  Prefer the former when
5005      optimizing for size or when the execution frequency of the
5006      definition is no greater than the combined execution frequencies of
5007      the uses.  When putting the layout change at the site of the definition,
5008      divvy up the cost among all consumers.  */
5009   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
5010     {
5011       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
5012       cost.split (def_vertex.out_degree);
5013       return cost;
5014     }
5015   return { use_vertex.weight * factor, m_optimize_size };
5016 }
5017
5018 /* UD represents a use-def link between FROM_NODE_I and a node in a later
5019    partition; FROM_NODE_I could be the definition node or the use node.
5020    The node at the other end of the link wants to use layout TO_LAYOUT_I.
5021    Return the cost of any necessary fix-ups on edge UD, or return
5022    slpg_layout_cost::impossible () if the change isn't possible.
5023
5024    At this point, FROM_NODE_I's partition has chosen the cheapest
5025    layout based on the information available so far, but this choice
5026    is only provisional.  */
5027
5028 slpg_layout_cost
5029 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
5030                                       unsigned int to_layout_i)
5031 {
5032   auto &from_vertex = m_vertices[from_node_i];
5033   unsigned int from_partition_i = from_vertex.partition;
5034   slpg_partition_info &from_partition = m_partitions[from_partition_i];
5035   gcc_assert (from_partition.layout >= 0);
5036
5037   /* First calculate the cost on the assumption that FROM_PARTITION sticks
5038      with its current layout preference.  */
5039   slpg_layout_cost cost = slpg_layout_cost::impossible ();
5040   auto edge_cost = edge_layout_cost (ud, from_node_i,
5041                                      from_partition.layout, to_layout_i);
5042   if (edge_cost.is_possible ())
5043     {
5044       auto &from_costs = partition_layout_costs (from_partition_i,
5045                                                  from_partition.layout);
5046       cost = from_costs.in_cost;
5047       cost.add_serial_cost (from_costs.internal_cost);
5048       cost.split (from_partition.out_degree);
5049       cost.add_serial_cost (edge_cost);
5050     }
5051   else if (from_partition.layout == 0)
5052     /* We must allow the source partition to have layout 0 as a fallback,
5053        in case all other options turn out to be impossible.  */
5054     return cost;
5055
5056   /* Take the minimum of that cost and the cost that applies if
5057      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
5058   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
5059                                                       to_layout_i);
5060   if (direct_layout_costs.is_possible ())
5061     {
5062       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5063       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
5064       direct_cost.split (from_partition.out_degree);
5065       if (!cost.is_possible ()
5066           || direct_cost.is_better_than (cost, m_optimize_size))
5067         cost = direct_cost;
5068     }
5069
5070   return cost;
5071 }
5072
5073 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
5074    partition; TO_NODE_I could be the definition node or the use node.
5075    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5076    return the cost of any necessary fix-ups on edge UD, or
5077    slpg_layout_cost::impossible () if the choice cannot be made.
5078
5079    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
5080
5081 slpg_layout_cost
5082 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
5083                                        unsigned int from_layout_i)
5084 {
5085   auto &to_vertex = m_vertices[to_node_i];
5086   unsigned int to_partition_i = to_vertex.partition;
5087   slpg_partition_info &to_partition = m_partitions[to_partition_i];
5088   gcc_assert (to_partition.layout >= 0);
5089
5090   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
5091      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
5092      any other inputs keep their current choice of layout.  */
5093   auto &to_costs = partition_layout_costs (to_partition_i,
5094                                            to_partition.layout);
5095   if (ud->src == int (to_node_i)
5096       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5097     {
5098       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
5099       auto old_layout = from_partition.layout;
5100       from_partition.layout = from_layout_i;
5101       int factor = internal_node_cost (to_vertex.node, -1,
5102                                        to_partition.layout);
5103       from_partition.layout = old_layout;
5104       if (factor >= 0)
5105         {
5106           slpg_layout_cost cost = to_costs.out_cost;
5107           cost.add_serial_cost ({ to_vertex.weight * factor,
5108                                   m_optimize_size });
5109           cost.split (to_partition.in_degree);
5110           return cost;
5111         }
5112     }
5113
5114   /* Compute the cost if we insert any necessary layout change on edge UD.  */
5115   auto edge_cost = edge_layout_cost (ud, to_node_i,
5116                                      to_partition.layout, from_layout_i);
5117   if (edge_cost.is_possible ())
5118     {
5119       slpg_layout_cost cost = to_costs.out_cost;
5120       cost.add_serial_cost (to_costs.internal_cost);
5121       cost.split (to_partition.in_degree);
5122       cost.add_serial_cost (edge_cost);
5123       return cost;
5124     }
5125
5126   return slpg_layout_cost::impossible ();
5127 }
5128
5129 /* Make a forward pass through the partitions, accumulating input costs.
5130    Make a tentative (provisional) choice of layout for each partition,
5131    ensuring that this choice still allows later partitions to keep
5132    their original layout.  */
5133
5134 void
5135 vect_optimize_slp_pass::forward_pass ()
5136 {
5137   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5138        ++partition_i)
5139     {
5140       auto &partition = m_partitions[partition_i];
5141
5142       /* If the partition consists of a single VEC_PERM_EXPR, precompute
5143          the incoming cost that would apply if every predecessor partition
5144          keeps its current layout.  This is used within the loop below.  */
5145       slpg_layout_cost in_cost;
5146       slp_tree single_node = nullptr;
5147       if (partition.node_end == partition.node_begin + 1)
5148         {
5149           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5150           single_node = m_vertices[node_i].node;
5151           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5152             in_cost = total_in_cost (node_i);
5153         }
5154
5155       /* Go through the possible layouts.  Decide which ones are valid
5156          for this partition and record which of the valid layouts has
5157          the lowest cost.  */
5158       unsigned int min_layout_i = 0;
5159       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5160       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5161         {
5162           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5163           if (!layout_costs.is_possible ())
5164             continue;
5165
5166           /* If the recorded layout is already 0 then the layout cannot
5167              change.  */
5168           if (partition.layout == 0 && layout_i != 0)
5169             {
5170               layout_costs.mark_impossible ();
5171               continue;
5172             }
5173
5174           bool is_possible = true;
5175           for (unsigned int order_i = partition.node_begin;
5176                order_i < partition.node_end; ++order_i)
5177             {
5178               unsigned int node_i = m_partitioned_nodes[order_i];
5179               auto &vertex = m_vertices[node_i];
5180
5181               /* Reject the layout if it is individually incompatible
5182                  with any node in the partition.  */
5183               if (!is_compatible_layout (vertex.node, layout_i))
5184                 {
5185                   is_possible = false;
5186                   break;
5187                 }
5188
5189               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5190                 {
5191                   auto &other_vertex = m_vertices[other_node_i];
5192                   if (other_vertex.partition < vertex.partition)
5193                     {
5194                       /* Accumulate the incoming costs from earlier
5195                          partitions, plus the cost of any layout changes
5196                          on UD itself.  */
5197                       auto cost = forward_cost (ud, other_node_i, layout_i);
5198                       if (!cost.is_possible ())
5199                         is_possible = false;
5200                       else
5201                         layout_costs.in_cost.add_parallel_cost (cost);
5202                     }
5203                   else
5204                     /* Reject the layout if it would make layout 0 impossible
5205                        for later partitions.  This amounts to testing that the
5206                        target supports reversing the layout change on edges
5207                        to later partitions.
5208
5209                        In principle, it might be possible to push a layout
5210                        change all the way down a graph, so that it never
5211                        needs to be reversed and so that the target doesn't
5212                        need to support the reverse operation.  But it would
5213                        be awkward to bail out if we hit a partition that
5214                        does not support the new layout, especially since
5215                        we are not dealing with a lattice.  */
5216                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
5217                                                      layout_i).is_possible ();
5218                 };
5219               for_each_partition_edge (node_i, add_cost);
5220
5221               /* Accumulate the cost of using LAYOUT_I within NODE,
5222                  both for the inputs and the outputs.  */
5223               int factor = internal_node_cost (vertex.node, layout_i,
5224                                                layout_i);
5225               if (factor < 0)
5226                 {
5227                   is_possible = false;
5228                   break;
5229                 }
5230               else if (factor)
5231                 layout_costs.internal_cost.add_serial_cost
5232                   ({ vertex.weight * factor, m_optimize_size });
5233             }
5234           if (!is_possible)
5235             {
5236               layout_costs.mark_impossible ();
5237               continue;
5238             }
5239
5240           /* Combine the incoming and partition-internal costs.  */
5241           slpg_layout_cost combined_cost = layout_costs.in_cost;
5242           combined_cost.add_serial_cost (layout_costs.internal_cost);
5243
5244           /* If this partition consists of a single VEC_PERM_EXPR, see
5245              if the VEC_PERM_EXPR can be changed to support output layout
5246              LAYOUT_I while keeping all the provisional choices of input
5247              layout.  */
5248           if (single_node
5249               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5250             {
5251               int factor = internal_node_cost (single_node, -1, layout_i);
5252               if (factor >= 0)
5253                 {
5254                   auto weight = m_vertices[single_node->vertex].weight;
5255                   slpg_layout_cost internal_cost
5256                     = { weight * factor, m_optimize_size };
5257
5258                   slpg_layout_cost alt_cost = in_cost;
5259                   alt_cost.add_serial_cost (internal_cost);
5260                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5261                     {
5262                       combined_cost = alt_cost;
5263                       layout_costs.in_cost = in_cost;
5264                       layout_costs.internal_cost = internal_cost;
5265                     }
5266                 }
5267             }
5268
5269           /* Record the layout with the lowest cost.  Prefer layout 0 in
5270              the event of a tie between it and another layout.  */
5271           if (!min_layout_cost.is_possible ()
5272               || combined_cost.is_better_than (min_layout_cost,
5273                                                m_optimize_size))
5274             {
5275               min_layout_i = layout_i;
5276               min_layout_cost = combined_cost;
5277             }
5278         }
5279
5280       /* This loop's handling of earlier partitions should ensure that
5281          choosing the original layout for the current partition is no
5282          less valid than it was in the original graph, even with the
5283          provisional layout choices for those earlier partitions.  */
5284       gcc_assert (min_layout_cost.is_possible ());
5285       partition.layout = min_layout_i;
5286     }
5287 }
5288
5289 /* Make a backward pass through the partitions, accumulating output costs.
5290    Make a final choice of layout for each partition.  */
5291
5292 void
5293 vect_optimize_slp_pass::backward_pass ()
5294 {
5295   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5296     {
5297       auto &partition = m_partitions[partition_i];
5298
5299       unsigned int min_layout_i = 0;
5300       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5301       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5302         {
5303           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5304           if (!layout_costs.is_possible ())
5305             continue;
5306
5307           /* Accumulate the costs from successor partitions.  */
5308           bool is_possible = true;
5309           for (unsigned int order_i = partition.node_begin;
5310                order_i < partition.node_end; ++order_i)
5311             {
5312               unsigned int node_i = m_partitioned_nodes[order_i];
5313               auto &vertex = m_vertices[node_i];
5314               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5315                 {
5316                   auto &other_vertex = m_vertices[other_node_i];
5317                   auto &other_partition = m_partitions[other_vertex.partition];
5318                   if (other_vertex.partition > vertex.partition)
5319                     {
5320                       /* Accumulate the incoming costs from later
5321                          partitions, plus the cost of any layout changes
5322                          on UD itself.  */
5323                       auto cost = backward_cost (ud, other_node_i, layout_i);
5324                       if (!cost.is_possible ())
5325                         is_possible = false;
5326                       else
5327                         layout_costs.out_cost.add_parallel_cost (cost);
5328                     }
5329                   else
5330                     /* Make sure that earlier partitions can (if necessary
5331                        or beneficial) keep the layout that they chose in
5332                        the forward pass.  This ensures that there is at
5333                        least one valid choice of layout.  */
5334                     is_possible &= edge_layout_cost (ud, other_node_i,
5335                                                      other_partition.layout,
5336                                                      layout_i).is_possible ();
5337                 };
5338               for_each_partition_edge (node_i, add_cost);
5339             }
5340           if (!is_possible)
5341             {
5342               layout_costs.mark_impossible ();
5343               continue;
5344             }
5345
5346           /* Locally combine the costs from the forward and backward passes.
5347              (This combined cost is not passed on, since that would lead
5348              to double counting.)  */
5349           slpg_layout_cost combined_cost = layout_costs.in_cost;
5350           combined_cost.add_serial_cost (layout_costs.internal_cost);
5351           combined_cost.add_serial_cost (layout_costs.out_cost);
5352
5353           /* Record the layout with the lowest cost.  Prefer layout 0 in
5354              the event of a tie between it and another layout.  */
5355           if (!min_layout_cost.is_possible ()
5356               || combined_cost.is_better_than (min_layout_cost,
5357                                                m_optimize_size))
5358             {
5359               min_layout_i = layout_i;
5360               min_layout_cost = combined_cost;
5361             }
5362         }
5363
5364       gcc_assert (min_layout_cost.is_possible ());
5365       partition.layout = min_layout_i;
5366     }
5367 }
5368
5369 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5370    NODE already has the layout that was selected for its partition.  */
5371
5372 slp_tree
5373 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5374                                                 unsigned int to_layout_i)
5375 {
5376   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5377   slp_tree result = m_node_layouts[result_i];
5378   if (result)
5379     return result;
5380
5381   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5382       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5383           /* We can't permute vector defs in place.  */
5384           && SLP_TREE_VEC_DEFS (node).is_empty ()))
5385     {
5386       /* If the vector is uniform or unchanged, there's nothing to do.  */
5387       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5388         result = node;
5389       else
5390         {
5391           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5392           result = vect_create_new_slp_node (scalar_ops);
5393           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5394         }
5395     }
5396   else
5397     {
5398       unsigned int partition_i = m_vertices[node->vertex].partition;
5399       unsigned int from_layout_i = m_partitions[partition_i].layout;
5400       if (from_layout_i == to_layout_i)
5401         return node;
5402
5403       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5404          permutation instead of a serial one.  Leave the new permutation
5405          in TMP_PERM on success.  */
5406       auto_lane_permutation_t tmp_perm;
5407       unsigned int num_inputs = 1;
5408       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5409         {
5410           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5411           if (from_layout_i != 0)
5412             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5413           if (to_layout_i != 0)
5414             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5415           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5416                                               tmp_perm,
5417                                               SLP_TREE_CHILDREN (node),
5418                                               false) >= 0)
5419             num_inputs = SLP_TREE_CHILDREN (node).length ();
5420           else
5421             tmp_perm.truncate (0);
5422         }
5423
5424       if (dump_enabled_p ())
5425         {
5426           if (tmp_perm.length () > 0)
5427             dump_printf_loc (MSG_NOTE, vect_location,
5428                              "duplicating permutation node %p with"
5429                              " layout %d\n",
5430                              (void *) node, to_layout_i);
5431           else
5432             dump_printf_loc (MSG_NOTE, vect_location,
5433                              "inserting permutation node in place of %p\n",
5434                              (void *) node);
5435         }
5436
5437       unsigned int num_lanes = SLP_TREE_LANES (node);
5438       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5439       if (SLP_TREE_SCALAR_STMTS (node).length ())
5440         {
5441           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5442           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5443           if (from_layout_i != 0)
5444             vect_slp_permute (m_perms[from_layout_i], stmts, false);
5445           if (to_layout_i != 0)
5446             vect_slp_permute (m_perms[to_layout_i], stmts, true);
5447         }
5448       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5449       SLP_TREE_LANES (result) = num_lanes;
5450       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5451       result->vertex = -1;
5452
5453       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5454       if (tmp_perm.length ())
5455         {
5456           lane_perm.safe_splice (tmp_perm);
5457           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5458         }
5459       else
5460         {
5461           lane_perm.create (num_lanes);
5462           for (unsigned j = 0; j < num_lanes; ++j)
5463             lane_perm.quick_push ({ 0, j });
5464           if (from_layout_i != 0)
5465             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5466           if (to_layout_i != 0)
5467             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5468           SLP_TREE_CHILDREN (result).safe_push (node);
5469         }
5470       for (slp_tree child : SLP_TREE_CHILDREN (result))
5471         child->refcnt++;
5472     }
5473   m_node_layouts[result_i] = result;
5474   return result;
5475 }
5476
5477 /* Apply the chosen vector layouts to the SLP graph.  */
5478
5479 void
5480 vect_optimize_slp_pass::materialize ()
5481 {
5482   /* We no longer need the costs, so avoid having two O(N * P) arrays
5483      live at the same time.  */
5484   m_partition_layout_costs.release ();
5485   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5486
5487   auto_sbitmap fully_folded (m_vertices.length ());
5488   bitmap_clear (fully_folded);
5489   for (unsigned int node_i : m_partitioned_nodes)
5490     {
5491       auto &vertex = m_vertices[node_i];
5492       slp_tree node = vertex.node;
5493       int layout_i = m_partitions[vertex.partition].layout;
5494       gcc_assert (layout_i >= 0);
5495
5496       /* Rearrange the scalar statements to match the chosen layout.  */
5497       if (layout_i > 0)
5498         vect_slp_permute (m_perms[layout_i],
5499                           SLP_TREE_SCALAR_STMTS (node), true);
5500
5501       /* Update load and lane permutations.  */
5502       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5503         {
5504           /* First try to absorb the input vector layouts.  If that fails,
5505              force the inputs to have layout LAYOUT_I too.  We checked that
5506              that was possible before deciding to use nonzero output layouts.
5507              (Note that at this stage we don't really have any guarantee that
5508              the target supports the original VEC_PERM_EXPR.)  */
5509           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5510           auto_lane_permutation_t tmp_perm;
5511           tmp_perm.safe_splice (perm);
5512           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5513           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5514                                               tmp_perm,
5515                                               SLP_TREE_CHILDREN (node),
5516                                               false) >= 0)
5517             {
5518               if (dump_enabled_p ()
5519                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5520                                   perm.begin ()))
5521                 dump_printf_loc (MSG_NOTE, vect_location,
5522                                  "absorbing input layouts into %p\n",
5523                                  (void *) node);
5524               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5525               bitmap_set_bit (fully_folded, node_i);
5526             }
5527           else
5528             {
5529               /* Not MSG_MISSED because it would make no sense to users.  */
5530               if (dump_enabled_p ())
5531                 dump_printf_loc (MSG_NOTE, vect_location,
5532                                  "failed to absorb input layouts into %p\n",
5533                                  (void *) node);
5534               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5535             }
5536         }
5537       else
5538         {
5539           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5540           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5541           if (layout_i > 0)
5542             /* ???  When we handle non-bijective permutes the idea
5543                is that we can force the load-permutation to be
5544                { min, min + 1, min + 2, ... max }.  But then the
5545                scalar defs might no longer match the lane content
5546                which means wrong-code with live lane vectorization.
5547                So we possibly have to have NULL entries for those.  */
5548             vect_slp_permute (m_perms[layout_i], load_perm, true);
5549         }
5550     }
5551
5552   /* Do this before any nodes disappear, since it involves a walk
5553      over the leaves.  */
5554   remove_redundant_permutations ();
5555
5556   /* Replace each child with a correctly laid-out version.  */
5557   for (unsigned int node_i : m_partitioned_nodes)
5558     {
5559       /* Skip nodes that have already been handled above.  */
5560       if (bitmap_bit_p (fully_folded, node_i))
5561         continue;
5562
5563       auto &vertex = m_vertices[node_i];
5564       int in_layout_i = m_partitions[vertex.partition].layout;
5565       gcc_assert (in_layout_i >= 0);
5566
5567       unsigned j;
5568       slp_tree child;
5569       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5570         {
5571           if (!child)
5572             continue;
5573
5574           slp_tree new_child = get_result_with_layout (child, in_layout_i);
5575           if (new_child != child)
5576             {
5577               vect_free_slp_tree (child);
5578               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5579               new_child->refcnt += 1;
5580             }
5581         }
5582     }
5583 }
5584
5585 /* Elide load permutations that are not necessary.  Such permutations might
5586    be pre-existing, rather than created by the layout optimizations.  */
5587
5588 void
5589 vect_optimize_slp_pass::remove_redundant_permutations ()
5590 {
5591   for (unsigned int node_i : m_leafs)
5592     {
5593       slp_tree node = m_vertices[node_i].node;
5594       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5595         continue;
5596
5597       /* In basic block vectorization we allow any subchain of an interleaving
5598          chain.
5599          FORNOW: not in loop SLP because of realignment complications.  */
5600       if (is_a <bb_vec_info> (m_vinfo))
5601         {
5602           bool subchain_p = true;
5603           stmt_vec_info next_load_info = NULL;
5604           stmt_vec_info load_info;
5605           unsigned j;
5606           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5607             {
5608               if (j != 0
5609                   && (next_load_info != load_info
5610                       || DR_GROUP_GAP (load_info) != 1))
5611                 {
5612                   subchain_p = false;
5613                   break;
5614                 }
5615               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5616             }
5617           if (subchain_p)
5618             {
5619               SLP_TREE_LOAD_PERMUTATION (node).release ();
5620               continue;
5621             }
5622         }
5623       else
5624         {
5625           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5626           stmt_vec_info load_info;
5627           bool this_load_permuted = false;
5628           unsigned j;
5629           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5630             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5631               {
5632                 this_load_permuted = true;
5633                 break;
5634               }
5635           /* When this isn't a grouped access we know it's single element
5636              and contiguous.  */
5637           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5638             {
5639               if (!this_load_permuted
5640                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5641                       || SLP_TREE_LANES (node) == 1))
5642                 SLP_TREE_LOAD_PERMUTATION (node).release ();
5643               continue;
5644             }
5645           stmt_vec_info first_stmt_info
5646             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5647           if (!this_load_permuted
5648               /* The load requires permutation when unrolling exposes
5649                  a gap either because the group is larger than the SLP
5650                  group-size or because there is a gap between the groups.  */
5651               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5652                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5653                       && DR_GROUP_GAP (first_stmt_info) == 0)))
5654             {
5655               SLP_TREE_LOAD_PERMUTATION (node).release ();
5656               continue;
5657             }
5658         }
5659     }
5660 }
5661
5662 /* Print the partition graph and layout information to the dump file.  */
5663
5664 void
5665 vect_optimize_slp_pass::dump ()
5666 {
5667   dump_printf_loc (MSG_NOTE, vect_location,
5668                    "SLP optimize permutations:\n");
5669   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5670     {
5671       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
5672       const char *sep = "";
5673       for (unsigned int idx : m_perms[layout_i])
5674         {
5675           dump_printf (MSG_NOTE, "%s%d", sep, idx);
5676           sep = ", ";
5677         }
5678       dump_printf (MSG_NOTE, " }\n");
5679     }
5680   dump_printf_loc (MSG_NOTE, vect_location,
5681                    "SLP optimize partitions:\n");
5682   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5683        ++partition_i)
5684     {
5685       auto &partition = m_partitions[partition_i];
5686       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
5687       dump_printf_loc (MSG_NOTE, vect_location,
5688                        "  partition %d (layout %d):\n",
5689                        partition_i, partition.layout);
5690       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
5691       for (unsigned int order_i = partition.node_begin;
5692            order_i < partition.node_end; ++order_i)
5693         {
5694           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5695           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
5696                            (void *) vertex.node);
5697           dump_printf_loc (MSG_NOTE, vect_location,
5698                            "          weight: %f\n",
5699                            vertex.weight.to_double ());
5700           if (vertex.out_degree)
5701             dump_printf_loc (MSG_NOTE, vect_location,
5702                              "          out weight: %f (degree %d)\n",
5703                              vertex.out_weight.to_double (),
5704                              vertex.out_degree);
5705           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5706             dump_printf_loc (MSG_NOTE, vect_location,
5707                              "          op: VEC_PERM_EXPR\n");
5708           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5709             dump_printf_loc (MSG_NOTE, vect_location,
5710                              "          op template: %G", rep->stmt);
5711         }
5712       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
5713       for (unsigned int order_i = partition.node_begin;
5714            order_i < partition.node_end; ++order_i)
5715         {
5716           unsigned int node_i = m_partitioned_nodes[order_i];
5717           auto &vertex = m_vertices[node_i];
5718           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5719             {
5720               auto &other_vertex = m_vertices[other_node_i];
5721               if (other_vertex.partition < vertex.partition)
5722                 dump_printf_loc (MSG_NOTE, vect_location,
5723                                  "      - %p [%d] --> %p\n",
5724                                  (void *) other_vertex.node,
5725                                  other_vertex.partition,
5726                                  (void *) vertex.node);
5727               else
5728                 dump_printf_loc (MSG_NOTE, vect_location,
5729                                  "      - %p --> [%d] %p\n",
5730                                  (void *) vertex.node,
5731                                  other_vertex.partition,
5732                                  (void *) other_vertex.node);
5733             };
5734           for_each_partition_edge (node_i, print_edge);
5735         }
5736
5737       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5738         {
5739           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5740           if (layout_costs.is_possible ())
5741             {
5742               dump_printf_loc (MSG_NOTE, vect_location,
5743                                "    layout %d:%s\n", layout_i,
5744                                partition.layout == int (layout_i)
5745                                ? " (*)" : "");
5746               slpg_layout_cost combined_cost = layout_costs.in_cost;
5747               combined_cost.add_serial_cost (layout_costs.internal_cost);
5748               combined_cost.add_serial_cost (layout_costs.out_cost);
5749 #define TEMPLATE "{depth: %f, total: %f}"
5750               dump_printf_loc (MSG_NOTE, vect_location,
5751                                "        " TEMPLATE "\n",
5752                                layout_costs.in_cost.depth.to_double (),
5753                                layout_costs.in_cost.total.to_double ());
5754               dump_printf_loc (MSG_NOTE, vect_location,
5755                                "      + " TEMPLATE "\n",
5756                                layout_costs.internal_cost.depth.to_double (),
5757                                layout_costs.internal_cost.total.to_double ());
5758               dump_printf_loc (MSG_NOTE, vect_location,
5759                                "      + " TEMPLATE "\n",
5760                                layout_costs.out_cost.depth.to_double (),
5761                                layout_costs.out_cost.total.to_double ());
5762               dump_printf_loc (MSG_NOTE, vect_location,
5763                                "      = " TEMPLATE "\n",
5764                                combined_cost.depth.to_double (),
5765                                combined_cost.total.to_double ());
5766 #undef TEMPLATE
5767             }
5768           else
5769             dump_printf_loc (MSG_NOTE, vect_location,
5770                              "    layout %d: rejected\n", layout_i);
5771         }
5772     }
5773 }
5774
5775 /* Main entry point for the SLP graph optimization pass.  */
5776
5777 void
5778 vect_optimize_slp_pass::run ()
5779 {
5780   build_graph ();
5781   create_partitions ();
5782   start_choosing_layouts ();
5783   if (m_perms.length () > 1)
5784     {
5785       forward_pass ();
5786       backward_pass ();
5787       if (dump_enabled_p ())
5788         dump ();
5789       materialize ();
5790       while (!m_perms.is_empty ())
5791         m_perms.pop ().release ();
5792     }
5793   else
5794     remove_redundant_permutations ();
5795   free_graph (m_slpg);
5796 }
5797
5798 /* Optimize the SLP graph of VINFO.  */
5799
5800 void
5801 vect_optimize_slp (vec_info *vinfo)
5802 {
5803   if (vinfo->slp_instances.is_empty ())
5804     return;
5805   vect_optimize_slp_pass (vinfo).run ();
5806 }
5807
5808 /* Gather loads reachable from the individual SLP graph entries.  */
5809
5810 void
5811 vect_gather_slp_loads (vec_info *vinfo)
5812 {
5813   unsigned i;
5814   slp_instance instance;
5815   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5816     {
5817       hash_set<slp_tree> visited;
5818       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5819                              SLP_INSTANCE_TREE (instance), visited);
5820     }
5821 }
5822
5823
5824 /* For each possible SLP instance decide whether to SLP it and calculate overall
5825    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
5826    least one instance.  */
5827
5828 bool
5829 vect_make_slp_decision (loop_vec_info loop_vinfo)
5830 {
5831   unsigned int i;
5832   poly_uint64 unrolling_factor = 1;
5833   const vec<slp_instance> &slp_instances
5834     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5835   slp_instance instance;
5836   int decided_to_slp = 0;
5837
5838   DUMP_VECT_SCOPE ("vect_make_slp_decision");
5839
5840   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5841     {
5842       /* FORNOW: SLP if you can.  */
5843       /* All unroll factors have the form:
5844
5845            GET_MODE_SIZE (vinfo->vector_mode) * X
5846
5847          for some rational X, so they must have a common multiple.  */
5848       unrolling_factor
5849         = force_common_multiple (unrolling_factor,
5850                                  SLP_INSTANCE_UNROLLING_FACTOR (instance));
5851
5852       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
5853          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5854          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
5855       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5856       decided_to_slp++;
5857     }
5858
5859   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5860
5861   if (decided_to_slp && dump_enabled_p ())
5862     {
5863       dump_printf_loc (MSG_NOTE, vect_location,
5864                        "Decided to SLP %d instances. Unrolling factor ",
5865                        decided_to_slp);
5866       dump_dec (MSG_NOTE, unrolling_factor);
5867       dump_printf (MSG_NOTE, "\n");
5868     }
5869
5870   return (decided_to_slp > 0);
5871 }
5872
5873 /* Private data for vect_detect_hybrid_slp.  */
5874 struct vdhs_data
5875 {
5876   loop_vec_info loop_vinfo;
5877   vec<stmt_vec_info> *worklist;
5878 };
5879
5880 /* Walker for walk_gimple_op.  */
5881
5882 static tree
5883 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5884 {
5885   walk_stmt_info *wi = (walk_stmt_info *)data;
5886   vdhs_data *dat = (vdhs_data *)wi->info;
5887
5888   if (wi->is_lhs)
5889     return NULL_TREE;
5890
5891   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5892   if (!def_stmt_info)
5893     return NULL_TREE;
5894   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5895   if (PURE_SLP_STMT (def_stmt_info))
5896     {
5897       if (dump_enabled_p ())
5898         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5899                          def_stmt_info->stmt);
5900       STMT_SLP_TYPE (def_stmt_info) = hybrid;
5901       dat->worklist->safe_push (def_stmt_info);
5902     }
5903
5904   return NULL_TREE;
5905 }
5906
5907 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5908    if so, otherwise pushing it to WORKLIST.  */
5909
5910 static void
5911 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5912                                vec<stmt_vec_info> &worklist,
5913                                stmt_vec_info stmt_info)
5914 {
5915   if (dump_enabled_p ())
5916     dump_printf_loc (MSG_NOTE, vect_location,
5917                      "Processing hybrid candidate : %G", stmt_info->stmt);
5918   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5919   imm_use_iterator iter2;
5920   ssa_op_iter iter1;
5921   use_operand_p use_p;
5922   def_operand_p def_p;
5923   bool any_def = false;
5924   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5925     {
5926       any_def = true;
5927       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5928         {
5929           if (is_gimple_debug (USE_STMT (use_p)))
5930             continue;
5931           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5932           /* An out-of loop use means this is a loop_vect sink.  */
5933           if (!use_info)
5934             {
5935               if (dump_enabled_p ())
5936                 dump_printf_loc (MSG_NOTE, vect_location,
5937                                  "Found loop_vect sink: %G", stmt_info->stmt);
5938               worklist.safe_push (stmt_info);
5939               return;
5940             }
5941           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5942             {
5943               if (dump_enabled_p ())
5944                 dump_printf_loc (MSG_NOTE, vect_location,
5945                                  "Found loop_vect use: %G", use_info->stmt);
5946               worklist.safe_push (stmt_info);
5947               return;
5948             }
5949         }
5950     }
5951   /* No def means this is a loo_vect sink.  */
5952   if (!any_def)
5953     {
5954       if (dump_enabled_p ())
5955         dump_printf_loc (MSG_NOTE, vect_location,
5956                          "Found loop_vect sink: %G", stmt_info->stmt);
5957       worklist.safe_push (stmt_info);
5958       return;
5959     }
5960   if (dump_enabled_p ())
5961     dump_printf_loc (MSG_NOTE, vect_location,
5962                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5963   STMT_SLP_TYPE (stmt_info) = pure_slp;
5964 }
5965
5966 /* Find stmts that must be both vectorized and SLPed.  */
5967
5968 void
5969 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5970 {
5971   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5972
5973   /* All stmts participating in SLP are marked pure_slp, all other
5974      stmts are loop_vect.
5975      First collect all loop_vect stmts into a worklist.
5976      SLP patterns cause not all original scalar stmts to appear in
5977      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5978      Rectify this here and do a backward walk over the IL only considering
5979      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5980      mark them as pure_slp.  */
5981   auto_vec<stmt_vec_info> worklist;
5982   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5983     {
5984       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5985       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5986            gsi_next (&gsi))
5987         {
5988           gphi *phi = gsi.phi ();
5989           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5990           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5991             maybe_push_to_hybrid_worklist (loop_vinfo,
5992                                            worklist, stmt_info);
5993         }
5994       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5995            gsi_prev (&gsi))
5996         {
5997           gimple *stmt = gsi_stmt (gsi);
5998           if (is_gimple_debug (stmt))
5999             continue;
6000           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
6001           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
6002             {
6003               for (gimple_stmt_iterator gsi2
6004                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
6005                    !gsi_end_p (gsi2); gsi_next (&gsi2))
6006                 {
6007                   stmt_vec_info patt_info
6008                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
6009                   if (!STMT_SLP_TYPE (patt_info)
6010                       && STMT_VINFO_RELEVANT (patt_info))
6011                     maybe_push_to_hybrid_worklist (loop_vinfo,
6012                                                    worklist, patt_info);
6013                 }
6014               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6015             }
6016           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
6017             maybe_push_to_hybrid_worklist (loop_vinfo,
6018                                            worklist, stmt_info);
6019         }
6020     }
6021
6022   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
6023      mark any SLP vectorized stmt as hybrid.
6024      ???  We're visiting def stmts N times (once for each non-SLP and
6025      once for each hybrid-SLP use).  */
6026   walk_stmt_info wi;
6027   vdhs_data dat;
6028   dat.worklist = &worklist;
6029   dat.loop_vinfo = loop_vinfo;
6030   memset (&wi, 0, sizeof (wi));
6031   wi.info = (void *)&dat;
6032   while (!worklist.is_empty ())
6033     {
6034       stmt_vec_info stmt_info = worklist.pop ();
6035       /* Since SSA operands are not set up for pattern stmts we need
6036          to use walk_gimple_op.  */
6037       wi.is_lhs = 0;
6038       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
6039       /* For gather/scatter make sure to walk the offset operand, that
6040          can be a scaling and conversion away.  */
6041       gather_scatter_info gs_info;
6042       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6043           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6044         {
6045           int dummy;
6046           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
6047         }
6048     }
6049 }
6050
6051
6052 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
6053
6054 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6055   : vec_info (vec_info::bb, shared),
6056     bbs (_bbs),
6057     roots (vNULL)
6058 {
6059   for (unsigned i = 0; i < bbs.length (); ++i)
6060     {
6061       if (i != 0)
6062         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6063              gsi_next (&si))
6064           {
6065             gphi *phi = si.phi ();
6066             gimple_set_uid (phi, 0);
6067             add_stmt (phi);
6068           }
6069       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6070            !gsi_end_p (gsi); gsi_next (&gsi))
6071         {
6072           gimple *stmt = gsi_stmt (gsi);
6073           gimple_set_uid (stmt, 0);
6074           if (is_gimple_debug (stmt))
6075             continue;
6076           add_stmt (stmt);
6077         }
6078     }
6079 }
6080
6081
6082 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
6083    stmts in the basic block.  */
6084
6085 _bb_vec_info::~_bb_vec_info ()
6086 {
6087   /* Reset region marker.  */
6088   for (unsigned i = 0; i < bbs.length (); ++i)
6089     {
6090       if (i != 0)
6091         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6092              gsi_next (&si))
6093           {
6094             gphi *phi = si.phi ();
6095             gimple_set_uid (phi, -1);
6096           }
6097       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6098            !gsi_end_p (gsi); gsi_next (&gsi))
6099         {
6100           gimple *stmt = gsi_stmt (gsi);
6101           gimple_set_uid (stmt, -1);
6102         }
6103     }
6104
6105   for (unsigned i = 0; i < roots.length (); ++i)
6106     {
6107       roots[i].stmts.release ();
6108       roots[i].roots.release ();
6109       roots[i].remain.release ();
6110     }
6111   roots.release ();
6112 }
6113
6114 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
6115    given then that child nodes have already been processed, and that
6116    their def types currently match their SLP node's def type.  */
6117
6118 static bool
6119 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6120                                     slp_instance node_instance,
6121                                     stmt_vector_for_cost *cost_vec)
6122 {
6123   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6124
6125   /* Calculate the number of vector statements to be created for the
6126      scalar stmts in this node.  For SLP reductions it is equal to the
6127      number of vector statements in the children (which has already been
6128      calculated by the recursive call).  Otherwise it is the number of
6129      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6130      VF divided by the number of elements in a vector.  */
6131   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6132       && !STMT_VINFO_DATA_REF (stmt_info)
6133       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6134     {
6135       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6136         if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6137           {
6138             SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6139               = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6140             break;
6141           }
6142     }
6143   else
6144     {
6145       poly_uint64 vf;
6146       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6147         vf = loop_vinfo->vectorization_factor;
6148       else
6149         vf = 1;
6150       unsigned int group_size = SLP_TREE_LANES (node);
6151       tree vectype = SLP_TREE_VECTYPE (node);
6152       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6153         = vect_get_num_vectors (vf * group_size, vectype);
6154     }
6155
6156   /* Handle purely internal nodes.  */
6157   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6158     {
6159       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6160         return false;
6161
6162       stmt_vec_info slp_stmt_info;
6163       unsigned int i;
6164       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6165         {
6166           if (STMT_VINFO_LIVE_P (slp_stmt_info)
6167               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6168                                                node_instance, i,
6169                                                false, cost_vec))
6170             return false;
6171         }
6172       return true;
6173     }
6174
6175   bool dummy;
6176   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6177                             node, node_instance, cost_vec);
6178 }
6179
6180 /* Try to build NODE from scalars, returning true on success.
6181    NODE_INSTANCE is the SLP instance that contains NODE.  */
6182
6183 static bool
6184 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6185                               slp_instance node_instance)
6186 {
6187   stmt_vec_info stmt_info;
6188   unsigned int i;
6189
6190   if (!is_a <bb_vec_info> (vinfo)
6191       || node == SLP_INSTANCE_TREE (node_instance)
6192       || !SLP_TREE_SCALAR_STMTS (node).exists ()
6193       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6194       /* Force the mask use to be built from scalars instead.  */
6195       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6196     return false;
6197
6198   if (dump_enabled_p ())
6199     dump_printf_loc (MSG_NOTE, vect_location,
6200                      "Building vector operands of %p from scalars instead\n",
6201                      (void *) node);
6202
6203   /* Don't remove and free the child nodes here, since they could be
6204      referenced by other structures.  The analysis and scheduling phases
6205      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
6206   unsigned int group_size = SLP_TREE_LANES (node);
6207   SLP_TREE_DEF_TYPE (node) = vect_external_def;
6208   /* Invariants get their vector type from the uses.  */
6209   SLP_TREE_VECTYPE (node) = NULL_TREE;
6210   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6211   SLP_TREE_LOAD_PERMUTATION (node).release ();
6212   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6213     {
6214       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6215       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6216     }
6217   return true;
6218 }
6219
6220 /* Return true if all elements of the slice are the same.  */
6221 bool
6222 vect_scalar_ops_slice::all_same_p () const
6223 {
6224   for (unsigned int i = 1; i < length; ++i)
6225     if (!operand_equal_p (op (0), op (i)))
6226       return false;
6227   return true;
6228 }
6229
6230 hashval_t
6231 vect_scalar_ops_slice_hash::hash (const value_type &s)
6232 {
6233   hashval_t hash = 0;
6234   for (unsigned i = 0; i < s.length; ++i)
6235     hash = iterative_hash_expr (s.op (i), hash);
6236   return hash;
6237 }
6238
6239 bool
6240 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6241                                    const compare_type &s2)
6242 {
6243   if (s1.length != s2.length)
6244     return false;
6245   for (unsigned i = 0; i < s1.length; ++i)
6246     if (!operand_equal_p (s1.op (i), s2.op (i)))
6247       return false;
6248   return true;
6249 }
6250
6251 /* Compute the prologue cost for invariant or constant operands represented
6252    by NODE.  */
6253
6254 static void
6255 vect_prologue_cost_for_slp (slp_tree node,
6256                             stmt_vector_for_cost *cost_vec)
6257 {
6258   /* There's a special case of an existing vector, that costs nothing.  */
6259   if (SLP_TREE_SCALAR_OPS (node).length () == 0
6260       && !SLP_TREE_VEC_DEFS (node).is_empty ())
6261     return;
6262   /* Without looking at the actual initializer a vector of
6263      constants can be implemented as load from the constant pool.
6264      When all elements are the same we can use a splat.  */
6265   tree vectype = SLP_TREE_VECTYPE (node);
6266   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6267   unsigned HOST_WIDE_INT const_nunits;
6268   unsigned nelt_limit;
6269   auto ops = &SLP_TREE_SCALAR_OPS (node);
6270   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6271   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6272       && ! multiple_p (const_nunits, group_size))
6273     {
6274       nelt_limit = const_nunits;
6275       hash_set<vect_scalar_ops_slice_hash> vector_ops;
6276       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6277         if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6278           starts.quick_push (i * const_nunits);
6279     }
6280   else
6281     {
6282       /* If either the vector has variable length or the vectors
6283          are composed of repeated whole groups we only need to
6284          cost construction once.  All vectors will be the same.  */
6285       nelt_limit = group_size;
6286       starts.quick_push (0);
6287     }
6288   /* ???  We're just tracking whether vectors in a single node are the same.
6289      Ideally we'd do something more global.  */
6290   bool passed = false;
6291   for (unsigned int start : starts)
6292     {
6293       vect_cost_for_stmt kind;
6294       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6295         kind = vector_load;
6296       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6297         kind = scalar_to_vec;
6298       else
6299         kind = vec_construct;
6300       /* The target cost hook has no idea which part of the SLP node
6301          we are costing so avoid passing it down more than once.  Pass
6302          it to the first vec_construct or scalar_to_vec part since for those
6303          the x86 backend tries to account for GPR to XMM register moves.  */
6304       record_stmt_cost (cost_vec, 1, kind,
6305                         (kind != vector_load && !passed) ? node : nullptr,
6306                         vectype, 0, vect_prologue);
6307       if (kind != vector_load)
6308         passed = true;
6309     }
6310 }
6311
6312 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6313    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6314
6315    Return true if the operations are supported.  */
6316
6317 static bool
6318 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6319                                   slp_instance node_instance,
6320                                   hash_set<slp_tree> &visited_set,
6321                                   vec<slp_tree> &visited_vec,
6322                                   stmt_vector_for_cost *cost_vec)
6323 {
6324   int i, j;
6325   slp_tree child;
6326
6327   /* Assume we can code-generate all invariants.  */
6328   if (!node
6329       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6330       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6331     return true;
6332
6333   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6334     {
6335       if (dump_enabled_p ())
6336         dump_printf_loc (MSG_NOTE, vect_location,
6337                          "Failed cyclic SLP reference in %p\n", (void *) node);
6338       return false;
6339     }
6340   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6341
6342   /* If we already analyzed the exact same set of scalar stmts we're done.
6343      We share the generated vector stmts for those.  */
6344   if (visited_set.add (node))
6345     return true;
6346   visited_vec.safe_push (node);
6347
6348   bool res = true;
6349   unsigned visited_rec_start = visited_vec.length ();
6350   unsigned cost_vec_rec_start = cost_vec->length ();
6351   bool seen_non_constant_child = false;
6352   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6353     {
6354       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6355                                               visited_set, visited_vec,
6356                                               cost_vec);
6357       if (!res)
6358         break;
6359       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6360         seen_non_constant_child = true;
6361     }
6362   /* We're having difficulties scheduling nodes with just constant
6363      operands and no scalar stmts since we then cannot compute a stmt
6364      insertion place.  */
6365   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6366     {
6367       if (dump_enabled_p ())
6368         dump_printf_loc (MSG_NOTE, vect_location,
6369                          "Cannot vectorize all-constant op node %p\n",
6370                          (void *) node);
6371       res = false;
6372     }
6373
6374   if (res)
6375     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6376                                               cost_vec);
6377   /* If analysis failed we have to pop all recursive visited nodes
6378      plus ourselves.  */
6379   if (!res)
6380     {
6381       while (visited_vec.length () >= visited_rec_start)
6382         visited_set.remove (visited_vec.pop ());
6383       cost_vec->truncate (cost_vec_rec_start);
6384     }
6385
6386   /* When the node can be vectorized cost invariant nodes it references.
6387      This is not done in DFS order to allow the refering node
6388      vectorizable_* calls to nail down the invariant nodes vector type
6389      and possibly unshare it if it needs a different vector type than
6390      other referrers.  */
6391   if (res)
6392     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6393       if (child
6394           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6395               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6396           /* Perform usual caching, note code-generation still
6397              code-gens these nodes multiple times but we expect
6398              to CSE them later.  */
6399           && !visited_set.add (child))
6400         {
6401           visited_vec.safe_push (child);
6402           /* ???  After auditing more code paths make a "default"
6403              and push the vector type from NODE to all children
6404              if it is not already set.  */
6405           /* Compute the number of vectors to be generated.  */
6406           tree vector_type = SLP_TREE_VECTYPE (child);
6407           if (!vector_type)
6408             {
6409               /* For shifts with a scalar argument we don't need
6410                  to cost or code-generate anything.
6411                  ???  Represent this more explicitely.  */
6412               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6413                            == shift_vec_info_type)
6414                           && j == 1);
6415               continue;
6416             }
6417           unsigned group_size = SLP_TREE_LANES (child);
6418           poly_uint64 vf = 1;
6419           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6420             vf = loop_vinfo->vectorization_factor;
6421           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6422             = vect_get_num_vectors (vf * group_size, vector_type);
6423           /* And cost them.  */
6424           vect_prologue_cost_for_slp (child, cost_vec);
6425         }
6426
6427   /* If this node or any of its children can't be vectorized, try pruning
6428      the tree here rather than felling the whole thing.  */
6429   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6430     {
6431       /* We'll need to revisit this for invariant costing and number
6432          of vectorized stmt setting.   */
6433       res = true;
6434     }
6435
6436   return res;
6437 }
6438
6439 /* Given a definition DEF, analyze if it will have any live scalar use after
6440    performing SLP vectorization whose information is represented by BB_VINFO,
6441    and record result into hash map SCALAR_USE_MAP as cache for later fast
6442    check.  If recursion DEPTH exceeds a limit, stop analysis and make a
6443    conservative assumption.  Return 0 if no scalar use, 1 if there is, -1
6444    means recursion is limited.  */
6445
6446 static int
6447 vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
6448                         hash_map<tree, int> &scalar_use_map,
6449                         int depth = 0)
6450 {
6451   const int depth_limit = 2;
6452   imm_use_iterator use_iter;
6453   gimple *use_stmt;
6454
6455   if (int *res = scalar_use_map.get (def))
6456     return *res;
6457
6458   int scalar_use = 1;
6459
6460   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
6461     {
6462       if (is_gimple_debug (use_stmt))
6463         continue;
6464
6465       stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6466
6467       if (!use_stmt_info)
6468         break;
6469
6470       if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6471         continue;
6472
6473       /* Do not step forward when encounter PHI statement, since it may
6474          involve cyclic reference and cause infinite recursive invocation.  */
6475       if (gimple_code (use_stmt) == GIMPLE_PHI)
6476         break;
6477
6478       /* When pattern recognition is involved, a statement whose definition is
6479          consumed in some pattern, may not be included in the final replacement
6480          pattern statements, so would be skipped when building SLP graph.
6481
6482          * Original
6483           char a_c = *(char *) a;
6484           char b_c = *(char *) b;
6485           unsigned short a_s = (unsigned short) a_c;
6486           int a_i = (int) a_s;
6487           int b_i = (int) b_c;
6488           int r_i = a_i - b_i;
6489
6490          * After pattern replacement
6491           a_s = (unsigned short) a_c;
6492           a_i = (int) a_s;
6493
6494           patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
6495           patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
6496
6497           patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
6498           patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
6499
6500          The definitions of a_i(original statement) and b_i(pattern statement)
6501          are related to, but actually not part of widen_minus pattern.
6502          Vectorizing the pattern does not cause these definition statements to
6503          be marked as PURE_SLP.  For this case, we need to recursively check
6504          whether their uses are all absorbed into vectorized code.  But there
6505          is an exception that some use may participate in an vectorized
6506          operation via an external SLP node containing that use as an element.
6507          The parameter "scalar_use_map" tags such kind of SSA as having scalar
6508          use in advance.  */
6509       tree lhs = gimple_get_lhs (use_stmt);
6510
6511       if (!lhs || TREE_CODE (lhs) != SSA_NAME)
6512         break;
6513
6514       if (depth_limit && depth >= depth_limit)
6515         return -1;
6516
6517       if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
6518                                                 depth + 1)))
6519         break;
6520     }
6521
6522   if (end_imm_use_stmt_p (&use_iter))
6523     scalar_use = 0;
6524
6525   /* If recursion is limited, do not cache result for non-root defs.  */
6526   if (!depth || scalar_use >= 0)
6527     {
6528       bool added = scalar_use_map.put (def, scalar_use);
6529       gcc_assert (!added);
6530     }
6531
6532   return scalar_use;
6533 }
6534
6535 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6536    region and that can be vectorized using vectorizable_live_operation
6537    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
6538    scalar code computing it to be retained.  */
6539
6540 static void
6541 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6542                              slp_instance instance,
6543                              stmt_vector_for_cost *cost_vec,
6544                              hash_map<tree, int> &scalar_use_map,
6545                              hash_set<stmt_vec_info> &svisited,
6546                              hash_set<slp_tree> &visited)
6547 {
6548   if (visited.add (node))
6549     return;
6550
6551   unsigned i;
6552   stmt_vec_info stmt_info;
6553   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6554   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6555     {
6556       if (svisited.contains (stmt_info))
6557         continue;
6558       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6559       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6560           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6561         /* Only the pattern root stmt computes the original scalar value.  */
6562         continue;
6563       bool mark_visited = true;
6564       gimple *orig_stmt = orig_stmt_info->stmt;
6565       ssa_op_iter op_iter;
6566       def_operand_p def_p;
6567       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6568         {
6569           if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
6570                                       scalar_use_map))
6571             {
6572               STMT_VINFO_LIVE_P (stmt_info) = true;
6573               if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
6574                                                instance, i, false, cost_vec))
6575                 /* ???  So we know we can vectorize the live stmt from one SLP
6576                    node.  If we cannot do so from all or none consistently
6577                    we'd have to record which SLP node (and lane) we want to
6578                    use for the live operation.  So make sure we can
6579                    code-generate from all nodes.  */
6580                 mark_visited = false;
6581               else
6582                 STMT_VINFO_LIVE_P (stmt_info) = false;
6583             }
6584
6585           /* We have to verify whether we can insert the lane extract
6586              before all uses.  The following is a conservative approximation.
6587              We cannot put this into vectorizable_live_operation because
6588              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6589              doesn't work.
6590              Note that while the fact that we emit code for loads at the
6591              first load should make this a non-problem leafs we construct
6592              from scalars are vectorized after the last scalar def.
6593              ???  If we'd actually compute the insert location during
6594              analysis we could use sth less conservative than the last
6595              scalar stmt in the node for the dominance check.  */
6596           /* ???  What remains is "live" uses in vector CTORs in the same
6597              SLP graph which is where those uses can end up code-generated
6598              right after their definition instead of close to their original
6599              use.  But that would restrict us to code-generate lane-extracts
6600              from the latest stmt in a node.  So we compensate for this
6601              during code-generation, simply not replacing uses for those
6602              hopefully rare cases.  */
6603           imm_use_iterator use_iter;
6604           gimple *use_stmt;
6605           stmt_vec_info use_stmt_info;
6606
6607           if (STMT_VINFO_LIVE_P (stmt_info))
6608             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6609               if (!is_gimple_debug (use_stmt)
6610                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6611                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6612                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6613                 {
6614                   if (dump_enabled_p ())
6615                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6616                                      "Cannot determine insertion place for "
6617                                      "lane extract\n");
6618                   STMT_VINFO_LIVE_P (stmt_info) = false;
6619                   mark_visited = true;
6620                 }
6621         }
6622       if (mark_visited)
6623         svisited.add (stmt_info);
6624     }
6625
6626   slp_tree child;
6627   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6628     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6629       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
6630                                    scalar_use_map, svisited, visited);
6631 }
6632
6633 /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
6634    are live outside of the basic-block vectorized region and that can be
6635    vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
6636
6637 static void
6638 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
6639 {
6640   if (bb_vinfo->slp_instances.is_empty ())
6641     return;
6642
6643   hash_set<stmt_vec_info> svisited;
6644   hash_set<slp_tree> visited;
6645   hash_map<tree, int> scalar_use_map;
6646   auto_vec<slp_tree> worklist;
6647
6648   for (slp_instance instance : bb_vinfo->slp_instances)
6649     {
6650       if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
6651         for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
6652           if (TREE_CODE (op) == SSA_NAME)
6653             scalar_use_map.put (op, 1);
6654       if (!visited.add (SLP_INSTANCE_TREE (instance)))
6655         worklist.safe_push (SLP_INSTANCE_TREE (instance));
6656     }
6657
6658   do
6659     {
6660       slp_tree node = worklist.pop ();
6661
6662       if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
6663         {
6664           for (tree op : SLP_TREE_SCALAR_OPS (node))
6665             if (TREE_CODE (op) == SSA_NAME)
6666               scalar_use_map.put (op, 1);
6667         }
6668       else
6669         {
6670           for (slp_tree child : SLP_TREE_CHILDREN (node))
6671             if (child && !visited.add (child))
6672               worklist.safe_push (child);
6673         }
6674     }
6675   while (!worklist.is_empty ());
6676
6677   visited.empty ();
6678
6679   for (slp_instance instance : bb_vinfo->slp_instances)
6680     {
6681       vect_location = instance->location ();
6682       vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6683                                    instance, &instance->cost_vec,
6684                                    scalar_use_map, svisited, visited);
6685     }
6686 }
6687
6688 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
6689
6690 static bool
6691 vectorizable_bb_reduc_epilogue (slp_instance instance,
6692                                 stmt_vector_for_cost *cost_vec)
6693 {
6694   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6695   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6696   if (reduc_code == MINUS_EXPR)
6697     reduc_code = PLUS_EXPR;
6698   internal_fn reduc_fn;
6699   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6700   if (!vectype
6701       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6702       || reduc_fn == IFN_LAST
6703       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6704       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6705                                      TREE_TYPE (vectype)))
6706     {
6707       if (dump_enabled_p ())
6708         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6709                          "not vectorized: basic block reduction epilogue "
6710                          "operation unsupported.\n");
6711       return false;
6712     }
6713
6714   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6715      cost log2 vector operations plus shuffles and one extraction.  */
6716   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6717   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6718                     vectype, 0, vect_body);
6719   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6720                     vectype, 0, vect_body);
6721   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6722                     vectype, 0, vect_body);
6723
6724   /* Since we replace all stmts of a possibly longer scalar reduction
6725      chain account for the extra scalar stmts for that.  */
6726   record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6727                     instance->root_stmts[0], 0, vect_body);
6728   return true;
6729 }
6730
6731 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6732    and recurse to children.  */
6733
6734 static void
6735 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6736                               hash_set<slp_tree> &visited)
6737 {
6738   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6739       || visited.add (node))
6740     return;
6741
6742   stmt_vec_info stmt;
6743   unsigned i;
6744   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6745     roots.remove (vect_orig_stmt (stmt));
6746
6747   slp_tree child;
6748   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6749     if (child)
6750       vect_slp_prune_covered_roots (child, roots, visited);
6751 }
6752
6753 /* Analyze statements in SLP instances of VINFO.  Return true if the
6754    operations are supported. */
6755
6756 bool
6757 vect_slp_analyze_operations (vec_info *vinfo)
6758 {
6759   slp_instance instance;
6760   int i;
6761
6762   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6763
6764   hash_set<slp_tree> visited;
6765   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6766     {
6767       auto_vec<slp_tree> visited_vec;
6768       stmt_vector_for_cost cost_vec;
6769       cost_vec.create (2);
6770       if (is_a <bb_vec_info> (vinfo))
6771         vect_location = instance->location ();
6772       if (!vect_slp_analyze_node_operations (vinfo,
6773                                              SLP_INSTANCE_TREE (instance),
6774                                              instance, visited, visited_vec,
6775                                              &cost_vec)
6776           /* CTOR instances require vectorized defs for the SLP tree root.  */
6777           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6778               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6779                   != vect_internal_def
6780                   /* Make sure we vectorized with the expected type.  */
6781                   || !useless_type_conversion_p
6782                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6783                                               (instance->root_stmts[0]->stmt))),
6784                          TREE_TYPE (SLP_TREE_VECTYPE
6785                                             (SLP_INSTANCE_TREE (instance))))))
6786           /* Check we can vectorize the reduction.  */
6787           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6788               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6789         {
6790           slp_tree node = SLP_INSTANCE_TREE (instance);
6791           stmt_vec_info stmt_info;
6792           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6793             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6794           else
6795             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6796           if (dump_enabled_p ())
6797             dump_printf_loc (MSG_NOTE, vect_location,
6798                              "removing SLP instance operations starting from: %G",
6799                              stmt_info->stmt);
6800           vect_free_slp_instance (instance);
6801           vinfo->slp_instances.ordered_remove (i);
6802           cost_vec.release ();
6803           while (!visited_vec.is_empty ())
6804             visited.remove (visited_vec.pop ());
6805         }
6806       else
6807         {
6808           i++;
6809           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6810             {
6811               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6812               cost_vec.release ();
6813             }
6814           else
6815             /* For BB vectorization remember the SLP graph entry
6816                cost for later.  */
6817             instance->cost_vec = cost_vec;
6818         }
6819     }
6820
6821   /* Now look for SLP instances with a root that are covered by other
6822      instances and remove them.  */
6823   hash_set<stmt_vec_info> roots;
6824   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6825     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6826       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6827   if (!roots.is_empty ())
6828     {
6829       visited.empty ();
6830       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6831         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6832                                       visited);
6833       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6834         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6835             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6836           {
6837             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6838             if (dump_enabled_p ())
6839               dump_printf_loc (MSG_NOTE, vect_location,
6840                                "removing SLP instance operations starting "
6841                                "from: %G", root->stmt);
6842             vect_free_slp_instance (instance);
6843             vinfo->slp_instances.ordered_remove (i);
6844           }
6845         else
6846           ++i;
6847     }
6848
6849   /* Compute vectorizable live stmts.  */
6850   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6851     vect_bb_slp_mark_live_stmts (bb_vinfo);
6852
6853   return !vinfo->slp_instances.is_empty ();
6854 }
6855
6856 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6857    closing the eventual chain.  */
6858
6859 static slp_instance
6860 get_ultimate_leader (slp_instance instance,
6861                      hash_map<slp_instance, slp_instance> &instance_leader)
6862 {
6863   auto_vec<slp_instance *, 8> chain;
6864   slp_instance *tem;
6865   while (*(tem = instance_leader.get (instance)) != instance)
6866     {
6867       chain.safe_push (tem);
6868       instance = *tem;
6869     }
6870   while (!chain.is_empty ())
6871     *chain.pop () = instance;
6872   return instance;
6873 }
6874
6875 namespace {
6876 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
6877    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6878    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
6879
6880    INSTANCE_LEADER is as for get_ultimate_leader.  */
6881
6882 template<typename T>
6883 bool
6884 vect_map_to_instance (slp_instance instance, T key,
6885                       hash_map<T, slp_instance> &key_to_instance,
6886                       hash_map<slp_instance, slp_instance> &instance_leader)
6887 {
6888   bool existed_p;
6889   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6890   if (!existed_p)
6891     ;
6892   else if (key_instance != instance)
6893     {
6894       /* If we're running into a previously marked key make us the
6895          leader of the current ultimate leader.  This keeps the
6896          leader chain acyclic and works even when the current instance
6897          connects two previously independent graph parts.  */
6898       slp_instance key_leader
6899         = get_ultimate_leader (key_instance, instance_leader);
6900       if (key_leader != instance)
6901         instance_leader.put (key_leader, instance);
6902     }
6903   key_instance = instance;
6904   return existed_p;
6905 }
6906 }
6907
6908 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
6909
6910 static void
6911 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6912                            slp_instance instance, slp_tree node,
6913                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6914                            hash_map<slp_tree, slp_instance> &node_to_instance,
6915                            hash_map<slp_instance, slp_instance> &instance_leader)
6916 {
6917   stmt_vec_info stmt_info;
6918   unsigned i;
6919
6920   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6921     vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6922                           instance_leader);
6923
6924   if (vect_map_to_instance (instance, node, node_to_instance,
6925                             instance_leader))
6926     return;
6927
6928   slp_tree child;
6929   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6930     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6931       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6932                                  node_to_instance, instance_leader);
6933 }
6934
6935 /* Partition the SLP graph into pieces that can be costed independently.  */
6936
6937 static void
6938 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6939 {
6940   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6941
6942   /* First walk the SLP graph assigning each involved scalar stmt a
6943      corresponding SLP graph entry and upon visiting a previously
6944      marked stmt, make the stmts leader the current SLP graph entry.  */
6945   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6946   hash_map<slp_tree, slp_instance> node_to_instance;
6947   hash_map<slp_instance, slp_instance> instance_leader;
6948   slp_instance instance;
6949   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6950     {
6951       instance_leader.put (instance, instance);
6952       vect_bb_partition_graph_r (bb_vinfo,
6953                                  instance, SLP_INSTANCE_TREE (instance),
6954                                  stmt_to_instance, node_to_instance,
6955                                  instance_leader);
6956     }
6957
6958   /* Then collect entries to each independent subgraph.  */
6959   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6960     {
6961       slp_instance leader = get_ultimate_leader (instance, instance_leader);
6962       leader->subgraph_entries.safe_push (instance);
6963       if (dump_enabled_p ()
6964           && leader != instance)
6965         dump_printf_loc (MSG_NOTE, vect_location,
6966                          "instance %p is leader of %p\n",
6967                          (void *) leader, (void *) instance);
6968     }
6969 }
6970
6971 /* Compute the set of scalar stmts participating in internal and external
6972    nodes.  */
6973
6974 static void
6975 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6976                                          hash_set<slp_tree> &visited,
6977                                          hash_set<stmt_vec_info> &vstmts,
6978                                          hash_set<stmt_vec_info> &estmts)
6979 {
6980   int i;
6981   stmt_vec_info stmt_info;
6982   slp_tree child;
6983
6984   if (visited.add (node))
6985     return;
6986
6987   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6988     {
6989       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6990         vstmts.add (stmt_info);
6991
6992       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6993         if (child)
6994           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6995                                                    vstmts, estmts);
6996     }
6997   else
6998     for (tree def : SLP_TREE_SCALAR_OPS (node))
6999       {
7000         stmt_vec_info def_stmt = vinfo->lookup_def (def);
7001         if (def_stmt)
7002           estmts.add (def_stmt);
7003       }
7004 }
7005
7006
7007 /* Compute the scalar cost of the SLP node NODE and its children
7008    and return it.  Do not account defs that are marked in LIFE and
7009    update LIFE according to uses of NODE.  */
7010
7011 static void
7012 vect_bb_slp_scalar_cost (vec_info *vinfo,
7013                          slp_tree node, vec<bool, va_heap> *life,
7014                          stmt_vector_for_cost *cost_vec,
7015                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
7016                          hash_set<slp_tree> &visited)
7017 {
7018   unsigned i;
7019   stmt_vec_info stmt_info;
7020   slp_tree child;
7021
7022   if (visited.add (node))
7023     return;
7024
7025   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7026     {
7027       ssa_op_iter op_iter;
7028       def_operand_p def_p;
7029
7030       if ((*life)[i])
7031         continue;
7032
7033       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
7034       gimple *orig_stmt = orig_stmt_info->stmt;
7035
7036       /* If there is a non-vectorized use of the defs then the scalar
7037          stmt is kept live in which case we do not account it or any
7038          required defs in the SLP children in the scalar cost.  This
7039          way we make the vectorization more costly when compared to
7040          the scalar cost.  */
7041       if (!STMT_VINFO_LIVE_P (stmt_info))
7042         {
7043           auto_vec<gimple *, 8> worklist;
7044           hash_set<gimple *> *worklist_visited = NULL;
7045           worklist.quick_push (orig_stmt);
7046           do
7047             {
7048               gimple *work_stmt = worklist.pop ();
7049               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
7050                 {
7051                   imm_use_iterator use_iter;
7052                   gimple *use_stmt;
7053                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
7054                                          DEF_FROM_PTR (def_p))
7055                     if (!is_gimple_debug (use_stmt))
7056                       {
7057                         stmt_vec_info use_stmt_info
7058                           = vinfo->lookup_stmt (use_stmt);
7059                         if (!use_stmt_info
7060                             || !vectorized_scalar_stmts.contains (use_stmt_info))
7061                           {
7062                             if (use_stmt_info
7063                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
7064                               {
7065                                 /* For stmts participating in patterns we have
7066                                    to check its uses recursively.  */
7067                                 if (!worklist_visited)
7068                                   worklist_visited = new hash_set<gimple *> ();
7069                                 if (!worklist_visited->add (use_stmt))
7070                                   worklist.safe_push (use_stmt);
7071                                 continue;
7072                               }
7073                             (*life)[i] = true;
7074                             goto next_lane;
7075                           }
7076                       }
7077                 }
7078             }
7079           while (!worklist.is_empty ());
7080 next_lane:
7081           if (worklist_visited)
7082             delete worklist_visited;
7083           if ((*life)[i])
7084             continue;
7085         }
7086
7087       /* Count scalar stmts only once.  */
7088       if (gimple_visited_p (orig_stmt))
7089         continue;
7090       gimple_set_visited (orig_stmt, true);
7091
7092       vect_cost_for_stmt kind;
7093       if (STMT_VINFO_DATA_REF (orig_stmt_info))
7094         {
7095           if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
7096             kind = scalar_load;
7097           else
7098             kind = scalar_store;
7099         }
7100       else if (vect_nop_conversion_p (orig_stmt_info))
7101         continue;
7102       /* For single-argument PHIs assume coalescing which means zero cost
7103          for the scalar and the vector PHIs.  This avoids artificially
7104          favoring the vector path (but may pessimize it in some cases).  */
7105       else if (is_a <gphi *> (orig_stmt_info->stmt)
7106                && gimple_phi_num_args
7107                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
7108         continue;
7109       else
7110         kind = scalar_stmt;
7111       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
7112                         SLP_TREE_VECTYPE (node), 0, vect_body);
7113     }
7114
7115   auto_vec<bool, 20> subtree_life;
7116   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7117     {
7118       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7119         {
7120           /* Do not directly pass LIFE to the recursive call, copy it to
7121              confine changes in the callee to the current child/subtree.  */
7122           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7123             {
7124               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
7125               for (unsigned j = 0;
7126                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
7127                 {
7128                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
7129                   if (perm.first == i)
7130                     subtree_life[perm.second] = (*life)[j];
7131                 }
7132             }
7133           else
7134             {
7135               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
7136               subtree_life.safe_splice (*life);
7137             }
7138           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
7139                                    vectorized_scalar_stmts, visited);
7140           subtree_life.truncate (0);
7141         }
7142     }
7143 }
7144
7145 /* Comparator for the loop-index sorted cost vectors.  */
7146
7147 static int
7148 li_cost_vec_cmp (const void *a_, const void *b_)
7149 {
7150   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
7151   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
7152   if (a->first < b->first)
7153     return -1;
7154   else if (a->first == b->first)
7155     return 0;
7156   return 1;
7157 }
7158
7159 /* Check if vectorization of the basic block is profitable for the
7160    subgraph denoted by SLP_INSTANCES.  */
7161
7162 static bool
7163 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
7164                                     vec<slp_instance> slp_instances,
7165                                     loop_p orig_loop)
7166 {
7167   slp_instance instance;
7168   int i;
7169   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
7170   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
7171
7172   if (dump_enabled_p ())
7173     {
7174       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
7175       hash_set<slp_tree> visited;
7176       FOR_EACH_VEC_ELT (slp_instances, i, instance)
7177         vect_print_slp_graph (MSG_NOTE, vect_location,
7178                               SLP_INSTANCE_TREE (instance), visited);
7179     }
7180
7181   /* Compute the set of scalar stmts we know will go away 'locally' when
7182      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
7183      not accurate for nodes promoted extern late or for scalar stmts that
7184      are used both in extern defs and in vectorized defs.  */
7185   hash_set<stmt_vec_info> vectorized_scalar_stmts;
7186   hash_set<stmt_vec_info> scalar_stmts_in_externs;
7187   hash_set<slp_tree> visited;
7188   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7189     {
7190       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
7191                                                SLP_INSTANCE_TREE (instance),
7192                                                visited,
7193                                                vectorized_scalar_stmts,
7194                                                scalar_stmts_in_externs);
7195       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7196         vectorized_scalar_stmts.add (rstmt);
7197     }
7198   /* Scalar stmts used as defs in external nodes need to be preseved, so
7199      remove them from vectorized_scalar_stmts.  */
7200   for (stmt_vec_info stmt : scalar_stmts_in_externs)
7201     vectorized_scalar_stmts.remove (stmt);
7202
7203   /* Calculate scalar cost and sum the cost for the vector stmts
7204      previously collected.  */
7205   stmt_vector_for_cost scalar_costs = vNULL;
7206   stmt_vector_for_cost vector_costs = vNULL;
7207   visited.empty ();
7208   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7209     {
7210       auto_vec<bool, 20> life;
7211       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7212                               true);
7213       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7214         record_stmt_cost (&scalar_costs,
7215                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
7216                           scalar_stmt,
7217                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
7218       vect_bb_slp_scalar_cost (bb_vinfo,
7219                                SLP_INSTANCE_TREE (instance),
7220                                &life, &scalar_costs, vectorized_scalar_stmts,
7221                                visited);
7222       vector_costs.safe_splice (instance->cost_vec);
7223       instance->cost_vec.release ();
7224     }
7225
7226   if (dump_enabled_p ())
7227     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7228
7229   /* When costing non-loop vectorization we need to consider each covered
7230      loop independently and make sure vectorization is profitable.  For
7231      now we assume a loop may be not entered or executed an arbitrary
7232      number of iterations (???  static information can provide more
7233      precise info here) which means we can simply cost each containing
7234      loops stmts separately.  */
7235
7236   /* First produce cost vectors sorted by loop index.  */
7237   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7238     li_scalar_costs (scalar_costs.length ());
7239   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7240     li_vector_costs (vector_costs.length ());
7241   stmt_info_for_cost *cost;
7242   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7243     {
7244       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7245       li_scalar_costs.quick_push (std::make_pair (l, cost));
7246     }
7247   /* Use a random used loop as fallback in case the first vector_costs
7248      entry does not have a stmt_info associated with it.  */
7249   unsigned l = li_scalar_costs[0].first;
7250   FOR_EACH_VEC_ELT (vector_costs, i, cost)
7251     {
7252       /* We inherit from the previous COST, invariants, externals and
7253          extracts immediately follow the cost for the related stmt.  */
7254       if (cost->stmt_info)
7255         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7256       li_vector_costs.quick_push (std::make_pair (l, cost));
7257     }
7258   li_scalar_costs.qsort (li_cost_vec_cmp);
7259   li_vector_costs.qsort (li_cost_vec_cmp);
7260
7261   /* Now cost the portions individually.  */
7262   unsigned vi = 0;
7263   unsigned si = 0;
7264   bool profitable = true;
7265   while (si < li_scalar_costs.length ()
7266          && vi < li_vector_costs.length ())
7267     {
7268       unsigned sl = li_scalar_costs[si].first;
7269       unsigned vl = li_vector_costs[vi].first;
7270       if (sl != vl)
7271         {
7272           if (dump_enabled_p ())
7273             dump_printf_loc (MSG_NOTE, vect_location,
7274                              "Scalar %d and vector %d loop part do not "
7275                              "match up, skipping scalar part\n", sl, vl);
7276           /* Skip the scalar part, assuming zero cost on the vector side.  */
7277           do
7278             {
7279               si++;
7280             }
7281           while (si < li_scalar_costs.length ()
7282                  && li_scalar_costs[si].first == sl);
7283           continue;
7284         }
7285
7286       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7287       do
7288         {
7289           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7290           si++;
7291         }
7292       while (si < li_scalar_costs.length ()
7293              && li_scalar_costs[si].first == sl);
7294       unsigned dummy;
7295       finish_cost (scalar_target_cost_data, nullptr,
7296                    &dummy, &scalar_cost, &dummy);
7297
7298       /* Complete the target-specific vector cost calculation.  */
7299       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7300       do
7301         {
7302           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7303           vi++;
7304         }
7305       while (vi < li_vector_costs.length ()
7306              && li_vector_costs[vi].first == vl);
7307       finish_cost (vect_target_cost_data, scalar_target_cost_data,
7308                    &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7309       delete scalar_target_cost_data;
7310       delete vect_target_cost_data;
7311
7312       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7313
7314       if (dump_enabled_p ())
7315         {
7316           dump_printf_loc (MSG_NOTE, vect_location,
7317                            "Cost model analysis for part in loop %d:\n", sl);
7318           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
7319                        vec_inside_cost + vec_outside_cost);
7320           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
7321         }
7322
7323       /* Vectorization is profitable if its cost is more than the cost of scalar
7324          version.  Note that we err on the vector side for equal cost because
7325          the cost estimate is otherwise quite pessimistic (constant uses are
7326          free on the scalar side but cost a load on the vector side for
7327          example).  */
7328       if (vec_outside_cost + vec_inside_cost > scalar_cost)
7329         {
7330           profitable = false;
7331           break;
7332         }
7333     }
7334   if (profitable && vi < li_vector_costs.length ())
7335     {
7336       if (dump_enabled_p ())
7337         dump_printf_loc (MSG_NOTE, vect_location,
7338                          "Excess vector cost for part in loop %d:\n",
7339                          li_vector_costs[vi].first);
7340       profitable = false;
7341     }
7342
7343   /* Unset visited flag.  This is delayed when the subgraph is profitable
7344      and we process the loop for remaining unvectorized if-converted code.  */
7345   if (!orig_loop || !profitable)
7346     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7347       gimple_set_visited  (cost->stmt_info->stmt, false);
7348
7349   scalar_costs.release ();
7350   vector_costs.release ();
7351
7352   return profitable;
7353 }
7354
7355 /* qsort comparator for lane defs.  */
7356
7357 static int
7358 vld_cmp (const void *a_, const void *b_)
7359 {
7360   auto *a = (const std::pair<unsigned, tree> *)a_;
7361   auto *b = (const std::pair<unsigned, tree> *)b_;
7362   return a->first - b->first;
7363 }
7364
7365 /* Return true if USE_STMT is a vector lane insert into VEC and set
7366    *THIS_LANE to the lane number that is set.  */
7367
7368 static bool
7369 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7370 {
7371   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7372   if (!use_ass
7373       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7374       || (vec
7375           ? gimple_assign_rhs1 (use_ass) != vec
7376           : ((vec = gimple_assign_rhs1 (use_ass)), false))
7377       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7378                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7379       || !constant_multiple_p
7380             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7381              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7382              this_lane))
7383     return false;
7384   return true;
7385 }
7386
7387 /* Find any vectorizable constructors and add them to the grouped_store
7388    array.  */
7389
7390 static void
7391 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7392 {
7393   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7394     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7395          !gsi_end_p (gsi); gsi_next (&gsi))
7396     {
7397       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7398       if (!assign)
7399         continue;
7400
7401       tree rhs = gimple_assign_rhs1 (assign);
7402       enum tree_code code = gimple_assign_rhs_code (assign);
7403       use_operand_p use_p;
7404       gimple *use_stmt;
7405       if (code == CONSTRUCTOR)
7406         {
7407           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7408               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7409                            CONSTRUCTOR_NELTS (rhs))
7410               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7411               || uniform_vector_p (rhs))
7412             continue;
7413
7414           unsigned j;
7415           tree val;
7416           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7417             if (TREE_CODE (val) != SSA_NAME
7418                 || !bb_vinfo->lookup_def (val))
7419               break;
7420           if (j != CONSTRUCTOR_NELTS (rhs))
7421             continue;
7422
7423           vec<stmt_vec_info> roots = vNULL;
7424           roots.safe_push (bb_vinfo->lookup_stmt (assign));
7425           vec<stmt_vec_info> stmts;
7426           stmts.create (CONSTRUCTOR_NELTS (rhs));
7427           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7428             stmts.quick_push
7429               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7430           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7431                                                stmts, roots));
7432         }
7433       else if (code == BIT_INSERT_EXPR
7434                && VECTOR_TYPE_P (TREE_TYPE (rhs))
7435                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7436                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7437                && integer_zerop (gimple_assign_rhs3 (assign))
7438                && useless_type_conversion_p
7439                     (TREE_TYPE (TREE_TYPE (rhs)),
7440                      TREE_TYPE (gimple_assign_rhs2 (assign)))
7441                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7442         {
7443           /* We start to match on insert to lane zero but since the
7444              inserts need not be ordered we'd have to search both
7445              the def and the use chains.  */
7446           tree vectype = TREE_TYPE (rhs);
7447           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7448           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7449           auto_sbitmap lanes (nlanes);
7450           bitmap_clear (lanes);
7451           bitmap_set_bit (lanes, 0);
7452           tree def = gimple_assign_lhs (assign);
7453           lane_defs.quick_push
7454                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
7455           unsigned lanes_found = 1;
7456           /* Start with the use chains, the last stmt will be the root.  */
7457           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7458           vec<stmt_vec_info> roots = vNULL;
7459           roots.safe_push (last);
7460           do
7461             {
7462               use_operand_p use_p;
7463               gimple *use_stmt;
7464               if (!single_imm_use (def, &use_p, &use_stmt))
7465                 break;
7466               unsigned this_lane;
7467               if (!bb_vinfo->lookup_stmt (use_stmt)
7468                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7469                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7470                 break;
7471               if (bitmap_bit_p (lanes, this_lane))
7472                 break;
7473               lanes_found++;
7474               bitmap_set_bit (lanes, this_lane);
7475               gassign *use_ass = as_a <gassign *> (use_stmt);
7476               lane_defs.quick_push (std::make_pair
7477                                      (this_lane, gimple_assign_rhs2 (use_ass)));
7478               last = bb_vinfo->lookup_stmt (use_ass);
7479               roots.safe_push (last);
7480               def = gimple_assign_lhs (use_ass);
7481             }
7482           while (lanes_found < nlanes);
7483           if (roots.length () > 1)
7484             std::swap(roots[0], roots[roots.length () - 1]);
7485           if (lanes_found < nlanes)
7486             {
7487               /* Now search the def chain.  */
7488               def = gimple_assign_rhs1 (assign);
7489               do
7490                 {
7491                   if (TREE_CODE (def) != SSA_NAME
7492                       || !has_single_use (def))
7493                     break;
7494                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7495                   unsigned this_lane;
7496                   if (!bb_vinfo->lookup_stmt (def_stmt)
7497                       || !vect_slp_is_lane_insert (def_stmt,
7498                                                    NULL_TREE, &this_lane)
7499                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7500                     break;
7501                   if (bitmap_bit_p (lanes, this_lane))
7502                     break;
7503                   lanes_found++;
7504                   bitmap_set_bit (lanes, this_lane);
7505                   lane_defs.quick_push (std::make_pair
7506                                           (this_lane,
7507                                            gimple_assign_rhs2 (def_stmt)));
7508                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7509                   def = gimple_assign_rhs1 (def_stmt);
7510                 }
7511               while (lanes_found < nlanes);
7512             }
7513           if (lanes_found == nlanes)
7514             {
7515               /* Sort lane_defs after the lane index and register the root.  */
7516               lane_defs.qsort (vld_cmp);
7517               vec<stmt_vec_info> stmts;
7518               stmts.create (nlanes);
7519               for (unsigned i = 0; i < nlanes; ++i)
7520                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7521               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7522                                                    stmts, roots));
7523             }
7524           else
7525             roots.release ();
7526         }
7527       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7528                && (associative_tree_code (code) || code == MINUS_EXPR)
7529                /* ???  This pessimizes a two-element reduction.  PR54400.
7530                   ???  In-order reduction could be handled if we only
7531                   traverse one operand chain in vect_slp_linearize_chain.  */
7532                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7533                /* Ops with constants at the tail can be stripped here.  */
7534                && TREE_CODE (rhs) == SSA_NAME
7535                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7536                /* Should be the chain end.  */
7537                && (!single_imm_use (gimple_assign_lhs (assign),
7538                                     &use_p, &use_stmt)
7539                    || !is_gimple_assign (use_stmt)
7540                    || (gimple_assign_rhs_code (use_stmt) != code
7541                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
7542                            || (gimple_assign_rhs_code (use_stmt)
7543                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7544         {
7545           /* We start the match at the end of a possible association
7546              chain.  */
7547           auto_vec<chain_op_t> chain;
7548           auto_vec<std::pair<tree_code, gimple *> > worklist;
7549           auto_vec<gimple *> chain_stmts;
7550           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7551           if (code == MINUS_EXPR)
7552             code = PLUS_EXPR;
7553           internal_fn reduc_fn;
7554           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7555               || reduc_fn == IFN_LAST)
7556             continue;
7557           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7558                                     /* ??? */
7559                                     code_stmt, alt_code_stmt, &chain_stmts);
7560           if (chain.length () > 1)
7561             {
7562               /* Sort the chain according to def_type and operation.  */
7563               chain.sort (dt_sort_cmp, bb_vinfo);
7564               /* ???  Now we'd want to strip externals and constants
7565                  but record those to be handled in the epilogue.  */
7566               /* ???  For now do not allow mixing ops or externs/constants.  */
7567               bool invalid = false;
7568               unsigned remain_cnt = 0;
7569               unsigned last_idx = 0;
7570               for (unsigned i = 0; i < chain.length (); ++i)
7571                 {
7572                   if (chain[i].code != code)
7573                     {
7574                       invalid = true;
7575                       break;
7576                     }
7577                   if (chain[i].dt != vect_internal_def
7578                       /* Avoid stmts where the def is not the LHS, like
7579                          ASMs.  */
7580                       || (gimple_get_lhs (bb_vinfo->lookup_def
7581                                                       (chain[i].op)->stmt)
7582                           != chain[i].op))
7583                     remain_cnt++;
7584                   else
7585                     last_idx = i;
7586                 }
7587               /* Make sure to have an even number of lanes as we later do
7588                  all-or-nothing discovery, not trying to split further.  */
7589               if ((chain.length () - remain_cnt) & 1)
7590                 remain_cnt++;
7591               if (!invalid && chain.length () - remain_cnt > 1)
7592                 {
7593                   vec<stmt_vec_info> stmts;
7594                   vec<tree> remain = vNULL;
7595                   stmts.create (chain.length ());
7596                   if (remain_cnt > 0)
7597                     remain.create (remain_cnt);
7598                   for (unsigned i = 0; i < chain.length (); ++i)
7599                     {
7600                       stmt_vec_info stmt_info;
7601                       if (chain[i].dt == vect_internal_def
7602                           && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
7603                               gimple_get_lhs (stmt_info->stmt) == chain[i].op)
7604                           && (i != last_idx
7605                               || (stmts.length () & 1)))
7606                         stmts.quick_push (stmt_info);
7607                       else
7608                         remain.quick_push (chain[i].op);
7609                     }
7610                   vec<stmt_vec_info> roots;
7611                   roots.create (chain_stmts.length ());
7612                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
7613                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7614                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7615                                                        stmts, roots, remain));
7616                 }
7617             }
7618         }
7619     }
7620 }
7621
7622 /* Walk the grouped store chains and replace entries with their
7623    pattern variant if any.  */
7624
7625 static void
7626 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7627 {
7628   stmt_vec_info first_element;
7629   unsigned i;
7630
7631   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7632     {
7633       /* We also have CTORs in this array.  */
7634       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7635         continue;
7636       if (STMT_VINFO_IN_PATTERN_P (first_element))
7637         {
7638           stmt_vec_info orig = first_element;
7639           first_element = STMT_VINFO_RELATED_STMT (first_element);
7640           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7641           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7642           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7643           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7644           vinfo->grouped_stores[i] = first_element;
7645         }
7646       stmt_vec_info prev = first_element;
7647       while (DR_GROUP_NEXT_ELEMENT (prev))
7648         {
7649           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7650           if (STMT_VINFO_IN_PATTERN_P (elt))
7651             {
7652               stmt_vec_info orig = elt;
7653               elt = STMT_VINFO_RELATED_STMT (elt);
7654               DR_GROUP_NEXT_ELEMENT (prev) = elt;
7655               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7656               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7657             }
7658           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7659           prev = elt;
7660         }
7661     }
7662 }
7663
7664 /* Check if the region described by BB_VINFO can be vectorized, returning
7665    true if so.  When returning false, set FATAL to true if the same failure
7666    would prevent vectorization at other vector sizes, false if it is still
7667    worth trying other sizes.  N_STMTS is the number of statements in the
7668    region.  */
7669
7670 static bool
7671 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7672                        vec<int> *dataref_groups)
7673 {
7674   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7675
7676   slp_instance instance;
7677   int i;
7678   poly_uint64 min_vf = 2;
7679
7680   /* The first group of checks is independent of the vector size.  */
7681   fatal = true;
7682
7683   /* Analyze the data references.  */
7684
7685   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7686     {
7687       if (dump_enabled_p ())
7688         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7689                          "not vectorized: unhandled data-ref in basic "
7690                          "block.\n");
7691       return false;
7692     }
7693
7694   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7695     {
7696      if (dump_enabled_p ())
7697        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7698                         "not vectorized: unhandled data access in "
7699                         "basic block.\n");
7700       return false;
7701     }
7702
7703   vect_slp_check_for_roots (bb_vinfo);
7704
7705   /* If there are no grouped stores and no constructors in the region
7706      there is no need to continue with pattern recog as vect_analyze_slp
7707      will fail anyway.  */
7708   if (bb_vinfo->grouped_stores.is_empty ()
7709       && bb_vinfo->roots.is_empty ())
7710     {
7711       if (dump_enabled_p ())
7712         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7713                          "not vectorized: no grouped stores in "
7714                          "basic block.\n");
7715       return false;
7716     }
7717
7718   /* While the rest of the analysis below depends on it in some way.  */
7719   fatal = false;
7720
7721   vect_pattern_recog (bb_vinfo);
7722
7723   /* Update store groups from pattern processing.  */
7724   vect_fixup_store_groups_with_patterns (bb_vinfo);
7725
7726   /* Check the SLP opportunities in the basic block, analyze and build SLP
7727      trees.  */
7728   if (!vect_analyze_slp (bb_vinfo, n_stmts))
7729     {
7730       if (dump_enabled_p ())
7731         {
7732           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7733                            "Failed to SLP the basic block.\n");
7734           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7735                            "not vectorized: failed to find SLP opportunities "
7736                            "in basic block.\n");
7737         }
7738       return false;
7739     }
7740
7741   /* Optimize permutations.  */
7742   vect_optimize_slp (bb_vinfo);
7743
7744   /* Gather the loads reachable from the SLP graph entries.  */
7745   vect_gather_slp_loads (bb_vinfo);
7746
7747   vect_record_base_alignments (bb_vinfo);
7748
7749   /* Analyze and verify the alignment of data references and the
7750      dependence in the SLP instances.  */
7751   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7752     {
7753       vect_location = instance->location ();
7754       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7755           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7756         {
7757           slp_tree node = SLP_INSTANCE_TREE (instance);
7758           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7759           if (dump_enabled_p ())
7760             dump_printf_loc (MSG_NOTE, vect_location,
7761                              "removing SLP instance operations starting from: %G",
7762                              stmt_info->stmt);
7763           vect_free_slp_instance (instance);
7764           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7765           continue;
7766         }
7767
7768       /* Mark all the statements that we want to vectorize as pure SLP and
7769          relevant.  */
7770       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7771       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7772       unsigned j;
7773       stmt_vec_info root;
7774       /* Likewise consider instance root stmts as vectorized.  */
7775       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7776         STMT_SLP_TYPE (root) = pure_slp;
7777
7778       i++;
7779     }
7780   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7781     return false;
7782
7783   if (!vect_slp_analyze_operations (bb_vinfo))
7784     {
7785       if (dump_enabled_p ())
7786         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7787                          "not vectorized: bad operation in basic block.\n");
7788       return false;
7789     }
7790
7791   vect_bb_partition_graph (bb_vinfo);
7792
7793   return true;
7794 }
7795
7796 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
7797    basic blocks in BBS, returning true on success.
7798    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
7799
7800 static bool
7801 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7802                  vec<int> *dataref_groups, unsigned int n_stmts,
7803                  loop_p orig_loop)
7804 {
7805   bb_vec_info bb_vinfo;
7806   auto_vector_modes vector_modes;
7807
7808   /* Autodetect first vector size we try.  */
7809   machine_mode next_vector_mode = VOIDmode;
7810   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7811   unsigned int mode_i = 0;
7812
7813   vec_info_shared shared;
7814
7815   machine_mode autodetected_vector_mode = VOIDmode;
7816   while (1)
7817     {
7818       bool vectorized = false;
7819       bool fatal = false;
7820       bb_vinfo = new _bb_vec_info (bbs, &shared);
7821
7822       bool first_time_p = shared.datarefs.is_empty ();
7823       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7824       if (first_time_p)
7825         bb_vinfo->shared->save_datarefs ();
7826       else
7827         bb_vinfo->shared->check_datarefs ();
7828       bb_vinfo->vector_mode = next_vector_mode;
7829
7830       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7831         {
7832           if (dump_enabled_p ())
7833             {
7834               dump_printf_loc (MSG_NOTE, vect_location,
7835                                "***** Analysis succeeded with vector mode"
7836                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7837               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7838             }
7839
7840           bb_vinfo->shared->check_datarefs ();
7841
7842           bool force_clear = false;
7843           auto_vec<slp_instance> profitable_subgraphs;
7844           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7845             {
7846               if (instance->subgraph_entries.is_empty ())
7847                 continue;
7848
7849               dump_user_location_t saved_vect_location = vect_location;
7850               vect_location = instance->location ();
7851               if (!unlimited_cost_model (NULL)
7852                   && !vect_bb_vectorization_profitable_p
7853                         (bb_vinfo, instance->subgraph_entries, orig_loop))
7854                 {
7855                   if (dump_enabled_p ())
7856                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7857                                      "not vectorized: vectorization is not "
7858                                      "profitable.\n");
7859                   vect_location = saved_vect_location;
7860                   continue;
7861                 }
7862
7863               vect_location = saved_vect_location;
7864               if (!dbg_cnt (vect_slp))
7865                 {
7866                   force_clear = true;
7867                   continue;
7868                 }
7869
7870               profitable_subgraphs.safe_push (instance);
7871             }
7872
7873           /* When we're vectorizing an if-converted loop body make sure
7874              we vectorized all if-converted code.  */
7875           if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
7876             {
7877               gcc_assert (bb_vinfo->bbs.length () == 1);
7878               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7879                    !gsi_end_p (gsi); gsi_next (&gsi))
7880                 {
7881                   /* The costing above left us with DCEable vectorized scalar
7882                      stmts having the visited flag set on profitable
7883                      subgraphs.  Do the delayed clearing of the flag here.  */
7884                   if (gimple_visited_p (gsi_stmt (gsi)))
7885                     {
7886                       gimple_set_visited (gsi_stmt (gsi), false);
7887                       continue;
7888                     }
7889                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7890                     continue;
7891
7892                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7893                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
7894                       {
7895                         if (!profitable_subgraphs.is_empty ()
7896                             && dump_enabled_p ())
7897                           dump_printf_loc (MSG_NOTE, vect_location,
7898                                            "not profitable because of "
7899                                            "unprofitable if-converted scalar "
7900                                            "code\n");
7901                         profitable_subgraphs.truncate (0);
7902                       }
7903                 }
7904             }
7905
7906           /* Finally schedule the profitable subgraphs.  */
7907           for (slp_instance instance : profitable_subgraphs)
7908             {
7909               if (!vectorized && dump_enabled_p ())
7910                 dump_printf_loc (MSG_NOTE, vect_location,
7911                                  "Basic block will be vectorized "
7912                                  "using SLP\n");
7913               vectorized = true;
7914
7915               /* Dump before scheduling as store vectorization will remove
7916                  the original stores and mess with the instance tree
7917                  so querying its location will eventually ICE.  */
7918               if (flag_checking)
7919                 for (slp_instance sub : instance->subgraph_entries)
7920                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7921               unsigned HOST_WIDE_INT bytes;
7922               if (dump_enabled_p ())
7923                 for (slp_instance sub : instance->subgraph_entries)
7924                   {
7925                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7926                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7927                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7928                                        sub->location (),
7929                                        "basic block part vectorized using %wu "
7930                                        "byte vectors\n", bytes);
7931                     else
7932                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7933                                        sub->location (),
7934                                        "basic block part vectorized using "
7935                                        "variable length vectors\n");
7936                   }
7937
7938               dump_user_location_t saved_vect_location = vect_location;
7939               vect_location = instance->location ();
7940
7941               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7942
7943               vect_location = saved_vect_location;
7944             }
7945         }
7946       else
7947         {
7948           if (dump_enabled_p ())
7949             dump_printf_loc (MSG_NOTE, vect_location,
7950                              "***** Analysis failed with vector mode %s\n",
7951                              GET_MODE_NAME (bb_vinfo->vector_mode));
7952         }
7953
7954       if (mode_i == 0)
7955         autodetected_vector_mode = bb_vinfo->vector_mode;
7956
7957       if (!fatal)
7958         while (mode_i < vector_modes.length ()
7959                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7960           {
7961             if (dump_enabled_p ())
7962               dump_printf_loc (MSG_NOTE, vect_location,
7963                                "***** The result for vector mode %s would"
7964                                " be the same\n",
7965                                GET_MODE_NAME (vector_modes[mode_i]));
7966             mode_i += 1;
7967           }
7968
7969       delete bb_vinfo;
7970
7971       if (mode_i < vector_modes.length ()
7972           && VECTOR_MODE_P (autodetected_vector_mode)
7973           && (related_vector_mode (vector_modes[mode_i],
7974                                    GET_MODE_INNER (autodetected_vector_mode))
7975               == autodetected_vector_mode)
7976           && (related_vector_mode (autodetected_vector_mode,
7977                                    GET_MODE_INNER (vector_modes[mode_i]))
7978               == vector_modes[mode_i]))
7979         {
7980           if (dump_enabled_p ())
7981             dump_printf_loc (MSG_NOTE, vect_location,
7982                              "***** Skipping vector mode %s, which would"
7983                              " repeat the analysis for %s\n",
7984                              GET_MODE_NAME (vector_modes[mode_i]),
7985                              GET_MODE_NAME (autodetected_vector_mode));
7986           mode_i += 1;
7987         }
7988
7989       if (vectorized
7990           || mode_i == vector_modes.length ()
7991           || autodetected_vector_mode == VOIDmode
7992           /* If vect_slp_analyze_bb_1 signaled that analysis for all
7993              vector sizes will fail do not bother iterating.  */
7994           || fatal)
7995         return vectorized;
7996
7997       /* Try the next biggest vector size.  */
7998       next_vector_mode = vector_modes[mode_i++];
7999       if (dump_enabled_p ())
8000         dump_printf_loc (MSG_NOTE, vect_location,
8001                          "***** Re-trying analysis with vector mode %s\n",
8002                          GET_MODE_NAME (next_vector_mode));
8003     }
8004 }
8005
8006
8007 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
8008    true if anything in the basic-block was vectorized.  */
8009
8010 static bool
8011 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
8012 {
8013   vec<data_reference_p> datarefs = vNULL;
8014   auto_vec<int> dataref_groups;
8015   int insns = 0;
8016   int current_group = 0;
8017
8018   for (unsigned i = 0; i < bbs.length (); i++)
8019     {
8020       basic_block bb = bbs[i];
8021       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
8022            gsi_next (&gsi))
8023         {
8024           gimple *stmt = gsi_stmt (gsi);
8025           if (is_gimple_debug (stmt))
8026             continue;
8027
8028           insns++;
8029
8030           if (gimple_location (stmt) != UNKNOWN_LOCATION)
8031             vect_location = stmt;
8032
8033           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
8034                                               &dataref_groups, current_group))
8035             ++current_group;
8036         }
8037       /* New BBs always start a new DR group.  */
8038       ++current_group;
8039     }
8040
8041   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
8042 }
8043
8044 /* Special entry for the BB vectorizer.  Analyze and transform a single
8045    if-converted BB with ORIG_LOOPs body being the not if-converted
8046    representation.  Returns true if anything in the basic-block was
8047    vectorized.  */
8048
8049 bool
8050 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
8051 {
8052   auto_vec<basic_block> bbs;
8053   bbs.safe_push (bb);
8054   return vect_slp_bbs (bbs, orig_loop);
8055 }
8056
8057 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
8058    true if anything in the basic-block was vectorized.  */
8059
8060 bool
8061 vect_slp_function (function *fun)
8062 {
8063   bool r = false;
8064   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
8065   auto_bitmap exit_bbs;
8066   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
8067   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
8068   unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
8069                                                       true, rpo, NULL);
8070
8071   /* For the moment split the function into pieces to avoid making
8072      the iteration on the vector mode moot.  Split at points we know
8073      to not handle well which is CFG merges (SLP discovery doesn't
8074      handle non-loop-header PHIs) and loop exits.  Since pattern
8075      recog requires reverse iteration to visit uses before defs
8076      simply chop RPO into pieces.  */
8077   auto_vec<basic_block> bbs;
8078   for (unsigned i = 0; i < n; i++)
8079     {
8080       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
8081       bool split = false;
8082
8083       /* Split when a BB is not dominated by the first block.  */
8084       if (!bbs.is_empty ()
8085           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
8086         {
8087           if (dump_enabled_p ())
8088             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8089                              "splitting region at dominance boundary bb%d\n",
8090                              bb->index);
8091           split = true;
8092         }
8093       /* Split when the loop determined by the first block
8094          is exited.  This is because we eventually insert
8095          invariants at region begin.  */
8096       else if (!bbs.is_empty ()
8097                && bbs[0]->loop_father != bb->loop_father
8098                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
8099         {
8100           if (dump_enabled_p ())
8101             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8102                              "splitting region at loop %d exit at bb%d\n",
8103                              bbs[0]->loop_father->num, bb->index);
8104           split = true;
8105         }
8106       else if (!bbs.is_empty ()
8107                && bb->loop_father->header == bb
8108                && bb->loop_father->dont_vectorize)
8109         {
8110           if (dump_enabled_p ())
8111             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8112                              "splitting region at dont-vectorize loop %d "
8113                              "entry at bb%d\n",
8114                              bb->loop_father->num, bb->index);
8115           split = true;
8116         }
8117
8118       if (split && !bbs.is_empty ())
8119         {
8120           r |= vect_slp_bbs (bbs, NULL);
8121           bbs.truncate (0);
8122         }
8123
8124       if (bbs.is_empty ())
8125         {
8126           /* We need to be able to insert at the head of the region which
8127              we cannot for region starting with a returns-twice call.  */
8128           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
8129             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
8130               {
8131                 if (dump_enabled_p ())
8132                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8133                                    "skipping bb%d as start of region as it "
8134                                    "starts with returns-twice call\n",
8135                                    bb->index);
8136                 continue;
8137               }
8138           /* If the loop this BB belongs to is marked as not to be vectorized
8139              honor that also for BB vectorization.  */
8140           if (bb->loop_father->dont_vectorize)
8141             continue;
8142         }
8143
8144       bbs.safe_push (bb);
8145
8146       /* When we have a stmt ending this block and defining a
8147          value we have to insert on edges when inserting after it for
8148          a vector containing its definition.  Avoid this for now.  */
8149       if (gimple *last = *gsi_last_bb (bb))
8150         if (gimple_get_lhs (last)
8151             && is_ctrl_altering_stmt (last))
8152           {
8153             if (dump_enabled_p ())
8154               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8155                                "splitting region at control altering "
8156                                "definition %G", last);
8157             r |= vect_slp_bbs (bbs, NULL);
8158             bbs.truncate (0);
8159           }
8160     }
8161
8162   if (!bbs.is_empty ())
8163     r |= vect_slp_bbs (bbs, NULL);
8164
8165   free (rpo);
8166
8167   return r;
8168 }
8169
8170 /* Build a variable-length vector in which the elements in ELTS are repeated
8171    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
8172    RESULTS and add any new instructions to SEQ.
8173
8174    The approach we use is:
8175
8176    (1) Find a vector mode VM with integer elements of mode IM.
8177
8178    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8179        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
8180        from small vectors to IM.
8181
8182    (3) Duplicate each ELTS'[I] into a vector of mode VM.
8183
8184    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
8185        correct byte contents.
8186
8187    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
8188
8189    We try to find the largest IM for which this sequence works, in order
8190    to cut down on the number of interleaves.  */
8191
8192 void
8193 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
8194                           const vec<tree> &elts, unsigned int nresults,
8195                           vec<tree> &results)
8196 {
8197   unsigned int nelts = elts.length ();
8198   tree element_type = TREE_TYPE (vector_type);
8199
8200   /* (1) Find a vector mode VM with integer elements of mode IM.  */
8201   unsigned int nvectors = 1;
8202   tree new_vector_type;
8203   tree permutes[2];
8204   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
8205                                        &nvectors, &new_vector_type,
8206                                        permutes))
8207     gcc_unreachable ();
8208
8209   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
8210   unsigned int partial_nelts = nelts / nvectors;
8211   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8212
8213   tree_vector_builder partial_elts;
8214   auto_vec<tree, 32> pieces (nvectors * 2);
8215   pieces.quick_grow_cleared (nvectors * 2);
8216   for (unsigned int i = 0; i < nvectors; ++i)
8217     {
8218       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8219              ELTS' has mode IM.  */
8220       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
8221       for (unsigned int j = 0; j < partial_nelts; ++j)
8222         partial_elts.quick_push (elts[i * partial_nelts + j]);
8223       tree t = gimple_build_vector (seq, &partial_elts);
8224       t = gimple_build (seq, VIEW_CONVERT_EXPR,
8225                         TREE_TYPE (new_vector_type), t);
8226
8227       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
8228       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
8229     }
8230
8231   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
8232          correct byte contents.
8233
8234      Conceptually, we need to repeat the following operation log2(nvectors)
8235      times, where hi_start = nvectors / 2:
8236
8237         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
8238         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
8239
8240      However, if each input repeats every N elements and the VF is
8241      a multiple of N * 2, the HI result is the same as the LO result.
8242      This will be true for the first N1 iterations of the outer loop,
8243      followed by N2 iterations for which both the LO and HI results
8244      are needed.  I.e.:
8245
8246         N1 + N2 = log2(nvectors)
8247
8248      Each "N1 iteration" doubles the number of redundant vectors and the
8249      effect of the process as a whole is to have a sequence of nvectors/2**N1
8250      vectors that repeats 2**N1 times.  Rather than generate these redundant
8251      vectors, we halve the number of vectors for each N1 iteration.  */
8252   unsigned int in_start = 0;
8253   unsigned int out_start = nvectors;
8254   unsigned int new_nvectors = nvectors;
8255   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
8256     {
8257       unsigned int hi_start = new_nvectors / 2;
8258       unsigned int out_i = 0;
8259       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
8260         {
8261           if ((in_i & 1) != 0
8262               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
8263                              2 * in_repeat))
8264             continue;
8265
8266           tree output = make_ssa_name (new_vector_type);
8267           tree input1 = pieces[in_start + (in_i / 2)];
8268           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
8269           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8270                                                input1, input2,
8271                                                permutes[in_i & 1]);
8272           gimple_seq_add_stmt (seq, stmt);
8273           pieces[out_start + out_i] = output;
8274           out_i += 1;
8275         }
8276       std::swap (in_start, out_start);
8277       new_nvectors = out_i;
8278     }
8279
8280   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
8281   results.reserve (nresults);
8282   for (unsigned int i = 0; i < nresults; ++i)
8283     if (i < new_nvectors)
8284       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
8285                                         pieces[in_start + i]));
8286     else
8287       results.quick_push (results[i - new_nvectors]);
8288 }
8289
8290
8291 /* For constant and loop invariant defs in OP_NODE this function creates
8292    vector defs that will be used in the vectorized stmts and stores them
8293    to SLP_TREE_VEC_DEFS of OP_NODE.  */
8294
8295 static void
8296 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8297 {
8298   unsigned HOST_WIDE_INT nunits;
8299   tree vec_cst;
8300   unsigned j, number_of_places_left_in_vector;
8301   tree vector_type;
8302   tree vop;
8303   int group_size = op_node->ops.length ();
8304   unsigned int vec_num, i;
8305   unsigned number_of_copies = 1;
8306   bool constant_p;
8307   gimple_seq ctor_seq = NULL;
8308   auto_vec<tree, 16> permute_results;
8309
8310   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
8311   vector_type = SLP_TREE_VECTYPE (op_node);
8312
8313   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8314   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8315   auto_vec<tree> voprnds (number_of_vectors);
8316
8317   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8318      created vectors. It is greater than 1 if unrolling is performed.
8319
8320      For example, we have two scalar operands, s1 and s2 (e.g., group of
8321      strided accesses of size two), while NUNITS is four (i.e., four scalars
8322      of this type can be packed in a vector).  The output vector will contain
8323      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
8324      will be 2).
8325
8326      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8327      containing the operands.
8328
8329      For example, NUNITS is four as before, and the group size is 8
8330      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
8331      {s5, s6, s7, s8}.  */
8332
8333   /* When using duplicate_and_interleave, we just need one element for
8334      each scalar statement.  */
8335   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8336     nunits = group_size;
8337
8338   number_of_copies = nunits * number_of_vectors / group_size;
8339
8340   number_of_places_left_in_vector = nunits;
8341   constant_p = true;
8342   tree uniform_elt = NULL_TREE;
8343   tree_vector_builder elts (vector_type, nunits, 1);
8344   elts.quick_grow (nunits);
8345   stmt_vec_info insert_after = NULL;
8346   for (j = 0; j < number_of_copies; j++)
8347     {
8348       tree op;
8349       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8350         {
8351           /* Create 'vect_ = {op0,op1,...,opn}'.  */
8352           tree orig_op = op;
8353           if (number_of_places_left_in_vector == nunits)
8354             uniform_elt = op;
8355           else if (uniform_elt && operand_equal_p (uniform_elt, op))
8356             op = elts[number_of_places_left_in_vector];
8357           else
8358             uniform_elt = NULL_TREE;
8359           number_of_places_left_in_vector--;
8360           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8361             {
8362               if (CONSTANT_CLASS_P (op))
8363                 {
8364                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8365                     {
8366                       /* Can't use VIEW_CONVERT_EXPR for booleans because
8367                          of possibly different sizes of scalar value and
8368                          vector element.  */
8369                       if (integer_zerop (op))
8370                         op = build_int_cst (TREE_TYPE (vector_type), 0);
8371                       else if (integer_onep (op))
8372                         op = build_all_ones_cst (TREE_TYPE (vector_type));
8373                       else
8374                         gcc_unreachable ();
8375                     }
8376                   else
8377                     op = fold_unary (VIEW_CONVERT_EXPR,
8378                                      TREE_TYPE (vector_type), op);
8379                   gcc_assert (op && CONSTANT_CLASS_P (op));
8380                 }
8381               else
8382                 {
8383                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8384                   gimple *init_stmt;
8385                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8386                     {
8387                       tree true_val
8388                         = build_all_ones_cst (TREE_TYPE (vector_type));
8389                       tree false_val
8390                         = build_zero_cst (TREE_TYPE (vector_type));
8391                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8392                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8393                                                        op, true_val,
8394                                                        false_val);
8395                     }
8396                   else
8397                     {
8398                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8399                                    op);
8400                       init_stmt
8401                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8402                                                op);
8403                     }
8404                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
8405                   op = new_temp;
8406                 }
8407             }
8408           elts[number_of_places_left_in_vector] = op;
8409           if (!CONSTANT_CLASS_P (op))
8410             constant_p = false;
8411           /* For BB vectorization we have to compute an insert location
8412              when a def is inside the analyzed region since we cannot
8413              simply insert at the BB start in this case.  */
8414           stmt_vec_info opdef;
8415           if (TREE_CODE (orig_op) == SSA_NAME
8416               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8417               && is_a <bb_vec_info> (vinfo)
8418               && (opdef = vinfo->lookup_def (orig_op)))
8419             {
8420               if (!insert_after)
8421                 insert_after = opdef;
8422               else
8423                 insert_after = get_later_stmt (insert_after, opdef);
8424             }
8425
8426           if (number_of_places_left_in_vector == 0)
8427             {
8428               auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
8429               if (uniform_elt)
8430                 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
8431                                                         elts[0]);
8432               else if (constant_p
8433                        ? multiple_p (type_nunits, nunits)
8434                        : known_eq (type_nunits, nunits))
8435                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8436               else
8437                 {
8438                   if (permute_results.is_empty ())
8439                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8440                                               elts, number_of_vectors,
8441                                               permute_results);
8442                   vec_cst = permute_results[number_of_vectors - j - 1];
8443                 }
8444               if (!gimple_seq_empty_p (ctor_seq))
8445                 {
8446                   if (insert_after)
8447                     {
8448                       gimple_stmt_iterator gsi;
8449                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8450                         {
8451                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8452                           gsi_insert_seq_before (&gsi, ctor_seq,
8453                                                  GSI_CONTINUE_LINKING);
8454                         }
8455                       else if (!stmt_ends_bb_p (insert_after->stmt))
8456                         {
8457                           gsi = gsi_for_stmt (insert_after->stmt);
8458                           gsi_insert_seq_after (&gsi, ctor_seq,
8459                                                 GSI_CONTINUE_LINKING);
8460                         }
8461                       else
8462                         {
8463                           /* When we want to insert after a def where the
8464                              defining stmt throws then insert on the fallthru
8465                              edge.  */
8466                           edge e = find_fallthru_edge
8467                                      (gimple_bb (insert_after->stmt)->succs);
8468                           basic_block new_bb
8469                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8470                           gcc_assert (!new_bb);
8471                         }
8472                     }
8473                   else
8474                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
8475                   ctor_seq = NULL;
8476                 }
8477               voprnds.quick_push (vec_cst);
8478               insert_after = NULL;
8479               number_of_places_left_in_vector = nunits;
8480               constant_p = true;
8481               elts.new_vector (vector_type, nunits, 1);
8482               elts.quick_grow (nunits);
8483             }
8484         }
8485     }
8486
8487   /* Since the vectors are created in the reverse order, we should invert
8488      them.  */
8489   vec_num = voprnds.length ();
8490   for (j = vec_num; j != 0; j--)
8491     {
8492       vop = voprnds[j - 1];
8493       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8494     }
8495
8496   /* In case that VF is greater than the unrolling factor needed for the SLP
8497      group of stmts, NUMBER_OF_VECTORS to be created is greater than
8498      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8499      to replicate the vectors.  */
8500   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8501     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8502          i++)
8503       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8504 }
8505
8506 /* Get the Ith vectorized definition from SLP_NODE.  */
8507
8508 tree
8509 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8510 {
8511   return SLP_TREE_VEC_DEFS (slp_node)[i];
8512 }
8513
8514 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
8515
8516 void
8517 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8518 {
8519   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8520   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8521 }
8522
8523 /* Get N vectorized definitions for SLP_NODE.  */
8524
8525 void
8526 vect_get_slp_defs (vec_info *,
8527                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8528 {
8529   if (n == -1U)
8530     n = SLP_TREE_CHILDREN (slp_node).length ();
8531
8532   for (unsigned i = 0; i < n; ++i)
8533     {
8534       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8535       vec<tree> vec_defs = vNULL;
8536       vect_get_slp_defs (child, &vec_defs);
8537       vec_oprnds->quick_push (vec_defs);
8538     }
8539 }
8540
8541 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8542    - PERM gives the permutation that the caller wants to use for NODE,
8543      which might be different from SLP_LOAD_PERMUTATION.
8544    - DUMP_P controls whether the function dumps information.  */
8545
8546 static bool
8547 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8548                                 load_permutation_t &perm,
8549                                 const vec<tree> &dr_chain,
8550                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
8551                                 bool analyze_only, bool dump_p,
8552                                 unsigned *n_perms, unsigned int *n_loads,
8553                                 bool dce_chain)
8554 {
8555   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8556   int vec_index = 0;
8557   tree vectype = SLP_TREE_VECTYPE (node);
8558   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8559   unsigned int mask_element;
8560   unsigned dr_group_size;
8561   machine_mode mode;
8562
8563   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8564     dr_group_size = 1;
8565   else
8566     {
8567       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8568       dr_group_size = DR_GROUP_SIZE (stmt_info);
8569     }
8570
8571   mode = TYPE_MODE (vectype);
8572   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8573   unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8574
8575   /* Initialize the vect stmts of NODE to properly insert the generated
8576      stmts later.  */
8577   if (! analyze_only)
8578     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8579       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8580
8581   /* Generate permutation masks for every NODE. Number of masks for each NODE
8582      is equal to GROUP_SIZE.
8583      E.g., we have a group of three nodes with three loads from the same
8584      location in each node, and the vector size is 4. I.e., we have a
8585      a0b0c0a1b1c1... sequence and we need to create the following vectors:
8586      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8587      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8588      ...
8589
8590      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8591      The last mask is illegal since we assume two operands for permute
8592      operation, and the mask element values can't be outside that range.
8593      Hence, the last mask must be converted into {2,5,5,5}.
8594      For the first two permutations we need the first and the second input
8595      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8596      we need the second and the third vectors: {b1,c1,a2,b2} and
8597      {c2,a3,b3,c3}.  */
8598
8599   int vect_stmts_counter = 0;
8600   unsigned int index = 0;
8601   int first_vec_index = -1;
8602   int second_vec_index = -1;
8603   bool noop_p = true;
8604   *n_perms = 0;
8605
8606   vec_perm_builder mask;
8607   unsigned int nelts_to_build;
8608   unsigned int nvectors_per_build;
8609   unsigned int in_nlanes;
8610   bool repeating_p = (group_size == dr_group_size
8611                       && multiple_p (nunits, group_size));
8612   if (repeating_p)
8613     {
8614       /* A single vector contains a whole number of copies of the node, so:
8615          (a) all permutes can use the same mask; and
8616          (b) the permutes only need a single vector input.  */
8617       mask.new_vector (nunits, group_size, 3);
8618       nelts_to_build = mask.encoded_nelts ();
8619       /* It's possible to obtain zero nstmts during analyze_only, so make
8620          it at least one to ensure the later computation for n_perms
8621          proceed.  */
8622       nvectors_per_build = nstmts > 0 ? nstmts : 1;
8623       in_nlanes = dr_group_size * 3;
8624     }
8625   else
8626     {
8627       /* We need to construct a separate mask for each vector statement.  */
8628       unsigned HOST_WIDE_INT const_nunits, const_vf;
8629       if (!nunits.is_constant (&const_nunits)
8630           || !vf.is_constant (&const_vf))
8631         return false;
8632       mask.new_vector (const_nunits, const_nunits, 1);
8633       nelts_to_build = const_vf * group_size;
8634       nvectors_per_build = 1;
8635       in_nlanes = const_vf * dr_group_size;
8636     }
8637   auto_sbitmap used_in_lanes (in_nlanes);
8638   bitmap_clear (used_in_lanes);
8639   auto_bitmap used_defs;
8640
8641   unsigned int count = mask.encoded_nelts ();
8642   mask.quick_grow (count);
8643   vec_perm_indices indices;
8644
8645   for (unsigned int j = 0; j < nelts_to_build; j++)
8646     {
8647       unsigned int iter_num = j / group_size;
8648       unsigned int stmt_num = j % group_size;
8649       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8650       bitmap_set_bit (used_in_lanes, i);
8651       if (repeating_p)
8652         {
8653           first_vec_index = 0;
8654           mask_element = i;
8655         }
8656       else
8657         {
8658           /* Enforced before the loop when !repeating_p.  */
8659           unsigned int const_nunits = nunits.to_constant ();
8660           vec_index = i / const_nunits;
8661           mask_element = i % const_nunits;
8662           if (vec_index == first_vec_index
8663               || first_vec_index == -1)
8664             {
8665               first_vec_index = vec_index;
8666             }
8667           else if (vec_index == second_vec_index
8668                    || second_vec_index == -1)
8669             {
8670               second_vec_index = vec_index;
8671               mask_element += const_nunits;
8672             }
8673           else
8674             {
8675               if (dump_p)
8676                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8677                                  "permutation requires at "
8678                                  "least three vectors %G",
8679                                  stmt_info->stmt);
8680               gcc_assert (analyze_only);
8681               return false;
8682             }
8683
8684           gcc_assert (mask_element < 2 * const_nunits);
8685         }
8686
8687       if (mask_element != index)
8688         noop_p = false;
8689       mask[index++] = mask_element;
8690
8691       if (index == count)
8692         {
8693           if (!noop_p)
8694             {
8695               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8696               if (!can_vec_perm_const_p (mode, mode, indices))
8697                 {
8698                   if (dump_p)
8699                     {
8700                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8701                                        "unsupported vect permute { ");
8702                       for (i = 0; i < count; ++i)
8703                         {
8704                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8705                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8706                         }
8707                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8708                     }
8709                   gcc_assert (analyze_only);
8710                   return false;
8711                 }
8712
8713               tree mask_vec = NULL_TREE;
8714               if (!analyze_only)
8715                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8716
8717               if (second_vec_index == -1)
8718                 second_vec_index = first_vec_index;
8719
8720               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8721                 {
8722                   ++*n_perms;
8723                   if (analyze_only)
8724                     continue;
8725                   /* Generate the permute statement if necessary.  */
8726                   tree first_vec = dr_chain[first_vec_index + ri];
8727                   tree second_vec = dr_chain[second_vec_index + ri];
8728                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8729                   tree perm_dest
8730                     = vect_create_destination_var (gimple_assign_lhs (stmt),
8731                                                    vectype);
8732                   perm_dest = make_ssa_name (perm_dest);
8733                   gimple *perm_stmt
8734                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8735                                            second_vec, mask_vec);
8736                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8737                                                gsi);
8738                   if (dce_chain)
8739                     {
8740                       bitmap_set_bit (used_defs, first_vec_index + ri);
8741                       bitmap_set_bit (used_defs, second_vec_index + ri);
8742                     }
8743
8744                   /* Store the vector statement in NODE.  */
8745                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8746                 }
8747             }
8748           else if (!analyze_only)
8749             {
8750               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8751                 {
8752                   tree first_vec = dr_chain[first_vec_index + ri];
8753                   /* If mask was NULL_TREE generate the requested
8754                      identity transform.  */
8755                   if (dce_chain)
8756                     bitmap_set_bit (used_defs, first_vec_index + ri);
8757
8758                   /* Store the vector statement in NODE.  */
8759                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8760                 }
8761             }
8762
8763           index = 0;
8764           first_vec_index = -1;
8765           second_vec_index = -1;
8766           noop_p = true;
8767         }
8768     }
8769
8770   if (n_loads)
8771     {
8772       if (repeating_p)
8773         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8774       else
8775         {
8776           /* Enforced above when !repeating_p.  */
8777           unsigned int const_nunits = nunits.to_constant ();
8778           *n_loads = 0;
8779           bool load_seen = false;
8780           for (unsigned i = 0; i < in_nlanes; ++i)
8781             {
8782               if (i % const_nunits == 0)
8783                 {
8784                   if (load_seen)
8785                     *n_loads += 1;
8786                   load_seen = false;
8787                 }
8788               if (bitmap_bit_p (used_in_lanes, i))
8789                 load_seen = true;
8790             }
8791           if (load_seen)
8792             *n_loads += 1;
8793         }
8794     }
8795
8796   if (dce_chain)
8797     for (unsigned i = 0; i < dr_chain.length (); ++i)
8798       if (!bitmap_bit_p (used_defs, i))
8799         {
8800           tree def = dr_chain[i];
8801           do
8802             {
8803               gimple *stmt = SSA_NAME_DEF_STMT (def);
8804               if (is_gimple_assign (stmt)
8805                   && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
8806                       || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
8807                 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
8808               else
8809                 def = NULL;
8810               gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8811               gsi_remove (&rgsi, true);
8812               release_defs (stmt);
8813             }
8814           while (def);
8815         }
8816
8817   return true;
8818 }
8819
8820 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8821    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8822    permute statements for the SLP node NODE.  Store the number of vector
8823    permute instructions in *N_PERMS and the number of vector load
8824    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
8825    that were not needed.  */
8826
8827 bool
8828 vect_transform_slp_perm_load (vec_info *vinfo,
8829                               slp_tree node, const vec<tree> &dr_chain,
8830                               gimple_stmt_iterator *gsi, poly_uint64 vf,
8831                               bool analyze_only, unsigned *n_perms,
8832                               unsigned int *n_loads, bool dce_chain)
8833 {
8834   return vect_transform_slp_perm_load_1 (vinfo, node,
8835                                          SLP_TREE_LOAD_PERMUTATION (node),
8836                                          dr_chain, gsi, vf, analyze_only,
8837                                          dump_enabled_p (), n_perms, n_loads,
8838                                          dce_chain);
8839 }
8840
8841 /* Produce the next vector result for SLP permutation NODE by adding a vector
8842    statement at GSI.  If MASK_VEC is nonnull, add:
8843
8844       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8845
8846    otherwise add:
8847
8848       <new SSA name> = FIRST_DEF.  */
8849
8850 static void
8851 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8852                           slp_tree node, tree first_def, tree second_def,
8853                           tree mask_vec, poly_uint64 identity_offset)
8854 {
8855   tree vectype = SLP_TREE_VECTYPE (node);
8856
8857   /* ???  We SLP match existing vector element extracts but
8858      allow punning which we need to re-instantiate at uses
8859      but have no good way of explicitly representing.  */
8860   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8861       && !types_compatible_p (TREE_TYPE (first_def), vectype))
8862     {
8863       gassign *conv_stmt
8864         = gimple_build_assign (make_ssa_name (vectype),
8865                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8866       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8867       first_def = gimple_assign_lhs (conv_stmt);
8868     }
8869   gassign *perm_stmt;
8870   tree perm_dest = make_ssa_name (vectype);
8871   if (mask_vec)
8872     {
8873       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8874                            TYPE_SIZE (vectype))
8875           && !types_compatible_p (TREE_TYPE (second_def), vectype))
8876         {
8877           gassign *conv_stmt
8878             = gimple_build_assign (make_ssa_name (vectype),
8879                                    build1 (VIEW_CONVERT_EXPR,
8880                                            vectype, second_def));
8881           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8882           second_def = gimple_assign_lhs (conv_stmt);
8883         }
8884       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8885                                        first_def, second_def,
8886                                        mask_vec);
8887     }
8888   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8889     {
8890       /* For identity permutes we still need to handle the case
8891          of offsetted extracts or concats.  */
8892       unsigned HOST_WIDE_INT c;
8893       auto first_def_nunits
8894         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8895       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8896         {
8897           unsigned HOST_WIDE_INT elsz
8898             = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8899           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8900                                  TYPE_SIZE (vectype),
8901                                  bitsize_int (identity_offset * elsz));
8902           perm_stmt = gimple_build_assign (perm_dest, lowpart);
8903         }
8904       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8905                                     first_def_nunits, &c) && c == 2)
8906         {
8907           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8908                                             NULL_TREE, second_def);
8909           perm_stmt = gimple_build_assign (perm_dest, ctor);
8910         }
8911       else
8912         gcc_unreachable ();
8913     }
8914   else
8915     {
8916       /* We need a copy here in case the def was external.  */
8917       perm_stmt = gimple_build_assign (perm_dest, first_def);
8918     }
8919   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8920   /* Store the vector statement in NODE.  */
8921   node->push_vec_def (perm_stmt);
8922 }
8923
8924 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
8925    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8926    If GSI is nonnull, emit the permutation there.
8927
8928    When GSI is null, the only purpose of NODE is to give properties
8929    of the result, such as the vector type and number of SLP lanes.
8930    The node does not need to be a VEC_PERM_EXPR.
8931
8932    If the target supports the operation, return the number of individual
8933    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
8934    dump file if DUMP_P is true.  */
8935
8936 static int
8937 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8938                                 slp_tree node, lane_permutation_t &perm,
8939                                 vec<slp_tree> &children, bool dump_p)
8940 {
8941   tree vectype = SLP_TREE_VECTYPE (node);
8942
8943   /* ???  We currently only support all same vector input types
8944      while the SLP IL should really do a concat + select and thus accept
8945      arbitrary mismatches.  */
8946   slp_tree child;
8947   unsigned i;
8948   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8949   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8950   tree op_vectype = NULL_TREE;
8951   FOR_EACH_VEC_ELT (children, i, child)
8952     if (SLP_TREE_VECTYPE (child))
8953       {
8954         op_vectype = SLP_TREE_VECTYPE (child);
8955         break;
8956       }
8957   if (!op_vectype)
8958     op_vectype = vectype;
8959   FOR_EACH_VEC_ELT (children, i, child)
8960     {
8961       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8962            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8963           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8964           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8965         {
8966           if (dump_p)
8967             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8968                              "Unsupported vector types in lane permutation\n");
8969           return -1;
8970         }
8971       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8972         repeating_p = false;
8973     }
8974
8975   gcc_assert (perm.length () == SLP_TREE_LANES (node));
8976   if (dump_p)
8977     {
8978       dump_printf_loc (MSG_NOTE, vect_location,
8979                        "vectorizing permutation");
8980       for (unsigned i = 0; i < perm.length (); ++i)
8981         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8982       if (repeating_p)
8983         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8984       dump_printf (MSG_NOTE, "\n");
8985     }
8986
8987   /* REPEATING_P is true if every output vector is guaranteed to use the
8988      same permute vector.  We can handle that case for both variable-length
8989      and constant-length vectors, but we only handle other cases for
8990      constant-length vectors.
8991
8992      Set:
8993
8994      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8995        mask vector that we want to build.
8996
8997      - NCOPIES to the number of copies of PERM that we need in order
8998        to build the necessary permute mask vectors.
8999
9000      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
9001        for each permute mask vector.  This is only relevant when GSI is
9002        nonnull.  */
9003   uint64_t npatterns;
9004   unsigned nelts_per_pattern;
9005   uint64_t ncopies;
9006   unsigned noutputs_per_mask;
9007   if (repeating_p)
9008     {
9009       /* We need a single permute mask vector that has the form:
9010
9011            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
9012
9013          In other words, the original n-element permute in PERM is
9014          "unrolled" to fill a full vector.  The stepped vector encoding
9015          that we use for permutes requires 3n elements.  */
9016       npatterns = SLP_TREE_LANES (node);
9017       nelts_per_pattern = ncopies = 3;
9018       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9019     }
9020   else
9021     {
9022       /* Calculate every element of every permute mask vector explicitly,
9023          instead of relying on the pattern described above.  */
9024       if (!nunits.is_constant (&npatterns)
9025           || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
9026         return -1;
9027       nelts_per_pattern = ncopies = 1;
9028       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
9029         if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
9030           return -1;
9031       noutputs_per_mask = 1;
9032     }
9033   unsigned olanes = ncopies * SLP_TREE_LANES (node);
9034   gcc_assert (repeating_p || multiple_p (olanes, nunits));
9035
9036   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
9037      from the { SLP operand, scalar lane } permutation as recorded in the
9038      SLP node as intermediate step.  This part should already work
9039      with SLP children with arbitrary number of lanes.  */
9040   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
9041   auto_vec<unsigned> active_lane;
9042   vperm.create (olanes);
9043   active_lane.safe_grow_cleared (children.length (), true);
9044   for (unsigned i = 0; i < ncopies; ++i)
9045     {
9046       for (unsigned pi = 0; pi < perm.length (); ++pi)
9047         {
9048           std::pair<unsigned, unsigned> p = perm[pi];
9049           tree vtype = SLP_TREE_VECTYPE (children[p.first]);
9050           if (repeating_p)
9051             vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
9052           else
9053             {
9054               /* We checked above that the vectors are constant-length.  */
9055               unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
9056               unsigned vi = (active_lane[p.first] + p.second) / vnunits;
9057               unsigned vl = (active_lane[p.first] + p.second) % vnunits;
9058               vperm.quick_push ({{p.first, vi}, vl});
9059             }
9060         }
9061       /* Advance to the next group.  */
9062       for (unsigned j = 0; j < children.length (); ++j)
9063         active_lane[j] += SLP_TREE_LANES (children[j]);
9064     }
9065
9066   if (dump_p)
9067     {
9068       dump_printf_loc (MSG_NOTE, vect_location,
9069                        "vectorizing permutation");
9070       for (unsigned i = 0; i < perm.length (); ++i)
9071         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
9072       if (repeating_p)
9073         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
9074       dump_printf (MSG_NOTE, "\n");
9075       dump_printf_loc (MSG_NOTE, vect_location, "as");
9076       for (unsigned i = 0; i < vperm.length (); ++i)
9077         {
9078           if (i != 0
9079               && (repeating_p
9080                   ? multiple_p (i, npatterns)
9081                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
9082             dump_printf (MSG_NOTE, ",");
9083           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
9084                        vperm[i].first.first, vperm[i].first.second,
9085                        vperm[i].second);
9086         }
9087       dump_printf (MSG_NOTE, "\n");
9088     }
9089
9090   /* We can only handle two-vector permutes, everything else should
9091      be lowered on the SLP level.  The following is closely inspired
9092      by vect_transform_slp_perm_load and is supposed to eventually
9093      replace it.
9094      ???   As intermediate step do code-gen in the SLP tree representation
9095      somehow?  */
9096   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
9097   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
9098   unsigned int index = 0;
9099   poly_uint64 mask_element;
9100   vec_perm_builder mask;
9101   mask.new_vector (nunits, npatterns, nelts_per_pattern);
9102   unsigned int count = mask.encoded_nelts ();
9103   mask.quick_grow (count);
9104   vec_perm_indices indices;
9105   unsigned nperms = 0;
9106   for (unsigned i = 0; i < vperm.length (); ++i)
9107     {
9108       mask_element = vperm[i].second;
9109       if (first_vec.first == -1U
9110           || first_vec == vperm[i].first)
9111         first_vec = vperm[i].first;
9112       else if (second_vec.first == -1U
9113                || second_vec == vperm[i].first)
9114         {
9115           second_vec = vperm[i].first;
9116           mask_element += nunits;
9117         }
9118       else
9119         {
9120           if (dump_p)
9121             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9122                              "permutation requires at "
9123                              "least three vectors\n");
9124           gcc_assert (!gsi);
9125           return -1;
9126         }
9127
9128       mask[index++] = mask_element;
9129
9130       if (index == count)
9131         {
9132           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
9133                               TYPE_VECTOR_SUBPARTS (op_vectype));
9134           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
9135                              && constant_multiple_p (mask[0], nunits));
9136           machine_mode vmode = TYPE_MODE (vectype);
9137           machine_mode op_vmode = TYPE_MODE (op_vectype);
9138           unsigned HOST_WIDE_INT c;
9139           if ((!identity_p
9140                && !can_vec_perm_const_p (vmode, op_vmode, indices))
9141               || (identity_p
9142                   && !known_le (nunits,
9143                                 TYPE_VECTOR_SUBPARTS (op_vectype))
9144                   && (!constant_multiple_p (nunits,
9145                                             TYPE_VECTOR_SUBPARTS (op_vectype),
9146                                             &c) || c != 2)))
9147             {
9148               if (dump_p)
9149                 {
9150                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
9151                                    vect_location,
9152                                    "unsupported vect permute { ");
9153                   for (i = 0; i < count; ++i)
9154                     {
9155                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
9156                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
9157                     }
9158                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
9159                 }
9160               gcc_assert (!gsi);
9161               return -1;
9162             }
9163
9164           if (!identity_p)
9165             nperms++;
9166           if (gsi)
9167             {
9168               if (second_vec.first == -1U)
9169                 second_vec = first_vec;
9170
9171               slp_tree
9172                 first_node = children[first_vec.first],
9173                 second_node = children[second_vec.first];
9174
9175               tree mask_vec = NULL_TREE;
9176               if (!identity_p)
9177                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
9178
9179               for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
9180                 {
9181                   tree first_def
9182                     = vect_get_slp_vect_def (first_node,
9183                                              first_vec.second + vi);
9184                   tree second_def
9185                     = vect_get_slp_vect_def (second_node,
9186                                              second_vec.second + vi);
9187                   vect_add_slp_permutation (vinfo, gsi, node, first_def,
9188                                             second_def, mask_vec, mask[0]);
9189                 }
9190             }
9191
9192           index = 0;
9193           first_vec = std::make_pair (-1U, -1U);
9194           second_vec = std::make_pair (-1U, -1U);
9195         }
9196     }
9197
9198   return nperms;
9199 }
9200
9201 /* Vectorize the SLP permutations in NODE as specified
9202    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
9203    child number and lane number.
9204    Interleaving of two two-lane two-child SLP subtrees (not supported):
9205      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
9206    A blend of two four-lane two-child SLP subtrees:
9207      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
9208    Highpart of a four-lane one-child SLP subtree (not supported):
9209      [ { 0, 2 }, { 0, 3 } ]
9210    Where currently only a subset is supported by code generating below.  */
9211
9212 static bool
9213 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
9214                               slp_tree node, stmt_vector_for_cost *cost_vec)
9215 {
9216   tree vectype = SLP_TREE_VECTYPE (node);
9217   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
9218   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
9219                                                SLP_TREE_CHILDREN (node),
9220                                                dump_enabled_p ());
9221   if (nperms < 0)
9222     return false;
9223
9224   if (!gsi)
9225     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
9226
9227   return true;
9228 }
9229
9230 /* Vectorize SLP NODE.  */
9231
9232 static void
9233 vect_schedule_slp_node (vec_info *vinfo,
9234                         slp_tree node, slp_instance instance)
9235 {
9236   gimple_stmt_iterator si;
9237   int i;
9238   slp_tree child;
9239
9240   /* Vectorize externals and constants.  */
9241   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9242       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
9243     {
9244       /* ???  vectorizable_shift can end up using a scalar operand which is
9245          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
9246          node in this case.  */
9247       if (!SLP_TREE_VECTYPE (node))
9248         return;
9249
9250       /* There are two reasons vector defs might already exist.  The first
9251          is that we are vectorizing an existing vector def.  The second is
9252          when performing BB vectorization shared constant/external nodes
9253          are not split apart during partitioning so during the code-gen
9254          DFS walk we can end up visiting them twice.  */
9255       if (! SLP_TREE_VEC_DEFS (node).exists ())
9256         vect_create_constant_vectors (vinfo, node);
9257       return;
9258     }
9259
9260   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9261
9262   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9263
9264   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
9265   SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9266
9267   if (dump_enabled_p ())
9268     dump_printf_loc (MSG_NOTE, vect_location,
9269                      "------>vectorizing SLP node starting from: %G",
9270                      stmt_info->stmt);
9271
9272   if (STMT_VINFO_DATA_REF (stmt_info)
9273       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9274     {
9275       /* Vectorized loads go before the first scalar load to make it
9276          ready early, vectorized stores go before the last scalar
9277          stmt which is where all uses are ready.  */
9278       stmt_vec_info last_stmt_info = NULL;
9279       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9280         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9281       else /* DR_IS_WRITE */
9282         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9283       si = gsi_for_stmt (last_stmt_info->stmt);
9284     }
9285   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9286             || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9287             || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9288            && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9289     {
9290       /* For PHI node vectorization we do not use the insertion iterator.  */
9291       si = gsi_none ();
9292     }
9293   else
9294     {
9295       /* Emit other stmts after the children vectorized defs which is
9296          earliest possible.  */
9297       gimple *last_stmt = NULL;
9298       if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
9299         if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
9300             || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
9301           {
9302             /* But avoid scheduling internal defs outside of the loop when
9303                we might have only implicitly tracked loop mask/len defs.  */
9304             gimple_stmt_iterator si
9305               = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
9306             last_stmt = *si;
9307           }
9308       bool seen_vector_def = false;
9309       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9310         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9311           {
9312             /* For fold-left reductions we are retaining the scalar
9313                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9314                set so the representation isn't perfect.  Resort to the
9315                last scalar def here.  */
9316             if (SLP_TREE_VEC_DEFS (child).is_empty ())
9317               {
9318                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9319                             == cycle_phi_info_type);
9320                 gphi *phi = as_a <gphi *>
9321                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
9322                 if (!last_stmt
9323                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
9324                   last_stmt = phi;
9325               }
9326             /* We are emitting all vectorized stmts in the same place and
9327                the last one is the last.
9328                ???  Unless we have a load permutation applied and that
9329                figures to re-use an earlier generated load.  */
9330             unsigned j;
9331             tree vdef;
9332             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9333               {
9334                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9335                 if (!last_stmt
9336                     || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9337                   last_stmt = vstmt;
9338               }
9339           }
9340         else if (!SLP_TREE_VECTYPE (child))
9341           {
9342             /* For externals we use unvectorized at all scalar defs.  */
9343             unsigned j;
9344             tree def;
9345             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9346               if (TREE_CODE (def) == SSA_NAME
9347                   && !SSA_NAME_IS_DEFAULT_DEF (def))
9348                 {
9349                   gimple *stmt = SSA_NAME_DEF_STMT (def);
9350                   if (!last_stmt
9351                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9352                     last_stmt = stmt;
9353                 }
9354           }
9355         else
9356           {
9357             /* For externals we have to look at all defs since their
9358                insertion place is decided per vector.  But beware
9359                of pre-existing vectors where we need to make sure
9360                we do not insert before the region boundary.  */
9361             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9362                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9363               seen_vector_def = true;
9364             else
9365               {
9366                 unsigned j;
9367                 tree vdef;
9368                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9369                   if (TREE_CODE (vdef) == SSA_NAME
9370                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9371                     {
9372                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9373                       if (!last_stmt
9374                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9375                         last_stmt = vstmt;
9376                     }
9377               }
9378           }
9379       /* This can happen when all children are pre-existing vectors or
9380          constants.  */
9381       if (!last_stmt)
9382         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9383       if (!last_stmt)
9384         {
9385           gcc_assert (seen_vector_def);
9386           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9387         }
9388       else if (is_ctrl_altering_stmt (last_stmt))
9389         {
9390           /* We split regions to vectorize at control altering stmts
9391              with a definition so this must be an external which
9392              we can insert at the start of the region.  */
9393           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9394         }
9395       else if (is_a <bb_vec_info> (vinfo)
9396                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9397                && gimple_could_trap_p (stmt_info->stmt))
9398         {
9399           /* We've constrained possibly trapping operations to all come
9400              from the same basic-block, if vectorized defs would allow earlier
9401              scheduling still force vectorized stmts to the original block.
9402              This is only necessary for BB vectorization since for loop vect
9403              all operations are in a single BB and scalar stmt based
9404              placement doesn't play well with epilogue vectorization.  */
9405           gcc_assert (dominated_by_p (CDI_DOMINATORS,
9406                                       gimple_bb (stmt_info->stmt),
9407                                       gimple_bb (last_stmt)));
9408           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9409         }
9410       else if (is_a <gphi *> (last_stmt))
9411         si = gsi_after_labels (gimple_bb (last_stmt));
9412       else
9413         {
9414           si = gsi_for_stmt (last_stmt);
9415           gsi_next (&si);
9416         }
9417     }
9418
9419   /* Handle purely internal nodes.  */
9420   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9421     {
9422       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
9423          be shared with different SLP nodes (but usually it's the same
9424          operation apart from the case the stmt is only there for denoting
9425          the actual scalar lane defs ...).  So do not call vect_transform_stmt
9426          but open-code it here (partly).  */
9427       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9428       gcc_assert (done);
9429       stmt_vec_info slp_stmt_info;
9430       unsigned int i;
9431       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9432         if (STMT_VINFO_LIVE_P (slp_stmt_info))
9433           {
9434             done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9435                                                 instance, i, true, NULL);
9436             gcc_assert (done);
9437           }
9438     }
9439   else
9440     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9441 }
9442
9443 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9444    For loop vectorization this is done in vectorizable_call, but for SLP
9445    it needs to be deferred until end of vect_schedule_slp, because multiple
9446    SLP instances may refer to the same scalar stmt.  */
9447
9448 static void
9449 vect_remove_slp_scalar_calls (vec_info *vinfo,
9450                               slp_tree node, hash_set<slp_tree> &visited)
9451 {
9452   gimple *new_stmt;
9453   gimple_stmt_iterator gsi;
9454   int i;
9455   slp_tree child;
9456   tree lhs;
9457   stmt_vec_info stmt_info;
9458
9459   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9460     return;
9461
9462   if (visited.add (node))
9463     return;
9464
9465   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9466     vect_remove_slp_scalar_calls (vinfo, child, visited);
9467
9468   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9469     {
9470       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9471       if (!stmt || gimple_bb (stmt) == NULL)
9472         continue;
9473       if (is_pattern_stmt_p (stmt_info)
9474           || !PURE_SLP_STMT (stmt_info))
9475         continue;
9476       lhs = gimple_call_lhs (stmt);
9477       if (lhs)
9478         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9479       else
9480         {
9481           new_stmt = gimple_build_nop ();
9482           unlink_stmt_vdef (stmt_info->stmt);
9483         }
9484       gsi = gsi_for_stmt (stmt);
9485       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9486       if (lhs)
9487         SSA_NAME_DEF_STMT (lhs) = new_stmt;
9488     }
9489 }
9490
9491 static void
9492 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9493 {
9494   hash_set<slp_tree> visited;
9495   vect_remove_slp_scalar_calls (vinfo, node, visited);
9496 }
9497
9498 /* Vectorize the instance root.  */
9499
9500 void
9501 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9502 {
9503   gassign *rstmt = NULL;
9504
9505   if (instance->kind == slp_inst_kind_ctor)
9506     {
9507       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9508         {
9509           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9510           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9511           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9512                                           TREE_TYPE (vect_lhs)))
9513             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9514                                vect_lhs);
9515           rstmt = gimple_build_assign (root_lhs, vect_lhs);
9516         }
9517       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9518         {
9519           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9520           tree child_def;
9521           int j;
9522           vec<constructor_elt, va_gc> *v;
9523           vec_alloc (v, nelts);
9524
9525           /* A CTOR can handle V16HI composition from VNx8HI so we
9526              do not need to convert vector elements if the types
9527              do not match.  */
9528           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9529             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9530           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9531           tree rtype
9532             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9533           tree r_constructor = build_constructor (rtype, v);
9534           rstmt = gimple_build_assign (lhs, r_constructor);
9535         }
9536     }
9537   else if (instance->kind == slp_inst_kind_bb_reduc)
9538     {
9539       /* Largely inspired by reduction chain epilogue handling in
9540          vect_create_epilog_for_reduction.  */
9541       vec<tree> vec_defs = vNULL;
9542       vect_get_slp_defs (node, &vec_defs);
9543       enum tree_code reduc_code
9544         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9545       /* ???  We actually have to reflect signs somewhere.  */
9546       if (reduc_code == MINUS_EXPR)
9547         reduc_code = PLUS_EXPR;
9548       gimple_seq epilogue = NULL;
9549       /* We may end up with more than one vector result, reduce them
9550          to one vector.  */
9551       tree vec_def = vec_defs[0];
9552       tree vectype = TREE_TYPE (vec_def);
9553       tree compute_vectype = vectype;
9554       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9555                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
9556                                  && operation_can_overflow (reduc_code));
9557       if (pun_for_overflow_p)
9558         {
9559           compute_vectype = unsigned_type_for (vectype);
9560           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9561                                   compute_vectype, vec_def);
9562         }
9563       for (unsigned i = 1; i < vec_defs.length (); ++i)
9564         {
9565           tree def = vec_defs[i];
9566           if (pun_for_overflow_p)
9567             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9568                                 compute_vectype, def);
9569           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9570                                   vec_def, def);
9571         }
9572       vec_defs.release ();
9573       /* ???  Support other schemes than direct internal fn.  */
9574       internal_fn reduc_fn;
9575       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9576           || reduc_fn == IFN_LAST)
9577         gcc_unreachable ();
9578       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9579                                       TREE_TYPE (compute_vectype), vec_def);
9580       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9581         {
9582           tree rem_def = NULL_TREE;
9583           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9584             {
9585               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9586               if (!rem_def)
9587                 rem_def = def;
9588               else
9589                 rem_def = gimple_build (&epilogue, reduc_code,
9590                                         TREE_TYPE (scalar_def),
9591                                         rem_def, def);
9592             }
9593           scalar_def = gimple_build (&epilogue, reduc_code,
9594                                      TREE_TYPE (scalar_def),
9595                                      scalar_def, rem_def);
9596         }
9597       scalar_def = gimple_convert (&epilogue,
9598                                    TREE_TYPE (vectype), scalar_def);
9599       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9600       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9601       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9602       update_stmt (gsi_stmt (rgsi));
9603       return;
9604     }
9605   else
9606     gcc_unreachable ();
9607
9608   gcc_assert (rstmt);
9609
9610   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9611   gsi_replace (&rgsi, rstmt, true);
9612 }
9613
9614 struct slp_scc_info
9615 {
9616   bool on_stack;
9617   int dfs;
9618   int lowlink;
9619 };
9620
9621 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
9622
9623 static void
9624 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9625                    hash_map<slp_tree, slp_scc_info> &scc_info,
9626                    int &maxdfs, vec<slp_tree> &stack)
9627 {
9628   bool existed_p;
9629   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9630   gcc_assert (!existed_p);
9631   info->dfs = maxdfs;
9632   info->lowlink = maxdfs;
9633   maxdfs++;
9634
9635   /* Leaf.  */
9636   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9637     {
9638       info->on_stack = false;
9639       vect_schedule_slp_node (vinfo, node, instance);
9640       return;
9641     }
9642
9643   info->on_stack = true;
9644   stack.safe_push (node);
9645
9646   unsigned i;
9647   slp_tree child;
9648   /* DFS recurse.  */
9649   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9650     {
9651       if (!child)
9652         continue;
9653       slp_scc_info *child_info = scc_info.get (child);
9654       if (!child_info)
9655         {
9656           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9657           /* Recursion might have re-allocated the node.  */
9658           info = scc_info.get (node);
9659           child_info = scc_info.get (child);
9660           info->lowlink = MIN (info->lowlink, child_info->lowlink);
9661         }
9662       else if (child_info->on_stack)
9663         info->lowlink = MIN (info->lowlink, child_info->dfs);
9664     }
9665   if (info->lowlink != info->dfs)
9666     return;
9667
9668   auto_vec<slp_tree, 4> phis_to_fixup;
9669
9670   /* Singleton.  */
9671   if (stack.last () == node)
9672     {
9673       stack.pop ();
9674       info->on_stack = false;
9675       vect_schedule_slp_node (vinfo, node, instance);
9676       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9677           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9678         phis_to_fixup.quick_push (node);
9679     }
9680   else
9681     {
9682       /* SCC.  */
9683       int last_idx = stack.length () - 1;
9684       while (stack[last_idx] != node)
9685         last_idx--;
9686       /* We can break the cycle at PHIs who have at least one child
9687          code generated.  Then we could re-start the DFS walk until
9688          all nodes in the SCC are covered (we might have new entries
9689          for only back-reachable nodes).  But it's simpler to just
9690          iterate and schedule those that are ready.  */
9691       unsigned todo = stack.length () - last_idx;
9692       do
9693         {
9694           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9695             {
9696               slp_tree entry = stack[idx];
9697               if (!entry)
9698                 continue;
9699               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9700                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9701               bool ready = !phi;
9702               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9703                   if (!child)
9704                     {
9705                       gcc_assert (phi);
9706                       ready = true;
9707                       break;
9708                     }
9709                   else if (scc_info.get (child)->on_stack)
9710                     {
9711                       if (!phi)
9712                         {
9713                           ready = false;
9714                           break;
9715                         }
9716                     }
9717                   else
9718                     {
9719                       if (phi)
9720                         {
9721                           ready = true;
9722                           break;
9723                         }
9724                     }
9725               if (ready)
9726                 {
9727                   vect_schedule_slp_node (vinfo, entry, instance);
9728                   scc_info.get (entry)->on_stack = false;
9729                   stack[idx] = NULL;
9730                   todo--;
9731                   if (phi)
9732                     phis_to_fixup.safe_push (entry);
9733                 }
9734             }
9735         }
9736       while (todo != 0);
9737
9738       /* Pop the SCC.  */
9739       stack.truncate (last_idx);
9740     }
9741
9742   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
9743   slp_tree phi_node;
9744   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9745     {
9746       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9747       edge_iterator ei;
9748       edge e;
9749       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9750         {
9751           unsigned dest_idx = e->dest_idx;
9752           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9753           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9754             continue;
9755           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9756           /* Simply fill all args.  */
9757           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9758               != vect_first_order_recurrence)
9759             for (unsigned i = 0; i < n; ++i)
9760               {
9761                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9762                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9763                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9764                              e, gimple_phi_arg_location (phi, dest_idx));
9765               }
9766           else
9767             {
9768               /* Unless it is a first order recurrence which needs
9769                  args filled in for both the PHI node and the permutes.  */
9770               gimple *perm
9771                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9772               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9773               add_phi_arg (as_a <gphi *> (rphi),
9774                            vect_get_slp_vect_def (child, n - 1),
9775                            e, gimple_phi_arg_location (phi, dest_idx));
9776               for (unsigned i = 0; i < n; ++i)
9777                 {
9778                   gimple *perm
9779                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9780                   if (i > 0)
9781                     gimple_assign_set_rhs1 (perm,
9782                                             vect_get_slp_vect_def (child, i - 1));
9783                   gimple_assign_set_rhs2 (perm,
9784                                           vect_get_slp_vect_def (child, i));
9785                   update_stmt (perm);
9786                 }
9787             }
9788         }
9789     }
9790 }
9791
9792 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
9793
9794 void
9795 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9796 {
9797   slp_instance instance;
9798   unsigned int i;
9799
9800   hash_map<slp_tree, slp_scc_info> scc_info;
9801   int maxdfs = 0;
9802   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9803     {
9804       slp_tree node = SLP_INSTANCE_TREE (instance);
9805       if (dump_enabled_p ())
9806         {
9807           dump_printf_loc (MSG_NOTE, vect_location,
9808                            "Vectorizing SLP tree:\n");
9809           /* ???  Dump all?  */
9810           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9811             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9812                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9813           vect_print_slp_graph (MSG_NOTE, vect_location,
9814                                 SLP_INSTANCE_TREE (instance));
9815         }
9816       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9817          have a PHI be the node breaking the cycle.  */
9818       auto_vec<slp_tree> stack;
9819       if (!scc_info.get (node))
9820         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9821
9822       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9823         vectorize_slp_instance_root_stmt (node, instance);
9824
9825       if (dump_enabled_p ())
9826         dump_printf_loc (MSG_NOTE, vect_location,
9827                          "vectorizing stmts using SLP.\n");
9828     }
9829
9830   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9831     {
9832       slp_tree root = SLP_INSTANCE_TREE (instance);
9833       stmt_vec_info store_info;
9834       unsigned int j;
9835
9836       /* Remove scalar call stmts.  Do not do this for basic-block
9837          vectorization as not all uses may be vectorized.
9838          ???  Why should this be necessary?  DCE should be able to
9839          remove the stmts itself.
9840          ???  For BB vectorization we can as well remove scalar
9841          stmts starting from the SLP tree root if they have no
9842          uses.  */
9843       if (is_a <loop_vec_info> (vinfo))
9844         vect_remove_slp_scalar_calls (vinfo, root);
9845
9846       /* Remove vectorized stores original scalar stmts.  */
9847       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9848         {
9849           if (!STMT_VINFO_DATA_REF (store_info)
9850               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9851             break;
9852
9853           store_info = vect_orig_stmt (store_info);
9854           /* Free the attached stmt_vec_info and remove the stmt.  */
9855           vinfo->remove_stmt (store_info);
9856
9857           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9858              to not crash in vect_free_slp_tree later.  */
9859           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9860             SLP_TREE_REPRESENTATIVE (root) = NULL;
9861         }
9862     }
9863 }