gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "insn-config.h"
  35 #include "recog.h"              /* FIXME: for insn_data */
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "gimple-iterator.h"
  39 #include "cfgloop.h"
  40 #include "tree-vectorizer.h"
  41 #include "langhooks.h"
  42 #include "gimple-walk.h"
  43 #include "dbgcnt.h"
  44 #include "tree-vector-builder.h"
  45 #include "vec-perm-indices.h"
  46 #include "gimple-fold.h"
  47 #include "internal-fn.h"
  48 #include "dump-context.h"
  49 #include "cfganal.h"
  50 #include "tree-eh.h"
  51 #include "tree-cfg.h"
  52 #include "alloc-pool.h"
  53 #include "sreal.h"
  54 #include "predict.h"
  55
  56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  57                                             load_permutation_t &,
  58                                             const vec<tree> &,
  59                                             gimple_stmt_iterator *,
  60                                             poly_uint64, bool, bool,
  61                                             unsigned *,
  62                                             unsigned * = nullptr,
  63                                             bool = false);
  64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  65                                            slp_tree, lane_permutation_t &,
  66                                            vec<slp_tree> &, bool);
  67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  68                                           slp_tree, stmt_vector_for_cost *);
  69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  70
  71 static object_allocator<_slp_tree> *slp_tree_pool;
  72 static slp_tree slp_first_node;
  73
  74 void
  75 vect_slp_init (void)
  76 {
  77   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  78 }
  79
  80 void
  81 vect_slp_fini (void)
  82 {
  83   while (slp_first_node)
  84     delete slp_first_node;
  85   delete slp_tree_pool;
  86   slp_tree_pool = NULL;
  87 }
  88
  89 void *
  90 _slp_tree::operator new (size_t n)
  91 {
  92   gcc_assert (n == sizeof (_slp_tree));
  93   return slp_tree_pool->allocate_raw ();
  94 }
  95
  96 void
  97 _slp_tree::operator delete (void *node, size_t n)
  98 {
  99   gcc_assert (n == sizeof (_slp_tree));
 100   slp_tree_pool->remove_raw (node);
 101 }
 102
 103
 104 /* Initialize a SLP node.  */
 105
 106 _slp_tree::_slp_tree ()
 107 {
 108   this->prev_node = NULL;
 109   if (slp_first_node)
 110     slp_first_node->prev_node = this;
 111   this->next_node = slp_first_node;
 112   slp_first_node = this;
 113   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 114   SLP_TREE_SCALAR_OPS (this) = vNULL;
 115   SLP_TREE_VEC_DEFS (this) = vNULL;
 116   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 117   SLP_TREE_CHILDREN (this) = vNULL;
 118   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 119   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 120   SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
 121   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 122   SLP_TREE_CODE (this) = ERROR_MARK;
 123   SLP_TREE_VECTYPE (this) = NULL_TREE;
 124   SLP_TREE_REPRESENTATIVE (this) = NULL;
 125   SLP_TREE_REF_COUNT (this) = 1;
 126   this->failed = NULL;
 127   this->max_nunits = 1;
 128   this->lanes = 0;
 129 }
 130
 131 /* Tear down a SLP node.  */
 132
 133 _slp_tree::~_slp_tree ()
 134 {
 135   if (this->prev_node)
 136     this->prev_node->next_node = this->next_node;
 137   else
 138     slp_first_node = this->next_node;
 139   if (this->next_node)
 140     this->next_node->prev_node = this->prev_node;
 141   SLP_TREE_CHILDREN (this).release ();
 142   SLP_TREE_SCALAR_STMTS (this).release ();
 143   SLP_TREE_SCALAR_OPS (this).release ();
 144   SLP_TREE_VEC_DEFS (this).release ();
 145   SLP_TREE_LOAD_PERMUTATION (this).release ();
 146   SLP_TREE_LANE_PERMUTATION (this).release ();
 147   SLP_TREE_SIMD_CLONE_INFO (this).release ();
 148   if (this->failed)
 149     free (failed);
 150 }
 151
 152 /* Push the single SSA definition in DEF to the vector of vector defs.  */
 153
 154 void
 155 _slp_tree::push_vec_def (gimple *def)
 156 {
 157   if (gphi *phi = dyn_cast <gphi *> (def))
 158     vec_defs.quick_push (gimple_phi_result (phi));
 159   else
 160     {
 161       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
 162       vec_defs.quick_push (get_def_from_ptr (defop));
 163     }
 164 }
 165
 166 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 167
 168 void
 169 vect_free_slp_tree (slp_tree node)
 170 {
 171   int i;
 172   slp_tree child;
 173
 174   if (--SLP_TREE_REF_COUNT (node) != 0)
 175     return;
 176
 177   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 178     if (child)
 179       vect_free_slp_tree (child);
 180
 181   /* If the node defines any SLP only patterns then those patterns are no
 182      longer valid and should be removed.  */
 183   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 184   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 185     {
 186       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 187       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 188       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 189     }
 190
 191   delete node;
 192 }
 193
 194 /* Return a location suitable for dumpings related to the SLP instance.  */
 195
 196 dump_user_location_t
 197 _slp_instance::location () const
 198 {
 199   if (!root_stmts.is_empty ())
 200     return root_stmts[0]->stmt;
 201   else
 202     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 203 }
 204
 205
 206 /* Free the memory allocated for the SLP instance.  */
 207
 208 void
 209 vect_free_slp_instance (slp_instance instance)
 210 {
 211   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 212   SLP_INSTANCE_LOADS (instance).release ();
 213   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 214   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
 215   instance->subgraph_entries.release ();
 216   instance->cost_vec.release ();
 217   free (instance);
 218 }
 219
 220
 221 /* Create an SLP node for SCALAR_STMTS.  */
 222
 223 slp_tree
 224 vect_create_new_slp_node (unsigned nops, tree_code code)
 225 {
 226   slp_tree node = new _slp_tree;
 227   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 228   SLP_TREE_CHILDREN (node).create (nops);
 229   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 230   SLP_TREE_CODE (node) = code;
 231   return node;
 232 }
 233 /* Create an SLP node for SCALAR_STMTS.  */
 234
 235 static slp_tree
 236 vect_create_new_slp_node (slp_tree node,
 237                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 238 {
 239   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 240   SLP_TREE_CHILDREN (node).create (nops);
 241   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 242   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 243   SLP_TREE_LANES (node) = scalar_stmts.length ();
 244   return node;
 245 }
 246
 247 /* Create an SLP node for SCALAR_STMTS.  */
 248
 249 static slp_tree
 250 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 251 {
 252   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 253 }
 254
 255 /* Create an SLP node for OPS.  */
 256
 257 static slp_tree
 258 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 259 {
 260   SLP_TREE_SCALAR_OPS (node) = ops;
 261   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 262   SLP_TREE_LANES (node) = ops.length ();
 263   return node;
 264 }
 265
 266 /* Create an SLP node for OPS.  */
 267
 268 static slp_tree
 269 vect_create_new_slp_node (vec<tree> ops)
 270 {
 271   return vect_create_new_slp_node (new _slp_tree, ops);
 272 }
 273
 274
 275 /* This structure is used in creation of an SLP tree.  Each instance
 276    corresponds to the same operand in a group of scalar stmts in an SLP
 277    node.  */
 278 typedef struct _slp_oprnd_info
 279 {
 280   /* Def-stmts for the operands.  */
 281   vec<stmt_vec_info> def_stmts;
 282   /* Operands.  */
 283   vec<tree> ops;
 284   /* Information about the first statement, its vector def-type, type, the
 285      operand itself in case it's constant, and an indication if it's a pattern
 286      stmt and gather/scatter info.  */
 287   tree first_op_type;
 288   enum vect_def_type first_dt;
 289   bool any_pattern;
 290   bool first_gs_p;
 291   gather_scatter_info first_gs_info;
 292 } *slp_oprnd_info;
 293
 294
 295 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 296    operand.  */
 297 static vec<slp_oprnd_info>
 298 vect_create_oprnd_info (int nops, int group_size)
 299 {
 300   int i;
 301   slp_oprnd_info oprnd_info;
 302   vec<slp_oprnd_info> oprnds_info;
 303
 304   oprnds_info.create (nops);
 305   for (i = 0; i < nops; i++)
 306     {
 307       oprnd_info = XNEW (struct _slp_oprnd_info);
 308       oprnd_info->def_stmts.create (group_size);
 309       oprnd_info->ops.create (group_size);
 310       oprnd_info->first_dt = vect_uninitialized_def;
 311       oprnd_info->first_op_type = NULL_TREE;
 312       oprnd_info->any_pattern = false;
 313       oprnd_info->first_gs_p = false;
 314       oprnds_info.quick_push (oprnd_info);
 315     }
 316
 317   return oprnds_info;
 318 }
 319
 320
 321 /* Free operands info.  */
 322
 323 static void
 324 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 325 {
 326   int i;
 327   slp_oprnd_info oprnd_info;
 328
 329   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 330     {
 331       oprnd_info->def_stmts.release ();
 332       oprnd_info->ops.release ();
 333       XDELETE (oprnd_info);
 334     }
 335
 336   oprnds_info.release ();
 337 }
 338
 339 /* Return the execution frequency of NODE (so that a higher value indicates
 340    a "more important" node when optimizing for speed).  */
 341
 342 static sreal
 343 vect_slp_node_weight (slp_tree node)
 344 {
 345   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 346   basic_block bb = gimple_bb (stmt_info->stmt);
 347   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 348 }
 349
 350 /* Return true if STMTS contains a pattern statement.  */
 351
 352 static bool
 353 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 354 {
 355   stmt_vec_info stmt_info;
 356   unsigned int i;
 357   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 358     if (is_pattern_stmt_p (stmt_info))
 359       return true;
 360   return false;
 361 }
 362
 363 /* Return true when all lanes in the external or constant NODE have
 364    the same value.  */
 365
 366 static bool
 367 vect_slp_tree_uniform_p (slp_tree node)
 368 {
 369   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 370               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 371
 372   /* Pre-exsting vectors.  */
 373   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 374     return false;
 375
 376   unsigned i;
 377   tree op, first = NULL_TREE;
 378   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 379     if (!first)
 380       first = op;
 381     else if (!operand_equal_p (first, op, 0))
 382       return false;
 383
 384   return true;
 385 }
 386
 387 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 388    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 389    of the chain.  */
 390
 391 int
 392 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 393                                       stmt_vec_info first_stmt_info)
 394 {
 395   stmt_vec_info next_stmt_info = first_stmt_info;
 396   int result = 0;
 397
 398   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 399     return -1;
 400
 401   do
 402     {
 403       if (next_stmt_info == stmt_info)
 404         return result;
 405       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 406       if (next_stmt_info)
 407         result += DR_GROUP_GAP (next_stmt_info);
 408     }
 409   while (next_stmt_info);
 410
 411   return -1;
 412 }
 413
 414 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 415    using the method implemented by duplicate_and_interleave.  Return true
 416    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 417    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 418    (if nonnull).  */
 419
 420 bool
 421 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 422                                 tree elt_type, unsigned int *nvectors_out,
 423                                 tree *vector_type_out,
 424                                 tree *permutes)
 425 {
 426   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 427   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 428     return false;
 429
 430   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 431   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 432   unsigned int nvectors = 1;
 433   for (;;)
 434     {
 435       scalar_int_mode int_mode;
 436       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 437       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 438         {
 439           /* Get the natural vector type for this SLP group size.  */
 440           tree int_type = build_nonstandard_integer_type
 441             (GET_MODE_BITSIZE (int_mode), 1);
 442           tree vector_type
 443             = get_vectype_for_scalar_type (vinfo, int_type, count);
 444           poly_int64 half_nelts;
 445           if (vector_type
 446               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 447               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 448                            GET_MODE_SIZE (base_vector_mode))
 449               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
 450                              2, &half_nelts))
 451             {
 452               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 453                  together into elements of type INT_TYPE and using the result
 454                  to build NVECTORS vectors.  */
 455               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 456               vec_perm_builder sel1 (nelts, 2, 3);
 457               vec_perm_builder sel2 (nelts, 2, 3);
 458
 459               for (unsigned int i = 0; i < 3; ++i)
 460                 {
 461                   sel1.quick_push (i);
 462                   sel1.quick_push (i + nelts);
 463                   sel2.quick_push (half_nelts + i);
 464                   sel2.quick_push (half_nelts + i + nelts);
 465                 }
 466               vec_perm_indices indices1 (sel1, 2, nelts);
 467               vec_perm_indices indices2 (sel2, 2, nelts);
 468               machine_mode vmode = TYPE_MODE (vector_type);
 469               if (can_vec_perm_const_p (vmode, vmode, indices1)
 470                   && can_vec_perm_const_p (vmode, vmode, indices2))
 471                 {
 472                   if (nvectors_out)
 473                     *nvectors_out = nvectors;
 474                   if (vector_type_out)
 475                     *vector_type_out = vector_type;
 476                   if (permutes)
 477                     {
 478                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 479                                                                 indices1);
 480                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 481                                                                 indices2);
 482                     }
 483                   return true;
 484                 }
 485             }
 486         }
 487       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 488         return false;
 489       nvectors *= 2;
 490     }
 491 }
 492
 493 /* Return true if DTA and DTB match.  */
 494
 495 static bool
 496 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 497 {
 498   return (dta == dtb
 499           || ((dta == vect_external_def || dta == vect_constant_def)
 500               && (dtb == vect_external_def || dtb == vect_constant_def)));
 501 }
 502
 503 static const int cond_expr_maps[3][5] = {
 504   { 4, -1, -2, 1, 2 },
 505   { 4, -2, -1, 1, 2 },
 506   { 4, -1, -2, 2, 1 }
 507 };
 508 static const int arg0_map[] = { 1, 0 };
 509 static const int arg1_map[] = { 1, 1 };
 510 static const int arg2_map[] = { 1, 2 };
 511 static const int arg1_arg4_map[] = { 2, 1, 4 };
 512 static const int arg3_arg2_map[] = { 2, 3, 2 };
 513 static const int op1_op0_map[] = { 2, 1, 0 };
 514 static const int off_map[] = { 1, -3 };
 515 static const int off_op0_map[] = { 2, -3, 0 };
 516 static const int off_arg2_map[] = { 2, -3, 2 };
 517 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
 518 static const int mask_call_maps[6][7] = {
 519   { 1, 1, },
 520   { 2, 1, 2, },
 521   { 3, 1, 2, 3, },
 522   { 4, 1, 2, 3, 4, },
 523   { 5, 1, 2, 3, 4, 5, },
 524   { 6, 1, 2, 3, 4, 5, 6 },
 525 };
 526
 527 /* For most SLP statements, there is a one-to-one mapping between
 528    gimple arguments and child nodes.  If that is not true for STMT,
 529    return an array that contains:
 530
 531    - the number of child nodes, followed by
 532    - for each child node, the index of the argument associated with that node.
 533      The special index -1 is the first operand of an embedded comparison and
 534      the special index -2 is the second operand of an embedded comparison.
 535      The special indes -3 is the offset of a gather as analyzed by
 536      vect_check_gather_scatter.
 537
 538    SWAP is as for vect_get_and_check_slp_defs.  */
 539
 540 static const int *
 541 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
 542                       unsigned char swap = 0)
 543 {
 544   if (auto assign = dyn_cast<const gassign *> (stmt))
 545     {
 546       if (gimple_assign_rhs_code (assign) == COND_EXPR
 547           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 548         return cond_expr_maps[swap];
 549       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 550           && swap)
 551         return op1_op0_map;
 552       if (gather_scatter_p)
 553         return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
 554                 ? off_op0_map : off_map);
 555     }
 556   gcc_assert (!swap);
 557   if (auto call = dyn_cast<const gcall *> (stmt))
 558     {
 559       if (gimple_call_internal_p (call))
 560         switch (gimple_call_internal_fn (call))
 561           {
 562           case IFN_MASK_LOAD:
 563             return gather_scatter_p ? off_arg2_map : arg2_map;
 564
 565           case IFN_GATHER_LOAD:
 566             return arg1_map;
 567
 568           case IFN_MASK_GATHER_LOAD:
 569           case IFN_MASK_LEN_GATHER_LOAD:
 570             return arg1_arg4_map;
 571
 572           case IFN_MASK_STORE:
 573             return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
 574
 575           case IFN_MASK_CALL:
 576             {
 577               unsigned nargs = gimple_call_num_args (call);
 578               if (nargs >= 2 && nargs <= 7)
 579                 return mask_call_maps[nargs-2];
 580               else
 581                 return nullptr;
 582             }
 583
 584           case IFN_CLZ:
 585           case IFN_CTZ:
 586             return arg0_map;
 587
 588           default:
 589             break;
 590           }
 591     }
 592   return nullptr;
 593 }
 594
 595 /* Return the SLP node child index for operand OP of STMT.  */
 596
 597 int
 598 vect_slp_child_index_for_operand (const gimple *stmt, int op,
 599                                   bool gather_scatter_p)
 600 {
 601   const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
 602   if (!opmap)
 603     return op;
 604   for (int i = 1; i < 1 + opmap[0]; ++i)
 605     if (opmap[i] == op)
 606       return i - 1;
 607   gcc_unreachable ();
 608 }
 609
 610 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 611    they are of a valid type and that they match the defs of the first stmt of
 612    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 613    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 614    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 615    is 1 if STMT is cond and operands of comparison need to be swapped;
 616    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 617
 618    If there was a fatal error return -1; if the error could be corrected by
 619    swapping operands of father node of this one, return 1; if everything is
 620    ok return 0.  */
 621 static int
 622 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 623                              bool *skip_args,
 624                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 625                              vec<slp_oprnd_info> *oprnds_info)
 626 {
 627   stmt_vec_info stmt_info = stmts[stmt_num];
 628   tree oprnd;
 629   unsigned int i, number_of_oprnds;
 630   enum vect_def_type dt = vect_uninitialized_def;
 631   slp_oprnd_info oprnd_info;
 632   gather_scatter_info gs_info;
 633   unsigned int gs_op = -1u;
 634   unsigned int commutative_op = -1U;
 635   bool first = stmt_num == 0;
 636
 637   if (!is_a<gcall *> (stmt_info->stmt)
 638       && !is_a<gassign *> (stmt_info->stmt)
 639       && !is_a<gphi *> (stmt_info->stmt))
 640     return -1;
 641
 642   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 643   const int *map
 644     = vect_get_operand_map (stmt_info->stmt,
 645                             STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
 646   if (map)
 647     number_of_oprnds = *map++;
 648   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 649     {
 650       if (gimple_call_internal_p (stmt))
 651         {
 652           internal_fn ifn = gimple_call_internal_fn (stmt);
 653           commutative_op = first_commutative_argument (ifn);
 654         }
 655     }
 656   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 657     {
 658       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 659         commutative_op = 0;
 660     }
 661
 662   bool swapped = (swap != 0);
 663   bool backedge = false;
 664   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 665   for (i = 0; i < number_of_oprnds; i++)
 666     {
 667       oprnd_info = (*oprnds_info)[i];
 668       int opno = map ? map[i] : int (i);
 669       if (opno == -3)
 670         {
 671           gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
 672           if (!is_a <loop_vec_info> (vinfo)
 673               || !vect_check_gather_scatter (stmt_info,
 674                                              as_a <loop_vec_info> (vinfo),
 675                                              first ? &oprnd_info->first_gs_info
 676                                              : &gs_info))
 677             return -1;
 678
 679           if (first)
 680             {
 681               oprnd_info->first_gs_p = true;
 682               oprnd = oprnd_info->first_gs_info.offset;
 683             }
 684           else
 685             {
 686               gs_op = i;
 687               oprnd = gs_info.offset;
 688             }
 689         }
 690       else if (opno < 0)
 691         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 692       else
 693         {
 694           oprnd = gimple_arg (stmt_info->stmt, opno);
 695           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 696             {
 697               edge e = gimple_phi_arg_edge (stmt, opno);
 698               backedge = (is_a <bb_vec_info> (vinfo)
 699                           ? e->flags & EDGE_DFS_BACK
 700                           : dominated_by_p (CDI_DOMINATORS, e->src,
 701                                             gimple_bb (stmt_info->stmt)));
 702             }
 703         }
 704       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 705         oprnd = TREE_OPERAND (oprnd, 0);
 706
 707       stmt_vec_info def_stmt_info;
 708       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 709         {
 710           if (dump_enabled_p ())
 711             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 712                              "Build SLP failed: can't analyze def for %T\n",
 713                              oprnd);
 714
 715           return -1;
 716         }
 717
 718       if (skip_args[i])
 719         {
 720           oprnd_info->def_stmts.quick_push (NULL);
 721           oprnd_info->ops.quick_push (NULL_TREE);
 722           oprnd_info->first_dt = vect_uninitialized_def;
 723           continue;
 724         }
 725
 726       oprnd_info->def_stmts.quick_push (def_stmt_info);
 727       oprnd_info->ops.quick_push (oprnd);
 728
 729       if (def_stmt_info
 730           && is_pattern_stmt_p (def_stmt_info))
 731         {
 732           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 733               != def_stmt_info)
 734             oprnd_info->any_pattern = true;
 735           else
 736             /* If we promote this to external use the original stmt def.  */
 737             oprnd_info->ops.last ()
 738               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 739         }
 740
 741       /* If there's a extern def on a backedge make sure we can
 742          code-generate at the region start.
 743          ???  This is another case that could be fixed by adjusting
 744          how we split the function but at the moment we'd have conflicting
 745          goals there.  */
 746       if (backedge
 747           && dts[i] == vect_external_def
 748           && is_a <bb_vec_info> (vinfo)
 749           && TREE_CODE (oprnd) == SSA_NAME
 750           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 751           && !dominated_by_p (CDI_DOMINATORS,
 752                               as_a <bb_vec_info> (vinfo)->bbs[0],
 753                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 754         {
 755           if (dump_enabled_p ())
 756             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 757                              "Build SLP failed: extern def %T only defined "
 758                              "on backedge\n", oprnd);
 759           return -1;
 760         }
 761
 762       if (first)
 763         {
 764           tree type = TREE_TYPE (oprnd);
 765           dt = dts[i];
 766
 767           /* For the swapping logic below force vect_reduction_def
 768              for the reduction op in a SLP reduction group.  */
 769           if (!STMT_VINFO_DATA_REF (stmt_info)
 770               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 771               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 772               && def_stmt_info)
 773             dts[i] = dt = vect_reduction_def;
 774
 775           /* Check the types of the definition.  */
 776           switch (dt)
 777             {
 778             case vect_external_def:
 779             case vect_constant_def:
 780             case vect_internal_def:
 781             case vect_reduction_def:
 782             case vect_induction_def:
 783             case vect_nested_cycle:
 784             case vect_first_order_recurrence:
 785               break;
 786
 787             default:
 788               /* FORNOW: Not supported.  */
 789               if (dump_enabled_p ())
 790                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 791                                  "Build SLP failed: illegal type of def %T\n",
 792                                  oprnd);
 793               return -1;
 794             }
 795
 796           oprnd_info->first_dt = dt;
 797           oprnd_info->first_op_type = type;
 798         }
 799     }
 800   if (first)
 801     return 0;
 802
 803   /* Now match the operand definition types to that of the first stmt.  */
 804   for (i = 0; i < number_of_oprnds;)
 805     {
 806       if (skip_args[i])
 807         {
 808           ++i;
 809           continue;
 810         }
 811
 812       oprnd_info = (*oprnds_info)[i];
 813       dt = dts[i];
 814       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 815       oprnd = oprnd_info->ops[stmt_num];
 816       tree type = TREE_TYPE (oprnd);
 817
 818       if (!types_compatible_p (oprnd_info->first_op_type, type))
 819         {
 820           if (dump_enabled_p ())
 821             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 822                              "Build SLP failed: different operand types\n");
 823           return 1;
 824         }
 825
 826       if ((gs_op == i) != oprnd_info->first_gs_p)
 827         {
 828           if (dump_enabled_p ())
 829             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 830                              "Build SLP failed: mixed gather and non-gather\n");
 831           return 1;
 832         }
 833       else if (gs_op == i)
 834         {
 835           if (!operand_equal_p (oprnd_info->first_gs_info.base,
 836                                 gs_info.base))
 837             {
 838               if (dump_enabled_p ())
 839                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 840                                  "Build SLP failed: different gather base\n");
 841               return 1;
 842             }
 843           if (oprnd_info->first_gs_info.scale != gs_info.scale)
 844             {
 845               if (dump_enabled_p ())
 846                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 847                                  "Build SLP failed: different gather scale\n");
 848               return 1;
 849             }
 850         }
 851
 852       /* Not first stmt of the group, check that the def-stmt/s match
 853          the def-stmt/s of the first stmt.  Allow different definition
 854          types for reduction chains: the first stmt must be a
 855          vect_reduction_def (a phi node), and the rest
 856          end in the reduction chain.  */
 857       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 858            && !(oprnd_info->first_dt == vect_reduction_def
 859                 && !STMT_VINFO_DATA_REF (stmt_info)
 860                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 861                 && def_stmt_info
 862                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 863                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 864                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 865           || (!STMT_VINFO_DATA_REF (stmt_info)
 866               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 867               && ((!def_stmt_info
 868                    || STMT_VINFO_DATA_REF (def_stmt_info)
 869                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 870                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 871                   != (oprnd_info->first_dt != vect_reduction_def))))
 872         {
 873           /* Try swapping operands if we got a mismatch.  For BB
 874              vectorization only in case it will clearly improve things.  */
 875           if (i == commutative_op && !swapped
 876               && (!is_a <bb_vec_info> (vinfo)
 877                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 878                                              dts[i+1])
 879                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 880                           || vect_def_types_match
 881                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 882             {
 883               if (dump_enabled_p ())
 884                 dump_printf_loc (MSG_NOTE, vect_location,
 885                                  "trying swapped operands\n");
 886               std::swap (dts[i], dts[i+1]);
 887               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 888                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 889               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 890                          (*oprnds_info)[i+1]->ops[stmt_num]);
 891               swapped = true;
 892               continue;
 893             }
 894
 895           if (is_a <bb_vec_info> (vinfo)
 896               && !oprnd_info->any_pattern)
 897             {
 898               /* Now for commutative ops we should see whether we can
 899                  make the other operand matching.  */
 900               if (dump_enabled_p ())
 901                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 902                                  "treating operand as external\n");
 903               oprnd_info->first_dt = dt = vect_external_def;
 904             }
 905           else
 906             {
 907               if (dump_enabled_p ())
 908                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 909                                  "Build SLP failed: different types\n");
 910               return 1;
 911             }
 912         }
 913
 914       /* Make sure to demote the overall operand to external.  */
 915       if (dt == vect_external_def)
 916         oprnd_info->first_dt = vect_external_def;
 917       /* For a SLP reduction chain we want to duplicate the reduction to
 918          each of the chain members.  That gets us a sane SLP graph (still
 919          the stmts are not 100% correct wrt the initial values).  */
 920       else if ((dt == vect_internal_def
 921                 || dt == vect_reduction_def)
 922                && oprnd_info->first_dt == vect_reduction_def
 923                && !STMT_VINFO_DATA_REF (stmt_info)
 924                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 925                && !STMT_VINFO_DATA_REF (def_stmt_info)
 926                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 927                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 928         {
 929           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 930           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 931         }
 932
 933       ++i;
 934     }
 935
 936   /* Swap operands.  */
 937   if (swapped)
 938     {
 939       if (dump_enabled_p ())
 940         dump_printf_loc (MSG_NOTE, vect_location,
 941                          "swapped operands to match def types in %G",
 942                          stmt_info->stmt);
 943     }
 944
 945   return 0;
 946 }
 947
 948 /* Return true if call statements CALL1 and CALL2 are similar enough
 949    to be combined into the same SLP group.  */
 950
 951 bool
 952 compatible_calls_p (gcall *call1, gcall *call2)
 953 {
 954   unsigned int nargs = gimple_call_num_args (call1);
 955   if (nargs != gimple_call_num_args (call2))
 956     return false;
 957
 958   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 959     return false;
 960
 961   if (gimple_call_internal_p (call1))
 962     {
 963       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
 964                                TREE_TYPE (gimple_call_lhs (call2))))
 965         return false;
 966       for (unsigned int i = 0; i < nargs; ++i)
 967         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
 968                                  TREE_TYPE (gimple_call_arg (call2, i))))
 969           return false;
 970     }
 971   else
 972     {
 973       if (!operand_equal_p (gimple_call_fn (call1),
 974                             gimple_call_fn (call2), 0))
 975         return false;
 976
 977       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
 978         return false;
 979     }
 980
 981   /* Check that any unvectorized arguments are equal.  */
 982   if (const int *map = vect_get_operand_map (call1))
 983     {
 984       unsigned int nkept = *map++;
 985       unsigned int mapi = 0;
 986       for (unsigned int i = 0; i < nargs; ++i)
 987         if (mapi < nkept && map[mapi] == int (i))
 988           mapi += 1;
 989         else if (!operand_equal_p (gimple_call_arg (call1, i),
 990                                    gimple_call_arg (call2, i)))
 991           return false;
 992     }
 993
 994   return true;
 995 }
 996
 997 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
 998    caller's attempt to find the vector type in STMT_INFO with the narrowest
 999    element type.  Return true if VECTYPE is nonnull and if it is valid
1000    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
1001    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
1002    vect_build_slp_tree.  */
1003
1004 static bool
1005 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1006                         unsigned int group_size,
1007                         tree vectype, poly_uint64 *max_nunits)
1008 {
1009   if (!vectype)
1010     {
1011       if (dump_enabled_p ())
1012         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1013                          "Build SLP failed: unsupported data-type in %G\n",
1014                          stmt_info->stmt);
1015       /* Fatal mismatch.  */
1016       return false;
1017     }
1018
1019   /* If populating the vector type requires unrolling then fail
1020      before adjusting *max_nunits for basic-block vectorization.  */
1021   if (is_a <bb_vec_info> (vinfo)
1022       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1023     {
1024       if (dump_enabled_p ())
1025         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026                          "Build SLP failed: unrolling required "
1027                          "in basic block SLP\n");
1028       /* Fatal mismatch.  */
1029       return false;
1030     }
1031
1032   /* In case of multiple types we need to detect the smallest type.  */
1033   vect_update_max_nunits (max_nunits, vectype);
1034   return true;
1035 }
1036
1037 /* Verify if the scalar stmts STMTS are isomorphic, require data
1038    permutation or are of unsupported types of operation.  Return
1039    true if they are, otherwise return false and indicate in *MATCHES
1040    which stmts are not isomorphic to the first one.  If MATCHES[0]
1041    is false then this indicates the comparison could not be
1042    carried out or the stmts will never be vectorized by SLP.
1043
1044    Note COND_EXPR is possibly isomorphic to another one after swapping its
1045    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1046    the first stmt by swapping the two operands of comparison; set SWAP[i]
1047    to 2 if stmt I is isormorphic to the first stmt by inverting the code
1048    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1049    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
1050
1051 static bool
1052 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1053                        vec<stmt_vec_info> stmts, unsigned int group_size,
1054                        poly_uint64 *max_nunits, bool *matches,
1055                        bool *two_operators, tree *node_vectype)
1056 {
1057   unsigned int i;
1058   stmt_vec_info first_stmt_info = stmts[0];
1059   code_helper first_stmt_code = ERROR_MARK;
1060   code_helper alt_stmt_code = ERROR_MARK;
1061   code_helper rhs_code = ERROR_MARK;
1062   code_helper first_cond_code = ERROR_MARK;
1063   tree lhs;
1064   bool need_same_oprnds = false;
1065   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1066   stmt_vec_info first_load = NULL, prev_first_load = NULL;
1067   bool first_stmt_ldst_p = false, ldst_p = false;
1068   bool first_stmt_phi_p = false, phi_p = false;
1069   bool maybe_soft_fail = false;
1070   tree soft_fail_nunits_vectype = NULL_TREE;
1071
1072   /* For every stmt in NODE find its def stmt/s.  */
1073   stmt_vec_info stmt_info;
1074   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1075     {
1076       gimple *stmt = stmt_info->stmt;
1077       swap[i] = 0;
1078       matches[i] = false;
1079
1080       if (dump_enabled_p ())
1081         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1082
1083       /* Fail to vectorize statements marked as unvectorizable, throw
1084          or are volatile.  */
1085       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1086           || stmt_can_throw_internal (cfun, stmt)
1087           || gimple_has_volatile_ops (stmt))
1088         {
1089           if (dump_enabled_p ())
1090             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1091                              "Build SLP failed: unvectorizable statement %G",
1092                              stmt);
1093           /* ???  For BB vectorization we want to commutate operands in a way
1094              to shuffle all unvectorizable defs into one operand and have
1095              the other still vectorized.  The following doesn't reliably
1096              work for this though but it's the easiest we can do here.  */
1097           if (is_a <bb_vec_info> (vinfo) && i != 0)
1098             continue;
1099           /* Fatal mismatch.  */
1100           matches[0] = false;
1101           return false;
1102         }
1103
1104       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1105       lhs = gimple_get_lhs (stmt);
1106       if (lhs == NULL_TREE
1107           && (!call_stmt
1108               || !gimple_call_internal_p (stmt)
1109               || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1110         {
1111           if (dump_enabled_p ())
1112             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113                              "Build SLP failed: not GIMPLE_ASSIGN nor "
1114                              "GIMPLE_CALL %G", stmt);
1115           if (is_a <bb_vec_info> (vinfo) && i != 0)
1116             continue;
1117           /* Fatal mismatch.  */
1118           matches[0] = false;
1119           return false;
1120         }
1121
1122       tree nunits_vectype;
1123       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1124                                            &nunits_vectype, group_size))
1125         {
1126           if (is_a <bb_vec_info> (vinfo) && i != 0)
1127             continue;
1128           /* Fatal mismatch.  */
1129           matches[0] = false;
1130           return false;
1131         }
1132       /* Record nunits required but continue analysis, producing matches[]
1133          as if nunits was not an issue.  This allows splitting of groups
1134          to happen.  */
1135       if (nunits_vectype
1136           && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1137                                       nunits_vectype, max_nunits))
1138         {
1139           gcc_assert (is_a <bb_vec_info> (vinfo));
1140           maybe_soft_fail = true;
1141           soft_fail_nunits_vectype = nunits_vectype;
1142         }
1143
1144       gcc_assert (vectype);
1145
1146       if (call_stmt)
1147         {
1148           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1149           if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1150             rhs_code = cfn;
1151           else
1152             rhs_code = CALL_EXPR;
1153
1154           if (cfn == CFN_MASK_LOAD
1155               || cfn == CFN_GATHER_LOAD
1156               || cfn == CFN_MASK_GATHER_LOAD
1157               || cfn == CFN_MASK_LEN_GATHER_LOAD)
1158             ldst_p = true;
1159           else if (cfn == CFN_MASK_STORE)
1160             {
1161               ldst_p = true;
1162               rhs_code = CFN_MASK_STORE;
1163             }
1164           else if ((cfn != CFN_LAST
1165                     && cfn != CFN_MASK_CALL
1166                     && internal_fn_p (cfn)
1167                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1168                    || gimple_call_tail_p (call_stmt)
1169                    || gimple_call_noreturn_p (call_stmt)
1170                    || gimple_call_chain (call_stmt))
1171             {
1172               if (dump_enabled_p ())
1173                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1174                                  "Build SLP failed: unsupported call type %G",
1175                                  (gimple *) call_stmt);
1176               if (is_a <bb_vec_info> (vinfo) && i != 0)
1177                 continue;
1178               /* Fatal mismatch.  */
1179               matches[0] = false;
1180               return false;
1181             }
1182         }
1183       else if (gimple_code (stmt) == GIMPLE_PHI)
1184         {
1185           rhs_code = ERROR_MARK;
1186           phi_p = true;
1187         }
1188       else
1189         {
1190           rhs_code = gimple_assign_rhs_code (stmt);
1191           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1192         }
1193
1194       /* Check the operation.  */
1195       if (i == 0)
1196         {
1197           *node_vectype = vectype;
1198           first_stmt_code = rhs_code;
1199           first_stmt_ldst_p = ldst_p;
1200           first_stmt_phi_p = phi_p;
1201
1202           /* Shift arguments should be equal in all the packed stmts for a
1203              vector shift with scalar shift operand.  */
1204           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1205               || rhs_code == LROTATE_EXPR
1206               || rhs_code == RROTATE_EXPR)
1207             {
1208               /* First see if we have a vector/vector shift.  */
1209               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1210                 {
1211                   /* No vector/vector shift, try for a vector/scalar shift.  */
1212                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1213                     {
1214                       if (dump_enabled_p ())
1215                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1216                                          "Build SLP failed: "
1217                                          "op not supported by target.\n");
1218                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1219                         continue;
1220                       /* Fatal mismatch.  */
1221                       matches[0] = false;
1222                       return false;
1223                     }
1224                   need_same_oprnds = true;
1225                   first_op1 = gimple_assign_rhs2 (stmt);
1226                 }
1227             }
1228           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1229             {
1230               need_same_oprnds = true;
1231               first_op1 = gimple_assign_rhs2 (stmt);
1232             }
1233           else if (!ldst_p
1234                    && rhs_code == BIT_FIELD_REF)
1235             {
1236               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1237               if (!is_a <bb_vec_info> (vinfo)
1238                   || TREE_CODE (vec) != SSA_NAME
1239                   /* When the element types are not compatible we pun the
1240                      source to the target vectype which requires equal size.  */
1241                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1242                        || !types_compatible_p (TREE_TYPE (vectype),
1243                                                TREE_TYPE (TREE_TYPE (vec))))
1244                       && !operand_equal_p (TYPE_SIZE (vectype),
1245                                            TYPE_SIZE (TREE_TYPE (vec)))))
1246                 {
1247                   if (dump_enabled_p ())
1248                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249                                      "Build SLP failed: "
1250                                      "BIT_FIELD_REF not supported\n");
1251                   /* Fatal mismatch.  */
1252                   matches[0] = false;
1253                   return false;
1254                 }
1255             }
1256           else if (rhs_code == CFN_DIV_POW2)
1257             {
1258               need_same_oprnds = true;
1259               first_op1 = gimple_call_arg (call_stmt, 1);
1260             }
1261         }
1262       else
1263         {
1264           if (first_stmt_code != rhs_code
1265               && alt_stmt_code == ERROR_MARK)
1266             alt_stmt_code = rhs_code;
1267           if ((first_stmt_code != rhs_code
1268                && (first_stmt_code != IMAGPART_EXPR
1269                    || rhs_code != REALPART_EXPR)
1270                && (first_stmt_code != REALPART_EXPR
1271                    || rhs_code != IMAGPART_EXPR)
1272                /* Handle mismatches in plus/minus by computing both
1273                   and merging the results.  */
1274                && !((first_stmt_code == PLUS_EXPR
1275                      || first_stmt_code == MINUS_EXPR)
1276                     && (alt_stmt_code == PLUS_EXPR
1277                         || alt_stmt_code == MINUS_EXPR)
1278                     && rhs_code == alt_stmt_code)
1279                && !(first_stmt_code.is_tree_code ()
1280                     && rhs_code.is_tree_code ()
1281                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1282                         == tcc_comparison)
1283                     && (swap_tree_comparison (tree_code (first_stmt_code))
1284                         == tree_code (rhs_code)))
1285                && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1286                     && (first_stmt_code == ARRAY_REF
1287                         || first_stmt_code == BIT_FIELD_REF
1288                         || first_stmt_code == INDIRECT_REF
1289                         || first_stmt_code == COMPONENT_REF
1290                         || first_stmt_code == MEM_REF)
1291                     && (rhs_code == ARRAY_REF
1292                         || rhs_code == BIT_FIELD_REF
1293                         || rhs_code == INDIRECT_REF
1294                         || rhs_code == COMPONENT_REF
1295                         || rhs_code == MEM_REF)))
1296               || (ldst_p
1297                   && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1298                       != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1299               || (ldst_p
1300                   && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1301                       != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1302               || first_stmt_ldst_p != ldst_p
1303               || first_stmt_phi_p != phi_p)
1304             {
1305               if (dump_enabled_p ())
1306                 {
1307                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1308                                    "Build SLP failed: different operation "
1309                                    "in stmt %G", stmt);
1310                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1311                                    "original stmt %G", first_stmt_info->stmt);
1312                 }
1313               /* Mismatch.  */
1314               continue;
1315             }
1316
1317           if (!ldst_p
1318               && first_stmt_code == BIT_FIELD_REF
1319               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1320                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1321             {
1322               if (dump_enabled_p ())
1323                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1324                                  "Build SLP failed: different BIT_FIELD_REF "
1325                                  "arguments in %G", stmt);
1326               /* Mismatch.  */
1327               continue;
1328             }
1329
1330           if (call_stmt
1331               && first_stmt_code != CFN_MASK_LOAD
1332               && first_stmt_code != CFN_MASK_STORE)
1333             {
1334               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1335                                        call_stmt))
1336                 {
1337                   if (dump_enabled_p ())
1338                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1339                                      "Build SLP failed: different calls in %G",
1340                                      stmt);
1341                   /* Mismatch.  */
1342                   continue;
1343                 }
1344             }
1345
1346           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1347               && (gimple_bb (first_stmt_info->stmt)
1348                   != gimple_bb (stmt_info->stmt)))
1349             {
1350               if (dump_enabled_p ())
1351                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1352                                  "Build SLP failed: different BB for PHI "
1353                                  "or possibly trapping operation in %G", stmt);
1354               /* Mismatch.  */
1355               continue;
1356             }
1357
1358           if (need_same_oprnds)
1359             {
1360               tree other_op1 = gimple_arg (stmt, 1);
1361               if (!operand_equal_p (first_op1, other_op1, 0))
1362                 {
1363                   if (dump_enabled_p ())
1364                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1365                                      "Build SLP failed: different shift "
1366                                      "arguments in %G", stmt);
1367                   /* Mismatch.  */
1368                   continue;
1369                 }
1370             }
1371
1372           if (!types_compatible_p (vectype, *node_vectype))
1373             {
1374               if (dump_enabled_p ())
1375                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1376                                  "Build SLP failed: different vector type "
1377                                  "in %G", stmt);
1378               /* Mismatch.  */
1379               continue;
1380             }
1381         }
1382
1383       /* Grouped store or load.  */
1384       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1385         {
1386           gcc_assert (ldst_p);
1387           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1388             {
1389               /* Store.  */
1390               gcc_assert (rhs_code == CFN_MASK_STORE
1391                           || REFERENCE_CLASS_P (lhs)
1392                           || DECL_P (lhs));
1393             }
1394           else
1395             {
1396               /* Load.  */
1397               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1398               if (prev_first_load)
1399                 {
1400                   /* Check that there are no loads from different interleaving
1401                      chains in the same node.  */
1402                   if (prev_first_load != first_load)
1403                     {
1404                       if (dump_enabled_p ())
1405                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1406                                          vect_location,
1407                                          "Build SLP failed: different "
1408                                          "interleaving chains in one node %G",
1409                                          stmt);
1410                       /* Mismatch.  */
1411                       continue;
1412                     }
1413                 }
1414               else
1415                 prev_first_load = first_load;
1416            }
1417         }
1418       /* Non-grouped store or load.  */
1419       else if (ldst_p)
1420         {
1421           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1422               && rhs_code != CFN_GATHER_LOAD
1423               && rhs_code != CFN_MASK_GATHER_LOAD
1424               && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1425               && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1426               /* Not grouped loads are handled as externals for BB
1427                  vectorization.  For loop vectorization we can handle
1428                  splats the same we handle single element interleaving.  */
1429               && (is_a <bb_vec_info> (vinfo)
1430                   || stmt_info != first_stmt_info))
1431             {
1432               /* Not grouped load.  */
1433               if (dump_enabled_p ())
1434                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1435                                  "Build SLP failed: not grouped load %G", stmt);
1436
1437               if (i != 0)
1438                 continue;
1439               /* Fatal mismatch.  */
1440               matches[0] = false;
1441               return false;
1442             }
1443         }
1444       /* Not memory operation.  */
1445       else
1446         {
1447           if (!phi_p
1448               && rhs_code.is_tree_code ()
1449               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1450               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1451               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1452               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1453               && rhs_code != VIEW_CONVERT_EXPR
1454               && rhs_code != CALL_EXPR
1455               && rhs_code != BIT_FIELD_REF)
1456             {
1457               if (dump_enabled_p ())
1458                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459                                  "Build SLP failed: operation unsupported %G",
1460                                  stmt);
1461               if (is_a <bb_vec_info> (vinfo) && i != 0)
1462                 continue;
1463               /* Fatal mismatch.  */
1464               matches[0] = false;
1465               return false;
1466             }
1467
1468           if (rhs_code == COND_EXPR)
1469             {
1470               tree cond_expr = gimple_assign_rhs1 (stmt);
1471               enum tree_code cond_code = TREE_CODE (cond_expr);
1472               enum tree_code swap_code = ERROR_MARK;
1473               enum tree_code invert_code = ERROR_MARK;
1474
1475               if (i == 0)
1476                 first_cond_code = TREE_CODE (cond_expr);
1477               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1478                 {
1479                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1480                   swap_code = swap_tree_comparison (cond_code);
1481                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1482                 }
1483
1484               if (first_cond_code == cond_code)
1485                 ;
1486               /* Isomorphic can be achieved by swapping.  */
1487               else if (first_cond_code == swap_code)
1488                 swap[i] = 1;
1489               /* Isomorphic can be achieved by inverting.  */
1490               else if (first_cond_code == invert_code)
1491                 swap[i] = 2;
1492               else
1493                 {
1494                   if (dump_enabled_p ())
1495                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1496                                      "Build SLP failed: different"
1497                                      " operation %G", stmt);
1498                   /* Mismatch.  */
1499                   continue;
1500                 }
1501             }
1502
1503           if (rhs_code.is_tree_code ()
1504               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1505               && (swap_tree_comparison ((tree_code)first_stmt_code)
1506                   == (tree_code)rhs_code))
1507             swap[i] = 1;
1508         }
1509
1510       matches[i] = true;
1511     }
1512
1513   for (i = 0; i < group_size; ++i)
1514     if (!matches[i])
1515       return false;
1516
1517   /* If we allowed a two-operation SLP node verify the target can cope
1518      with the permute we are going to use.  */
1519   if (alt_stmt_code != ERROR_MARK
1520       && (!alt_stmt_code.is_tree_code ()
1521           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1522               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1523     {
1524       *two_operators = true;
1525     }
1526
1527   if (maybe_soft_fail)
1528     {
1529       unsigned HOST_WIDE_INT const_nunits;
1530       if (!TYPE_VECTOR_SUBPARTS
1531             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1532           || const_nunits > group_size)
1533         matches[0] = false;
1534       else
1535         {
1536           /* With constant vector elements simulate a mismatch at the
1537              point we need to split.  */
1538           unsigned tail = group_size & (const_nunits - 1);
1539           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1540         }
1541       return false;
1542     }
1543
1544   return true;
1545 }
1546
1547 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1548    Note we never remove apart from at destruction time so we do not
1549    need a special value for deleted that differs from empty.  */
1550 struct bst_traits
1551 {
1552   typedef vec <stmt_vec_info> value_type;
1553   typedef vec <stmt_vec_info> compare_type;
1554   static inline hashval_t hash (value_type);
1555   static inline bool equal (value_type existing, value_type candidate);
1556   static inline bool is_empty (value_type x) { return !x.exists (); }
1557   static inline bool is_deleted (value_type x) { return !x.exists (); }
1558   static const bool empty_zero_p = true;
1559   static inline void mark_empty (value_type &x) { x.release (); }
1560   static inline void mark_deleted (value_type &x) { x.release (); }
1561   static inline void remove (value_type &x) { x.release (); }
1562 };
1563 inline hashval_t
1564 bst_traits::hash (value_type x)
1565 {
1566   inchash::hash h;
1567   for (unsigned i = 0; i < x.length (); ++i)
1568     h.add_int (gimple_uid (x[i]->stmt));
1569   return h.end ();
1570 }
1571 inline bool
1572 bst_traits::equal (value_type existing, value_type candidate)
1573 {
1574   if (existing.length () != candidate.length ())
1575     return false;
1576   for (unsigned i = 0; i < existing.length (); ++i)
1577     if (existing[i] != candidate[i])
1578       return false;
1579   return true;
1580 }
1581
1582 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1583    but then vec::insert does memmove and that's not compatible with
1584    std::pair.  */
1585 struct chain_op_t
1586 {
1587   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1588       : code (code_), dt (dt_), op (op_) {}
1589   tree_code code;
1590   vect_def_type dt;
1591   tree op;
1592 };
1593
1594 /* Comparator for sorting associatable chains.  */
1595
1596 static int
1597 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1598 {
1599   auto *op1 = (const chain_op_t *) op1_;
1600   auto *op2 = (const chain_op_t *) op2_;
1601   if (op1->dt != op2->dt)
1602     return (int)op1->dt - (int)op2->dt;
1603   return (int)op1->code - (int)op2->code;
1604 }
1605
1606 /* Linearize the associatable expression chain at START with the
1607    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1608    filling CHAIN with the result and using WORKLIST as intermediate storage.
1609    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1610    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1611    stmts, starting with START.  */
1612
1613 static void
1614 vect_slp_linearize_chain (vec_info *vinfo,
1615                           vec<std::pair<tree_code, gimple *> > &worklist,
1616                           vec<chain_op_t> &chain,
1617                           enum tree_code code, gimple *start,
1618                           gimple *&code_stmt, gimple *&alt_code_stmt,
1619                           vec<gimple *> *chain_stmts)
1620 {
1621   /* For each lane linearize the addition/subtraction (or other
1622      uniform associatable operation) expression tree.  */
1623   worklist.safe_push (std::make_pair (code, start));
1624   while (!worklist.is_empty ())
1625     {
1626       auto entry = worklist.pop ();
1627       gassign *stmt = as_a <gassign *> (entry.second);
1628       enum tree_code in_code = entry.first;
1629       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1630       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1631       if (!code_stmt
1632           && gimple_assign_rhs_code (stmt) == code)
1633         code_stmt = stmt;
1634       else if (!alt_code_stmt
1635                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1636         alt_code_stmt = stmt;
1637       if (chain_stmts)
1638         chain_stmts->safe_push (stmt);
1639       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1640         {
1641           tree op = gimple_op (stmt, opnum);
1642           vect_def_type dt;
1643           stmt_vec_info def_stmt_info;
1644           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1645           gcc_assert (res);
1646           if (dt == vect_internal_def
1647               && is_pattern_stmt_p (def_stmt_info))
1648             op = gimple_get_lhs (def_stmt_info->stmt);
1649           gimple *use_stmt;
1650           use_operand_p use_p;
1651           if (dt == vect_internal_def
1652               && single_imm_use (op, &use_p, &use_stmt)
1653               && is_gimple_assign (def_stmt_info->stmt)
1654               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1655                   || (code == PLUS_EXPR
1656                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1657                           == MINUS_EXPR))))
1658             {
1659               tree_code op_def_code = this_code;
1660               if (op_def_code == MINUS_EXPR && opnum == 1)
1661                 op_def_code = PLUS_EXPR;
1662               if (in_code == MINUS_EXPR)
1663                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1664               worklist.safe_push (std::make_pair (op_def_code,
1665                                                   def_stmt_info->stmt));
1666             }
1667           else
1668             {
1669               tree_code op_def_code = this_code;
1670               if (op_def_code == MINUS_EXPR && opnum == 1)
1671                 op_def_code = PLUS_EXPR;
1672               if (in_code == MINUS_EXPR)
1673                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1674               chain.safe_push (chain_op_t (op_def_code, dt, op));
1675             }
1676         }
1677     }
1678 }
1679
1680 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1681                   simple_hashmap_traits <bst_traits, slp_tree> >
1682   scalar_stmts_to_slp_tree_map_t;
1683
1684 static slp_tree
1685 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1686                        vec<stmt_vec_info> stmts, unsigned int group_size,
1687                        poly_uint64 *max_nunits,
1688                        bool *matches, unsigned *limit, unsigned *tree_size,
1689                        scalar_stmts_to_slp_tree_map_t *bst_map);
1690
1691 static slp_tree
1692 vect_build_slp_tree (vec_info *vinfo,
1693                      vec<stmt_vec_info> stmts, unsigned int group_size,
1694                      poly_uint64 *max_nunits,
1695                      bool *matches, unsigned *limit, unsigned *tree_size,
1696                      scalar_stmts_to_slp_tree_map_t *bst_map)
1697 {
1698   if (slp_tree *leader = bst_map->get (stmts))
1699     {
1700       if (dump_enabled_p ())
1701         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1702                          !(*leader)->failed ? "" : "failed ",
1703                          (void *) *leader);
1704       if (!(*leader)->failed)
1705         {
1706           SLP_TREE_REF_COUNT (*leader)++;
1707           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1708           stmts.release ();
1709           return *leader;
1710         }
1711       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1712       return NULL;
1713     }
1714
1715   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1716      so we can pick up backedge destinations during discovery.  */
1717   slp_tree res = new _slp_tree;
1718   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1719   SLP_TREE_SCALAR_STMTS (res) = stmts;
1720   bst_map->put (stmts.copy (), res);
1721
1722   if (*limit == 0)
1723     {
1724       if (dump_enabled_p ())
1725         dump_printf_loc (MSG_NOTE, vect_location,
1726                          "SLP discovery limit exceeded\n");
1727       /* Mark the node invalid so we can detect those when still in use
1728          as backedge destinations.  */
1729       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1730       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1731       res->failed = XNEWVEC (bool, group_size);
1732       memset (res->failed, 0, sizeof (bool) * group_size);
1733       memset (matches, 0, sizeof (bool) * group_size);
1734       return NULL;
1735     }
1736   --*limit;
1737
1738   if (dump_enabled_p ())
1739     dump_printf_loc (MSG_NOTE, vect_location,
1740                      "starting SLP discovery for node %p\n", (void *) res);
1741
1742   poly_uint64 this_max_nunits = 1;
1743   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1744                                         &this_max_nunits,
1745                                         matches, limit, tree_size, bst_map);
1746   if (!res_)
1747     {
1748       if (dump_enabled_p ())
1749         dump_printf_loc (MSG_NOTE, vect_location,
1750                          "SLP discovery for node %p failed\n", (void *) res);
1751       /* Mark the node invalid so we can detect those when still in use
1752          as backedge destinations.  */
1753       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1754       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1755       res->failed = XNEWVEC (bool, group_size);
1756       if (flag_checking)
1757         {
1758           unsigned i;
1759           for (i = 0; i < group_size; ++i)
1760             if (!matches[i])
1761               break;
1762           gcc_assert (i < group_size);
1763         }
1764       memcpy (res->failed, matches, sizeof (bool) * group_size);
1765     }
1766   else
1767     {
1768       if (dump_enabled_p ())
1769         dump_printf_loc (MSG_NOTE, vect_location,
1770                          "SLP discovery for node %p succeeded\n",
1771                          (void *) res);
1772       gcc_assert (res_ == res);
1773       res->max_nunits = this_max_nunits;
1774       vect_update_max_nunits (max_nunits, this_max_nunits);
1775       /* Keep a reference for the bst_map use.  */
1776       SLP_TREE_REF_COUNT (res)++;
1777     }
1778   return res_;
1779 }
1780
1781 /* Helper for building an associated SLP node chain.  */
1782
1783 static void
1784 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1785                                    slp_tree op0, slp_tree op1,
1786                                    stmt_vec_info oper1, stmt_vec_info oper2,
1787                                    vec<std::pair<unsigned, unsigned> > lperm)
1788 {
1789   unsigned group_size = SLP_TREE_LANES (op1);
1790
1791   slp_tree child1 = new _slp_tree;
1792   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1793   SLP_TREE_VECTYPE (child1) = vectype;
1794   SLP_TREE_LANES (child1) = group_size;
1795   SLP_TREE_CHILDREN (child1).create (2);
1796   SLP_TREE_CHILDREN (child1).quick_push (op0);
1797   SLP_TREE_CHILDREN (child1).quick_push (op1);
1798   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1799
1800   slp_tree child2 = new _slp_tree;
1801   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1802   SLP_TREE_VECTYPE (child2) = vectype;
1803   SLP_TREE_LANES (child2) = group_size;
1804   SLP_TREE_CHILDREN (child2).create (2);
1805   SLP_TREE_CHILDREN (child2).quick_push (op0);
1806   SLP_TREE_REF_COUNT (op0)++;
1807   SLP_TREE_CHILDREN (child2).quick_push (op1);
1808   SLP_TREE_REF_COUNT (op1)++;
1809   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1810
1811   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1812   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1813   SLP_TREE_VECTYPE (perm) = vectype;
1814   SLP_TREE_LANES (perm) = group_size;
1815   /* ???  We should set this NULL but that's not expected.  */
1816   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1817   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1818   SLP_TREE_CHILDREN (perm).quick_push (child1);
1819   SLP_TREE_CHILDREN (perm).quick_push (child2);
1820 }
1821
1822 /* Recursively build an SLP tree starting from NODE.
1823    Fail (and return a value not equal to zero) if def-stmts are not
1824    isomorphic, require data permutation or are of unsupported types of
1825    operation.  Otherwise, return 0.
1826    The value returned is the depth in the SLP tree where a mismatch
1827    was found.  */
1828
1829 static slp_tree
1830 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1831                        vec<stmt_vec_info> stmts, unsigned int group_size,
1832                        poly_uint64 *max_nunits,
1833                        bool *matches, unsigned *limit, unsigned *tree_size,
1834                        scalar_stmts_to_slp_tree_map_t *bst_map)
1835 {
1836   unsigned nops, i, this_tree_size = 0;
1837   poly_uint64 this_max_nunits = *max_nunits;
1838
1839   matches[0] = false;
1840
1841   stmt_vec_info stmt_info = stmts[0];
1842   if (!is_a<gcall *> (stmt_info->stmt)
1843       && !is_a<gassign *> (stmt_info->stmt)
1844       && !is_a<gphi *> (stmt_info->stmt))
1845     return NULL;
1846
1847   nops = gimple_num_args (stmt_info->stmt);
1848   if (const int *map = vect_get_operand_map (stmt_info->stmt,
1849                                              STMT_VINFO_GATHER_SCATTER_P
1850                                                (stmt_info)))
1851     nops = map[0];
1852
1853   /* If the SLP node is a PHI (induction or reduction), terminate
1854      the recursion.  */
1855   bool *skip_args = XALLOCAVEC (bool, nops);
1856   memset (skip_args, 0, sizeof (bool) * nops);
1857   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1858     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1859       {
1860         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1861         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1862                                                     group_size);
1863         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1864                                      max_nunits))
1865           return NULL;
1866
1867         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1868         if (def_type == vect_induction_def)
1869           {
1870             /* Induction PHIs are not cycles but walk the initial
1871                value.  Only for inner loops through, for outer loops
1872                we need to pick up the value from the actual PHIs
1873                to more easily support peeling and epilogue vectorization.  */
1874             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1875             if (!nested_in_vect_loop_p (loop, stmt_info))
1876               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1877             else
1878               loop = loop->inner;
1879             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1880           }
1881         else if (def_type == vect_reduction_def
1882                  || def_type == vect_double_reduction_def
1883                  || def_type == vect_nested_cycle
1884                  || def_type == vect_first_order_recurrence)
1885           {
1886             /* Else def types have to match.  */
1887             stmt_vec_info other_info;
1888             bool all_same = true;
1889             FOR_EACH_VEC_ELT (stmts, i, other_info)
1890               {
1891                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1892                   return NULL;
1893                 if (other_info != stmt_info)
1894                   all_same = false;
1895               }
1896             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1897             /* Reduction initial values are not explicitely represented.  */
1898             if (def_type != vect_first_order_recurrence
1899                 && !nested_in_vect_loop_p (loop, stmt_info))
1900               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1901             /* Reduction chain backedge defs are filled manually.
1902                ???  Need a better way to identify a SLP reduction chain PHI.
1903                Or a better overall way to SLP match those.  */
1904             if (all_same && def_type == vect_reduction_def)
1905               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1906           }
1907         else if (def_type != vect_internal_def)
1908           return NULL;
1909       }
1910
1911
1912   bool two_operators = false;
1913   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1914   tree vectype = NULL_TREE;
1915   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1916                               &this_max_nunits, matches, &two_operators,
1917                               &vectype))
1918     return NULL;
1919
1920   /* If the SLP node is a load, terminate the recursion unless masked.  */
1921   if (STMT_VINFO_DATA_REF (stmt_info)
1922       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1923     {
1924       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1925         gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1926                     || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1927                     || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
1928                     || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD));
1929       else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1930         gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1931       else
1932         {
1933           *max_nunits = this_max_nunits;
1934           (*tree_size)++;
1935           node = vect_create_new_slp_node (node, stmts, 0);
1936           SLP_TREE_VECTYPE (node) = vectype;
1937           /* And compute the load permutation.  Whether it is actually
1938              a permutation depends on the unrolling factor which is
1939              decided later.  */
1940           vec<unsigned> load_permutation;
1941           int j;
1942           stmt_vec_info load_info;
1943           load_permutation.create (group_size);
1944           stmt_vec_info first_stmt_info
1945             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1946           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1947             {
1948               int load_place;
1949               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1950                 load_place = vect_get_place_in_interleaving_chain
1951                                 (load_info, first_stmt_info);
1952               else
1953                 load_place = 0;
1954               gcc_assert (load_place != -1);
1955               load_permutation.safe_push (load_place);
1956             }
1957           SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1958           return node;
1959         }
1960     }
1961   else if (gimple_assign_single_p (stmt_info->stmt)
1962            && !gimple_vuse (stmt_info->stmt)
1963            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1964     {
1965       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1966          the same SSA name vector of a compatible type to vectype.  */
1967       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1968       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1969       stmt_vec_info estmt_info;
1970       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1971         {
1972           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1973           tree bfref = gimple_assign_rhs1 (estmt);
1974           HOST_WIDE_INT lane;
1975           if (!known_eq (bit_field_size (bfref),
1976                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1977               || !constant_multiple_p (bit_field_offset (bfref),
1978                                        bit_field_size (bfref), &lane))
1979             {
1980               lperm.release ();
1981               matches[0] = false;
1982               return NULL;
1983             }
1984           lperm.safe_push (std::make_pair (0, (unsigned)lane));
1985         }
1986       slp_tree vnode = vect_create_new_slp_node (vNULL);
1987       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1988         /* ???  We record vectype here but we hide eventually necessary
1989            punning and instead rely on code generation to materialize
1990            VIEW_CONVERT_EXPRs as necessary.  We instead should make
1991            this explicit somehow.  */
1992         SLP_TREE_VECTYPE (vnode) = vectype;
1993       else
1994         {
1995           /* For different size but compatible elements we can still
1996              use VEC_PERM_EXPR without punning.  */
1997           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1998                       && types_compatible_p (TREE_TYPE (vectype),
1999                                              TREE_TYPE (TREE_TYPE (vec))));
2000           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2001         }
2002       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2003       unsigned HOST_WIDE_INT const_nunits;
2004       if (nunits.is_constant (&const_nunits))
2005         SLP_TREE_LANES (vnode) = const_nunits;
2006       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2007       /* We are always building a permutation node even if it is an identity
2008          permute to shield the rest of the vectorizer from the odd node
2009          representing an actual vector without any scalar ops.
2010          ???  We could hide it completely with making the permute node
2011          external?  */
2012       node = vect_create_new_slp_node (node, stmts, 1);
2013       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2014       SLP_TREE_LANE_PERMUTATION (node) = lperm;
2015       SLP_TREE_VECTYPE (node) = vectype;
2016       SLP_TREE_CHILDREN (node).quick_push (vnode);
2017       return node;
2018     }
2019   /* When discovery reaches an associatable operation see whether we can
2020      improve that to match up lanes in a way superior to the operand
2021      swapping code which at most looks at two defs.
2022      ???  For BB vectorization we cannot do the brute-force search
2023      for matching as we can succeed by means of builds from scalars
2024      and have no good way to "cost" one build against another.  */
2025   else if (is_a <loop_vec_info> (vinfo)
2026            /* ???  We don't handle !vect_internal_def defs below.  */
2027            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2028            && is_gimple_assign (stmt_info->stmt)
2029            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2030                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2031            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2032                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2033                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2034     {
2035       /* See if we have a chain of (mixed) adds or subtracts or other
2036          associatable ops.  */
2037       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2038       if (code == MINUS_EXPR)
2039         code = PLUS_EXPR;
2040       stmt_vec_info other_op_stmt_info = NULL;
2041       stmt_vec_info op_stmt_info = NULL;
2042       unsigned chain_len = 0;
2043       auto_vec<chain_op_t> chain;
2044       auto_vec<std::pair<tree_code, gimple *> > worklist;
2045       auto_vec<vec<chain_op_t> > chains (group_size);
2046       auto_vec<slp_tree, 4> children;
2047       bool hard_fail = true;
2048       for (unsigned lane = 0; lane < group_size; ++lane)
2049         {
2050           /* For each lane linearize the addition/subtraction (or other
2051              uniform associatable operation) expression tree.  */
2052           gimple *op_stmt = NULL, *other_op_stmt = NULL;
2053           vect_slp_linearize_chain (vinfo, worklist, chain, code,
2054                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
2055                                     NULL);
2056           if (!op_stmt_info && op_stmt)
2057             op_stmt_info = vinfo->lookup_stmt (op_stmt);
2058           if (!other_op_stmt_info && other_op_stmt)
2059             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2060           if (chain.length () == 2)
2061             {
2062               /* In a chain of just two elements resort to the regular
2063                  operand swapping scheme.  If we run into a length
2064                  mismatch still hard-FAIL.  */
2065               if (chain_len == 0)
2066                 hard_fail = false;
2067               else
2068                 {
2069                   matches[lane] = false;
2070                   /* ???  We might want to process the other lanes, but
2071                      make sure to not give false matching hints to the
2072                      caller for lanes we did not process.  */
2073                   if (lane != group_size - 1)
2074                     matches[0] = false;
2075                 }
2076               break;
2077             }
2078           else if (chain_len == 0)
2079             chain_len = chain.length ();
2080           else if (chain.length () != chain_len)
2081             {
2082               /* ???  Here we could slip in magic to compensate with
2083                  neutral operands.  */
2084               matches[lane] = false;
2085               if (lane != group_size - 1)
2086                 matches[0] = false;
2087               break;
2088             }
2089           chains.quick_push (chain.copy ());
2090           chain.truncate (0);
2091         }
2092       if (chains.length () == group_size)
2093         {
2094           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
2095           if (!op_stmt_info)
2096             {
2097               hard_fail = false;
2098               goto out;
2099             }
2100           /* Now we have a set of chains with the same length.  */
2101           /* 1. pre-sort according to def_type and operation.  */
2102           for (unsigned lane = 0; lane < group_size; ++lane)
2103             chains[lane].stablesort (dt_sort_cmp, vinfo);
2104           if (dump_enabled_p ())
2105             {
2106               dump_printf_loc (MSG_NOTE, vect_location,
2107                                "pre-sorted chains of %s\n",
2108                                get_tree_code_name (code));
2109               for (unsigned lane = 0; lane < group_size; ++lane)
2110                 {
2111                   for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2112                     dump_printf (MSG_NOTE, "%s %T ",
2113                                  get_tree_code_name (chains[lane][opnum].code),
2114                                  chains[lane][opnum].op);
2115                   dump_printf (MSG_NOTE, "\n");
2116                 }
2117             }
2118           /* 2. try to build children nodes, associating as necessary.  */
2119           for (unsigned n = 0; n < chain_len; ++n)
2120             {
2121               vect_def_type dt = chains[0][n].dt;
2122               unsigned lane;
2123               for (lane = 0; lane < group_size; ++lane)
2124                 if (chains[lane][n].dt != dt)
2125                   {
2126                     if (dt == vect_constant_def
2127                         && chains[lane][n].dt == vect_external_def)
2128                       dt = vect_external_def;
2129                     else if (dt == vect_external_def
2130                              && chains[lane][n].dt == vect_constant_def)
2131                       ;
2132                     else
2133                       break;
2134                   }
2135               if (lane != group_size)
2136                 {
2137                   if (dump_enabled_p ())
2138                     dump_printf_loc (MSG_NOTE, vect_location,
2139                                      "giving up on chain due to mismatched "
2140                                      "def types\n");
2141                   matches[lane] = false;
2142                   if (lane != group_size - 1)
2143                     matches[0] = false;
2144                   goto out;
2145                 }
2146               if (dt == vect_constant_def
2147                   || dt == vect_external_def)
2148                 {
2149                   /* Check whether we can build the invariant.  If we can't
2150                      we never will be able to.  */
2151                   tree type = TREE_TYPE (chains[0][n].op);
2152                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2153                       && (TREE_CODE (type) == BOOLEAN_TYPE
2154                           || !can_duplicate_and_interleave_p (vinfo, group_size,
2155                                                               type)))
2156                     {
2157                       matches[0] = false;
2158                       goto out;
2159                     }
2160                   vec<tree> ops;
2161                   ops.create (group_size);
2162                   for (lane = 0; lane < group_size; ++lane)
2163                     ops.quick_push (chains[lane][n].op);
2164                   slp_tree child = vect_create_new_slp_node (ops);
2165                   SLP_TREE_DEF_TYPE (child) = dt;
2166                   children.safe_push (child);
2167                 }
2168               else if (dt != vect_internal_def)
2169                 {
2170                   /* Not sure, we might need sth special.
2171                      gcc.dg/vect/pr96854.c,
2172                      gfortran.dg/vect/fast-math-pr37021.f90
2173                      and gfortran.dg/vect/pr61171.f trigger.  */
2174                   /* Soft-fail for now.  */
2175                   hard_fail = false;
2176                   goto out;
2177                 }
2178               else
2179                 {
2180                   vec<stmt_vec_info> op_stmts;
2181                   op_stmts.create (group_size);
2182                   slp_tree child = NULL;
2183                   /* Brute-force our way.  We have to consider a lane
2184                      failing after fixing an earlier fail up in the
2185                      SLP discovery recursion.  So track the current
2186                      permute per lane.  */
2187                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2188                   memset (perms, 0, sizeof (unsigned) * group_size);
2189                   do
2190                     {
2191                       op_stmts.truncate (0);
2192                       for (lane = 0; lane < group_size; ++lane)
2193                         op_stmts.quick_push
2194                           (vinfo->lookup_def (chains[lane][n].op));
2195                       child = vect_build_slp_tree (vinfo, op_stmts,
2196                                                    group_size, &this_max_nunits,
2197                                                    matches, limit,
2198                                                    &this_tree_size, bst_map);
2199                       /* ???  We're likely getting too many fatal mismatches
2200                          here so maybe we want to ignore them (but then we
2201                          have no idea which lanes fatally mismatched).  */
2202                       if (child || !matches[0])
2203                         break;
2204                       /* Swap another lane we have not yet matched up into
2205                          lanes that did not match.  If we run out of
2206                          permute possibilities for a lane terminate the
2207                          search.  */
2208                       bool term = false;
2209                       for (lane = 1; lane < group_size; ++lane)
2210                         if (!matches[lane])
2211                           {
2212                             if (n + perms[lane] + 1 == chain_len)
2213                               {
2214                                 term = true;
2215                                 break;
2216                               }
2217                             std::swap (chains[lane][n],
2218                                        chains[lane][n + perms[lane] + 1]);
2219                             perms[lane]++;
2220                           }
2221                       if (term)
2222                         break;
2223                     }
2224                   while (1);
2225                   if (!child)
2226                     {
2227                       if (dump_enabled_p ())
2228                         dump_printf_loc (MSG_NOTE, vect_location,
2229                                          "failed to match up op %d\n", n);
2230                       op_stmts.release ();
2231                       if (lane != group_size - 1)
2232                         matches[0] = false;
2233                       else
2234                         matches[lane] = false;
2235                       goto out;
2236                     }
2237                   if (dump_enabled_p ())
2238                     {
2239                       dump_printf_loc (MSG_NOTE, vect_location,
2240                                        "matched up op %d to\n", n);
2241                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2242                     }
2243                   children.safe_push (child);
2244                 }
2245             }
2246           /* 3. build SLP nodes to combine the chain.  */
2247           for (unsigned lane = 0; lane < group_size; ++lane)
2248             if (chains[lane][0].code != code)
2249               {
2250                 /* See if there's any alternate all-PLUS entry.  */
2251                 unsigned n;
2252                 for (n = 1; n < chain_len; ++n)
2253                   {
2254                     for (lane = 0; lane < group_size; ++lane)
2255                       if (chains[lane][n].code != code)
2256                         break;
2257                     if (lane == group_size)
2258                       break;
2259                   }
2260                 if (n != chain_len)
2261                   {
2262                     /* Swap that in at first position.  */
2263                     std::swap (children[0], children[n]);
2264                     for (lane = 0; lane < group_size; ++lane)
2265                       std::swap (chains[lane][0], chains[lane][n]);
2266                   }
2267                 else
2268                   {
2269                     /* ???  When this triggers and we end up with two
2270                        vect_constant/external_def up-front things break (ICE)
2271                        spectacularly finding an insertion place for the
2272                        all-constant op.  We should have a fully
2273                        vect_internal_def operand though(?) so we can swap
2274                        that into first place and then prepend the all-zero
2275                        constant.  */
2276                     if (dump_enabled_p ())
2277                       dump_printf_loc (MSG_NOTE, vect_location,
2278                                        "inserting constant zero to compensate "
2279                                        "for (partially) negated first "
2280                                        "operand\n");
2281                     chain_len++;
2282                     for (lane = 0; lane < group_size; ++lane)
2283                       chains[lane].safe_insert
2284                         (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2285                     vec<tree> zero_ops;
2286                     zero_ops.create (group_size);
2287                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2288                     for (lane = 1; lane < group_size; ++lane)
2289                       zero_ops.quick_push (zero_ops[0]);
2290                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2291                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2292                     children.safe_insert (0, zero);
2293                   }
2294                 break;
2295               }
2296           for (unsigned i = 1; i < children.length (); ++i)
2297             {
2298               slp_tree op0 = children[i - 1];
2299               slp_tree op1 = children[i];
2300               bool this_two_op = false;
2301               for (unsigned lane = 0; lane < group_size; ++lane)
2302                 if (chains[lane][i].code != chains[0][i].code)
2303                   {
2304                     this_two_op = true;
2305                     break;
2306                   }
2307               slp_tree child;
2308               if (i == children.length () - 1)
2309                 child = vect_create_new_slp_node (node, stmts, 2);
2310               else
2311                 child = vect_create_new_slp_node (2, ERROR_MARK);
2312               if (this_two_op)
2313                 {
2314                   vec<std::pair<unsigned, unsigned> > lperm;
2315                   lperm.create (group_size);
2316                   for (unsigned lane = 0; lane < group_size; ++lane)
2317                     lperm.quick_push (std::make_pair
2318                       (chains[lane][i].code != chains[0][i].code, lane));
2319                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2320                                                      (chains[0][i].code == code
2321                                                       ? op_stmt_info
2322                                                       : other_op_stmt_info),
2323                                                      (chains[0][i].code == code
2324                                                       ? other_op_stmt_info
2325                                                       : op_stmt_info),
2326                                                      lperm);
2327                 }
2328               else
2329                 {
2330                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2331                   SLP_TREE_VECTYPE (child) = vectype;
2332                   SLP_TREE_LANES (child) = group_size;
2333                   SLP_TREE_CHILDREN (child).quick_push (op0);
2334                   SLP_TREE_CHILDREN (child).quick_push (op1);
2335                   SLP_TREE_REPRESENTATIVE (child)
2336                     = (chains[0][i].code == code
2337                        ? op_stmt_info : other_op_stmt_info);
2338                 }
2339               children[i] = child;
2340             }
2341           *tree_size += this_tree_size + 1;
2342           *max_nunits = this_max_nunits;
2343           while (!chains.is_empty ())
2344             chains.pop ().release ();
2345           return node;
2346         }
2347 out:
2348       while (!children.is_empty ())
2349         vect_free_slp_tree (children.pop ());
2350       while (!chains.is_empty ())
2351         chains.pop ().release ();
2352       /* Hard-fail, otherwise we might run into quadratic processing of the
2353          chains starting one stmt into the chain again.  */
2354       if (hard_fail)
2355         return NULL;
2356       /* Fall thru to normal processing.  */
2357     }
2358
2359   /* Get at the operands, verifying they are compatible.  */
2360   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2361   slp_oprnd_info oprnd_info;
2362   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2363     {
2364       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2365                                              stmts, i, &oprnds_info);
2366       if (res != 0)
2367         matches[(res == -1) ? 0 : i] = false;
2368       if (!matches[0])
2369         break;
2370     }
2371   for (i = 0; i < group_size; ++i)
2372     if (!matches[i])
2373       {
2374         vect_free_oprnd_info (oprnds_info);
2375         return NULL;
2376       }
2377   swap = NULL;
2378
2379   auto_vec<slp_tree, 4> children;
2380
2381   stmt_info = stmts[0];
2382
2383   /* Create SLP_TREE nodes for the definition node/s.  */
2384   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2385     {
2386       slp_tree child = nullptr;
2387       unsigned int j;
2388
2389       /* We're skipping certain operands from processing, for example
2390          outer loop reduction initial defs.  */
2391       if (skip_args[i])
2392         {
2393           children.safe_push (NULL);
2394           continue;
2395         }
2396
2397       if (oprnd_info->first_dt == vect_uninitialized_def)
2398         {
2399           /* COND_EXPR have one too many eventually if the condition
2400              is a SSA name.  */
2401           gcc_assert (i == 3 && nops == 4);
2402           continue;
2403         }
2404
2405       if (is_a <bb_vec_info> (vinfo)
2406           && oprnd_info->first_dt == vect_internal_def
2407           && !oprnd_info->any_pattern)
2408         {
2409           /* For BB vectorization, if all defs are the same do not
2410              bother to continue the build along the single-lane
2411              graph but use a splat of the scalar value.  */
2412           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2413           for (j = 1; j < group_size; ++j)
2414             if (oprnd_info->def_stmts[j] != first_def)
2415               break;
2416           if (j == group_size
2417               /* But avoid doing this for loads where we may be
2418                  able to CSE things, unless the stmt is not
2419                  vectorizable.  */
2420               && (!STMT_VINFO_VECTORIZABLE (first_def)
2421                   || !gimple_vuse (first_def->stmt)))
2422             {
2423               if (dump_enabled_p ())
2424                 dump_printf_loc (MSG_NOTE, vect_location,
2425                                  "Using a splat of the uniform operand %G",
2426                                  first_def->stmt);
2427               oprnd_info->first_dt = vect_external_def;
2428             }
2429         }
2430
2431       if (oprnd_info->first_dt == vect_external_def
2432           || oprnd_info->first_dt == vect_constant_def)
2433         {
2434           if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2435             {
2436               tree op0;
2437               tree uniform_val = op0 = oprnd_info->ops[0];
2438               for (j = 1; j < oprnd_info->ops.length (); ++j)
2439                 if (!operand_equal_p (uniform_val, oprnd_info->ops[j]))
2440                   {
2441                     uniform_val = NULL_TREE;
2442                     break;
2443                   }
2444               if (!uniform_val
2445                   && !can_duplicate_and_interleave_p (vinfo,
2446                                                       oprnd_info->ops.length (),
2447                                                       TREE_TYPE (op0)))
2448                 {
2449                   matches[j] = false;
2450                   if (dump_enabled_p ())
2451                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2452                                      "Build SLP failed: invalid type of def "
2453                                      "for variable-length SLP %T\n", op0);
2454                   goto fail;
2455                 }
2456             }
2457           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2458           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2459           oprnd_info->ops = vNULL;
2460           children.safe_push (invnode);
2461           continue;
2462         }
2463
2464       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2465                                         group_size, &this_max_nunits,
2466                                         matches, limit,
2467                                         &this_tree_size, bst_map)) != NULL)
2468         {
2469           oprnd_info->def_stmts = vNULL;
2470           children.safe_push (child);
2471           continue;
2472         }
2473
2474       /* If the SLP build for operand zero failed and operand zero
2475          and one can be commutated try that for the scalar stmts
2476          that failed the match.  */
2477       if (i == 0
2478           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2479           && matches[0]
2480           /* ???  For COND_EXPRs we can swap the comparison operands
2481              as well as the arms under some constraints.  */
2482           && nops == 2
2483           && oprnds_info[1]->first_dt == vect_internal_def
2484           && is_gimple_assign (stmt_info->stmt)
2485           /* Swapping operands for reductions breaks assumptions later on.  */
2486           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2487           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2488         {
2489           /* See whether we can swap the matching or the non-matching
2490              stmt operands.  */
2491           bool swap_not_matching = true;
2492           do
2493             {
2494               for (j = 0; j < group_size; ++j)
2495                 {
2496                   if (matches[j] != !swap_not_matching)
2497                     continue;
2498                   stmt_vec_info stmt_info = stmts[j];
2499                   /* Verify if we can swap operands of this stmt.  */
2500                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2501                   if (!stmt
2502                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2503                     {
2504                       if (!swap_not_matching)
2505                         goto fail;
2506                       swap_not_matching = false;
2507                       break;
2508                     }
2509                 }
2510             }
2511           while (j != group_size);
2512
2513           /* Swap mismatched definition stmts.  */
2514           if (dump_enabled_p ())
2515             dump_printf_loc (MSG_NOTE, vect_location,
2516                              "Re-trying with swapped operands of stmts ");
2517           for (j = 0; j < group_size; ++j)
2518             if (matches[j] == !swap_not_matching)
2519               {
2520                 std::swap (oprnds_info[0]->def_stmts[j],
2521                            oprnds_info[1]->def_stmts[j]);
2522                 std::swap (oprnds_info[0]->ops[j],
2523                            oprnds_info[1]->ops[j]);
2524                 if (dump_enabled_p ())
2525                   dump_printf (MSG_NOTE, "%d ", j);
2526               }
2527           if (dump_enabled_p ())
2528             dump_printf (MSG_NOTE, "\n");
2529           /* After swapping some operands we lost track whether an
2530              operand has any pattern defs so be conservative here.  */
2531           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2532             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2533           /* And try again with scratch 'matches' ... */
2534           bool *tem = XALLOCAVEC (bool, group_size);
2535           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2536                                             group_size, &this_max_nunits,
2537                                             tem, limit,
2538                                             &this_tree_size, bst_map)) != NULL)
2539             {
2540               oprnd_info->def_stmts = vNULL;
2541               children.safe_push (child);
2542               continue;
2543             }
2544         }
2545 fail:
2546
2547       /* If the SLP build failed and we analyze a basic-block
2548          simply treat nodes we fail to build as externally defined
2549          (and thus build vectors from the scalar defs).
2550          The cost model will reject outright expensive cases.
2551          ???  This doesn't treat cases where permutation ultimatively
2552          fails (or we don't try permutation below).  Ideally we'd
2553          even compute a permutation that will end up with the maximum
2554          SLP tree size...  */
2555       if (is_a <bb_vec_info> (vinfo)
2556           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2557              do extra work to cancel the pattern so the uses see the
2558              scalar version.  */
2559           && !is_pattern_stmt_p (stmt_info)
2560           && !oprnd_info->any_pattern)
2561         {
2562           /* But if there's a leading vector sized set of matching stmts
2563              fail here so we can split the group.  This matches the condition
2564              vect_analyze_slp_instance uses.  */
2565           /* ???  We might want to split here and combine the results to support
2566              multiple vector sizes better.  */
2567           for (j = 0; j < group_size; ++j)
2568             if (!matches[j])
2569               break;
2570           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2571             {
2572               if (dump_enabled_p ())
2573                 dump_printf_loc (MSG_NOTE, vect_location,
2574                                  "Building vector operands from scalars\n");
2575               this_tree_size++;
2576               child = vect_create_new_slp_node (oprnd_info->ops);
2577               children.safe_push (child);
2578               oprnd_info->ops = vNULL;
2579               continue;
2580             }
2581         }
2582
2583       gcc_assert (child == NULL);
2584       FOR_EACH_VEC_ELT (children, j, child)
2585         if (child)
2586           vect_free_slp_tree (child);
2587       vect_free_oprnd_info (oprnds_info);
2588       return NULL;
2589     }
2590
2591   vect_free_oprnd_info (oprnds_info);
2592
2593   /* If we have all children of a child built up from uniform scalars
2594      or does more than one possibly expensive vector construction then
2595      just throw that away, causing it built up from scalars.
2596      The exception is the SLP node for the vector store.  */
2597   if (is_a <bb_vec_info> (vinfo)
2598       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2599       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2600          do extra work to cancel the pattern so the uses see the
2601          scalar version.  */
2602       && !is_pattern_stmt_p (stmt_info))
2603     {
2604       slp_tree child;
2605       unsigned j;
2606       bool all_uniform_p = true;
2607       unsigned n_vector_builds = 0;
2608       FOR_EACH_VEC_ELT (children, j, child)
2609         {
2610           if (!child)
2611             ;
2612           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2613             all_uniform_p = false;
2614           else if (!vect_slp_tree_uniform_p (child))
2615             {
2616               all_uniform_p = false;
2617               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2618                 n_vector_builds++;
2619             }
2620         }
2621       if (all_uniform_p
2622           || n_vector_builds > 1
2623           || (n_vector_builds == children.length ()
2624               && is_a <gphi *> (stmt_info->stmt)))
2625         {
2626           /* Roll back.  */
2627           matches[0] = false;
2628           FOR_EACH_VEC_ELT (children, j, child)
2629             if (child)
2630               vect_free_slp_tree (child);
2631
2632           if (dump_enabled_p ())
2633             dump_printf_loc (MSG_NOTE, vect_location,
2634                              "Building parent vector operands from "
2635                              "scalars instead\n");
2636           return NULL;
2637         }
2638     }
2639
2640   *tree_size += this_tree_size + 1;
2641   *max_nunits = this_max_nunits;
2642
2643   if (two_operators)
2644     {
2645       /* ???  We'd likely want to either cache in bst_map sth like
2646          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2647          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2648          explicit stmts to put in so the keying on 'stmts' doesn't
2649          work (but we have the same issue with nodes that use 'ops').  */
2650       slp_tree one = new _slp_tree;
2651       slp_tree two = new _slp_tree;
2652       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2653       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2654       SLP_TREE_VECTYPE (one) = vectype;
2655       SLP_TREE_VECTYPE (two) = vectype;
2656       SLP_TREE_CHILDREN (one).safe_splice (children);
2657       SLP_TREE_CHILDREN (two).safe_splice (children);
2658       slp_tree child;
2659       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2660         SLP_TREE_REF_COUNT (child)++;
2661
2662       /* Here we record the original defs since this
2663          node represents the final lane configuration.  */
2664       node = vect_create_new_slp_node (node, stmts, 2);
2665       SLP_TREE_VECTYPE (node) = vectype;
2666       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2667       SLP_TREE_CHILDREN (node).quick_push (one);
2668       SLP_TREE_CHILDREN (node).quick_push (two);
2669       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2670       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2671       enum tree_code ocode = ERROR_MARK;
2672       stmt_vec_info ostmt_info;
2673       unsigned j = 0;
2674       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2675         {
2676           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2677           if (gimple_assign_rhs_code (ostmt) != code0)
2678             {
2679               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2680               ocode = gimple_assign_rhs_code (ostmt);
2681               j = i;
2682             }
2683           else
2684             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2685         }
2686       SLP_TREE_CODE (one) = code0;
2687       SLP_TREE_CODE (two) = ocode;
2688       SLP_TREE_LANES (one) = stmts.length ();
2689       SLP_TREE_LANES (two) = stmts.length ();
2690       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2691       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2692       return node;
2693     }
2694
2695   node = vect_create_new_slp_node (node, stmts, nops);
2696   SLP_TREE_VECTYPE (node) = vectype;
2697   SLP_TREE_CHILDREN (node).splice (children);
2698   return node;
2699 }
2700
2701 /* Dump a single SLP tree NODE.  */
2702
2703 static void
2704 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2705                      slp_tree node)
2706 {
2707   unsigned i, j;
2708   slp_tree child;
2709   stmt_vec_info stmt_info;
2710   tree op;
2711
2712   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2713   dump_user_location_t user_loc = loc.get_user_location ();
2714   dump_printf_loc (metadata, user_loc,
2715                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2716                    ", refcnt=%u)",
2717                    SLP_TREE_DEF_TYPE (node) == vect_external_def
2718                    ? " (external)"
2719                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2720                       ? " (constant)"
2721                       : ""), (void *) node,
2722                    estimated_poly_value (node->max_nunits),
2723                                          SLP_TREE_REF_COUNT (node));
2724   if (SLP_TREE_VECTYPE (node))
2725     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2726   dump_printf (metadata, "\n");
2727   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2728     {
2729       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2730         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2731       else
2732         dump_printf_loc (metadata, user_loc, "op template: %G",
2733                          SLP_TREE_REPRESENTATIVE (node)->stmt);
2734     }
2735   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2736     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2737       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2738   else
2739     {
2740       dump_printf_loc (metadata, user_loc, "\t{ ");
2741       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2742         dump_printf (metadata, "%T%s ", op,
2743                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2744       dump_printf (metadata, "}\n");
2745     }
2746   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2747     {
2748       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2749       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2750         dump_printf (dump_kind, " %u", j);
2751       dump_printf (dump_kind, " }\n");
2752     }
2753   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2754     {
2755       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2756       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2757         dump_printf (dump_kind, " %u[%u]",
2758                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
2759                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
2760       dump_printf (dump_kind, " }\n");
2761     }
2762   if (SLP_TREE_CHILDREN (node).is_empty ())
2763     return;
2764   dump_printf_loc (metadata, user_loc, "\tchildren");
2765   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2766     dump_printf (dump_kind, " %p", (void *)child);
2767   dump_printf (dump_kind, "\n");
2768 }
2769
2770 DEBUG_FUNCTION void
2771 debug (slp_tree node)
2772 {
2773   debug_dump_context ctx;
2774   vect_print_slp_tree (MSG_NOTE,
2775                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2776                        node);
2777 }
2778
2779 /* Recursive helper for the dot producer below.  */
2780
2781 static void
2782 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2783 {
2784   if (visited.add (node))
2785     return;
2786
2787   fprintf (f, "\"%p\" [label=\"", (void *)node);
2788   vect_print_slp_tree (MSG_NOTE,
2789                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2790                        node);
2791   fprintf (f, "\"];\n");
2792
2793
2794   for (slp_tree child : SLP_TREE_CHILDREN (node))
2795     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2796
2797   for (slp_tree child : SLP_TREE_CHILDREN (node))
2798     if (child)
2799       dot_slp_tree (f, child, visited);
2800 }
2801
2802 DEBUG_FUNCTION void
2803 dot_slp_tree (const char *fname, slp_tree node)
2804 {
2805   FILE *f = fopen (fname, "w");
2806   fprintf (f, "digraph {\n");
2807   fflush (f);
2808     {
2809       debug_dump_context ctx (f);
2810       hash_set<slp_tree> visited;
2811       dot_slp_tree (f, node, visited);
2812     }
2813   fflush (f);
2814   fprintf (f, "}\n");
2815   fclose (f);
2816 }
2817
2818 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2819
2820 static void
2821 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2822                       slp_tree node, hash_set<slp_tree> &visited)
2823 {
2824   unsigned i;
2825   slp_tree child;
2826
2827   if (visited.add (node))
2828     return;
2829
2830   vect_print_slp_tree (dump_kind, loc, node);
2831
2832   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2833     if (child)
2834       vect_print_slp_graph (dump_kind, loc, child, visited);
2835 }
2836
2837 static void
2838 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2839                       slp_tree entry)
2840 {
2841   hash_set<slp_tree> visited;
2842   vect_print_slp_graph (dump_kind, loc, entry, visited);
2843 }
2844
2845 /* Mark the tree rooted at NODE with PURE_SLP.  */
2846
2847 static void
2848 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2849 {
2850   int i;
2851   stmt_vec_info stmt_info;
2852   slp_tree child;
2853
2854   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2855     return;
2856
2857   if (visited.add (node))
2858     return;
2859
2860   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2861     STMT_SLP_TYPE (stmt_info) = pure_slp;
2862
2863   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2864     if (child)
2865       vect_mark_slp_stmts (child, visited);
2866 }
2867
2868 static void
2869 vect_mark_slp_stmts (slp_tree node)
2870 {
2871   hash_set<slp_tree> visited;
2872   vect_mark_slp_stmts (node, visited);
2873 }
2874
2875 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2876
2877 static void
2878 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2879 {
2880   int i;
2881   stmt_vec_info stmt_info;
2882   slp_tree child;
2883
2884   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2885     return;
2886
2887   if (visited.add (node))
2888     return;
2889
2890   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2891     {
2892       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2893                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2894       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2895     }
2896
2897   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2898     if (child)
2899       vect_mark_slp_stmts_relevant (child, visited);
2900 }
2901
2902 static void
2903 vect_mark_slp_stmts_relevant (slp_tree node)
2904 {
2905   hash_set<slp_tree> visited;
2906   vect_mark_slp_stmts_relevant (node, visited);
2907 }
2908
2909
2910 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2911
2912 static void
2913 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2914                        hash_set<slp_tree> &visited)
2915 {
2916   if (!node || visited.add (node))
2917     return;
2918
2919   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2920     return;
2921
2922   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
2923     {
2924       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2925       if (STMT_VINFO_DATA_REF (stmt_info)
2926           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2927         loads.safe_push (node);
2928     }
2929
2930   unsigned i;
2931   slp_tree child;
2932   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2933     vect_gather_slp_loads (loads, child, visited);
2934 }
2935
2936
2937 /* Find the last store in SLP INSTANCE.  */
2938
2939 stmt_vec_info
2940 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2941 {
2942   stmt_vec_info last = NULL;
2943   stmt_vec_info stmt_vinfo;
2944
2945   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2946     {
2947       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2948       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2949     }
2950
2951   return last;
2952 }
2953
2954 /* Find the first stmt in NODE.  */
2955
2956 stmt_vec_info
2957 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2958 {
2959   stmt_vec_info first = NULL;
2960   stmt_vec_info stmt_vinfo;
2961
2962   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2963     {
2964       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2965       if (!first
2966           || get_later_stmt (stmt_vinfo, first) == first)
2967         first = stmt_vinfo;
2968     }
2969
2970   return first;
2971 }
2972
2973 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2974    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2975    (also containing the first GROUP1_SIZE stmts, since stores are
2976    consecutive), the second containing the remainder.
2977    Return the first stmt in the second group.  */
2978
2979 static stmt_vec_info
2980 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2981 {
2982   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2983   gcc_assert (group1_size > 0);
2984   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2985   gcc_assert (group2_size > 0);
2986   DR_GROUP_SIZE (first_vinfo) = group1_size;
2987
2988   stmt_vec_info stmt_info = first_vinfo;
2989   for (unsigned i = group1_size; i > 1; i--)
2990     {
2991       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2992       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2993     }
2994   /* STMT is now the last element of the first group.  */
2995   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2996   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2997
2998   DR_GROUP_SIZE (group2) = group2_size;
2999   for (stmt_info = group2; stmt_info;
3000        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3001     {
3002       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3003       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3004     }
3005
3006   /* For the second group, the DR_GROUP_GAP is that before the original group,
3007      plus skipping over the first vector.  */
3008   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3009
3010   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
3011   DR_GROUP_GAP (first_vinfo) += group2_size;
3012
3013   if (dump_enabled_p ())
3014     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3015                      group1_size, group2_size);
3016
3017   return group2;
3018 }
3019
3020 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3021    statements and a vector of NUNITS elements.  */
3022
3023 static poly_uint64
3024 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3025 {
3026   return exact_div (common_multiple (nunits, group_size), group_size);
3027 }
3028
3029 /* Helper that checks to see if a node is a load node.  */
3030
3031 static inline bool
3032 vect_is_slp_load_node  (slp_tree root)
3033 {
3034   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3035          && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3036          && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3037 }
3038
3039
3040 /* Helper function of optimize_load_redistribution that performs the operation
3041    recursively.  */
3042
3043 static slp_tree
3044 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3045                                 vec_info *vinfo, unsigned int group_size,
3046                                 hash_map<slp_tree, slp_tree> *load_map,
3047                                 slp_tree root)
3048 {
3049   if (slp_tree *leader = load_map->get (root))
3050     return *leader;
3051
3052   slp_tree node;
3053   unsigned i;
3054
3055   /* For now, we don't know anything about externals so do not do anything.  */
3056   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3057     return NULL;
3058   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3059     {
3060       /* First convert this node into a load node and add it to the leaves
3061          list and flatten the permute from a lane to a load one.  If it's
3062          unneeded it will be elided later.  */
3063       vec<stmt_vec_info> stmts;
3064       stmts.create (SLP_TREE_LANES (root));
3065       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3066       for (unsigned j = 0; j < lane_perm.length (); j++)
3067         {
3068           std::pair<unsigned, unsigned> perm = lane_perm[j];
3069           node = SLP_TREE_CHILDREN (root)[perm.first];
3070
3071           if (!vect_is_slp_load_node (node)
3072               || SLP_TREE_CHILDREN (node).exists ())
3073             {
3074               stmts.release ();
3075               goto next;
3076             }
3077
3078           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3079         }
3080
3081       if (dump_enabled_p ())
3082         dump_printf_loc (MSG_NOTE, vect_location,
3083                          "converting stmts on permute node %p\n",
3084                          (void *) root);
3085
3086       bool *matches = XALLOCAVEC (bool, group_size);
3087       poly_uint64 max_nunits = 1;
3088       unsigned tree_size = 0, limit = 1;
3089       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3090                                   matches, &limit, &tree_size, bst_map);
3091       if (!node)
3092         stmts.release ();
3093
3094       load_map->put (root, node);
3095       return node;
3096     }
3097
3098 next:
3099   load_map->put (root, NULL);
3100
3101   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3102     {
3103       slp_tree value
3104         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3105                                           node);
3106       if (value)
3107         {
3108           SLP_TREE_REF_COUNT (value)++;
3109           SLP_TREE_CHILDREN (root)[i] = value;
3110           /* ???  We know the original leafs of the replaced nodes will
3111              be referenced by bst_map, only the permutes created by
3112              pattern matching are not.  */
3113           if (SLP_TREE_REF_COUNT (node) == 1)
3114             load_map->remove (node);
3115           vect_free_slp_tree (node);
3116         }
3117     }
3118
3119   return NULL;
3120 }
3121
3122 /* Temporary workaround for loads not being CSEd during SLP build.  This
3123    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3124    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3125    same DR such that the final operation is equal to a permuted load.  Such
3126    NODES are then directly converted into LOADS themselves.  The nodes are
3127    CSEd using BST_MAP.  */
3128
3129 static void
3130 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3131                               vec_info *vinfo, unsigned int group_size,
3132                               hash_map<slp_tree, slp_tree> *load_map,
3133                               slp_tree root)
3134 {
3135   slp_tree node;
3136   unsigned i;
3137
3138   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3139     {
3140       slp_tree value
3141         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3142                                           node);
3143       if (value)
3144         {
3145           SLP_TREE_REF_COUNT (value)++;
3146           SLP_TREE_CHILDREN (root)[i] = value;
3147           /* ???  We know the original leafs of the replaced nodes will
3148              be referenced by bst_map, only the permutes created by
3149              pattern matching are not.  */
3150           if (SLP_TREE_REF_COUNT (node) == 1)
3151             load_map->remove (node);
3152           vect_free_slp_tree (node);
3153         }
3154     }
3155 }
3156
3157 /* Helper function of vect_match_slp_patterns.
3158
3159    Attempts to match patterns against the slp tree rooted in REF_NODE using
3160    VINFO.  Patterns are matched in post-order traversal.
3161
3162    If matching is successful the value in REF_NODE is updated and returned, if
3163    not then it is returned unchanged.  */
3164
3165 static bool
3166 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3167                            slp_tree_to_load_perm_map_t *perm_cache,
3168                            slp_compat_nodes_map_t *compat_cache,
3169                            hash_set<slp_tree> *visited)
3170 {
3171   unsigned i;
3172   slp_tree node = *ref_node;
3173   bool found_p = false;
3174   if (!node || visited->add (node))
3175     return false;
3176
3177   slp_tree child;
3178   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3179     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3180                                           vinfo, perm_cache, compat_cache,
3181                                           visited);
3182
3183   for (unsigned x = 0; x < num__slp_patterns; x++)
3184     {
3185       vect_pattern *pattern
3186         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3187       if (pattern)
3188         {
3189           pattern->build (vinfo);
3190           delete pattern;
3191           found_p = true;
3192         }
3193     }
3194
3195   return found_p;
3196 }
3197
3198 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3199    vec_info VINFO.
3200
3201    The modified tree is returned.  Patterns are tried in order and multiple
3202    patterns may match.  */
3203
3204 static bool
3205 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3206                          hash_set<slp_tree> *visited,
3207                          slp_tree_to_load_perm_map_t *perm_cache,
3208                          slp_compat_nodes_map_t *compat_cache)
3209 {
3210   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3211   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3212
3213   if (dump_enabled_p ())
3214     dump_printf_loc (MSG_NOTE, vect_location,
3215                      "Analyzing SLP tree %p for patterns\n",
3216                      (void *) SLP_INSTANCE_TREE (instance));
3217
3218   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3219                                     visited);
3220 }
3221
3222 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3223    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3224    Return true if we could use IFN_STORE_LANES instead and if that appears
3225    to be the better approach.  */
3226
3227 static bool
3228 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3229                                unsigned int group_size,
3230                                unsigned int new_group_size)
3231 {
3232   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3233   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3234   if (!vectype)
3235     return false;
3236   /* Allow the split if one of the two new groups would operate on full
3237      vectors *within* rather than across one scalar loop iteration.
3238      This is purely a heuristic, but it should work well for group
3239      sizes of 3 and 4, where the possible splits are:
3240
3241        3->2+1:  OK if the vector has exactly two elements
3242        4->2+2:  Likewise
3243        4->3+1:  Less clear-cut.  */
3244   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3245       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3246     return false;
3247   return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3248 }
3249
3250 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3251    vect_build_slp_tree to build a tree of packed stmts if possible.
3252    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3253
3254 static bool
3255 vect_analyze_slp_instance (vec_info *vinfo,
3256                            scalar_stmts_to_slp_tree_map_t *bst_map,
3257                            stmt_vec_info stmt_info, slp_instance_kind kind,
3258                            unsigned max_tree_size, unsigned *limit);
3259
3260 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3261    of KIND.  Return true if successful.  */
3262
3263 static bool
3264 vect_build_slp_instance (vec_info *vinfo,
3265                          slp_instance_kind kind,
3266                          vec<stmt_vec_info> &scalar_stmts,
3267                          vec<stmt_vec_info> &root_stmt_infos,
3268                          vec<tree> &remain,
3269                          unsigned max_tree_size, unsigned *limit,
3270                          scalar_stmts_to_slp_tree_map_t *bst_map,
3271                          /* ???  We need stmt_info for group splitting.  */
3272                          stmt_vec_info stmt_info_)
3273 {
3274   if (kind == slp_inst_kind_ctor)
3275     {
3276       if (dump_enabled_p ())
3277         dump_printf_loc (MSG_NOTE, vect_location,
3278                          "Analyzing vectorizable constructor: %G\n",
3279                          root_stmt_infos[0]->stmt);
3280     }
3281
3282   if (dump_enabled_p ())
3283     {
3284       dump_printf_loc (MSG_NOTE, vect_location,
3285                        "Starting SLP discovery for\n");
3286       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3287         dump_printf_loc (MSG_NOTE, vect_location,
3288                          "  %G", scalar_stmts[i]->stmt);
3289     }
3290
3291   /* When a BB reduction doesn't have an even number of lanes
3292      strip it down, treating the remaining lane as scalar.
3293      ???  Selecting the optimal set of lanes to vectorize would be nice
3294      but SLP build for all lanes will fail quickly because we think
3295      we're going to need unrolling.  */
3296   if (kind == slp_inst_kind_bb_reduc
3297       && (scalar_stmts.length () & 1))
3298     remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt));
3299
3300   /* Build the tree for the SLP instance.  */
3301   unsigned int group_size = scalar_stmts.length ();
3302   bool *matches = XALLOCAVEC (bool, group_size);
3303   poly_uint64 max_nunits = 1;
3304   unsigned tree_size = 0;
3305   unsigned i;
3306   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3307                                        &max_nunits, matches, limit,
3308                                        &tree_size, bst_map);
3309   if (node != NULL)
3310     {
3311       /* Calculate the unrolling factor based on the smallest type.  */
3312       poly_uint64 unrolling_factor
3313         = calculate_unrolling_factor (max_nunits, group_size);
3314
3315       if (maybe_ne (unrolling_factor, 1U)
3316           && is_a <bb_vec_info> (vinfo))
3317         {
3318           unsigned HOST_WIDE_INT const_max_nunits;
3319           if (!max_nunits.is_constant (&const_max_nunits)
3320               || const_max_nunits > group_size)
3321             {
3322               if (dump_enabled_p ())
3323                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3324                                  "Build SLP failed: store group "
3325                                  "size not a multiple of the vector size "
3326                                  "in basic block SLP\n");
3327               vect_free_slp_tree (node);
3328               return false;
3329             }
3330           /* Fatal mismatch.  */
3331           if (dump_enabled_p ())
3332             dump_printf_loc (MSG_NOTE, vect_location,
3333                              "SLP discovery succeeded but node needs "
3334                              "splitting\n");
3335           memset (matches, true, group_size);
3336           matches[group_size / const_max_nunits * const_max_nunits] = false;
3337           vect_free_slp_tree (node);
3338         }
3339       else
3340         {
3341           /* Create a new SLP instance.  */
3342           slp_instance new_instance = XNEW (class _slp_instance);
3343           SLP_INSTANCE_TREE (new_instance) = node;
3344           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3345           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3346           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3347           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3348           SLP_INSTANCE_KIND (new_instance) = kind;
3349           new_instance->reduc_phis = NULL;
3350           new_instance->cost_vec = vNULL;
3351           new_instance->subgraph_entries = vNULL;
3352
3353           if (dump_enabled_p ())
3354             dump_printf_loc (MSG_NOTE, vect_location,
3355                              "SLP size %u vs. limit %u.\n",
3356                              tree_size, max_tree_size);
3357
3358           /* Fixup SLP reduction chains.  */
3359           if (kind == slp_inst_kind_reduc_chain)
3360             {
3361               /* If this is a reduction chain with a conversion in front
3362                  amend the SLP tree with a node for that.  */
3363               gimple *scalar_def
3364                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3365               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3366                 {
3367                   /* Get at the conversion stmt - we know it's the single use
3368                      of the last stmt of the reduction chain.  */
3369                   use_operand_p use_p;
3370                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3371                                            &use_p, &scalar_def);
3372                   gcc_assert (r);
3373                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3374                   next_info = vect_stmt_to_vectorize (next_info);
3375                   scalar_stmts = vNULL;
3376                   scalar_stmts.create (group_size);
3377                   for (unsigned i = 0; i < group_size; ++i)
3378                     scalar_stmts.quick_push (next_info);
3379                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3380                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3381                   SLP_TREE_CHILDREN (conv).quick_push (node);
3382                   SLP_INSTANCE_TREE (new_instance) = conv;
3383                   /* We also have to fake this conversion stmt as SLP reduction
3384                      group so we don't have to mess with too much code
3385                      elsewhere.  */
3386                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3387                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3388                 }
3389               /* Fill the backedge child of the PHI SLP node.  The
3390                  general matching code cannot find it because the
3391                  scalar code does not reflect how we vectorize the
3392                  reduction.  */
3393               use_operand_p use_p;
3394               imm_use_iterator imm_iter;
3395               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3396               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3397                                      gimple_get_lhs (scalar_def))
3398                 /* There are exactly two non-debug uses, the reduction
3399                    PHI and the loop-closed PHI node.  */
3400                 if (!is_gimple_debug (USE_STMT (use_p))
3401                     && gimple_bb (USE_STMT (use_p)) == loop->header)
3402                   {
3403                     auto_vec<stmt_vec_info, 64> phis (group_size);
3404                     stmt_vec_info phi_info
3405                       = vinfo->lookup_stmt (USE_STMT (use_p));
3406                     for (unsigned i = 0; i < group_size; ++i)
3407                       phis.quick_push (phi_info);
3408                     slp_tree *phi_node = bst_map->get (phis);
3409                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3410                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3411                       = SLP_INSTANCE_TREE (new_instance);
3412                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
3413                   }
3414             }
3415
3416           vinfo->slp_instances.safe_push (new_instance);
3417
3418           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3419              the number of scalar stmts in the root in a few places.
3420              Verify that assumption holds.  */
3421           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3422                         .length () == group_size);
3423
3424           if (dump_enabled_p ())
3425             {
3426               dump_printf_loc (MSG_NOTE, vect_location,
3427                                "Final SLP tree for instance %p:\n",
3428                                (void *) new_instance);
3429               vect_print_slp_graph (MSG_NOTE, vect_location,
3430                                     SLP_INSTANCE_TREE (new_instance));
3431             }
3432
3433           return true;
3434         }
3435     }
3436   else
3437     {
3438       /* Failed to SLP.  */
3439       /* Free the allocated memory.  */
3440       scalar_stmts.release ();
3441     }
3442
3443   stmt_vec_info stmt_info = stmt_info_;
3444   /* Try to break the group up into pieces.  */
3445   if (kind == slp_inst_kind_store)
3446     {
3447       /* ???  We could delay all the actual splitting of store-groups
3448          until after SLP discovery of the original group completed.
3449          Then we can recurse to vect_build_slp_instance directly.  */
3450       for (i = 0; i < group_size; i++)
3451         if (!matches[i])
3452           break;
3453
3454       /* For basic block SLP, try to break the group up into multiples of
3455          a vector size.  */
3456       if (is_a <bb_vec_info> (vinfo)
3457           && (i > 1 && i < group_size))
3458         {
3459           tree scalar_type
3460             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3461           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3462                                                       1 << floor_log2 (i));
3463           unsigned HOST_WIDE_INT const_nunits;
3464           if (vectype
3465               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3466             {
3467               /* Split into two groups at the first vector boundary.  */
3468               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3469               unsigned group1_size = i & ~(const_nunits - 1);
3470
3471               if (dump_enabled_p ())
3472                 dump_printf_loc (MSG_NOTE, vect_location,
3473                                  "Splitting SLP group at stmt %u\n", i);
3474               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3475                                                                group1_size);
3476               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3477                                                     kind, max_tree_size,
3478                                                     limit);
3479               /* Split the rest at the failure point and possibly
3480                  re-analyze the remaining matching part if it has
3481                  at least two lanes.  */
3482               if (group1_size < i
3483                   && (i + 1 < group_size
3484                       || i - group1_size > 1))
3485                 {
3486                   stmt_vec_info rest2 = rest;
3487                   rest = vect_split_slp_store_group (rest, i - group1_size);
3488                   if (i - group1_size > 1)
3489                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3490                                                       kind, max_tree_size,
3491                                                       limit);
3492                 }
3493               /* Re-analyze the non-matching tail if it has at least
3494                  two lanes.  */
3495               if (i + 1 < group_size)
3496                 res |= vect_analyze_slp_instance (vinfo, bst_map,
3497                                                   rest, kind, max_tree_size,
3498                                                   limit);
3499               return res;
3500             }
3501         }
3502
3503       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3504       if (is_a <loop_vec_info> (vinfo)
3505           && (i > 1 && i < group_size)
3506           && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3507         {
3508           unsigned group1_size = i;
3509
3510           if (dump_enabled_p ())
3511             dump_printf_loc (MSG_NOTE, vect_location,
3512                              "Splitting SLP group at stmt %u\n", i);
3513
3514           stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3515                                                            group1_size);
3516           /* Loop vectorization cannot handle gaps in stores, make sure
3517              the split group appears as strided.  */
3518           STMT_VINFO_STRIDED_P (rest) = 1;
3519           DR_GROUP_GAP (rest) = 0;
3520           STMT_VINFO_STRIDED_P (stmt_info) = 1;
3521           DR_GROUP_GAP (stmt_info) = 0;
3522
3523           bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3524                                                 kind, max_tree_size, limit);
3525           if (i + 1 < group_size)
3526             res |= vect_analyze_slp_instance (vinfo, bst_map,
3527                                               rest, kind, max_tree_size, limit);
3528
3529           return res;
3530         }
3531
3532       /* Even though the first vector did not all match, we might be able to SLP
3533          (some) of the remainder.  FORNOW ignore this possibility.  */
3534     }
3535
3536   /* Failed to SLP.  */
3537   if (dump_enabled_p ())
3538     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3539   return false;
3540 }
3541
3542
3543 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3544    vect_build_slp_tree to build a tree of packed stmts if possible.
3545    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3546
3547 static bool
3548 vect_analyze_slp_instance (vec_info *vinfo,
3549                            scalar_stmts_to_slp_tree_map_t *bst_map,
3550                            stmt_vec_info stmt_info,
3551                            slp_instance_kind kind,
3552                            unsigned max_tree_size, unsigned *limit)
3553 {
3554   unsigned int i;
3555   vec<stmt_vec_info> scalar_stmts;
3556
3557   if (is_a <bb_vec_info> (vinfo))
3558     vect_location = stmt_info->stmt;
3559
3560   stmt_vec_info next_info = stmt_info;
3561   if (kind == slp_inst_kind_store)
3562     {
3563       /* Collect the stores and store them in scalar_stmts.  */
3564       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3565       while (next_info)
3566         {
3567           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3568           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3569         }
3570     }
3571   else if (kind == slp_inst_kind_reduc_chain)
3572     {
3573       /* Collect the reduction stmts and store them in scalar_stmts.  */
3574       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3575       while (next_info)
3576         {
3577           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3578           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3579         }
3580       /* Mark the first element of the reduction chain as reduction to properly
3581          transform the node.  In the reduction analysis phase only the last
3582          element of the chain is marked as reduction.  */
3583       STMT_VINFO_DEF_TYPE (stmt_info)
3584         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3585       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3586         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3587     }
3588   else if (kind == slp_inst_kind_reduc_group)
3589     {
3590       /* Collect reduction statements.  */
3591       const vec<stmt_vec_info> &reductions
3592         = as_a <loop_vec_info> (vinfo)->reductions;
3593       scalar_stmts.create (reductions.length ());
3594       for (i = 0; reductions.iterate (i, &next_info); i++)
3595         if ((STMT_VINFO_RELEVANT_P (next_info)
3596              || STMT_VINFO_LIVE_P (next_info))
3597             /* ???  Make sure we didn't skip a conversion around a reduction
3598                path.  In that case we'd have to reverse engineer that conversion
3599                stmt following the chain using reduc_idx and from the PHI
3600                using reduc_def.  */
3601             && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3602           scalar_stmts.quick_push (next_info);
3603       /* If less than two were relevant/live there's nothing to SLP.  */
3604       if (scalar_stmts.length () < 2)
3605         return false;
3606     }
3607   else
3608     gcc_unreachable ();
3609
3610   vec<stmt_vec_info> roots = vNULL;
3611   vec<tree> remain = vNULL;
3612   /* Build the tree for the SLP instance.  */
3613   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3614                                       roots, remain,
3615                                       max_tree_size, limit, bst_map,
3616                                       kind == slp_inst_kind_store
3617                                       ? stmt_info : NULL);
3618
3619   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3620      where we should do store group splitting.  */
3621
3622   return res;
3623 }
3624
3625 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3626    trees of packed scalar stmts if SLP is possible.  */
3627
3628 opt_result
3629 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3630 {
3631   unsigned int i;
3632   stmt_vec_info first_element;
3633   slp_instance instance;
3634
3635   DUMP_VECT_SCOPE ("vect_analyze_slp");
3636
3637   unsigned limit = max_tree_size;
3638
3639   scalar_stmts_to_slp_tree_map_t *bst_map
3640     = new scalar_stmts_to_slp_tree_map_t ();
3641
3642   /* Find SLP sequences starting from groups of grouped stores.  */
3643   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3644     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3645                                slp_inst_kind_store, max_tree_size, &limit);
3646
3647   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3648     {
3649       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3650         {
3651           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3652           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3653                                        bb_vinfo->roots[i].stmts,
3654                                        bb_vinfo->roots[i].roots,
3655                                        bb_vinfo->roots[i].remain,
3656                                        max_tree_size, &limit, bst_map, NULL))
3657             {
3658               bb_vinfo->roots[i].stmts = vNULL;
3659               bb_vinfo->roots[i].roots = vNULL;
3660               bb_vinfo->roots[i].remain = vNULL;
3661             }
3662         }
3663     }
3664
3665   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3666     {
3667       /* Find SLP sequences starting from reduction chains.  */
3668       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3669         if (! STMT_VINFO_RELEVANT_P (first_element)
3670             && ! STMT_VINFO_LIVE_P (first_element))
3671           ;
3672         else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3673                                               slp_inst_kind_reduc_chain,
3674                                               max_tree_size, &limit))
3675           {
3676             /* Dissolve reduction chain group.  */
3677             stmt_vec_info vinfo = first_element;
3678             stmt_vec_info last = NULL;
3679             while (vinfo)
3680               {
3681                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3682                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3683                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3684                 last = vinfo;
3685                 vinfo = next;
3686               }
3687             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3688             /* It can be still vectorized as part of an SLP reduction.  */
3689             loop_vinfo->reductions.safe_push (last);
3690           }
3691
3692       /* Find SLP sequences starting from groups of reductions.  */
3693       if (loop_vinfo->reductions.length () > 1)
3694         vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3695                                    slp_inst_kind_reduc_group, max_tree_size,
3696                                    &limit);
3697     }
3698
3699   hash_set<slp_tree> visited_patterns;
3700   slp_tree_to_load_perm_map_t perm_cache;
3701   slp_compat_nodes_map_t compat_cache;
3702
3703   /* See if any patterns can be found in the SLP tree.  */
3704   bool pattern_found = false;
3705   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3706     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3707                                               &visited_patterns, &perm_cache,
3708                                               &compat_cache);
3709
3710   /* If any were found optimize permutations of loads.  */
3711   if (pattern_found)
3712     {
3713       hash_map<slp_tree, slp_tree> load_map;
3714       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3715         {
3716           slp_tree root = SLP_INSTANCE_TREE (instance);
3717           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3718                                         &load_map, root);
3719         }
3720     }
3721
3722
3723
3724   /* The map keeps a reference on SLP nodes built, release that.  */
3725   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3726        it != bst_map->end (); ++it)
3727     if ((*it).second)
3728       vect_free_slp_tree ((*it).second);
3729   delete bst_map;
3730
3731   if (pattern_found && dump_enabled_p ())
3732     {
3733       dump_printf_loc (MSG_NOTE, vect_location,
3734                        "Pattern matched SLP tree\n");
3735       hash_set<slp_tree> visited;
3736       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3737         vect_print_slp_graph (MSG_NOTE, vect_location,
3738                               SLP_INSTANCE_TREE (instance), visited);
3739     }
3740
3741   return opt_result::success ();
3742 }
3743
3744 /* Estimates the cost of inserting layout changes into the SLP graph.
3745    It can also say that the insertion is impossible.  */
3746
3747 struct slpg_layout_cost
3748 {
3749   slpg_layout_cost () = default;
3750   slpg_layout_cost (sreal, bool);
3751
3752   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3753   bool is_possible () const { return depth != sreal::max (); }
3754
3755   bool operator== (const slpg_layout_cost &) const;
3756   bool operator!= (const slpg_layout_cost &) const;
3757
3758   bool is_better_than (const slpg_layout_cost &, bool) const;
3759
3760   void add_parallel_cost (const slpg_layout_cost &);
3761   void add_serial_cost (const slpg_layout_cost &);
3762   void split (unsigned int);
3763
3764   /* The longest sequence of layout changes needed during any traversal
3765      of the partition dag, weighted by execution frequency.
3766
3767      This is the most important metric when optimizing for speed, since
3768      it helps to ensure that we keep the number of operations on
3769      critical paths to a minimum.  */
3770   sreal depth = 0;
3771
3772   /* An estimate of the total number of operations needed.  It is weighted by
3773      execution frequency when optimizing for speed but not when optimizing for
3774      size.  In order to avoid double-counting, a node with a fanout of N will
3775      distribute 1/N of its total cost to each successor.
3776
3777      This is the most important metric when optimizing for size, since
3778      it helps to keep the total number of operations to a minimum,  */
3779   sreal total = 0;
3780 };
3781
3782 /* Construct costs for a node with weight WEIGHT.  A higher weight
3783    indicates more frequent execution.  IS_FOR_SIZE is true if we are
3784    optimizing for size rather than speed.  */
3785
3786 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3787   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3788 {
3789 }
3790
3791 bool
3792 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3793 {
3794   return depth == other.depth && total == other.total;
3795 }
3796
3797 bool
3798 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3799 {
3800   return !operator== (other);
3801 }
3802
3803 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
3804    true if we are optimizing for size rather than speed.  */
3805
3806 bool
3807 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3808                                   bool is_for_size) const
3809 {
3810   if (is_for_size)
3811     {
3812       if (total != other.total)
3813         return total < other.total;
3814       return depth < other.depth;
3815     }
3816   else
3817     {
3818       if (depth != other.depth)
3819         return depth < other.depth;
3820       return total < other.total;
3821     }
3822 }
3823
3824 /* Increase the costs to account for something with cost INPUT_COST
3825    happening in parallel with the current costs.  */
3826
3827 void
3828 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3829 {
3830   depth = std::max (depth, input_cost.depth);
3831   total += input_cost.total;
3832 }
3833
3834 /* Increase the costs to account for something with cost INPUT_COST
3835    happening in series with the current costs.  */
3836
3837 void
3838 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3839 {
3840   depth += other.depth;
3841   total += other.total;
3842 }
3843
3844 /* Split the total cost among TIMES successors or predecessors.  */
3845
3846 void
3847 slpg_layout_cost::split (unsigned int times)
3848 {
3849   if (times > 1)
3850     total /= times;
3851 }
3852
3853 /* Information about one node in the SLP graph, for use during
3854    vect_optimize_slp_pass.  */
3855
3856 struct slpg_vertex
3857 {
3858   slpg_vertex (slp_tree node_) : node (node_) {}
3859
3860   /* The node itself.  */
3861   slp_tree node;
3862
3863   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
3864      partitions are flexible; they can have whichever layout consumers
3865      want them to have.  */
3866   int partition = -1;
3867
3868   /* The number of nodes that directly use the result of this one
3869      (i.e. the number of nodes that count this one as a child).  */
3870   unsigned int out_degree = 0;
3871
3872   /* The execution frequency of the node.  */
3873   sreal weight = 0;
3874
3875   /* The total execution frequency of all nodes that directly use the
3876      result of this one.  */
3877   sreal out_weight = 0;
3878 };
3879
3880 /* Information about one partition of the SLP graph, for use during
3881    vect_optimize_slp_pass.  */
3882
3883 struct slpg_partition_info
3884 {
3885   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3886      of m_partitioned_nodes.  */
3887   unsigned int node_begin = 0;
3888   unsigned int node_end = 0;
3889
3890   /* Which layout we've chosen to use for this partition, or -1 if
3891      we haven't picked one yet.  */
3892   int layout = -1;
3893
3894   /* The number of predecessors and successors in the partition dag.
3895      The predecessors always have lower partition numbers and the
3896      successors always have higher partition numbers.
3897
3898      Note that the directions of these edges are not necessarily the
3899      same as in the data flow graph.  For example, if an SCC has separate
3900      partitions for an inner loop and an outer loop, the inner loop's
3901      partition will have at least two incoming edges from the outer loop's
3902      partition: one for a live-in value and one for a live-out value.
3903      In data flow terms, one of these edges would also be from the outer loop
3904      to the inner loop, but the other would be in the opposite direction.  */
3905   unsigned int in_degree = 0;
3906   unsigned int out_degree = 0;
3907 };
3908
3909 /* Information about the costs of using a particular layout for a
3910    particular partition.  It can also say that the combination is
3911    impossible.  */
3912
3913 struct slpg_partition_layout_costs
3914 {
3915   bool is_possible () const { return internal_cost.is_possible (); }
3916   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3917
3918   /* The costs inherited from predecessor partitions.  */
3919   slpg_layout_cost in_cost;
3920
3921   /* The inherent cost of the layout within the node itself.  For example,
3922      this is nonzero for a load if choosing a particular layout would require
3923      the load to permute the loaded elements.  It is nonzero for a
3924      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3925      to full-vector moves.  */
3926   slpg_layout_cost internal_cost;
3927
3928   /* The costs inherited from successor partitions.  */
3929   slpg_layout_cost out_cost;
3930 };
3931
3932 /* This class tries to optimize the layout of vectors in order to avoid
3933    unnecessary shuffling.  At the moment, the set of possible layouts are
3934    restricted to bijective permutations.
3935
3936    The goal of the pass depends on whether we're optimizing for size or
3937    for speed.  When optimizing for size, the goal is to reduce the overall
3938    number of layout changes (including layout changes implied by things
3939    like load permutations).  When optimizing for speed, the goal is to
3940    reduce the maximum latency attributable to layout changes on any
3941    non-cyclical path through the data flow graph.
3942
3943    For example, when optimizing a loop nest for speed, we will prefer
3944    to make layout changes outside of a loop rather than inside of a loop,
3945    and will prefer to make layout changes in parallel rather than serially,
3946    even if that increases the overall number of layout changes.
3947
3948    The high-level procedure is:
3949
3950    (1) Build a graph in which edges go from uses (parents) to definitions
3951        (children).
3952
3953    (2) Divide the graph into a dag of strongly-connected components (SCCs).
3954
3955    (3) When optimizing for speed, partition the nodes in each SCC based
3956        on their containing cfg loop.  When optimizing for size, treat
3957        each SCC as a single partition.
3958
3959        This gives us a dag of partitions.  The goal is now to assign a
3960        layout to each partition.
3961
3962    (4) Construct a set of vector layouts that are worth considering.
3963        Record which nodes must keep their current layout.
3964
3965    (5) Perform a forward walk over the partition dag (from loads to stores)
3966        accumulating the "forward" cost of using each layout.  When visiting
3967        each partition, assign a tentative choice of layout to the partition
3968        and use that choice when calculating the cost of using a different
3969        layout in successor partitions.
3970
3971    (6) Perform a backward walk over the partition dag (from stores to loads),
3972        accumulating the "backward" cost of using each layout.  When visiting
3973        each partition, make a final choice of layout for that partition based
3974        on the accumulated forward costs (from (5)) and backward costs
3975        (from (6)).
3976
3977    (7) Apply the chosen layouts to the SLP graph.
3978
3979    For example, consider the SLP statements:
3980
3981    S1:      a_1 = load
3982        loop:
3983    S2:      a_2 = PHI<a_1, a_3>
3984    S3:      b_1 = load
3985    S4:      a_3 = a_2 + b_1
3986        exit:
3987    S5:      a_4 = PHI<a_3>
3988    S6:      store a_4
3989
3990    S2 and S4 form an SCC and are part of the same loop.  Every other
3991    statement is in a singleton SCC.  In this example there is a one-to-one
3992    mapping between SCCs and partitions and the partition dag looks like this;
3993
3994         S1     S3
3995          \     /
3996           S2+S4
3997             |
3998            S5
3999             |
4000            S6
4001
4002    S2, S3 and S4 will have a higher execution frequency than the other
4003    statements, so when optimizing for speed, the goal is to avoid any
4004    layout changes:
4005
4006    - within S3
4007    - within S2+S4
4008    - on the S3->S2+S4 edge
4009
4010    For example, if S3 was originally a reversing load, the goal of the
4011    pass is to make it an unreversed load and change the layout on the
4012    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
4013    on S1->S2+S4 and S5->S6 would also be acceptable.)
4014
4015    The difference between SCCs and partitions becomes important if we
4016    add an outer loop:
4017
4018    S1:      a_1 = ...
4019        loop1:
4020    S2:      a_2 = PHI<a_1, a_6>
4021    S3:      b_1 = load
4022    S4:      a_3 = a_2 + b_1
4023        loop2:
4024    S5:      a_4 = PHI<a_3, a_5>
4025    S6:      c_1 = load
4026    S7:      a_5 = a_4 + c_1
4027        exit2:
4028    S8:      a_6 = PHI<a_5>
4029    S9:      store a_6
4030        exit1:
4031
4032    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
4033    for speed, we usually do not want restrictions in the outer loop to "infect"
4034    the decision for the inner loop.  For example, if an outer-loop node
4035    in the SCC contains a statement with a fixed layout, that should not
4036    prevent the inner loop from using a different layout.  Conversely,
4037    the inner loop should not dictate a layout to the outer loop: if the
4038    outer loop does a lot of computation, then it may not be efficient to
4039    do all of that computation in the inner loop's preferred layout.
4040
4041    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4042    and S5+S7 (inner).  We also try to arrange partitions so that:
4043
4044    - the partition for an outer loop comes before the partition for
4045      an inner loop
4046
4047    - if a sibling loop A dominates a sibling loop B, A's partition
4048      comes before B's
4049
4050    This gives the following partition dag for the example above:
4051
4052         S1        S3
4053          \        /
4054           S2+S4+S8   S6
4055            |   \\    /
4056            |    S5+S7
4057            |
4058           S9
4059
4060    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4061    one for a reversal of the edge S7->S8.
4062
4063    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
4064    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4065    preferred layout against the cost of changing the layout on entry to the
4066    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4067
4068    Although this works well when optimizing for speed, it has the downside
4069    when optimizing for size that the choice of layout for S5+S7 is completely
4070    independent of S9, which lessens the chance of reducing the overall number
4071    of permutations.  We therefore do not partition SCCs when optimizing
4072    for size.
4073
4074    To give a concrete example of the difference between optimizing
4075    for size and speed, consider:
4076
4077    a[0] = (b[1] << c[3]) - d[1];
4078    a[1] = (b[0] << c[2]) - d[0];
4079    a[2] = (b[3] << c[1]) - d[3];
4080    a[3] = (b[2] << c[0]) - d[2];
4081
4082    There are three different layouts here: one for a, one for b and d,
4083    and one for c.  When optimizing for speed it is better to permute each
4084    of b, c and d into the order required by a, since those permutations
4085    happen in parallel.  But when optimizing for size, it is better to:
4086
4087    - permute c into the same order as b
4088    - do the arithmetic
4089    - permute the result into the order required by a
4090
4091    This gives 2 permutations rather than 3.  */
4092
4093 class vect_optimize_slp_pass
4094 {
4095 public:
4096   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4097   void run ();
4098
4099 private:
4100   /* Graph building.  */
4101   struct loop *containing_loop (slp_tree);
4102   bool is_cfg_latch_edge (graph_edge *);
4103   void build_vertices (hash_set<slp_tree> &, slp_tree);
4104   void build_vertices ();
4105   void build_graph ();
4106
4107   /* Partitioning.  */
4108   void create_partitions ();
4109   template<typename T> void for_each_partition_edge (unsigned int, T);
4110
4111   /* Layout selection.  */
4112   bool is_compatible_layout (slp_tree, unsigned int);
4113   int change_layout_cost (slp_tree, unsigned int, unsigned int);
4114   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4115                                                        unsigned int);
4116   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4117                                int, unsigned int);
4118   int internal_node_cost (slp_tree, int, unsigned int);
4119   void start_choosing_layouts ();
4120
4121   /* Cost propagation.  */
4122   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4123                                      unsigned int, unsigned int);
4124   slpg_layout_cost total_in_cost (unsigned int);
4125   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4126   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4127   void forward_pass ();
4128   void backward_pass ();
4129
4130   /* Rematerialization.  */
4131   slp_tree get_result_with_layout (slp_tree, unsigned int);
4132   void materialize ();
4133
4134   /* Clean-up.  */
4135   void remove_redundant_permutations ();
4136
4137   void dump ();
4138
4139   vec_info *m_vinfo;
4140
4141   /* True if we should optimize the graph for size, false if we should
4142      optimize it for speed.  (It wouldn't be easy to make this decision
4143      more locally.)  */
4144   bool m_optimize_size;
4145
4146   /* A graph of all SLP nodes, with edges leading from uses to definitions.
4147      In other words, a node's predecessors are its slp_tree parents and
4148      a node's successors are its slp_tree children.  */
4149   graph *m_slpg = nullptr;
4150
4151   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
4152   auto_vec<slpg_vertex> m_vertices;
4153
4154   /* The list of all leaves of M_SLPG. such as external definitions, constants,
4155      and loads.  */
4156   auto_vec<int> m_leafs;
4157
4158   /* This array has one entry for every vector layout that we're considering.
4159      Element 0 is null and indicates "no change".  Other entries describe
4160      permutations that are inherent in the current graph and that we would
4161      like to reverse if possible.
4162
4163      For example, a permutation { 1, 2, 3, 0 } means that something has
4164      effectively been permuted in that way, such as a load group
4165      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4166      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4167      in order to put things "back" in order.  */
4168   auto_vec<vec<unsigned> > m_perms;
4169
4170   /* A partitioning of the nodes for which a layout must be chosen.
4171      Each partition represents an <SCC, cfg loop> pair; that is,
4172      nodes in different SCCs belong to different partitions, and nodes
4173      within an SCC can be further partitioned according to a containing
4174      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
4175
4176      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4177        from leaves (such as loads) to roots (such as stores).
4178
4179      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
4180   auto_vec<slpg_partition_info> m_partitions;
4181
4182   /* The list of all nodes for which a layout must be chosen.  Nodes for
4183      partition P come before the nodes for partition P+1.  Nodes within a
4184      partition are in reverse postorder.  */
4185   auto_vec<unsigned int> m_partitioned_nodes;
4186
4187   /* Index P * num-layouts + L contains the cost of using layout L
4188      for partition P.  */
4189   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4190
4191   /* Index N * num-layouts + L, if nonnull, is a node that provides the
4192      original output of node N adjusted to have layout L.  */
4193   auto_vec<slp_tree> m_node_layouts;
4194 };
4195
4196 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4197    Also record whether we should optimize anything for speed rather
4198    than size.  */
4199
4200 void
4201 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4202                                         slp_tree node)
4203 {
4204   unsigned i;
4205   slp_tree child;
4206
4207   if (visited.add (node))
4208     return;
4209
4210   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4211     {
4212       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4213       if (optimize_bb_for_speed_p (bb))
4214         m_optimize_size = false;
4215     }
4216
4217   node->vertex = m_vertices.length ();
4218   m_vertices.safe_push (slpg_vertex (node));
4219
4220   bool leaf = true;
4221   bool force_leaf = false;
4222   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4223     if (child)
4224       {
4225         leaf = false;
4226         build_vertices (visited, child);
4227       }
4228     else
4229       force_leaf = true;
4230   /* Since SLP discovery works along use-def edges all cycles have an
4231      entry - but there's the exception of cycles where we do not handle
4232      the entry explicitely (but with a NULL SLP node), like some reductions
4233      and inductions.  Force those SLP PHIs to act as leafs to make them
4234      backwards reachable.  */
4235   if (leaf || force_leaf)
4236     m_leafs.safe_push (node->vertex);
4237 }
4238
4239 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
4240
4241 void
4242 vect_optimize_slp_pass::build_vertices ()
4243 {
4244   hash_set<slp_tree> visited;
4245   unsigned i;
4246   slp_instance instance;
4247   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4248     build_vertices (visited, SLP_INSTANCE_TREE (instance));
4249 }
4250
4251 /* Apply (reverse) bijectite PERM to VEC.  */
4252
4253 template <class T>
4254 static void
4255 vect_slp_permute (vec<unsigned> perm,
4256                   vec<T> &vec, bool reverse)
4257 {
4258   auto_vec<T, 64> saved;
4259   saved.create (vec.length ());
4260   for (unsigned i = 0; i < vec.length (); ++i)
4261     saved.quick_push (vec[i]);
4262
4263   if (reverse)
4264     {
4265       for (unsigned i = 0; i < vec.length (); ++i)
4266         vec[perm[i]] = saved[i];
4267       for (unsigned i = 0; i < vec.length (); ++i)
4268         gcc_assert (vec[perm[i]] == saved[i]);
4269     }
4270   else
4271     {
4272       for (unsigned i = 0; i < vec.length (); ++i)
4273         vec[i] = saved[perm[i]];
4274       for (unsigned i = 0; i < vec.length (); ++i)
4275         gcc_assert (vec[i] == saved[perm[i]]);
4276     }
4277 }
4278
4279 /* Return the cfg loop that contains NODE.  */
4280
4281 struct loop *
4282 vect_optimize_slp_pass::containing_loop (slp_tree node)
4283 {
4284   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4285   if (!rep)
4286     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4287   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4288 }
4289
4290 /* Return true if UD (an edge from a use to a definition) is associated
4291    with a loop latch edge in the cfg.  */
4292
4293 bool
4294 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4295 {
4296   slp_tree use = m_vertices[ud->src].node;
4297   slp_tree def = m_vertices[ud->dest].node;
4298   if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4299       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4300     return false;
4301
4302   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4303   return (is_a<gphi *> (use_rep->stmt)
4304           && bb_loop_header_p (gimple_bb (use_rep->stmt))
4305           && containing_loop (def) == containing_loop (use));
4306 }
4307
4308 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
4309    a nonnull data field.  */
4310
4311 void
4312 vect_optimize_slp_pass::build_graph ()
4313 {
4314   m_optimize_size = true;
4315   build_vertices ();
4316
4317   m_slpg = new_graph (m_vertices.length ());
4318   for (slpg_vertex &v : m_vertices)
4319     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4320       if (child)
4321         {
4322           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4323           if (is_cfg_latch_edge (ud))
4324             ud->data = this;
4325         }
4326 }
4327
4328 /* Return true if E corresponds to a loop latch edge in the cfg.  */
4329
4330 static bool
4331 skip_cfg_latch_edges (graph_edge *e)
4332 {
4333   return e->data;
4334 }
4335
4336 /* Create the node partitions.  */
4337
4338 void
4339 vect_optimize_slp_pass::create_partitions ()
4340 {
4341   /* Calculate a postorder of the graph, ignoring edges that correspond
4342      to natural latch edges in the cfg.  Reading the vector from the end
4343      to the beginning gives the reverse postorder.  */
4344   auto_vec<int> initial_rpo;
4345   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4346                false, NULL, skip_cfg_latch_edges);
4347   gcc_assert (initial_rpo.length () == m_vertices.length ());
4348
4349   /* Calculate the strongly connected components of the graph.  */
4350   auto_vec<int> scc_grouping;
4351   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4352
4353   /* Create a new index order in which all nodes from the same SCC are
4354      consecutive.  Use scc_pos to record the index of the first node in
4355      each SCC.  */
4356   auto_vec<unsigned int> scc_pos (num_sccs);
4357   int last_component = -1;
4358   unsigned int node_count = 0;
4359   for (unsigned int node_i : scc_grouping)
4360     {
4361       if (last_component != m_slpg->vertices[node_i].component)
4362         {
4363           last_component = m_slpg->vertices[node_i].component;
4364           gcc_assert (last_component == int (scc_pos.length ()));
4365           scc_pos.quick_push (node_count);
4366         }
4367       node_count += 1;
4368     }
4369   gcc_assert (node_count == initial_rpo.length ()
4370               && last_component + 1 == int (num_sccs));
4371
4372   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4373      inside each SCC following the RPO we calculated above.  The fact that
4374      we ignored natural latch edges when calculating the RPO should ensure
4375      that, for natural loop nests:
4376
4377      - the first node that we encounter in a cfg loop is the loop header phi
4378      - the loop header phis are in dominance order
4379
4380      Arranging for this is an optimization (see below) rather than a
4381      correctness issue.  Unnatural loops with a tangled mess of backedges
4382      will still work correctly, but might give poorer results.
4383
4384      Also update scc_pos so that it gives 1 + the index of the last node
4385      in the SCC.  */
4386   m_partitioned_nodes.safe_grow (node_count);
4387   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4388     {
4389       unsigned int node_i = initial_rpo[old_i];
4390       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4391       m_partitioned_nodes[new_i] = node_i;
4392     }
4393
4394   /* When optimizing for speed, partition each SCC based on the containing
4395      cfg loop. The order we constructed above should ensure that, for natural
4396      cfg loops, we'll create sub-SCC partitions for outer loops before
4397      the corresponding sub-SCC partitions for inner loops.  Similarly,
4398      when one sibling loop A dominates another sibling loop B, we should
4399      create a sub-SCC partition for A before a sub-SCC partition for B.
4400
4401      As above, nothing depends for correctness on whether this achieves
4402      a natural nesting, but we should get better results when it does.  */
4403   m_partitions.reserve (m_vertices.length ());
4404   unsigned int next_partition_i = 0;
4405   hash_map<struct loop *, int> loop_partitions;
4406   unsigned int rpo_begin = 0;
4407   unsigned int num_partitioned_nodes = 0;
4408   for (unsigned int rpo_end : scc_pos)
4409     {
4410       loop_partitions.empty ();
4411       unsigned int partition_i = next_partition_i;
4412       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4413         {
4414           /* Handle externals and constants optimistically throughout.
4415              But treat existing vectors as fixed since we do not handle
4416              permuting them.  */
4417           unsigned int node_i = m_partitioned_nodes[rpo_i];
4418           auto &vertex = m_vertices[node_i];
4419           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4420                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4421               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4422             vertex.partition = -1;
4423           else
4424             {
4425               bool existed;
4426               if (m_optimize_size)
4427                 existed = next_partition_i > partition_i;
4428               else
4429                 {
4430                   struct loop *loop = containing_loop (vertex.node);
4431                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
4432                   if (!existed)
4433                     entry = next_partition_i;
4434                   partition_i = entry;
4435                 }
4436               if (!existed)
4437                 {
4438                   m_partitions.quick_push (slpg_partition_info ());
4439                   next_partition_i += 1;
4440                 }
4441               vertex.partition = partition_i;
4442               num_partitioned_nodes += 1;
4443               m_partitions[partition_i].node_end += 1;
4444             }
4445         }
4446       rpo_begin = rpo_end;
4447     }
4448
4449   /* Assign ranges of consecutive node indices to each partition,
4450      in partition order.  Start with node_end being the same as
4451      node_begin so that the next loop can use it as a counter.  */
4452   unsigned int node_begin = 0;
4453   for (auto &partition : m_partitions)
4454     {
4455       partition.node_begin = node_begin;
4456       node_begin += partition.node_end;
4457       partition.node_end = partition.node_begin;
4458     }
4459   gcc_assert (node_begin == num_partitioned_nodes);
4460
4461   /* Finally build the list of nodes in partition order.  */
4462   m_partitioned_nodes.truncate (num_partitioned_nodes);
4463   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4464     {
4465       int partition_i = m_vertices[node_i].partition;
4466       if (partition_i >= 0)
4467         {
4468           unsigned int order_i = m_partitions[partition_i].node_end++;
4469           m_partitioned_nodes[order_i] = node_i;
4470         }
4471     }
4472 }
4473
4474 /* Look for edges from earlier partitions into node NODE_I and edges from
4475    node NODE_I into later partitions.  Call:
4476
4477       FN (ud, other_node_i)
4478
4479    for each such use-to-def edge ud, where other_node_i is the node at the
4480    other end of the edge.  */
4481
4482 template<typename T>
4483 void
4484 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4485 {
4486   int partition_i = m_vertices[node_i].partition;
4487   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4488        pred; pred = pred->pred_next)
4489     {
4490       int src_partition_i = m_vertices[pred->src].partition;
4491       if (src_partition_i >= 0 && src_partition_i != partition_i)
4492         fn (pred, pred->src);
4493     }
4494   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4495        succ; succ = succ->succ_next)
4496     {
4497       int dest_partition_i = m_vertices[succ->dest].partition;
4498       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4499         fn (succ, succ->dest);
4500     }
4501 }
4502
4503 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4504    that NODE would operate on.  This test is independent of NODE's actual
4505    operation.  */
4506
4507 bool
4508 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4509                                               unsigned int layout_i)
4510 {
4511   if (layout_i == 0)
4512     return true;
4513
4514   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4515     return false;
4516
4517   return true;
4518 }
4519
4520 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4521    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
4522    layouts is incompatible with NODE or if the change is not possible for
4523    some other reason.
4524
4525    The properties taken from NODE include the number of lanes and the
4526    vector type.  The actual operation doesn't matter.  */
4527
4528 int
4529 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4530                                             unsigned int from_layout_i,
4531                                             unsigned int to_layout_i)
4532 {
4533   if (!is_compatible_layout (node, from_layout_i)
4534       || !is_compatible_layout (node, to_layout_i))
4535     return -1;
4536
4537   if (from_layout_i == to_layout_i)
4538     return 0;
4539
4540   auto_vec<slp_tree, 1> children (1);
4541   children.quick_push (node);
4542   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4543   if (from_layout_i > 0)
4544     for (unsigned int i : m_perms[from_layout_i])
4545       perm.quick_push ({ 0, i });
4546   else
4547     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4548       perm.quick_push ({ 0, i });
4549   if (to_layout_i > 0)
4550     vect_slp_permute (m_perms[to_layout_i], perm, true);
4551   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4552                                                children, false);
4553   if (count >= 0)
4554     return MAX (count, 1);
4555
4556   /* ??? In principle we could try changing via layout 0, giving two
4557      layout changes rather than 1.  Doing that would require
4558      corresponding support in get_result_with_layout.  */
4559   return -1;
4560 }
4561
4562 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
4563
4564 inline slpg_partition_layout_costs &
4565 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4566                                                 unsigned int layout_i)
4567 {
4568   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4569 }
4570
4571 /* Change PERM in one of two ways:
4572
4573    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4574      chosen for child I of NODE.
4575
4576    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4577
4578    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
4579
4580 void
4581 vect_optimize_slp_pass::
4582 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4583                         int in_layout_i, unsigned int out_layout_i)
4584 {
4585   for (auto &entry : perm)
4586     {
4587       int this_in_layout_i = in_layout_i;
4588       if (this_in_layout_i < 0)
4589         {
4590           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4591           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4592           this_in_layout_i = m_partitions[in_partition_i].layout;
4593         }
4594       if (this_in_layout_i > 0)
4595         entry.second = m_perms[this_in_layout_i][entry.second];
4596     }
4597   if (out_layout_i > 0)
4598     vect_slp_permute (m_perms[out_layout_i], perm, true);
4599 }
4600
4601 /* Check whether the target allows NODE to be rearranged so that the node's
4602    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
4603    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
4604
4605    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4606    NODE can adapt to the layout changes that have (perhaps provisionally)
4607    been chosen for NODE's children, so that no extra permutations are
4608    needed on either the input or the output of NODE.
4609
4610    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4611    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4612
4613    IN_LAYOUT_I has no meaning for other types of node.
4614
4615    Keeping the node as-is is always valid.  If the target doesn't appear
4616    to support the node as-is, but might realistically support other layouts,
4617    then layout 0 instead has the cost of a worst-case permutation.  On the
4618    one hand, this ensures that every node has at least one valid layout,
4619    avoiding what would otherwise be an awkward special case.  On the other,
4620    it still encourages the pass to change an invalid pre-existing layout
4621    choice into a valid one.  */
4622
4623 int
4624 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4625                                             unsigned int out_layout_i)
4626 {
4627   const int fallback_cost = 1;
4628
4629   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4630     {
4631       auto_lane_permutation_t tmp_perm;
4632       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4633
4634       /* Check that the child nodes support the chosen layout.  Checking
4635          the first child is enough, since any second child would have the
4636          same shape.  */
4637       auto first_child = SLP_TREE_CHILDREN (node)[0];
4638       if (in_layout_i > 0
4639           && !is_compatible_layout (first_child, in_layout_i))
4640         return -1;
4641
4642       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4643       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4644                                                   node, tmp_perm,
4645                                                   SLP_TREE_CHILDREN (node),
4646                                                   false);
4647       if (count < 0)
4648         {
4649           if (in_layout_i == 0 && out_layout_i == 0)
4650             {
4651               /* Use the fallback cost if the node could in principle support
4652                  some nonzero layout for both the inputs and the outputs.
4653                  Otherwise assume that the node will be rejected later
4654                  and rebuilt from scalars.  */
4655               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4656                 return fallback_cost;
4657               return 0;
4658             }
4659           return -1;
4660         }
4661
4662       /* We currently have no way of telling whether the new layout is cheaper
4663          or more expensive than the old one.  But at least in principle,
4664          it should be worth making zero permutations (whole-vector shuffles)
4665          cheaper than real permutations, in case the pass is able to remove
4666          the latter.  */
4667       return count == 0 ? 0 : 1;
4668     }
4669
4670   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4671   if (rep
4672       && STMT_VINFO_DATA_REF (rep)
4673       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4674       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4675     {
4676       auto_load_permutation_t tmp_perm;
4677       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4678       if (out_layout_i > 0)
4679         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4680
4681       poly_uint64 vf = 1;
4682       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4683         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4684       unsigned int n_perms;
4685       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4686                                            nullptr, vf, true, false, &n_perms))
4687         {
4688           auto rep = SLP_TREE_REPRESENTATIVE (node);
4689           if (out_layout_i == 0)
4690             {
4691               /* Use the fallback cost if the load is an N-to-N permutation.
4692                  Otherwise assume that the node will be rejected later
4693                  and rebuilt from scalars.  */
4694               if (STMT_VINFO_GROUPED_ACCESS (rep)
4695                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4696                       == SLP_TREE_LANES (node)))
4697                 return fallback_cost;
4698               return 0;
4699             }
4700           return -1;
4701         }
4702
4703       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
4704       return n_perms == 0 ? 0 : 1;
4705     }
4706
4707   return 0;
4708 }
4709
4710 /* Decide which element layouts we should consider using.  Calculate the
4711    weights associated with inserting layout changes on partition edges.
4712    Also mark partitions that cannot change layout, by setting their
4713    layout to zero.  */
4714
4715 void
4716 vect_optimize_slp_pass::start_choosing_layouts ()
4717 {
4718   /* Used to assign unique permutation indices.  */
4719   using perm_hash = unbounded_hashmap_traits<
4720     vec_free_hash_base<int_hash_base<unsigned>>,
4721     int_hash<int, -1, -2>
4722   >;
4723   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4724
4725   /* Layout 0 is "no change".  */
4726   m_perms.safe_push (vNULL);
4727
4728   /* Create layouts from existing permutations.  */
4729   auto_load_permutation_t tmp_perm;
4730   for (unsigned int node_i : m_partitioned_nodes)
4731     {
4732       /* Leafs also double as entries to the reverse graph.  Allow the
4733          layout of those to be changed.  */
4734       auto &vertex = m_vertices[node_i];
4735       auto &partition = m_partitions[vertex.partition];
4736       if (!m_slpg->vertices[node_i].succ)
4737         partition.layout = 0;
4738
4739       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
4740       slp_tree node = vertex.node;
4741       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4742       slp_tree child;
4743       unsigned HOST_WIDE_INT imin, imax = 0;
4744       bool any_permute = false;
4745       tmp_perm.truncate (0);
4746       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4747         {
4748           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4749              unpermuted, record a layout that reverses this permutation.
4750
4751              We would need more work to cope with loads that are internally
4752              permuted and also have inputs (such as masks for
4753              IFN_MASK_LOADs).  */
4754           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4755           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4756             {
4757               partition.layout = -1;
4758               continue;
4759             }
4760           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4761           imin = DR_GROUP_SIZE (dr_stmt) + 1;
4762           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4763         }
4764       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4765                && SLP_TREE_CHILDREN (node).length () == 1
4766                && (child = SLP_TREE_CHILDREN (node)[0])
4767                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4768                    .is_constant (&imin)))
4769         {
4770           /* If the child has the same vector size as this node,
4771              reversing the permutation can make the permutation a no-op.
4772              In other cases it can change a true permutation into a
4773              full-vector extract.  */
4774           tmp_perm.reserve (SLP_TREE_LANES (node));
4775           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4776             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4777         }
4778       else
4779         continue;
4780
4781       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4782         {
4783           unsigned idx = tmp_perm[j];
4784           imin = MIN (imin, idx);
4785           imax = MAX (imax, idx);
4786           if (idx - tmp_perm[0] != j)
4787             any_permute = true;
4788         }
4789       /* If the span doesn't match we'd disrupt VF computation, avoid
4790          that for now.  */
4791       if (imax - imin + 1 != SLP_TREE_LANES (node))
4792         continue;
4793       /* If there's no permute no need to split one out.  In this case
4794          we can consider turning a load into a permuted load, if that
4795          turns out to be cheaper than alternatives.  */
4796       if (!any_permute)
4797         {
4798           partition.layout = -1;
4799           continue;
4800         }
4801
4802       /* For now only handle true permutes, like
4803          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
4804          when permuting constants and invariants keeping the permute
4805          bijective.  */
4806       auto_sbitmap load_index (SLP_TREE_LANES (node));
4807       bitmap_clear (load_index);
4808       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4809         bitmap_set_bit (load_index, tmp_perm[j] - imin);
4810       unsigned j;
4811       for (j = 0; j < SLP_TREE_LANES (node); ++j)
4812         if (!bitmap_bit_p (load_index, j))
4813           break;
4814       if (j != SLP_TREE_LANES (node))
4815         continue;
4816
4817       vec<unsigned> perm = vNULL;
4818       perm.safe_grow (SLP_TREE_LANES (node), true);
4819       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4820         perm[j] = tmp_perm[j] - imin;
4821
4822       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4823         {
4824           /* Continue to use existing layouts, but don't add any more.  */
4825           int *entry = layout_ids.get (perm);
4826           partition.layout = entry ? *entry : 0;
4827           perm.release ();
4828         }
4829       else
4830         {
4831           bool existed;
4832           int &layout_i = layout_ids.get_or_insert (perm, &existed);
4833           if (existed)
4834             perm.release ();
4835           else
4836             {
4837               layout_i = m_perms.length ();
4838               m_perms.safe_push (perm);
4839             }
4840           partition.layout = layout_i;
4841         }
4842     }
4843
4844   /* Initially assume that every layout is possible and has zero cost
4845      in every partition.  */
4846   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4847                                               * m_perms.length ());
4848
4849   /* We have to mark outgoing permutations facing non-associating-reduction
4850      graph entries that are not represented as to be materialized.
4851      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
4852   for (slp_instance instance : m_vinfo->slp_instances)
4853     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4854       {
4855         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4856         m_partitions[m_vertices[node_i].partition].layout = 0;
4857       }
4858     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4859       {
4860         stmt_vec_info stmt_info
4861           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4862         stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4863         if (needs_fold_left_reduction_p (TREE_TYPE
4864                                            (gimple_get_lhs (stmt_info->stmt)),
4865                                          STMT_VINFO_REDUC_CODE (reduc_info)))
4866           {
4867             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4868             m_partitions[m_vertices[node_i].partition].layout = 0;
4869           }
4870       }
4871
4872   /* Check which layouts each node and partition can handle.  Calculate the
4873      weights associated with inserting layout changes on edges.  */
4874   for (unsigned int node_i : m_partitioned_nodes)
4875     {
4876       auto &vertex = m_vertices[node_i];
4877       auto &partition = m_partitions[vertex.partition];
4878       slp_tree node = vertex.node;
4879
4880       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4881         {
4882           vertex.weight = vect_slp_node_weight (node);
4883
4884           /* We do not handle stores with a permutation, so all
4885              incoming permutations must have been materialized.
4886
4887              We also don't handle masked grouped loads, which lack a
4888              permutation vector.  In this case the memory locations
4889              form an implicit second input to the loads, on top of the
4890              explicit mask input, and the memory input's layout cannot
4891              be changed.
4892
4893              On the other hand, we do support permuting gather loads and
4894              masked gather loads, where each scalar load is independent
4895              of the others.  This can be useful if the address/index input
4896              benefits from permutation.  */
4897           if (STMT_VINFO_DATA_REF (rep)
4898               && STMT_VINFO_GROUPED_ACCESS (rep)
4899               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4900             partition.layout = 0;
4901
4902           /* We cannot change the layout of an operation that is
4903              not independent on lanes.  Note this is an explicit
4904              negative list since that's much shorter than the respective
4905              positive one but it's critical to keep maintaining it.  */
4906           if (is_gimple_call (STMT_VINFO_STMT (rep)))
4907             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4908               {
4909               case CFN_COMPLEX_ADD_ROT90:
4910               case CFN_COMPLEX_ADD_ROT270:
4911               case CFN_COMPLEX_MUL:
4912               case CFN_COMPLEX_MUL_CONJ:
4913               case CFN_VEC_ADDSUB:
4914               case CFN_VEC_FMADDSUB:
4915               case CFN_VEC_FMSUBADD:
4916                 partition.layout = 0;
4917               default:;
4918               }
4919         }
4920
4921       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4922         {
4923           auto &other_vertex = m_vertices[other_node_i];
4924
4925           /* Count the number of edges from earlier partitions and the number
4926              of edges to later partitions.  */
4927           if (other_vertex.partition < vertex.partition)
4928             partition.in_degree += 1;
4929           else
4930             partition.out_degree += 1;
4931
4932           /* If the current node uses the result of OTHER_NODE_I, accumulate
4933              the effects of that.  */
4934           if (ud->src == int (node_i))
4935             {
4936               other_vertex.out_weight += vertex.weight;
4937               other_vertex.out_degree += 1;
4938             }
4939         };
4940       for_each_partition_edge (node_i, process_edge);
4941     }
4942 }
4943
4944 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4945    its current (provisional) choice of layout.  The inputs do not necessarily
4946    have the same layout as each other.  */
4947
4948 slpg_layout_cost
4949 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4950 {
4951   auto &vertex = m_vertices[node_i];
4952   slpg_layout_cost cost;
4953   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4954     {
4955       auto &other_vertex = m_vertices[other_node_i];
4956       if (other_vertex.partition < vertex.partition)
4957         {
4958           auto &other_partition = m_partitions[other_vertex.partition];
4959           auto &other_costs = partition_layout_costs (other_vertex.partition,
4960                                                       other_partition.layout);
4961           slpg_layout_cost this_cost = other_costs.in_cost;
4962           this_cost.add_serial_cost (other_costs.internal_cost);
4963           this_cost.split (other_partition.out_degree);
4964           cost.add_parallel_cost (this_cost);
4965         }
4966     };
4967   for_each_partition_edge (node_i, add_cost);
4968   return cost;
4969 }
4970
4971 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4972    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
4973    slpg_layout_cost::impossible () if the change isn't possible.  */
4974
4975 slpg_layout_cost
4976 vect_optimize_slp_pass::
4977 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4978                   unsigned int layout2_i)
4979 {
4980   auto &def_vertex = m_vertices[ud->dest];
4981   auto &use_vertex = m_vertices[ud->src];
4982   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4983   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4984   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4985                                     use_layout_i);
4986   if (factor < 0)
4987     return slpg_layout_cost::impossible ();
4988
4989   /* We have a choice of putting the layout change at the site of the
4990      definition or at the site of the use.  Prefer the former when
4991      optimizing for size or when the execution frequency of the
4992      definition is no greater than the combined execution frequencies of
4993      the uses.  When putting the layout change at the site of the definition,
4994      divvy up the cost among all consumers.  */
4995   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4996     {
4997       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4998       cost.split (def_vertex.out_degree);
4999       return cost;
5000     }
5001   return { use_vertex.weight * factor, m_optimize_size };
5002 }
5003
5004 /* UD represents a use-def link between FROM_NODE_I and a node in a later
5005    partition; FROM_NODE_I could be the definition node or the use node.
5006    The node at the other end of the link wants to use layout TO_LAYOUT_I.
5007    Return the cost of any necessary fix-ups on edge UD, or return
5008    slpg_layout_cost::impossible () if the change isn't possible.
5009
5010    At this point, FROM_NODE_I's partition has chosen the cheapest
5011    layout based on the information available so far, but this choice
5012    is only provisional.  */
5013
5014 slpg_layout_cost
5015 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
5016                                       unsigned int to_layout_i)
5017 {
5018   auto &from_vertex = m_vertices[from_node_i];
5019   unsigned int from_partition_i = from_vertex.partition;
5020   slpg_partition_info &from_partition = m_partitions[from_partition_i];
5021   gcc_assert (from_partition.layout >= 0);
5022
5023   /* First calculate the cost on the assumption that FROM_PARTITION sticks
5024      with its current layout preference.  */
5025   slpg_layout_cost cost = slpg_layout_cost::impossible ();
5026   auto edge_cost = edge_layout_cost (ud, from_node_i,
5027                                      from_partition.layout, to_layout_i);
5028   if (edge_cost.is_possible ())
5029     {
5030       auto &from_costs = partition_layout_costs (from_partition_i,
5031                                                  from_partition.layout);
5032       cost = from_costs.in_cost;
5033       cost.add_serial_cost (from_costs.internal_cost);
5034       cost.split (from_partition.out_degree);
5035       cost.add_serial_cost (edge_cost);
5036     }
5037
5038   /* Take the minimum of that cost and the cost that applies if
5039      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
5040   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
5041                                                       to_layout_i);
5042   if (direct_layout_costs.is_possible ())
5043     {
5044       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5045       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
5046       direct_cost.split (from_partition.out_degree);
5047       if (!cost.is_possible ()
5048           || direct_cost.is_better_than (cost, m_optimize_size))
5049         cost = direct_cost;
5050     }
5051
5052   return cost;
5053 }
5054
5055 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
5056    partition; TO_NODE_I could be the definition node or the use node.
5057    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5058    return the cost of any necessary fix-ups on edge UD, or
5059    slpg_layout_cost::impossible () if the choice cannot be made.
5060
5061    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
5062
5063 slpg_layout_cost
5064 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
5065                                        unsigned int from_layout_i)
5066 {
5067   auto &to_vertex = m_vertices[to_node_i];
5068   unsigned int to_partition_i = to_vertex.partition;
5069   slpg_partition_info &to_partition = m_partitions[to_partition_i];
5070   gcc_assert (to_partition.layout >= 0);
5071
5072   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
5073      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
5074      any other inputs keep their current choice of layout.  */
5075   auto &to_costs = partition_layout_costs (to_partition_i,
5076                                            to_partition.layout);
5077   if (ud->src == int (to_node_i)
5078       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5079     {
5080       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
5081       auto old_layout = from_partition.layout;
5082       from_partition.layout = from_layout_i;
5083       int factor = internal_node_cost (to_vertex.node, -1,
5084                                        to_partition.layout);
5085       from_partition.layout = old_layout;
5086       if (factor >= 0)
5087         {
5088           slpg_layout_cost cost = to_costs.out_cost;
5089           cost.add_serial_cost ({ to_vertex.weight * factor,
5090                                   m_optimize_size });
5091           cost.split (to_partition.in_degree);
5092           return cost;
5093         }
5094     }
5095
5096   /* Compute the cost if we insert any necessary layout change on edge UD.  */
5097   auto edge_cost = edge_layout_cost (ud, to_node_i,
5098                                      to_partition.layout, from_layout_i);
5099   if (edge_cost.is_possible ())
5100     {
5101       slpg_layout_cost cost = to_costs.out_cost;
5102       cost.add_serial_cost (to_costs.internal_cost);
5103       cost.split (to_partition.in_degree);
5104       cost.add_serial_cost (edge_cost);
5105       return cost;
5106     }
5107
5108   return slpg_layout_cost::impossible ();
5109 }
5110
5111 /* Make a forward pass through the partitions, accumulating input costs.
5112    Make a tentative (provisional) choice of layout for each partition,
5113    ensuring that this choice still allows later partitions to keep
5114    their original layout.  */
5115
5116 void
5117 vect_optimize_slp_pass::forward_pass ()
5118 {
5119   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5120        ++partition_i)
5121     {
5122       auto &partition = m_partitions[partition_i];
5123
5124       /* If the partition consists of a single VEC_PERM_EXPR, precompute
5125          the incoming cost that would apply if every predecessor partition
5126          keeps its current layout.  This is used within the loop below.  */
5127       slpg_layout_cost in_cost;
5128       slp_tree single_node = nullptr;
5129       if (partition.node_end == partition.node_begin + 1)
5130         {
5131           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5132           single_node = m_vertices[node_i].node;
5133           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5134             in_cost = total_in_cost (node_i);
5135         }
5136
5137       /* Go through the possible layouts.  Decide which ones are valid
5138          for this partition and record which of the valid layouts has
5139          the lowest cost.  */
5140       unsigned int min_layout_i = 0;
5141       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5142       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5143         {
5144           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5145           if (!layout_costs.is_possible ())
5146             continue;
5147
5148           /* If the recorded layout is already 0 then the layout cannot
5149              change.  */
5150           if (partition.layout == 0 && layout_i != 0)
5151             {
5152               layout_costs.mark_impossible ();
5153               continue;
5154             }
5155
5156           bool is_possible = true;
5157           for (unsigned int order_i = partition.node_begin;
5158                order_i < partition.node_end; ++order_i)
5159             {
5160               unsigned int node_i = m_partitioned_nodes[order_i];
5161               auto &vertex = m_vertices[node_i];
5162
5163               /* Reject the layout if it is individually incompatible
5164                  with any node in the partition.  */
5165               if (!is_compatible_layout (vertex.node, layout_i))
5166                 {
5167                   is_possible = false;
5168                   break;
5169                 }
5170
5171               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5172                 {
5173                   auto &other_vertex = m_vertices[other_node_i];
5174                   if (other_vertex.partition < vertex.partition)
5175                     {
5176                       /* Accumulate the incoming costs from earlier
5177                          partitions, plus the cost of any layout changes
5178                          on UD itself.  */
5179                       auto cost = forward_cost (ud, other_node_i, layout_i);
5180                       if (!cost.is_possible ())
5181                         is_possible = false;
5182                       else
5183                         layout_costs.in_cost.add_parallel_cost (cost);
5184                     }
5185                   else
5186                     /* Reject the layout if it would make layout 0 impossible
5187                        for later partitions.  This amounts to testing that the
5188                        target supports reversing the layout change on edges
5189                        to later partitions.
5190
5191                        In principle, it might be possible to push a layout
5192                        change all the way down a graph, so that it never
5193                        needs to be reversed and so that the target doesn't
5194                        need to support the reverse operation.  But it would
5195                        be awkward to bail out if we hit a partition that
5196                        does not support the new layout, especially since
5197                        we are not dealing with a lattice.  */
5198                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
5199                                                      layout_i).is_possible ();
5200                 };
5201               for_each_partition_edge (node_i, add_cost);
5202
5203               /* Accumulate the cost of using LAYOUT_I within NODE,
5204                  both for the inputs and the outputs.  */
5205               int factor = internal_node_cost (vertex.node, layout_i,
5206                                                layout_i);
5207               if (factor < 0)
5208                 {
5209                   is_possible = false;
5210                   break;
5211                 }
5212               else if (factor)
5213                 layout_costs.internal_cost.add_serial_cost
5214                   ({ vertex.weight * factor, m_optimize_size });
5215             }
5216           if (!is_possible)
5217             {
5218               layout_costs.mark_impossible ();
5219               continue;
5220             }
5221
5222           /* Combine the incoming and partition-internal costs.  */
5223           slpg_layout_cost combined_cost = layout_costs.in_cost;
5224           combined_cost.add_serial_cost (layout_costs.internal_cost);
5225
5226           /* If this partition consists of a single VEC_PERM_EXPR, see
5227              if the VEC_PERM_EXPR can be changed to support output layout
5228              LAYOUT_I while keeping all the provisional choices of input
5229              layout.  */
5230           if (single_node
5231               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5232             {
5233               int factor = internal_node_cost (single_node, -1, layout_i);
5234               if (factor >= 0)
5235                 {
5236                   auto weight = m_vertices[single_node->vertex].weight;
5237                   slpg_layout_cost internal_cost
5238                     = { weight * factor, m_optimize_size };
5239
5240                   slpg_layout_cost alt_cost = in_cost;
5241                   alt_cost.add_serial_cost (internal_cost);
5242                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5243                     {
5244                       combined_cost = alt_cost;
5245                       layout_costs.in_cost = in_cost;
5246                       layout_costs.internal_cost = internal_cost;
5247                     }
5248                 }
5249             }
5250
5251           /* Record the layout with the lowest cost.  Prefer layout 0 in
5252              the event of a tie between it and another layout.  */
5253           if (!min_layout_cost.is_possible ()
5254               || combined_cost.is_better_than (min_layout_cost,
5255                                                m_optimize_size))
5256             {
5257               min_layout_i = layout_i;
5258               min_layout_cost = combined_cost;
5259             }
5260         }
5261
5262       /* This loop's handling of earlier partitions should ensure that
5263          choosing the original layout for the current partition is no
5264          less valid than it was in the original graph, even with the
5265          provisional layout choices for those earlier partitions.  */
5266       gcc_assert (min_layout_cost.is_possible ());
5267       partition.layout = min_layout_i;
5268     }
5269 }
5270
5271 /* Make a backward pass through the partitions, accumulating output costs.
5272    Make a final choice of layout for each partition.  */
5273
5274 void
5275 vect_optimize_slp_pass::backward_pass ()
5276 {
5277   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5278     {
5279       auto &partition = m_partitions[partition_i];
5280
5281       unsigned int min_layout_i = 0;
5282       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5283       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5284         {
5285           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5286           if (!layout_costs.is_possible ())
5287             continue;
5288
5289           /* Accumulate the costs from successor partitions.  */
5290           bool is_possible = true;
5291           for (unsigned int order_i = partition.node_begin;
5292                order_i < partition.node_end; ++order_i)
5293             {
5294               unsigned int node_i = m_partitioned_nodes[order_i];
5295               auto &vertex = m_vertices[node_i];
5296               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5297                 {
5298                   auto &other_vertex = m_vertices[other_node_i];
5299                   auto &other_partition = m_partitions[other_vertex.partition];
5300                   if (other_vertex.partition > vertex.partition)
5301                     {
5302                       /* Accumulate the incoming costs from later
5303                          partitions, plus the cost of any layout changes
5304                          on UD itself.  */
5305                       auto cost = backward_cost (ud, other_node_i, layout_i);
5306                       if (!cost.is_possible ())
5307                         is_possible = false;
5308                       else
5309                         layout_costs.out_cost.add_parallel_cost (cost);
5310                     }
5311                   else
5312                     /* Make sure that earlier partitions can (if necessary
5313                        or beneficial) keep the layout that they chose in
5314                        the forward pass.  This ensures that there is at
5315                        least one valid choice of layout.  */
5316                     is_possible &= edge_layout_cost (ud, other_node_i,
5317                                                      other_partition.layout,
5318                                                      layout_i).is_possible ();
5319                 };
5320               for_each_partition_edge (node_i, add_cost);
5321             }
5322           if (!is_possible)
5323             {
5324               layout_costs.mark_impossible ();
5325               continue;
5326             }
5327
5328           /* Locally combine the costs from the forward and backward passes.
5329              (This combined cost is not passed on, since that would lead
5330              to double counting.)  */
5331           slpg_layout_cost combined_cost = layout_costs.in_cost;
5332           combined_cost.add_serial_cost (layout_costs.internal_cost);
5333           combined_cost.add_serial_cost (layout_costs.out_cost);
5334
5335           /* Record the layout with the lowest cost.  Prefer layout 0 in
5336              the event of a tie between it and another layout.  */
5337           if (!min_layout_cost.is_possible ()
5338               || combined_cost.is_better_than (min_layout_cost,
5339                                                m_optimize_size))
5340             {
5341               min_layout_i = layout_i;
5342               min_layout_cost = combined_cost;
5343             }
5344         }
5345
5346       gcc_assert (min_layout_cost.is_possible ());
5347       partition.layout = min_layout_i;
5348     }
5349 }
5350
5351 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5352    NODE already has the layout that was selected for its partition.  */
5353
5354 slp_tree
5355 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5356                                                 unsigned int to_layout_i)
5357 {
5358   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5359   slp_tree result = m_node_layouts[result_i];
5360   if (result)
5361     return result;
5362
5363   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5364       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5365           /* We can't permute vector defs in place.  */
5366           && SLP_TREE_VEC_DEFS (node).is_empty ()))
5367     {
5368       /* If the vector is uniform or unchanged, there's nothing to do.  */
5369       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5370         result = node;
5371       else
5372         {
5373           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5374           result = vect_create_new_slp_node (scalar_ops);
5375           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5376         }
5377     }
5378   else
5379     {
5380       unsigned int partition_i = m_vertices[node->vertex].partition;
5381       unsigned int from_layout_i = m_partitions[partition_i].layout;
5382       if (from_layout_i == to_layout_i)
5383         return node;
5384
5385       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5386          permutation instead of a serial one.  Leave the new permutation
5387          in TMP_PERM on success.  */
5388       auto_lane_permutation_t tmp_perm;
5389       unsigned int num_inputs = 1;
5390       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5391         {
5392           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5393           if (from_layout_i != 0)
5394             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5395           if (to_layout_i != 0)
5396             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5397           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5398                                               tmp_perm,
5399                                               SLP_TREE_CHILDREN (node),
5400                                               false) >= 0)
5401             num_inputs = SLP_TREE_CHILDREN (node).length ();
5402           else
5403             tmp_perm.truncate (0);
5404         }
5405
5406       if (dump_enabled_p ())
5407         {
5408           if (tmp_perm.length () > 0)
5409             dump_printf_loc (MSG_NOTE, vect_location,
5410                              "duplicating permutation node %p with"
5411                              " layout %d\n",
5412                              (void *) node, to_layout_i);
5413           else
5414             dump_printf_loc (MSG_NOTE, vect_location,
5415                              "inserting permutation node in place of %p\n",
5416                              (void *) node);
5417         }
5418
5419       unsigned int num_lanes = SLP_TREE_LANES (node);
5420       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5421       if (SLP_TREE_SCALAR_STMTS (node).length ())
5422         {
5423           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5424           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5425           if (from_layout_i != 0)
5426             vect_slp_permute (m_perms[from_layout_i], stmts, false);
5427           if (to_layout_i != 0)
5428             vect_slp_permute (m_perms[to_layout_i], stmts, true);
5429         }
5430       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5431       SLP_TREE_LANES (result) = num_lanes;
5432       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5433       result->vertex = -1;
5434
5435       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5436       if (tmp_perm.length ())
5437         {
5438           lane_perm.safe_splice (tmp_perm);
5439           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5440         }
5441       else
5442         {
5443           lane_perm.create (num_lanes);
5444           for (unsigned j = 0; j < num_lanes; ++j)
5445             lane_perm.quick_push ({ 0, j });
5446           if (from_layout_i != 0)
5447             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5448           if (to_layout_i != 0)
5449             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5450           SLP_TREE_CHILDREN (result).safe_push (node);
5451         }
5452       for (slp_tree child : SLP_TREE_CHILDREN (result))
5453         child->refcnt++;
5454     }
5455   m_node_layouts[result_i] = result;
5456   return result;
5457 }
5458
5459 /* Apply the chosen vector layouts to the SLP graph.  */
5460
5461 void
5462 vect_optimize_slp_pass::materialize ()
5463 {
5464   /* We no longer need the costs, so avoid having two O(N * P) arrays
5465      live at the same time.  */
5466   m_partition_layout_costs.release ();
5467   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5468
5469   auto_sbitmap fully_folded (m_vertices.length ());
5470   bitmap_clear (fully_folded);
5471   for (unsigned int node_i : m_partitioned_nodes)
5472     {
5473       auto &vertex = m_vertices[node_i];
5474       slp_tree node = vertex.node;
5475       int layout_i = m_partitions[vertex.partition].layout;
5476       gcc_assert (layout_i >= 0);
5477
5478       /* Rearrange the scalar statements to match the chosen layout.  */
5479       if (layout_i > 0)
5480         vect_slp_permute (m_perms[layout_i],
5481                           SLP_TREE_SCALAR_STMTS (node), true);
5482
5483       /* Update load and lane permutations.  */
5484       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5485         {
5486           /* First try to absorb the input vector layouts.  If that fails,
5487              force the inputs to have layout LAYOUT_I too.  We checked that
5488              that was possible before deciding to use nonzero output layouts.
5489              (Note that at this stage we don't really have any guarantee that
5490              the target supports the original VEC_PERM_EXPR.)  */
5491           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5492           auto_lane_permutation_t tmp_perm;
5493           tmp_perm.safe_splice (perm);
5494           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5495           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5496                                               tmp_perm,
5497                                               SLP_TREE_CHILDREN (node),
5498                                               false) >= 0)
5499             {
5500               if (dump_enabled_p ()
5501                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5502                                   perm.begin ()))
5503                 dump_printf_loc (MSG_NOTE, vect_location,
5504                                  "absorbing input layouts into %p\n",
5505                                  (void *) node);
5506               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5507               bitmap_set_bit (fully_folded, node_i);
5508             }
5509           else
5510             {
5511               /* Not MSG_MISSED because it would make no sense to users.  */
5512               if (dump_enabled_p ())
5513                 dump_printf_loc (MSG_NOTE, vect_location,
5514                                  "failed to absorb input layouts into %p\n",
5515                                  (void *) node);
5516               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5517             }
5518         }
5519       else
5520         {
5521           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5522           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5523           if (layout_i > 0)
5524             /* ???  When we handle non-bijective permutes the idea
5525                is that we can force the load-permutation to be
5526                { min, min + 1, min + 2, ... max }.  But then the
5527                scalar defs might no longer match the lane content
5528                which means wrong-code with live lane vectorization.
5529                So we possibly have to have NULL entries for those.  */
5530             vect_slp_permute (m_perms[layout_i], load_perm, true);
5531         }
5532     }
5533
5534   /* Do this before any nodes disappear, since it involves a walk
5535      over the leaves.  */
5536   remove_redundant_permutations ();
5537
5538   /* Replace each child with a correctly laid-out version.  */
5539   for (unsigned int node_i : m_partitioned_nodes)
5540     {
5541       /* Skip nodes that have already been handled above.  */
5542       if (bitmap_bit_p (fully_folded, node_i))
5543         continue;
5544
5545       auto &vertex = m_vertices[node_i];
5546       int in_layout_i = m_partitions[vertex.partition].layout;
5547       gcc_assert (in_layout_i >= 0);
5548
5549       unsigned j;
5550       slp_tree child;
5551       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5552         {
5553           if (!child)
5554             continue;
5555
5556           slp_tree new_child = get_result_with_layout (child, in_layout_i);
5557           if (new_child != child)
5558             {
5559               vect_free_slp_tree (child);
5560               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5561               new_child->refcnt += 1;
5562             }
5563         }
5564     }
5565 }
5566
5567 /* Elide load permutations that are not necessary.  Such permutations might
5568    be pre-existing, rather than created by the layout optimizations.  */
5569
5570 void
5571 vect_optimize_slp_pass::remove_redundant_permutations ()
5572 {
5573   for (unsigned int node_i : m_leafs)
5574     {
5575       slp_tree node = m_vertices[node_i].node;
5576       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5577         continue;
5578
5579       /* In basic block vectorization we allow any subchain of an interleaving
5580          chain.
5581          FORNOW: not in loop SLP because of realignment complications.  */
5582       if (is_a <bb_vec_info> (m_vinfo))
5583         {
5584           bool subchain_p = true;
5585           stmt_vec_info next_load_info = NULL;
5586           stmt_vec_info load_info;
5587           unsigned j;
5588           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5589             {
5590               if (j != 0
5591                   && (next_load_info != load_info
5592                       || DR_GROUP_GAP (load_info) != 1))
5593                 {
5594                   subchain_p = false;
5595                   break;
5596                 }
5597               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5598             }
5599           if (subchain_p)
5600             {
5601               SLP_TREE_LOAD_PERMUTATION (node).release ();
5602               continue;
5603             }
5604         }
5605       else
5606         {
5607           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5608           stmt_vec_info load_info;
5609           bool this_load_permuted = false;
5610           unsigned j;
5611           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5612             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5613               {
5614                 this_load_permuted = true;
5615                 break;
5616               }
5617           /* When this isn't a grouped access we know it's single element
5618              and contiguous.  */
5619           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5620             {
5621               if (!this_load_permuted
5622                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5623                       || SLP_TREE_LANES (node) == 1))
5624                 SLP_TREE_LOAD_PERMUTATION (node).release ();
5625               continue;
5626             }
5627           stmt_vec_info first_stmt_info
5628             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5629           if (!this_load_permuted
5630               /* The load requires permutation when unrolling exposes
5631                  a gap either because the group is larger than the SLP
5632                  group-size or because there is a gap between the groups.  */
5633               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5634                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5635                       && DR_GROUP_GAP (first_stmt_info) == 0)))
5636             {
5637               SLP_TREE_LOAD_PERMUTATION (node).release ();
5638               continue;
5639             }
5640         }
5641     }
5642 }
5643
5644 /* Print the partition graph and layout information to the dump file.  */
5645
5646 void
5647 vect_optimize_slp_pass::dump ()
5648 {
5649   dump_printf_loc (MSG_NOTE, vect_location,
5650                    "SLP optimize permutations:\n");
5651   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5652     {
5653       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
5654       const char *sep = "";
5655       for (unsigned int idx : m_perms[layout_i])
5656         {
5657           dump_printf (MSG_NOTE, "%s%d", sep, idx);
5658           sep = ", ";
5659         }
5660       dump_printf (MSG_NOTE, " }\n");
5661     }
5662   dump_printf_loc (MSG_NOTE, vect_location,
5663                    "SLP optimize partitions:\n");
5664   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5665        ++partition_i)
5666     {
5667       auto &partition = m_partitions[partition_i];
5668       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
5669       dump_printf_loc (MSG_NOTE, vect_location,
5670                        "  partition %d (layout %d):\n",
5671                        partition_i, partition.layout);
5672       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
5673       for (unsigned int order_i = partition.node_begin;
5674            order_i < partition.node_end; ++order_i)
5675         {
5676           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5677           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
5678                            (void *) vertex.node);
5679           dump_printf_loc (MSG_NOTE, vect_location,
5680                            "          weight: %f\n",
5681                            vertex.weight.to_double ());
5682           if (vertex.out_degree)
5683             dump_printf_loc (MSG_NOTE, vect_location,
5684                              "          out weight: %f (degree %d)\n",
5685                              vertex.out_weight.to_double (),
5686                              vertex.out_degree);
5687           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5688             dump_printf_loc (MSG_NOTE, vect_location,
5689                              "          op: VEC_PERM_EXPR\n");
5690           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5691             dump_printf_loc (MSG_NOTE, vect_location,
5692                              "          op template: %G", rep->stmt);
5693         }
5694       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
5695       for (unsigned int order_i = partition.node_begin;
5696            order_i < partition.node_end; ++order_i)
5697         {
5698           unsigned int node_i = m_partitioned_nodes[order_i];
5699           auto &vertex = m_vertices[node_i];
5700           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5701             {
5702               auto &other_vertex = m_vertices[other_node_i];
5703               if (other_vertex.partition < vertex.partition)
5704                 dump_printf_loc (MSG_NOTE, vect_location,
5705                                  "      - %p [%d] --> %p\n",
5706                                  (void *) other_vertex.node,
5707                                  other_vertex.partition,
5708                                  (void *) vertex.node);
5709               else
5710                 dump_printf_loc (MSG_NOTE, vect_location,
5711                                  "      - %p --> [%d] %p\n",
5712                                  (void *) vertex.node,
5713                                  other_vertex.partition,
5714                                  (void *) other_vertex.node);
5715             };
5716           for_each_partition_edge (node_i, print_edge);
5717         }
5718
5719       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5720         {
5721           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5722           if (layout_costs.is_possible ())
5723             {
5724               dump_printf_loc (MSG_NOTE, vect_location,
5725                                "    layout %d:%s\n", layout_i,
5726                                partition.layout == int (layout_i)
5727                                ? " (*)" : "");
5728               slpg_layout_cost combined_cost = layout_costs.in_cost;
5729               combined_cost.add_serial_cost (layout_costs.internal_cost);
5730               combined_cost.add_serial_cost (layout_costs.out_cost);
5731 #define TEMPLATE "{depth: %f, total: %f}"
5732               dump_printf_loc (MSG_NOTE, vect_location,
5733                                "        " TEMPLATE "\n",
5734                                layout_costs.in_cost.depth.to_double (),
5735                                layout_costs.in_cost.total.to_double ());
5736               dump_printf_loc (MSG_NOTE, vect_location,
5737                                "      + " TEMPLATE "\n",
5738                                layout_costs.internal_cost.depth.to_double (),
5739                                layout_costs.internal_cost.total.to_double ());
5740               dump_printf_loc (MSG_NOTE, vect_location,
5741                                "      + " TEMPLATE "\n",
5742                                layout_costs.out_cost.depth.to_double (),
5743                                layout_costs.out_cost.total.to_double ());
5744               dump_printf_loc (MSG_NOTE, vect_location,
5745                                "      = " TEMPLATE "\n",
5746                                combined_cost.depth.to_double (),
5747                                combined_cost.total.to_double ());
5748 #undef TEMPLATE
5749             }
5750           else
5751             dump_printf_loc (MSG_NOTE, vect_location,
5752                              "    layout %d: rejected\n", layout_i);
5753         }
5754     }
5755 }
5756
5757 /* Main entry point for the SLP graph optimization pass.  */
5758
5759 void
5760 vect_optimize_slp_pass::run ()
5761 {
5762   build_graph ();
5763   create_partitions ();
5764   start_choosing_layouts ();
5765   if (m_perms.length () > 1)
5766     {
5767       forward_pass ();
5768       backward_pass ();
5769       if (dump_enabled_p ())
5770         dump ();
5771       materialize ();
5772       while (!m_perms.is_empty ())
5773         m_perms.pop ().release ();
5774     }
5775   else
5776     remove_redundant_permutations ();
5777   free_graph (m_slpg);
5778 }
5779
5780 /* Optimize the SLP graph of VINFO.  */
5781
5782 void
5783 vect_optimize_slp (vec_info *vinfo)
5784 {
5785   if (vinfo->slp_instances.is_empty ())
5786     return;
5787   vect_optimize_slp_pass (vinfo).run ();
5788 }
5789
5790 /* Gather loads reachable from the individual SLP graph entries.  */
5791
5792 void
5793 vect_gather_slp_loads (vec_info *vinfo)
5794 {
5795   unsigned i;
5796   slp_instance instance;
5797   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5798     {
5799       hash_set<slp_tree> visited;
5800       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5801                              SLP_INSTANCE_TREE (instance), visited);
5802     }
5803 }
5804
5805
5806 /* For each possible SLP instance decide whether to SLP it and calculate overall
5807    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
5808    least one instance.  */
5809
5810 bool
5811 vect_make_slp_decision (loop_vec_info loop_vinfo)
5812 {
5813   unsigned int i;
5814   poly_uint64 unrolling_factor = 1;
5815   const vec<slp_instance> &slp_instances
5816     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5817   slp_instance instance;
5818   int decided_to_slp = 0;
5819
5820   DUMP_VECT_SCOPE ("vect_make_slp_decision");
5821
5822   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5823     {
5824       /* FORNOW: SLP if you can.  */
5825       /* All unroll factors have the form:
5826
5827            GET_MODE_SIZE (vinfo->vector_mode) * X
5828
5829          for some rational X, so they must have a common multiple.  */
5830       unrolling_factor
5831         = force_common_multiple (unrolling_factor,
5832                                  SLP_INSTANCE_UNROLLING_FACTOR (instance));
5833
5834       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
5835          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5836          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
5837       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5838       decided_to_slp++;
5839     }
5840
5841   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5842
5843   if (decided_to_slp && dump_enabled_p ())
5844     {
5845       dump_printf_loc (MSG_NOTE, vect_location,
5846                        "Decided to SLP %d instances. Unrolling factor ",
5847                        decided_to_slp);
5848       dump_dec (MSG_NOTE, unrolling_factor);
5849       dump_printf (MSG_NOTE, "\n");
5850     }
5851
5852   return (decided_to_slp > 0);
5853 }
5854
5855 /* Private data for vect_detect_hybrid_slp.  */
5856 struct vdhs_data
5857 {
5858   loop_vec_info loop_vinfo;
5859   vec<stmt_vec_info> *worklist;
5860 };
5861
5862 /* Walker for walk_gimple_op.  */
5863
5864 static tree
5865 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5866 {
5867   walk_stmt_info *wi = (walk_stmt_info *)data;
5868   vdhs_data *dat = (vdhs_data *)wi->info;
5869
5870   if (wi->is_lhs)
5871     return NULL_TREE;
5872
5873   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5874   if (!def_stmt_info)
5875     return NULL_TREE;
5876   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5877   if (PURE_SLP_STMT (def_stmt_info))
5878     {
5879       if (dump_enabled_p ())
5880         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5881                          def_stmt_info->stmt);
5882       STMT_SLP_TYPE (def_stmt_info) = hybrid;
5883       dat->worklist->safe_push (def_stmt_info);
5884     }
5885
5886   return NULL_TREE;
5887 }
5888
5889 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5890    if so, otherwise pushing it to WORKLIST.  */
5891
5892 static void
5893 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5894                                vec<stmt_vec_info> &worklist,
5895                                stmt_vec_info stmt_info)
5896 {
5897   if (dump_enabled_p ())
5898     dump_printf_loc (MSG_NOTE, vect_location,
5899                      "Processing hybrid candidate : %G", stmt_info->stmt);
5900   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5901   imm_use_iterator iter2;
5902   ssa_op_iter iter1;
5903   use_operand_p use_p;
5904   def_operand_p def_p;
5905   bool any_def = false;
5906   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5907     {
5908       any_def = true;
5909       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5910         {
5911           if (is_gimple_debug (USE_STMT (use_p)))
5912             continue;
5913           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5914           /* An out-of loop use means this is a loop_vect sink.  */
5915           if (!use_info)
5916             {
5917               if (dump_enabled_p ())
5918                 dump_printf_loc (MSG_NOTE, vect_location,
5919                                  "Found loop_vect sink: %G", stmt_info->stmt);
5920               worklist.safe_push (stmt_info);
5921               return;
5922             }
5923           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5924             {
5925               if (dump_enabled_p ())
5926                 dump_printf_loc (MSG_NOTE, vect_location,
5927                                  "Found loop_vect use: %G", use_info->stmt);
5928               worklist.safe_push (stmt_info);
5929               return;
5930             }
5931         }
5932     }
5933   /* No def means this is a loo_vect sink.  */
5934   if (!any_def)
5935     {
5936       if (dump_enabled_p ())
5937         dump_printf_loc (MSG_NOTE, vect_location,
5938                          "Found loop_vect sink: %G", stmt_info->stmt);
5939       worklist.safe_push (stmt_info);
5940       return;
5941     }
5942   if (dump_enabled_p ())
5943     dump_printf_loc (MSG_NOTE, vect_location,
5944                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5945   STMT_SLP_TYPE (stmt_info) = pure_slp;
5946 }
5947
5948 /* Find stmts that must be both vectorized and SLPed.  */
5949
5950 void
5951 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5952 {
5953   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5954
5955   /* All stmts participating in SLP are marked pure_slp, all other
5956      stmts are loop_vect.
5957      First collect all loop_vect stmts into a worklist.
5958      SLP patterns cause not all original scalar stmts to appear in
5959      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5960      Rectify this here and do a backward walk over the IL only considering
5961      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5962      mark them as pure_slp.  */
5963   auto_vec<stmt_vec_info> worklist;
5964   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5965     {
5966       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5967       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5968            gsi_next (&gsi))
5969         {
5970           gphi *phi = gsi.phi ();
5971           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5972           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5973             maybe_push_to_hybrid_worklist (loop_vinfo,
5974                                            worklist, stmt_info);
5975         }
5976       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5977            gsi_prev (&gsi))
5978         {
5979           gimple *stmt = gsi_stmt (gsi);
5980           if (is_gimple_debug (stmt))
5981             continue;
5982           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5983           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5984             {
5985               for (gimple_stmt_iterator gsi2
5986                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5987                    !gsi_end_p (gsi2); gsi_next (&gsi2))
5988                 {
5989                   stmt_vec_info patt_info
5990                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5991                   if (!STMT_SLP_TYPE (patt_info)
5992                       && STMT_VINFO_RELEVANT (patt_info))
5993                     maybe_push_to_hybrid_worklist (loop_vinfo,
5994                                                    worklist, patt_info);
5995                 }
5996               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5997             }
5998           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5999             maybe_push_to_hybrid_worklist (loop_vinfo,
6000                                            worklist, stmt_info);
6001         }
6002     }
6003
6004   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
6005      mark any SLP vectorized stmt as hybrid.
6006      ???  We're visiting def stmts N times (once for each non-SLP and
6007      once for each hybrid-SLP use).  */
6008   walk_stmt_info wi;
6009   vdhs_data dat;
6010   dat.worklist = &worklist;
6011   dat.loop_vinfo = loop_vinfo;
6012   memset (&wi, 0, sizeof (wi));
6013   wi.info = (void *)&dat;
6014   while (!worklist.is_empty ())
6015     {
6016       stmt_vec_info stmt_info = worklist.pop ();
6017       /* Since SSA operands are not set up for pattern stmts we need
6018          to use walk_gimple_op.  */
6019       wi.is_lhs = 0;
6020       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
6021       /* For gather/scatter make sure to walk the offset operand, that
6022          can be a scaling and conversion away.  */
6023       gather_scatter_info gs_info;
6024       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6025           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6026         {
6027           int dummy;
6028           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
6029         }
6030     }
6031 }
6032
6033
6034 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
6035
6036 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6037   : vec_info (vec_info::bb, shared),
6038     bbs (_bbs),
6039     roots (vNULL)
6040 {
6041   for (unsigned i = 0; i < bbs.length (); ++i)
6042     {
6043       if (i != 0)
6044         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6045              gsi_next (&si))
6046           {
6047             gphi *phi = si.phi ();
6048             gimple_set_uid (phi, 0);
6049             add_stmt (phi);
6050           }
6051       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6052            !gsi_end_p (gsi); gsi_next (&gsi))
6053         {
6054           gimple *stmt = gsi_stmt (gsi);
6055           gimple_set_uid (stmt, 0);
6056           if (is_gimple_debug (stmt))
6057             continue;
6058           add_stmt (stmt);
6059         }
6060     }
6061 }
6062
6063
6064 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
6065    stmts in the basic block.  */
6066
6067 _bb_vec_info::~_bb_vec_info ()
6068 {
6069   /* Reset region marker.  */
6070   for (unsigned i = 0; i < bbs.length (); ++i)
6071     {
6072       if (i != 0)
6073         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6074              gsi_next (&si))
6075           {
6076             gphi *phi = si.phi ();
6077             gimple_set_uid (phi, -1);
6078           }
6079       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6080            !gsi_end_p (gsi); gsi_next (&gsi))
6081         {
6082           gimple *stmt = gsi_stmt (gsi);
6083           gimple_set_uid (stmt, -1);
6084         }
6085     }
6086
6087   for (unsigned i = 0; i < roots.length (); ++i)
6088     {
6089       roots[i].stmts.release ();
6090       roots[i].roots.release ();
6091       roots[i].remain.release ();
6092     }
6093   roots.release ();
6094 }
6095
6096 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
6097    given then that child nodes have already been processed, and that
6098    their def types currently match their SLP node's def type.  */
6099
6100 static bool
6101 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6102                                     slp_instance node_instance,
6103                                     stmt_vector_for_cost *cost_vec)
6104 {
6105   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6106
6107   /* Calculate the number of vector statements to be created for the
6108      scalar stmts in this node.  For SLP reductions it is equal to the
6109      number of vector statements in the children (which has already been
6110      calculated by the recursive call).  Otherwise it is the number of
6111      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6112      VF divided by the number of elements in a vector.  */
6113   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6114       && !STMT_VINFO_DATA_REF (stmt_info)
6115       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6116     {
6117       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6118         if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6119           {
6120             SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6121               = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6122             break;
6123           }
6124     }
6125   else
6126     {
6127       poly_uint64 vf;
6128       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6129         vf = loop_vinfo->vectorization_factor;
6130       else
6131         vf = 1;
6132       unsigned int group_size = SLP_TREE_LANES (node);
6133       tree vectype = SLP_TREE_VECTYPE (node);
6134       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6135         = vect_get_num_vectors (vf * group_size, vectype);
6136     }
6137
6138   /* Handle purely internal nodes.  */
6139   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6140     {
6141       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6142         return false;
6143
6144       stmt_vec_info slp_stmt_info;
6145       unsigned int i;
6146       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6147         {
6148           if (STMT_VINFO_LIVE_P (slp_stmt_info)
6149               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6150                                                node_instance, i,
6151                                                false, cost_vec))
6152             return false;
6153         }
6154       return true;
6155     }
6156
6157   bool dummy;
6158   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6159                             node, node_instance, cost_vec);
6160 }
6161
6162 /* Try to build NODE from scalars, returning true on success.
6163    NODE_INSTANCE is the SLP instance that contains NODE.  */
6164
6165 static bool
6166 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6167                               slp_instance node_instance)
6168 {
6169   stmt_vec_info stmt_info;
6170   unsigned int i;
6171
6172   if (!is_a <bb_vec_info> (vinfo)
6173       || node == SLP_INSTANCE_TREE (node_instance)
6174       || !SLP_TREE_SCALAR_STMTS (node).exists ()
6175       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6176       /* Force the mask use to be built from scalars instead.  */
6177       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6178     return false;
6179
6180   if (dump_enabled_p ())
6181     dump_printf_loc (MSG_NOTE, vect_location,
6182                      "Building vector operands of %p from scalars instead\n",
6183                      (void *) node);
6184
6185   /* Don't remove and free the child nodes here, since they could be
6186      referenced by other structures.  The analysis and scheduling phases
6187      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
6188   unsigned int group_size = SLP_TREE_LANES (node);
6189   SLP_TREE_DEF_TYPE (node) = vect_external_def;
6190   /* Invariants get their vector type from the uses.  */
6191   SLP_TREE_VECTYPE (node) = NULL_TREE;
6192   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6193   SLP_TREE_LOAD_PERMUTATION (node).release ();
6194   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6195     {
6196       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6197       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6198     }
6199   return true;
6200 }
6201
6202 /* Return true if all elements of the slice are the same.  */
6203 bool
6204 vect_scalar_ops_slice::all_same_p () const
6205 {
6206   for (unsigned int i = 1; i < length; ++i)
6207     if (!operand_equal_p (op (0), op (i)))
6208       return false;
6209   return true;
6210 }
6211
6212 hashval_t
6213 vect_scalar_ops_slice_hash::hash (const value_type &s)
6214 {
6215   hashval_t hash = 0;
6216   for (unsigned i = 0; i < s.length; ++i)
6217     hash = iterative_hash_expr (s.op (i), hash);
6218   return hash;
6219 }
6220
6221 bool
6222 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6223                                    const compare_type &s2)
6224 {
6225   if (s1.length != s2.length)
6226     return false;
6227   for (unsigned i = 0; i < s1.length; ++i)
6228     if (!operand_equal_p (s1.op (i), s2.op (i)))
6229       return false;
6230   return true;
6231 }
6232
6233 /* Compute the prologue cost for invariant or constant operands represented
6234    by NODE.  */
6235
6236 static void
6237 vect_prologue_cost_for_slp (slp_tree node,
6238                             stmt_vector_for_cost *cost_vec)
6239 {
6240   /* There's a special case of an existing vector, that costs nothing.  */
6241   if (SLP_TREE_SCALAR_OPS (node).length () == 0
6242       && !SLP_TREE_VEC_DEFS (node).is_empty ())
6243     return;
6244   /* Without looking at the actual initializer a vector of
6245      constants can be implemented as load from the constant pool.
6246      When all elements are the same we can use a splat.  */
6247   tree vectype = SLP_TREE_VECTYPE (node);
6248   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6249   unsigned HOST_WIDE_INT const_nunits;
6250   unsigned nelt_limit;
6251   auto ops = &SLP_TREE_SCALAR_OPS (node);
6252   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6253   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6254       && ! multiple_p (const_nunits, group_size))
6255     {
6256       nelt_limit = const_nunits;
6257       hash_set<vect_scalar_ops_slice_hash> vector_ops;
6258       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6259         if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6260           starts.quick_push (i * const_nunits);
6261     }
6262   else
6263     {
6264       /* If either the vector has variable length or the vectors
6265          are composed of repeated whole groups we only need to
6266          cost construction once.  All vectors will be the same.  */
6267       nelt_limit = group_size;
6268       starts.quick_push (0);
6269     }
6270   /* ???  We're just tracking whether vectors in a single node are the same.
6271      Ideally we'd do something more global.  */
6272   bool passed = false;
6273   for (unsigned int start : starts)
6274     {
6275       vect_cost_for_stmt kind;
6276       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6277         kind = vector_load;
6278       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6279         kind = scalar_to_vec;
6280       else
6281         kind = vec_construct;
6282       /* The target cost hook has no idea which part of the SLP node
6283          we are costing so avoid passing it down more than once.  Pass
6284          it to the first vec_construct or scalar_to_vec part since for those
6285          the x86 backend tries to account for GPR to XMM register moves.  */
6286       record_stmt_cost (cost_vec, 1, kind,
6287                         (kind != vector_load && !passed) ? node : nullptr,
6288                         vectype, 0, vect_prologue);
6289       if (kind != vector_load)
6290         passed = true;
6291     }
6292 }
6293
6294 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6295    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6296
6297    Return true if the operations are supported.  */
6298
6299 static bool
6300 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6301                                   slp_instance node_instance,
6302                                   hash_set<slp_tree> &visited_set,
6303                                   vec<slp_tree> &visited_vec,
6304                                   stmt_vector_for_cost *cost_vec)
6305 {
6306   int i, j;
6307   slp_tree child;
6308
6309   /* Assume we can code-generate all invariants.  */
6310   if (!node
6311       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6312       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6313     return true;
6314
6315   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6316     {
6317       if (dump_enabled_p ())
6318         dump_printf_loc (MSG_NOTE, vect_location,
6319                          "Failed cyclic SLP reference in %p\n", (void *) node);
6320       return false;
6321     }
6322   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6323
6324   /* If we already analyzed the exact same set of scalar stmts we're done.
6325      We share the generated vector stmts for those.  */
6326   if (visited_set.add (node))
6327     return true;
6328   visited_vec.safe_push (node);
6329
6330   bool res = true;
6331   unsigned visited_rec_start = visited_vec.length ();
6332   unsigned cost_vec_rec_start = cost_vec->length ();
6333   bool seen_non_constant_child = false;
6334   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6335     {
6336       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6337                                               visited_set, visited_vec,
6338                                               cost_vec);
6339       if (!res)
6340         break;
6341       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6342         seen_non_constant_child = true;
6343     }
6344   /* We're having difficulties scheduling nodes with just constant
6345      operands and no scalar stmts since we then cannot compute a stmt
6346      insertion place.  */
6347   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6348     {
6349       if (dump_enabled_p ())
6350         dump_printf_loc (MSG_NOTE, vect_location,
6351                          "Cannot vectorize all-constant op node %p\n",
6352                          (void *) node);
6353       res = false;
6354     }
6355
6356   if (res)
6357     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6358                                               cost_vec);
6359   /* If analysis failed we have to pop all recursive visited nodes
6360      plus ourselves.  */
6361   if (!res)
6362     {
6363       while (visited_vec.length () >= visited_rec_start)
6364         visited_set.remove (visited_vec.pop ());
6365       cost_vec->truncate (cost_vec_rec_start);
6366     }
6367
6368   /* When the node can be vectorized cost invariant nodes it references.
6369      This is not done in DFS order to allow the refering node
6370      vectorizable_* calls to nail down the invariant nodes vector type
6371      and possibly unshare it if it needs a different vector type than
6372      other referrers.  */
6373   if (res)
6374     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6375       if (child
6376           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6377               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6378           /* Perform usual caching, note code-generation still
6379              code-gens these nodes multiple times but we expect
6380              to CSE them later.  */
6381           && !visited_set.add (child))
6382         {
6383           visited_vec.safe_push (child);
6384           /* ???  After auditing more code paths make a "default"
6385              and push the vector type from NODE to all children
6386              if it is not already set.  */
6387           /* Compute the number of vectors to be generated.  */
6388           tree vector_type = SLP_TREE_VECTYPE (child);
6389           if (!vector_type)
6390             {
6391               /* For shifts with a scalar argument we don't need
6392                  to cost or code-generate anything.
6393                  ???  Represent this more explicitely.  */
6394               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6395                            == shift_vec_info_type)
6396                           && j == 1);
6397               continue;
6398             }
6399           unsigned group_size = SLP_TREE_LANES (child);
6400           poly_uint64 vf = 1;
6401           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6402             vf = loop_vinfo->vectorization_factor;
6403           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6404             = vect_get_num_vectors (vf * group_size, vector_type);
6405           /* And cost them.  */
6406           vect_prologue_cost_for_slp (child, cost_vec);
6407         }
6408
6409   /* If this node or any of its children can't be vectorized, try pruning
6410      the tree here rather than felling the whole thing.  */
6411   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6412     {
6413       /* We'll need to revisit this for invariant costing and number
6414          of vectorized stmt setting.   */
6415       res = true;
6416     }
6417
6418   return res;
6419 }
6420
6421 /* Given a definition DEF, analyze if it will have any live scalar use after
6422    performing SLP vectorization whose information is represented by BB_VINFO,
6423    and record result into hash map SCALAR_USE_MAP as cache for later fast
6424    check.  If recursion DEPTH exceeds a limit, stop analysis and make a
6425    conservative assumption.  Return 0 if no scalar use, 1 if there is, -1
6426    means recursion is limited.  */
6427
6428 static int
6429 vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
6430                         hash_map<tree, int> &scalar_use_map,
6431                         int depth = 0)
6432 {
6433   const int depth_limit = 2;
6434   imm_use_iterator use_iter;
6435   gimple *use_stmt;
6436
6437   if (int *res = scalar_use_map.get (def))
6438     return *res;
6439
6440   int scalar_use = 1;
6441
6442   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
6443     {
6444       if (is_gimple_debug (use_stmt))
6445         continue;
6446
6447       stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6448
6449       if (!use_stmt_info)
6450         break;
6451
6452       if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6453         continue;
6454
6455       /* Do not step forward when encounter PHI statement, since it may
6456          involve cyclic reference and cause infinite recursive invocation.  */
6457       if (gimple_code (use_stmt) == GIMPLE_PHI)
6458         break;
6459
6460       /* When pattern recognition is involved, a statement whose definition is
6461          consumed in some pattern, may not be included in the final replacement
6462          pattern statements, so would be skipped when building SLP graph.
6463
6464          * Original
6465           char a_c = *(char *) a;
6466           char b_c = *(char *) b;
6467           unsigned short a_s = (unsigned short) a_c;
6468           int a_i = (int) a_s;
6469           int b_i = (int) b_c;
6470           int r_i = a_i - b_i;
6471
6472          * After pattern replacement
6473           a_s = (unsigned short) a_c;
6474           a_i = (int) a_s;
6475
6476           patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
6477           patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
6478
6479           patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
6480           patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
6481
6482          The definitions of a_i(original statement) and b_i(pattern statement)
6483          are related to, but actually not part of widen_minus pattern.
6484          Vectorizing the pattern does not cause these definition statements to
6485          be marked as PURE_SLP.  For this case, we need to recursively check
6486          whether their uses are all absorbed into vectorized code.  But there
6487          is an exception that some use may participate in an vectorized
6488          operation via an external SLP node containing that use as an element.
6489          The parameter "scalar_use_map" tags such kind of SSA as having scalar
6490          use in advance.  */
6491       tree lhs = gimple_get_lhs (use_stmt);
6492
6493       if (!lhs || TREE_CODE (lhs) != SSA_NAME)
6494         break;
6495
6496       if (depth_limit && depth >= depth_limit)
6497         return -1;
6498
6499       if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
6500                                                 depth + 1)))
6501         break;
6502     }
6503
6504   if (end_imm_use_stmt_p (&use_iter))
6505     scalar_use = 0;
6506
6507   /* If recursion is limited, do not cache result for non-root defs.  */
6508   if (!depth || scalar_use >= 0)
6509     {
6510       bool added = scalar_use_map.put (def, scalar_use);
6511       gcc_assert (!added);
6512     }
6513
6514   return scalar_use;
6515 }
6516
6517 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6518    region and that can be vectorized using vectorizable_live_operation
6519    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
6520    scalar code computing it to be retained.  */
6521
6522 static void
6523 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6524                              slp_instance instance,
6525                              stmt_vector_for_cost *cost_vec,
6526                              hash_map<tree, int> &scalar_use_map,
6527                              hash_set<stmt_vec_info> &svisited,
6528                              hash_set<slp_tree> &visited)
6529 {
6530   if (visited.add (node))
6531     return;
6532
6533   unsigned i;
6534   stmt_vec_info stmt_info;
6535   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6536   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6537     {
6538       if (svisited.contains (stmt_info))
6539         continue;
6540       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6541       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6542           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6543         /* Only the pattern root stmt computes the original scalar value.  */
6544         continue;
6545       bool mark_visited = true;
6546       gimple *orig_stmt = orig_stmt_info->stmt;
6547       ssa_op_iter op_iter;
6548       def_operand_p def_p;
6549       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6550         {
6551           if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
6552                                       scalar_use_map))
6553             {
6554               STMT_VINFO_LIVE_P (stmt_info) = true;
6555               if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
6556                                                instance, i, false, cost_vec))
6557                 /* ???  So we know we can vectorize the live stmt from one SLP
6558                    node.  If we cannot do so from all or none consistently
6559                    we'd have to record which SLP node (and lane) we want to
6560                    use for the live operation.  So make sure we can
6561                    code-generate from all nodes.  */
6562                 mark_visited = false;
6563               else
6564                 STMT_VINFO_LIVE_P (stmt_info) = false;
6565             }
6566
6567           /* We have to verify whether we can insert the lane extract
6568              before all uses.  The following is a conservative approximation.
6569              We cannot put this into vectorizable_live_operation because
6570              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6571              doesn't work.
6572              Note that while the fact that we emit code for loads at the
6573              first load should make this a non-problem leafs we construct
6574              from scalars are vectorized after the last scalar def.
6575              ???  If we'd actually compute the insert location during
6576              analysis we could use sth less conservative than the last
6577              scalar stmt in the node for the dominance check.  */
6578           /* ???  What remains is "live" uses in vector CTORs in the same
6579              SLP graph which is where those uses can end up code-generated
6580              right after their definition instead of close to their original
6581              use.  But that would restrict us to code-generate lane-extracts
6582              from the latest stmt in a node.  So we compensate for this
6583              during code-generation, simply not replacing uses for those
6584              hopefully rare cases.  */
6585           imm_use_iterator use_iter;
6586           gimple *use_stmt;
6587           stmt_vec_info use_stmt_info;
6588
6589           if (STMT_VINFO_LIVE_P (stmt_info))
6590             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6591               if (!is_gimple_debug (use_stmt)
6592                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6593                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6594                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6595                 {
6596                   if (dump_enabled_p ())
6597                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6598                                      "Cannot determine insertion place for "
6599                                      "lane extract\n");
6600                   STMT_VINFO_LIVE_P (stmt_info) = false;
6601                   mark_visited = true;
6602                 }
6603         }
6604       if (mark_visited)
6605         svisited.add (stmt_info);
6606     }
6607
6608   slp_tree child;
6609   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6610     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6611       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
6612                                    scalar_use_map, svisited, visited);
6613 }
6614
6615 /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
6616    are live outside of the basic-block vectorized region and that can be
6617    vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
6618
6619 static void
6620 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
6621 {
6622   if (bb_vinfo->slp_instances.is_empty ())
6623     return;
6624
6625   hash_set<stmt_vec_info> svisited;
6626   hash_set<slp_tree> visited;
6627   hash_map<tree, int> scalar_use_map;
6628   auto_vec<slp_tree> worklist;
6629
6630   for (slp_instance instance : bb_vinfo->slp_instances)
6631     if (!visited.add (SLP_INSTANCE_TREE (instance)))
6632       worklist.safe_push (SLP_INSTANCE_TREE (instance));
6633
6634   do
6635     {
6636       slp_tree node = worklist.pop ();
6637
6638       if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
6639         {
6640           for (tree op : SLP_TREE_SCALAR_OPS (node))
6641             if (TREE_CODE (op) == SSA_NAME)
6642               scalar_use_map.put (op, 1);
6643         }
6644       else
6645         {
6646           for (slp_tree child : SLP_TREE_CHILDREN (node))
6647             if (child && !visited.add (child))
6648               worklist.safe_push (child);
6649         }
6650     } while (!worklist.is_empty ());
6651
6652   visited.empty ();
6653
6654   for (slp_instance instance : bb_vinfo->slp_instances)
6655     {
6656       vect_location = instance->location ();
6657       vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6658                                    instance, &instance->cost_vec,
6659                                    scalar_use_map, svisited, visited);
6660     }
6661 }
6662
6663 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
6664
6665 static bool
6666 vectorizable_bb_reduc_epilogue (slp_instance instance,
6667                                 stmt_vector_for_cost *cost_vec)
6668 {
6669   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6670   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6671   if (reduc_code == MINUS_EXPR)
6672     reduc_code = PLUS_EXPR;
6673   internal_fn reduc_fn;
6674   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6675   if (!vectype
6676       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6677       || reduc_fn == IFN_LAST
6678       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6679       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6680                                      TREE_TYPE (vectype)))
6681     {
6682       if (dump_enabled_p ())
6683         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6684                          "not vectorized: basic block reduction epilogue "
6685                          "operation unsupported.\n");
6686       return false;
6687     }
6688
6689   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6690      cost log2 vector operations plus shuffles and one extraction.  */
6691   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6692   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6693                     vectype, 0, vect_body);
6694   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6695                     vectype, 0, vect_body);
6696   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6697                     vectype, 0, vect_body);
6698
6699   /* Since we replace all stmts of a possibly longer scalar reduction
6700      chain account for the extra scalar stmts for that.  */
6701   record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6702                     instance->root_stmts[0], 0, vect_body);
6703   return true;
6704 }
6705
6706 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6707    and recurse to children.  */
6708
6709 static void
6710 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6711                               hash_set<slp_tree> &visited)
6712 {
6713   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6714       || visited.add (node))
6715     return;
6716
6717   stmt_vec_info stmt;
6718   unsigned i;
6719   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6720     roots.remove (vect_orig_stmt (stmt));
6721
6722   slp_tree child;
6723   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6724     if (child)
6725       vect_slp_prune_covered_roots (child, roots, visited);
6726 }
6727
6728 /* Analyze statements in SLP instances of VINFO.  Return true if the
6729    operations are supported. */
6730
6731 bool
6732 vect_slp_analyze_operations (vec_info *vinfo)
6733 {
6734   slp_instance instance;
6735   int i;
6736
6737   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6738
6739   hash_set<slp_tree> visited;
6740   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6741     {
6742       auto_vec<slp_tree> visited_vec;
6743       stmt_vector_for_cost cost_vec;
6744       cost_vec.create (2);
6745       if (is_a <bb_vec_info> (vinfo))
6746         vect_location = instance->location ();
6747       if (!vect_slp_analyze_node_operations (vinfo,
6748                                              SLP_INSTANCE_TREE (instance),
6749                                              instance, visited, visited_vec,
6750                                              &cost_vec)
6751           /* CTOR instances require vectorized defs for the SLP tree root.  */
6752           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6753               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6754                   != vect_internal_def
6755                   /* Make sure we vectorized with the expected type.  */
6756                   || !useless_type_conversion_p
6757                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6758                                               (instance->root_stmts[0]->stmt))),
6759                          TREE_TYPE (SLP_TREE_VECTYPE
6760                                             (SLP_INSTANCE_TREE (instance))))))
6761           /* Check we can vectorize the reduction.  */
6762           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6763               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6764         {
6765           slp_tree node = SLP_INSTANCE_TREE (instance);
6766           stmt_vec_info stmt_info;
6767           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6768             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6769           else
6770             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6771           if (dump_enabled_p ())
6772             dump_printf_loc (MSG_NOTE, vect_location,
6773                              "removing SLP instance operations starting from: %G",
6774                              stmt_info->stmt);
6775           vect_free_slp_instance (instance);
6776           vinfo->slp_instances.ordered_remove (i);
6777           cost_vec.release ();
6778           while (!visited_vec.is_empty ())
6779             visited.remove (visited_vec.pop ());
6780         }
6781       else
6782         {
6783           i++;
6784           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6785             {
6786               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6787               cost_vec.release ();
6788             }
6789           else
6790             /* For BB vectorization remember the SLP graph entry
6791                cost for later.  */
6792             instance->cost_vec = cost_vec;
6793         }
6794     }
6795
6796   /* Now look for SLP instances with a root that are covered by other
6797      instances and remove them.  */
6798   hash_set<stmt_vec_info> roots;
6799   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6800     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6801       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6802   if (!roots.is_empty ())
6803     {
6804       visited.empty ();
6805       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6806         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6807                                       visited);
6808       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6809         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6810             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6811           {
6812             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6813             if (dump_enabled_p ())
6814               dump_printf_loc (MSG_NOTE, vect_location,
6815                                "removing SLP instance operations starting "
6816                                "from: %G", root->stmt);
6817             vect_free_slp_instance (instance);
6818             vinfo->slp_instances.ordered_remove (i);
6819           }
6820         else
6821           ++i;
6822     }
6823
6824   /* Compute vectorizable live stmts.  */
6825   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6826     vect_bb_slp_mark_live_stmts (bb_vinfo);
6827
6828   return !vinfo->slp_instances.is_empty ();
6829 }
6830
6831 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6832    closing the eventual chain.  */
6833
6834 static slp_instance
6835 get_ultimate_leader (slp_instance instance,
6836                      hash_map<slp_instance, slp_instance> &instance_leader)
6837 {
6838   auto_vec<slp_instance *, 8> chain;
6839   slp_instance *tem;
6840   while (*(tem = instance_leader.get (instance)) != instance)
6841     {
6842       chain.safe_push (tem);
6843       instance = *tem;
6844     }
6845   while (!chain.is_empty ())
6846     *chain.pop () = instance;
6847   return instance;
6848 }
6849
6850 namespace {
6851 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
6852    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6853    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
6854
6855    INSTANCE_LEADER is as for get_ultimate_leader.  */
6856
6857 template<typename T>
6858 bool
6859 vect_map_to_instance (slp_instance instance, T key,
6860                       hash_map<T, slp_instance> &key_to_instance,
6861                       hash_map<slp_instance, slp_instance> &instance_leader)
6862 {
6863   bool existed_p;
6864   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6865   if (!existed_p)
6866     ;
6867   else if (key_instance != instance)
6868     {
6869       /* If we're running into a previously marked key make us the
6870          leader of the current ultimate leader.  This keeps the
6871          leader chain acyclic and works even when the current instance
6872          connects two previously independent graph parts.  */
6873       slp_instance key_leader
6874         = get_ultimate_leader (key_instance, instance_leader);
6875       if (key_leader != instance)
6876         instance_leader.put (key_leader, instance);
6877     }
6878   key_instance = instance;
6879   return existed_p;
6880 }
6881 }
6882
6883 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
6884
6885 static void
6886 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6887                            slp_instance instance, slp_tree node,
6888                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6889                            hash_map<slp_tree, slp_instance> &node_to_instance,
6890                            hash_map<slp_instance, slp_instance> &instance_leader)
6891 {
6892   stmt_vec_info stmt_info;
6893   unsigned i;
6894
6895   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6896     vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6897                           instance_leader);
6898
6899   if (vect_map_to_instance (instance, node, node_to_instance,
6900                             instance_leader))
6901     return;
6902
6903   slp_tree child;
6904   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6905     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6906       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6907                                  node_to_instance, instance_leader);
6908 }
6909
6910 /* Partition the SLP graph into pieces that can be costed independently.  */
6911
6912 static void
6913 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6914 {
6915   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6916
6917   /* First walk the SLP graph assigning each involved scalar stmt a
6918      corresponding SLP graph entry and upon visiting a previously
6919      marked stmt, make the stmts leader the current SLP graph entry.  */
6920   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6921   hash_map<slp_tree, slp_instance> node_to_instance;
6922   hash_map<slp_instance, slp_instance> instance_leader;
6923   slp_instance instance;
6924   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6925     {
6926       instance_leader.put (instance, instance);
6927       vect_bb_partition_graph_r (bb_vinfo,
6928                                  instance, SLP_INSTANCE_TREE (instance),
6929                                  stmt_to_instance, node_to_instance,
6930                                  instance_leader);
6931     }
6932
6933   /* Then collect entries to each independent subgraph.  */
6934   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6935     {
6936       slp_instance leader = get_ultimate_leader (instance, instance_leader);
6937       leader->subgraph_entries.safe_push (instance);
6938       if (dump_enabled_p ()
6939           && leader != instance)
6940         dump_printf_loc (MSG_NOTE, vect_location,
6941                          "instance %p is leader of %p\n",
6942                          (void *) leader, (void *) instance);
6943     }
6944 }
6945
6946 /* Compute the set of scalar stmts participating in internal and external
6947    nodes.  */
6948
6949 static void
6950 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6951                                          hash_set<slp_tree> &visited,
6952                                          hash_set<stmt_vec_info> &vstmts,
6953                                          hash_set<stmt_vec_info> &estmts)
6954 {
6955   int i;
6956   stmt_vec_info stmt_info;
6957   slp_tree child;
6958
6959   if (visited.add (node))
6960     return;
6961
6962   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6963     {
6964       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6965         vstmts.add (stmt_info);
6966
6967       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6968         if (child)
6969           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6970                                                    vstmts, estmts);
6971     }
6972   else
6973     for (tree def : SLP_TREE_SCALAR_OPS (node))
6974       {
6975         stmt_vec_info def_stmt = vinfo->lookup_def (def);
6976         if (def_stmt)
6977           estmts.add (def_stmt);
6978       }
6979 }
6980
6981
6982 /* Compute the scalar cost of the SLP node NODE and its children
6983    and return it.  Do not account defs that are marked in LIFE and
6984    update LIFE according to uses of NODE.  */
6985
6986 static void
6987 vect_bb_slp_scalar_cost (vec_info *vinfo,
6988                          slp_tree node, vec<bool, va_heap> *life,
6989                          stmt_vector_for_cost *cost_vec,
6990                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6991                          hash_set<slp_tree> &visited)
6992 {
6993   unsigned i;
6994   stmt_vec_info stmt_info;
6995   slp_tree child;
6996
6997   if (visited.add (node))
6998     return;
6999
7000   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7001     {
7002       ssa_op_iter op_iter;
7003       def_operand_p def_p;
7004
7005       if ((*life)[i])
7006         continue;
7007
7008       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
7009       gimple *orig_stmt = orig_stmt_info->stmt;
7010
7011       /* If there is a non-vectorized use of the defs then the scalar
7012          stmt is kept live in which case we do not account it or any
7013          required defs in the SLP children in the scalar cost.  This
7014          way we make the vectorization more costly when compared to
7015          the scalar cost.  */
7016       if (!STMT_VINFO_LIVE_P (stmt_info))
7017         {
7018           auto_vec<gimple *, 8> worklist;
7019           hash_set<gimple *> *worklist_visited = NULL;
7020           worklist.quick_push (orig_stmt);
7021           do
7022             {
7023               gimple *work_stmt = worklist.pop ();
7024               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
7025                 {
7026                   imm_use_iterator use_iter;
7027                   gimple *use_stmt;
7028                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
7029                                          DEF_FROM_PTR (def_p))
7030                     if (!is_gimple_debug (use_stmt))
7031                       {
7032                         stmt_vec_info use_stmt_info
7033                           = vinfo->lookup_stmt (use_stmt);
7034                         if (!use_stmt_info
7035                             || !vectorized_scalar_stmts.contains (use_stmt_info))
7036                           {
7037                             if (use_stmt_info
7038                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
7039                               {
7040                                 /* For stmts participating in patterns we have
7041                                    to check its uses recursively.  */
7042                                 if (!worklist_visited)
7043                                   worklist_visited = new hash_set<gimple *> ();
7044                                 if (!worklist_visited->add (use_stmt))
7045                                   worklist.safe_push (use_stmt);
7046                                 continue;
7047                               }
7048                             (*life)[i] = true;
7049                             goto next_lane;
7050                           }
7051                       }
7052                 }
7053             }
7054           while (!worklist.is_empty ());
7055 next_lane:
7056           if (worklist_visited)
7057             delete worklist_visited;
7058           if ((*life)[i])
7059             continue;
7060         }
7061
7062       /* Count scalar stmts only once.  */
7063       if (gimple_visited_p (orig_stmt))
7064         continue;
7065       gimple_set_visited (orig_stmt, true);
7066
7067       vect_cost_for_stmt kind;
7068       if (STMT_VINFO_DATA_REF (orig_stmt_info))
7069         {
7070           if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
7071             kind = scalar_load;
7072           else
7073             kind = scalar_store;
7074         }
7075       else if (vect_nop_conversion_p (orig_stmt_info))
7076         continue;
7077       /* For single-argument PHIs assume coalescing which means zero cost
7078          for the scalar and the vector PHIs.  This avoids artificially
7079          favoring the vector path (but may pessimize it in some cases).  */
7080       else if (is_a <gphi *> (orig_stmt_info->stmt)
7081                && gimple_phi_num_args
7082                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
7083         continue;
7084       else
7085         kind = scalar_stmt;
7086       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
7087                         SLP_TREE_VECTYPE (node), 0, vect_body);
7088     }
7089
7090   auto_vec<bool, 20> subtree_life;
7091   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7092     {
7093       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7094         {
7095           /* Do not directly pass LIFE to the recursive call, copy it to
7096              confine changes in the callee to the current child/subtree.  */
7097           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7098             {
7099               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
7100               for (unsigned j = 0;
7101                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
7102                 {
7103                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
7104                   if (perm.first == i)
7105                     subtree_life[perm.second] = (*life)[j];
7106                 }
7107             }
7108           else
7109             {
7110               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
7111               subtree_life.safe_splice (*life);
7112             }
7113           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
7114                                    vectorized_scalar_stmts, visited);
7115           subtree_life.truncate (0);
7116         }
7117     }
7118 }
7119
7120 /* Comparator for the loop-index sorted cost vectors.  */
7121
7122 static int
7123 li_cost_vec_cmp (const void *a_, const void *b_)
7124 {
7125   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
7126   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
7127   if (a->first < b->first)
7128     return -1;
7129   else if (a->first == b->first)
7130     return 0;
7131   return 1;
7132 }
7133
7134 /* Check if vectorization of the basic block is profitable for the
7135    subgraph denoted by SLP_INSTANCES.  */
7136
7137 static bool
7138 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
7139                                     vec<slp_instance> slp_instances,
7140                                     loop_p orig_loop)
7141 {
7142   slp_instance instance;
7143   int i;
7144   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
7145   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
7146
7147   if (dump_enabled_p ())
7148     {
7149       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
7150       hash_set<slp_tree> visited;
7151       FOR_EACH_VEC_ELT (slp_instances, i, instance)
7152         vect_print_slp_graph (MSG_NOTE, vect_location,
7153                               SLP_INSTANCE_TREE (instance), visited);
7154     }
7155
7156   /* Compute the set of scalar stmts we know will go away 'locally' when
7157      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
7158      not accurate for nodes promoted extern late or for scalar stmts that
7159      are used both in extern defs and in vectorized defs.  */
7160   hash_set<stmt_vec_info> vectorized_scalar_stmts;
7161   hash_set<stmt_vec_info> scalar_stmts_in_externs;
7162   hash_set<slp_tree> visited;
7163   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7164     {
7165       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
7166                                                SLP_INSTANCE_TREE (instance),
7167                                                visited,
7168                                                vectorized_scalar_stmts,
7169                                                scalar_stmts_in_externs);
7170       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7171         vectorized_scalar_stmts.add (rstmt);
7172     }
7173   /* Scalar stmts used as defs in external nodes need to be preseved, so
7174      remove them from vectorized_scalar_stmts.  */
7175   for (stmt_vec_info stmt : scalar_stmts_in_externs)
7176     vectorized_scalar_stmts.remove (stmt);
7177
7178   /* Calculate scalar cost and sum the cost for the vector stmts
7179      previously collected.  */
7180   stmt_vector_for_cost scalar_costs = vNULL;
7181   stmt_vector_for_cost vector_costs = vNULL;
7182   visited.empty ();
7183   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7184     {
7185       auto_vec<bool, 20> life;
7186       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7187                               true);
7188       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7189         record_stmt_cost (&scalar_costs,
7190                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
7191                           scalar_stmt,
7192                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
7193       vect_bb_slp_scalar_cost (bb_vinfo,
7194                                SLP_INSTANCE_TREE (instance),
7195                                &life, &scalar_costs, vectorized_scalar_stmts,
7196                                visited);
7197       vector_costs.safe_splice (instance->cost_vec);
7198       instance->cost_vec.release ();
7199     }
7200
7201   if (dump_enabled_p ())
7202     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7203
7204   /* When costing non-loop vectorization we need to consider each covered
7205      loop independently and make sure vectorization is profitable.  For
7206      now we assume a loop may be not entered or executed an arbitrary
7207      number of iterations (???  static information can provide more
7208      precise info here) which means we can simply cost each containing
7209      loops stmts separately.  */
7210
7211   /* First produce cost vectors sorted by loop index.  */
7212   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7213     li_scalar_costs (scalar_costs.length ());
7214   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7215     li_vector_costs (vector_costs.length ());
7216   stmt_info_for_cost *cost;
7217   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7218     {
7219       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7220       li_scalar_costs.quick_push (std::make_pair (l, cost));
7221     }
7222   /* Use a random used loop as fallback in case the first vector_costs
7223      entry does not have a stmt_info associated with it.  */
7224   unsigned l = li_scalar_costs[0].first;
7225   FOR_EACH_VEC_ELT (vector_costs, i, cost)
7226     {
7227       /* We inherit from the previous COST, invariants, externals and
7228          extracts immediately follow the cost for the related stmt.  */
7229       if (cost->stmt_info)
7230         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7231       li_vector_costs.quick_push (std::make_pair (l, cost));
7232     }
7233   li_scalar_costs.qsort (li_cost_vec_cmp);
7234   li_vector_costs.qsort (li_cost_vec_cmp);
7235
7236   /* Now cost the portions individually.  */
7237   unsigned vi = 0;
7238   unsigned si = 0;
7239   bool profitable = true;
7240   while (si < li_scalar_costs.length ()
7241          && vi < li_vector_costs.length ())
7242     {
7243       unsigned sl = li_scalar_costs[si].first;
7244       unsigned vl = li_vector_costs[vi].first;
7245       if (sl != vl)
7246         {
7247           if (dump_enabled_p ())
7248             dump_printf_loc (MSG_NOTE, vect_location,
7249                              "Scalar %d and vector %d loop part do not "
7250                              "match up, skipping scalar part\n", sl, vl);
7251           /* Skip the scalar part, assuming zero cost on the vector side.  */
7252           do
7253             {
7254               si++;
7255             }
7256           while (si < li_scalar_costs.length ()
7257                  && li_scalar_costs[si].first == sl);
7258           continue;
7259         }
7260
7261       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7262       do
7263         {
7264           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7265           si++;
7266         }
7267       while (si < li_scalar_costs.length ()
7268              && li_scalar_costs[si].first == sl);
7269       unsigned dummy;
7270       finish_cost (scalar_target_cost_data, nullptr,
7271                    &dummy, &scalar_cost, &dummy);
7272
7273       /* Complete the target-specific vector cost calculation.  */
7274       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7275       do
7276         {
7277           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7278           vi++;
7279         }
7280       while (vi < li_vector_costs.length ()
7281              && li_vector_costs[vi].first == vl);
7282       finish_cost (vect_target_cost_data, scalar_target_cost_data,
7283                    &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7284       delete scalar_target_cost_data;
7285       delete vect_target_cost_data;
7286
7287       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7288
7289       if (dump_enabled_p ())
7290         {
7291           dump_printf_loc (MSG_NOTE, vect_location,
7292                            "Cost model analysis for part in loop %d:\n", sl);
7293           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
7294                        vec_inside_cost + vec_outside_cost);
7295           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
7296         }
7297
7298       /* Vectorization is profitable if its cost is more than the cost of scalar
7299          version.  Note that we err on the vector side for equal cost because
7300          the cost estimate is otherwise quite pessimistic (constant uses are
7301          free on the scalar side but cost a load on the vector side for
7302          example).  */
7303       if (vec_outside_cost + vec_inside_cost > scalar_cost)
7304         {
7305           profitable = false;
7306           break;
7307         }
7308     }
7309   if (profitable && vi < li_vector_costs.length ())
7310     {
7311       if (dump_enabled_p ())
7312         dump_printf_loc (MSG_NOTE, vect_location,
7313                          "Excess vector cost for part in loop %d:\n",
7314                          li_vector_costs[vi].first);
7315       profitable = false;
7316     }
7317
7318   /* Unset visited flag.  This is delayed when the subgraph is profitable
7319      and we process the loop for remaining unvectorized if-converted code.  */
7320   if (!orig_loop || !profitable)
7321     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7322       gimple_set_visited  (cost->stmt_info->stmt, false);
7323
7324   scalar_costs.release ();
7325   vector_costs.release ();
7326
7327   return profitable;
7328 }
7329
7330 /* qsort comparator for lane defs.  */
7331
7332 static int
7333 vld_cmp (const void *a_, const void *b_)
7334 {
7335   auto *a = (const std::pair<unsigned, tree> *)a_;
7336   auto *b = (const std::pair<unsigned, tree> *)b_;
7337   return a->first - b->first;
7338 }
7339
7340 /* Return true if USE_STMT is a vector lane insert into VEC and set
7341    *THIS_LANE to the lane number that is set.  */
7342
7343 static bool
7344 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7345 {
7346   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7347   if (!use_ass
7348       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7349       || (vec
7350           ? gimple_assign_rhs1 (use_ass) != vec
7351           : ((vec = gimple_assign_rhs1 (use_ass)), false))
7352       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7353                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7354       || !constant_multiple_p
7355             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7356              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7357              this_lane))
7358     return false;
7359   return true;
7360 }
7361
7362 /* Find any vectorizable constructors and add them to the grouped_store
7363    array.  */
7364
7365 static void
7366 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7367 {
7368   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7369     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7370          !gsi_end_p (gsi); gsi_next (&gsi))
7371     {
7372       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7373       if (!assign)
7374         continue;
7375
7376       tree rhs = gimple_assign_rhs1 (assign);
7377       enum tree_code code = gimple_assign_rhs_code (assign);
7378       use_operand_p use_p;
7379       gimple *use_stmt;
7380       if (code == CONSTRUCTOR)
7381         {
7382           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7383               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7384                            CONSTRUCTOR_NELTS (rhs))
7385               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7386               || uniform_vector_p (rhs))
7387             continue;
7388
7389           unsigned j;
7390           tree val;
7391           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7392             if (TREE_CODE (val) != SSA_NAME
7393                 || !bb_vinfo->lookup_def (val))
7394               break;
7395           if (j != CONSTRUCTOR_NELTS (rhs))
7396             continue;
7397
7398           vec<stmt_vec_info> roots = vNULL;
7399           roots.safe_push (bb_vinfo->lookup_stmt (assign));
7400           vec<stmt_vec_info> stmts;
7401           stmts.create (CONSTRUCTOR_NELTS (rhs));
7402           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7403             stmts.quick_push
7404               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7405           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7406                                                stmts, roots));
7407         }
7408       else if (code == BIT_INSERT_EXPR
7409                && VECTOR_TYPE_P (TREE_TYPE (rhs))
7410                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7411                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7412                && integer_zerop (gimple_assign_rhs3 (assign))
7413                && useless_type_conversion_p
7414                     (TREE_TYPE (TREE_TYPE (rhs)),
7415                      TREE_TYPE (gimple_assign_rhs2 (assign)))
7416                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7417         {
7418           /* We start to match on insert to lane zero but since the
7419              inserts need not be ordered we'd have to search both
7420              the def and the use chains.  */
7421           tree vectype = TREE_TYPE (rhs);
7422           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7423           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7424           auto_sbitmap lanes (nlanes);
7425           bitmap_clear (lanes);
7426           bitmap_set_bit (lanes, 0);
7427           tree def = gimple_assign_lhs (assign);
7428           lane_defs.quick_push
7429                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
7430           unsigned lanes_found = 1;
7431           /* Start with the use chains, the last stmt will be the root.  */
7432           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7433           vec<stmt_vec_info> roots = vNULL;
7434           roots.safe_push (last);
7435           do
7436             {
7437               use_operand_p use_p;
7438               gimple *use_stmt;
7439               if (!single_imm_use (def, &use_p, &use_stmt))
7440                 break;
7441               unsigned this_lane;
7442               if (!bb_vinfo->lookup_stmt (use_stmt)
7443                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7444                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7445                 break;
7446               if (bitmap_bit_p (lanes, this_lane))
7447                 break;
7448               lanes_found++;
7449               bitmap_set_bit (lanes, this_lane);
7450               gassign *use_ass = as_a <gassign *> (use_stmt);
7451               lane_defs.quick_push (std::make_pair
7452                                      (this_lane, gimple_assign_rhs2 (use_ass)));
7453               last = bb_vinfo->lookup_stmt (use_ass);
7454               roots.safe_push (last);
7455               def = gimple_assign_lhs (use_ass);
7456             }
7457           while (lanes_found < nlanes);
7458           if (roots.length () > 1)
7459             std::swap(roots[0], roots[roots.length () - 1]);
7460           if (lanes_found < nlanes)
7461             {
7462               /* Now search the def chain.  */
7463               def = gimple_assign_rhs1 (assign);
7464               do
7465                 {
7466                   if (TREE_CODE (def) != SSA_NAME
7467                       || !has_single_use (def))
7468                     break;
7469                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7470                   unsigned this_lane;
7471                   if (!bb_vinfo->lookup_stmt (def_stmt)
7472                       || !vect_slp_is_lane_insert (def_stmt,
7473                                                    NULL_TREE, &this_lane)
7474                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7475                     break;
7476                   if (bitmap_bit_p (lanes, this_lane))
7477                     break;
7478                   lanes_found++;
7479                   bitmap_set_bit (lanes, this_lane);
7480                   lane_defs.quick_push (std::make_pair
7481                                           (this_lane,
7482                                            gimple_assign_rhs2 (def_stmt)));
7483                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7484                   def = gimple_assign_rhs1 (def_stmt);
7485                 }
7486               while (lanes_found < nlanes);
7487             }
7488           if (lanes_found == nlanes)
7489             {
7490               /* Sort lane_defs after the lane index and register the root.  */
7491               lane_defs.qsort (vld_cmp);
7492               vec<stmt_vec_info> stmts;
7493               stmts.create (nlanes);
7494               for (unsigned i = 0; i < nlanes; ++i)
7495                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7496               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7497                                                    stmts, roots));
7498             }
7499           else
7500             roots.release ();
7501         }
7502       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7503                && (associative_tree_code (code) || code == MINUS_EXPR)
7504                /* ???  This pessimizes a two-element reduction.  PR54400.
7505                   ???  In-order reduction could be handled if we only
7506                   traverse one operand chain in vect_slp_linearize_chain.  */
7507                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7508                /* Ops with constants at the tail can be stripped here.  */
7509                && TREE_CODE (rhs) == SSA_NAME
7510                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7511                /* Should be the chain end.  */
7512                && (!single_imm_use (gimple_assign_lhs (assign),
7513                                     &use_p, &use_stmt)
7514                    || !is_gimple_assign (use_stmt)
7515                    || (gimple_assign_rhs_code (use_stmt) != code
7516                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
7517                            || (gimple_assign_rhs_code (use_stmt)
7518                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7519         {
7520           /* We start the match at the end of a possible association
7521              chain.  */
7522           auto_vec<chain_op_t> chain;
7523           auto_vec<std::pair<tree_code, gimple *> > worklist;
7524           auto_vec<gimple *> chain_stmts;
7525           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7526           if (code == MINUS_EXPR)
7527             code = PLUS_EXPR;
7528           internal_fn reduc_fn;
7529           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7530               || reduc_fn == IFN_LAST)
7531             continue;
7532           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7533                                     /* ??? */
7534                                     code_stmt, alt_code_stmt, &chain_stmts);
7535           if (chain.length () > 1)
7536             {
7537               /* Sort the chain according to def_type and operation.  */
7538               chain.sort (dt_sort_cmp, bb_vinfo);
7539               /* ???  Now we'd want to strip externals and constants
7540                  but record those to be handled in the epilogue.  */
7541               /* ???  For now do not allow mixing ops or externs/constants.  */
7542               bool invalid = false;
7543               unsigned remain_cnt = 0;
7544               for (unsigned i = 0; i < chain.length (); ++i)
7545                 {
7546                   if (chain[i].code != code)
7547                     {
7548                       invalid = true;
7549                       break;
7550                     }
7551                   if (chain[i].dt != vect_internal_def
7552                       /* Avoid stmts where the def is not the LHS, like
7553                          ASMs.  */
7554                       || (gimple_get_lhs (bb_vinfo->lookup_def
7555                                                       (chain[i].op)->stmt)
7556                           != chain[i].op))
7557                     remain_cnt++;
7558                 }
7559               if (!invalid && chain.length () - remain_cnt > 1)
7560                 {
7561                   vec<stmt_vec_info> stmts;
7562                   vec<tree> remain = vNULL;
7563                   stmts.create (chain.length ());
7564                   if (remain_cnt > 0)
7565                     remain.create (remain_cnt);
7566                   for (unsigned i = 0; i < chain.length (); ++i)
7567                     {
7568                       stmt_vec_info stmt_info;
7569                       if (chain[i].dt == vect_internal_def
7570                           && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
7571                               gimple_get_lhs (stmt_info->stmt) == chain[i].op))
7572                         stmts.quick_push (stmt_info);
7573                       else
7574                         remain.quick_push (chain[i].op);
7575                     }
7576                   vec<stmt_vec_info> roots;
7577                   roots.create (chain_stmts.length ());
7578                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
7579                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7580                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7581                                                        stmts, roots, remain));
7582                 }
7583             }
7584         }
7585     }
7586 }
7587
7588 /* Walk the grouped store chains and replace entries with their
7589    pattern variant if any.  */
7590
7591 static void
7592 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7593 {
7594   stmt_vec_info first_element;
7595   unsigned i;
7596
7597   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7598     {
7599       /* We also have CTORs in this array.  */
7600       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7601         continue;
7602       if (STMT_VINFO_IN_PATTERN_P (first_element))
7603         {
7604           stmt_vec_info orig = first_element;
7605           first_element = STMT_VINFO_RELATED_STMT (first_element);
7606           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7607           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7608           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7609           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7610           vinfo->grouped_stores[i] = first_element;
7611         }
7612       stmt_vec_info prev = first_element;
7613       while (DR_GROUP_NEXT_ELEMENT (prev))
7614         {
7615           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7616           if (STMT_VINFO_IN_PATTERN_P (elt))
7617             {
7618               stmt_vec_info orig = elt;
7619               elt = STMT_VINFO_RELATED_STMT (elt);
7620               DR_GROUP_NEXT_ELEMENT (prev) = elt;
7621               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7622               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7623             }
7624           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7625           prev = elt;
7626         }
7627     }
7628 }
7629
7630 /* Check if the region described by BB_VINFO can be vectorized, returning
7631    true if so.  When returning false, set FATAL to true if the same failure
7632    would prevent vectorization at other vector sizes, false if it is still
7633    worth trying other sizes.  N_STMTS is the number of statements in the
7634    region.  */
7635
7636 static bool
7637 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7638                        vec<int> *dataref_groups)
7639 {
7640   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7641
7642   slp_instance instance;
7643   int i;
7644   poly_uint64 min_vf = 2;
7645
7646   /* The first group of checks is independent of the vector size.  */
7647   fatal = true;
7648
7649   /* Analyze the data references.  */
7650
7651   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7652     {
7653       if (dump_enabled_p ())
7654         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7655                          "not vectorized: unhandled data-ref in basic "
7656                          "block.\n");
7657       return false;
7658     }
7659
7660   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7661     {
7662      if (dump_enabled_p ())
7663        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7664                         "not vectorized: unhandled data access in "
7665                         "basic block.\n");
7666       return false;
7667     }
7668
7669   vect_slp_check_for_roots (bb_vinfo);
7670
7671   /* If there are no grouped stores and no constructors in the region
7672      there is no need to continue with pattern recog as vect_analyze_slp
7673      will fail anyway.  */
7674   if (bb_vinfo->grouped_stores.is_empty ()
7675       && bb_vinfo->roots.is_empty ())
7676     {
7677       if (dump_enabled_p ())
7678         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7679                          "not vectorized: no grouped stores in "
7680                          "basic block.\n");
7681       return false;
7682     }
7683
7684   /* While the rest of the analysis below depends on it in some way.  */
7685   fatal = false;
7686
7687   vect_pattern_recog (bb_vinfo);
7688
7689   /* Update store groups from pattern processing.  */
7690   vect_fixup_store_groups_with_patterns (bb_vinfo);
7691
7692   /* Check the SLP opportunities in the basic block, analyze and build SLP
7693      trees.  */
7694   if (!vect_analyze_slp (bb_vinfo, n_stmts))
7695     {
7696       if (dump_enabled_p ())
7697         {
7698           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7699                            "Failed to SLP the basic block.\n");
7700           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7701                            "not vectorized: failed to find SLP opportunities "
7702                            "in basic block.\n");
7703         }
7704       return false;
7705     }
7706
7707   /* Optimize permutations.  */
7708   vect_optimize_slp (bb_vinfo);
7709
7710   /* Gather the loads reachable from the SLP graph entries.  */
7711   vect_gather_slp_loads (bb_vinfo);
7712
7713   vect_record_base_alignments (bb_vinfo);
7714
7715   /* Analyze and verify the alignment of data references and the
7716      dependence in the SLP instances.  */
7717   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7718     {
7719       vect_location = instance->location ();
7720       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7721           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7722         {
7723           slp_tree node = SLP_INSTANCE_TREE (instance);
7724           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7725           if (dump_enabled_p ())
7726             dump_printf_loc (MSG_NOTE, vect_location,
7727                              "removing SLP instance operations starting from: %G",
7728                              stmt_info->stmt);
7729           vect_free_slp_instance (instance);
7730           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7731           continue;
7732         }
7733
7734       /* Mark all the statements that we want to vectorize as pure SLP and
7735          relevant.  */
7736       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7737       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7738       unsigned j;
7739       stmt_vec_info root;
7740       /* Likewise consider instance root stmts as vectorized.  */
7741       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7742         STMT_SLP_TYPE (root) = pure_slp;
7743
7744       i++;
7745     }
7746   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7747     return false;
7748
7749   if (!vect_slp_analyze_operations (bb_vinfo))
7750     {
7751       if (dump_enabled_p ())
7752         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7753                          "not vectorized: bad operation in basic block.\n");
7754       return false;
7755     }
7756
7757   vect_bb_partition_graph (bb_vinfo);
7758
7759   return true;
7760 }
7761
7762 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
7763    basic blocks in BBS, returning true on success.
7764    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
7765
7766 static bool
7767 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7768                  vec<int> *dataref_groups, unsigned int n_stmts,
7769                  loop_p orig_loop)
7770 {
7771   bb_vec_info bb_vinfo;
7772   auto_vector_modes vector_modes;
7773
7774   /* Autodetect first vector size we try.  */
7775   machine_mode next_vector_mode = VOIDmode;
7776   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7777   unsigned int mode_i = 0;
7778
7779   vec_info_shared shared;
7780
7781   machine_mode autodetected_vector_mode = VOIDmode;
7782   while (1)
7783     {
7784       bool vectorized = false;
7785       bool fatal = false;
7786       bb_vinfo = new _bb_vec_info (bbs, &shared);
7787
7788       bool first_time_p = shared.datarefs.is_empty ();
7789       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7790       if (first_time_p)
7791         bb_vinfo->shared->save_datarefs ();
7792       else
7793         bb_vinfo->shared->check_datarefs ();
7794       bb_vinfo->vector_mode = next_vector_mode;
7795
7796       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7797         {
7798           if (dump_enabled_p ())
7799             {
7800               dump_printf_loc (MSG_NOTE, vect_location,
7801                                "***** Analysis succeeded with vector mode"
7802                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7803               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7804             }
7805
7806           bb_vinfo->shared->check_datarefs ();
7807
7808           bool force_clear = false;
7809           auto_vec<slp_instance> profitable_subgraphs;
7810           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7811             {
7812               if (instance->subgraph_entries.is_empty ())
7813                 continue;
7814
7815               dump_user_location_t saved_vect_location = vect_location;
7816               vect_location = instance->location ();
7817               if (!unlimited_cost_model (NULL)
7818                   && !vect_bb_vectorization_profitable_p
7819                         (bb_vinfo, instance->subgraph_entries, orig_loop))
7820                 {
7821                   if (dump_enabled_p ())
7822                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7823                                      "not vectorized: vectorization is not "
7824                                      "profitable.\n");
7825                   vect_location = saved_vect_location;
7826                   continue;
7827                 }
7828
7829               vect_location = saved_vect_location;
7830               if (!dbg_cnt (vect_slp))
7831                 {
7832                   force_clear = true;
7833                   continue;
7834                 }
7835
7836               profitable_subgraphs.safe_push (instance);
7837             }
7838
7839           /* When we're vectorizing an if-converted loop body make sure
7840              we vectorized all if-converted code.  */
7841           if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
7842             {
7843               gcc_assert (bb_vinfo->bbs.length () == 1);
7844               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7845                    !gsi_end_p (gsi); gsi_next (&gsi))
7846                 {
7847                   /* The costing above left us with DCEable vectorized scalar
7848                      stmts having the visited flag set on profitable
7849                      subgraphs.  Do the delayed clearing of the flag here.  */
7850                   if (gimple_visited_p (gsi_stmt (gsi)))
7851                     {
7852                       gimple_set_visited (gsi_stmt (gsi), false);
7853                       continue;
7854                     }
7855                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7856                     continue;
7857
7858                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7859                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
7860                       {
7861                         if (!profitable_subgraphs.is_empty ()
7862                             && dump_enabled_p ())
7863                           dump_printf_loc (MSG_NOTE, vect_location,
7864                                            "not profitable because of "
7865                                            "unprofitable if-converted scalar "
7866                                            "code\n");
7867                         profitable_subgraphs.truncate (0);
7868                       }
7869                 }
7870             }
7871
7872           /* Finally schedule the profitable subgraphs.  */
7873           for (slp_instance instance : profitable_subgraphs)
7874             {
7875               if (!vectorized && dump_enabled_p ())
7876                 dump_printf_loc (MSG_NOTE, vect_location,
7877                                  "Basic block will be vectorized "
7878                                  "using SLP\n");
7879               vectorized = true;
7880
7881               /* Dump before scheduling as store vectorization will remove
7882                  the original stores and mess with the instance tree
7883                  so querying its location will eventually ICE.  */
7884               if (flag_checking)
7885                 for (slp_instance sub : instance->subgraph_entries)
7886                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7887               unsigned HOST_WIDE_INT bytes;
7888               if (dump_enabled_p ())
7889                 for (slp_instance sub : instance->subgraph_entries)
7890                   {
7891                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7892                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7893                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7894                                        sub->location (),
7895                                        "basic block part vectorized using %wu "
7896                                        "byte vectors\n", bytes);
7897                     else
7898                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7899                                        sub->location (),
7900                                        "basic block part vectorized using "
7901                                        "variable length vectors\n");
7902                   }
7903
7904               dump_user_location_t saved_vect_location = vect_location;
7905               vect_location = instance->location ();
7906
7907               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7908
7909               vect_location = saved_vect_location;
7910             }
7911         }
7912       else
7913         {
7914           if (dump_enabled_p ())
7915             dump_printf_loc (MSG_NOTE, vect_location,
7916                              "***** Analysis failed with vector mode %s\n",
7917                              GET_MODE_NAME (bb_vinfo->vector_mode));
7918         }
7919
7920       if (mode_i == 0)
7921         autodetected_vector_mode = bb_vinfo->vector_mode;
7922
7923       if (!fatal)
7924         while (mode_i < vector_modes.length ()
7925                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7926           {
7927             if (dump_enabled_p ())
7928               dump_printf_loc (MSG_NOTE, vect_location,
7929                                "***** The result for vector mode %s would"
7930                                " be the same\n",
7931                                GET_MODE_NAME (vector_modes[mode_i]));
7932             mode_i += 1;
7933           }
7934
7935       delete bb_vinfo;
7936
7937       if (mode_i < vector_modes.length ()
7938           && VECTOR_MODE_P (autodetected_vector_mode)
7939           && (related_vector_mode (vector_modes[mode_i],
7940                                    GET_MODE_INNER (autodetected_vector_mode))
7941               == autodetected_vector_mode)
7942           && (related_vector_mode (autodetected_vector_mode,
7943                                    GET_MODE_INNER (vector_modes[mode_i]))
7944               == vector_modes[mode_i]))
7945         {
7946           if (dump_enabled_p ())
7947             dump_printf_loc (MSG_NOTE, vect_location,
7948                              "***** Skipping vector mode %s, which would"
7949                              " repeat the analysis for %s\n",
7950                              GET_MODE_NAME (vector_modes[mode_i]),
7951                              GET_MODE_NAME (autodetected_vector_mode));
7952           mode_i += 1;
7953         }
7954
7955       if (vectorized
7956           || mode_i == vector_modes.length ()
7957           || autodetected_vector_mode == VOIDmode
7958           /* If vect_slp_analyze_bb_1 signaled that analysis for all
7959              vector sizes will fail do not bother iterating.  */
7960           || fatal)
7961         return vectorized;
7962
7963       /* Try the next biggest vector size.  */
7964       next_vector_mode = vector_modes[mode_i++];
7965       if (dump_enabled_p ())
7966         dump_printf_loc (MSG_NOTE, vect_location,
7967                          "***** Re-trying analysis with vector mode %s\n",
7968                          GET_MODE_NAME (next_vector_mode));
7969     }
7970 }
7971
7972
7973 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
7974    true if anything in the basic-block was vectorized.  */
7975
7976 static bool
7977 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7978 {
7979   vec<data_reference_p> datarefs = vNULL;
7980   auto_vec<int> dataref_groups;
7981   int insns = 0;
7982   int current_group = 0;
7983
7984   for (unsigned i = 0; i < bbs.length (); i++)
7985     {
7986       basic_block bb = bbs[i];
7987       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7988            gsi_next (&gsi))
7989         {
7990           gimple *stmt = gsi_stmt (gsi);
7991           if (is_gimple_debug (stmt))
7992             continue;
7993
7994           insns++;
7995
7996           if (gimple_location (stmt) != UNKNOWN_LOCATION)
7997             vect_location = stmt;
7998
7999           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
8000                                               &dataref_groups, current_group))
8001             ++current_group;
8002         }
8003       /* New BBs always start a new DR group.  */
8004       ++current_group;
8005     }
8006
8007   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
8008 }
8009
8010 /* Special entry for the BB vectorizer.  Analyze and transform a single
8011    if-converted BB with ORIG_LOOPs body being the not if-converted
8012    representation.  Returns true if anything in the basic-block was
8013    vectorized.  */
8014
8015 bool
8016 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
8017 {
8018   auto_vec<basic_block> bbs;
8019   bbs.safe_push (bb);
8020   return vect_slp_bbs (bbs, orig_loop);
8021 }
8022
8023 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
8024    true if anything in the basic-block was vectorized.  */
8025
8026 bool
8027 vect_slp_function (function *fun)
8028 {
8029   bool r = false;
8030   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
8031   auto_bitmap exit_bbs;
8032   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
8033   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
8034   unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
8035                                                       true, rpo, NULL);
8036
8037   /* For the moment split the function into pieces to avoid making
8038      the iteration on the vector mode moot.  Split at points we know
8039      to not handle well which is CFG merges (SLP discovery doesn't
8040      handle non-loop-header PHIs) and loop exits.  Since pattern
8041      recog requires reverse iteration to visit uses before defs
8042      simply chop RPO into pieces.  */
8043   auto_vec<basic_block> bbs;
8044   for (unsigned i = 0; i < n; i++)
8045     {
8046       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
8047       bool split = false;
8048
8049       /* Split when a BB is not dominated by the first block.  */
8050       if (!bbs.is_empty ()
8051           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
8052         {
8053           if (dump_enabled_p ())
8054             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8055                              "splitting region at dominance boundary bb%d\n",
8056                              bb->index);
8057           split = true;
8058         }
8059       /* Split when the loop determined by the first block
8060          is exited.  This is because we eventually insert
8061          invariants at region begin.  */
8062       else if (!bbs.is_empty ()
8063                && bbs[0]->loop_father != bb->loop_father
8064                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
8065         {
8066           if (dump_enabled_p ())
8067             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8068                              "splitting region at loop %d exit at bb%d\n",
8069                              bbs[0]->loop_father->num, bb->index);
8070           split = true;
8071         }
8072       else if (!bbs.is_empty ()
8073                && bb->loop_father->header == bb
8074                && bb->loop_father->dont_vectorize)
8075         {
8076           if (dump_enabled_p ())
8077             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8078                              "splitting region at dont-vectorize loop %d "
8079                              "entry at bb%d\n",
8080                              bb->loop_father->num, bb->index);
8081           split = true;
8082         }
8083
8084       if (split && !bbs.is_empty ())
8085         {
8086           r |= vect_slp_bbs (bbs, NULL);
8087           bbs.truncate (0);
8088         }
8089
8090       if (bbs.is_empty ())
8091         {
8092           /* We need to be able to insert at the head of the region which
8093              we cannot for region starting with a returns-twice call.  */
8094           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
8095             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
8096               {
8097                 if (dump_enabled_p ())
8098                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8099                                    "skipping bb%d as start of region as it "
8100                                    "starts with returns-twice call\n",
8101                                    bb->index);
8102                 continue;
8103               }
8104           /* If the loop this BB belongs to is marked as not to be vectorized
8105              honor that also for BB vectorization.  */
8106           if (bb->loop_father->dont_vectorize)
8107             continue;
8108         }
8109
8110       bbs.safe_push (bb);
8111
8112       /* When we have a stmt ending this block and defining a
8113          value we have to insert on edges when inserting after it for
8114          a vector containing its definition.  Avoid this for now.  */
8115       if (gimple *last = *gsi_last_bb (bb))
8116         if (gimple_get_lhs (last)
8117             && is_ctrl_altering_stmt (last))
8118           {
8119             if (dump_enabled_p ())
8120               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8121                                "splitting region at control altering "
8122                                "definition %G", last);
8123             r |= vect_slp_bbs (bbs, NULL);
8124             bbs.truncate (0);
8125           }
8126     }
8127
8128   if (!bbs.is_empty ())
8129     r |= vect_slp_bbs (bbs, NULL);
8130
8131   free (rpo);
8132
8133   return r;
8134 }
8135
8136 /* Build a variable-length vector in which the elements in ELTS are repeated
8137    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
8138    RESULTS and add any new instructions to SEQ.
8139
8140    The approach we use is:
8141
8142    (1) Find a vector mode VM with integer elements of mode IM.
8143
8144    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8145        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
8146        from small vectors to IM.
8147
8148    (3) Duplicate each ELTS'[I] into a vector of mode VM.
8149
8150    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
8151        correct byte contents.
8152
8153    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
8154
8155    We try to find the largest IM for which this sequence works, in order
8156    to cut down on the number of interleaves.  */
8157
8158 void
8159 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
8160                           const vec<tree> &elts, unsigned int nresults,
8161                           vec<tree> &results)
8162 {
8163   unsigned int nelts = elts.length ();
8164   tree element_type = TREE_TYPE (vector_type);
8165
8166   /* (1) Find a vector mode VM with integer elements of mode IM.  */
8167   unsigned int nvectors = 1;
8168   tree new_vector_type;
8169   tree permutes[2];
8170   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
8171                                        &nvectors, &new_vector_type,
8172                                        permutes))
8173     gcc_unreachable ();
8174
8175   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
8176   unsigned int partial_nelts = nelts / nvectors;
8177   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8178
8179   tree_vector_builder partial_elts;
8180   auto_vec<tree, 32> pieces (nvectors * 2);
8181   pieces.quick_grow_cleared (nvectors * 2);
8182   for (unsigned int i = 0; i < nvectors; ++i)
8183     {
8184       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8185              ELTS' has mode IM.  */
8186       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
8187       for (unsigned int j = 0; j < partial_nelts; ++j)
8188         partial_elts.quick_push (elts[i * partial_nelts + j]);
8189       tree t = gimple_build_vector (seq, &partial_elts);
8190       t = gimple_build (seq, VIEW_CONVERT_EXPR,
8191                         TREE_TYPE (new_vector_type), t);
8192
8193       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
8194       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
8195     }
8196
8197   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
8198          correct byte contents.
8199
8200      Conceptually, we need to repeat the following operation log2(nvectors)
8201      times, where hi_start = nvectors / 2:
8202
8203         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
8204         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
8205
8206      However, if each input repeats every N elements and the VF is
8207      a multiple of N * 2, the HI result is the same as the LO result.
8208      This will be true for the first N1 iterations of the outer loop,
8209      followed by N2 iterations for which both the LO and HI results
8210      are needed.  I.e.:
8211
8212         N1 + N2 = log2(nvectors)
8213
8214      Each "N1 iteration" doubles the number of redundant vectors and the
8215      effect of the process as a whole is to have a sequence of nvectors/2**N1
8216      vectors that repeats 2**N1 times.  Rather than generate these redundant
8217      vectors, we halve the number of vectors for each N1 iteration.  */
8218   unsigned int in_start = 0;
8219   unsigned int out_start = nvectors;
8220   unsigned int new_nvectors = nvectors;
8221   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
8222     {
8223       unsigned int hi_start = new_nvectors / 2;
8224       unsigned int out_i = 0;
8225       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
8226         {
8227           if ((in_i & 1) != 0
8228               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
8229                              2 * in_repeat))
8230             continue;
8231
8232           tree output = make_ssa_name (new_vector_type);
8233           tree input1 = pieces[in_start + (in_i / 2)];
8234           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
8235           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8236                                                input1, input2,
8237                                                permutes[in_i & 1]);
8238           gimple_seq_add_stmt (seq, stmt);
8239           pieces[out_start + out_i] = output;
8240           out_i += 1;
8241         }
8242       std::swap (in_start, out_start);
8243       new_nvectors = out_i;
8244     }
8245
8246   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
8247   results.reserve (nresults);
8248   for (unsigned int i = 0; i < nresults; ++i)
8249     if (i < new_nvectors)
8250       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
8251                                         pieces[in_start + i]));
8252     else
8253       results.quick_push (results[i - new_nvectors]);
8254 }
8255
8256
8257 /* For constant and loop invariant defs in OP_NODE this function creates
8258    vector defs that will be used in the vectorized stmts and stores them
8259    to SLP_TREE_VEC_DEFS of OP_NODE.  */
8260
8261 static void
8262 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8263 {
8264   unsigned HOST_WIDE_INT nunits;
8265   tree vec_cst;
8266   unsigned j, number_of_places_left_in_vector;
8267   tree vector_type;
8268   tree vop;
8269   int group_size = op_node->ops.length ();
8270   unsigned int vec_num, i;
8271   unsigned number_of_copies = 1;
8272   bool constant_p;
8273   gimple_seq ctor_seq = NULL;
8274   auto_vec<tree, 16> permute_results;
8275
8276   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
8277   vector_type = SLP_TREE_VECTYPE (op_node);
8278
8279   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8280   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8281   auto_vec<tree> voprnds (number_of_vectors);
8282
8283   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8284      created vectors. It is greater than 1 if unrolling is performed.
8285
8286      For example, we have two scalar operands, s1 and s2 (e.g., group of
8287      strided accesses of size two), while NUNITS is four (i.e., four scalars
8288      of this type can be packed in a vector).  The output vector will contain
8289      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
8290      will be 2).
8291
8292      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8293      containing the operands.
8294
8295      For example, NUNITS is four as before, and the group size is 8
8296      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
8297      {s5, s6, s7, s8}.  */
8298
8299   /* When using duplicate_and_interleave, we just need one element for
8300      each scalar statement.  */
8301   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8302     nunits = group_size;
8303
8304   number_of_copies = nunits * number_of_vectors / group_size;
8305
8306   number_of_places_left_in_vector = nunits;
8307   constant_p = true;
8308   tree uniform_elt = NULL_TREE;
8309   tree_vector_builder elts (vector_type, nunits, 1);
8310   elts.quick_grow (nunits);
8311   stmt_vec_info insert_after = NULL;
8312   for (j = 0; j < number_of_copies; j++)
8313     {
8314       tree op;
8315       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8316         {
8317           /* Create 'vect_ = {op0,op1,...,opn}'.  */
8318           tree orig_op = op;
8319           if (number_of_places_left_in_vector == nunits)
8320             uniform_elt = op;
8321           else if (uniform_elt && operand_equal_p (uniform_elt, op))
8322             op = elts[number_of_places_left_in_vector];
8323           else
8324             uniform_elt = NULL_TREE;
8325           number_of_places_left_in_vector--;
8326           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8327             {
8328               if (CONSTANT_CLASS_P (op))
8329                 {
8330                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8331                     {
8332                       /* Can't use VIEW_CONVERT_EXPR for booleans because
8333                          of possibly different sizes of scalar value and
8334                          vector element.  */
8335                       if (integer_zerop (op))
8336                         op = build_int_cst (TREE_TYPE (vector_type), 0);
8337                       else if (integer_onep (op))
8338                         op = build_all_ones_cst (TREE_TYPE (vector_type));
8339                       else
8340                         gcc_unreachable ();
8341                     }
8342                   else
8343                     op = fold_unary (VIEW_CONVERT_EXPR,
8344                                      TREE_TYPE (vector_type), op);
8345                   gcc_assert (op && CONSTANT_CLASS_P (op));
8346                 }
8347               else
8348                 {
8349                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8350                   gimple *init_stmt;
8351                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8352                     {
8353                       tree true_val
8354                         = build_all_ones_cst (TREE_TYPE (vector_type));
8355                       tree false_val
8356                         = build_zero_cst (TREE_TYPE (vector_type));
8357                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8358                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8359                                                        op, true_val,
8360                                                        false_val);
8361                     }
8362                   else
8363                     {
8364                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8365                                    op);
8366                       init_stmt
8367                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8368                                                op);
8369                     }
8370                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
8371                   op = new_temp;
8372                 }
8373             }
8374           elts[number_of_places_left_in_vector] = op;
8375           if (!CONSTANT_CLASS_P (op))
8376             constant_p = false;
8377           /* For BB vectorization we have to compute an insert location
8378              when a def is inside the analyzed region since we cannot
8379              simply insert at the BB start in this case.  */
8380           stmt_vec_info opdef;
8381           if (TREE_CODE (orig_op) == SSA_NAME
8382               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8383               && is_a <bb_vec_info> (vinfo)
8384               && (opdef = vinfo->lookup_def (orig_op)))
8385             {
8386               if (!insert_after)
8387                 insert_after = opdef;
8388               else
8389                 insert_after = get_later_stmt (insert_after, opdef);
8390             }
8391
8392           if (number_of_places_left_in_vector == 0)
8393             {
8394               auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
8395               if (uniform_elt)
8396                 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
8397                                                         elts[0]);
8398               else if (constant_p
8399                        ? multiple_p (type_nunits, nunits)
8400                        : known_eq (type_nunits, nunits))
8401                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8402               else
8403                 {
8404                   if (permute_results.is_empty ())
8405                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8406                                               elts, number_of_vectors,
8407                                               permute_results);
8408                   vec_cst = permute_results[number_of_vectors - j - 1];
8409                 }
8410               if (!gimple_seq_empty_p (ctor_seq))
8411                 {
8412                   if (insert_after)
8413                     {
8414                       gimple_stmt_iterator gsi;
8415                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8416                         {
8417                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8418                           gsi_insert_seq_before (&gsi, ctor_seq,
8419                                                  GSI_CONTINUE_LINKING);
8420                         }
8421                       else if (!stmt_ends_bb_p (insert_after->stmt))
8422                         {
8423                           gsi = gsi_for_stmt (insert_after->stmt);
8424                           gsi_insert_seq_after (&gsi, ctor_seq,
8425                                                 GSI_CONTINUE_LINKING);
8426                         }
8427                       else
8428                         {
8429                           /* When we want to insert after a def where the
8430                              defining stmt throws then insert on the fallthru
8431                              edge.  */
8432                           edge e = find_fallthru_edge
8433                                      (gimple_bb (insert_after->stmt)->succs);
8434                           basic_block new_bb
8435                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8436                           gcc_assert (!new_bb);
8437                         }
8438                     }
8439                   else
8440                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
8441                   ctor_seq = NULL;
8442                 }
8443               voprnds.quick_push (vec_cst);
8444               insert_after = NULL;
8445               number_of_places_left_in_vector = nunits;
8446               constant_p = true;
8447               elts.new_vector (vector_type, nunits, 1);
8448               elts.quick_grow (nunits);
8449             }
8450         }
8451     }
8452
8453   /* Since the vectors are created in the reverse order, we should invert
8454      them.  */
8455   vec_num = voprnds.length ();
8456   for (j = vec_num; j != 0; j--)
8457     {
8458       vop = voprnds[j - 1];
8459       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8460     }
8461
8462   /* In case that VF is greater than the unrolling factor needed for the SLP
8463      group of stmts, NUMBER_OF_VECTORS to be created is greater than
8464      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8465      to replicate the vectors.  */
8466   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8467     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8468          i++)
8469       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8470 }
8471
8472 /* Get the Ith vectorized definition from SLP_NODE.  */
8473
8474 tree
8475 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8476 {
8477   return SLP_TREE_VEC_DEFS (slp_node)[i];
8478 }
8479
8480 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
8481
8482 void
8483 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8484 {
8485   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8486   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8487 }
8488
8489 /* Get N vectorized definitions for SLP_NODE.  */
8490
8491 void
8492 vect_get_slp_defs (vec_info *,
8493                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8494 {
8495   if (n == -1U)
8496     n = SLP_TREE_CHILDREN (slp_node).length ();
8497
8498   for (unsigned i = 0; i < n; ++i)
8499     {
8500       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8501       vec<tree> vec_defs = vNULL;
8502       vect_get_slp_defs (child, &vec_defs);
8503       vec_oprnds->quick_push (vec_defs);
8504     }
8505 }
8506
8507 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8508    - PERM gives the permutation that the caller wants to use for NODE,
8509      which might be different from SLP_LOAD_PERMUTATION.
8510    - DUMP_P controls whether the function dumps information.  */
8511
8512 static bool
8513 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8514                                 load_permutation_t &perm,
8515                                 const vec<tree> &dr_chain,
8516                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
8517                                 bool analyze_only, bool dump_p,
8518                                 unsigned *n_perms, unsigned int *n_loads,
8519                                 bool dce_chain)
8520 {
8521   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8522   int vec_index = 0;
8523   tree vectype = SLP_TREE_VECTYPE (node);
8524   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8525   unsigned int mask_element;
8526   unsigned dr_group_size;
8527   machine_mode mode;
8528
8529   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8530     dr_group_size = 1;
8531   else
8532     {
8533       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8534       dr_group_size = DR_GROUP_SIZE (stmt_info);
8535     }
8536
8537   mode = TYPE_MODE (vectype);
8538   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8539   unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8540
8541   /* Initialize the vect stmts of NODE to properly insert the generated
8542      stmts later.  */
8543   if (! analyze_only)
8544     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8545       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8546
8547   /* Generate permutation masks for every NODE. Number of masks for each NODE
8548      is equal to GROUP_SIZE.
8549      E.g., we have a group of three nodes with three loads from the same
8550      location in each node, and the vector size is 4. I.e., we have a
8551      a0b0c0a1b1c1... sequence and we need to create the following vectors:
8552      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8553      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8554      ...
8555
8556      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8557      The last mask is illegal since we assume two operands for permute
8558      operation, and the mask element values can't be outside that range.
8559      Hence, the last mask must be converted into {2,5,5,5}.
8560      For the first two permutations we need the first and the second input
8561      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8562      we need the second and the third vectors: {b1,c1,a2,b2} and
8563      {c2,a3,b3,c3}.  */
8564
8565   int vect_stmts_counter = 0;
8566   unsigned int index = 0;
8567   int first_vec_index = -1;
8568   int second_vec_index = -1;
8569   bool noop_p = true;
8570   *n_perms = 0;
8571
8572   vec_perm_builder mask;
8573   unsigned int nelts_to_build;
8574   unsigned int nvectors_per_build;
8575   unsigned int in_nlanes;
8576   bool repeating_p = (group_size == dr_group_size
8577                       && multiple_p (nunits, group_size));
8578   if (repeating_p)
8579     {
8580       /* A single vector contains a whole number of copies of the node, so:
8581          (a) all permutes can use the same mask; and
8582          (b) the permutes only need a single vector input.  */
8583       mask.new_vector (nunits, group_size, 3);
8584       nelts_to_build = mask.encoded_nelts ();
8585       /* It's possible to obtain zero nstmts during analyze_only, so make
8586          it at least one to ensure the later computation for n_perms
8587          proceed.  */
8588       nvectors_per_build = nstmts > 0 ? nstmts : 1;
8589       in_nlanes = dr_group_size * 3;
8590     }
8591   else
8592     {
8593       /* We need to construct a separate mask for each vector statement.  */
8594       unsigned HOST_WIDE_INT const_nunits, const_vf;
8595       if (!nunits.is_constant (&const_nunits)
8596           || !vf.is_constant (&const_vf))
8597         return false;
8598       mask.new_vector (const_nunits, const_nunits, 1);
8599       nelts_to_build = const_vf * group_size;
8600       nvectors_per_build = 1;
8601       in_nlanes = const_vf * dr_group_size;
8602     }
8603   auto_sbitmap used_in_lanes (in_nlanes);
8604   bitmap_clear (used_in_lanes);
8605   auto_bitmap used_defs;
8606
8607   unsigned int count = mask.encoded_nelts ();
8608   mask.quick_grow (count);
8609   vec_perm_indices indices;
8610
8611   for (unsigned int j = 0; j < nelts_to_build; j++)
8612     {
8613       unsigned int iter_num = j / group_size;
8614       unsigned int stmt_num = j % group_size;
8615       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8616       bitmap_set_bit (used_in_lanes, i);
8617       if (repeating_p)
8618         {
8619           first_vec_index = 0;
8620           mask_element = i;
8621         }
8622       else
8623         {
8624           /* Enforced before the loop when !repeating_p.  */
8625           unsigned int const_nunits = nunits.to_constant ();
8626           vec_index = i / const_nunits;
8627           mask_element = i % const_nunits;
8628           if (vec_index == first_vec_index
8629               || first_vec_index == -1)
8630             {
8631               first_vec_index = vec_index;
8632             }
8633           else if (vec_index == second_vec_index
8634                    || second_vec_index == -1)
8635             {
8636               second_vec_index = vec_index;
8637               mask_element += const_nunits;
8638             }
8639           else
8640             {
8641               if (dump_p)
8642                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8643                                  "permutation requires at "
8644                                  "least three vectors %G",
8645                                  stmt_info->stmt);
8646               gcc_assert (analyze_only);
8647               return false;
8648             }
8649
8650           gcc_assert (mask_element < 2 * const_nunits);
8651         }
8652
8653       if (mask_element != index)
8654         noop_p = false;
8655       mask[index++] = mask_element;
8656
8657       if (index == count)
8658         {
8659           if (!noop_p)
8660             {
8661               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8662               if (!can_vec_perm_const_p (mode, mode, indices))
8663                 {
8664                   if (dump_p)
8665                     {
8666                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8667                                        "unsupported vect permute { ");
8668                       for (i = 0; i < count; ++i)
8669                         {
8670                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8671                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8672                         }
8673                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8674                     }
8675                   gcc_assert (analyze_only);
8676                   return false;
8677                 }
8678
8679               tree mask_vec = NULL_TREE;
8680               if (!analyze_only)
8681                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8682
8683               if (second_vec_index == -1)
8684                 second_vec_index = first_vec_index;
8685
8686               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8687                 {
8688                   ++*n_perms;
8689                   if (analyze_only)
8690                     continue;
8691                   /* Generate the permute statement if necessary.  */
8692                   tree first_vec = dr_chain[first_vec_index + ri];
8693                   tree second_vec = dr_chain[second_vec_index + ri];
8694                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8695                   tree perm_dest
8696                     = vect_create_destination_var (gimple_assign_lhs (stmt),
8697                                                    vectype);
8698                   perm_dest = make_ssa_name (perm_dest);
8699                   gimple *perm_stmt
8700                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8701                                            second_vec, mask_vec);
8702                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8703                                                gsi);
8704                   if (dce_chain)
8705                     {
8706                       bitmap_set_bit (used_defs, first_vec_index + ri);
8707                       bitmap_set_bit (used_defs, second_vec_index + ri);
8708                     }
8709
8710                   /* Store the vector statement in NODE.  */
8711                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8712                 }
8713             }
8714           else if (!analyze_only)
8715             {
8716               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8717                 {
8718                   tree first_vec = dr_chain[first_vec_index + ri];
8719                   /* If mask was NULL_TREE generate the requested
8720                      identity transform.  */
8721                   if (dce_chain)
8722                     bitmap_set_bit (used_defs, first_vec_index + ri);
8723
8724                   /* Store the vector statement in NODE.  */
8725                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8726                 }
8727             }
8728
8729           index = 0;
8730           first_vec_index = -1;
8731           second_vec_index = -1;
8732           noop_p = true;
8733         }
8734     }
8735
8736   if (n_loads)
8737     {
8738       if (repeating_p)
8739         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8740       else
8741         {
8742           /* Enforced above when !repeating_p.  */
8743           unsigned int const_nunits = nunits.to_constant ();
8744           *n_loads = 0;
8745           bool load_seen = false;
8746           for (unsigned i = 0; i < in_nlanes; ++i)
8747             {
8748               if (i % const_nunits == 0)
8749                 {
8750                   if (load_seen)
8751                     *n_loads += 1;
8752                   load_seen = false;
8753                 }
8754               if (bitmap_bit_p (used_in_lanes, i))
8755                 load_seen = true;
8756             }
8757           if (load_seen)
8758             *n_loads += 1;
8759         }
8760     }
8761
8762   if (dce_chain)
8763     for (unsigned i = 0; i < dr_chain.length (); ++i)
8764       if (!bitmap_bit_p (used_defs, i))
8765         {
8766           tree def = dr_chain[i];
8767           do
8768             {
8769               gimple *stmt = SSA_NAME_DEF_STMT (def);
8770               if (is_gimple_assign (stmt)
8771                   && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
8772                       || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
8773                 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
8774               else
8775                 def = NULL;
8776               gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8777               gsi_remove (&rgsi, true);
8778               release_defs (stmt);
8779             }
8780           while (def);
8781         }
8782
8783   return true;
8784 }
8785
8786 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8787    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8788    permute statements for the SLP node NODE.  Store the number of vector
8789    permute instructions in *N_PERMS and the number of vector load
8790    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
8791    that were not needed.  */
8792
8793 bool
8794 vect_transform_slp_perm_load (vec_info *vinfo,
8795                               slp_tree node, const vec<tree> &dr_chain,
8796                               gimple_stmt_iterator *gsi, poly_uint64 vf,
8797                               bool analyze_only, unsigned *n_perms,
8798                               unsigned int *n_loads, bool dce_chain)
8799 {
8800   return vect_transform_slp_perm_load_1 (vinfo, node,
8801                                          SLP_TREE_LOAD_PERMUTATION (node),
8802                                          dr_chain, gsi, vf, analyze_only,
8803                                          dump_enabled_p (), n_perms, n_loads,
8804                                          dce_chain);
8805 }
8806
8807 /* Produce the next vector result for SLP permutation NODE by adding a vector
8808    statement at GSI.  If MASK_VEC is nonnull, add:
8809
8810       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8811
8812    otherwise add:
8813
8814       <new SSA name> = FIRST_DEF.  */
8815
8816 static void
8817 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8818                           slp_tree node, tree first_def, tree second_def,
8819                           tree mask_vec, poly_uint64 identity_offset)
8820 {
8821   tree vectype = SLP_TREE_VECTYPE (node);
8822
8823   /* ???  We SLP match existing vector element extracts but
8824      allow punning which we need to re-instantiate at uses
8825      but have no good way of explicitly representing.  */
8826   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8827       && !types_compatible_p (TREE_TYPE (first_def), vectype))
8828     {
8829       gassign *conv_stmt
8830         = gimple_build_assign (make_ssa_name (vectype),
8831                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8832       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8833       first_def = gimple_assign_lhs (conv_stmt);
8834     }
8835   gassign *perm_stmt;
8836   tree perm_dest = make_ssa_name (vectype);
8837   if (mask_vec)
8838     {
8839       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8840                            TYPE_SIZE (vectype))
8841           && !types_compatible_p (TREE_TYPE (second_def), vectype))
8842         {
8843           gassign *conv_stmt
8844             = gimple_build_assign (make_ssa_name (vectype),
8845                                    build1 (VIEW_CONVERT_EXPR,
8846                                            vectype, second_def));
8847           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8848           second_def = gimple_assign_lhs (conv_stmt);
8849         }
8850       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8851                                        first_def, second_def,
8852                                        mask_vec);
8853     }
8854   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8855     {
8856       /* For identity permutes we still need to handle the case
8857          of offsetted extracts or concats.  */
8858       unsigned HOST_WIDE_INT c;
8859       auto first_def_nunits
8860         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8861       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8862         {
8863           unsigned HOST_WIDE_INT elsz
8864             = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8865           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8866                                  TYPE_SIZE (vectype),
8867                                  bitsize_int (identity_offset * elsz));
8868           perm_stmt = gimple_build_assign (perm_dest, lowpart);
8869         }
8870       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8871                                     first_def_nunits, &c) && c == 2)
8872         {
8873           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8874                                             NULL_TREE, second_def);
8875           perm_stmt = gimple_build_assign (perm_dest, ctor);
8876         }
8877       else
8878         gcc_unreachable ();
8879     }
8880   else
8881     {
8882       /* We need a copy here in case the def was external.  */
8883       perm_stmt = gimple_build_assign (perm_dest, first_def);
8884     }
8885   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8886   /* Store the vector statement in NODE.  */
8887   node->push_vec_def (perm_stmt);
8888 }
8889
8890 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
8891    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8892    If GSI is nonnull, emit the permutation there.
8893
8894    When GSI is null, the only purpose of NODE is to give properties
8895    of the result, such as the vector type and number of SLP lanes.
8896    The node does not need to be a VEC_PERM_EXPR.
8897
8898    If the target supports the operation, return the number of individual
8899    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
8900    dump file if DUMP_P is true.  */
8901
8902 static int
8903 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8904                                 slp_tree node, lane_permutation_t &perm,
8905                                 vec<slp_tree> &children, bool dump_p)
8906 {
8907   tree vectype = SLP_TREE_VECTYPE (node);
8908
8909   /* ???  We currently only support all same vector input types
8910      while the SLP IL should really do a concat + select and thus accept
8911      arbitrary mismatches.  */
8912   slp_tree child;
8913   unsigned i;
8914   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8915   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8916   tree op_vectype = NULL_TREE;
8917   FOR_EACH_VEC_ELT (children, i, child)
8918     if (SLP_TREE_VECTYPE (child))
8919       {
8920         op_vectype = SLP_TREE_VECTYPE (child);
8921         break;
8922       }
8923   if (!op_vectype)
8924     op_vectype = vectype;
8925   FOR_EACH_VEC_ELT (children, i, child)
8926     {
8927       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8928            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8929           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8930           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8931         {
8932           if (dump_p)
8933             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8934                              "Unsupported vector types in lane permutation\n");
8935           return -1;
8936         }
8937       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8938         repeating_p = false;
8939     }
8940
8941   gcc_assert (perm.length () == SLP_TREE_LANES (node));
8942   if (dump_p)
8943     {
8944       dump_printf_loc (MSG_NOTE, vect_location,
8945                        "vectorizing permutation");
8946       for (unsigned i = 0; i < perm.length (); ++i)
8947         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8948       if (repeating_p)
8949         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8950       dump_printf (MSG_NOTE, "\n");
8951     }
8952
8953   /* REPEATING_P is true if every output vector is guaranteed to use the
8954      same permute vector.  We can handle that case for both variable-length
8955      and constant-length vectors, but we only handle other cases for
8956      constant-length vectors.
8957
8958      Set:
8959
8960      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8961        mask vector that we want to build.
8962
8963      - NCOPIES to the number of copies of PERM that we need in order
8964        to build the necessary permute mask vectors.
8965
8966      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8967        for each permute mask vector.  This is only relevant when GSI is
8968        nonnull.  */
8969   uint64_t npatterns;
8970   unsigned nelts_per_pattern;
8971   uint64_t ncopies;
8972   unsigned noutputs_per_mask;
8973   if (repeating_p)
8974     {
8975       /* We need a single permute mask vector that has the form:
8976
8977            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8978
8979          In other words, the original n-element permute in PERM is
8980          "unrolled" to fill a full vector.  The stepped vector encoding
8981          that we use for permutes requires 3n elements.  */
8982       npatterns = SLP_TREE_LANES (node);
8983       nelts_per_pattern = ncopies = 3;
8984       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8985     }
8986   else
8987     {
8988       /* Calculate every element of every permute mask vector explicitly,
8989          instead of relying on the pattern described above.  */
8990       if (!nunits.is_constant (&npatterns)
8991           || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
8992         return -1;
8993       nelts_per_pattern = ncopies = 1;
8994       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8995         if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8996           return -1;
8997       noutputs_per_mask = 1;
8998     }
8999   unsigned olanes = ncopies * SLP_TREE_LANES (node);
9000   gcc_assert (repeating_p || multiple_p (olanes, nunits));
9001
9002   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
9003      from the { SLP operand, scalar lane } permutation as recorded in the
9004      SLP node as intermediate step.  This part should already work
9005      with SLP children with arbitrary number of lanes.  */
9006   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
9007   auto_vec<unsigned> active_lane;
9008   vperm.create (olanes);
9009   active_lane.safe_grow_cleared (children.length (), true);
9010   for (unsigned i = 0; i < ncopies; ++i)
9011     {
9012       for (unsigned pi = 0; pi < perm.length (); ++pi)
9013         {
9014           std::pair<unsigned, unsigned> p = perm[pi];
9015           tree vtype = SLP_TREE_VECTYPE (children[p.first]);
9016           if (repeating_p)
9017             vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
9018           else
9019             {
9020               /* We checked above that the vectors are constant-length.  */
9021               unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
9022               unsigned vi = (active_lane[p.first] + p.second) / vnunits;
9023               unsigned vl = (active_lane[p.first] + p.second) % vnunits;
9024               vperm.quick_push ({{p.first, vi}, vl});
9025             }
9026         }
9027       /* Advance to the next group.  */
9028       for (unsigned j = 0; j < children.length (); ++j)
9029         active_lane[j] += SLP_TREE_LANES (children[j]);
9030     }
9031
9032   if (dump_p)
9033     {
9034       dump_printf_loc (MSG_NOTE, vect_location,
9035                        "vectorizing permutation");
9036       for (unsigned i = 0; i < perm.length (); ++i)
9037         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
9038       if (repeating_p)
9039         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
9040       dump_printf (MSG_NOTE, "\n");
9041       dump_printf_loc (MSG_NOTE, vect_location, "as");
9042       for (unsigned i = 0; i < vperm.length (); ++i)
9043         {
9044           if (i != 0
9045               && (repeating_p
9046                   ? multiple_p (i, npatterns)
9047                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
9048             dump_printf (MSG_NOTE, ",");
9049           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
9050                        vperm[i].first.first, vperm[i].first.second,
9051                        vperm[i].second);
9052         }
9053       dump_printf (MSG_NOTE, "\n");
9054     }
9055
9056   /* We can only handle two-vector permutes, everything else should
9057      be lowered on the SLP level.  The following is closely inspired
9058      by vect_transform_slp_perm_load and is supposed to eventually
9059      replace it.
9060      ???   As intermediate step do code-gen in the SLP tree representation
9061      somehow?  */
9062   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
9063   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
9064   unsigned int index = 0;
9065   poly_uint64 mask_element;
9066   vec_perm_builder mask;
9067   mask.new_vector (nunits, npatterns, nelts_per_pattern);
9068   unsigned int count = mask.encoded_nelts ();
9069   mask.quick_grow (count);
9070   vec_perm_indices indices;
9071   unsigned nperms = 0;
9072   for (unsigned i = 0; i < vperm.length (); ++i)
9073     {
9074       mask_element = vperm[i].second;
9075       if (first_vec.first == -1U
9076           || first_vec == vperm[i].first)
9077         first_vec = vperm[i].first;
9078       else if (second_vec.first == -1U
9079                || second_vec == vperm[i].first)
9080         {
9081           second_vec = vperm[i].first;
9082           mask_element += nunits;
9083         }
9084       else
9085         {
9086           if (dump_p)
9087             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9088                              "permutation requires at "
9089                              "least three vectors\n");
9090           gcc_assert (!gsi);
9091           return -1;
9092         }
9093
9094       mask[index++] = mask_element;
9095
9096       if (index == count)
9097         {
9098           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
9099                               TYPE_VECTOR_SUBPARTS (op_vectype));
9100           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
9101                              && constant_multiple_p (mask[0], nunits));
9102           machine_mode vmode = TYPE_MODE (vectype);
9103           machine_mode op_vmode = TYPE_MODE (op_vectype);
9104           unsigned HOST_WIDE_INT c;
9105           if ((!identity_p
9106                && !can_vec_perm_const_p (vmode, op_vmode, indices))
9107               || (identity_p
9108                   && !known_le (nunits,
9109                                 TYPE_VECTOR_SUBPARTS (op_vectype))
9110                   && (!constant_multiple_p (nunits,
9111                                             TYPE_VECTOR_SUBPARTS (op_vectype),
9112                                             &c) || c != 2)))
9113             {
9114               if (dump_p)
9115                 {
9116                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
9117                                    vect_location,
9118                                    "unsupported vect permute { ");
9119                   for (i = 0; i < count; ++i)
9120                     {
9121                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
9122                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
9123                     }
9124                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
9125                 }
9126               gcc_assert (!gsi);
9127               return -1;
9128             }
9129
9130           if (!identity_p)
9131             nperms++;
9132           if (gsi)
9133             {
9134               if (second_vec.first == -1U)
9135                 second_vec = first_vec;
9136
9137               slp_tree
9138                 first_node = children[first_vec.first],
9139                 second_node = children[second_vec.first];
9140
9141               tree mask_vec = NULL_TREE;
9142               if (!identity_p)
9143                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
9144
9145               for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
9146                 {
9147                   tree first_def
9148                     = vect_get_slp_vect_def (first_node,
9149                                              first_vec.second + vi);
9150                   tree second_def
9151                     = vect_get_slp_vect_def (second_node,
9152                                              second_vec.second + vi);
9153                   vect_add_slp_permutation (vinfo, gsi, node, first_def,
9154                                             second_def, mask_vec, mask[0]);
9155                 }
9156             }
9157
9158           index = 0;
9159           first_vec = std::make_pair (-1U, -1U);
9160           second_vec = std::make_pair (-1U, -1U);
9161         }
9162     }
9163
9164   return nperms;
9165 }
9166
9167 /* Vectorize the SLP permutations in NODE as specified
9168    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
9169    child number and lane number.
9170    Interleaving of two two-lane two-child SLP subtrees (not supported):
9171      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
9172    A blend of two four-lane two-child SLP subtrees:
9173      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
9174    Highpart of a four-lane one-child SLP subtree (not supported):
9175      [ { 0, 2 }, { 0, 3 } ]
9176    Where currently only a subset is supported by code generating below.  */
9177
9178 static bool
9179 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
9180                               slp_tree node, stmt_vector_for_cost *cost_vec)
9181 {
9182   tree vectype = SLP_TREE_VECTYPE (node);
9183   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
9184   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
9185                                                SLP_TREE_CHILDREN (node),
9186                                                dump_enabled_p ());
9187   if (nperms < 0)
9188     return false;
9189
9190   if (!gsi)
9191     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
9192
9193   return true;
9194 }
9195
9196 /* Vectorize SLP NODE.  */
9197
9198 static void
9199 vect_schedule_slp_node (vec_info *vinfo,
9200                         slp_tree node, slp_instance instance)
9201 {
9202   gimple_stmt_iterator si;
9203   int i;
9204   slp_tree child;
9205
9206   /* Vectorize externals and constants.  */
9207   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9208       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
9209     {
9210       /* ???  vectorizable_shift can end up using a scalar operand which is
9211          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
9212          node in this case.  */
9213       if (!SLP_TREE_VECTYPE (node))
9214         return;
9215
9216       /* There are two reasons vector defs might already exist.  The first
9217          is that we are vectorizing an existing vector def.  The second is
9218          when performing BB vectorization shared constant/external nodes
9219          are not split apart during partitioning so during the code-gen
9220          DFS walk we can end up visiting them twice.  */
9221       if (! SLP_TREE_VEC_DEFS (node).exists ())
9222         vect_create_constant_vectors (vinfo, node);
9223       return;
9224     }
9225
9226   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9227
9228   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9229
9230   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
9231   SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9232
9233   if (dump_enabled_p ())
9234     dump_printf_loc (MSG_NOTE, vect_location,
9235                      "------>vectorizing SLP node starting from: %G",
9236                      stmt_info->stmt);
9237
9238   if (STMT_VINFO_DATA_REF (stmt_info)
9239       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9240     {
9241       /* Vectorized loads go before the first scalar load to make it
9242          ready early, vectorized stores go before the last scalar
9243          stmt which is where all uses are ready.  */
9244       stmt_vec_info last_stmt_info = NULL;
9245       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9246         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9247       else /* DR_IS_WRITE */
9248         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9249       si = gsi_for_stmt (last_stmt_info->stmt);
9250     }
9251   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9252             || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9253             || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9254            && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9255     {
9256       /* For PHI node vectorization we do not use the insertion iterator.  */
9257       si = gsi_none ();
9258     }
9259   else
9260     {
9261       /* Emit other stmts after the children vectorized defs which is
9262          earliest possible.  */
9263       gimple *last_stmt = NULL;
9264       if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
9265         if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
9266             || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
9267           {
9268             /* But avoid scheduling internal defs outside of the loop when
9269                we might have only implicitly tracked loop mask/len defs.  */
9270             gimple_stmt_iterator si
9271               = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
9272             last_stmt = *si;
9273           }
9274       bool seen_vector_def = false;
9275       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9276         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9277           {
9278             /* For fold-left reductions we are retaining the scalar
9279                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9280                set so the representation isn't perfect.  Resort to the
9281                last scalar def here.  */
9282             if (SLP_TREE_VEC_DEFS (child).is_empty ())
9283               {
9284                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9285                             == cycle_phi_info_type);
9286                 gphi *phi = as_a <gphi *>
9287                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
9288                 if (!last_stmt
9289                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
9290                   last_stmt = phi;
9291               }
9292             /* We are emitting all vectorized stmts in the same place and
9293                the last one is the last.
9294                ???  Unless we have a load permutation applied and that
9295                figures to re-use an earlier generated load.  */
9296             unsigned j;
9297             tree vdef;
9298             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9299               {
9300                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9301                 if (!last_stmt
9302                     || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9303                   last_stmt = vstmt;
9304               }
9305           }
9306         else if (!SLP_TREE_VECTYPE (child))
9307           {
9308             /* For externals we use unvectorized at all scalar defs.  */
9309             unsigned j;
9310             tree def;
9311             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9312               if (TREE_CODE (def) == SSA_NAME
9313                   && !SSA_NAME_IS_DEFAULT_DEF (def))
9314                 {
9315                   gimple *stmt = SSA_NAME_DEF_STMT (def);
9316                   if (!last_stmt
9317                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9318                     last_stmt = stmt;
9319                 }
9320           }
9321         else
9322           {
9323             /* For externals we have to look at all defs since their
9324                insertion place is decided per vector.  But beware
9325                of pre-existing vectors where we need to make sure
9326                we do not insert before the region boundary.  */
9327             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9328                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9329               seen_vector_def = true;
9330             else
9331               {
9332                 unsigned j;
9333                 tree vdef;
9334                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9335                   if (TREE_CODE (vdef) == SSA_NAME
9336                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9337                     {
9338                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9339                       if (!last_stmt
9340                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9341                         last_stmt = vstmt;
9342                     }
9343               }
9344           }
9345       /* This can happen when all children are pre-existing vectors or
9346          constants.  */
9347       if (!last_stmt)
9348         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9349       if (!last_stmt)
9350         {
9351           gcc_assert (seen_vector_def);
9352           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9353         }
9354       else if (is_ctrl_altering_stmt (last_stmt))
9355         {
9356           /* We split regions to vectorize at control altering stmts
9357              with a definition so this must be an external which
9358              we can insert at the start of the region.  */
9359           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9360         }
9361       else if (is_a <bb_vec_info> (vinfo)
9362                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9363                && gimple_could_trap_p (stmt_info->stmt))
9364         {
9365           /* We've constrained possibly trapping operations to all come
9366              from the same basic-block, if vectorized defs would allow earlier
9367              scheduling still force vectorized stmts to the original block.
9368              This is only necessary for BB vectorization since for loop vect
9369              all operations are in a single BB and scalar stmt based
9370              placement doesn't play well with epilogue vectorization.  */
9371           gcc_assert (dominated_by_p (CDI_DOMINATORS,
9372                                       gimple_bb (stmt_info->stmt),
9373                                       gimple_bb (last_stmt)));
9374           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9375         }
9376       else if (is_a <gphi *> (last_stmt))
9377         si = gsi_after_labels (gimple_bb (last_stmt));
9378       else
9379         {
9380           si = gsi_for_stmt (last_stmt);
9381           gsi_next (&si);
9382         }
9383     }
9384
9385   /* Handle purely internal nodes.  */
9386   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9387     {
9388       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
9389          be shared with different SLP nodes (but usually it's the same
9390          operation apart from the case the stmt is only there for denoting
9391          the actual scalar lane defs ...).  So do not call vect_transform_stmt
9392          but open-code it here (partly).  */
9393       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9394       gcc_assert (done);
9395       stmt_vec_info slp_stmt_info;
9396       unsigned int i;
9397       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9398         if (STMT_VINFO_LIVE_P (slp_stmt_info))
9399           {
9400             done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9401                                                 instance, i, true, NULL);
9402             gcc_assert (done);
9403           }
9404     }
9405   else
9406     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9407 }
9408
9409 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9410    For loop vectorization this is done in vectorizable_call, but for SLP
9411    it needs to be deferred until end of vect_schedule_slp, because multiple
9412    SLP instances may refer to the same scalar stmt.  */
9413
9414 static void
9415 vect_remove_slp_scalar_calls (vec_info *vinfo,
9416                               slp_tree node, hash_set<slp_tree> &visited)
9417 {
9418   gimple *new_stmt;
9419   gimple_stmt_iterator gsi;
9420   int i;
9421   slp_tree child;
9422   tree lhs;
9423   stmt_vec_info stmt_info;
9424
9425   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9426     return;
9427
9428   if (visited.add (node))
9429     return;
9430
9431   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9432     vect_remove_slp_scalar_calls (vinfo, child, visited);
9433
9434   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9435     {
9436       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9437       if (!stmt || gimple_bb (stmt) == NULL)
9438         continue;
9439       if (is_pattern_stmt_p (stmt_info)
9440           || !PURE_SLP_STMT (stmt_info))
9441         continue;
9442       lhs = gimple_call_lhs (stmt);
9443       if (lhs)
9444         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9445       else
9446         {
9447           new_stmt = gimple_build_nop ();
9448           unlink_stmt_vdef (stmt_info->stmt);
9449         }
9450       gsi = gsi_for_stmt (stmt);
9451       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9452       if (lhs)
9453         SSA_NAME_DEF_STMT (lhs) = new_stmt;
9454     }
9455 }
9456
9457 static void
9458 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9459 {
9460   hash_set<slp_tree> visited;
9461   vect_remove_slp_scalar_calls (vinfo, node, visited);
9462 }
9463
9464 /* Vectorize the instance root.  */
9465
9466 void
9467 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9468 {
9469   gassign *rstmt = NULL;
9470
9471   if (instance->kind == slp_inst_kind_ctor)
9472     {
9473       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9474         {
9475           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9476           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9477           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9478                                           TREE_TYPE (vect_lhs)))
9479             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9480                                vect_lhs);
9481           rstmt = gimple_build_assign (root_lhs, vect_lhs);
9482         }
9483       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9484         {
9485           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9486           tree child_def;
9487           int j;
9488           vec<constructor_elt, va_gc> *v;
9489           vec_alloc (v, nelts);
9490
9491           /* A CTOR can handle V16HI composition from VNx8HI so we
9492              do not need to convert vector elements if the types
9493              do not match.  */
9494           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9495             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9496           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9497           tree rtype
9498             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9499           tree r_constructor = build_constructor (rtype, v);
9500           rstmt = gimple_build_assign (lhs, r_constructor);
9501         }
9502     }
9503   else if (instance->kind == slp_inst_kind_bb_reduc)
9504     {
9505       /* Largely inspired by reduction chain epilogue handling in
9506          vect_create_epilog_for_reduction.  */
9507       vec<tree> vec_defs = vNULL;
9508       vect_get_slp_defs (node, &vec_defs);
9509       enum tree_code reduc_code
9510         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9511       /* ???  We actually have to reflect signs somewhere.  */
9512       if (reduc_code == MINUS_EXPR)
9513         reduc_code = PLUS_EXPR;
9514       gimple_seq epilogue = NULL;
9515       /* We may end up with more than one vector result, reduce them
9516          to one vector.  */
9517       tree vec_def = vec_defs[0];
9518       tree vectype = TREE_TYPE (vec_def);
9519       tree compute_vectype = vectype;
9520       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9521                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
9522                                  && operation_can_overflow (reduc_code));
9523       if (pun_for_overflow_p)
9524         {
9525           compute_vectype = unsigned_type_for (vectype);
9526           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9527                                   compute_vectype, vec_def);
9528         }
9529       for (unsigned i = 1; i < vec_defs.length (); ++i)
9530         {
9531           tree def = vec_defs[i];
9532           if (pun_for_overflow_p)
9533             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9534                                 compute_vectype, def);
9535           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9536                                   vec_def, def);
9537         }
9538       vec_defs.release ();
9539       /* ???  Support other schemes than direct internal fn.  */
9540       internal_fn reduc_fn;
9541       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9542           || reduc_fn == IFN_LAST)
9543         gcc_unreachable ();
9544       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9545                                       TREE_TYPE (compute_vectype), vec_def);
9546       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9547         {
9548           tree rem_def = NULL_TREE;
9549           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9550             {
9551               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9552               if (!rem_def)
9553                 rem_def = def;
9554               else
9555                 rem_def = gimple_build (&epilogue, reduc_code,
9556                                         TREE_TYPE (scalar_def),
9557                                         rem_def, def);
9558             }
9559           scalar_def = gimple_build (&epilogue, reduc_code,
9560                                      TREE_TYPE (scalar_def),
9561                                      scalar_def, rem_def);
9562         }
9563       scalar_def = gimple_convert (&epilogue,
9564                                    TREE_TYPE (vectype), scalar_def);
9565       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9566       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9567       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9568       update_stmt (gsi_stmt (rgsi));
9569       return;
9570     }
9571   else
9572     gcc_unreachable ();
9573
9574   gcc_assert (rstmt);
9575
9576   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9577   gsi_replace (&rgsi, rstmt, true);
9578 }
9579
9580 struct slp_scc_info
9581 {
9582   bool on_stack;
9583   int dfs;
9584   int lowlink;
9585 };
9586
9587 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
9588
9589 static void
9590 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9591                    hash_map<slp_tree, slp_scc_info> &scc_info,
9592                    int &maxdfs, vec<slp_tree> &stack)
9593 {
9594   bool existed_p;
9595   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9596   gcc_assert (!existed_p);
9597   info->dfs = maxdfs;
9598   info->lowlink = maxdfs;
9599   maxdfs++;
9600
9601   /* Leaf.  */
9602   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9603     {
9604       info->on_stack = false;
9605       vect_schedule_slp_node (vinfo, node, instance);
9606       return;
9607     }
9608
9609   info->on_stack = true;
9610   stack.safe_push (node);
9611
9612   unsigned i;
9613   slp_tree child;
9614   /* DFS recurse.  */
9615   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9616     {
9617       if (!child)
9618         continue;
9619       slp_scc_info *child_info = scc_info.get (child);
9620       if (!child_info)
9621         {
9622           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9623           /* Recursion might have re-allocated the node.  */
9624           info = scc_info.get (node);
9625           child_info = scc_info.get (child);
9626           info->lowlink = MIN (info->lowlink, child_info->lowlink);
9627         }
9628       else if (child_info->on_stack)
9629         info->lowlink = MIN (info->lowlink, child_info->dfs);
9630     }
9631   if (info->lowlink != info->dfs)
9632     return;
9633
9634   auto_vec<slp_tree, 4> phis_to_fixup;
9635
9636   /* Singleton.  */
9637   if (stack.last () == node)
9638     {
9639       stack.pop ();
9640       info->on_stack = false;
9641       vect_schedule_slp_node (vinfo, node, instance);
9642       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9643           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9644         phis_to_fixup.quick_push (node);
9645     }
9646   else
9647     {
9648       /* SCC.  */
9649       int last_idx = stack.length () - 1;
9650       while (stack[last_idx] != node)
9651         last_idx--;
9652       /* We can break the cycle at PHIs who have at least one child
9653          code generated.  Then we could re-start the DFS walk until
9654          all nodes in the SCC are covered (we might have new entries
9655          for only back-reachable nodes).  But it's simpler to just
9656          iterate and schedule those that are ready.  */
9657       unsigned todo = stack.length () - last_idx;
9658       do
9659         {
9660           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9661             {
9662               slp_tree entry = stack[idx];
9663               if (!entry)
9664                 continue;
9665               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9666                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9667               bool ready = !phi;
9668               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9669                   if (!child)
9670                     {
9671                       gcc_assert (phi);
9672                       ready = true;
9673                       break;
9674                     }
9675                   else if (scc_info.get (child)->on_stack)
9676                     {
9677                       if (!phi)
9678                         {
9679                           ready = false;
9680                           break;
9681                         }
9682                     }
9683                   else
9684                     {
9685                       if (phi)
9686                         {
9687                           ready = true;
9688                           break;
9689                         }
9690                     }
9691               if (ready)
9692                 {
9693                   vect_schedule_slp_node (vinfo, entry, instance);
9694                   scc_info.get (entry)->on_stack = false;
9695                   stack[idx] = NULL;
9696                   todo--;
9697                   if (phi)
9698                     phis_to_fixup.safe_push (entry);
9699                 }
9700             }
9701         }
9702       while (todo != 0);
9703
9704       /* Pop the SCC.  */
9705       stack.truncate (last_idx);
9706     }
9707
9708   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
9709   slp_tree phi_node;
9710   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9711     {
9712       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9713       edge_iterator ei;
9714       edge e;
9715       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9716         {
9717           unsigned dest_idx = e->dest_idx;
9718           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9719           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9720             continue;
9721           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9722           /* Simply fill all args.  */
9723           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9724               != vect_first_order_recurrence)
9725             for (unsigned i = 0; i < n; ++i)
9726               {
9727                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9728                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9729                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9730                              e, gimple_phi_arg_location (phi, dest_idx));
9731               }
9732           else
9733             {
9734               /* Unless it is a first order recurrence which needs
9735                  args filled in for both the PHI node and the permutes.  */
9736               gimple *perm
9737                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9738               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9739               add_phi_arg (as_a <gphi *> (rphi),
9740                            vect_get_slp_vect_def (child, n - 1),
9741                            e, gimple_phi_arg_location (phi, dest_idx));
9742               for (unsigned i = 0; i < n; ++i)
9743                 {
9744                   gimple *perm
9745                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9746                   if (i > 0)
9747                     gimple_assign_set_rhs1 (perm,
9748                                             vect_get_slp_vect_def (child, i - 1));
9749                   gimple_assign_set_rhs2 (perm,
9750                                           vect_get_slp_vect_def (child, i));
9751                   update_stmt (perm);
9752                 }
9753             }
9754         }
9755     }
9756 }
9757
9758 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
9759
9760 void
9761 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9762 {
9763   slp_instance instance;
9764   unsigned int i;
9765
9766   hash_map<slp_tree, slp_scc_info> scc_info;
9767   int maxdfs = 0;
9768   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9769     {
9770       slp_tree node = SLP_INSTANCE_TREE (instance);
9771       if (dump_enabled_p ())
9772         {
9773           dump_printf_loc (MSG_NOTE, vect_location,
9774                            "Vectorizing SLP tree:\n");
9775           /* ???  Dump all?  */
9776           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9777             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9778                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9779           vect_print_slp_graph (MSG_NOTE, vect_location,
9780                                 SLP_INSTANCE_TREE (instance));
9781         }
9782       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9783          have a PHI be the node breaking the cycle.  */
9784       auto_vec<slp_tree> stack;
9785       if (!scc_info.get (node))
9786         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9787
9788       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9789         vectorize_slp_instance_root_stmt (node, instance);
9790
9791       if (dump_enabled_p ())
9792         dump_printf_loc (MSG_NOTE, vect_location,
9793                          "vectorizing stmts using SLP.\n");
9794     }
9795
9796   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9797     {
9798       slp_tree root = SLP_INSTANCE_TREE (instance);
9799       stmt_vec_info store_info;
9800       unsigned int j;
9801
9802       /* Remove scalar call stmts.  Do not do this for basic-block
9803          vectorization as not all uses may be vectorized.
9804          ???  Why should this be necessary?  DCE should be able to
9805          remove the stmts itself.
9806          ???  For BB vectorization we can as well remove scalar
9807          stmts starting from the SLP tree root if they have no
9808          uses.  */
9809       if (is_a <loop_vec_info> (vinfo))
9810         vect_remove_slp_scalar_calls (vinfo, root);
9811
9812       /* Remove vectorized stores original scalar stmts.  */
9813       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9814         {
9815           if (!STMT_VINFO_DATA_REF (store_info)
9816               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9817             break;
9818
9819           store_info = vect_orig_stmt (store_info);
9820           /* Free the attached stmt_vec_info and remove the stmt.  */
9821           vinfo->remove_stmt (store_info);
9822
9823           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9824              to not crash in vect_free_slp_tree later.  */
9825           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9826             SLP_TREE_REPRESENTATIVE (root) = NULL;
9827         }
9828     }
9829 }