gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "insn-config.h"
  35 #include "recog.h"              /* FIXME: for insn_data */
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "gimple-iterator.h"
  39 #include "cfgloop.h"
  40 #include "tree-vectorizer.h"
  41 #include "langhooks.h"
  42 #include "gimple-walk.h"
  43 #include "dbgcnt.h"
  44 #include "tree-vector-builder.h"
  45 #include "vec-perm-indices.h"
  46 #include "gimple-fold.h"
  47 #include "internal-fn.h"
  48 #include "dump-context.h"
  49 #include "cfganal.h"
  50 #include "tree-eh.h"
  51 #include "tree-cfg.h"
  52 #include "alloc-pool.h"
  53 #include "sreal.h"
  54 #include "predict.h"
  55
  56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  57                                             load_permutation_t &,
  58                                             const vec<tree> &,
  59                                             gimple_stmt_iterator *,
  60                                             poly_uint64, bool, bool,
  61                                             unsigned *,
  62                                             unsigned * = nullptr,
  63                                             bool = false);
  64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  65                                            slp_tree, lane_permutation_t &,
  66                                            vec<slp_tree> &, bool);
  67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  68                                           slp_tree, stmt_vector_for_cost *);
  69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  70
  71 static object_allocator<_slp_tree> *slp_tree_pool;
  72 static slp_tree slp_first_node;
  73
  74 void
  75 vect_slp_init (void)
  76 {
  77   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  78 }
  79
  80 void
  81 vect_slp_fini (void)
  82 {
  83   while (slp_first_node)
  84     delete slp_first_node;
  85   delete slp_tree_pool;
  86   slp_tree_pool = NULL;
  87 }
  88
  89 void *
  90 _slp_tree::operator new (size_t n)
  91 {
  92   gcc_assert (n == sizeof (_slp_tree));
  93   return slp_tree_pool->allocate_raw ();
  94 }
  95
  96 void
  97 _slp_tree::operator delete (void *node, size_t n)
  98 {
  99   gcc_assert (n == sizeof (_slp_tree));
 100   slp_tree_pool->remove_raw (node);
 101 }
 102
 103
 104 /* Initialize a SLP node.  */
 105
 106 _slp_tree::_slp_tree ()
 107 {
 108   this->prev_node = NULL;
 109   if (slp_first_node)
 110     slp_first_node->prev_node = this;
 111   this->next_node = slp_first_node;
 112   slp_first_node = this;
 113   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 114   SLP_TREE_SCALAR_OPS (this) = vNULL;
 115   SLP_TREE_VEC_DEFS (this) = vNULL;
 116   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 117   SLP_TREE_CHILDREN (this) = vNULL;
 118   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 119   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 120   SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
 121   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 122   SLP_TREE_CODE (this) = ERROR_MARK;
 123   SLP_TREE_VECTYPE (this) = NULL_TREE;
 124   SLP_TREE_REPRESENTATIVE (this) = NULL;
 125   SLP_TREE_REF_COUNT (this) = 1;
 126   this->failed = NULL;
 127   this->max_nunits = 1;
 128   this->lanes = 0;
 129 }
 130
 131 /* Tear down a SLP node.  */
 132
 133 _slp_tree::~_slp_tree ()
 134 {
 135   if (this->prev_node)
 136     this->prev_node->next_node = this->next_node;
 137   else
 138     slp_first_node = this->next_node;
 139   if (this->next_node)
 140     this->next_node->prev_node = this->prev_node;
 141   SLP_TREE_CHILDREN (this).release ();
 142   SLP_TREE_SCALAR_STMTS (this).release ();
 143   SLP_TREE_SCALAR_OPS (this).release ();
 144   SLP_TREE_VEC_DEFS (this).release ();
 145   SLP_TREE_LOAD_PERMUTATION (this).release ();
 146   SLP_TREE_LANE_PERMUTATION (this).release ();
 147   SLP_TREE_SIMD_CLONE_INFO (this).release ();
 148   if (this->failed)
 149     free (failed);
 150 }
 151
 152 /* Push the single SSA definition in DEF to the vector of vector defs.  */
 153
 154 void
 155 _slp_tree::push_vec_def (gimple *def)
 156 {
 157   if (gphi *phi = dyn_cast <gphi *> (def))
 158     vec_defs.quick_push (gimple_phi_result (phi));
 159   else
 160     {
 161       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
 162       vec_defs.quick_push (get_def_from_ptr (defop));
 163     }
 164 }
 165
 166 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 167
 168 void
 169 vect_free_slp_tree (slp_tree node)
 170 {
 171   int i;
 172   slp_tree child;
 173
 174   if (--SLP_TREE_REF_COUNT (node) != 0)
 175     return;
 176
 177   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 178     if (child)
 179       vect_free_slp_tree (child);
 180
 181   /* If the node defines any SLP only patterns then those patterns are no
 182      longer valid and should be removed.  */
 183   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 184   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 185     {
 186       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 187       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 188       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 189     }
 190
 191   delete node;
 192 }
 193
 194 /* Return a location suitable for dumpings related to the SLP instance.  */
 195
 196 dump_user_location_t
 197 _slp_instance::location () const
 198 {
 199   if (!root_stmts.is_empty ())
 200     return root_stmts[0]->stmt;
 201   else
 202     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 203 }
 204
 205
 206 /* Free the memory allocated for the SLP instance.  */
 207
 208 void
 209 vect_free_slp_instance (slp_instance instance)
 210 {
 211   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 212   SLP_INSTANCE_LOADS (instance).release ();
 213   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 214   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
 215   instance->subgraph_entries.release ();
 216   instance->cost_vec.release ();
 217   free (instance);
 218 }
 219
 220
 221 /* Create an SLP node for SCALAR_STMTS.  */
 222
 223 slp_tree
 224 vect_create_new_slp_node (unsigned nops, tree_code code)
 225 {
 226   slp_tree node = new _slp_tree;
 227   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 228   SLP_TREE_CHILDREN (node).create (nops);
 229   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 230   SLP_TREE_CODE (node) = code;
 231   return node;
 232 }
 233 /* Create an SLP node for SCALAR_STMTS.  */
 234
 235 static slp_tree
 236 vect_create_new_slp_node (slp_tree node,
 237                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 238 {
 239   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 240   SLP_TREE_CHILDREN (node).create (nops);
 241   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 242   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 243   SLP_TREE_LANES (node) = scalar_stmts.length ();
 244   return node;
 245 }
 246
 247 /* Create an SLP node for SCALAR_STMTS.  */
 248
 249 static slp_tree
 250 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 251 {
 252   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 253 }
 254
 255 /* Create an SLP node for OPS.  */
 256
 257 static slp_tree
 258 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 259 {
 260   SLP_TREE_SCALAR_OPS (node) = ops;
 261   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 262   SLP_TREE_LANES (node) = ops.length ();
 263   return node;
 264 }
 265
 266 /* Create an SLP node for OPS.  */
 267
 268 static slp_tree
 269 vect_create_new_slp_node (vec<tree> ops)
 270 {
 271   return vect_create_new_slp_node (new _slp_tree, ops);
 272 }
 273
 274
 275 /* This structure is used in creation of an SLP tree.  Each instance
 276    corresponds to the same operand in a group of scalar stmts in an SLP
 277    node.  */
 278 typedef struct _slp_oprnd_info
 279 {
 280   /* Def-stmts for the operands.  */
 281   vec<stmt_vec_info> def_stmts;
 282   /* Operands.  */
 283   vec<tree> ops;
 284   /* Information about the first statement, its vector def-type, type, the
 285      operand itself in case it's constant, and an indication if it's a pattern
 286      stmt and gather/scatter info.  */
 287   tree first_op_type;
 288   enum vect_def_type first_dt;
 289   bool any_pattern;
 290   bool first_gs_p;
 291   gather_scatter_info first_gs_info;
 292 } *slp_oprnd_info;
 293
 294
 295 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 296    operand.  */
 297 static vec<slp_oprnd_info>
 298 vect_create_oprnd_info (int nops, int group_size)
 299 {
 300   int i;
 301   slp_oprnd_info oprnd_info;
 302   vec<slp_oprnd_info> oprnds_info;
 303
 304   oprnds_info.create (nops);
 305   for (i = 0; i < nops; i++)
 306     {
 307       oprnd_info = XNEW (struct _slp_oprnd_info);
 308       oprnd_info->def_stmts.create (group_size);
 309       oprnd_info->ops.create (group_size);
 310       oprnd_info->first_dt = vect_uninitialized_def;
 311       oprnd_info->first_op_type = NULL_TREE;
 312       oprnd_info->any_pattern = false;
 313       oprnd_info->first_gs_p = false;
 314       oprnds_info.quick_push (oprnd_info);
 315     }
 316
 317   return oprnds_info;
 318 }
 319
 320
 321 /* Free operands info.  */
 322
 323 static void
 324 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 325 {
 326   int i;
 327   slp_oprnd_info oprnd_info;
 328
 329   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 330     {
 331       oprnd_info->def_stmts.release ();
 332       oprnd_info->ops.release ();
 333       XDELETE (oprnd_info);
 334     }
 335
 336   oprnds_info.release ();
 337 }
 338
 339 /* Return the execution frequency of NODE (so that a higher value indicates
 340    a "more important" node when optimizing for speed).  */
 341
 342 static sreal
 343 vect_slp_node_weight (slp_tree node)
 344 {
 345   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 346   basic_block bb = gimple_bb (stmt_info->stmt);
 347   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 348 }
 349
 350 /* Return true if STMTS contains a pattern statement.  */
 351
 352 static bool
 353 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 354 {
 355   stmt_vec_info stmt_info;
 356   unsigned int i;
 357   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 358     if (is_pattern_stmt_p (stmt_info))
 359       return true;
 360   return false;
 361 }
 362
 363 /* Return true when all lanes in the external or constant NODE have
 364    the same value.  */
 365
 366 static bool
 367 vect_slp_tree_uniform_p (slp_tree node)
 368 {
 369   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 370               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 371
 372   /* Pre-exsting vectors.  */
 373   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 374     return false;
 375
 376   unsigned i;
 377   tree op, first = NULL_TREE;
 378   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 379     if (!first)
 380       first = op;
 381     else if (!operand_equal_p (first, op, 0))
 382       return false;
 383
 384   return true;
 385 }
 386
 387 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 388    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 389    of the chain.  */
 390
 391 int
 392 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 393                                       stmt_vec_info first_stmt_info)
 394 {
 395   stmt_vec_info next_stmt_info = first_stmt_info;
 396   int result = 0;
 397
 398   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 399     return -1;
 400
 401   do
 402     {
 403       if (next_stmt_info == stmt_info)
 404         return result;
 405       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 406       if (next_stmt_info)
 407         result += DR_GROUP_GAP (next_stmt_info);
 408     }
 409   while (next_stmt_info);
 410
 411   return -1;
 412 }
 413
 414 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 415    using the method implemented by duplicate_and_interleave.  Return true
 416    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 417    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 418    (if nonnull).  */
 419
 420 bool
 421 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 422                                 tree elt_type, unsigned int *nvectors_out,
 423                                 tree *vector_type_out,
 424                                 tree *permutes)
 425 {
 426   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 427   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 428     return false;
 429
 430   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 431   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 432   unsigned int nvectors = 1;
 433   for (;;)
 434     {
 435       scalar_int_mode int_mode;
 436       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 437       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 438         {
 439           /* Get the natural vector type for this SLP group size.  */
 440           tree int_type = build_nonstandard_integer_type
 441             (GET_MODE_BITSIZE (int_mode), 1);
 442           tree vector_type
 443             = get_vectype_for_scalar_type (vinfo, int_type, count);
 444           poly_int64 half_nelts;
 445           if (vector_type
 446               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 447               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 448                            GET_MODE_SIZE (base_vector_mode))
 449               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
 450                              2, &half_nelts))
 451             {
 452               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 453                  together into elements of type INT_TYPE and using the result
 454                  to build NVECTORS vectors.  */
 455               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 456               vec_perm_builder sel1 (nelts, 2, 3);
 457               vec_perm_builder sel2 (nelts, 2, 3);
 458
 459               for (unsigned int i = 0; i < 3; ++i)
 460                 {
 461                   sel1.quick_push (i);
 462                   sel1.quick_push (i + nelts);
 463                   sel2.quick_push (half_nelts + i);
 464                   sel2.quick_push (half_nelts + i + nelts);
 465                 }
 466               vec_perm_indices indices1 (sel1, 2, nelts);
 467               vec_perm_indices indices2 (sel2, 2, nelts);
 468               machine_mode vmode = TYPE_MODE (vector_type);
 469               if (can_vec_perm_const_p (vmode, vmode, indices1)
 470                   && can_vec_perm_const_p (vmode, vmode, indices2))
 471                 {
 472                   if (nvectors_out)
 473                     *nvectors_out = nvectors;
 474                   if (vector_type_out)
 475                     *vector_type_out = vector_type;
 476                   if (permutes)
 477                     {
 478                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 479                                                                 indices1);
 480                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 481                                                                 indices2);
 482                     }
 483                   return true;
 484                 }
 485             }
 486         }
 487       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 488         return false;
 489       nvectors *= 2;
 490     }
 491 }
 492
 493 /* Return true if DTA and DTB match.  */
 494
 495 static bool
 496 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 497 {
 498   return (dta == dtb
 499           || ((dta == vect_external_def || dta == vect_constant_def)
 500               && (dtb == vect_external_def || dtb == vect_constant_def)));
 501 }
 502
 503 static const int cond_expr_maps[3][5] = {
 504   { 4, -1, -2, 1, 2 },
 505   { 4, -2, -1, 1, 2 },
 506   { 4, -1, -2, 2, 1 }
 507 };
 508 static const int arg0_map[] = { 1, 0 };
 509 static const int arg1_map[] = { 1, 1 };
 510 static const int arg2_map[] = { 1, 2 };
 511 static const int arg1_arg4_map[] = { 2, 1, 4 };
 512 static const int arg3_arg2_map[] = { 2, 3, 2 };
 513 static const int op1_op0_map[] = { 2, 1, 0 };
 514 static const int off_map[] = { 1, -3 };
 515 static const int off_op0_map[] = { 2, -3, 0 };
 516 static const int off_arg2_map[] = { 2, -3, 2 };
 517 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
 518 static const int mask_call_maps[6][7] = {
 519   { 1, 1, },
 520   { 2, 1, 2, },
 521   { 3, 1, 2, 3, },
 522   { 4, 1, 2, 3, 4, },
 523   { 5, 1, 2, 3, 4, 5, },
 524   { 6, 1, 2, 3, 4, 5, 6 },
 525 };
 526
 527 /* For most SLP statements, there is a one-to-one mapping between
 528    gimple arguments and child nodes.  If that is not true for STMT,
 529    return an array that contains:
 530
 531    - the number of child nodes, followed by
 532    - for each child node, the index of the argument associated with that node.
 533      The special index -1 is the first operand of an embedded comparison and
 534      the special index -2 is the second operand of an embedded comparison.
 535      The special indes -3 is the offset of a gather as analyzed by
 536      vect_check_gather_scatter.
 537
 538    SWAP is as for vect_get_and_check_slp_defs.  */
 539
 540 static const int *
 541 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
 542                       unsigned char swap = 0)
 543 {
 544   if (auto assign = dyn_cast<const gassign *> (stmt))
 545     {
 546       if (gimple_assign_rhs_code (assign) == COND_EXPR
 547           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 548         return cond_expr_maps[swap];
 549       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 550           && swap)
 551         return op1_op0_map;
 552       if (gather_scatter_p)
 553         return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
 554                 ? off_op0_map : off_map);
 555     }
 556   gcc_assert (!swap);
 557   if (auto call = dyn_cast<const gcall *> (stmt))
 558     {
 559       if (gimple_call_internal_p (call))
 560         switch (gimple_call_internal_fn (call))
 561           {
 562           case IFN_MASK_LOAD:
 563             return gather_scatter_p ? off_arg2_map : arg2_map;
 564
 565           case IFN_GATHER_LOAD:
 566             return arg1_map;
 567
 568           case IFN_MASK_GATHER_LOAD:
 569           case IFN_MASK_LEN_GATHER_LOAD:
 570             return arg1_arg4_map;
 571
 572           case IFN_MASK_STORE:
 573             return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
 574
 575           case IFN_MASK_CALL:
 576             {
 577               unsigned nargs = gimple_call_num_args (call);
 578               if (nargs >= 2 && nargs <= 7)
 579                 return mask_call_maps[nargs-2];
 580               else
 581                 return nullptr;
 582             }
 583
 584           case IFN_CLZ:
 585           case IFN_CTZ:
 586             return arg0_map;
 587
 588           default:
 589             break;
 590           }
 591     }
 592   return nullptr;
 593 }
 594
 595 /* Return the SLP node child index for operand OP of STMT.  */
 596
 597 int
 598 vect_slp_child_index_for_operand (const gimple *stmt, int op,
 599                                   bool gather_scatter_p)
 600 {
 601   const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
 602   if (!opmap)
 603     return op;
 604   for (int i = 1; i < 1 + opmap[0]; ++i)
 605     if (opmap[i] == op)
 606       return i - 1;
 607   gcc_unreachable ();
 608 }
 609
 610 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 611    they are of a valid type and that they match the defs of the first stmt of
 612    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 613    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 614    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 615    is 1 if STMT is cond and operands of comparison need to be swapped;
 616    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 617
 618    If there was a fatal error return -1; if the error could be corrected by
 619    swapping operands of father node of this one, return 1; if everything is
 620    ok return 0.  */
 621 static int
 622 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 623                              bool *skip_args,
 624                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 625                              vec<slp_oprnd_info> *oprnds_info)
 626 {
 627   stmt_vec_info stmt_info = stmts[stmt_num];
 628   tree oprnd;
 629   unsigned int i, number_of_oprnds;
 630   enum vect_def_type dt = vect_uninitialized_def;
 631   slp_oprnd_info oprnd_info;
 632   gather_scatter_info gs_info;
 633   unsigned int gs_op = -1u;
 634   unsigned int commutative_op = -1U;
 635   bool first = stmt_num == 0;
 636
 637   if (!is_a<gcall *> (stmt_info->stmt)
 638       && !is_a<gassign *> (stmt_info->stmt)
 639       && !is_a<gphi *> (stmt_info->stmt))
 640     return -1;
 641
 642   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 643   const int *map
 644     = vect_get_operand_map (stmt_info->stmt,
 645                             STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
 646   if (map)
 647     number_of_oprnds = *map++;
 648   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 649     {
 650       if (gimple_call_internal_p (stmt))
 651         {
 652           internal_fn ifn = gimple_call_internal_fn (stmt);
 653           commutative_op = first_commutative_argument (ifn);
 654         }
 655     }
 656   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 657     {
 658       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 659         commutative_op = 0;
 660     }
 661
 662   bool swapped = (swap != 0);
 663   bool backedge = false;
 664   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 665   for (i = 0; i < number_of_oprnds; i++)
 666     {
 667       oprnd_info = (*oprnds_info)[i];
 668       int opno = map ? map[i] : int (i);
 669       if (opno == -3)
 670         {
 671           gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
 672           if (!is_a <loop_vec_info> (vinfo)
 673               || !vect_check_gather_scatter (stmt_info,
 674                                              as_a <loop_vec_info> (vinfo),
 675                                              first ? &oprnd_info->first_gs_info
 676                                              : &gs_info))
 677             return -1;
 678
 679           if (first)
 680             {
 681               oprnd_info->first_gs_p = true;
 682               oprnd = oprnd_info->first_gs_info.offset;
 683             }
 684           else
 685             {
 686               gs_op = i;
 687               oprnd = gs_info.offset;
 688             }
 689         }
 690       else if (opno < 0)
 691         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 692       else
 693         {
 694           oprnd = gimple_arg (stmt_info->stmt, opno);
 695           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 696             {
 697               edge e = gimple_phi_arg_edge (stmt, opno);
 698               backedge = (is_a <bb_vec_info> (vinfo)
 699                           ? e->flags & EDGE_DFS_BACK
 700                           : dominated_by_p (CDI_DOMINATORS, e->src,
 701                                             gimple_bb (stmt_info->stmt)));
 702             }
 703         }
 704       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 705         oprnd = TREE_OPERAND (oprnd, 0);
 706
 707       stmt_vec_info def_stmt_info;
 708       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 709         {
 710           if (dump_enabled_p ())
 711             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 712                              "Build SLP failed: can't analyze def for %T\n",
 713                              oprnd);
 714
 715           return -1;
 716         }
 717
 718       if (skip_args[i])
 719         {
 720           oprnd_info->def_stmts.quick_push (NULL);
 721           oprnd_info->ops.quick_push (NULL_TREE);
 722           oprnd_info->first_dt = vect_uninitialized_def;
 723           continue;
 724         }
 725
 726       oprnd_info->def_stmts.quick_push (def_stmt_info);
 727       oprnd_info->ops.quick_push (oprnd);
 728
 729       if (def_stmt_info
 730           && is_pattern_stmt_p (def_stmt_info))
 731         {
 732           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 733               != def_stmt_info)
 734             oprnd_info->any_pattern = true;
 735           else
 736             /* If we promote this to external use the original stmt def.  */
 737             oprnd_info->ops.last ()
 738               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 739         }
 740
 741       /* If there's a extern def on a backedge make sure we can
 742          code-generate at the region start.
 743          ???  This is another case that could be fixed by adjusting
 744          how we split the function but at the moment we'd have conflicting
 745          goals there.  */
 746       if (backedge
 747           && dts[i] == vect_external_def
 748           && is_a <bb_vec_info> (vinfo)
 749           && TREE_CODE (oprnd) == SSA_NAME
 750           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 751           && !dominated_by_p (CDI_DOMINATORS,
 752                               as_a <bb_vec_info> (vinfo)->bbs[0],
 753                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 754         {
 755           if (dump_enabled_p ())
 756             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 757                              "Build SLP failed: extern def %T only defined "
 758                              "on backedge\n", oprnd);
 759           return -1;
 760         }
 761
 762       if (first)
 763         {
 764           tree type = TREE_TYPE (oprnd);
 765           dt = dts[i];
 766
 767           /* For the swapping logic below force vect_reduction_def
 768              for the reduction op in a SLP reduction group.  */
 769           if (!STMT_VINFO_DATA_REF (stmt_info)
 770               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 771               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 772               && def_stmt_info)
 773             dts[i] = dt = vect_reduction_def;
 774
 775           /* Check the types of the definition.  */
 776           switch (dt)
 777             {
 778             case vect_external_def:
 779             case vect_constant_def:
 780             case vect_internal_def:
 781             case vect_reduction_def:
 782             case vect_induction_def:
 783             case vect_nested_cycle:
 784             case vect_first_order_recurrence:
 785               break;
 786
 787             default:
 788               /* FORNOW: Not supported.  */
 789               if (dump_enabled_p ())
 790                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 791                                  "Build SLP failed: illegal type of def %T\n",
 792                                  oprnd);
 793               return -1;
 794             }
 795
 796           oprnd_info->first_dt = dt;
 797           oprnd_info->first_op_type = type;
 798         }
 799     }
 800   if (first)
 801     return 0;
 802
 803   /* Now match the operand definition types to that of the first stmt.  */
 804   for (i = 0; i < number_of_oprnds;)
 805     {
 806       if (skip_args[i])
 807         {
 808           ++i;
 809           continue;
 810         }
 811
 812       oprnd_info = (*oprnds_info)[i];
 813       dt = dts[i];
 814       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 815       oprnd = oprnd_info->ops[stmt_num];
 816       tree type = TREE_TYPE (oprnd);
 817
 818       if (!types_compatible_p (oprnd_info->first_op_type, type))
 819         {
 820           if (dump_enabled_p ())
 821             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 822                              "Build SLP failed: different operand types\n");
 823           return 1;
 824         }
 825
 826       if ((gs_op == i) != oprnd_info->first_gs_p)
 827         {
 828           if (dump_enabled_p ())
 829             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 830                              "Build SLP failed: mixed gather and non-gather\n");
 831           return 1;
 832         }
 833       else if (gs_op == i)
 834         {
 835           if (!operand_equal_p (oprnd_info->first_gs_info.base,
 836                                 gs_info.base))
 837             {
 838               if (dump_enabled_p ())
 839                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 840                                  "Build SLP failed: different gather base\n");
 841               return 1;
 842             }
 843           if (oprnd_info->first_gs_info.scale != gs_info.scale)
 844             {
 845               if (dump_enabled_p ())
 846                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 847                                  "Build SLP failed: different gather scale\n");
 848               return 1;
 849             }
 850         }
 851
 852       /* Not first stmt of the group, check that the def-stmt/s match
 853          the def-stmt/s of the first stmt.  Allow different definition
 854          types for reduction chains: the first stmt must be a
 855          vect_reduction_def (a phi node), and the rest
 856          end in the reduction chain.  */
 857       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 858            && !(oprnd_info->first_dt == vect_reduction_def
 859                 && !STMT_VINFO_DATA_REF (stmt_info)
 860                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 861                 && def_stmt_info
 862                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 863                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 864                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 865           || (!STMT_VINFO_DATA_REF (stmt_info)
 866               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 867               && ((!def_stmt_info
 868                    || STMT_VINFO_DATA_REF (def_stmt_info)
 869                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 870                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 871                   != (oprnd_info->first_dt != vect_reduction_def))))
 872         {
 873           /* Try swapping operands if we got a mismatch.  For BB
 874              vectorization only in case it will clearly improve things.  */
 875           if (i == commutative_op && !swapped
 876               && (!is_a <bb_vec_info> (vinfo)
 877                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 878                                              dts[i+1])
 879                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 880                           || vect_def_types_match
 881                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 882             {
 883               if (dump_enabled_p ())
 884                 dump_printf_loc (MSG_NOTE, vect_location,
 885                                  "trying swapped operands\n");
 886               std::swap (dts[i], dts[i+1]);
 887               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 888                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 889               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 890                          (*oprnds_info)[i+1]->ops[stmt_num]);
 891               swapped = true;
 892               continue;
 893             }
 894
 895           if (is_a <bb_vec_info> (vinfo)
 896               && !oprnd_info->any_pattern)
 897             {
 898               /* Now for commutative ops we should see whether we can
 899                  make the other operand matching.  */
 900               if (dump_enabled_p ())
 901                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 902                                  "treating operand as external\n");
 903               oprnd_info->first_dt = dt = vect_external_def;
 904             }
 905           else
 906             {
 907               if (dump_enabled_p ())
 908                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 909                                  "Build SLP failed: different types\n");
 910               return 1;
 911             }
 912         }
 913
 914       /* Make sure to demote the overall operand to external.  */
 915       if (dt == vect_external_def)
 916         oprnd_info->first_dt = vect_external_def;
 917       /* For a SLP reduction chain we want to duplicate the reduction to
 918          each of the chain members.  That gets us a sane SLP graph (still
 919          the stmts are not 100% correct wrt the initial values).  */
 920       else if ((dt == vect_internal_def
 921                 || dt == vect_reduction_def)
 922                && oprnd_info->first_dt == vect_reduction_def
 923                && !STMT_VINFO_DATA_REF (stmt_info)
 924                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 925                && !STMT_VINFO_DATA_REF (def_stmt_info)
 926                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 927                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 928         {
 929           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 930           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 931         }
 932
 933       ++i;
 934     }
 935
 936   /* Swap operands.  */
 937   if (swapped)
 938     {
 939       if (dump_enabled_p ())
 940         dump_printf_loc (MSG_NOTE, vect_location,
 941                          "swapped operands to match def types in %G",
 942                          stmt_info->stmt);
 943     }
 944
 945   return 0;
 946 }
 947
 948 /* Return true if call statements CALL1 and CALL2 are similar enough
 949    to be combined into the same SLP group.  */
 950
 951 bool
 952 compatible_calls_p (gcall *call1, gcall *call2)
 953 {
 954   unsigned int nargs = gimple_call_num_args (call1);
 955   if (nargs != gimple_call_num_args (call2))
 956     return false;
 957
 958   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 959     return false;
 960
 961   if (gimple_call_internal_p (call1))
 962     {
 963       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
 964                                TREE_TYPE (gimple_call_lhs (call2))))
 965         return false;
 966       for (unsigned int i = 0; i < nargs; ++i)
 967         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
 968                                  TREE_TYPE (gimple_call_arg (call2, i))))
 969           return false;
 970     }
 971   else
 972     {
 973       if (!operand_equal_p (gimple_call_fn (call1),
 974                             gimple_call_fn (call2), 0))
 975         return false;
 976
 977       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
 978         return false;
 979     }
 980
 981   /* Check that any unvectorized arguments are equal.  */
 982   if (const int *map = vect_get_operand_map (call1))
 983     {
 984       unsigned int nkept = *map++;
 985       unsigned int mapi = 0;
 986       for (unsigned int i = 0; i < nargs; ++i)
 987         if (mapi < nkept && map[mapi] == int (i))
 988           mapi += 1;
 989         else if (!operand_equal_p (gimple_call_arg (call1, i),
 990                                    gimple_call_arg (call2, i)))
 991           return false;
 992     }
 993
 994   return true;
 995 }
 996
 997 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
 998    caller's attempt to find the vector type in STMT_INFO with the narrowest
 999    element type.  Return true if VECTYPE is nonnull and if it is valid
1000    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
1001    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
1002    vect_build_slp_tree.  */
1003
1004 static bool
1005 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1006                         unsigned int group_size,
1007                         tree vectype, poly_uint64 *max_nunits)
1008 {
1009   if (!vectype)
1010     {
1011       if (dump_enabled_p ())
1012         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1013                          "Build SLP failed: unsupported data-type in %G\n",
1014                          stmt_info->stmt);
1015       /* Fatal mismatch.  */
1016       return false;
1017     }
1018
1019   /* If populating the vector type requires unrolling then fail
1020      before adjusting *max_nunits for basic-block vectorization.  */
1021   if (is_a <bb_vec_info> (vinfo)
1022       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1023     {
1024       if (dump_enabled_p ())
1025         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026                          "Build SLP failed: unrolling required "
1027                          "in basic block SLP\n");
1028       /* Fatal mismatch.  */
1029       return false;
1030     }
1031
1032   /* In case of multiple types we need to detect the smallest type.  */
1033   vect_update_max_nunits (max_nunits, vectype);
1034   return true;
1035 }
1036
1037 /* Verify if the scalar stmts STMTS are isomorphic, require data
1038    permutation or are of unsupported types of operation.  Return
1039    true if they are, otherwise return false and indicate in *MATCHES
1040    which stmts are not isomorphic to the first one.  If MATCHES[0]
1041    is false then this indicates the comparison could not be
1042    carried out or the stmts will never be vectorized by SLP.
1043
1044    Note COND_EXPR is possibly isomorphic to another one after swapping its
1045    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1046    the first stmt by swapping the two operands of comparison; set SWAP[i]
1047    to 2 if stmt I is isormorphic to the first stmt by inverting the code
1048    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1049    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
1050
1051 static bool
1052 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1053                        vec<stmt_vec_info> stmts, unsigned int group_size,
1054                        poly_uint64 *max_nunits, bool *matches,
1055                        bool *two_operators, tree *node_vectype)
1056 {
1057   unsigned int i;
1058   stmt_vec_info first_stmt_info = stmts[0];
1059   code_helper first_stmt_code = ERROR_MARK;
1060   code_helper alt_stmt_code = ERROR_MARK;
1061   code_helper rhs_code = ERROR_MARK;
1062   code_helper first_cond_code = ERROR_MARK;
1063   tree lhs;
1064   bool need_same_oprnds = false;
1065   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1066   stmt_vec_info first_load = NULL, prev_first_load = NULL;
1067   bool first_stmt_ldst_p = false, ldst_p = false;
1068   bool first_stmt_phi_p = false, phi_p = false;
1069   bool maybe_soft_fail = false;
1070   tree soft_fail_nunits_vectype = NULL_TREE;
1071
1072   /* For every stmt in NODE find its def stmt/s.  */
1073   stmt_vec_info stmt_info;
1074   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1075     {
1076       gimple *stmt = stmt_info->stmt;
1077       swap[i] = 0;
1078       matches[i] = false;
1079
1080       if (dump_enabled_p ())
1081         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1082
1083       /* Fail to vectorize statements marked as unvectorizable, throw
1084          or are volatile.  */
1085       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1086           || stmt_can_throw_internal (cfun, stmt)
1087           || gimple_has_volatile_ops (stmt))
1088         {
1089           if (dump_enabled_p ())
1090             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1091                              "Build SLP failed: unvectorizable statement %G",
1092                              stmt);
1093           /* ???  For BB vectorization we want to commutate operands in a way
1094              to shuffle all unvectorizable defs into one operand and have
1095              the other still vectorized.  The following doesn't reliably
1096              work for this though but it's the easiest we can do here.  */
1097           if (is_a <bb_vec_info> (vinfo) && i != 0)
1098             continue;
1099           /* Fatal mismatch.  */
1100           matches[0] = false;
1101           return false;
1102         }
1103
1104       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1105       lhs = gimple_get_lhs (stmt);
1106       if (lhs == NULL_TREE
1107           && (!call_stmt
1108               || !gimple_call_internal_p (stmt)
1109               || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1110         {
1111           if (dump_enabled_p ())
1112             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113                              "Build SLP failed: not GIMPLE_ASSIGN nor "
1114                              "GIMPLE_CALL %G", stmt);
1115           if (is_a <bb_vec_info> (vinfo) && i != 0)
1116             continue;
1117           /* Fatal mismatch.  */
1118           matches[0] = false;
1119           return false;
1120         }
1121
1122       tree nunits_vectype;
1123       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1124                                            &nunits_vectype, group_size))
1125         {
1126           if (is_a <bb_vec_info> (vinfo) && i != 0)
1127             continue;
1128           /* Fatal mismatch.  */
1129           matches[0] = false;
1130           return false;
1131         }
1132       /* Record nunits required but continue analysis, producing matches[]
1133          as if nunits was not an issue.  This allows splitting of groups
1134          to happen.  */
1135       if (nunits_vectype
1136           && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1137                                       nunits_vectype, max_nunits))
1138         {
1139           gcc_assert (is_a <bb_vec_info> (vinfo));
1140           maybe_soft_fail = true;
1141           soft_fail_nunits_vectype = nunits_vectype;
1142         }
1143
1144       gcc_assert (vectype);
1145
1146       if (call_stmt)
1147         {
1148           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1149           if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1150             rhs_code = cfn;
1151           else
1152             rhs_code = CALL_EXPR;
1153
1154           if (cfn == CFN_MASK_LOAD
1155               || cfn == CFN_GATHER_LOAD
1156               || cfn == CFN_MASK_GATHER_LOAD
1157               || cfn == CFN_MASK_LEN_GATHER_LOAD)
1158             ldst_p = true;
1159           else if (cfn == CFN_MASK_STORE)
1160             {
1161               ldst_p = true;
1162               rhs_code = CFN_MASK_STORE;
1163             }
1164           else if ((cfn != CFN_LAST
1165                     && cfn != CFN_MASK_CALL
1166                     && internal_fn_p (cfn)
1167                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1168                    || gimple_call_tail_p (call_stmt)
1169                    || gimple_call_noreturn_p (call_stmt)
1170                    || gimple_call_chain (call_stmt))
1171             {
1172               if (dump_enabled_p ())
1173                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1174                                  "Build SLP failed: unsupported call type %G",
1175                                  (gimple *) call_stmt);
1176               if (is_a <bb_vec_info> (vinfo) && i != 0)
1177                 continue;
1178               /* Fatal mismatch.  */
1179               matches[0] = false;
1180               return false;
1181             }
1182         }
1183       else if (gimple_code (stmt) == GIMPLE_PHI)
1184         {
1185           rhs_code = ERROR_MARK;
1186           phi_p = true;
1187         }
1188       else
1189         {
1190           rhs_code = gimple_assign_rhs_code (stmt);
1191           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1192         }
1193
1194       /* Check the operation.  */
1195       if (i == 0)
1196         {
1197           *node_vectype = vectype;
1198           first_stmt_code = rhs_code;
1199           first_stmt_ldst_p = ldst_p;
1200           first_stmt_phi_p = phi_p;
1201
1202           /* Shift arguments should be equal in all the packed stmts for a
1203              vector shift with scalar shift operand.  */
1204           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1205               || rhs_code == LROTATE_EXPR
1206               || rhs_code == RROTATE_EXPR)
1207             {
1208               /* First see if we have a vector/vector shift.  */
1209               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1210                 {
1211                   /* No vector/vector shift, try for a vector/scalar shift.  */
1212                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1213                     {
1214                       if (dump_enabled_p ())
1215                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1216                                          "Build SLP failed: "
1217                                          "op not supported by target.\n");
1218                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1219                         continue;
1220                       /* Fatal mismatch.  */
1221                       matches[0] = false;
1222                       return false;
1223                     }
1224                   need_same_oprnds = true;
1225                   first_op1 = gimple_assign_rhs2 (stmt);
1226                 }
1227             }
1228           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1229             {
1230               need_same_oprnds = true;
1231               first_op1 = gimple_assign_rhs2 (stmt);
1232             }
1233           else if (!ldst_p
1234                    && rhs_code == BIT_FIELD_REF)
1235             {
1236               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1237               if (!is_a <bb_vec_info> (vinfo)
1238                   || TREE_CODE (vec) != SSA_NAME
1239                   /* When the element types are not compatible we pun the
1240                      source to the target vectype which requires equal size.  */
1241                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1242                        || !types_compatible_p (TREE_TYPE (vectype),
1243                                                TREE_TYPE (TREE_TYPE (vec))))
1244                       && !operand_equal_p (TYPE_SIZE (vectype),
1245                                            TYPE_SIZE (TREE_TYPE (vec)))))
1246                 {
1247                   if (dump_enabled_p ())
1248                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249                                      "Build SLP failed: "
1250                                      "BIT_FIELD_REF not supported\n");
1251                   /* Fatal mismatch.  */
1252                   matches[0] = false;
1253                   return false;
1254                 }
1255             }
1256           else if (rhs_code == CFN_DIV_POW2)
1257             {
1258               need_same_oprnds = true;
1259               first_op1 = gimple_call_arg (call_stmt, 1);
1260             }
1261         }
1262       else
1263         {
1264           if (first_stmt_code != rhs_code
1265               && alt_stmt_code == ERROR_MARK)
1266             alt_stmt_code = rhs_code;
1267           if ((first_stmt_code != rhs_code
1268                && (first_stmt_code != IMAGPART_EXPR
1269                    || rhs_code != REALPART_EXPR)
1270                && (first_stmt_code != REALPART_EXPR
1271                    || rhs_code != IMAGPART_EXPR)
1272                /* Handle mismatches in plus/minus by computing both
1273                   and merging the results.  */
1274                && !((first_stmt_code == PLUS_EXPR
1275                      || first_stmt_code == MINUS_EXPR)
1276                     && (alt_stmt_code == PLUS_EXPR
1277                         || alt_stmt_code == MINUS_EXPR)
1278                     && rhs_code == alt_stmt_code)
1279                && !(first_stmt_code.is_tree_code ()
1280                     && rhs_code.is_tree_code ()
1281                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1282                         == tcc_comparison)
1283                     && (swap_tree_comparison (tree_code (first_stmt_code))
1284                         == tree_code (rhs_code)))
1285                && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1286                     && (first_stmt_code == ARRAY_REF
1287                         || first_stmt_code == BIT_FIELD_REF
1288                         || first_stmt_code == INDIRECT_REF
1289                         || first_stmt_code == COMPONENT_REF
1290                         || first_stmt_code == MEM_REF)
1291                     && (rhs_code == ARRAY_REF
1292                         || rhs_code == BIT_FIELD_REF
1293                         || rhs_code == INDIRECT_REF
1294                         || rhs_code == COMPONENT_REF
1295                         || rhs_code == MEM_REF)))
1296               || (ldst_p
1297                   && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1298                       != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1299               || (ldst_p
1300                   && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1301                       != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1302               || first_stmt_ldst_p != ldst_p
1303               || first_stmt_phi_p != phi_p)
1304             {
1305               if (dump_enabled_p ())
1306                 {
1307                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1308                                    "Build SLP failed: different operation "
1309                                    "in stmt %G", stmt);
1310                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1311                                    "original stmt %G", first_stmt_info->stmt);
1312                 }
1313               /* Mismatch.  */
1314               continue;
1315             }
1316
1317           if (!ldst_p
1318               && first_stmt_code == BIT_FIELD_REF
1319               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1320                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1321             {
1322               if (dump_enabled_p ())
1323                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1324                                  "Build SLP failed: different BIT_FIELD_REF "
1325                                  "arguments in %G", stmt);
1326               /* Mismatch.  */
1327               continue;
1328             }
1329
1330           if (call_stmt
1331               && first_stmt_code != CFN_MASK_LOAD
1332               && first_stmt_code != CFN_MASK_STORE)
1333             {
1334               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1335                                        call_stmt))
1336                 {
1337                   if (dump_enabled_p ())
1338                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1339                                      "Build SLP failed: different calls in %G",
1340                                      stmt);
1341                   /* Mismatch.  */
1342                   continue;
1343                 }
1344             }
1345
1346           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1347               && (gimple_bb (first_stmt_info->stmt)
1348                   != gimple_bb (stmt_info->stmt)))
1349             {
1350               if (dump_enabled_p ())
1351                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1352                                  "Build SLP failed: different BB for PHI "
1353                                  "or possibly trapping operation in %G", stmt);
1354               /* Mismatch.  */
1355               continue;
1356             }
1357
1358           if (need_same_oprnds)
1359             {
1360               tree other_op1 = gimple_arg (stmt, 1);
1361               if (!operand_equal_p (first_op1, other_op1, 0))
1362                 {
1363                   if (dump_enabled_p ())
1364                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1365                                      "Build SLP failed: different shift "
1366                                      "arguments in %G", stmt);
1367                   /* Mismatch.  */
1368                   continue;
1369                 }
1370             }
1371
1372           if (!types_compatible_p (vectype, *node_vectype))
1373             {
1374               if (dump_enabled_p ())
1375                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1376                                  "Build SLP failed: different vector type "
1377                                  "in %G", stmt);
1378               /* Mismatch.  */
1379               continue;
1380             }
1381         }
1382
1383       /* Grouped store or load.  */
1384       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1385         {
1386           gcc_assert (ldst_p);
1387           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1388             {
1389               /* Store.  */
1390               gcc_assert (rhs_code == CFN_MASK_STORE
1391                           || REFERENCE_CLASS_P (lhs)
1392                           || DECL_P (lhs));
1393             }
1394           else
1395             {
1396               /* Load.  */
1397               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1398               if (prev_first_load)
1399                 {
1400                   /* Check that there are no loads from different interleaving
1401                      chains in the same node.  */
1402                   if (prev_first_load != first_load)
1403                     {
1404                       if (dump_enabled_p ())
1405                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1406                                          vect_location,
1407                                          "Build SLP failed: different "
1408                                          "interleaving chains in one node %G",
1409                                          stmt);
1410                       /* Mismatch.  */
1411                       continue;
1412                     }
1413                 }
1414               else
1415                 prev_first_load = first_load;
1416            }
1417         }
1418       /* Non-grouped store or load.  */
1419       else if (ldst_p)
1420         {
1421           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1422               && rhs_code != CFN_GATHER_LOAD
1423               && rhs_code != CFN_MASK_GATHER_LOAD
1424               && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1425               && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1426               /* Not grouped loads are handled as externals for BB
1427                  vectorization.  For loop vectorization we can handle
1428                  splats the same we handle single element interleaving.  */
1429               && (is_a <bb_vec_info> (vinfo)
1430                   || stmt_info != first_stmt_info))
1431             {
1432               /* Not grouped load.  */
1433               if (dump_enabled_p ())
1434                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1435                                  "Build SLP failed: not grouped load %G", stmt);
1436
1437               if (i != 0)
1438                 continue;
1439               /* Fatal mismatch.  */
1440               matches[0] = false;
1441               return false;
1442             }
1443         }
1444       /* Not memory operation.  */
1445       else
1446         {
1447           if (!phi_p
1448               && rhs_code.is_tree_code ()
1449               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1450               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1451               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1452               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1453               && rhs_code != VIEW_CONVERT_EXPR
1454               && rhs_code != CALL_EXPR
1455               && rhs_code != BIT_FIELD_REF)
1456             {
1457               if (dump_enabled_p ())
1458                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459                                  "Build SLP failed: operation unsupported %G",
1460                                  stmt);
1461               if (is_a <bb_vec_info> (vinfo) && i != 0)
1462                 continue;
1463               /* Fatal mismatch.  */
1464               matches[0] = false;
1465               return false;
1466             }
1467
1468           if (rhs_code == COND_EXPR)
1469             {
1470               tree cond_expr = gimple_assign_rhs1 (stmt);
1471               enum tree_code cond_code = TREE_CODE (cond_expr);
1472               enum tree_code swap_code = ERROR_MARK;
1473               enum tree_code invert_code = ERROR_MARK;
1474
1475               if (i == 0)
1476                 first_cond_code = TREE_CODE (cond_expr);
1477               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1478                 {
1479                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1480                   swap_code = swap_tree_comparison (cond_code);
1481                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1482                 }
1483
1484               if (first_cond_code == cond_code)
1485                 ;
1486               /* Isomorphic can be achieved by swapping.  */
1487               else if (first_cond_code == swap_code)
1488                 swap[i] = 1;
1489               /* Isomorphic can be achieved by inverting.  */
1490               else if (first_cond_code == invert_code)
1491                 swap[i] = 2;
1492               else
1493                 {
1494                   if (dump_enabled_p ())
1495                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1496                                      "Build SLP failed: different"
1497                                      " operation %G", stmt);
1498                   /* Mismatch.  */
1499                   continue;
1500                 }
1501             }
1502
1503           if (rhs_code.is_tree_code ()
1504               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1505               && (swap_tree_comparison ((tree_code)first_stmt_code)
1506                   == (tree_code)rhs_code))
1507             swap[i] = 1;
1508         }
1509
1510       matches[i] = true;
1511     }
1512
1513   for (i = 0; i < group_size; ++i)
1514     if (!matches[i])
1515       return false;
1516
1517   /* If we allowed a two-operation SLP node verify the target can cope
1518      with the permute we are going to use.  */
1519   if (alt_stmt_code != ERROR_MARK
1520       && (!alt_stmt_code.is_tree_code ()
1521           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1522               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1523     {
1524       *two_operators = true;
1525     }
1526
1527   if (maybe_soft_fail)
1528     {
1529       unsigned HOST_WIDE_INT const_nunits;
1530       if (!TYPE_VECTOR_SUBPARTS
1531             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1532           || const_nunits > group_size)
1533         matches[0] = false;
1534       else
1535         {
1536           /* With constant vector elements simulate a mismatch at the
1537              point we need to split.  */
1538           unsigned tail = group_size & (const_nunits - 1);
1539           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1540         }
1541       return false;
1542     }
1543
1544   return true;
1545 }
1546
1547 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1548    Note we never remove apart from at destruction time so we do not
1549    need a special value for deleted that differs from empty.  */
1550 struct bst_traits
1551 {
1552   typedef vec <stmt_vec_info> value_type;
1553   typedef vec <stmt_vec_info> compare_type;
1554   static inline hashval_t hash (value_type);
1555   static inline bool equal (value_type existing, value_type candidate);
1556   static inline bool is_empty (value_type x) { return !x.exists (); }
1557   static inline bool is_deleted (value_type x) { return !x.exists (); }
1558   static const bool empty_zero_p = true;
1559   static inline void mark_empty (value_type &x) { x.release (); }
1560   static inline void mark_deleted (value_type &x) { x.release (); }
1561   static inline void remove (value_type &x) { x.release (); }
1562 };
1563 inline hashval_t
1564 bst_traits::hash (value_type x)
1565 {
1566   inchash::hash h;
1567   for (unsigned i = 0; i < x.length (); ++i)
1568     h.add_int (gimple_uid (x[i]->stmt));
1569   return h.end ();
1570 }
1571 inline bool
1572 bst_traits::equal (value_type existing, value_type candidate)
1573 {
1574   if (existing.length () != candidate.length ())
1575     return false;
1576   for (unsigned i = 0; i < existing.length (); ++i)
1577     if (existing[i] != candidate[i])
1578       return false;
1579   return true;
1580 }
1581
1582 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1583    but then vec::insert does memmove and that's not compatible with
1584    std::pair.  */
1585 struct chain_op_t
1586 {
1587   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1588       : code (code_), dt (dt_), op (op_) {}
1589   tree_code code;
1590   vect_def_type dt;
1591   tree op;
1592 };
1593
1594 /* Comparator for sorting associatable chains.  */
1595
1596 static int
1597 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1598 {
1599   auto *op1 = (const chain_op_t *) op1_;
1600   auto *op2 = (const chain_op_t *) op2_;
1601   if (op1->dt != op2->dt)
1602     return (int)op1->dt - (int)op2->dt;
1603   return (int)op1->code - (int)op2->code;
1604 }
1605
1606 /* Linearize the associatable expression chain at START with the
1607    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1608    filling CHAIN with the result and using WORKLIST as intermediate storage.
1609    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1610    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1611    stmts, starting with START.  */
1612
1613 static void
1614 vect_slp_linearize_chain (vec_info *vinfo,
1615                           vec<std::pair<tree_code, gimple *> > &worklist,
1616                           vec<chain_op_t> &chain,
1617                           enum tree_code code, gimple *start,
1618                           gimple *&code_stmt, gimple *&alt_code_stmt,
1619                           vec<gimple *> *chain_stmts)
1620 {
1621   /* For each lane linearize the addition/subtraction (or other
1622      uniform associatable operation) expression tree.  */
1623   worklist.safe_push (std::make_pair (code, start));
1624   while (!worklist.is_empty ())
1625     {
1626       auto entry = worklist.pop ();
1627       gassign *stmt = as_a <gassign *> (entry.second);
1628       enum tree_code in_code = entry.first;
1629       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1630       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1631       if (!code_stmt
1632           && gimple_assign_rhs_code (stmt) == code)
1633         code_stmt = stmt;
1634       else if (!alt_code_stmt
1635                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1636         alt_code_stmt = stmt;
1637       if (chain_stmts)
1638         chain_stmts->safe_push (stmt);
1639       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1640         {
1641           tree op = gimple_op (stmt, opnum);
1642           vect_def_type dt;
1643           stmt_vec_info def_stmt_info;
1644           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1645           gcc_assert (res);
1646           if (dt == vect_internal_def
1647               && is_pattern_stmt_p (def_stmt_info))
1648             op = gimple_get_lhs (def_stmt_info->stmt);
1649           gimple *use_stmt;
1650           use_operand_p use_p;
1651           if (dt == vect_internal_def
1652               && single_imm_use (op, &use_p, &use_stmt)
1653               && is_gimple_assign (def_stmt_info->stmt)
1654               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1655                   || (code == PLUS_EXPR
1656                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1657                           == MINUS_EXPR))))
1658             {
1659               tree_code op_def_code = this_code;
1660               if (op_def_code == MINUS_EXPR && opnum == 1)
1661                 op_def_code = PLUS_EXPR;
1662               if (in_code == MINUS_EXPR)
1663                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1664               worklist.safe_push (std::make_pair (op_def_code,
1665                                                   def_stmt_info->stmt));
1666             }
1667           else
1668             {
1669               tree_code op_def_code = this_code;
1670               if (op_def_code == MINUS_EXPR && opnum == 1)
1671                 op_def_code = PLUS_EXPR;
1672               if (in_code == MINUS_EXPR)
1673                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1674               chain.safe_push (chain_op_t (op_def_code, dt, op));
1675             }
1676         }
1677     }
1678 }
1679
1680 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1681                   simple_hashmap_traits <bst_traits, slp_tree> >
1682   scalar_stmts_to_slp_tree_map_t;
1683
1684 static slp_tree
1685 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1686                        vec<stmt_vec_info> stmts, unsigned int group_size,
1687                        poly_uint64 *max_nunits,
1688                        bool *matches, unsigned *limit, unsigned *tree_size,
1689                        scalar_stmts_to_slp_tree_map_t *bst_map);
1690
1691 static slp_tree
1692 vect_build_slp_tree (vec_info *vinfo,
1693                      vec<stmt_vec_info> stmts, unsigned int group_size,
1694                      poly_uint64 *max_nunits,
1695                      bool *matches, unsigned *limit, unsigned *tree_size,
1696                      scalar_stmts_to_slp_tree_map_t *bst_map)
1697 {
1698   if (slp_tree *leader = bst_map->get (stmts))
1699     {
1700       if (dump_enabled_p ())
1701         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1702                          !(*leader)->failed ? "" : "failed ",
1703                          (void *) *leader);
1704       if (!(*leader)->failed)
1705         {
1706           SLP_TREE_REF_COUNT (*leader)++;
1707           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1708           stmts.release ();
1709           return *leader;
1710         }
1711       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1712       return NULL;
1713     }
1714
1715   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1716      so we can pick up backedge destinations during discovery.  */
1717   slp_tree res = new _slp_tree;
1718   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1719   SLP_TREE_SCALAR_STMTS (res) = stmts;
1720   bst_map->put (stmts.copy (), res);
1721
1722   if (*limit == 0)
1723     {
1724       if (dump_enabled_p ())
1725         dump_printf_loc (MSG_NOTE, vect_location,
1726                          "SLP discovery limit exceeded\n");
1727       /* Mark the node invalid so we can detect those when still in use
1728          as backedge destinations.  */
1729       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1730       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1731       res->failed = XNEWVEC (bool, group_size);
1732       memset (res->failed, 0, sizeof (bool) * group_size);
1733       memset (matches, 0, sizeof (bool) * group_size);
1734       return NULL;
1735     }
1736   --*limit;
1737
1738   if (dump_enabled_p ())
1739     dump_printf_loc (MSG_NOTE, vect_location,
1740                      "starting SLP discovery for node %p\n", (void *) res);
1741
1742   poly_uint64 this_max_nunits = 1;
1743   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1744                                         &this_max_nunits,
1745                                         matches, limit, tree_size, bst_map);
1746   if (!res_)
1747     {
1748       if (dump_enabled_p ())
1749         dump_printf_loc (MSG_NOTE, vect_location,
1750                          "SLP discovery for node %p failed\n", (void *) res);
1751       /* Mark the node invalid so we can detect those when still in use
1752          as backedge destinations.  */
1753       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1754       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1755       res->failed = XNEWVEC (bool, group_size);
1756       if (flag_checking)
1757         {
1758           unsigned i;
1759           for (i = 0; i < group_size; ++i)
1760             if (!matches[i])
1761               break;
1762           gcc_assert (i < group_size);
1763         }
1764       memcpy (res->failed, matches, sizeof (bool) * group_size);
1765     }
1766   else
1767     {
1768       if (dump_enabled_p ())
1769         dump_printf_loc (MSG_NOTE, vect_location,
1770                          "SLP discovery for node %p succeeded\n",
1771                          (void *) res);
1772       gcc_assert (res_ == res);
1773       res->max_nunits = this_max_nunits;
1774       vect_update_max_nunits (max_nunits, this_max_nunits);
1775       /* Keep a reference for the bst_map use.  */
1776       SLP_TREE_REF_COUNT (res)++;
1777     }
1778   return res_;
1779 }
1780
1781 /* Helper for building an associated SLP node chain.  */
1782
1783 static void
1784 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1785                                    slp_tree op0, slp_tree op1,
1786                                    stmt_vec_info oper1, stmt_vec_info oper2,
1787                                    vec<std::pair<unsigned, unsigned> > lperm)
1788 {
1789   unsigned group_size = SLP_TREE_LANES (op1);
1790
1791   slp_tree child1 = new _slp_tree;
1792   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1793   SLP_TREE_VECTYPE (child1) = vectype;
1794   SLP_TREE_LANES (child1) = group_size;
1795   SLP_TREE_CHILDREN (child1).create (2);
1796   SLP_TREE_CHILDREN (child1).quick_push (op0);
1797   SLP_TREE_CHILDREN (child1).quick_push (op1);
1798   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1799
1800   slp_tree child2 = new _slp_tree;
1801   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1802   SLP_TREE_VECTYPE (child2) = vectype;
1803   SLP_TREE_LANES (child2) = group_size;
1804   SLP_TREE_CHILDREN (child2).create (2);
1805   SLP_TREE_CHILDREN (child2).quick_push (op0);
1806   SLP_TREE_REF_COUNT (op0)++;
1807   SLP_TREE_CHILDREN (child2).quick_push (op1);
1808   SLP_TREE_REF_COUNT (op1)++;
1809   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1810
1811   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1812   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1813   SLP_TREE_VECTYPE (perm) = vectype;
1814   SLP_TREE_LANES (perm) = group_size;
1815   /* ???  We should set this NULL but that's not expected.  */
1816   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1817   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1818   SLP_TREE_CHILDREN (perm).quick_push (child1);
1819   SLP_TREE_CHILDREN (perm).quick_push (child2);
1820 }
1821
1822 /* Recursively build an SLP tree starting from NODE.
1823    Fail (and return a value not equal to zero) if def-stmts are not
1824    isomorphic, require data permutation or are of unsupported types of
1825    operation.  Otherwise, return 0.
1826    The value returned is the depth in the SLP tree where a mismatch
1827    was found.  */
1828
1829 static slp_tree
1830 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1831                        vec<stmt_vec_info> stmts, unsigned int group_size,
1832                        poly_uint64 *max_nunits,
1833                        bool *matches, unsigned *limit, unsigned *tree_size,
1834                        scalar_stmts_to_slp_tree_map_t *bst_map)
1835 {
1836   unsigned nops, i, this_tree_size = 0;
1837   poly_uint64 this_max_nunits = *max_nunits;
1838
1839   matches[0] = false;
1840
1841   stmt_vec_info stmt_info = stmts[0];
1842   if (!is_a<gcall *> (stmt_info->stmt)
1843       && !is_a<gassign *> (stmt_info->stmt)
1844       && !is_a<gphi *> (stmt_info->stmt))
1845     return NULL;
1846
1847   nops = gimple_num_args (stmt_info->stmt);
1848   if (const int *map = vect_get_operand_map (stmt_info->stmt,
1849                                              STMT_VINFO_GATHER_SCATTER_P
1850                                                (stmt_info)))
1851     nops = map[0];
1852
1853   /* If the SLP node is a PHI (induction or reduction), terminate
1854      the recursion.  */
1855   bool *skip_args = XALLOCAVEC (bool, nops);
1856   memset (skip_args, 0, sizeof (bool) * nops);
1857   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1858     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1859       {
1860         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1861         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1862                                                     group_size);
1863         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1864                                      max_nunits))
1865           return NULL;
1866
1867         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1868         if (def_type == vect_induction_def)
1869           {
1870             /* Induction PHIs are not cycles but walk the initial
1871                value.  Only for inner loops through, for outer loops
1872                we need to pick up the value from the actual PHIs
1873                to more easily support peeling and epilogue vectorization.  */
1874             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1875             if (!nested_in_vect_loop_p (loop, stmt_info))
1876               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1877             else
1878               loop = loop->inner;
1879             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1880           }
1881         else if (def_type == vect_reduction_def
1882                  || def_type == vect_double_reduction_def
1883                  || def_type == vect_nested_cycle
1884                  || def_type == vect_first_order_recurrence)
1885           {
1886             /* Else def types have to match.  */
1887             stmt_vec_info other_info;
1888             bool all_same = true;
1889             FOR_EACH_VEC_ELT (stmts, i, other_info)
1890               {
1891                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1892                   return NULL;
1893                 if (other_info != stmt_info)
1894                   all_same = false;
1895               }
1896             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1897             /* Reduction initial values are not explicitely represented.  */
1898             if (def_type != vect_first_order_recurrence
1899                 && !nested_in_vect_loop_p (loop, stmt_info))
1900               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1901             /* Reduction chain backedge defs are filled manually.
1902                ???  Need a better way to identify a SLP reduction chain PHI.
1903                Or a better overall way to SLP match those.  */
1904             if (all_same && def_type == vect_reduction_def)
1905               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1906           }
1907         else if (def_type != vect_internal_def)
1908           return NULL;
1909       }
1910
1911
1912   bool two_operators = false;
1913   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1914   tree vectype = NULL_TREE;
1915   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1916                               &this_max_nunits, matches, &two_operators,
1917                               &vectype))
1918     return NULL;
1919
1920   /* If the SLP node is a load, terminate the recursion unless masked.  */
1921   if (STMT_VINFO_DATA_REF (stmt_info)
1922       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1923     {
1924       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1925         gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1926                     || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1927                     || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
1928                     || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD));
1929       else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1930         gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1931       else
1932         {
1933           *max_nunits = this_max_nunits;
1934           (*tree_size)++;
1935           node = vect_create_new_slp_node (node, stmts, 0);
1936           SLP_TREE_VECTYPE (node) = vectype;
1937           /* And compute the load permutation.  Whether it is actually
1938              a permutation depends on the unrolling factor which is
1939              decided later.  */
1940           vec<unsigned> load_permutation;
1941           int j;
1942           stmt_vec_info load_info;
1943           load_permutation.create (group_size);
1944           stmt_vec_info first_stmt_info
1945             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1946           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1947             {
1948               int load_place;
1949               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1950                 load_place = vect_get_place_in_interleaving_chain
1951                                 (load_info, first_stmt_info);
1952               else
1953                 load_place = 0;
1954               gcc_assert (load_place != -1);
1955               load_permutation.safe_push (load_place);
1956             }
1957           SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1958           return node;
1959         }
1960     }
1961   else if (gimple_assign_single_p (stmt_info->stmt)
1962            && !gimple_vuse (stmt_info->stmt)
1963            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1964     {
1965       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1966          the same SSA name vector of a compatible type to vectype.  */
1967       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1968       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1969       stmt_vec_info estmt_info;
1970       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1971         {
1972           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1973           tree bfref = gimple_assign_rhs1 (estmt);
1974           HOST_WIDE_INT lane;
1975           if (!known_eq (bit_field_size (bfref),
1976                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1977               || !constant_multiple_p (bit_field_offset (bfref),
1978                                        bit_field_size (bfref), &lane))
1979             {
1980               lperm.release ();
1981               matches[0] = false;
1982               return NULL;
1983             }
1984           lperm.safe_push (std::make_pair (0, (unsigned)lane));
1985         }
1986       slp_tree vnode = vect_create_new_slp_node (vNULL);
1987       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1988         /* ???  We record vectype here but we hide eventually necessary
1989            punning and instead rely on code generation to materialize
1990            VIEW_CONVERT_EXPRs as necessary.  We instead should make
1991            this explicit somehow.  */
1992         SLP_TREE_VECTYPE (vnode) = vectype;
1993       else
1994         {
1995           /* For different size but compatible elements we can still
1996              use VEC_PERM_EXPR without punning.  */
1997           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1998                       && types_compatible_p (TREE_TYPE (vectype),
1999                                              TREE_TYPE (TREE_TYPE (vec))));
2000           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2001         }
2002       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2003       unsigned HOST_WIDE_INT const_nunits;
2004       if (nunits.is_constant (&const_nunits))
2005         SLP_TREE_LANES (vnode) = const_nunits;
2006       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2007       /* We are always building a permutation node even if it is an identity
2008          permute to shield the rest of the vectorizer from the odd node
2009          representing an actual vector without any scalar ops.
2010          ???  We could hide it completely with making the permute node
2011          external?  */
2012       node = vect_create_new_slp_node (node, stmts, 1);
2013       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2014       SLP_TREE_LANE_PERMUTATION (node) = lperm;
2015       SLP_TREE_VECTYPE (node) = vectype;
2016       SLP_TREE_CHILDREN (node).quick_push (vnode);
2017       return node;
2018     }
2019   /* When discovery reaches an associatable operation see whether we can
2020      improve that to match up lanes in a way superior to the operand
2021      swapping code which at most looks at two defs.
2022      ???  For BB vectorization we cannot do the brute-force search
2023      for matching as we can succeed by means of builds from scalars
2024      and have no good way to "cost" one build against another.  */
2025   else if (is_a <loop_vec_info> (vinfo)
2026            /* ???  We don't handle !vect_internal_def defs below.  */
2027            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2028            && is_gimple_assign (stmt_info->stmt)
2029            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2030                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2031            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2032                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2033                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2034     {
2035       /* See if we have a chain of (mixed) adds or subtracts or other
2036          associatable ops.  */
2037       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2038       if (code == MINUS_EXPR)
2039         code = PLUS_EXPR;
2040       stmt_vec_info other_op_stmt_info = NULL;
2041       stmt_vec_info op_stmt_info = NULL;
2042       unsigned chain_len = 0;
2043       auto_vec<chain_op_t> chain;
2044       auto_vec<std::pair<tree_code, gimple *> > worklist;
2045       auto_vec<vec<chain_op_t> > chains (group_size);
2046       auto_vec<slp_tree, 4> children;
2047       bool hard_fail = true;
2048       for (unsigned lane = 0; lane < group_size; ++lane)
2049         {
2050           /* For each lane linearize the addition/subtraction (or other
2051              uniform associatable operation) expression tree.  */
2052           gimple *op_stmt = NULL, *other_op_stmt = NULL;
2053           vect_slp_linearize_chain (vinfo, worklist, chain, code,
2054                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
2055                                     NULL);
2056           if (!op_stmt_info && op_stmt)
2057             op_stmt_info = vinfo->lookup_stmt (op_stmt);
2058           if (!other_op_stmt_info && other_op_stmt)
2059             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2060           if (chain.length () == 2)
2061             {
2062               /* In a chain of just two elements resort to the regular
2063                  operand swapping scheme.  If we run into a length
2064                  mismatch still hard-FAIL.  */
2065               if (chain_len == 0)
2066                 hard_fail = false;
2067               else
2068                 {
2069                   matches[lane] = false;
2070                   /* ???  We might want to process the other lanes, but
2071                      make sure to not give false matching hints to the
2072                      caller for lanes we did not process.  */
2073                   if (lane != group_size - 1)
2074                     matches[0] = false;
2075                 }
2076               break;
2077             }
2078           else if (chain_len == 0)
2079             chain_len = chain.length ();
2080           else if (chain.length () != chain_len)
2081             {
2082               /* ???  Here we could slip in magic to compensate with
2083                  neutral operands.  */
2084               matches[lane] = false;
2085               if (lane != group_size - 1)
2086                 matches[0] = false;
2087               break;
2088             }
2089           chains.quick_push (chain.copy ());
2090           chain.truncate (0);
2091         }
2092       if (chains.length () == group_size)
2093         {
2094           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
2095           if (!op_stmt_info)
2096             {
2097               hard_fail = false;
2098               goto out;
2099             }
2100           /* Now we have a set of chains with the same length.  */
2101           /* 1. pre-sort according to def_type and operation.  */
2102           for (unsigned lane = 0; lane < group_size; ++lane)
2103             chains[lane].stablesort (dt_sort_cmp, vinfo);
2104           if (dump_enabled_p ())
2105             {
2106               dump_printf_loc (MSG_NOTE, vect_location,
2107                                "pre-sorted chains of %s\n",
2108                                get_tree_code_name (code));
2109               for (unsigned lane = 0; lane < group_size; ++lane)
2110                 {
2111                   for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2112                     dump_printf (MSG_NOTE, "%s %T ",
2113                                  get_tree_code_name (chains[lane][opnum].code),
2114                                  chains[lane][opnum].op);
2115                   dump_printf (MSG_NOTE, "\n");
2116                 }
2117             }
2118           /* 2. try to build children nodes, associating as necessary.  */
2119           for (unsigned n = 0; n < chain_len; ++n)
2120             {
2121               vect_def_type dt = chains[0][n].dt;
2122               unsigned lane;
2123               for (lane = 0; lane < group_size; ++lane)
2124                 if (chains[lane][n].dt != dt)
2125                   {
2126                     if (dt == vect_constant_def
2127                         && chains[lane][n].dt == vect_external_def)
2128                       dt = vect_external_def;
2129                     else if (dt == vect_external_def
2130                              && chains[lane][n].dt == vect_constant_def)
2131                       ;
2132                     else
2133                       break;
2134                   }
2135               if (lane != group_size)
2136                 {
2137                   if (dump_enabled_p ())
2138                     dump_printf_loc (MSG_NOTE, vect_location,
2139                                      "giving up on chain due to mismatched "
2140                                      "def types\n");
2141                   matches[lane] = false;
2142                   if (lane != group_size - 1)
2143                     matches[0] = false;
2144                   goto out;
2145                 }
2146               if (dt == vect_constant_def
2147                   || dt == vect_external_def)
2148                 {
2149                   /* Check whether we can build the invariant.  If we can't
2150                      we never will be able to.  */
2151                   tree type = TREE_TYPE (chains[0][n].op);
2152                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2153                       && (TREE_CODE (type) == BOOLEAN_TYPE
2154                           || !can_duplicate_and_interleave_p (vinfo, group_size,
2155                                                               type)))
2156                     {
2157                       matches[0] = false;
2158                       goto out;
2159                     }
2160                   vec<tree> ops;
2161                   ops.create (group_size);
2162                   for (lane = 0; lane < group_size; ++lane)
2163                     ops.quick_push (chains[lane][n].op);
2164                   slp_tree child = vect_create_new_slp_node (ops);
2165                   SLP_TREE_DEF_TYPE (child) = dt;
2166                   children.safe_push (child);
2167                 }
2168               else if (dt != vect_internal_def)
2169                 {
2170                   /* Not sure, we might need sth special.
2171                      gcc.dg/vect/pr96854.c,
2172                      gfortran.dg/vect/fast-math-pr37021.f90
2173                      and gfortran.dg/vect/pr61171.f trigger.  */
2174                   /* Soft-fail for now.  */
2175                   hard_fail = false;
2176                   goto out;
2177                 }
2178               else
2179                 {
2180                   vec<stmt_vec_info> op_stmts;
2181                   op_stmts.create (group_size);
2182                   slp_tree child = NULL;
2183                   /* Brute-force our way.  We have to consider a lane
2184                      failing after fixing an earlier fail up in the
2185                      SLP discovery recursion.  So track the current
2186                      permute per lane.  */
2187                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2188                   memset (perms, 0, sizeof (unsigned) * group_size);
2189                   do
2190                     {
2191                       op_stmts.truncate (0);
2192                       for (lane = 0; lane < group_size; ++lane)
2193                         op_stmts.quick_push
2194                           (vinfo->lookup_def (chains[lane][n].op));
2195                       child = vect_build_slp_tree (vinfo, op_stmts,
2196                                                    group_size, &this_max_nunits,
2197                                                    matches, limit,
2198                                                    &this_tree_size, bst_map);
2199                       /* ???  We're likely getting too many fatal mismatches
2200                          here so maybe we want to ignore them (but then we
2201                          have no idea which lanes fatally mismatched).  */
2202                       if (child || !matches[0])
2203                         break;
2204                       /* Swap another lane we have not yet matched up into
2205                          lanes that did not match.  If we run out of
2206                          permute possibilities for a lane terminate the
2207                          search.  */
2208                       bool term = false;
2209                       for (lane = 1; lane < group_size; ++lane)
2210                         if (!matches[lane])
2211                           {
2212                             if (n + perms[lane] + 1 == chain_len)
2213                               {
2214                                 term = true;
2215                                 break;
2216                               }
2217                             std::swap (chains[lane][n],
2218                                        chains[lane][n + perms[lane] + 1]);
2219                             perms[lane]++;
2220                           }
2221                       if (term)
2222                         break;
2223                     }
2224                   while (1);
2225                   if (!child)
2226                     {
2227                       if (dump_enabled_p ())
2228                         dump_printf_loc (MSG_NOTE, vect_location,
2229                                          "failed to match up op %d\n", n);
2230                       op_stmts.release ();
2231                       if (lane != group_size - 1)
2232                         matches[0] = false;
2233                       else
2234                         matches[lane] = false;
2235                       goto out;
2236                     }
2237                   if (dump_enabled_p ())
2238                     {
2239                       dump_printf_loc (MSG_NOTE, vect_location,
2240                                        "matched up op %d to\n", n);
2241                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2242                     }
2243                   children.safe_push (child);
2244                 }
2245             }
2246           /* 3. build SLP nodes to combine the chain.  */
2247           for (unsigned lane = 0; lane < group_size; ++lane)
2248             if (chains[lane][0].code != code)
2249               {
2250                 /* See if there's any alternate all-PLUS entry.  */
2251                 unsigned n;
2252                 for (n = 1; n < chain_len; ++n)
2253                   {
2254                     for (lane = 0; lane < group_size; ++lane)
2255                       if (chains[lane][n].code != code)
2256                         break;
2257                     if (lane == group_size)
2258                       break;
2259                   }
2260                 if (n != chain_len)
2261                   {
2262                     /* Swap that in at first position.  */
2263                     std::swap (children[0], children[n]);
2264                     for (lane = 0; lane < group_size; ++lane)
2265                       std::swap (chains[lane][0], chains[lane][n]);
2266                   }
2267                 else
2268                   {
2269                     /* ???  When this triggers and we end up with two
2270                        vect_constant/external_def up-front things break (ICE)
2271                        spectacularly finding an insertion place for the
2272                        all-constant op.  We should have a fully
2273                        vect_internal_def operand though(?) so we can swap
2274                        that into first place and then prepend the all-zero
2275                        constant.  */
2276                     if (dump_enabled_p ())
2277                       dump_printf_loc (MSG_NOTE, vect_location,
2278                                        "inserting constant zero to compensate "
2279                                        "for (partially) negated first "
2280                                        "operand\n");
2281                     chain_len++;
2282                     for (lane = 0; lane < group_size; ++lane)
2283                       chains[lane].safe_insert
2284                         (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2285                     vec<tree> zero_ops;
2286                     zero_ops.create (group_size);
2287                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2288                     for (lane = 1; lane < group_size; ++lane)
2289                       zero_ops.quick_push (zero_ops[0]);
2290                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2291                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2292                     children.safe_insert (0, zero);
2293                   }
2294                 break;
2295               }
2296           for (unsigned i = 1; i < children.length (); ++i)
2297             {
2298               slp_tree op0 = children[i - 1];
2299               slp_tree op1 = children[i];
2300               bool this_two_op = false;
2301               for (unsigned lane = 0; lane < group_size; ++lane)
2302                 if (chains[lane][i].code != chains[0][i].code)
2303                   {
2304                     this_two_op = true;
2305                     break;
2306                   }
2307               slp_tree child;
2308               if (i == children.length () - 1)
2309                 child = vect_create_new_slp_node (node, stmts, 2);
2310               else
2311                 child = vect_create_new_slp_node (2, ERROR_MARK);
2312               if (this_two_op)
2313                 {
2314                   vec<std::pair<unsigned, unsigned> > lperm;
2315                   lperm.create (group_size);
2316                   for (unsigned lane = 0; lane < group_size; ++lane)
2317                     lperm.quick_push (std::make_pair
2318                       (chains[lane][i].code != chains[0][i].code, lane));
2319                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2320                                                      (chains[0][i].code == code
2321                                                       ? op_stmt_info
2322                                                       : other_op_stmt_info),
2323                                                      (chains[0][i].code == code
2324                                                       ? other_op_stmt_info
2325                                                       : op_stmt_info),
2326                                                      lperm);
2327                 }
2328               else
2329                 {
2330                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2331                   SLP_TREE_VECTYPE (child) = vectype;
2332                   SLP_TREE_LANES (child) = group_size;
2333                   SLP_TREE_CHILDREN (child).quick_push (op0);
2334                   SLP_TREE_CHILDREN (child).quick_push (op1);
2335                   SLP_TREE_REPRESENTATIVE (child)
2336                     = (chains[0][i].code == code
2337                        ? op_stmt_info : other_op_stmt_info);
2338                 }
2339               children[i] = child;
2340             }
2341           *tree_size += this_tree_size + 1;
2342           *max_nunits = this_max_nunits;
2343           while (!chains.is_empty ())
2344             chains.pop ().release ();
2345           return node;
2346         }
2347 out:
2348       while (!children.is_empty ())
2349         vect_free_slp_tree (children.pop ());
2350       while (!chains.is_empty ())
2351         chains.pop ().release ();
2352       /* Hard-fail, otherwise we might run into quadratic processing of the
2353          chains starting one stmt into the chain again.  */
2354       if (hard_fail)
2355         return NULL;
2356       /* Fall thru to normal processing.  */
2357     }
2358
2359   /* Get at the operands, verifying they are compatible.  */
2360   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2361   slp_oprnd_info oprnd_info;
2362   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2363     {
2364       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2365                                              stmts, i, &oprnds_info);
2366       if (res != 0)
2367         matches[(res == -1) ? 0 : i] = false;
2368       if (!matches[0])
2369         break;
2370     }
2371   for (i = 0; i < group_size; ++i)
2372     if (!matches[i])
2373       {
2374         vect_free_oprnd_info (oprnds_info);
2375         return NULL;
2376       }
2377   swap = NULL;
2378
2379   auto_vec<slp_tree, 4> children;
2380
2381   stmt_info = stmts[0];
2382
2383   /* Create SLP_TREE nodes for the definition node/s.  */
2384   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2385     {
2386       slp_tree child = nullptr;
2387       unsigned int j;
2388
2389       /* We're skipping certain operands from processing, for example
2390          outer loop reduction initial defs.  */
2391       if (skip_args[i])
2392         {
2393           children.safe_push (NULL);
2394           continue;
2395         }
2396
2397       if (oprnd_info->first_dt == vect_uninitialized_def)
2398         {
2399           /* COND_EXPR have one too many eventually if the condition
2400              is a SSA name.  */
2401           gcc_assert (i == 3 && nops == 4);
2402           continue;
2403         }
2404
2405       if (is_a <bb_vec_info> (vinfo)
2406           && oprnd_info->first_dt == vect_internal_def
2407           && !oprnd_info->any_pattern)
2408         {
2409           /* For BB vectorization, if all defs are the same do not
2410              bother to continue the build along the single-lane
2411              graph but use a splat of the scalar value.  */
2412           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2413           for (j = 1; j < group_size; ++j)
2414             if (oprnd_info->def_stmts[j] != first_def)
2415               break;
2416           if (j == group_size
2417               /* But avoid doing this for loads where we may be
2418                  able to CSE things, unless the stmt is not
2419                  vectorizable.  */
2420               && (!STMT_VINFO_VECTORIZABLE (first_def)
2421                   || !gimple_vuse (first_def->stmt)))
2422             {
2423               if (dump_enabled_p ())
2424                 dump_printf_loc (MSG_NOTE, vect_location,
2425                                  "Using a splat of the uniform operand %G",
2426                                  first_def->stmt);
2427               oprnd_info->first_dt = vect_external_def;
2428             }
2429         }
2430
2431       if (oprnd_info->first_dt == vect_external_def
2432           || oprnd_info->first_dt == vect_constant_def)
2433         {
2434           if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2435             {
2436               tree op0;
2437               tree uniform_val = op0 = oprnd_info->ops[0];
2438               for (j = 1; j < oprnd_info->ops.length (); ++j)
2439                 if (!operand_equal_p (uniform_val, oprnd_info->ops[j]))
2440                   {
2441                     uniform_val = NULL_TREE;
2442                     break;
2443                   }
2444               if (!uniform_val
2445                   && !can_duplicate_and_interleave_p (vinfo,
2446                                                       oprnd_info->ops.length (),
2447                                                       TREE_TYPE (op0)))
2448                 {
2449                   matches[j] = false;
2450                   if (dump_enabled_p ())
2451                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2452                                      "Build SLP failed: invalid type of def "
2453                                      "for variable-length SLP %T\n", op0);
2454                   goto fail;
2455                 }
2456             }
2457           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2458           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2459           oprnd_info->ops = vNULL;
2460           children.safe_push (invnode);
2461           continue;
2462         }
2463
2464       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2465                                         group_size, &this_max_nunits,
2466                                         matches, limit,
2467                                         &this_tree_size, bst_map)) != NULL)
2468         {
2469           oprnd_info->def_stmts = vNULL;
2470           children.safe_push (child);
2471           continue;
2472         }
2473
2474       /* If the SLP build for operand zero failed and operand zero
2475          and one can be commutated try that for the scalar stmts
2476          that failed the match.  */
2477       if (i == 0
2478           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2479           && matches[0]
2480           /* ???  For COND_EXPRs we can swap the comparison operands
2481              as well as the arms under some constraints.  */
2482           && nops == 2
2483           && oprnds_info[1]->first_dt == vect_internal_def
2484           && is_gimple_assign (stmt_info->stmt)
2485           /* Swapping operands for reductions breaks assumptions later on.  */
2486           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2487           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2488         {
2489           /* See whether we can swap the matching or the non-matching
2490              stmt operands.  */
2491           bool swap_not_matching = true;
2492           do
2493             {
2494               for (j = 0; j < group_size; ++j)
2495                 {
2496                   if (matches[j] != !swap_not_matching)
2497                     continue;
2498                   stmt_vec_info stmt_info = stmts[j];
2499                   /* Verify if we can swap operands of this stmt.  */
2500                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2501                   if (!stmt
2502                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2503                     {
2504                       if (!swap_not_matching)
2505                         goto fail;
2506                       swap_not_matching = false;
2507                       break;
2508                     }
2509                 }
2510             }
2511           while (j != group_size);
2512
2513           /* Swap mismatched definition stmts.  */
2514           if (dump_enabled_p ())
2515             dump_printf_loc (MSG_NOTE, vect_location,
2516                              "Re-trying with swapped operands of stmts ");
2517           for (j = 0; j < group_size; ++j)
2518             if (matches[j] == !swap_not_matching)
2519               {
2520                 std::swap (oprnds_info[0]->def_stmts[j],
2521                            oprnds_info[1]->def_stmts[j]);
2522                 std::swap (oprnds_info[0]->ops[j],
2523                            oprnds_info[1]->ops[j]);
2524                 if (dump_enabled_p ())
2525                   dump_printf (MSG_NOTE, "%d ", j);
2526               }
2527           if (dump_enabled_p ())
2528             dump_printf (MSG_NOTE, "\n");
2529           /* After swapping some operands we lost track whether an
2530              operand has any pattern defs so be conservative here.  */
2531           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2532             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2533           /* And try again with scratch 'matches' ... */
2534           bool *tem = XALLOCAVEC (bool, group_size);
2535           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2536                                             group_size, &this_max_nunits,
2537                                             tem, limit,
2538                                             &this_tree_size, bst_map)) != NULL)
2539             {
2540               oprnd_info->def_stmts = vNULL;
2541               children.safe_push (child);
2542               continue;
2543             }
2544         }
2545 fail:
2546
2547       /* If the SLP build failed and we analyze a basic-block
2548          simply treat nodes we fail to build as externally defined
2549          (and thus build vectors from the scalar defs).
2550          The cost model will reject outright expensive cases.
2551          ???  This doesn't treat cases where permutation ultimatively
2552          fails (or we don't try permutation below).  Ideally we'd
2553          even compute a permutation that will end up with the maximum
2554          SLP tree size...  */
2555       if (is_a <bb_vec_info> (vinfo)
2556           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2557              do extra work to cancel the pattern so the uses see the
2558              scalar version.  */
2559           && !is_pattern_stmt_p (stmt_info)
2560           && !oprnd_info->any_pattern)
2561         {
2562           /* But if there's a leading vector sized set of matching stmts
2563              fail here so we can split the group.  This matches the condition
2564              vect_analyze_slp_instance uses.  */
2565           /* ???  We might want to split here and combine the results to support
2566              multiple vector sizes better.  */
2567           for (j = 0; j < group_size; ++j)
2568             if (!matches[j])
2569               break;
2570           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2571             {
2572               if (dump_enabled_p ())
2573                 dump_printf_loc (MSG_NOTE, vect_location,
2574                                  "Building vector operands from scalars\n");
2575               this_tree_size++;
2576               child = vect_create_new_slp_node (oprnd_info->ops);
2577               children.safe_push (child);
2578               oprnd_info->ops = vNULL;
2579               continue;
2580             }
2581         }
2582
2583       gcc_assert (child == NULL);
2584       FOR_EACH_VEC_ELT (children, j, child)
2585         if (child)
2586           vect_free_slp_tree (child);
2587       vect_free_oprnd_info (oprnds_info);
2588       return NULL;
2589     }
2590
2591   vect_free_oprnd_info (oprnds_info);
2592
2593   /* If we have all children of a child built up from uniform scalars
2594      or does more than one possibly expensive vector construction then
2595      just throw that away, causing it built up from scalars.
2596      The exception is the SLP node for the vector store.  */
2597   if (is_a <bb_vec_info> (vinfo)
2598       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2599       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2600          do extra work to cancel the pattern so the uses see the
2601          scalar version.  */
2602       && !is_pattern_stmt_p (stmt_info))
2603     {
2604       slp_tree child;
2605       unsigned j;
2606       bool all_uniform_p = true;
2607       unsigned n_vector_builds = 0;
2608       FOR_EACH_VEC_ELT (children, j, child)
2609         {
2610           if (!child)
2611             ;
2612           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2613             all_uniform_p = false;
2614           else if (!vect_slp_tree_uniform_p (child))
2615             {
2616               all_uniform_p = false;
2617               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2618                 n_vector_builds++;
2619             }
2620         }
2621       if (all_uniform_p
2622           || n_vector_builds > 1
2623           || (n_vector_builds == children.length ()
2624               && is_a <gphi *> (stmt_info->stmt)))
2625         {
2626           /* Roll back.  */
2627           matches[0] = false;
2628           FOR_EACH_VEC_ELT (children, j, child)
2629             if (child)
2630               vect_free_slp_tree (child);
2631
2632           if (dump_enabled_p ())
2633             dump_printf_loc (MSG_NOTE, vect_location,
2634                              "Building parent vector operands from "
2635                              "scalars instead\n");
2636           return NULL;
2637         }
2638     }
2639
2640   *tree_size += this_tree_size + 1;
2641   *max_nunits = this_max_nunits;
2642
2643   if (two_operators)
2644     {
2645       /* ???  We'd likely want to either cache in bst_map sth like
2646          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2647          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2648          explicit stmts to put in so the keying on 'stmts' doesn't
2649          work (but we have the same issue with nodes that use 'ops').  */
2650       slp_tree one = new _slp_tree;
2651       slp_tree two = new _slp_tree;
2652       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2653       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2654       SLP_TREE_VECTYPE (one) = vectype;
2655       SLP_TREE_VECTYPE (two) = vectype;
2656       SLP_TREE_CHILDREN (one).safe_splice (children);
2657       SLP_TREE_CHILDREN (two).safe_splice (children);
2658       slp_tree child;
2659       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2660         SLP_TREE_REF_COUNT (child)++;
2661
2662       /* Here we record the original defs since this
2663          node represents the final lane configuration.  */
2664       node = vect_create_new_slp_node (node, stmts, 2);
2665       SLP_TREE_VECTYPE (node) = vectype;
2666       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2667       SLP_TREE_CHILDREN (node).quick_push (one);
2668       SLP_TREE_CHILDREN (node).quick_push (two);
2669       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2670       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2671       enum tree_code ocode = ERROR_MARK;
2672       stmt_vec_info ostmt_info;
2673       unsigned j = 0;
2674       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2675         {
2676           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2677           if (gimple_assign_rhs_code (ostmt) != code0)
2678             {
2679               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2680               ocode = gimple_assign_rhs_code (ostmt);
2681               j = i;
2682             }
2683           else
2684             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2685         }
2686       SLP_TREE_CODE (one) = code0;
2687       SLP_TREE_CODE (two) = ocode;
2688       SLP_TREE_LANES (one) = stmts.length ();
2689       SLP_TREE_LANES (two) = stmts.length ();
2690       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2691       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2692       return node;
2693     }
2694
2695   node = vect_create_new_slp_node (node, stmts, nops);
2696   SLP_TREE_VECTYPE (node) = vectype;
2697   SLP_TREE_CHILDREN (node).splice (children);
2698   return node;
2699 }
2700
2701 /* Dump a single SLP tree NODE.  */
2702
2703 static void
2704 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2705                      slp_tree node)
2706 {
2707   unsigned i, j;
2708   slp_tree child;
2709   stmt_vec_info stmt_info;
2710   tree op;
2711
2712   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2713   dump_user_location_t user_loc = loc.get_user_location ();
2714   dump_printf_loc (metadata, user_loc,
2715                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2716                    ", refcnt=%u)",
2717                    SLP_TREE_DEF_TYPE (node) == vect_external_def
2718                    ? " (external)"
2719                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2720                       ? " (constant)"
2721                       : ""), (void *) node,
2722                    estimated_poly_value (node->max_nunits),
2723                                          SLP_TREE_REF_COUNT (node));
2724   if (SLP_TREE_VECTYPE (node))
2725     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2726   dump_printf (metadata, "\n");
2727   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2728     {
2729       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2730         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2731       else
2732         dump_printf_loc (metadata, user_loc, "op template: %G",
2733                          SLP_TREE_REPRESENTATIVE (node)->stmt);
2734     }
2735   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2736     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2737       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2738   else
2739     {
2740       dump_printf_loc (metadata, user_loc, "\t{ ");
2741       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2742         dump_printf (metadata, "%T%s ", op,
2743                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2744       dump_printf (metadata, "}\n");
2745     }
2746   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2747     {
2748       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2749       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2750         dump_printf (dump_kind, " %u", j);
2751       dump_printf (dump_kind, " }\n");
2752     }
2753   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2754     {
2755       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2756       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2757         dump_printf (dump_kind, " %u[%u]",
2758                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
2759                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
2760       dump_printf (dump_kind, " }\n");
2761     }
2762   if (SLP_TREE_CHILDREN (node).is_empty ())
2763     return;
2764   dump_printf_loc (metadata, user_loc, "\tchildren");
2765   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2766     dump_printf (dump_kind, " %p", (void *)child);
2767   dump_printf (dump_kind, "\n");
2768 }
2769
2770 DEBUG_FUNCTION void
2771 debug (slp_tree node)
2772 {
2773   debug_dump_context ctx;
2774   vect_print_slp_tree (MSG_NOTE,
2775                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2776                        node);
2777 }
2778
2779 /* Recursive helper for the dot producer below.  */
2780
2781 static void
2782 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2783 {
2784   if (visited.add (node))
2785     return;
2786
2787   fprintf (f, "\"%p\" [label=\"", (void *)node);
2788   vect_print_slp_tree (MSG_NOTE,
2789                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2790                        node);
2791   fprintf (f, "\"];\n");
2792
2793
2794   for (slp_tree child : SLP_TREE_CHILDREN (node))
2795     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2796
2797   for (slp_tree child : SLP_TREE_CHILDREN (node))
2798     if (child)
2799       dot_slp_tree (f, child, visited);
2800 }
2801
2802 DEBUG_FUNCTION void
2803 dot_slp_tree (const char *fname, slp_tree node)
2804 {
2805   FILE *f = fopen (fname, "w");
2806   fprintf (f, "digraph {\n");
2807   fflush (f);
2808     {
2809       debug_dump_context ctx (f);
2810       hash_set<slp_tree> visited;
2811       dot_slp_tree (f, node, visited);
2812     }
2813   fflush (f);
2814   fprintf (f, "}\n");
2815   fclose (f);
2816 }
2817
2818 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2819
2820 static void
2821 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2822                       slp_tree node, hash_set<slp_tree> &visited)
2823 {
2824   unsigned i;
2825   slp_tree child;
2826
2827   if (visited.add (node))
2828     return;
2829
2830   vect_print_slp_tree (dump_kind, loc, node);
2831
2832   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2833     if (child)
2834       vect_print_slp_graph (dump_kind, loc, child, visited);
2835 }
2836
2837 static void
2838 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2839                       slp_tree entry)
2840 {
2841   hash_set<slp_tree> visited;
2842   vect_print_slp_graph (dump_kind, loc, entry, visited);
2843 }
2844
2845 /* Mark the tree rooted at NODE with PURE_SLP.  */
2846
2847 static void
2848 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2849 {
2850   int i;
2851   stmt_vec_info stmt_info;
2852   slp_tree child;
2853
2854   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2855     return;
2856
2857   if (visited.add (node))
2858     return;
2859
2860   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2861     STMT_SLP_TYPE (stmt_info) = pure_slp;
2862
2863   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2864     if (child)
2865       vect_mark_slp_stmts (child, visited);
2866 }
2867
2868 static void
2869 vect_mark_slp_stmts (slp_tree node)
2870 {
2871   hash_set<slp_tree> visited;
2872   vect_mark_slp_stmts (node, visited);
2873 }
2874
2875 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2876
2877 static void
2878 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2879 {
2880   int i;
2881   stmt_vec_info stmt_info;
2882   slp_tree child;
2883
2884   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2885     return;
2886
2887   if (visited.add (node))
2888     return;
2889
2890   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2891     {
2892       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2893                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2894       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2895     }
2896
2897   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2898     if (child)
2899       vect_mark_slp_stmts_relevant (child, visited);
2900 }
2901
2902 static void
2903 vect_mark_slp_stmts_relevant (slp_tree node)
2904 {
2905   hash_set<slp_tree> visited;
2906   vect_mark_slp_stmts_relevant (node, visited);
2907 }
2908
2909
2910 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2911
2912 static void
2913 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2914                        hash_set<slp_tree> &visited)
2915 {
2916   if (!node || visited.add (node))
2917     return;
2918
2919   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2920     return;
2921
2922   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
2923     {
2924       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2925       if (STMT_VINFO_DATA_REF (stmt_info)
2926           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2927         loads.safe_push (node);
2928     }
2929
2930   unsigned i;
2931   slp_tree child;
2932   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2933     vect_gather_slp_loads (loads, child, visited);
2934 }
2935
2936
2937 /* Find the last store in SLP INSTANCE.  */
2938
2939 stmt_vec_info
2940 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2941 {
2942   stmt_vec_info last = NULL;
2943   stmt_vec_info stmt_vinfo;
2944
2945   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2946     {
2947       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2948       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2949     }
2950
2951   return last;
2952 }
2953
2954 /* Find the first stmt in NODE.  */
2955
2956 stmt_vec_info
2957 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2958 {
2959   stmt_vec_info first = NULL;
2960   stmt_vec_info stmt_vinfo;
2961
2962   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2963     {
2964       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2965       if (!first
2966           || get_later_stmt (stmt_vinfo, first) == first)
2967         first = stmt_vinfo;
2968     }
2969
2970   return first;
2971 }
2972
2973 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2974    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2975    (also containing the first GROUP1_SIZE stmts, since stores are
2976    consecutive), the second containing the remainder.
2977    Return the first stmt in the second group.  */
2978
2979 static stmt_vec_info
2980 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2981 {
2982   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2983   gcc_assert (group1_size > 0);
2984   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2985   gcc_assert (group2_size > 0);
2986   DR_GROUP_SIZE (first_vinfo) = group1_size;
2987
2988   stmt_vec_info stmt_info = first_vinfo;
2989   for (unsigned i = group1_size; i > 1; i--)
2990     {
2991       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2992       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2993     }
2994   /* STMT is now the last element of the first group.  */
2995   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2996   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2997
2998   DR_GROUP_SIZE (group2) = group2_size;
2999   for (stmt_info = group2; stmt_info;
3000        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3001     {
3002       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3003       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3004     }
3005
3006   /* For the second group, the DR_GROUP_GAP is that before the original group,
3007      plus skipping over the first vector.  */
3008   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3009
3010   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
3011   DR_GROUP_GAP (first_vinfo) += group2_size;
3012
3013   if (dump_enabled_p ())
3014     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3015                      group1_size, group2_size);
3016
3017   return group2;
3018 }
3019
3020 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3021    statements and a vector of NUNITS elements.  */
3022
3023 static poly_uint64
3024 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3025 {
3026   return exact_div (common_multiple (nunits, group_size), group_size);
3027 }
3028
3029 /* Helper that checks to see if a node is a load node.  */
3030
3031 static inline bool
3032 vect_is_slp_load_node  (slp_tree root)
3033 {
3034   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3035          && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3036          && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3037 }
3038
3039
3040 /* Helper function of optimize_load_redistribution that performs the operation
3041    recursively.  */
3042
3043 static slp_tree
3044 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3045                                 vec_info *vinfo, unsigned int group_size,
3046                                 hash_map<slp_tree, slp_tree> *load_map,
3047                                 slp_tree root)
3048 {
3049   if (slp_tree *leader = load_map->get (root))
3050     return *leader;
3051
3052   slp_tree node;
3053   unsigned i;
3054
3055   /* For now, we don't know anything about externals so do not do anything.  */
3056   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3057     return NULL;
3058   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3059     {
3060       /* First convert this node into a load node and add it to the leaves
3061          list and flatten the permute from a lane to a load one.  If it's
3062          unneeded it will be elided later.  */
3063       vec<stmt_vec_info> stmts;
3064       stmts.create (SLP_TREE_LANES (root));
3065       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3066       for (unsigned j = 0; j < lane_perm.length (); j++)
3067         {
3068           std::pair<unsigned, unsigned> perm = lane_perm[j];
3069           node = SLP_TREE_CHILDREN (root)[perm.first];
3070
3071           if (!vect_is_slp_load_node (node)
3072               || SLP_TREE_CHILDREN (node).exists ())
3073             {
3074               stmts.release ();
3075               goto next;
3076             }
3077
3078           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3079         }
3080
3081       if (dump_enabled_p ())
3082         dump_printf_loc (MSG_NOTE, vect_location,
3083                          "converting stmts on permute node %p\n",
3084                          (void *) root);
3085
3086       bool *matches = XALLOCAVEC (bool, group_size);
3087       poly_uint64 max_nunits = 1;
3088       unsigned tree_size = 0, limit = 1;
3089       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3090                                   matches, &limit, &tree_size, bst_map);
3091       if (!node)
3092         stmts.release ();
3093
3094       load_map->put (root, node);
3095       return node;
3096     }
3097
3098 next:
3099   load_map->put (root, NULL);
3100
3101   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3102     {
3103       slp_tree value
3104         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3105                                           node);
3106       if (value)
3107         {
3108           SLP_TREE_REF_COUNT (value)++;
3109           SLP_TREE_CHILDREN (root)[i] = value;
3110           /* ???  We know the original leafs of the replaced nodes will
3111              be referenced by bst_map, only the permutes created by
3112              pattern matching are not.  */
3113           if (SLP_TREE_REF_COUNT (node) == 1)
3114             load_map->remove (node);
3115           vect_free_slp_tree (node);
3116         }
3117     }
3118
3119   return NULL;
3120 }
3121
3122 /* Temporary workaround for loads not being CSEd during SLP build.  This
3123    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3124    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3125    same DR such that the final operation is equal to a permuted load.  Such
3126    NODES are then directly converted into LOADS themselves.  The nodes are
3127    CSEd using BST_MAP.  */
3128
3129 static void
3130 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3131                               vec_info *vinfo, unsigned int group_size,
3132                               hash_map<slp_tree, slp_tree> *load_map,
3133                               slp_tree root)
3134 {
3135   slp_tree node;
3136   unsigned i;
3137
3138   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3139     {
3140       slp_tree value
3141         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3142                                           node);
3143       if (value)
3144         {
3145           SLP_TREE_REF_COUNT (value)++;
3146           SLP_TREE_CHILDREN (root)[i] = value;
3147           /* ???  We know the original leafs of the replaced nodes will
3148              be referenced by bst_map, only the permutes created by
3149              pattern matching are not.  */
3150           if (SLP_TREE_REF_COUNT (node) == 1)
3151             load_map->remove (node);
3152           vect_free_slp_tree (node);
3153         }
3154     }
3155 }
3156
3157 /* Helper function of vect_match_slp_patterns.
3158
3159    Attempts to match patterns against the slp tree rooted in REF_NODE using
3160    VINFO.  Patterns are matched in post-order traversal.
3161
3162    If matching is successful the value in REF_NODE is updated and returned, if
3163    not then it is returned unchanged.  */
3164
3165 static bool
3166 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3167                            slp_tree_to_load_perm_map_t *perm_cache,
3168                            slp_compat_nodes_map_t *compat_cache,
3169                            hash_set<slp_tree> *visited)
3170 {
3171   unsigned i;
3172   slp_tree node = *ref_node;
3173   bool found_p = false;
3174   if (!node || visited->add (node))
3175     return false;
3176
3177   slp_tree child;
3178   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3179     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3180                                           vinfo, perm_cache, compat_cache,
3181                                           visited);
3182
3183   for (unsigned x = 0; x < num__slp_patterns; x++)
3184     {
3185       vect_pattern *pattern
3186         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3187       if (pattern)
3188         {
3189           pattern->build (vinfo);
3190           delete pattern;
3191           found_p = true;
3192         }
3193     }
3194
3195   return found_p;
3196 }
3197
3198 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3199    vec_info VINFO.
3200
3201    The modified tree is returned.  Patterns are tried in order and multiple
3202    patterns may match.  */
3203
3204 static bool
3205 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3206                          hash_set<slp_tree> *visited,
3207                          slp_tree_to_load_perm_map_t *perm_cache,
3208                          slp_compat_nodes_map_t *compat_cache)
3209 {
3210   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3211   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3212
3213   if (dump_enabled_p ())
3214     dump_printf_loc (MSG_NOTE, vect_location,
3215                      "Analyzing SLP tree %p for patterns\n",
3216                      (void *) SLP_INSTANCE_TREE (instance));
3217
3218   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3219                                     visited);
3220 }
3221
3222 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3223    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3224    Return true if we could use IFN_STORE_LANES instead and if that appears
3225    to be the better approach.  */
3226
3227 static bool
3228 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3229                                unsigned int group_size,
3230                                unsigned int new_group_size)
3231 {
3232   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3233   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3234   if (!vectype)
3235     return false;
3236   /* Allow the split if one of the two new groups would operate on full
3237      vectors *within* rather than across one scalar loop iteration.
3238      This is purely a heuristic, but it should work well for group
3239      sizes of 3 and 4, where the possible splits are:
3240
3241        3->2+1:  OK if the vector has exactly two elements
3242        4->2+2:  Likewise
3243        4->3+1:  Less clear-cut.  */
3244   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3245       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3246     return false;
3247   return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3248 }
3249
3250 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3251    vect_build_slp_tree to build a tree of packed stmts if possible.
3252    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3253
3254 static bool
3255 vect_analyze_slp_instance (vec_info *vinfo,
3256                            scalar_stmts_to_slp_tree_map_t *bst_map,
3257                            stmt_vec_info stmt_info, slp_instance_kind kind,
3258                            unsigned max_tree_size, unsigned *limit);
3259
3260 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3261    of KIND.  Return true if successful.  */
3262
3263 static bool
3264 vect_build_slp_instance (vec_info *vinfo,
3265                          slp_instance_kind kind,
3266                          vec<stmt_vec_info> &scalar_stmts,
3267                          vec<stmt_vec_info> &root_stmt_infos,
3268                          vec<tree> &remain,
3269                          unsigned max_tree_size, unsigned *limit,
3270                          scalar_stmts_to_slp_tree_map_t *bst_map,
3271                          /* ???  We need stmt_info for group splitting.  */
3272                          stmt_vec_info stmt_info_)
3273 {
3274   if (kind == slp_inst_kind_ctor)
3275     {
3276       if (dump_enabled_p ())
3277         dump_printf_loc (MSG_NOTE, vect_location,
3278                          "Analyzing vectorizable constructor: %G\n",
3279                          root_stmt_infos[0]->stmt);
3280     }
3281
3282   if (dump_enabled_p ())
3283     {
3284       dump_printf_loc (MSG_NOTE, vect_location,
3285                        "Starting SLP discovery for\n");
3286       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3287         dump_printf_loc (MSG_NOTE, vect_location,
3288                          "  %G", scalar_stmts[i]->stmt);
3289     }
3290
3291   /* When a BB reduction doesn't have an even number of lanes
3292      strip it down, treating the remaining lane as scalar.
3293      ???  Selecting the optimal set of lanes to vectorize would be nice
3294      but SLP build for all lanes will fail quickly because we think
3295      we're going to need unrolling.  */
3296   if (kind == slp_inst_kind_bb_reduc
3297       && (scalar_stmts.length () & 1))
3298     remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt));
3299
3300   /* Build the tree for the SLP instance.  */
3301   unsigned int group_size = scalar_stmts.length ();
3302   bool *matches = XALLOCAVEC (bool, group_size);
3303   poly_uint64 max_nunits = 1;
3304   unsigned tree_size = 0;
3305   unsigned i;
3306   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3307                                        &max_nunits, matches, limit,
3308                                        &tree_size, bst_map);
3309   if (node != NULL)
3310     {
3311       /* Calculate the unrolling factor based on the smallest type.  */
3312       poly_uint64 unrolling_factor
3313         = calculate_unrolling_factor (max_nunits, group_size);
3314
3315       if (maybe_ne (unrolling_factor, 1U)
3316           && is_a <bb_vec_info> (vinfo))
3317         {
3318           unsigned HOST_WIDE_INT const_max_nunits;
3319           if (!max_nunits.is_constant (&const_max_nunits)
3320               || const_max_nunits > group_size)
3321             {
3322               if (dump_enabled_p ())
3323                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3324                                  "Build SLP failed: store group "
3325                                  "size not a multiple of the vector size "
3326                                  "in basic block SLP\n");
3327               vect_free_slp_tree (node);
3328               return false;
3329             }
3330           /* Fatal mismatch.  */
3331           if (dump_enabled_p ())
3332             dump_printf_loc (MSG_NOTE, vect_location,
3333                              "SLP discovery succeeded but node needs "
3334                              "splitting\n");
3335           memset (matches, true, group_size);
3336           matches[group_size / const_max_nunits * const_max_nunits] = false;
3337           vect_free_slp_tree (node);
3338         }
3339       else
3340         {
3341           /* Create a new SLP instance.  */
3342           slp_instance new_instance = XNEW (class _slp_instance);
3343           SLP_INSTANCE_TREE (new_instance) = node;
3344           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3345           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3346           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3347           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3348           SLP_INSTANCE_KIND (new_instance) = kind;
3349           new_instance->reduc_phis = NULL;
3350           new_instance->cost_vec = vNULL;
3351           new_instance->subgraph_entries = vNULL;
3352
3353           if (dump_enabled_p ())
3354             dump_printf_loc (MSG_NOTE, vect_location,
3355                              "SLP size %u vs. limit %u.\n",
3356                              tree_size, max_tree_size);
3357
3358           /* Fixup SLP reduction chains.  */
3359           if (kind == slp_inst_kind_reduc_chain)
3360             {
3361               /* If this is a reduction chain with a conversion in front
3362                  amend the SLP tree with a node for that.  */
3363               gimple *scalar_def
3364                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3365               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3366                 {
3367                   /* Get at the conversion stmt - we know it's the single use
3368                      of the last stmt of the reduction chain.  */
3369                   use_operand_p use_p;
3370                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3371                                            &use_p, &scalar_def);
3372                   gcc_assert (r);
3373                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3374                   next_info = vect_stmt_to_vectorize (next_info);
3375                   scalar_stmts = vNULL;
3376                   scalar_stmts.create (group_size);
3377                   for (unsigned i = 0; i < group_size; ++i)
3378                     scalar_stmts.quick_push (next_info);
3379                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3380                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3381                   SLP_TREE_CHILDREN (conv).quick_push (node);
3382                   SLP_INSTANCE_TREE (new_instance) = conv;
3383                   /* We also have to fake this conversion stmt as SLP reduction
3384                      group so we don't have to mess with too much code
3385                      elsewhere.  */
3386                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3387                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3388                 }
3389               /* Fill the backedge child of the PHI SLP node.  The
3390                  general matching code cannot find it because the
3391                  scalar code does not reflect how we vectorize the
3392                  reduction.  */
3393               use_operand_p use_p;
3394               imm_use_iterator imm_iter;
3395               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3396               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3397                                      gimple_get_lhs (scalar_def))
3398                 /* There are exactly two non-debug uses, the reduction
3399                    PHI and the loop-closed PHI node.  */
3400                 if (!is_gimple_debug (USE_STMT (use_p))
3401                     && gimple_bb (USE_STMT (use_p)) == loop->header)
3402                   {
3403                     auto_vec<stmt_vec_info, 64> phis (group_size);
3404                     stmt_vec_info phi_info
3405                       = vinfo->lookup_stmt (USE_STMT (use_p));
3406                     for (unsigned i = 0; i < group_size; ++i)
3407                       phis.quick_push (phi_info);
3408                     slp_tree *phi_node = bst_map->get (phis);
3409                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3410                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3411                       = SLP_INSTANCE_TREE (new_instance);
3412                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
3413                   }
3414             }
3415
3416           vinfo->slp_instances.safe_push (new_instance);
3417
3418           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3419              the number of scalar stmts in the root in a few places.
3420              Verify that assumption holds.  */
3421           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3422                         .length () == group_size);
3423
3424           if (dump_enabled_p ())
3425             {
3426               dump_printf_loc (MSG_NOTE, vect_location,
3427                                "Final SLP tree for instance %p:\n",
3428                                (void *) new_instance);
3429               vect_print_slp_graph (MSG_NOTE, vect_location,
3430                                     SLP_INSTANCE_TREE (new_instance));
3431             }
3432
3433           return true;
3434         }
3435     }
3436   else
3437     {
3438       /* Failed to SLP.  */
3439       /* Free the allocated memory.  */
3440       scalar_stmts.release ();
3441     }
3442
3443   stmt_vec_info stmt_info = stmt_info_;
3444   /* Try to break the group up into pieces.  */
3445   if (kind == slp_inst_kind_store)
3446     {
3447       /* ???  We could delay all the actual splitting of store-groups
3448          until after SLP discovery of the original group completed.
3449          Then we can recurse to vect_build_slp_instance directly.  */
3450       for (i = 0; i < group_size; i++)
3451         if (!matches[i])
3452           break;
3453
3454       /* For basic block SLP, try to break the group up into multiples of
3455          a vector size.  */
3456       if (is_a <bb_vec_info> (vinfo)
3457           && (i > 1 && i < group_size))
3458         {
3459           tree scalar_type
3460             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3461           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3462                                                       1 << floor_log2 (i));
3463           unsigned HOST_WIDE_INT const_nunits;
3464           if (vectype
3465               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3466             {
3467               /* Split into two groups at the first vector boundary.  */
3468               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3469               unsigned group1_size = i & ~(const_nunits - 1);
3470
3471               if (dump_enabled_p ())
3472                 dump_printf_loc (MSG_NOTE, vect_location,
3473                                  "Splitting SLP group at stmt %u\n", i);
3474               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3475                                                                group1_size);
3476               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3477                                                     kind, max_tree_size,
3478                                                     limit);
3479               /* Split the rest at the failure point and possibly
3480                  re-analyze the remaining matching part if it has
3481                  at least two lanes.  */
3482               if (group1_size < i
3483                   && (i + 1 < group_size
3484                       || i - group1_size > 1))
3485                 {
3486                   stmt_vec_info rest2 = rest;
3487                   rest = vect_split_slp_store_group (rest, i - group1_size);
3488                   if (i - group1_size > 1)
3489                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3490                                                       kind, max_tree_size,
3491                                                       limit);
3492                 }
3493               /* Re-analyze the non-matching tail if it has at least
3494                  two lanes.  */
3495               if (i + 1 < group_size)
3496                 res |= vect_analyze_slp_instance (vinfo, bst_map,
3497                                                   rest, kind, max_tree_size,
3498                                                   limit);
3499               return res;
3500             }
3501         }
3502
3503       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3504       if (is_a <loop_vec_info> (vinfo)
3505           && (i > 1 && i < group_size)
3506           && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3507         {
3508           unsigned group1_size = i;
3509
3510           if (dump_enabled_p ())
3511             dump_printf_loc (MSG_NOTE, vect_location,
3512                              "Splitting SLP group at stmt %u\n", i);
3513
3514           stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3515                                                            group1_size);
3516           /* Loop vectorization cannot handle gaps in stores, make sure
3517              the split group appears as strided.  */
3518           STMT_VINFO_STRIDED_P (rest) = 1;
3519           DR_GROUP_GAP (rest) = 0;
3520           STMT_VINFO_STRIDED_P (stmt_info) = 1;
3521           DR_GROUP_GAP (stmt_info) = 0;
3522
3523           bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3524                                                 kind, max_tree_size, limit);
3525           if (i + 1 < group_size)
3526             res |= vect_analyze_slp_instance (vinfo, bst_map,
3527                                               rest, kind, max_tree_size, limit);
3528
3529           return res;
3530         }
3531
3532       /* Even though the first vector did not all match, we might be able to SLP
3533          (some) of the remainder.  FORNOW ignore this possibility.  */
3534     }
3535
3536   /* Failed to SLP.  */
3537   if (dump_enabled_p ())
3538     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3539   return false;
3540 }
3541
3542
3543 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3544    vect_build_slp_tree to build a tree of packed stmts if possible.
3545    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3546
3547 static bool
3548 vect_analyze_slp_instance (vec_info *vinfo,
3549                            scalar_stmts_to_slp_tree_map_t *bst_map,
3550                            stmt_vec_info stmt_info,
3551                            slp_instance_kind kind,
3552                            unsigned max_tree_size, unsigned *limit)
3553 {
3554   unsigned int i;
3555   vec<stmt_vec_info> scalar_stmts;
3556
3557   if (is_a <bb_vec_info> (vinfo))
3558     vect_location = stmt_info->stmt;
3559
3560   stmt_vec_info next_info = stmt_info;
3561   if (kind == slp_inst_kind_store)
3562     {
3563       /* Collect the stores and store them in scalar_stmts.  */
3564       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3565       while (next_info)
3566         {
3567           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3568           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3569         }
3570     }
3571   else if (kind == slp_inst_kind_reduc_chain)
3572     {
3573       /* Collect the reduction stmts and store them in scalar_stmts.  */
3574       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3575       while (next_info)
3576         {
3577           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3578           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3579         }
3580       /* Mark the first element of the reduction chain as reduction to properly
3581          transform the node.  In the reduction analysis phase only the last
3582          element of the chain is marked as reduction.  */
3583       STMT_VINFO_DEF_TYPE (stmt_info)
3584         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3585       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3586         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3587     }
3588   else if (kind == slp_inst_kind_reduc_group)
3589     {
3590       /* Collect reduction statements.  */
3591       const vec<stmt_vec_info> &reductions
3592         = as_a <loop_vec_info> (vinfo)->reductions;
3593       scalar_stmts.create (reductions.length ());
3594       for (i = 0; reductions.iterate (i, &next_info); i++)
3595         if ((STMT_VINFO_RELEVANT_P (next_info)
3596              || STMT_VINFO_LIVE_P (next_info))
3597             /* ???  Make sure we didn't skip a conversion around a reduction
3598                path.  In that case we'd have to reverse engineer that conversion
3599                stmt following the chain using reduc_idx and from the PHI
3600                using reduc_def.  */
3601             && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3602           scalar_stmts.quick_push (next_info);
3603       /* If less than two were relevant/live there's nothing to SLP.  */
3604       if (scalar_stmts.length () < 2)
3605         return false;
3606     }
3607   else
3608     gcc_unreachable ();
3609
3610   vec<stmt_vec_info> roots = vNULL;
3611   vec<tree> remain = vNULL;
3612   /* Build the tree for the SLP instance.  */
3613   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3614                                       roots, remain,
3615                                       max_tree_size, limit, bst_map,
3616                                       kind == slp_inst_kind_store
3617                                       ? stmt_info : NULL);
3618
3619   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3620      where we should do store group splitting.  */
3621
3622   return res;
3623 }
3624
3625 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3626    trees of packed scalar stmts if SLP is possible.  */
3627
3628 opt_result
3629 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3630 {
3631   unsigned int i;
3632   stmt_vec_info first_element;
3633   slp_instance instance;
3634
3635   DUMP_VECT_SCOPE ("vect_analyze_slp");
3636
3637   unsigned limit = max_tree_size;
3638
3639   scalar_stmts_to_slp_tree_map_t *bst_map
3640     = new scalar_stmts_to_slp_tree_map_t ();
3641
3642   /* Find SLP sequences starting from groups of grouped stores.  */
3643   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3644     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3645                                slp_inst_kind_store, max_tree_size, &limit);
3646
3647   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3648     {
3649       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3650         {
3651           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3652           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3653                                        bb_vinfo->roots[i].stmts,
3654                                        bb_vinfo->roots[i].roots,
3655                                        bb_vinfo->roots[i].remain,
3656                                        max_tree_size, &limit, bst_map, NULL))
3657             {
3658               bb_vinfo->roots[i].stmts = vNULL;
3659               bb_vinfo->roots[i].roots = vNULL;
3660               bb_vinfo->roots[i].remain = vNULL;
3661             }
3662         }
3663     }
3664
3665   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3666     {
3667       /* Find SLP sequences starting from reduction chains.  */
3668       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3669         if (! STMT_VINFO_RELEVANT_P (first_element)
3670             && ! STMT_VINFO_LIVE_P (first_element))
3671           ;
3672         else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3673                                               slp_inst_kind_reduc_chain,
3674                                               max_tree_size, &limit))
3675           {
3676             /* Dissolve reduction chain group.  */
3677             stmt_vec_info vinfo = first_element;
3678             stmt_vec_info last = NULL;
3679             while (vinfo)
3680               {
3681                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3682                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3683                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3684                 last = vinfo;
3685                 vinfo = next;
3686               }
3687             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3688             /* It can be still vectorized as part of an SLP reduction.  */
3689             loop_vinfo->reductions.safe_push (last);
3690           }
3691
3692       /* Find SLP sequences starting from groups of reductions.  */
3693       if (loop_vinfo->reductions.length () > 1)
3694         vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3695                                    slp_inst_kind_reduc_group, max_tree_size,
3696                                    &limit);
3697     }
3698
3699   hash_set<slp_tree> visited_patterns;
3700   slp_tree_to_load_perm_map_t perm_cache;
3701   slp_compat_nodes_map_t compat_cache;
3702
3703   /* See if any patterns can be found in the SLP tree.  */
3704   bool pattern_found = false;
3705   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3706     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3707                                               &visited_patterns, &perm_cache,
3708                                               &compat_cache);
3709
3710   /* If any were found optimize permutations of loads.  */
3711   if (pattern_found)
3712     {
3713       hash_map<slp_tree, slp_tree> load_map;
3714       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3715         {
3716           slp_tree root = SLP_INSTANCE_TREE (instance);
3717           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3718                                         &load_map, root);
3719         }
3720     }
3721
3722
3723
3724   /* The map keeps a reference on SLP nodes built, release that.  */
3725   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3726        it != bst_map->end (); ++it)
3727     if ((*it).second)
3728       vect_free_slp_tree ((*it).second);
3729   delete bst_map;
3730
3731   if (pattern_found && dump_enabled_p ())
3732     {
3733       dump_printf_loc (MSG_NOTE, vect_location,
3734                        "Pattern matched SLP tree\n");
3735       hash_set<slp_tree> visited;
3736       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3737         vect_print_slp_graph (MSG_NOTE, vect_location,
3738                               SLP_INSTANCE_TREE (instance), visited);
3739     }
3740
3741   return opt_result::success ();
3742 }
3743
3744 /* Estimates the cost of inserting layout changes into the SLP graph.
3745    It can also say that the insertion is impossible.  */
3746
3747 struct slpg_layout_cost
3748 {
3749   slpg_layout_cost () = default;
3750   slpg_layout_cost (sreal, bool);
3751
3752   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3753   bool is_possible () const { return depth != sreal::max (); }
3754
3755   bool operator== (const slpg_layout_cost &) const;
3756   bool operator!= (const slpg_layout_cost &) const;
3757
3758   bool is_better_than (const slpg_layout_cost &, bool) const;
3759
3760   void add_parallel_cost (const slpg_layout_cost &);
3761   void add_serial_cost (const slpg_layout_cost &);
3762   void split (unsigned int);
3763
3764   /* The longest sequence of layout changes needed during any traversal
3765      of the partition dag, weighted by execution frequency.
3766
3767      This is the most important metric when optimizing for speed, since
3768      it helps to ensure that we keep the number of operations on
3769      critical paths to a minimum.  */
3770   sreal depth = 0;
3771
3772   /* An estimate of the total number of operations needed.  It is weighted by
3773      execution frequency when optimizing for speed but not when optimizing for
3774      size.  In order to avoid double-counting, a node with a fanout of N will
3775      distribute 1/N of its total cost to each successor.
3776
3777      This is the most important metric when optimizing for size, since
3778      it helps to keep the total number of operations to a minimum,  */
3779   sreal total = 0;
3780 };
3781
3782 /* Construct costs for a node with weight WEIGHT.  A higher weight
3783    indicates more frequent execution.  IS_FOR_SIZE is true if we are
3784    optimizing for size rather than speed.  */
3785
3786 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3787   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3788 {
3789 }
3790
3791 bool
3792 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3793 {
3794   return depth == other.depth && total == other.total;
3795 }
3796
3797 bool
3798 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3799 {
3800   return !operator== (other);
3801 }
3802
3803 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
3804    true if we are optimizing for size rather than speed.  */
3805
3806 bool
3807 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3808                                   bool is_for_size) const
3809 {
3810   if (is_for_size)
3811     {
3812       if (total != other.total)
3813         return total < other.total;
3814       return depth < other.depth;
3815     }
3816   else
3817     {
3818       if (depth != other.depth)
3819         return depth < other.depth;
3820       return total < other.total;
3821     }
3822 }
3823
3824 /* Increase the costs to account for something with cost INPUT_COST
3825    happening in parallel with the current costs.  */
3826
3827 void
3828 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3829 {
3830   depth = std::max (depth, input_cost.depth);
3831   total += input_cost.total;
3832 }
3833
3834 /* Increase the costs to account for something with cost INPUT_COST
3835    happening in series with the current costs.  */
3836
3837 void
3838 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3839 {
3840   depth += other.depth;
3841   total += other.total;
3842 }
3843
3844 /* Split the total cost among TIMES successors or predecessors.  */
3845
3846 void
3847 slpg_layout_cost::split (unsigned int times)
3848 {
3849   if (times > 1)
3850     total /= times;
3851 }
3852
3853 /* Information about one node in the SLP graph, for use during
3854    vect_optimize_slp_pass.  */
3855
3856 struct slpg_vertex
3857 {
3858   slpg_vertex (slp_tree node_) : node (node_) {}
3859
3860   /* The node itself.  */
3861   slp_tree node;
3862
3863   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
3864      partitions are flexible; they can have whichever layout consumers
3865      want them to have.  */
3866   int partition = -1;
3867
3868   /* The number of nodes that directly use the result of this one
3869      (i.e. the number of nodes that count this one as a child).  */
3870   unsigned int out_degree = 0;
3871
3872   /* The execution frequency of the node.  */
3873   sreal weight = 0;
3874
3875   /* The total execution frequency of all nodes that directly use the
3876      result of this one.  */
3877   sreal out_weight = 0;
3878 };
3879
3880 /* Information about one partition of the SLP graph, for use during
3881    vect_optimize_slp_pass.  */
3882
3883 struct slpg_partition_info
3884 {
3885   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3886      of m_partitioned_nodes.  */
3887   unsigned int node_begin = 0;
3888   unsigned int node_end = 0;
3889
3890   /* Which layout we've chosen to use for this partition, or -1 if
3891      we haven't picked one yet.  */
3892   int layout = -1;
3893
3894   /* The number of predecessors and successors in the partition dag.
3895      The predecessors always have lower partition numbers and the
3896      successors always have higher partition numbers.
3897
3898      Note that the directions of these edges are not necessarily the
3899      same as in the data flow graph.  For example, if an SCC has separate
3900      partitions for an inner loop and an outer loop, the inner loop's
3901      partition will have at least two incoming edges from the outer loop's
3902      partition: one for a live-in value and one for a live-out value.
3903      In data flow terms, one of these edges would also be from the outer loop
3904      to the inner loop, but the other would be in the opposite direction.  */
3905   unsigned int in_degree = 0;
3906   unsigned int out_degree = 0;
3907 };
3908
3909 /* Information about the costs of using a particular layout for a
3910    particular partition.  It can also say that the combination is
3911    impossible.  */
3912
3913 struct slpg_partition_layout_costs
3914 {
3915   bool is_possible () const { return internal_cost.is_possible (); }
3916   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3917
3918   /* The costs inherited from predecessor partitions.  */
3919   slpg_layout_cost in_cost;
3920
3921   /* The inherent cost of the layout within the node itself.  For example,
3922      this is nonzero for a load if choosing a particular layout would require
3923      the load to permute the loaded elements.  It is nonzero for a
3924      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3925      to full-vector moves.  */
3926   slpg_layout_cost internal_cost;
3927
3928   /* The costs inherited from successor partitions.  */
3929   slpg_layout_cost out_cost;
3930 };
3931
3932 /* This class tries to optimize the layout of vectors in order to avoid
3933    unnecessary shuffling.  At the moment, the set of possible layouts are
3934    restricted to bijective permutations.
3935
3936    The goal of the pass depends on whether we're optimizing for size or
3937    for speed.  When optimizing for size, the goal is to reduce the overall
3938    number of layout changes (including layout changes implied by things
3939    like load permutations).  When optimizing for speed, the goal is to
3940    reduce the maximum latency attributable to layout changes on any
3941    non-cyclical path through the data flow graph.
3942
3943    For example, when optimizing a loop nest for speed, we will prefer
3944    to make layout changes outside of a loop rather than inside of a loop,
3945    and will prefer to make layout changes in parallel rather than serially,
3946    even if that increases the overall number of layout changes.
3947
3948    The high-level procedure is:
3949
3950    (1) Build a graph in which edges go from uses (parents) to definitions
3951        (children).
3952
3953    (2) Divide the graph into a dag of strongly-connected components (SCCs).
3954
3955    (3) When optimizing for speed, partition the nodes in each SCC based
3956        on their containing cfg loop.  When optimizing for size, treat
3957        each SCC as a single partition.
3958
3959        This gives us a dag of partitions.  The goal is now to assign a
3960        layout to each partition.
3961
3962    (4) Construct a set of vector layouts that are worth considering.
3963        Record which nodes must keep their current layout.
3964
3965    (5) Perform a forward walk over the partition dag (from loads to stores)
3966        accumulating the "forward" cost of using each layout.  When visiting
3967        each partition, assign a tentative choice of layout to the partition
3968        and use that choice when calculating the cost of using a different
3969        layout in successor partitions.
3970
3971    (6) Perform a backward walk over the partition dag (from stores to loads),
3972        accumulating the "backward" cost of using each layout.  When visiting
3973        each partition, make a final choice of layout for that partition based
3974        on the accumulated forward costs (from (5)) and backward costs
3975        (from (6)).
3976
3977    (7) Apply the chosen layouts to the SLP graph.
3978
3979    For example, consider the SLP statements:
3980
3981    S1:      a_1 = load
3982        loop:
3983    S2:      a_2 = PHI<a_1, a_3>
3984    S3:      b_1 = load
3985    S4:      a_3 = a_2 + b_1
3986        exit:
3987    S5:      a_4 = PHI<a_3>
3988    S6:      store a_4
3989
3990    S2 and S4 form an SCC and are part of the same loop.  Every other
3991    statement is in a singleton SCC.  In this example there is a one-to-one
3992    mapping between SCCs and partitions and the partition dag looks like this;
3993
3994         S1     S3
3995          \     /
3996           S2+S4
3997             |
3998            S5
3999             |
4000            S6
4001
4002    S2, S3 and S4 will have a higher execution frequency than the other
4003    statements, so when optimizing for speed, the goal is to avoid any
4004    layout changes:
4005
4006    - within S3
4007    - within S2+S4
4008    - on the S3->S2+S4 edge
4009
4010    For example, if S3 was originally a reversing load, the goal of the
4011    pass is to make it an unreversed load and change the layout on the
4012    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
4013    on S1->S2+S4 and S5->S6 would also be acceptable.)
4014
4015    The difference between SCCs and partitions becomes important if we
4016    add an outer loop:
4017
4018    S1:      a_1 = ...
4019        loop1:
4020    S2:      a_2 = PHI<a_1, a_6>
4021    S3:      b_1 = load
4022    S4:      a_3 = a_2 + b_1
4023        loop2:
4024    S5:      a_4 = PHI<a_3, a_5>
4025    S6:      c_1 = load
4026    S7:      a_5 = a_4 + c_1
4027        exit2:
4028    S8:      a_6 = PHI<a_5>
4029    S9:      store a_6
4030        exit1:
4031
4032    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
4033    for speed, we usually do not want restrictions in the outer loop to "infect"
4034    the decision for the inner loop.  For example, if an outer-loop node
4035    in the SCC contains a statement with a fixed layout, that should not
4036    prevent the inner loop from using a different layout.  Conversely,
4037    the inner loop should not dictate a layout to the outer loop: if the
4038    outer loop does a lot of computation, then it may not be efficient to
4039    do all of that computation in the inner loop's preferred layout.
4040
4041    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4042    and S5+S7 (inner).  We also try to arrange partitions so that:
4043
4044    - the partition for an outer loop comes before the partition for
4045      an inner loop
4046
4047    - if a sibling loop A dominates a sibling loop B, A's partition
4048      comes before B's
4049
4050    This gives the following partition dag for the example above:
4051
4052         S1        S3
4053          \        /
4054           S2+S4+S8   S6
4055            |   \\    /
4056            |    S5+S7
4057            |
4058           S9
4059
4060    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4061    one for a reversal of the edge S7->S8.
4062
4063    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
4064    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4065    preferred layout against the cost of changing the layout on entry to the
4066    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4067
4068    Although this works well when optimizing for speed, it has the downside
4069    when optimizing for size that the choice of layout for S5+S7 is completely
4070    independent of S9, which lessens the chance of reducing the overall number
4071    of permutations.  We therefore do not partition SCCs when optimizing
4072    for size.
4073
4074    To give a concrete example of the difference between optimizing
4075    for size and speed, consider:
4076
4077    a[0] = (b[1] << c[3]) - d[1];
4078    a[1] = (b[0] << c[2]) - d[0];
4079    a[2] = (b[3] << c[1]) - d[3];
4080    a[3] = (b[2] << c[0]) - d[2];
4081
4082    There are three different layouts here: one for a, one for b and d,
4083    and one for c.  When optimizing for speed it is better to permute each
4084    of b, c and d into the order required by a, since those permutations
4085    happen in parallel.  But when optimizing for size, it is better to:
4086
4087    - permute c into the same order as b
4088    - do the arithmetic
4089    - permute the result into the order required by a
4090
4091    This gives 2 permutations rather than 3.  */
4092
4093 class vect_optimize_slp_pass
4094 {
4095 public:
4096   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4097   void run ();
4098
4099 private:
4100   /* Graph building.  */
4101   struct loop *containing_loop (slp_tree);
4102   bool is_cfg_latch_edge (graph_edge *);
4103   void build_vertices (hash_set<slp_tree> &, slp_tree);
4104   void build_vertices ();
4105   void build_graph ();
4106
4107   /* Partitioning.  */
4108   void create_partitions ();
4109   template<typename T> void for_each_partition_edge (unsigned int, T);
4110
4111   /* Layout selection.  */
4112   bool is_compatible_layout (slp_tree, unsigned int);
4113   int change_layout_cost (slp_tree, unsigned int, unsigned int);
4114   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4115                                                        unsigned int);
4116   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4117                                int, unsigned int);
4118   int internal_node_cost (slp_tree, int, unsigned int);
4119   void start_choosing_layouts ();
4120
4121   /* Cost propagation.  */
4122   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4123                                      unsigned int, unsigned int);
4124   slpg_layout_cost total_in_cost (unsigned int);
4125   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4126   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4127   void forward_pass ();
4128   void backward_pass ();
4129
4130   /* Rematerialization.  */
4131   slp_tree get_result_with_layout (slp_tree, unsigned int);
4132   void materialize ();
4133
4134   /* Clean-up.  */
4135   void remove_redundant_permutations ();
4136
4137   void dump ();
4138
4139   vec_info *m_vinfo;
4140
4141   /* True if we should optimize the graph for size, false if we should
4142      optimize it for speed.  (It wouldn't be easy to make this decision
4143      more locally.)  */
4144   bool m_optimize_size;
4145
4146   /* A graph of all SLP nodes, with edges leading from uses to definitions.
4147      In other words, a node's predecessors are its slp_tree parents and
4148      a node's successors are its slp_tree children.  */
4149   graph *m_slpg = nullptr;
4150
4151   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
4152   auto_vec<slpg_vertex> m_vertices;
4153
4154   /* The list of all leaves of M_SLPG. such as external definitions, constants,
4155      and loads.  */
4156   auto_vec<int> m_leafs;
4157
4158   /* This array has one entry for every vector layout that we're considering.
4159      Element 0 is null and indicates "no change".  Other entries describe
4160      permutations that are inherent in the current graph and that we would
4161      like to reverse if possible.
4162
4163      For example, a permutation { 1, 2, 3, 0 } means that something has
4164      effectively been permuted in that way, such as a load group
4165      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4166      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4167      in order to put things "back" in order.  */
4168   auto_vec<vec<unsigned> > m_perms;
4169
4170   /* A partitioning of the nodes for which a layout must be chosen.
4171      Each partition represents an <SCC, cfg loop> pair; that is,
4172      nodes in different SCCs belong to different partitions, and nodes
4173      within an SCC can be further partitioned according to a containing
4174      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
4175
4176      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4177        from leaves (such as loads) to roots (such as stores).
4178
4179      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
4180   auto_vec<slpg_partition_info> m_partitions;
4181
4182   /* The list of all nodes for which a layout must be chosen.  Nodes for
4183      partition P come before the nodes for partition P+1.  Nodes within a
4184      partition are in reverse postorder.  */
4185   auto_vec<unsigned int> m_partitioned_nodes;
4186
4187   /* Index P * num-layouts + L contains the cost of using layout L
4188      for partition P.  */
4189   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4190
4191   /* Index N * num-layouts + L, if nonnull, is a node that provides the
4192      original output of node N adjusted to have layout L.  */
4193   auto_vec<slp_tree> m_node_layouts;
4194 };
4195
4196 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4197    Also record whether we should optimize anything for speed rather
4198    than size.  */
4199
4200 void
4201 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4202                                         slp_tree node)
4203 {
4204   unsigned i;
4205   slp_tree child;
4206
4207   if (visited.add (node))
4208     return;
4209
4210   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4211     {
4212       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4213       if (optimize_bb_for_speed_p (bb))
4214         m_optimize_size = false;
4215     }
4216
4217   node->vertex = m_vertices.length ();
4218   m_vertices.safe_push (slpg_vertex (node));
4219
4220   bool leaf = true;
4221   bool force_leaf = false;
4222   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4223     if (child)
4224       {
4225         leaf = false;
4226         build_vertices (visited, child);
4227       }
4228     else
4229       force_leaf = true;
4230   /* Since SLP discovery works along use-def edges all cycles have an
4231      entry - but there's the exception of cycles where we do not handle
4232      the entry explicitely (but with a NULL SLP node), like some reductions
4233      and inductions.  Force those SLP PHIs to act as leafs to make them
4234      backwards reachable.  */
4235   if (leaf || force_leaf)
4236     m_leafs.safe_push (node->vertex);
4237 }
4238
4239 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
4240
4241 void
4242 vect_optimize_slp_pass::build_vertices ()
4243 {
4244   hash_set<slp_tree> visited;
4245   unsigned i;
4246   slp_instance instance;
4247   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4248     build_vertices (visited, SLP_INSTANCE_TREE (instance));
4249 }
4250
4251 /* Apply (reverse) bijectite PERM to VEC.  */
4252
4253 template <class T>
4254 static void
4255 vect_slp_permute (vec<unsigned> perm,
4256                   vec<T> &vec, bool reverse)
4257 {
4258   auto_vec<T, 64> saved;
4259   saved.create (vec.length ());
4260   for (unsigned i = 0; i < vec.length (); ++i)
4261     saved.quick_push (vec[i]);
4262
4263   if (reverse)
4264     {
4265       for (unsigned i = 0; i < vec.length (); ++i)
4266         vec[perm[i]] = saved[i];
4267       for (unsigned i = 0; i < vec.length (); ++i)
4268         gcc_assert (vec[perm[i]] == saved[i]);
4269     }
4270   else
4271     {
4272       for (unsigned i = 0; i < vec.length (); ++i)
4273         vec[i] = saved[perm[i]];
4274       for (unsigned i = 0; i < vec.length (); ++i)
4275         gcc_assert (vec[i] == saved[perm[i]]);
4276     }
4277 }
4278
4279 /* Return the cfg loop that contains NODE.  */
4280
4281 struct loop *
4282 vect_optimize_slp_pass::containing_loop (slp_tree node)
4283 {
4284   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4285   if (!rep)
4286     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4287   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4288 }
4289
4290 /* Return true if UD (an edge from a use to a definition) is associated
4291    with a loop latch edge in the cfg.  */
4292
4293 bool
4294 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4295 {
4296   slp_tree use = m_vertices[ud->src].node;
4297   slp_tree def = m_vertices[ud->dest].node;
4298   if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4299       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4300     return false;
4301
4302   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4303   return (is_a<gphi *> (use_rep->stmt)
4304           && bb_loop_header_p (gimple_bb (use_rep->stmt))
4305           && containing_loop (def) == containing_loop (use));
4306 }
4307
4308 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
4309    a nonnull data field.  */
4310
4311 void
4312 vect_optimize_slp_pass::build_graph ()
4313 {
4314   m_optimize_size = true;
4315   build_vertices ();
4316
4317   m_slpg = new_graph (m_vertices.length ());
4318   for (slpg_vertex &v : m_vertices)
4319     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4320       if (child)
4321         {
4322           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4323           if (is_cfg_latch_edge (ud))
4324             ud->data = this;
4325         }
4326 }
4327
4328 /* Return true if E corresponds to a loop latch edge in the cfg.  */
4329
4330 static bool
4331 skip_cfg_latch_edges (graph_edge *e)
4332 {
4333   return e->data;
4334 }
4335
4336 /* Create the node partitions.  */
4337
4338 void
4339 vect_optimize_slp_pass::create_partitions ()
4340 {
4341   /* Calculate a postorder of the graph, ignoring edges that correspond
4342      to natural latch edges in the cfg.  Reading the vector from the end
4343      to the beginning gives the reverse postorder.  */
4344   auto_vec<int> initial_rpo;
4345   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4346                false, NULL, skip_cfg_latch_edges);
4347   gcc_assert (initial_rpo.length () == m_vertices.length ());
4348
4349   /* Calculate the strongly connected components of the graph.  */
4350   auto_vec<int> scc_grouping;
4351   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4352
4353   /* Create a new index order in which all nodes from the same SCC are
4354      consecutive.  Use scc_pos to record the index of the first node in
4355      each SCC.  */
4356   auto_vec<unsigned int> scc_pos (num_sccs);
4357   int last_component = -1;
4358   unsigned int node_count = 0;
4359   for (unsigned int node_i : scc_grouping)
4360     {
4361       if (last_component != m_slpg->vertices[node_i].component)
4362         {
4363           last_component = m_slpg->vertices[node_i].component;
4364           gcc_assert (last_component == int (scc_pos.length ()));
4365           scc_pos.quick_push (node_count);
4366         }
4367       node_count += 1;
4368     }
4369   gcc_assert (node_count == initial_rpo.length ()
4370               && last_component + 1 == int (num_sccs));
4371
4372   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4373      inside each SCC following the RPO we calculated above.  The fact that
4374      we ignored natural latch edges when calculating the RPO should ensure
4375      that, for natural loop nests:
4376
4377      - the first node that we encounter in a cfg loop is the loop header phi
4378      - the loop header phis are in dominance order
4379
4380      Arranging for this is an optimization (see below) rather than a
4381      correctness issue.  Unnatural loops with a tangled mess of backedges
4382      will still work correctly, but might give poorer results.
4383
4384      Also update scc_pos so that it gives 1 + the index of the last node
4385      in the SCC.  */
4386   m_partitioned_nodes.safe_grow (node_count);
4387   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4388     {
4389       unsigned int node_i = initial_rpo[old_i];
4390       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4391       m_partitioned_nodes[new_i] = node_i;
4392     }
4393
4394   /* When optimizing for speed, partition each SCC based on the containing
4395      cfg loop. The order we constructed above should ensure that, for natural
4396      cfg loops, we'll create sub-SCC partitions for outer loops before
4397      the corresponding sub-SCC partitions for inner loops.  Similarly,
4398      when one sibling loop A dominates another sibling loop B, we should
4399      create a sub-SCC partition for A before a sub-SCC partition for B.
4400
4401      As above, nothing depends for correctness on whether this achieves
4402      a natural nesting, but we should get better results when it does.  */
4403   m_partitions.reserve (m_vertices.length ());
4404   unsigned int next_partition_i = 0;
4405   hash_map<struct loop *, int> loop_partitions;
4406   unsigned int rpo_begin = 0;
4407   unsigned int num_partitioned_nodes = 0;
4408   for (unsigned int rpo_end : scc_pos)
4409     {
4410       loop_partitions.empty ();
4411       unsigned int partition_i = next_partition_i;
4412       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4413         {
4414           /* Handle externals and constants optimistically throughout.
4415              But treat existing vectors as fixed since we do not handle
4416              permuting them.  */
4417           unsigned int node_i = m_partitioned_nodes[rpo_i];
4418           auto &vertex = m_vertices[node_i];
4419           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4420                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4421               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4422             vertex.partition = -1;
4423           else
4424             {
4425               bool existed;
4426               if (m_optimize_size)
4427                 existed = next_partition_i > partition_i;
4428               else
4429                 {
4430                   struct loop *loop = containing_loop (vertex.node);
4431                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
4432                   if (!existed)
4433                     entry = next_partition_i;
4434                   partition_i = entry;
4435                 }
4436               if (!existed)
4437                 {
4438                   m_partitions.quick_push (slpg_partition_info ());
4439                   next_partition_i += 1;
4440                 }
4441               vertex.partition = partition_i;
4442               num_partitioned_nodes += 1;
4443               m_partitions[partition_i].node_end += 1;
4444             }
4445         }
4446       rpo_begin = rpo_end;
4447     }
4448
4449   /* Assign ranges of consecutive node indices to each partition,
4450      in partition order.  Start with node_end being the same as
4451      node_begin so that the next loop can use it as a counter.  */
4452   unsigned int node_begin = 0;
4453   for (auto &partition : m_partitions)
4454     {
4455       partition.node_begin = node_begin;
4456       node_begin += partition.node_end;
4457       partition.node_end = partition.node_begin;
4458     }
4459   gcc_assert (node_begin == num_partitioned_nodes);
4460
4461   /* Finally build the list of nodes in partition order.  */
4462   m_partitioned_nodes.truncate (num_partitioned_nodes);
4463   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4464     {
4465       int partition_i = m_vertices[node_i].partition;
4466       if (partition_i >= 0)
4467         {
4468           unsigned int order_i = m_partitions[partition_i].node_end++;
4469           m_partitioned_nodes[order_i] = node_i;
4470         }
4471     }
4472 }
4473
4474 /* Look for edges from earlier partitions into node NODE_I and edges from
4475    node NODE_I into later partitions.  Call:
4476
4477       FN (ud, other_node_i)
4478
4479    for each such use-to-def edge ud, where other_node_i is the node at the
4480    other end of the edge.  */
4481
4482 template<typename T>
4483 void
4484 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4485 {
4486   int partition_i = m_vertices[node_i].partition;
4487   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4488        pred; pred = pred->pred_next)
4489     {
4490       int src_partition_i = m_vertices[pred->src].partition;
4491       if (src_partition_i >= 0 && src_partition_i != partition_i)
4492         fn (pred, pred->src);
4493     }
4494   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4495        succ; succ = succ->succ_next)
4496     {
4497       int dest_partition_i = m_vertices[succ->dest].partition;
4498       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4499         fn (succ, succ->dest);
4500     }
4501 }
4502
4503 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4504    that NODE would operate on.  This test is independent of NODE's actual
4505    operation.  */
4506
4507 bool
4508 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4509                                               unsigned int layout_i)
4510 {
4511   if (layout_i == 0)
4512     return true;
4513
4514   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4515     return false;
4516
4517   return true;
4518 }
4519
4520 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4521    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
4522    layouts is incompatible with NODE or if the change is not possible for
4523    some other reason.
4524
4525    The properties taken from NODE include the number of lanes and the
4526    vector type.  The actual operation doesn't matter.  */
4527
4528 int
4529 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4530                                             unsigned int from_layout_i,
4531                                             unsigned int to_layout_i)
4532 {
4533   if (!is_compatible_layout (node, from_layout_i)
4534       || !is_compatible_layout (node, to_layout_i))
4535     return -1;
4536
4537   if (from_layout_i == to_layout_i)
4538     return 0;
4539
4540   auto_vec<slp_tree, 1> children (1);
4541   children.quick_push (node);
4542   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4543   if (from_layout_i > 0)
4544     for (unsigned int i : m_perms[from_layout_i])
4545       perm.quick_push ({ 0, i });
4546   else
4547     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4548       perm.quick_push ({ 0, i });
4549   if (to_layout_i > 0)
4550     vect_slp_permute (m_perms[to_layout_i], perm, true);
4551   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4552                                                children, false);
4553   if (count >= 0)
4554     return MAX (count, 1);
4555
4556   /* ??? In principle we could try changing via layout 0, giving two
4557      layout changes rather than 1.  Doing that would require
4558      corresponding support in get_result_with_layout.  */
4559   return -1;
4560 }
4561
4562 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
4563
4564 inline slpg_partition_layout_costs &
4565 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4566                                                 unsigned int layout_i)
4567 {
4568   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4569 }
4570
4571 /* Change PERM in one of two ways:
4572
4573    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4574      chosen for child I of NODE.
4575
4576    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4577
4578    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
4579
4580 void
4581 vect_optimize_slp_pass::
4582 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4583                         int in_layout_i, unsigned int out_layout_i)
4584 {
4585   for (auto &entry : perm)
4586     {
4587       int this_in_layout_i = in_layout_i;
4588       if (this_in_layout_i < 0)
4589         {
4590           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4591           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4592           this_in_layout_i = m_partitions[in_partition_i].layout;
4593         }
4594       if (this_in_layout_i > 0)
4595         entry.second = m_perms[this_in_layout_i][entry.second];
4596     }
4597   if (out_layout_i > 0)
4598     vect_slp_permute (m_perms[out_layout_i], perm, true);
4599 }
4600
4601 /* Check whether the target allows NODE to be rearranged so that the node's
4602    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
4603    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
4604
4605    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4606    NODE can adapt to the layout changes that have (perhaps provisionally)
4607    been chosen for NODE's children, so that no extra permutations are
4608    needed on either the input or the output of NODE.
4609
4610    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4611    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4612
4613    IN_LAYOUT_I has no meaning for other types of node.
4614
4615    Keeping the node as-is is always valid.  If the target doesn't appear
4616    to support the node as-is, but might realistically support other layouts,
4617    then layout 0 instead has the cost of a worst-case permutation.  On the
4618    one hand, this ensures that every node has at least one valid layout,
4619    avoiding what would otherwise be an awkward special case.  On the other,
4620    it still encourages the pass to change an invalid pre-existing layout
4621    choice into a valid one.  */
4622
4623 int
4624 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4625                                             unsigned int out_layout_i)
4626 {
4627   const int fallback_cost = 1;
4628
4629   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4630     {
4631       auto_lane_permutation_t tmp_perm;
4632       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4633
4634       /* Check that the child nodes support the chosen layout.  Checking
4635          the first child is enough, since any second child would have the
4636          same shape.  */
4637       auto first_child = SLP_TREE_CHILDREN (node)[0];
4638       if (in_layout_i > 0
4639           && !is_compatible_layout (first_child, in_layout_i))
4640         return -1;
4641
4642       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4643       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4644                                                   node, tmp_perm,
4645                                                   SLP_TREE_CHILDREN (node),
4646                                                   false);
4647       if (count < 0)
4648         {
4649           if (in_layout_i == 0 && out_layout_i == 0)
4650             {
4651               /* Use the fallback cost if the node could in principle support
4652                  some nonzero layout for both the inputs and the outputs.
4653                  Otherwise assume that the node will be rejected later
4654                  and rebuilt from scalars.  */
4655               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4656                 return fallback_cost;
4657               return 0;
4658             }
4659           return -1;
4660         }
4661
4662       /* We currently have no way of telling whether the new layout is cheaper
4663          or more expensive than the old one.  But at least in principle,
4664          it should be worth making zero permutations (whole-vector shuffles)
4665          cheaper than real permutations, in case the pass is able to remove
4666          the latter.  */
4667       return count == 0 ? 0 : 1;
4668     }
4669
4670   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4671   if (rep
4672       && STMT_VINFO_DATA_REF (rep)
4673       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4674       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4675     {
4676       auto_load_permutation_t tmp_perm;
4677       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4678       if (out_layout_i > 0)
4679         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4680
4681       poly_uint64 vf = 1;
4682       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4683         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4684       unsigned int n_perms;
4685       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4686                                            nullptr, vf, true, false, &n_perms))
4687         {
4688           auto rep = SLP_TREE_REPRESENTATIVE (node);
4689           if (out_layout_i == 0)
4690             {
4691               /* Use the fallback cost if the load is an N-to-N permutation.
4692                  Otherwise assume that the node will be rejected later
4693                  and rebuilt from scalars.  */
4694               if (STMT_VINFO_GROUPED_ACCESS (rep)
4695                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4696                       == SLP_TREE_LANES (node)))
4697                 return fallback_cost;
4698               return 0;
4699             }
4700           return -1;
4701         }
4702
4703       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
4704       return n_perms == 0 ? 0 : 1;
4705     }
4706
4707   return 0;
4708 }
4709
4710 /* Decide which element layouts we should consider using.  Calculate the
4711    weights associated with inserting layout changes on partition edges.
4712    Also mark partitions that cannot change layout, by setting their
4713    layout to zero.  */
4714
4715 void
4716 vect_optimize_slp_pass::start_choosing_layouts ()
4717 {
4718   /* Used to assign unique permutation indices.  */
4719   using perm_hash = unbounded_hashmap_traits<
4720     vec_free_hash_base<int_hash_base<unsigned>>,
4721     int_hash<int, -1, -2>
4722   >;
4723   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4724
4725   /* Layout 0 is "no change".  */
4726   m_perms.safe_push (vNULL);
4727
4728   /* Create layouts from existing permutations.  */
4729   auto_load_permutation_t tmp_perm;
4730   for (unsigned int node_i : m_partitioned_nodes)
4731     {
4732       /* Leafs also double as entries to the reverse graph.  Allow the
4733          layout of those to be changed.  */
4734       auto &vertex = m_vertices[node_i];
4735       auto &partition = m_partitions[vertex.partition];
4736       if (!m_slpg->vertices[node_i].succ)
4737         partition.layout = 0;
4738
4739       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
4740       slp_tree node = vertex.node;
4741       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4742       slp_tree child;
4743       unsigned HOST_WIDE_INT imin, imax = 0;
4744       bool any_permute = false;
4745       tmp_perm.truncate (0);
4746       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4747         {
4748           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4749              unpermuted, record a layout that reverses this permutation.
4750
4751              We would need more work to cope with loads that are internally
4752              permuted and also have inputs (such as masks for
4753              IFN_MASK_LOADs).  */
4754           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4755           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4756             {
4757               partition.layout = -1;
4758               continue;
4759             }
4760           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4761           imin = DR_GROUP_SIZE (dr_stmt) + 1;
4762           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4763         }
4764       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4765                && SLP_TREE_CHILDREN (node).length () == 1
4766                && (child = SLP_TREE_CHILDREN (node)[0])
4767                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4768                    .is_constant (&imin)))
4769         {
4770           /* If the child has the same vector size as this node,
4771              reversing the permutation can make the permutation a no-op.
4772              In other cases it can change a true permutation into a
4773              full-vector extract.  */
4774           tmp_perm.reserve (SLP_TREE_LANES (node));
4775           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4776             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4777         }
4778       else
4779         continue;
4780
4781       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4782         {
4783           unsigned idx = tmp_perm[j];
4784           imin = MIN (imin, idx);
4785           imax = MAX (imax, idx);
4786           if (idx - tmp_perm[0] != j)
4787             any_permute = true;
4788         }
4789       /* If the span doesn't match we'd disrupt VF computation, avoid
4790          that for now.  */
4791       if (imax - imin + 1 != SLP_TREE_LANES (node))
4792         continue;
4793       /* If there's no permute no need to split one out.  In this case
4794          we can consider turning a load into a permuted load, if that
4795          turns out to be cheaper than alternatives.  */
4796       if (!any_permute)
4797         {
4798           partition.layout = -1;
4799           continue;
4800         }
4801
4802       /* For now only handle true permutes, like
4803          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
4804          when permuting constants and invariants keeping the permute
4805          bijective.  */
4806       auto_sbitmap load_index (SLP_TREE_LANES (node));
4807       bitmap_clear (load_index);
4808       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4809         bitmap_set_bit (load_index, tmp_perm[j] - imin);
4810       unsigned j;
4811       for (j = 0; j < SLP_TREE_LANES (node); ++j)
4812         if (!bitmap_bit_p (load_index, j))
4813           break;
4814       if (j != SLP_TREE_LANES (node))
4815         continue;
4816
4817       vec<unsigned> perm = vNULL;
4818       perm.safe_grow (SLP_TREE_LANES (node), true);
4819       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4820         perm[j] = tmp_perm[j] - imin;
4821
4822       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4823         {
4824           /* Continue to use existing layouts, but don't add any more.  */
4825           int *entry = layout_ids.get (perm);
4826           partition.layout = entry ? *entry : 0;
4827           perm.release ();
4828         }
4829       else
4830         {
4831           bool existed;
4832           int &layout_i = layout_ids.get_or_insert (perm, &existed);
4833           if (existed)
4834             perm.release ();
4835           else
4836             {
4837               layout_i = m_perms.length ();
4838               m_perms.safe_push (perm);
4839             }
4840           partition.layout = layout_i;
4841         }
4842     }
4843
4844   /* Initially assume that every layout is possible and has zero cost
4845      in every partition.  */
4846   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4847                                               * m_perms.length ());
4848
4849   /* We have to mark outgoing permutations facing non-associating-reduction
4850      graph entries that are not represented as to be materialized.
4851      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
4852   for (slp_instance instance : m_vinfo->slp_instances)
4853     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4854       {
4855         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4856         m_partitions[m_vertices[node_i].partition].layout = 0;
4857       }
4858     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4859       {
4860         stmt_vec_info stmt_info
4861           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4862         stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4863         if (needs_fold_left_reduction_p (TREE_TYPE
4864                                            (gimple_get_lhs (stmt_info->stmt)),
4865                                          STMT_VINFO_REDUC_CODE (reduc_info)))
4866           {
4867             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4868             m_partitions[m_vertices[node_i].partition].layout = 0;
4869           }
4870       }
4871
4872   /* Check which layouts each node and partition can handle.  Calculate the
4873      weights associated with inserting layout changes on edges.  */
4874   for (unsigned int node_i : m_partitioned_nodes)
4875     {
4876       auto &vertex = m_vertices[node_i];
4877       auto &partition = m_partitions[vertex.partition];
4878       slp_tree node = vertex.node;
4879
4880       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4881         {
4882           vertex.weight = vect_slp_node_weight (node);
4883
4884           /* We do not handle stores with a permutation, so all
4885              incoming permutations must have been materialized.
4886
4887              We also don't handle masked grouped loads, which lack a
4888              permutation vector.  In this case the memory locations
4889              form an implicit second input to the loads, on top of the
4890              explicit mask input, and the memory input's layout cannot
4891              be changed.
4892
4893              On the other hand, we do support permuting gather loads and
4894              masked gather loads, where each scalar load is independent
4895              of the others.  This can be useful if the address/index input
4896              benefits from permutation.  */
4897           if (STMT_VINFO_DATA_REF (rep)
4898               && STMT_VINFO_GROUPED_ACCESS (rep)
4899               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4900             partition.layout = 0;
4901
4902           /* We cannot change the layout of an operation that is
4903              not independent on lanes.  Note this is an explicit
4904              negative list since that's much shorter than the respective
4905              positive one but it's critical to keep maintaining it.  */
4906           if (is_gimple_call (STMT_VINFO_STMT (rep)))
4907             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4908               {
4909               case CFN_COMPLEX_ADD_ROT90:
4910               case CFN_COMPLEX_ADD_ROT270:
4911               case CFN_COMPLEX_MUL:
4912               case CFN_COMPLEX_MUL_CONJ:
4913               case CFN_VEC_ADDSUB:
4914               case CFN_VEC_FMADDSUB:
4915               case CFN_VEC_FMSUBADD:
4916                 partition.layout = 0;
4917               default:;
4918               }
4919         }
4920
4921       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4922         {
4923           auto &other_vertex = m_vertices[other_node_i];
4924
4925           /* Count the number of edges from earlier partitions and the number
4926              of edges to later partitions.  */
4927           if (other_vertex.partition < vertex.partition)
4928             partition.in_degree += 1;
4929           else
4930             partition.out_degree += 1;
4931
4932           /* If the current node uses the result of OTHER_NODE_I, accumulate
4933              the effects of that.  */
4934           if (ud->src == int (node_i))
4935             {
4936               other_vertex.out_weight += vertex.weight;
4937               other_vertex.out_degree += 1;
4938             }
4939         };
4940       for_each_partition_edge (node_i, process_edge);
4941     }
4942 }
4943
4944 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4945    its current (provisional) choice of layout.  The inputs do not necessarily
4946    have the same layout as each other.  */
4947
4948 slpg_layout_cost
4949 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4950 {
4951   auto &vertex = m_vertices[node_i];
4952   slpg_layout_cost cost;
4953   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4954     {
4955       auto &other_vertex = m_vertices[other_node_i];
4956       if (other_vertex.partition < vertex.partition)
4957         {
4958           auto &other_partition = m_partitions[other_vertex.partition];
4959           auto &other_costs = partition_layout_costs (other_vertex.partition,
4960                                                       other_partition.layout);
4961           slpg_layout_cost this_cost = other_costs.in_cost;
4962           this_cost.add_serial_cost (other_costs.internal_cost);
4963           this_cost.split (other_partition.out_degree);
4964           cost.add_parallel_cost (this_cost);
4965         }
4966     };
4967   for_each_partition_edge (node_i, add_cost);
4968   return cost;
4969 }
4970
4971 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4972    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
4973    slpg_layout_cost::impossible () if the change isn't possible.  */
4974
4975 slpg_layout_cost
4976 vect_optimize_slp_pass::
4977 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4978                   unsigned int layout2_i)
4979 {
4980   auto &def_vertex = m_vertices[ud->dest];
4981   auto &use_vertex = m_vertices[ud->src];
4982   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4983   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4984   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4985                                     use_layout_i);
4986   if (factor < 0)
4987     return slpg_layout_cost::impossible ();
4988
4989   /* We have a choice of putting the layout change at the site of the
4990      definition or at the site of the use.  Prefer the former when
4991      optimizing for size or when the execution frequency of the
4992      definition is no greater than the combined execution frequencies of
4993      the uses.  When putting the layout change at the site of the definition,
4994      divvy up the cost among all consumers.  */
4995   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4996     {
4997       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4998       cost.split (def_vertex.out_degree);
4999       return cost;
5000     }
5001   return { use_vertex.weight * factor, m_optimize_size };
5002 }
5003
5004 /* UD represents a use-def link between FROM_NODE_I and a node in a later
5005    partition; FROM_NODE_I could be the definition node or the use node.
5006    The node at the other end of the link wants to use layout TO_LAYOUT_I.
5007    Return the cost of any necessary fix-ups on edge UD, or return
5008    slpg_layout_cost::impossible () if the change isn't possible.
5009
5010    At this point, FROM_NODE_I's partition has chosen the cheapest
5011    layout based on the information available so far, but this choice
5012    is only provisional.  */
5013
5014 slpg_layout_cost
5015 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
5016                                       unsigned int to_layout_i)
5017 {
5018   auto &from_vertex = m_vertices[from_node_i];
5019   unsigned int from_partition_i = from_vertex.partition;
5020   slpg_partition_info &from_partition = m_partitions[from_partition_i];
5021   gcc_assert (from_partition.layout >= 0);
5022
5023   /* First calculate the cost on the assumption that FROM_PARTITION sticks
5024      with its current layout preference.  */
5025   slpg_layout_cost cost = slpg_layout_cost::impossible ();
5026   auto edge_cost = edge_layout_cost (ud, from_node_i,
5027                                      from_partition.layout, to_layout_i);
5028   if (edge_cost.is_possible ())
5029     {
5030       auto &from_costs = partition_layout_costs (from_partition_i,
5031                                                  from_partition.layout);
5032       cost = from_costs.in_cost;
5033       cost.add_serial_cost (from_costs.internal_cost);
5034       cost.split (from_partition.out_degree);
5035       cost.add_serial_cost (edge_cost);
5036     }
5037
5038   /* Take the minimum of that cost and the cost that applies if
5039      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
5040   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
5041                                                       to_layout_i);
5042   if (direct_layout_costs.is_possible ())
5043     {
5044       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5045       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
5046       direct_cost.split (from_partition.out_degree);
5047       if (!cost.is_possible ()
5048           || direct_cost.is_better_than (cost, m_optimize_size))
5049         cost = direct_cost;
5050     }
5051
5052   return cost;
5053 }
5054
5055 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
5056    partition; TO_NODE_I could be the definition node or the use node.
5057    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5058    return the cost of any necessary fix-ups on edge UD, or
5059    slpg_layout_cost::impossible () if the choice cannot be made.
5060
5061    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
5062
5063 slpg_layout_cost
5064 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
5065                                        unsigned int from_layout_i)
5066 {
5067   auto &to_vertex = m_vertices[to_node_i];
5068   unsigned int to_partition_i = to_vertex.partition;
5069   slpg_partition_info &to_partition = m_partitions[to_partition_i];
5070   gcc_assert (to_partition.layout >= 0);
5071
5072   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
5073      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
5074      any other inputs keep their current choice of layout.  */
5075   auto &to_costs = partition_layout_costs (to_partition_i,
5076                                            to_partition.layout);
5077   if (ud->src == int (to_node_i)
5078       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5079     {
5080       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
5081       auto old_layout = from_partition.layout;
5082       from_partition.layout = from_layout_i;
5083       int factor = internal_node_cost (to_vertex.node, -1,
5084                                        to_partition.layout);
5085       from_partition.layout = old_layout;
5086       if (factor >= 0)
5087         {
5088           slpg_layout_cost cost = to_costs.out_cost;
5089           cost.add_serial_cost ({ to_vertex.weight * factor,
5090                                   m_optimize_size });
5091           cost.split (to_partition.in_degree);
5092           return cost;
5093         }
5094     }
5095
5096   /* Compute the cost if we insert any necessary layout change on edge UD.  */
5097   auto edge_cost = edge_layout_cost (ud, to_node_i,
5098                                      to_partition.layout, from_layout_i);
5099   if (edge_cost.is_possible ())
5100     {
5101       slpg_layout_cost cost = to_costs.out_cost;
5102       cost.add_serial_cost (to_costs.internal_cost);
5103       cost.split (to_partition.in_degree);
5104       cost.add_serial_cost (edge_cost);
5105       return cost;
5106     }
5107
5108   return slpg_layout_cost::impossible ();
5109 }
5110
5111 /* Make a forward pass through the partitions, accumulating input costs.
5112    Make a tentative (provisional) choice of layout for each partition,
5113    ensuring that this choice still allows later partitions to keep
5114    their original layout.  */
5115
5116 void
5117 vect_optimize_slp_pass::forward_pass ()
5118 {
5119   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5120        ++partition_i)
5121     {
5122       auto &partition = m_partitions[partition_i];
5123
5124       /* If the partition consists of a single VEC_PERM_EXPR, precompute
5125          the incoming cost that would apply if every predecessor partition
5126          keeps its current layout.  This is used within the loop below.  */
5127       slpg_layout_cost in_cost;
5128       slp_tree single_node = nullptr;
5129       if (partition.node_end == partition.node_begin + 1)
5130         {
5131           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5132           single_node = m_vertices[node_i].node;
5133           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5134             in_cost = total_in_cost (node_i);
5135         }
5136
5137       /* Go through the possible layouts.  Decide which ones are valid
5138          for this partition and record which of the valid layouts has
5139          the lowest cost.  */
5140       unsigned int min_layout_i = 0;
5141       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5142       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5143         {
5144           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5145           if (!layout_costs.is_possible ())
5146             continue;
5147
5148           /* If the recorded layout is already 0 then the layout cannot
5149              change.  */
5150           if (partition.layout == 0 && layout_i != 0)
5151             {
5152               layout_costs.mark_impossible ();
5153               continue;
5154             }
5155
5156           bool is_possible = true;
5157           for (unsigned int order_i = partition.node_begin;
5158                order_i < partition.node_end; ++order_i)
5159             {
5160               unsigned int node_i = m_partitioned_nodes[order_i];
5161               auto &vertex = m_vertices[node_i];
5162
5163               /* Reject the layout if it is individually incompatible
5164                  with any node in the partition.  */
5165               if (!is_compatible_layout (vertex.node, layout_i))
5166                 {
5167                   is_possible = false;
5168                   break;
5169                 }
5170
5171               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5172                 {
5173                   auto &other_vertex = m_vertices[other_node_i];
5174                   if (other_vertex.partition < vertex.partition)
5175                     {
5176                       /* Accumulate the incoming costs from earlier
5177                          partitions, plus the cost of any layout changes
5178                          on UD itself.  */
5179                       auto cost = forward_cost (ud, other_node_i, layout_i);
5180                       if (!cost.is_possible ())
5181                         is_possible = false;
5182                       else
5183                         layout_costs.in_cost.add_parallel_cost (cost);
5184                     }
5185                   else
5186                     /* Reject the layout if it would make layout 0 impossible
5187                        for later partitions.  This amounts to testing that the
5188                        target supports reversing the layout change on edges
5189                        to later partitions.
5190
5191                        In principle, it might be possible to push a layout
5192                        change all the way down a graph, so that it never
5193                        needs to be reversed and so that the target doesn't
5194                        need to support the reverse operation.  But it would
5195                        be awkward to bail out if we hit a partition that
5196                        does not support the new layout, especially since
5197                        we are not dealing with a lattice.  */
5198                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
5199                                                      layout_i).is_possible ();
5200                 };
5201               for_each_partition_edge (node_i, add_cost);
5202
5203               /* Accumulate the cost of using LAYOUT_I within NODE,
5204                  both for the inputs and the outputs.  */
5205               int factor = internal_node_cost (vertex.node, layout_i,
5206                                                layout_i);
5207               if (factor < 0)
5208                 {
5209                   is_possible = false;
5210                   break;
5211                 }
5212               else if (factor)
5213                 layout_costs.internal_cost.add_serial_cost
5214                   ({ vertex.weight * factor, m_optimize_size });
5215             }
5216           if (!is_possible)
5217             {
5218               layout_costs.mark_impossible ();
5219               continue;
5220             }
5221
5222           /* Combine the incoming and partition-internal costs.  */
5223           slpg_layout_cost combined_cost = layout_costs.in_cost;
5224           combined_cost.add_serial_cost (layout_costs.internal_cost);
5225
5226           /* If this partition consists of a single VEC_PERM_EXPR, see
5227              if the VEC_PERM_EXPR can be changed to support output layout
5228              LAYOUT_I while keeping all the provisional choices of input
5229              layout.  */
5230           if (single_node
5231               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5232             {
5233               int factor = internal_node_cost (single_node, -1, layout_i);
5234               if (factor >= 0)
5235                 {
5236                   auto weight = m_vertices[single_node->vertex].weight;
5237                   slpg_layout_cost internal_cost
5238                     = { weight * factor, m_optimize_size };
5239
5240                   slpg_layout_cost alt_cost = in_cost;
5241                   alt_cost.add_serial_cost (internal_cost);
5242                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5243                     {
5244                       combined_cost = alt_cost;
5245                       layout_costs.in_cost = in_cost;
5246                       layout_costs.internal_cost = internal_cost;
5247                     }
5248                 }
5249             }
5250
5251           /* Record the layout with the lowest cost.  Prefer layout 0 in
5252              the event of a tie between it and another layout.  */
5253           if (!min_layout_cost.is_possible ()
5254               || combined_cost.is_better_than (min_layout_cost,
5255                                                m_optimize_size))
5256             {
5257               min_layout_i = layout_i;
5258               min_layout_cost = combined_cost;
5259             }
5260         }
5261
5262       /* This loop's handling of earlier partitions should ensure that
5263          choosing the original layout for the current partition is no
5264          less valid than it was in the original graph, even with the
5265          provisional layout choices for those earlier partitions.  */
5266       gcc_assert (min_layout_cost.is_possible ());
5267       partition.layout = min_layout_i;
5268     }
5269 }
5270
5271 /* Make a backward pass through the partitions, accumulating output costs.
5272    Make a final choice of layout for each partition.  */
5273
5274 void
5275 vect_optimize_slp_pass::backward_pass ()
5276 {
5277   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5278     {
5279       auto &partition = m_partitions[partition_i];
5280
5281       unsigned int min_layout_i = 0;
5282       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5283       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5284         {
5285           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5286           if (!layout_costs.is_possible ())
5287             continue;
5288
5289           /* Accumulate the costs from successor partitions.  */
5290           bool is_possible = true;
5291           for (unsigned int order_i = partition.node_begin;
5292                order_i < partition.node_end; ++order_i)
5293             {
5294               unsigned int node_i = m_partitioned_nodes[order_i];
5295               auto &vertex = m_vertices[node_i];
5296               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5297                 {
5298                   auto &other_vertex = m_vertices[other_node_i];
5299                   auto &other_partition = m_partitions[other_vertex.partition];
5300                   if (other_vertex.partition > vertex.partition)
5301                     {
5302                       /* Accumulate the incoming costs from later
5303                          partitions, plus the cost of any layout changes
5304                          on UD itself.  */
5305                       auto cost = backward_cost (ud, other_node_i, layout_i);
5306                       if (!cost.is_possible ())
5307                         is_possible = false;
5308                       else
5309                         layout_costs.out_cost.add_parallel_cost (cost);
5310                     }
5311                   else
5312                     /* Make sure that earlier partitions can (if necessary
5313                        or beneficial) keep the layout that they chose in
5314                        the forward pass.  This ensures that there is at
5315                        least one valid choice of layout.  */
5316                     is_possible &= edge_layout_cost (ud, other_node_i,
5317                                                      other_partition.layout,
5318                                                      layout_i).is_possible ();
5319                 };
5320               for_each_partition_edge (node_i, add_cost);
5321             }
5322           if (!is_possible)
5323             {
5324               layout_costs.mark_impossible ();
5325               continue;
5326             }
5327
5328           /* Locally combine the costs from the forward and backward passes.
5329              (This combined cost is not passed on, since that would lead
5330              to double counting.)  */
5331           slpg_layout_cost combined_cost = layout_costs.in_cost;
5332           combined_cost.add_serial_cost (layout_costs.internal_cost);
5333           combined_cost.add_serial_cost (layout_costs.out_cost);
5334
5335           /* Record the layout with the lowest cost.  Prefer layout 0 in
5336              the event of a tie between it and another layout.  */
5337           if (!min_layout_cost.is_possible ()
5338               || combined_cost.is_better_than (min_layout_cost,
5339                                                m_optimize_size))
5340             {
5341               min_layout_i = layout_i;
5342               min_layout_cost = combined_cost;
5343             }
5344         }
5345
5346       gcc_assert (min_layout_cost.is_possible ());
5347       partition.layout = min_layout_i;
5348     }
5349 }
5350
5351 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5352    NODE already has the layout that was selected for its partition.  */
5353
5354 slp_tree
5355 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5356                                                 unsigned int to_layout_i)
5357 {
5358   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5359   slp_tree result = m_node_layouts[result_i];
5360   if (result)
5361     return result;
5362
5363   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5364       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5365           /* We can't permute vector defs in place.  */
5366           && SLP_TREE_VEC_DEFS (node).is_empty ()))
5367     {
5368       /* If the vector is uniform or unchanged, there's nothing to do.  */
5369       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5370         result = node;
5371       else
5372         {
5373           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5374           result = vect_create_new_slp_node (scalar_ops);
5375           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5376         }
5377     }
5378   else
5379     {
5380       unsigned int partition_i = m_vertices[node->vertex].partition;
5381       unsigned int from_layout_i = m_partitions[partition_i].layout;
5382       if (from_layout_i == to_layout_i)
5383         return node;
5384
5385       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5386          permutation instead of a serial one.  Leave the new permutation
5387          in TMP_PERM on success.  */
5388       auto_lane_permutation_t tmp_perm;
5389       unsigned int num_inputs = 1;
5390       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5391         {
5392           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5393           if (from_layout_i != 0)
5394             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5395           if (to_layout_i != 0)
5396             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5397           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5398                                               tmp_perm,
5399                                               SLP_TREE_CHILDREN (node),
5400                                               false) >= 0)
5401             num_inputs = SLP_TREE_CHILDREN (node).length ();
5402           else
5403             tmp_perm.truncate (0);
5404         }
5405
5406       if (dump_enabled_p ())
5407         {
5408           if (tmp_perm.length () > 0)
5409             dump_printf_loc (MSG_NOTE, vect_location,
5410                              "duplicating permutation node %p with"
5411                              " layout %d\n",
5412                              (void *) node, to_layout_i);
5413           else
5414             dump_printf_loc (MSG_NOTE, vect_location,
5415                              "inserting permutation node in place of %p\n",
5416                              (void *) node);
5417         }
5418
5419       unsigned int num_lanes = SLP_TREE_LANES (node);
5420       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5421       if (SLP_TREE_SCALAR_STMTS (node).length ())
5422         {
5423           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5424           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5425           if (from_layout_i != 0)
5426             vect_slp_permute (m_perms[from_layout_i], stmts, false);
5427           if (to_layout_i != 0)
5428             vect_slp_permute (m_perms[to_layout_i], stmts, true);
5429         }
5430       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5431       SLP_TREE_LANES (result) = num_lanes;
5432       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5433       result->vertex = -1;
5434
5435       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5436       if (tmp_perm.length ())
5437         {
5438           lane_perm.safe_splice (tmp_perm);
5439           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5440         }
5441       else
5442         {
5443           lane_perm.create (num_lanes);
5444           for (unsigned j = 0; j < num_lanes; ++j)
5445             lane_perm.quick_push ({ 0, j });
5446           if (from_layout_i != 0)
5447             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5448           if (to_layout_i != 0)
5449             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5450           SLP_TREE_CHILDREN (result).safe_push (node);
5451         }
5452       for (slp_tree child : SLP_TREE_CHILDREN (result))
5453         child->refcnt++;
5454     }
5455   m_node_layouts[result_i] = result;
5456   return result;
5457 }
5458
5459 /* Apply the chosen vector layouts to the SLP graph.  */
5460
5461 void
5462 vect_optimize_slp_pass::materialize ()
5463 {
5464   /* We no longer need the costs, so avoid having two O(N * P) arrays
5465      live at the same time.  */
5466   m_partition_layout_costs.release ();
5467   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5468
5469   auto_sbitmap fully_folded (m_vertices.length ());
5470   bitmap_clear (fully_folded);
5471   for (unsigned int node_i : m_partitioned_nodes)
5472     {
5473       auto &vertex = m_vertices[node_i];
5474       slp_tree node = vertex.node;
5475       int layout_i = m_partitions[vertex.partition].layout;
5476       gcc_assert (layout_i >= 0);
5477
5478       /* Rearrange the scalar statements to match the chosen layout.  */
5479       if (layout_i > 0)
5480         vect_slp_permute (m_perms[layout_i],
5481                           SLP_TREE_SCALAR_STMTS (node), true);
5482
5483       /* Update load and lane permutations.  */
5484       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5485         {
5486           /* First try to absorb the input vector layouts.  If that fails,
5487              force the inputs to have layout LAYOUT_I too.  We checked that
5488              that was possible before deciding to use nonzero output layouts.
5489              (Note that at this stage we don't really have any guarantee that
5490              the target supports the original VEC_PERM_EXPR.)  */
5491           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5492           auto_lane_permutation_t tmp_perm;
5493           tmp_perm.safe_splice (perm);
5494           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5495           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5496                                               tmp_perm,
5497                                               SLP_TREE_CHILDREN (node),
5498                                               false) >= 0)
5499             {
5500               if (dump_enabled_p ()
5501                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5502                                   perm.begin ()))
5503                 dump_printf_loc (MSG_NOTE, vect_location,
5504                                  "absorbing input layouts into %p\n",
5505                                  (void *) node);
5506               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5507               bitmap_set_bit (fully_folded, node_i);
5508             }
5509           else
5510             {
5511               /* Not MSG_MISSED because it would make no sense to users.  */
5512               if (dump_enabled_p ())
5513                 dump_printf_loc (MSG_NOTE, vect_location,
5514                                  "failed to absorb input layouts into %p\n",
5515                                  (void *) node);
5516               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5517             }
5518         }
5519       else
5520         {
5521           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5522           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5523           if (layout_i > 0)
5524             /* ???  When we handle non-bijective permutes the idea
5525                is that we can force the load-permutation to be
5526                { min, min + 1, min + 2, ... max }.  But then the
5527                scalar defs might no longer match the lane content
5528                which means wrong-code with live lane vectorization.
5529                So we possibly have to have NULL entries for those.  */
5530             vect_slp_permute (m_perms[layout_i], load_perm, true);
5531         }
5532     }
5533
5534   /* Do this before any nodes disappear, since it involves a walk
5535      over the leaves.  */
5536   remove_redundant_permutations ();
5537
5538   /* Replace each child with a correctly laid-out version.  */
5539   for (unsigned int node_i : m_partitioned_nodes)
5540     {
5541       /* Skip nodes that have already been handled above.  */
5542       if (bitmap_bit_p (fully_folded, node_i))
5543         continue;
5544
5545       auto &vertex = m_vertices[node_i];
5546       int in_layout_i = m_partitions[vertex.partition].layout;
5547       gcc_assert (in_layout_i >= 0);
5548
5549       unsigned j;
5550       slp_tree child;
5551       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5552         {
5553           if (!child)
5554             continue;
5555
5556           slp_tree new_child = get_result_with_layout (child, in_layout_i);
5557           if (new_child != child)
5558             {
5559               vect_free_slp_tree (child);
5560               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5561               new_child->refcnt += 1;
5562             }
5563         }
5564     }
5565 }
5566
5567 /* Elide load permutations that are not necessary.  Such permutations might
5568    be pre-existing, rather than created by the layout optimizations.  */
5569
5570 void
5571 vect_optimize_slp_pass::remove_redundant_permutations ()
5572 {
5573   for (unsigned int node_i : m_leafs)
5574     {
5575       slp_tree node = m_vertices[node_i].node;
5576       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5577         continue;
5578
5579       /* In basic block vectorization we allow any subchain of an interleaving
5580          chain.
5581          FORNOW: not in loop SLP because of realignment complications.  */
5582       if (is_a <bb_vec_info> (m_vinfo))
5583         {
5584           bool subchain_p = true;
5585           stmt_vec_info next_load_info = NULL;
5586           stmt_vec_info load_info;
5587           unsigned j;
5588           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5589             {
5590               if (j != 0
5591                   && (next_load_info != load_info
5592                       || DR_GROUP_GAP (load_info) != 1))
5593                 {
5594                   subchain_p = false;
5595                   break;
5596                 }
5597               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5598             }
5599           if (subchain_p)
5600             {
5601               SLP_TREE_LOAD_PERMUTATION (node).release ();
5602               continue;
5603             }
5604         }
5605       else
5606         {
5607           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5608           stmt_vec_info load_info;
5609           bool this_load_permuted = false;
5610           unsigned j;
5611           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5612             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5613               {
5614                 this_load_permuted = true;
5615                 break;
5616               }
5617           /* When this isn't a grouped access we know it's single element
5618              and contiguous.  */
5619           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5620             {
5621               if (!this_load_permuted
5622                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5623                       || SLP_TREE_LANES (node) == 1))
5624                 SLP_TREE_LOAD_PERMUTATION (node).release ();
5625               continue;
5626             }
5627           stmt_vec_info first_stmt_info
5628             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5629           if (!this_load_permuted
5630               /* The load requires permutation when unrolling exposes
5631                  a gap either because the group is larger than the SLP
5632                  group-size or because there is a gap between the groups.  */
5633               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5634                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5635                       && DR_GROUP_GAP (first_stmt_info) == 0)))
5636             {
5637               SLP_TREE_LOAD_PERMUTATION (node).release ();
5638               continue;
5639             }
5640         }
5641     }
5642 }
5643
5644 /* Print the partition graph and layout information to the dump file.  */
5645
5646 void
5647 vect_optimize_slp_pass::dump ()
5648 {
5649   dump_printf_loc (MSG_NOTE, vect_location,
5650                    "SLP optimize permutations:\n");
5651   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5652     {
5653       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
5654       const char *sep = "";
5655       for (unsigned int idx : m_perms[layout_i])
5656         {
5657           dump_printf (MSG_NOTE, "%s%d", sep, idx);
5658           sep = ", ";
5659         }
5660       dump_printf (MSG_NOTE, " }\n");
5661     }
5662   dump_printf_loc (MSG_NOTE, vect_location,
5663                    "SLP optimize partitions:\n");
5664   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5665        ++partition_i)
5666     {
5667       auto &partition = m_partitions[partition_i];
5668       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
5669       dump_printf_loc (MSG_NOTE, vect_location,
5670                        "  partition %d (layout %d):\n",
5671                        partition_i, partition.layout);
5672       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
5673       for (unsigned int order_i = partition.node_begin;
5674            order_i < partition.node_end; ++order_i)
5675         {
5676           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5677           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
5678                            (void *) vertex.node);
5679           dump_printf_loc (MSG_NOTE, vect_location,
5680                            "          weight: %f\n",
5681                            vertex.weight.to_double ());
5682           if (vertex.out_degree)
5683             dump_printf_loc (MSG_NOTE, vect_location,
5684                              "          out weight: %f (degree %d)\n",
5685                              vertex.out_weight.to_double (),
5686                              vertex.out_degree);
5687           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5688             dump_printf_loc (MSG_NOTE, vect_location,
5689                              "          op: VEC_PERM_EXPR\n");
5690           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5691             dump_printf_loc (MSG_NOTE, vect_location,
5692                              "          op template: %G", rep->stmt);
5693         }
5694       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
5695       for (unsigned int order_i = partition.node_begin;
5696            order_i < partition.node_end; ++order_i)
5697         {
5698           unsigned int node_i = m_partitioned_nodes[order_i];
5699           auto &vertex = m_vertices[node_i];
5700           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5701             {
5702               auto &other_vertex = m_vertices[other_node_i];
5703               if (other_vertex.partition < vertex.partition)
5704                 dump_printf_loc (MSG_NOTE, vect_location,
5705                                  "      - %p [%d] --> %p\n",
5706                                  (void *) other_vertex.node,
5707                                  other_vertex.partition,
5708                                  (void *) vertex.node);
5709               else
5710                 dump_printf_loc (MSG_NOTE, vect_location,
5711                                  "      - %p --> [%d] %p\n",
5712                                  (void *) vertex.node,
5713                                  other_vertex.partition,
5714                                  (void *) other_vertex.node);
5715             };
5716           for_each_partition_edge (node_i, print_edge);
5717         }
5718
5719       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5720         {
5721           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5722           if (layout_costs.is_possible ())
5723             {
5724               dump_printf_loc (MSG_NOTE, vect_location,
5725                                "    layout %d:%s\n", layout_i,
5726                                partition.layout == int (layout_i)
5727                                ? " (*)" : "");
5728               slpg_layout_cost combined_cost = layout_costs.in_cost;
5729               combined_cost.add_serial_cost (layout_costs.internal_cost);
5730               combined_cost.add_serial_cost (layout_costs.out_cost);
5731 #define TEMPLATE "{depth: %f, total: %f}"
5732               dump_printf_loc (MSG_NOTE, vect_location,
5733                                "        " TEMPLATE "\n",
5734                                layout_costs.in_cost.depth.to_double (),
5735                                layout_costs.in_cost.total.to_double ());
5736               dump_printf_loc (MSG_NOTE, vect_location,
5737                                "      + " TEMPLATE "\n",
5738                                layout_costs.internal_cost.depth.to_double (),
5739                                layout_costs.internal_cost.total.to_double ());
5740               dump_printf_loc (MSG_NOTE, vect_location,
5741                                "      + " TEMPLATE "\n",
5742                                layout_costs.out_cost.depth.to_double (),
5743                                layout_costs.out_cost.total.to_double ());
5744               dump_printf_loc (MSG_NOTE, vect_location,
5745                                "      = " TEMPLATE "\n",
5746                                combined_cost.depth.to_double (),
5747                                combined_cost.total.to_double ());
5748 #undef TEMPLATE
5749             }
5750           else
5751             dump_printf_loc (MSG_NOTE, vect_location,
5752                              "    layout %d: rejected\n", layout_i);
5753         }
5754     }
5755 }
5756
5757 /* Main entry point for the SLP graph optimization pass.  */
5758
5759 void
5760 vect_optimize_slp_pass::run ()
5761 {
5762   build_graph ();
5763   create_partitions ();
5764   start_choosing_layouts ();
5765   if (m_perms.length () > 1)
5766     {
5767       forward_pass ();
5768       backward_pass ();
5769       if (dump_enabled_p ())
5770         dump ();
5771       materialize ();
5772       while (!m_perms.is_empty ())
5773         m_perms.pop ().release ();
5774     }
5775   else
5776     remove_redundant_permutations ();
5777   free_graph (m_slpg);
5778 }
5779
5780 /* Optimize the SLP graph of VINFO.  */
5781
5782 void
5783 vect_optimize_slp (vec_info *vinfo)
5784 {
5785   if (vinfo->slp_instances.is_empty ())
5786     return;
5787   vect_optimize_slp_pass (vinfo).run ();
5788 }
5789
5790 /* Gather loads reachable from the individual SLP graph entries.  */
5791
5792 void
5793 vect_gather_slp_loads (vec_info *vinfo)
5794 {
5795   unsigned i;
5796   slp_instance instance;
5797   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5798     {
5799       hash_set<slp_tree> visited;
5800       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5801                              SLP_INSTANCE_TREE (instance), visited);
5802     }
5803 }
5804
5805
5806 /* For each possible SLP instance decide whether to SLP it and calculate overall
5807    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
5808    least one instance.  */
5809
5810 bool
5811 vect_make_slp_decision (loop_vec_info loop_vinfo)
5812 {
5813   unsigned int i;
5814   poly_uint64 unrolling_factor = 1;
5815   const vec<slp_instance> &slp_instances
5816     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5817   slp_instance instance;
5818   int decided_to_slp = 0;
5819
5820   DUMP_VECT_SCOPE ("vect_make_slp_decision");
5821
5822   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5823     {
5824       /* FORNOW: SLP if you can.  */
5825       /* All unroll factors have the form:
5826
5827            GET_MODE_SIZE (vinfo->vector_mode) * X
5828
5829          for some rational X, so they must have a common multiple.  */
5830       unrolling_factor
5831         = force_common_multiple (unrolling_factor,
5832                                  SLP_INSTANCE_UNROLLING_FACTOR (instance));
5833
5834       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
5835          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5836          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
5837       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5838       decided_to_slp++;
5839     }
5840
5841   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5842
5843   if (decided_to_slp && dump_enabled_p ())
5844     {
5845       dump_printf_loc (MSG_NOTE, vect_location,
5846                        "Decided to SLP %d instances. Unrolling factor ",
5847                        decided_to_slp);
5848       dump_dec (MSG_NOTE, unrolling_factor);
5849       dump_printf (MSG_NOTE, "\n");
5850     }
5851
5852   return (decided_to_slp > 0);
5853 }
5854
5855 /* Private data for vect_detect_hybrid_slp.  */
5856 struct vdhs_data
5857 {
5858   loop_vec_info loop_vinfo;
5859   vec<stmt_vec_info> *worklist;
5860 };
5861
5862 /* Walker for walk_gimple_op.  */
5863
5864 static tree
5865 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5866 {
5867   walk_stmt_info *wi = (walk_stmt_info *)data;
5868   vdhs_data *dat = (vdhs_data *)wi->info;
5869
5870   if (wi->is_lhs)
5871     return NULL_TREE;
5872
5873   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5874   if (!def_stmt_info)
5875     return NULL_TREE;
5876   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5877   if (PURE_SLP_STMT (def_stmt_info))
5878     {
5879       if (dump_enabled_p ())
5880         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5881                          def_stmt_info->stmt);
5882       STMT_SLP_TYPE (def_stmt_info) = hybrid;
5883       dat->worklist->safe_push (def_stmt_info);
5884     }
5885
5886   return NULL_TREE;
5887 }
5888
5889 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5890    if so, otherwise pushing it to WORKLIST.  */
5891
5892 static void
5893 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5894                                vec<stmt_vec_info> &worklist,
5895                                stmt_vec_info stmt_info)
5896 {
5897   if (dump_enabled_p ())
5898     dump_printf_loc (MSG_NOTE, vect_location,
5899                      "Processing hybrid candidate : %G", stmt_info->stmt);
5900   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5901   imm_use_iterator iter2;
5902   ssa_op_iter iter1;
5903   use_operand_p use_p;
5904   def_operand_p def_p;
5905   bool any_def = false;
5906   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5907     {
5908       any_def = true;
5909       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5910         {
5911           if (is_gimple_debug (USE_STMT (use_p)))
5912             continue;
5913           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5914           /* An out-of loop use means this is a loop_vect sink.  */
5915           if (!use_info)
5916             {
5917               if (dump_enabled_p ())
5918                 dump_printf_loc (MSG_NOTE, vect_location,
5919                                  "Found loop_vect sink: %G", stmt_info->stmt);
5920               worklist.safe_push (stmt_info);
5921               return;
5922             }
5923           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5924             {
5925               if (dump_enabled_p ())
5926                 dump_printf_loc (MSG_NOTE, vect_location,
5927                                  "Found loop_vect use: %G", use_info->stmt);
5928               worklist.safe_push (stmt_info);
5929               return;
5930             }
5931         }
5932     }
5933   /* No def means this is a loo_vect sink.  */
5934   if (!any_def)
5935     {
5936       if (dump_enabled_p ())
5937         dump_printf_loc (MSG_NOTE, vect_location,
5938                          "Found loop_vect sink: %G", stmt_info->stmt);
5939       worklist.safe_push (stmt_info);
5940       return;
5941     }
5942   if (dump_enabled_p ())
5943     dump_printf_loc (MSG_NOTE, vect_location,
5944                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5945   STMT_SLP_TYPE (stmt_info) = pure_slp;
5946 }
5947
5948 /* Find stmts that must be both vectorized and SLPed.  */
5949
5950 void
5951 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5952 {
5953   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5954
5955   /* All stmts participating in SLP are marked pure_slp, all other
5956      stmts are loop_vect.
5957      First collect all loop_vect stmts into a worklist.
5958      SLP patterns cause not all original scalar stmts to appear in
5959      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5960      Rectify this here and do a backward walk over the IL only considering
5961      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5962      mark them as pure_slp.  */
5963   auto_vec<stmt_vec_info> worklist;
5964   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5965     {
5966       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5967       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5968            gsi_next (&gsi))
5969         {
5970           gphi *phi = gsi.phi ();
5971           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5972           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5973             maybe_push_to_hybrid_worklist (loop_vinfo,
5974                                            worklist, stmt_info);
5975         }
5976       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5977            gsi_prev (&gsi))
5978         {
5979           gimple *stmt = gsi_stmt (gsi);
5980           if (is_gimple_debug (stmt))
5981             continue;
5982           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5983           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5984             {
5985               for (gimple_stmt_iterator gsi2
5986                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5987                    !gsi_end_p (gsi2); gsi_next (&gsi2))
5988                 {
5989                   stmt_vec_info patt_info
5990                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5991                   if (!STMT_SLP_TYPE (patt_info)
5992                       && STMT_VINFO_RELEVANT (patt_info))
5993                     maybe_push_to_hybrid_worklist (loop_vinfo,
5994                                                    worklist, patt_info);
5995                 }
5996               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5997             }
5998           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5999             maybe_push_to_hybrid_worklist (loop_vinfo,
6000                                            worklist, stmt_info);
6001         }
6002     }
6003
6004   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
6005      mark any SLP vectorized stmt as hybrid.
6006      ???  We're visiting def stmts N times (once for each non-SLP and
6007      once for each hybrid-SLP use).  */
6008   walk_stmt_info wi;
6009   vdhs_data dat;
6010   dat.worklist = &worklist;
6011   dat.loop_vinfo = loop_vinfo;
6012   memset (&wi, 0, sizeof (wi));
6013   wi.info = (void *)&dat;
6014   while (!worklist.is_empty ())
6015     {
6016       stmt_vec_info stmt_info = worklist.pop ();
6017       /* Since SSA operands are not set up for pattern stmts we need
6018          to use walk_gimple_op.  */
6019       wi.is_lhs = 0;
6020       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
6021       /* For gather/scatter make sure to walk the offset operand, that
6022          can be a scaling and conversion away.  */
6023       gather_scatter_info gs_info;
6024       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6025           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6026         {
6027           int dummy;
6028           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
6029         }
6030     }
6031 }
6032
6033
6034 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
6035
6036 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6037   : vec_info (vec_info::bb, shared),
6038     bbs (_bbs),
6039     roots (vNULL)
6040 {
6041   for (unsigned i = 0; i < bbs.length (); ++i)
6042     {
6043       if (i != 0)
6044         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6045              gsi_next (&si))
6046           {
6047             gphi *phi = si.phi ();
6048             gimple_set_uid (phi, 0);
6049             add_stmt (phi);
6050           }
6051       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6052            !gsi_end_p (gsi); gsi_next (&gsi))
6053         {
6054           gimple *stmt = gsi_stmt (gsi);
6055           gimple_set_uid (stmt, 0);
6056           if (is_gimple_debug (stmt))
6057             continue;
6058           add_stmt (stmt);
6059         }
6060     }
6061 }
6062
6063
6064 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
6065    stmts in the basic block.  */
6066
6067 _bb_vec_info::~_bb_vec_info ()
6068 {
6069   /* Reset region marker.  */
6070   for (unsigned i = 0; i < bbs.length (); ++i)
6071     {
6072       if (i != 0)
6073         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6074              gsi_next (&si))
6075           {
6076             gphi *phi = si.phi ();
6077             gimple_set_uid (phi, -1);
6078           }
6079       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6080            !gsi_end_p (gsi); gsi_next (&gsi))
6081         {
6082           gimple *stmt = gsi_stmt (gsi);
6083           gimple_set_uid (stmt, -1);
6084         }
6085     }
6086
6087   for (unsigned i = 0; i < roots.length (); ++i)
6088     {
6089       roots[i].stmts.release ();
6090       roots[i].roots.release ();
6091       roots[i].remain.release ();
6092     }
6093   roots.release ();
6094 }
6095
6096 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
6097    given then that child nodes have already been processed, and that
6098    their def types currently match their SLP node's def type.  */
6099
6100 static bool
6101 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6102                                     slp_instance node_instance,
6103                                     stmt_vector_for_cost *cost_vec)
6104 {
6105   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6106
6107   /* Calculate the number of vector statements to be created for the
6108      scalar stmts in this node.  For SLP reductions it is equal to the
6109      number of vector statements in the children (which has already been
6110      calculated by the recursive call).  Otherwise it is the number of
6111      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6112      VF divided by the number of elements in a vector.  */
6113   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6114       && !STMT_VINFO_DATA_REF (stmt_info)
6115       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6116     {
6117       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6118         if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6119           {
6120             SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6121               = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6122             break;
6123           }
6124     }
6125   else
6126     {
6127       poly_uint64 vf;
6128       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6129         vf = loop_vinfo->vectorization_factor;
6130       else
6131         vf = 1;
6132       unsigned int group_size = SLP_TREE_LANES (node);
6133       tree vectype = SLP_TREE_VECTYPE (node);
6134       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6135         = vect_get_num_vectors (vf * group_size, vectype);
6136     }
6137
6138   /* Handle purely internal nodes.  */
6139   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6140     {
6141       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6142         return false;
6143
6144       stmt_vec_info slp_stmt_info;
6145       unsigned int i;
6146       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6147         {
6148           if (STMT_VINFO_LIVE_P (slp_stmt_info)
6149               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6150                                                node_instance, i,
6151                                                false, cost_vec))
6152             return false;
6153         }
6154       return true;
6155     }
6156
6157   bool dummy;
6158   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6159                             node, node_instance, cost_vec);
6160 }
6161
6162 /* Try to build NODE from scalars, returning true on success.
6163    NODE_INSTANCE is the SLP instance that contains NODE.  */
6164
6165 static bool
6166 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6167                               slp_instance node_instance)
6168 {
6169   stmt_vec_info stmt_info;
6170   unsigned int i;
6171
6172   if (!is_a <bb_vec_info> (vinfo)
6173       || node == SLP_INSTANCE_TREE (node_instance)
6174       || !SLP_TREE_SCALAR_STMTS (node).exists ()
6175       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6176       /* Force the mask use to be built from scalars instead.  */
6177       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6178     return false;
6179
6180   if (dump_enabled_p ())
6181     dump_printf_loc (MSG_NOTE, vect_location,
6182                      "Building vector operands of %p from scalars instead\n",
6183                      (void *) node);
6184
6185   /* Don't remove and free the child nodes here, since they could be
6186      referenced by other structures.  The analysis and scheduling phases
6187      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
6188   unsigned int group_size = SLP_TREE_LANES (node);
6189   SLP_TREE_DEF_TYPE (node) = vect_external_def;
6190   /* Invariants get their vector type from the uses.  */
6191   SLP_TREE_VECTYPE (node) = NULL_TREE;
6192   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6193   SLP_TREE_LOAD_PERMUTATION (node).release ();
6194   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6195     {
6196       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6197       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6198     }
6199   return true;
6200 }
6201
6202 /* Return true if all elements of the slice are the same.  */
6203 bool
6204 vect_scalar_ops_slice::all_same_p () const
6205 {
6206   for (unsigned int i = 1; i < length; ++i)
6207     if (!operand_equal_p (op (0), op (i)))
6208       return false;
6209   return true;
6210 }
6211
6212 hashval_t
6213 vect_scalar_ops_slice_hash::hash (const value_type &s)
6214 {
6215   hashval_t hash = 0;
6216   for (unsigned i = 0; i < s.length; ++i)
6217     hash = iterative_hash_expr (s.op (i), hash);
6218   return hash;
6219 }
6220
6221 bool
6222 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6223                                    const compare_type &s2)
6224 {
6225   if (s1.length != s2.length)
6226     return false;
6227   for (unsigned i = 0; i < s1.length; ++i)
6228     if (!operand_equal_p (s1.op (i), s2.op (i)))
6229       return false;
6230   return true;
6231 }
6232
6233 /* Compute the prologue cost for invariant or constant operands represented
6234    by NODE.  */
6235
6236 static void
6237 vect_prologue_cost_for_slp (slp_tree node,
6238                             stmt_vector_for_cost *cost_vec)
6239 {
6240   /* There's a special case of an existing vector, that costs nothing.  */
6241   if (SLP_TREE_SCALAR_OPS (node).length () == 0
6242       && !SLP_TREE_VEC_DEFS (node).is_empty ())
6243     return;
6244   /* Without looking at the actual initializer a vector of
6245      constants can be implemented as load from the constant pool.
6246      When all elements are the same we can use a splat.  */
6247   tree vectype = SLP_TREE_VECTYPE (node);
6248   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6249   unsigned HOST_WIDE_INT const_nunits;
6250   unsigned nelt_limit;
6251   auto ops = &SLP_TREE_SCALAR_OPS (node);
6252   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6253   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6254       && ! multiple_p (const_nunits, group_size))
6255     {
6256       nelt_limit = const_nunits;
6257       hash_set<vect_scalar_ops_slice_hash> vector_ops;
6258       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6259         if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6260           starts.quick_push (i * const_nunits);
6261     }
6262   else
6263     {
6264       /* If either the vector has variable length or the vectors
6265          are composed of repeated whole groups we only need to
6266          cost construction once.  All vectors will be the same.  */
6267       nelt_limit = group_size;
6268       starts.quick_push (0);
6269     }
6270   /* ???  We're just tracking whether vectors in a single node are the same.
6271      Ideally we'd do something more global.  */
6272   bool passed = false;
6273   for (unsigned int start : starts)
6274     {
6275       vect_cost_for_stmt kind;
6276       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6277         kind = vector_load;
6278       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6279         kind = scalar_to_vec;
6280       else
6281         kind = vec_construct;
6282       /* The target cost hook has no idea which part of the SLP node
6283          we are costing so avoid passing it down more than once.  Pass
6284          it to the first vec_construct or scalar_to_vec part since for those
6285          the x86 backend tries to account for GPR to XMM register moves.  */
6286       record_stmt_cost (cost_vec, 1, kind,
6287                         (kind != vector_load && !passed) ? node : nullptr,
6288                         vectype, 0, vect_prologue);
6289       if (kind != vector_load)
6290         passed = true;
6291     }
6292 }
6293
6294 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6295    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6296
6297    Return true if the operations are supported.  */
6298
6299 static bool
6300 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6301                                   slp_instance node_instance,
6302                                   hash_set<slp_tree> &visited_set,
6303                                   vec<slp_tree> &visited_vec,
6304                                   stmt_vector_for_cost *cost_vec)
6305 {
6306   int i, j;
6307   slp_tree child;
6308
6309   /* Assume we can code-generate all invariants.  */
6310   if (!node
6311       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6312       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6313     return true;
6314
6315   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6316     {
6317       if (dump_enabled_p ())
6318         dump_printf_loc (MSG_NOTE, vect_location,
6319                          "Failed cyclic SLP reference in %p\n", (void *) node);
6320       return false;
6321     }
6322   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6323
6324   /* If we already analyzed the exact same set of scalar stmts we're done.
6325      We share the generated vector stmts for those.  */
6326   if (visited_set.add (node))
6327     return true;
6328   visited_vec.safe_push (node);
6329
6330   bool res = true;
6331   unsigned visited_rec_start = visited_vec.length ();
6332   unsigned cost_vec_rec_start = cost_vec->length ();
6333   bool seen_non_constant_child = false;
6334   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6335     {
6336       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6337                                               visited_set, visited_vec,
6338                                               cost_vec);
6339       if (!res)
6340         break;
6341       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6342         seen_non_constant_child = true;
6343     }
6344   /* We're having difficulties scheduling nodes with just constant
6345      operands and no scalar stmts since we then cannot compute a stmt
6346      insertion place.  */
6347   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6348     {
6349       if (dump_enabled_p ())
6350         dump_printf_loc (MSG_NOTE, vect_location,
6351                          "Cannot vectorize all-constant op node %p\n",
6352                          (void *) node);
6353       res = false;
6354     }
6355
6356   if (res)
6357     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6358                                               cost_vec);
6359   /* If analysis failed we have to pop all recursive visited nodes
6360      plus ourselves.  */
6361   if (!res)
6362     {
6363       while (visited_vec.length () >= visited_rec_start)
6364         visited_set.remove (visited_vec.pop ());
6365       cost_vec->truncate (cost_vec_rec_start);
6366     }
6367
6368   /* When the node can be vectorized cost invariant nodes it references.
6369      This is not done in DFS order to allow the refering node
6370      vectorizable_* calls to nail down the invariant nodes vector type
6371      and possibly unshare it if it needs a different vector type than
6372      other referrers.  */
6373   if (res)
6374     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6375       if (child
6376           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6377               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6378           /* Perform usual caching, note code-generation still
6379              code-gens these nodes multiple times but we expect
6380              to CSE them later.  */
6381           && !visited_set.add (child))
6382         {
6383           visited_vec.safe_push (child);
6384           /* ???  After auditing more code paths make a "default"
6385              and push the vector type from NODE to all children
6386              if it is not already set.  */
6387           /* Compute the number of vectors to be generated.  */
6388           tree vector_type = SLP_TREE_VECTYPE (child);
6389           if (!vector_type)
6390             {
6391               /* For shifts with a scalar argument we don't need
6392                  to cost or code-generate anything.
6393                  ???  Represent this more explicitely.  */
6394               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6395                            == shift_vec_info_type)
6396                           && j == 1);
6397               continue;
6398             }
6399           unsigned group_size = SLP_TREE_LANES (child);
6400           poly_uint64 vf = 1;
6401           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6402             vf = loop_vinfo->vectorization_factor;
6403           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6404             = vect_get_num_vectors (vf * group_size, vector_type);
6405           /* And cost them.  */
6406           vect_prologue_cost_for_slp (child, cost_vec);
6407         }
6408
6409   /* If this node or any of its children can't be vectorized, try pruning
6410      the tree here rather than felling the whole thing.  */
6411   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6412     {
6413       /* We'll need to revisit this for invariant costing and number
6414          of vectorized stmt setting.   */
6415       res = true;
6416     }
6417
6418   return res;
6419 }
6420
6421 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6422    region and that can be vectorized using vectorizable_live_operation
6423    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
6424    scalar code computing it to be retained.  */
6425
6426 static void
6427 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6428                              slp_instance instance,
6429                              stmt_vector_for_cost *cost_vec,
6430                              hash_set<stmt_vec_info> &svisited,
6431                              hash_set<slp_tree> &visited)
6432 {
6433   if (visited.add (node))
6434     return;
6435
6436   unsigned i;
6437   stmt_vec_info stmt_info;
6438   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6439   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6440     {
6441       if (svisited.contains (stmt_info))
6442         continue;
6443       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6444       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6445           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6446         /* Only the pattern root stmt computes the original scalar value.  */
6447         continue;
6448       bool mark_visited = true;
6449       gimple *orig_stmt = orig_stmt_info->stmt;
6450       ssa_op_iter op_iter;
6451       def_operand_p def_p;
6452       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6453         {
6454           imm_use_iterator use_iter;
6455           gimple *use_stmt;
6456           stmt_vec_info use_stmt_info;
6457           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6458             if (!is_gimple_debug (use_stmt))
6459               {
6460                 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6461                 if (!use_stmt_info
6462                     || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6463                   {
6464                     STMT_VINFO_LIVE_P (stmt_info) = true;
6465                     if (vectorizable_live_operation (bb_vinfo, stmt_info,
6466                                                      node, instance, i,
6467                                                      false, cost_vec))
6468                       /* ???  So we know we can vectorize the live stmt
6469                          from one SLP node.  If we cannot do so from all
6470                          or none consistently we'd have to record which
6471                          SLP node (and lane) we want to use for the live
6472                          operation.  So make sure we can code-generate
6473                          from all nodes.  */
6474                       mark_visited = false;
6475                     else
6476                       STMT_VINFO_LIVE_P (stmt_info) = false;
6477                     break;
6478                   }
6479               }
6480           /* We have to verify whether we can insert the lane extract
6481              before all uses.  The following is a conservative approximation.
6482              We cannot put this into vectorizable_live_operation because
6483              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6484              doesn't work.
6485              Note that while the fact that we emit code for loads at the
6486              first load should make this a non-problem leafs we construct
6487              from scalars are vectorized after the last scalar def.
6488              ???  If we'd actually compute the insert location during
6489              analysis we could use sth less conservative than the last
6490              scalar stmt in the node for the dominance check.  */
6491           /* ???  What remains is "live" uses in vector CTORs in the same
6492              SLP graph which is where those uses can end up code-generated
6493              right after their definition instead of close to their original
6494              use.  But that would restrict us to code-generate lane-extracts
6495              from the latest stmt in a node.  So we compensate for this
6496              during code-generation, simply not replacing uses for those
6497              hopefully rare cases.  */
6498           if (STMT_VINFO_LIVE_P (stmt_info))
6499             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6500               if (!is_gimple_debug (use_stmt)
6501                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6502                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6503                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6504                 {
6505                   if (dump_enabled_p ())
6506                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6507                                      "Cannot determine insertion place for "
6508                                      "lane extract\n");
6509                   STMT_VINFO_LIVE_P (stmt_info) = false;
6510                   mark_visited = true;
6511                 }
6512         }
6513       if (mark_visited)
6514         svisited.add (stmt_info);
6515     }
6516
6517   slp_tree child;
6518   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6519     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6520       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6521                                    cost_vec, svisited, visited);
6522 }
6523
6524 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
6525
6526 static bool
6527 vectorizable_bb_reduc_epilogue (slp_instance instance,
6528                                 stmt_vector_for_cost *cost_vec)
6529 {
6530   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6531   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6532   if (reduc_code == MINUS_EXPR)
6533     reduc_code = PLUS_EXPR;
6534   internal_fn reduc_fn;
6535   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6536   if (!vectype
6537       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6538       || reduc_fn == IFN_LAST
6539       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6540       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6541                                      TREE_TYPE (vectype)))
6542     {
6543       if (dump_enabled_p ())
6544         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6545                          "not vectorized: basic block reduction epilogue "
6546                          "operation unsupported.\n");
6547       return false;
6548     }
6549
6550   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6551      cost log2 vector operations plus shuffles and one extraction.  */
6552   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6553   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6554                     vectype, 0, vect_body);
6555   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6556                     vectype, 0, vect_body);
6557   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6558                     vectype, 0, vect_body);
6559
6560   /* Since we replace all stmts of a possibly longer scalar reduction
6561      chain account for the extra scalar stmts for that.  */
6562   record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6563                     instance->root_stmts[0], 0, vect_body);
6564   return true;
6565 }
6566
6567 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6568    and recurse to children.  */
6569
6570 static void
6571 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6572                               hash_set<slp_tree> &visited)
6573 {
6574   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6575       || visited.add (node))
6576     return;
6577
6578   stmt_vec_info stmt;
6579   unsigned i;
6580   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6581     roots.remove (vect_orig_stmt (stmt));
6582
6583   slp_tree child;
6584   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6585     if (child)
6586       vect_slp_prune_covered_roots (child, roots, visited);
6587 }
6588
6589 /* Analyze statements in SLP instances of VINFO.  Return true if the
6590    operations are supported. */
6591
6592 bool
6593 vect_slp_analyze_operations (vec_info *vinfo)
6594 {
6595   slp_instance instance;
6596   int i;
6597
6598   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6599
6600   hash_set<slp_tree> visited;
6601   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6602     {
6603       auto_vec<slp_tree> visited_vec;
6604       stmt_vector_for_cost cost_vec;
6605       cost_vec.create (2);
6606       if (is_a <bb_vec_info> (vinfo))
6607         vect_location = instance->location ();
6608       if (!vect_slp_analyze_node_operations (vinfo,
6609                                              SLP_INSTANCE_TREE (instance),
6610                                              instance, visited, visited_vec,
6611                                              &cost_vec)
6612           /* CTOR instances require vectorized defs for the SLP tree root.  */
6613           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6614               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6615                   != vect_internal_def
6616                   /* Make sure we vectorized with the expected type.  */
6617                   || !useless_type_conversion_p
6618                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6619                                               (instance->root_stmts[0]->stmt))),
6620                          TREE_TYPE (SLP_TREE_VECTYPE
6621                                             (SLP_INSTANCE_TREE (instance))))))
6622           /* Check we can vectorize the reduction.  */
6623           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6624               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6625         {
6626           slp_tree node = SLP_INSTANCE_TREE (instance);
6627           stmt_vec_info stmt_info;
6628           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6629             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6630           else
6631             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6632           if (dump_enabled_p ())
6633             dump_printf_loc (MSG_NOTE, vect_location,
6634                              "removing SLP instance operations starting from: %G",
6635                              stmt_info->stmt);
6636           vect_free_slp_instance (instance);
6637           vinfo->slp_instances.ordered_remove (i);
6638           cost_vec.release ();
6639           while (!visited_vec.is_empty ())
6640             visited.remove (visited_vec.pop ());
6641         }
6642       else
6643         {
6644           i++;
6645           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6646             {
6647               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6648               cost_vec.release ();
6649             }
6650           else
6651             /* For BB vectorization remember the SLP graph entry
6652                cost for later.  */
6653             instance->cost_vec = cost_vec;
6654         }
6655     }
6656
6657   /* Now look for SLP instances with a root that are covered by other
6658      instances and remove them.  */
6659   hash_set<stmt_vec_info> roots;
6660   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6661     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6662       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6663   if (!roots.is_empty ())
6664     {
6665       visited.empty ();
6666       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6667         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6668                                       visited);
6669       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6670         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6671             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6672           {
6673             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6674             if (dump_enabled_p ())
6675               dump_printf_loc (MSG_NOTE, vect_location,
6676                                "removing SLP instance operations starting "
6677                                "from: %G", root->stmt);
6678             vect_free_slp_instance (instance);
6679             vinfo->slp_instances.ordered_remove (i);
6680           }
6681         else
6682           ++i;
6683     }
6684
6685   /* Compute vectorizable live stmts.  */
6686   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6687     {
6688       hash_set<stmt_vec_info> svisited;
6689       hash_set<slp_tree> visited;
6690       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6691         {
6692           vect_location = instance->location ();
6693           vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6694                                        instance, &instance->cost_vec, svisited,
6695                                        visited);
6696         }
6697     }
6698
6699   return !vinfo->slp_instances.is_empty ();
6700 }
6701
6702 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6703    closing the eventual chain.  */
6704
6705 static slp_instance
6706 get_ultimate_leader (slp_instance instance,
6707                      hash_map<slp_instance, slp_instance> &instance_leader)
6708 {
6709   auto_vec<slp_instance *, 8> chain;
6710   slp_instance *tem;
6711   while (*(tem = instance_leader.get (instance)) != instance)
6712     {
6713       chain.safe_push (tem);
6714       instance = *tem;
6715     }
6716   while (!chain.is_empty ())
6717     *chain.pop () = instance;
6718   return instance;
6719 }
6720
6721 namespace {
6722 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
6723    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6724    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
6725
6726    INSTANCE_LEADER is as for get_ultimate_leader.  */
6727
6728 template<typename T>
6729 bool
6730 vect_map_to_instance (slp_instance instance, T key,
6731                       hash_map<T, slp_instance> &key_to_instance,
6732                       hash_map<slp_instance, slp_instance> &instance_leader)
6733 {
6734   bool existed_p;
6735   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6736   if (!existed_p)
6737     ;
6738   else if (key_instance != instance)
6739     {
6740       /* If we're running into a previously marked key make us the
6741          leader of the current ultimate leader.  This keeps the
6742          leader chain acyclic and works even when the current instance
6743          connects two previously independent graph parts.  */
6744       slp_instance key_leader
6745         = get_ultimate_leader (key_instance, instance_leader);
6746       if (key_leader != instance)
6747         instance_leader.put (key_leader, instance);
6748     }
6749   key_instance = instance;
6750   return existed_p;
6751 }
6752 }
6753
6754 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
6755
6756 static void
6757 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6758                            slp_instance instance, slp_tree node,
6759                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6760                            hash_map<slp_tree, slp_instance> &node_to_instance,
6761                            hash_map<slp_instance, slp_instance> &instance_leader)
6762 {
6763   stmt_vec_info stmt_info;
6764   unsigned i;
6765
6766   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6767     vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6768                           instance_leader);
6769
6770   if (vect_map_to_instance (instance, node, node_to_instance,
6771                             instance_leader))
6772     return;
6773
6774   slp_tree child;
6775   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6776     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6777       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6778                                  node_to_instance, instance_leader);
6779 }
6780
6781 /* Partition the SLP graph into pieces that can be costed independently.  */
6782
6783 static void
6784 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6785 {
6786   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6787
6788   /* First walk the SLP graph assigning each involved scalar stmt a
6789      corresponding SLP graph entry and upon visiting a previously
6790      marked stmt, make the stmts leader the current SLP graph entry.  */
6791   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6792   hash_map<slp_tree, slp_instance> node_to_instance;
6793   hash_map<slp_instance, slp_instance> instance_leader;
6794   slp_instance instance;
6795   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6796     {
6797       instance_leader.put (instance, instance);
6798       vect_bb_partition_graph_r (bb_vinfo,
6799                                  instance, SLP_INSTANCE_TREE (instance),
6800                                  stmt_to_instance, node_to_instance,
6801                                  instance_leader);
6802     }
6803
6804   /* Then collect entries to each independent subgraph.  */
6805   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6806     {
6807       slp_instance leader = get_ultimate_leader (instance, instance_leader);
6808       leader->subgraph_entries.safe_push (instance);
6809       if (dump_enabled_p ()
6810           && leader != instance)
6811         dump_printf_loc (MSG_NOTE, vect_location,
6812                          "instance %p is leader of %p\n",
6813                          (void *) leader, (void *) instance);
6814     }
6815 }
6816
6817 /* Compute the set of scalar stmts participating in internal and external
6818    nodes.  */
6819
6820 static void
6821 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6822                                          hash_set<slp_tree> &visited,
6823                                          hash_set<stmt_vec_info> &vstmts,
6824                                          hash_set<stmt_vec_info> &estmts)
6825 {
6826   int i;
6827   stmt_vec_info stmt_info;
6828   slp_tree child;
6829
6830   if (visited.add (node))
6831     return;
6832
6833   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6834     {
6835       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6836         vstmts.add (stmt_info);
6837
6838       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6839         if (child)
6840           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6841                                                    vstmts, estmts);
6842     }
6843   else
6844     for (tree def : SLP_TREE_SCALAR_OPS (node))
6845       {
6846         stmt_vec_info def_stmt = vinfo->lookup_def (def);
6847         if (def_stmt)
6848           estmts.add (def_stmt);
6849       }
6850 }
6851
6852
6853 /* Compute the scalar cost of the SLP node NODE and its children
6854    and return it.  Do not account defs that are marked in LIFE and
6855    update LIFE according to uses of NODE.  */
6856
6857 static void
6858 vect_bb_slp_scalar_cost (vec_info *vinfo,
6859                          slp_tree node, vec<bool, va_heap> *life,
6860                          stmt_vector_for_cost *cost_vec,
6861                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6862                          hash_set<slp_tree> &visited)
6863 {
6864   unsigned i;
6865   stmt_vec_info stmt_info;
6866   slp_tree child;
6867
6868   if (visited.add (node))
6869     return;
6870
6871   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6872     {
6873       ssa_op_iter op_iter;
6874       def_operand_p def_p;
6875
6876       if ((*life)[i])
6877         continue;
6878
6879       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6880       gimple *orig_stmt = orig_stmt_info->stmt;
6881
6882       /* If there is a non-vectorized use of the defs then the scalar
6883          stmt is kept live in which case we do not account it or any
6884          required defs in the SLP children in the scalar cost.  This
6885          way we make the vectorization more costly when compared to
6886          the scalar cost.  */
6887       if (!STMT_VINFO_LIVE_P (stmt_info))
6888         {
6889           auto_vec<gimple *, 8> worklist;
6890           hash_set<gimple *> *worklist_visited = NULL;
6891           worklist.quick_push (orig_stmt);
6892           do
6893             {
6894               gimple *work_stmt = worklist.pop ();
6895               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6896                 {
6897                   imm_use_iterator use_iter;
6898                   gimple *use_stmt;
6899                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6900                                          DEF_FROM_PTR (def_p))
6901                     if (!is_gimple_debug (use_stmt))
6902                       {
6903                         stmt_vec_info use_stmt_info
6904                           = vinfo->lookup_stmt (use_stmt);
6905                         if (!use_stmt_info
6906                             || !vectorized_scalar_stmts.contains (use_stmt_info))
6907                           {
6908                             if (use_stmt_info
6909                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6910                               {
6911                                 /* For stmts participating in patterns we have
6912                                    to check its uses recursively.  */
6913                                 if (!worklist_visited)
6914                                   worklist_visited = new hash_set<gimple *> ();
6915                                 if (!worklist_visited->add (use_stmt))
6916                                   worklist.safe_push (use_stmt);
6917                                 continue;
6918                               }
6919                             (*life)[i] = true;
6920                             goto next_lane;
6921                           }
6922                       }
6923                 }
6924             }
6925           while (!worklist.is_empty ());
6926 next_lane:
6927           if (worklist_visited)
6928             delete worklist_visited;
6929           if ((*life)[i])
6930             continue;
6931         }
6932
6933       /* Count scalar stmts only once.  */
6934       if (gimple_visited_p (orig_stmt))
6935         continue;
6936       gimple_set_visited (orig_stmt, true);
6937
6938       vect_cost_for_stmt kind;
6939       if (STMT_VINFO_DATA_REF (orig_stmt_info))
6940         {
6941           if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6942             kind = scalar_load;
6943           else
6944             kind = scalar_store;
6945         }
6946       else if (vect_nop_conversion_p (orig_stmt_info))
6947         continue;
6948       /* For single-argument PHIs assume coalescing which means zero cost
6949          for the scalar and the vector PHIs.  This avoids artificially
6950          favoring the vector path (but may pessimize it in some cases).  */
6951       else if (is_a <gphi *> (orig_stmt_info->stmt)
6952                && gimple_phi_num_args
6953                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6954         continue;
6955       else
6956         kind = scalar_stmt;
6957       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6958                         SLP_TREE_VECTYPE (node), 0, vect_body);
6959     }
6960
6961   auto_vec<bool, 20> subtree_life;
6962   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6963     {
6964       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6965         {
6966           /* Do not directly pass LIFE to the recursive call, copy it to
6967              confine changes in the callee to the current child/subtree.  */
6968           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6969             {
6970               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6971               for (unsigned j = 0;
6972                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6973                 {
6974                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6975                   if (perm.first == i)
6976                     subtree_life[perm.second] = (*life)[j];
6977                 }
6978             }
6979           else
6980             {
6981               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6982               subtree_life.safe_splice (*life);
6983             }
6984           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6985                                    vectorized_scalar_stmts, visited);
6986           subtree_life.truncate (0);
6987         }
6988     }
6989 }
6990
6991 /* Comparator for the loop-index sorted cost vectors.  */
6992
6993 static int
6994 li_cost_vec_cmp (const void *a_, const void *b_)
6995 {
6996   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6997   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6998   if (a->first < b->first)
6999     return -1;
7000   else if (a->first == b->first)
7001     return 0;
7002   return 1;
7003 }
7004
7005 /* Check if vectorization of the basic block is profitable for the
7006    subgraph denoted by SLP_INSTANCES.  */
7007
7008 static bool
7009 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
7010                                     vec<slp_instance> slp_instances,
7011                                     loop_p orig_loop)
7012 {
7013   slp_instance instance;
7014   int i;
7015   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
7016   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
7017
7018   if (dump_enabled_p ())
7019     {
7020       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
7021       hash_set<slp_tree> visited;
7022       FOR_EACH_VEC_ELT (slp_instances, i, instance)
7023         vect_print_slp_graph (MSG_NOTE, vect_location,
7024                               SLP_INSTANCE_TREE (instance), visited);
7025     }
7026
7027   /* Compute the set of scalar stmts we know will go away 'locally' when
7028      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
7029      not accurate for nodes promoted extern late or for scalar stmts that
7030      are used both in extern defs and in vectorized defs.  */
7031   hash_set<stmt_vec_info> vectorized_scalar_stmts;
7032   hash_set<stmt_vec_info> scalar_stmts_in_externs;
7033   hash_set<slp_tree> visited;
7034   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7035     {
7036       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
7037                                                SLP_INSTANCE_TREE (instance),
7038                                                visited,
7039                                                vectorized_scalar_stmts,
7040                                                scalar_stmts_in_externs);
7041       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7042         vectorized_scalar_stmts.add (rstmt);
7043     }
7044   /* Scalar stmts used as defs in external nodes need to be preseved, so
7045      remove them from vectorized_scalar_stmts.  */
7046   for (stmt_vec_info stmt : scalar_stmts_in_externs)
7047     vectorized_scalar_stmts.remove (stmt);
7048
7049   /* Calculate scalar cost and sum the cost for the vector stmts
7050      previously collected.  */
7051   stmt_vector_for_cost scalar_costs = vNULL;
7052   stmt_vector_for_cost vector_costs = vNULL;
7053   visited.empty ();
7054   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7055     {
7056       auto_vec<bool, 20> life;
7057       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7058                               true);
7059       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7060         record_stmt_cost (&scalar_costs,
7061                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
7062                           scalar_stmt,
7063                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
7064       vect_bb_slp_scalar_cost (bb_vinfo,
7065                                SLP_INSTANCE_TREE (instance),
7066                                &life, &scalar_costs, vectorized_scalar_stmts,
7067                                visited);
7068       vector_costs.safe_splice (instance->cost_vec);
7069       instance->cost_vec.release ();
7070     }
7071
7072   if (dump_enabled_p ())
7073     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7074
7075   /* When costing non-loop vectorization we need to consider each covered
7076      loop independently and make sure vectorization is profitable.  For
7077      now we assume a loop may be not entered or executed an arbitrary
7078      number of iterations (???  static information can provide more
7079      precise info here) which means we can simply cost each containing
7080      loops stmts separately.  */
7081
7082   /* First produce cost vectors sorted by loop index.  */
7083   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7084     li_scalar_costs (scalar_costs.length ());
7085   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7086     li_vector_costs (vector_costs.length ());
7087   stmt_info_for_cost *cost;
7088   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7089     {
7090       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7091       li_scalar_costs.quick_push (std::make_pair (l, cost));
7092     }
7093   /* Use a random used loop as fallback in case the first vector_costs
7094      entry does not have a stmt_info associated with it.  */
7095   unsigned l = li_scalar_costs[0].first;
7096   FOR_EACH_VEC_ELT (vector_costs, i, cost)
7097     {
7098       /* We inherit from the previous COST, invariants, externals and
7099          extracts immediately follow the cost for the related stmt.  */
7100       if (cost->stmt_info)
7101         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7102       li_vector_costs.quick_push (std::make_pair (l, cost));
7103     }
7104   li_scalar_costs.qsort (li_cost_vec_cmp);
7105   li_vector_costs.qsort (li_cost_vec_cmp);
7106
7107   /* Now cost the portions individually.  */
7108   unsigned vi = 0;
7109   unsigned si = 0;
7110   bool profitable = true;
7111   while (si < li_scalar_costs.length ()
7112          && vi < li_vector_costs.length ())
7113     {
7114       unsigned sl = li_scalar_costs[si].first;
7115       unsigned vl = li_vector_costs[vi].first;
7116       if (sl != vl)
7117         {
7118           if (dump_enabled_p ())
7119             dump_printf_loc (MSG_NOTE, vect_location,
7120                              "Scalar %d and vector %d loop part do not "
7121                              "match up, skipping scalar part\n", sl, vl);
7122           /* Skip the scalar part, assuming zero cost on the vector side.  */
7123           do
7124             {
7125               si++;
7126             }
7127           while (si < li_scalar_costs.length ()
7128                  && li_scalar_costs[si].first == sl);
7129           continue;
7130         }
7131
7132       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7133       do
7134         {
7135           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7136           si++;
7137         }
7138       while (si < li_scalar_costs.length ()
7139              && li_scalar_costs[si].first == sl);
7140       unsigned dummy;
7141       finish_cost (scalar_target_cost_data, nullptr,
7142                    &dummy, &scalar_cost, &dummy);
7143
7144       /* Complete the target-specific vector cost calculation.  */
7145       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7146       do
7147         {
7148           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7149           vi++;
7150         }
7151       while (vi < li_vector_costs.length ()
7152              && li_vector_costs[vi].first == vl);
7153       finish_cost (vect_target_cost_data, scalar_target_cost_data,
7154                    &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7155       delete scalar_target_cost_data;
7156       delete vect_target_cost_data;
7157
7158       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7159
7160       if (dump_enabled_p ())
7161         {
7162           dump_printf_loc (MSG_NOTE, vect_location,
7163                            "Cost model analysis for part in loop %d:\n", sl);
7164           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
7165                        vec_inside_cost + vec_outside_cost);
7166           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
7167         }
7168
7169       /* Vectorization is profitable if its cost is more than the cost of scalar
7170          version.  Note that we err on the vector side for equal cost because
7171          the cost estimate is otherwise quite pessimistic (constant uses are
7172          free on the scalar side but cost a load on the vector side for
7173          example).  */
7174       if (vec_outside_cost + vec_inside_cost > scalar_cost)
7175         {
7176           profitable = false;
7177           break;
7178         }
7179     }
7180   if (profitable && vi < li_vector_costs.length ())
7181     {
7182       if (dump_enabled_p ())
7183         dump_printf_loc (MSG_NOTE, vect_location,
7184                          "Excess vector cost for part in loop %d:\n",
7185                          li_vector_costs[vi].first);
7186       profitable = false;
7187     }
7188
7189   /* Unset visited flag.  This is delayed when the subgraph is profitable
7190      and we process the loop for remaining unvectorized if-converted code.  */
7191   if (!orig_loop || !profitable)
7192     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7193       gimple_set_visited  (cost->stmt_info->stmt, false);
7194
7195   scalar_costs.release ();
7196   vector_costs.release ();
7197
7198   return profitable;
7199 }
7200
7201 /* qsort comparator for lane defs.  */
7202
7203 static int
7204 vld_cmp (const void *a_, const void *b_)
7205 {
7206   auto *a = (const std::pair<unsigned, tree> *)a_;
7207   auto *b = (const std::pair<unsigned, tree> *)b_;
7208   return a->first - b->first;
7209 }
7210
7211 /* Return true if USE_STMT is a vector lane insert into VEC and set
7212    *THIS_LANE to the lane number that is set.  */
7213
7214 static bool
7215 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7216 {
7217   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7218   if (!use_ass
7219       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7220       || (vec
7221           ? gimple_assign_rhs1 (use_ass) != vec
7222           : ((vec = gimple_assign_rhs1 (use_ass)), false))
7223       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7224                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7225       || !constant_multiple_p
7226             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7227              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7228              this_lane))
7229     return false;
7230   return true;
7231 }
7232
7233 /* Find any vectorizable constructors and add them to the grouped_store
7234    array.  */
7235
7236 static void
7237 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7238 {
7239   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7240     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7241          !gsi_end_p (gsi); gsi_next (&gsi))
7242     {
7243       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7244       if (!assign)
7245         continue;
7246
7247       tree rhs = gimple_assign_rhs1 (assign);
7248       enum tree_code code = gimple_assign_rhs_code (assign);
7249       use_operand_p use_p;
7250       gimple *use_stmt;
7251       if (code == CONSTRUCTOR)
7252         {
7253           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7254               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7255                            CONSTRUCTOR_NELTS (rhs))
7256               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7257               || uniform_vector_p (rhs))
7258             continue;
7259
7260           unsigned j;
7261           tree val;
7262           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7263             if (TREE_CODE (val) != SSA_NAME
7264                 || !bb_vinfo->lookup_def (val))
7265               break;
7266           if (j != CONSTRUCTOR_NELTS (rhs))
7267             continue;
7268
7269           vec<stmt_vec_info> roots = vNULL;
7270           roots.safe_push (bb_vinfo->lookup_stmt (assign));
7271           vec<stmt_vec_info> stmts;
7272           stmts.create (CONSTRUCTOR_NELTS (rhs));
7273           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7274             stmts.quick_push
7275               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7276           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7277                                                stmts, roots));
7278         }
7279       else if (code == BIT_INSERT_EXPR
7280                && VECTOR_TYPE_P (TREE_TYPE (rhs))
7281                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7282                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7283                && integer_zerop (gimple_assign_rhs3 (assign))
7284                && useless_type_conversion_p
7285                     (TREE_TYPE (TREE_TYPE (rhs)),
7286                      TREE_TYPE (gimple_assign_rhs2 (assign)))
7287                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7288         {
7289           /* We start to match on insert to lane zero but since the
7290              inserts need not be ordered we'd have to search both
7291              the def and the use chains.  */
7292           tree vectype = TREE_TYPE (rhs);
7293           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7294           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7295           auto_sbitmap lanes (nlanes);
7296           bitmap_clear (lanes);
7297           bitmap_set_bit (lanes, 0);
7298           tree def = gimple_assign_lhs (assign);
7299           lane_defs.quick_push
7300                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
7301           unsigned lanes_found = 1;
7302           /* Start with the use chains, the last stmt will be the root.  */
7303           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7304           vec<stmt_vec_info> roots = vNULL;
7305           roots.safe_push (last);
7306           do
7307             {
7308               use_operand_p use_p;
7309               gimple *use_stmt;
7310               if (!single_imm_use (def, &use_p, &use_stmt))
7311                 break;
7312               unsigned this_lane;
7313               if (!bb_vinfo->lookup_stmt (use_stmt)
7314                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7315                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7316                 break;
7317               if (bitmap_bit_p (lanes, this_lane))
7318                 break;
7319               lanes_found++;
7320               bitmap_set_bit (lanes, this_lane);
7321               gassign *use_ass = as_a <gassign *> (use_stmt);
7322               lane_defs.quick_push (std::make_pair
7323                                      (this_lane, gimple_assign_rhs2 (use_ass)));
7324               last = bb_vinfo->lookup_stmt (use_ass);
7325               roots.safe_push (last);
7326               def = gimple_assign_lhs (use_ass);
7327             }
7328           while (lanes_found < nlanes);
7329           if (roots.length () > 1)
7330             std::swap(roots[0], roots[roots.length () - 1]);
7331           if (lanes_found < nlanes)
7332             {
7333               /* Now search the def chain.  */
7334               def = gimple_assign_rhs1 (assign);
7335               do
7336                 {
7337                   if (TREE_CODE (def) != SSA_NAME
7338                       || !has_single_use (def))
7339                     break;
7340                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7341                   unsigned this_lane;
7342                   if (!bb_vinfo->lookup_stmt (def_stmt)
7343                       || !vect_slp_is_lane_insert (def_stmt,
7344                                                    NULL_TREE, &this_lane)
7345                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7346                     break;
7347                   if (bitmap_bit_p (lanes, this_lane))
7348                     break;
7349                   lanes_found++;
7350                   bitmap_set_bit (lanes, this_lane);
7351                   lane_defs.quick_push (std::make_pair
7352                                           (this_lane,
7353                                            gimple_assign_rhs2 (def_stmt)));
7354                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7355                   def = gimple_assign_rhs1 (def_stmt);
7356                 }
7357               while (lanes_found < nlanes);
7358             }
7359           if (lanes_found == nlanes)
7360             {
7361               /* Sort lane_defs after the lane index and register the root.  */
7362               lane_defs.qsort (vld_cmp);
7363               vec<stmt_vec_info> stmts;
7364               stmts.create (nlanes);
7365               for (unsigned i = 0; i < nlanes; ++i)
7366                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7367               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7368                                                    stmts, roots));
7369             }
7370           else
7371             roots.release ();
7372         }
7373       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7374                && (associative_tree_code (code) || code == MINUS_EXPR)
7375                /* ???  This pessimizes a two-element reduction.  PR54400.
7376                   ???  In-order reduction could be handled if we only
7377                   traverse one operand chain in vect_slp_linearize_chain.  */
7378                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7379                /* Ops with constants at the tail can be stripped here.  */
7380                && TREE_CODE (rhs) == SSA_NAME
7381                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7382                /* Should be the chain end.  */
7383                && (!single_imm_use (gimple_assign_lhs (assign),
7384                                     &use_p, &use_stmt)
7385                    || !is_gimple_assign (use_stmt)
7386                    || (gimple_assign_rhs_code (use_stmt) != code
7387                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
7388                            || (gimple_assign_rhs_code (use_stmt)
7389                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7390         {
7391           /* We start the match at the end of a possible association
7392              chain.  */
7393           auto_vec<chain_op_t> chain;
7394           auto_vec<std::pair<tree_code, gimple *> > worklist;
7395           auto_vec<gimple *> chain_stmts;
7396           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7397           if (code == MINUS_EXPR)
7398             code = PLUS_EXPR;
7399           internal_fn reduc_fn;
7400           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7401               || reduc_fn == IFN_LAST)
7402             continue;
7403           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7404                                     /* ??? */
7405                                     code_stmt, alt_code_stmt, &chain_stmts);
7406           if (chain.length () > 1)
7407             {
7408               /* Sort the chain according to def_type and operation.  */
7409               chain.sort (dt_sort_cmp, bb_vinfo);
7410               /* ???  Now we'd want to strip externals and constants
7411                  but record those to be handled in the epilogue.  */
7412               /* ???  For now do not allow mixing ops or externs/constants.  */
7413               bool invalid = false;
7414               unsigned remain_cnt = 0;
7415               for (unsigned i = 0; i < chain.length (); ++i)
7416                 {
7417                   if (chain[i].code != code)
7418                     {
7419                       invalid = true;
7420                       break;
7421                     }
7422                   if (chain[i].dt != vect_internal_def
7423                       /* Avoid stmts where the def is not the LHS, like
7424                          ASMs.  */
7425                       || (gimple_get_lhs (bb_vinfo->lookup_def
7426                                                       (chain[i].op)->stmt)
7427                           != chain[i].op))
7428                     remain_cnt++;
7429                 }
7430               if (!invalid && chain.length () - remain_cnt > 1)
7431                 {
7432                   vec<stmt_vec_info> stmts;
7433                   vec<tree> remain = vNULL;
7434                   stmts.create (chain.length ());
7435                   if (remain_cnt > 0)
7436                     remain.create (remain_cnt);
7437                   for (unsigned i = 0; i < chain.length (); ++i)
7438                     {
7439                       stmt_vec_info stmt_info;
7440                       if (chain[i].dt == vect_internal_def
7441                           && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
7442                               gimple_get_lhs (stmt_info->stmt) == chain[i].op))
7443                         stmts.quick_push (stmt_info);
7444                       else
7445                         remain.quick_push (chain[i].op);
7446                     }
7447                   vec<stmt_vec_info> roots;
7448                   roots.create (chain_stmts.length ());
7449                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
7450                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7451                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7452                                                        stmts, roots, remain));
7453                 }
7454             }
7455         }
7456     }
7457 }
7458
7459 /* Walk the grouped store chains and replace entries with their
7460    pattern variant if any.  */
7461
7462 static void
7463 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7464 {
7465   stmt_vec_info first_element;
7466   unsigned i;
7467
7468   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7469     {
7470       /* We also have CTORs in this array.  */
7471       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7472         continue;
7473       if (STMT_VINFO_IN_PATTERN_P (first_element))
7474         {
7475           stmt_vec_info orig = first_element;
7476           first_element = STMT_VINFO_RELATED_STMT (first_element);
7477           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7478           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7479           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7480           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7481           vinfo->grouped_stores[i] = first_element;
7482         }
7483       stmt_vec_info prev = first_element;
7484       while (DR_GROUP_NEXT_ELEMENT (prev))
7485         {
7486           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7487           if (STMT_VINFO_IN_PATTERN_P (elt))
7488             {
7489               stmt_vec_info orig = elt;
7490               elt = STMT_VINFO_RELATED_STMT (elt);
7491               DR_GROUP_NEXT_ELEMENT (prev) = elt;
7492               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7493               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7494             }
7495           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7496           prev = elt;
7497         }
7498     }
7499 }
7500
7501 /* Check if the region described by BB_VINFO can be vectorized, returning
7502    true if so.  When returning false, set FATAL to true if the same failure
7503    would prevent vectorization at other vector sizes, false if it is still
7504    worth trying other sizes.  N_STMTS is the number of statements in the
7505    region.  */
7506
7507 static bool
7508 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7509                        vec<int> *dataref_groups)
7510 {
7511   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7512
7513   slp_instance instance;
7514   int i;
7515   poly_uint64 min_vf = 2;
7516
7517   /* The first group of checks is independent of the vector size.  */
7518   fatal = true;
7519
7520   /* Analyze the data references.  */
7521
7522   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7523     {
7524       if (dump_enabled_p ())
7525         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7526                          "not vectorized: unhandled data-ref in basic "
7527                          "block.\n");
7528       return false;
7529     }
7530
7531   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7532     {
7533      if (dump_enabled_p ())
7534        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7535                         "not vectorized: unhandled data access in "
7536                         "basic block.\n");
7537       return false;
7538     }
7539
7540   vect_slp_check_for_roots (bb_vinfo);
7541
7542   /* If there are no grouped stores and no constructors in the region
7543      there is no need to continue with pattern recog as vect_analyze_slp
7544      will fail anyway.  */
7545   if (bb_vinfo->grouped_stores.is_empty ()
7546       && bb_vinfo->roots.is_empty ())
7547     {
7548       if (dump_enabled_p ())
7549         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7550                          "not vectorized: no grouped stores in "
7551                          "basic block.\n");
7552       return false;
7553     }
7554
7555   /* While the rest of the analysis below depends on it in some way.  */
7556   fatal = false;
7557
7558   vect_pattern_recog (bb_vinfo);
7559
7560   /* Update store groups from pattern processing.  */
7561   vect_fixup_store_groups_with_patterns (bb_vinfo);
7562
7563   /* Check the SLP opportunities in the basic block, analyze and build SLP
7564      trees.  */
7565   if (!vect_analyze_slp (bb_vinfo, n_stmts))
7566     {
7567       if (dump_enabled_p ())
7568         {
7569           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7570                            "Failed to SLP the basic block.\n");
7571           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7572                            "not vectorized: failed to find SLP opportunities "
7573                            "in basic block.\n");
7574         }
7575       return false;
7576     }
7577
7578   /* Optimize permutations.  */
7579   vect_optimize_slp (bb_vinfo);
7580
7581   /* Gather the loads reachable from the SLP graph entries.  */
7582   vect_gather_slp_loads (bb_vinfo);
7583
7584   vect_record_base_alignments (bb_vinfo);
7585
7586   /* Analyze and verify the alignment of data references and the
7587      dependence in the SLP instances.  */
7588   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7589     {
7590       vect_location = instance->location ();
7591       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7592           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7593         {
7594           slp_tree node = SLP_INSTANCE_TREE (instance);
7595           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7596           if (dump_enabled_p ())
7597             dump_printf_loc (MSG_NOTE, vect_location,
7598                              "removing SLP instance operations starting from: %G",
7599                              stmt_info->stmt);
7600           vect_free_slp_instance (instance);
7601           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7602           continue;
7603         }
7604
7605       /* Mark all the statements that we want to vectorize as pure SLP and
7606          relevant.  */
7607       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7608       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7609       unsigned j;
7610       stmt_vec_info root;
7611       /* Likewise consider instance root stmts as vectorized.  */
7612       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7613         STMT_SLP_TYPE (root) = pure_slp;
7614
7615       i++;
7616     }
7617   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7618     return false;
7619
7620   if (!vect_slp_analyze_operations (bb_vinfo))
7621     {
7622       if (dump_enabled_p ())
7623         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7624                          "not vectorized: bad operation in basic block.\n");
7625       return false;
7626     }
7627
7628   vect_bb_partition_graph (bb_vinfo);
7629
7630   return true;
7631 }
7632
7633 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
7634    basic blocks in BBS, returning true on success.
7635    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
7636
7637 static bool
7638 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7639                  vec<int> *dataref_groups, unsigned int n_stmts,
7640                  loop_p orig_loop)
7641 {
7642   bb_vec_info bb_vinfo;
7643   auto_vector_modes vector_modes;
7644
7645   /* Autodetect first vector size we try.  */
7646   machine_mode next_vector_mode = VOIDmode;
7647   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7648   unsigned int mode_i = 0;
7649
7650   vec_info_shared shared;
7651
7652   machine_mode autodetected_vector_mode = VOIDmode;
7653   while (1)
7654     {
7655       bool vectorized = false;
7656       bool fatal = false;
7657       bb_vinfo = new _bb_vec_info (bbs, &shared);
7658
7659       bool first_time_p = shared.datarefs.is_empty ();
7660       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7661       if (first_time_p)
7662         bb_vinfo->shared->save_datarefs ();
7663       else
7664         bb_vinfo->shared->check_datarefs ();
7665       bb_vinfo->vector_mode = next_vector_mode;
7666
7667       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7668         {
7669           if (dump_enabled_p ())
7670             {
7671               dump_printf_loc (MSG_NOTE, vect_location,
7672                                "***** Analysis succeeded with vector mode"
7673                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7674               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7675             }
7676
7677           bb_vinfo->shared->check_datarefs ();
7678
7679           bool force_clear = false;
7680           auto_vec<slp_instance> profitable_subgraphs;
7681           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7682             {
7683               if (instance->subgraph_entries.is_empty ())
7684                 continue;
7685
7686               dump_user_location_t saved_vect_location = vect_location;
7687               vect_location = instance->location ();
7688               if (!unlimited_cost_model (NULL)
7689                   && !vect_bb_vectorization_profitable_p
7690                         (bb_vinfo, instance->subgraph_entries, orig_loop))
7691                 {
7692                   if (dump_enabled_p ())
7693                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7694                                      "not vectorized: vectorization is not "
7695                                      "profitable.\n");
7696                   vect_location = saved_vect_location;
7697                   continue;
7698                 }
7699
7700               vect_location = saved_vect_location;
7701               if (!dbg_cnt (vect_slp))
7702                 {
7703                   force_clear = true;
7704                   continue;
7705                 }
7706
7707               profitable_subgraphs.safe_push (instance);
7708             }
7709
7710           /* When we're vectorizing an if-converted loop body make sure
7711              we vectorized all if-converted code.  */
7712           if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
7713             {
7714               gcc_assert (bb_vinfo->bbs.length () == 1);
7715               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7716                    !gsi_end_p (gsi); gsi_next (&gsi))
7717                 {
7718                   /* The costing above left us with DCEable vectorized scalar
7719                      stmts having the visited flag set on profitable
7720                      subgraphs.  Do the delayed clearing of the flag here.  */
7721                   if (gimple_visited_p (gsi_stmt (gsi)))
7722                     {
7723                       gimple_set_visited (gsi_stmt (gsi), false);
7724                       continue;
7725                     }
7726                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7727                     continue;
7728
7729                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7730                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
7731                       {
7732                         if (!profitable_subgraphs.is_empty ()
7733                             && dump_enabled_p ())
7734                           dump_printf_loc (MSG_NOTE, vect_location,
7735                                            "not profitable because of "
7736                                            "unprofitable if-converted scalar "
7737                                            "code\n");
7738                         profitable_subgraphs.truncate (0);
7739                       }
7740                 }
7741             }
7742
7743           /* Finally schedule the profitable subgraphs.  */
7744           for (slp_instance instance : profitable_subgraphs)
7745             {
7746               if (!vectorized && dump_enabled_p ())
7747                 dump_printf_loc (MSG_NOTE, vect_location,
7748                                  "Basic block will be vectorized "
7749                                  "using SLP\n");
7750               vectorized = true;
7751
7752               /* Dump before scheduling as store vectorization will remove
7753                  the original stores and mess with the instance tree
7754                  so querying its location will eventually ICE.  */
7755               if (flag_checking)
7756                 for (slp_instance sub : instance->subgraph_entries)
7757                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7758               unsigned HOST_WIDE_INT bytes;
7759               if (dump_enabled_p ())
7760                 for (slp_instance sub : instance->subgraph_entries)
7761                   {
7762                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7763                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7764                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7765                                        sub->location (),
7766                                        "basic block part vectorized using %wu "
7767                                        "byte vectors\n", bytes);
7768                     else
7769                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7770                                        sub->location (),
7771                                        "basic block part vectorized using "
7772                                        "variable length vectors\n");
7773                   }
7774
7775               dump_user_location_t saved_vect_location = vect_location;
7776               vect_location = instance->location ();
7777
7778               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7779
7780               vect_location = saved_vect_location;
7781             }
7782         }
7783       else
7784         {
7785           if (dump_enabled_p ())
7786             dump_printf_loc (MSG_NOTE, vect_location,
7787                              "***** Analysis failed with vector mode %s\n",
7788                              GET_MODE_NAME (bb_vinfo->vector_mode));
7789         }
7790
7791       if (mode_i == 0)
7792         autodetected_vector_mode = bb_vinfo->vector_mode;
7793
7794       if (!fatal)
7795         while (mode_i < vector_modes.length ()
7796                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7797           {
7798             if (dump_enabled_p ())
7799               dump_printf_loc (MSG_NOTE, vect_location,
7800                                "***** The result for vector mode %s would"
7801                                " be the same\n",
7802                                GET_MODE_NAME (vector_modes[mode_i]));
7803             mode_i += 1;
7804           }
7805
7806       delete bb_vinfo;
7807
7808       if (mode_i < vector_modes.length ()
7809           && VECTOR_MODE_P (autodetected_vector_mode)
7810           && (related_vector_mode (vector_modes[mode_i],
7811                                    GET_MODE_INNER (autodetected_vector_mode))
7812               == autodetected_vector_mode)
7813           && (related_vector_mode (autodetected_vector_mode,
7814                                    GET_MODE_INNER (vector_modes[mode_i]))
7815               == vector_modes[mode_i]))
7816         {
7817           if (dump_enabled_p ())
7818             dump_printf_loc (MSG_NOTE, vect_location,
7819                              "***** Skipping vector mode %s, which would"
7820                              " repeat the analysis for %s\n",
7821                              GET_MODE_NAME (vector_modes[mode_i]),
7822                              GET_MODE_NAME (autodetected_vector_mode));
7823           mode_i += 1;
7824         }
7825
7826       if (vectorized
7827           || mode_i == vector_modes.length ()
7828           || autodetected_vector_mode == VOIDmode
7829           /* If vect_slp_analyze_bb_1 signaled that analysis for all
7830              vector sizes will fail do not bother iterating.  */
7831           || fatal)
7832         return vectorized;
7833
7834       /* Try the next biggest vector size.  */
7835       next_vector_mode = vector_modes[mode_i++];
7836       if (dump_enabled_p ())
7837         dump_printf_loc (MSG_NOTE, vect_location,
7838                          "***** Re-trying analysis with vector mode %s\n",
7839                          GET_MODE_NAME (next_vector_mode));
7840     }
7841 }
7842
7843
7844 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
7845    true if anything in the basic-block was vectorized.  */
7846
7847 static bool
7848 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7849 {
7850   vec<data_reference_p> datarefs = vNULL;
7851   auto_vec<int> dataref_groups;
7852   int insns = 0;
7853   int current_group = 0;
7854
7855   for (unsigned i = 0; i < bbs.length (); i++)
7856     {
7857       basic_block bb = bbs[i];
7858       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7859            gsi_next (&gsi))
7860         {
7861           gimple *stmt = gsi_stmt (gsi);
7862           if (is_gimple_debug (stmt))
7863             continue;
7864
7865           insns++;
7866
7867           if (gimple_location (stmt) != UNKNOWN_LOCATION)
7868             vect_location = stmt;
7869
7870           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7871                                               &dataref_groups, current_group))
7872             ++current_group;
7873         }
7874       /* New BBs always start a new DR group.  */
7875       ++current_group;
7876     }
7877
7878   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7879 }
7880
7881 /* Special entry for the BB vectorizer.  Analyze and transform a single
7882    if-converted BB with ORIG_LOOPs body being the not if-converted
7883    representation.  Returns true if anything in the basic-block was
7884    vectorized.  */
7885
7886 bool
7887 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7888 {
7889   auto_vec<basic_block> bbs;
7890   bbs.safe_push (bb);
7891   return vect_slp_bbs (bbs, orig_loop);
7892 }
7893
7894 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
7895    true if anything in the basic-block was vectorized.  */
7896
7897 bool
7898 vect_slp_function (function *fun)
7899 {
7900   bool r = false;
7901   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7902   auto_bitmap exit_bbs;
7903   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
7904   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
7905   unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
7906                                                       true, rpo, NULL);
7907
7908   /* For the moment split the function into pieces to avoid making
7909      the iteration on the vector mode moot.  Split at points we know
7910      to not handle well which is CFG merges (SLP discovery doesn't
7911      handle non-loop-header PHIs) and loop exits.  Since pattern
7912      recog requires reverse iteration to visit uses before defs
7913      simply chop RPO into pieces.  */
7914   auto_vec<basic_block> bbs;
7915   for (unsigned i = 0; i < n; i++)
7916     {
7917       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7918       bool split = false;
7919
7920       /* Split when a BB is not dominated by the first block.  */
7921       if (!bbs.is_empty ()
7922           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7923         {
7924           if (dump_enabled_p ())
7925             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7926                              "splitting region at dominance boundary bb%d\n",
7927                              bb->index);
7928           split = true;
7929         }
7930       /* Split when the loop determined by the first block
7931          is exited.  This is because we eventually insert
7932          invariants at region begin.  */
7933       else if (!bbs.is_empty ()
7934                && bbs[0]->loop_father != bb->loop_father
7935                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7936         {
7937           if (dump_enabled_p ())
7938             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7939                              "splitting region at loop %d exit at bb%d\n",
7940                              bbs[0]->loop_father->num, bb->index);
7941           split = true;
7942         }
7943       else if (!bbs.is_empty ()
7944                && bb->loop_father->header == bb
7945                && bb->loop_father->dont_vectorize)
7946         {
7947           if (dump_enabled_p ())
7948             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7949                              "splitting region at dont-vectorize loop %d "
7950                              "entry at bb%d\n",
7951                              bb->loop_father->num, bb->index);
7952           split = true;
7953         }
7954
7955       if (split && !bbs.is_empty ())
7956         {
7957           r |= vect_slp_bbs (bbs, NULL);
7958           bbs.truncate (0);
7959         }
7960
7961       if (bbs.is_empty ())
7962         {
7963           /* We need to be able to insert at the head of the region which
7964              we cannot for region starting with a returns-twice call.  */
7965           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
7966             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
7967               {
7968                 if (dump_enabled_p ())
7969                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7970                                    "skipping bb%d as start of region as it "
7971                                    "starts with returns-twice call\n",
7972                                    bb->index);
7973                 continue;
7974               }
7975           /* If the loop this BB belongs to is marked as not to be vectorized
7976              honor that also for BB vectorization.  */
7977           if (bb->loop_father->dont_vectorize)
7978             continue;
7979         }
7980
7981       bbs.safe_push (bb);
7982
7983       /* When we have a stmt ending this block and defining a
7984          value we have to insert on edges when inserting after it for
7985          a vector containing its definition.  Avoid this for now.  */
7986       if (gimple *last = *gsi_last_bb (bb))
7987         if (gimple_get_lhs (last)
7988             && is_ctrl_altering_stmt (last))
7989           {
7990             if (dump_enabled_p ())
7991               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7992                                "splitting region at control altering "
7993                                "definition %G", last);
7994             r |= vect_slp_bbs (bbs, NULL);
7995             bbs.truncate (0);
7996           }
7997     }
7998
7999   if (!bbs.is_empty ())
8000     r |= vect_slp_bbs (bbs, NULL);
8001
8002   free (rpo);
8003
8004   return r;
8005 }
8006
8007 /* Build a variable-length vector in which the elements in ELTS are repeated
8008    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
8009    RESULTS and add any new instructions to SEQ.
8010
8011    The approach we use is:
8012
8013    (1) Find a vector mode VM with integer elements of mode IM.
8014
8015    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8016        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
8017        from small vectors to IM.
8018
8019    (3) Duplicate each ELTS'[I] into a vector of mode VM.
8020
8021    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
8022        correct byte contents.
8023
8024    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
8025
8026    We try to find the largest IM for which this sequence works, in order
8027    to cut down on the number of interleaves.  */
8028
8029 void
8030 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
8031                           const vec<tree> &elts, unsigned int nresults,
8032                           vec<tree> &results)
8033 {
8034   unsigned int nelts = elts.length ();
8035   tree element_type = TREE_TYPE (vector_type);
8036
8037   /* (1) Find a vector mode VM with integer elements of mode IM.  */
8038   unsigned int nvectors = 1;
8039   tree new_vector_type;
8040   tree permutes[2];
8041   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
8042                                        &nvectors, &new_vector_type,
8043                                        permutes))
8044     gcc_unreachable ();
8045
8046   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
8047   unsigned int partial_nelts = nelts / nvectors;
8048   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8049
8050   tree_vector_builder partial_elts;
8051   auto_vec<tree, 32> pieces (nvectors * 2);
8052   pieces.quick_grow_cleared (nvectors * 2);
8053   for (unsigned int i = 0; i < nvectors; ++i)
8054     {
8055       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8056              ELTS' has mode IM.  */
8057       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
8058       for (unsigned int j = 0; j < partial_nelts; ++j)
8059         partial_elts.quick_push (elts[i * partial_nelts + j]);
8060       tree t = gimple_build_vector (seq, &partial_elts);
8061       t = gimple_build (seq, VIEW_CONVERT_EXPR,
8062                         TREE_TYPE (new_vector_type), t);
8063
8064       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
8065       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
8066     }
8067
8068   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
8069          correct byte contents.
8070
8071      Conceptually, we need to repeat the following operation log2(nvectors)
8072      times, where hi_start = nvectors / 2:
8073
8074         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
8075         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
8076
8077      However, if each input repeats every N elements and the VF is
8078      a multiple of N * 2, the HI result is the same as the LO result.
8079      This will be true for the first N1 iterations of the outer loop,
8080      followed by N2 iterations for which both the LO and HI results
8081      are needed.  I.e.:
8082
8083         N1 + N2 = log2(nvectors)
8084
8085      Each "N1 iteration" doubles the number of redundant vectors and the
8086      effect of the process as a whole is to have a sequence of nvectors/2**N1
8087      vectors that repeats 2**N1 times.  Rather than generate these redundant
8088      vectors, we halve the number of vectors for each N1 iteration.  */
8089   unsigned int in_start = 0;
8090   unsigned int out_start = nvectors;
8091   unsigned int new_nvectors = nvectors;
8092   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
8093     {
8094       unsigned int hi_start = new_nvectors / 2;
8095       unsigned int out_i = 0;
8096       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
8097         {
8098           if ((in_i & 1) != 0
8099               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
8100                              2 * in_repeat))
8101             continue;
8102
8103           tree output = make_ssa_name (new_vector_type);
8104           tree input1 = pieces[in_start + (in_i / 2)];
8105           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
8106           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8107                                                input1, input2,
8108                                                permutes[in_i & 1]);
8109           gimple_seq_add_stmt (seq, stmt);
8110           pieces[out_start + out_i] = output;
8111           out_i += 1;
8112         }
8113       std::swap (in_start, out_start);
8114       new_nvectors = out_i;
8115     }
8116
8117   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
8118   results.reserve (nresults);
8119   for (unsigned int i = 0; i < nresults; ++i)
8120     if (i < new_nvectors)
8121       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
8122                                         pieces[in_start + i]));
8123     else
8124       results.quick_push (results[i - new_nvectors]);
8125 }
8126
8127
8128 /* For constant and loop invariant defs in OP_NODE this function creates
8129    vector defs that will be used in the vectorized stmts and stores them
8130    to SLP_TREE_VEC_DEFS of OP_NODE.  */
8131
8132 static void
8133 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8134 {
8135   unsigned HOST_WIDE_INT nunits;
8136   tree vec_cst;
8137   unsigned j, number_of_places_left_in_vector;
8138   tree vector_type;
8139   tree vop;
8140   int group_size = op_node->ops.length ();
8141   unsigned int vec_num, i;
8142   unsigned number_of_copies = 1;
8143   bool constant_p;
8144   gimple_seq ctor_seq = NULL;
8145   auto_vec<tree, 16> permute_results;
8146
8147   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
8148   vector_type = SLP_TREE_VECTYPE (op_node);
8149
8150   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8151   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8152   auto_vec<tree> voprnds (number_of_vectors);
8153
8154   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8155      created vectors. It is greater than 1 if unrolling is performed.
8156
8157      For example, we have two scalar operands, s1 and s2 (e.g., group of
8158      strided accesses of size two), while NUNITS is four (i.e., four scalars
8159      of this type can be packed in a vector).  The output vector will contain
8160      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
8161      will be 2).
8162
8163      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8164      containing the operands.
8165
8166      For example, NUNITS is four as before, and the group size is 8
8167      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
8168      {s5, s6, s7, s8}.  */
8169
8170   /* When using duplicate_and_interleave, we just need one element for
8171      each scalar statement.  */
8172   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8173     nunits = group_size;
8174
8175   number_of_copies = nunits * number_of_vectors / group_size;
8176
8177   number_of_places_left_in_vector = nunits;
8178   constant_p = true;
8179   tree uniform_elt = NULL_TREE;
8180   tree_vector_builder elts (vector_type, nunits, 1);
8181   elts.quick_grow (nunits);
8182   stmt_vec_info insert_after = NULL;
8183   for (j = 0; j < number_of_copies; j++)
8184     {
8185       tree op;
8186       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8187         {
8188           /* Create 'vect_ = {op0,op1,...,opn}'.  */
8189           tree orig_op = op;
8190           if (number_of_places_left_in_vector == nunits)
8191             uniform_elt = op;
8192           else if (uniform_elt && operand_equal_p (uniform_elt, op))
8193             op = elts[number_of_places_left_in_vector];
8194           else
8195             uniform_elt = NULL_TREE;
8196           number_of_places_left_in_vector--;
8197           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8198             {
8199               if (CONSTANT_CLASS_P (op))
8200                 {
8201                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8202                     {
8203                       /* Can't use VIEW_CONVERT_EXPR for booleans because
8204                          of possibly different sizes of scalar value and
8205                          vector element.  */
8206                       if (integer_zerop (op))
8207                         op = build_int_cst (TREE_TYPE (vector_type), 0);
8208                       else if (integer_onep (op))
8209                         op = build_all_ones_cst (TREE_TYPE (vector_type));
8210                       else
8211                         gcc_unreachable ();
8212                     }
8213                   else
8214                     op = fold_unary (VIEW_CONVERT_EXPR,
8215                                      TREE_TYPE (vector_type), op);
8216                   gcc_assert (op && CONSTANT_CLASS_P (op));
8217                 }
8218               else
8219                 {
8220                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8221                   gimple *init_stmt;
8222                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8223                     {
8224                       tree true_val
8225                         = build_all_ones_cst (TREE_TYPE (vector_type));
8226                       tree false_val
8227                         = build_zero_cst (TREE_TYPE (vector_type));
8228                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8229                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8230                                                        op, true_val,
8231                                                        false_val);
8232                     }
8233                   else
8234                     {
8235                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8236                                    op);
8237                       init_stmt
8238                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8239                                                op);
8240                     }
8241                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
8242                   op = new_temp;
8243                 }
8244             }
8245           elts[number_of_places_left_in_vector] = op;
8246           if (!CONSTANT_CLASS_P (op))
8247             constant_p = false;
8248           /* For BB vectorization we have to compute an insert location
8249              when a def is inside the analyzed region since we cannot
8250              simply insert at the BB start in this case.  */
8251           stmt_vec_info opdef;
8252           if (TREE_CODE (orig_op) == SSA_NAME
8253               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8254               && is_a <bb_vec_info> (vinfo)
8255               && (opdef = vinfo->lookup_def (orig_op)))
8256             {
8257               if (!insert_after)
8258                 insert_after = opdef;
8259               else
8260                 insert_after = get_later_stmt (insert_after, opdef);
8261             }
8262
8263           if (number_of_places_left_in_vector == 0)
8264             {
8265               auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
8266               if (uniform_elt)
8267                 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
8268                                                         elts[0]);
8269               else if (constant_p
8270                        ? multiple_p (type_nunits, nunits)
8271                        : known_eq (type_nunits, nunits))
8272                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8273               else
8274                 {
8275                   if (permute_results.is_empty ())
8276                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8277                                               elts, number_of_vectors,
8278                                               permute_results);
8279                   vec_cst = permute_results[number_of_vectors - j - 1];
8280                 }
8281               if (!gimple_seq_empty_p (ctor_seq))
8282                 {
8283                   if (insert_after)
8284                     {
8285                       gimple_stmt_iterator gsi;
8286                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8287                         {
8288                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8289                           gsi_insert_seq_before (&gsi, ctor_seq,
8290                                                  GSI_CONTINUE_LINKING);
8291                         }
8292                       else if (!stmt_ends_bb_p (insert_after->stmt))
8293                         {
8294                           gsi = gsi_for_stmt (insert_after->stmt);
8295                           gsi_insert_seq_after (&gsi, ctor_seq,
8296                                                 GSI_CONTINUE_LINKING);
8297                         }
8298                       else
8299                         {
8300                           /* When we want to insert after a def where the
8301                              defining stmt throws then insert on the fallthru
8302                              edge.  */
8303                           edge e = find_fallthru_edge
8304                                      (gimple_bb (insert_after->stmt)->succs);
8305                           basic_block new_bb
8306                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8307                           gcc_assert (!new_bb);
8308                         }
8309                     }
8310                   else
8311                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
8312                   ctor_seq = NULL;
8313                 }
8314               voprnds.quick_push (vec_cst);
8315               insert_after = NULL;
8316               number_of_places_left_in_vector = nunits;
8317               constant_p = true;
8318               elts.new_vector (vector_type, nunits, 1);
8319               elts.quick_grow (nunits);
8320             }
8321         }
8322     }
8323
8324   /* Since the vectors are created in the reverse order, we should invert
8325      them.  */
8326   vec_num = voprnds.length ();
8327   for (j = vec_num; j != 0; j--)
8328     {
8329       vop = voprnds[j - 1];
8330       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8331     }
8332
8333   /* In case that VF is greater than the unrolling factor needed for the SLP
8334      group of stmts, NUMBER_OF_VECTORS to be created is greater than
8335      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8336      to replicate the vectors.  */
8337   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8338     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8339          i++)
8340       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8341 }
8342
8343 /* Get the Ith vectorized definition from SLP_NODE.  */
8344
8345 tree
8346 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8347 {
8348   return SLP_TREE_VEC_DEFS (slp_node)[i];
8349 }
8350
8351 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
8352
8353 void
8354 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8355 {
8356   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8357   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8358 }
8359
8360 /* Get N vectorized definitions for SLP_NODE.  */
8361
8362 void
8363 vect_get_slp_defs (vec_info *,
8364                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8365 {
8366   if (n == -1U)
8367     n = SLP_TREE_CHILDREN (slp_node).length ();
8368
8369   for (unsigned i = 0; i < n; ++i)
8370     {
8371       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8372       vec<tree> vec_defs = vNULL;
8373       vect_get_slp_defs (child, &vec_defs);
8374       vec_oprnds->quick_push (vec_defs);
8375     }
8376 }
8377
8378 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8379    - PERM gives the permutation that the caller wants to use for NODE,
8380      which might be different from SLP_LOAD_PERMUTATION.
8381    - DUMP_P controls whether the function dumps information.  */
8382
8383 static bool
8384 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8385                                 load_permutation_t &perm,
8386                                 const vec<tree> &dr_chain,
8387                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
8388                                 bool analyze_only, bool dump_p,
8389                                 unsigned *n_perms, unsigned int *n_loads,
8390                                 bool dce_chain)
8391 {
8392   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8393   int vec_index = 0;
8394   tree vectype = SLP_TREE_VECTYPE (node);
8395   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8396   unsigned int mask_element;
8397   unsigned dr_group_size;
8398   machine_mode mode;
8399
8400   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8401     dr_group_size = 1;
8402   else
8403     {
8404       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8405       dr_group_size = DR_GROUP_SIZE (stmt_info);
8406     }
8407
8408   mode = TYPE_MODE (vectype);
8409   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8410   unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8411
8412   /* Initialize the vect stmts of NODE to properly insert the generated
8413      stmts later.  */
8414   if (! analyze_only)
8415     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8416       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8417
8418   /* Generate permutation masks for every NODE. Number of masks for each NODE
8419      is equal to GROUP_SIZE.
8420      E.g., we have a group of three nodes with three loads from the same
8421      location in each node, and the vector size is 4. I.e., we have a
8422      a0b0c0a1b1c1... sequence and we need to create the following vectors:
8423      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8424      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8425      ...
8426
8427      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8428      The last mask is illegal since we assume two operands for permute
8429      operation, and the mask element values can't be outside that range.
8430      Hence, the last mask must be converted into {2,5,5,5}.
8431      For the first two permutations we need the first and the second input
8432      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8433      we need the second and the third vectors: {b1,c1,a2,b2} and
8434      {c2,a3,b3,c3}.  */
8435
8436   int vect_stmts_counter = 0;
8437   unsigned int index = 0;
8438   int first_vec_index = -1;
8439   int second_vec_index = -1;
8440   bool noop_p = true;
8441   *n_perms = 0;
8442
8443   vec_perm_builder mask;
8444   unsigned int nelts_to_build;
8445   unsigned int nvectors_per_build;
8446   unsigned int in_nlanes;
8447   bool repeating_p = (group_size == dr_group_size
8448                       && multiple_p (nunits, group_size));
8449   if (repeating_p)
8450     {
8451       /* A single vector contains a whole number of copies of the node, so:
8452          (a) all permutes can use the same mask; and
8453          (b) the permutes only need a single vector input.  */
8454       mask.new_vector (nunits, group_size, 3);
8455       nelts_to_build = mask.encoded_nelts ();
8456       /* It's possible to obtain zero nstmts during analyze_only, so make
8457          it at least one to ensure the later computation for n_perms
8458          proceed.  */
8459       nvectors_per_build = nstmts > 0 ? nstmts : 1;
8460       in_nlanes = dr_group_size * 3;
8461     }
8462   else
8463     {
8464       /* We need to construct a separate mask for each vector statement.  */
8465       unsigned HOST_WIDE_INT const_nunits, const_vf;
8466       if (!nunits.is_constant (&const_nunits)
8467           || !vf.is_constant (&const_vf))
8468         return false;
8469       mask.new_vector (const_nunits, const_nunits, 1);
8470       nelts_to_build = const_vf * group_size;
8471       nvectors_per_build = 1;
8472       in_nlanes = const_vf * dr_group_size;
8473     }
8474   auto_sbitmap used_in_lanes (in_nlanes);
8475   bitmap_clear (used_in_lanes);
8476   auto_bitmap used_defs;
8477
8478   unsigned int count = mask.encoded_nelts ();
8479   mask.quick_grow (count);
8480   vec_perm_indices indices;
8481
8482   for (unsigned int j = 0; j < nelts_to_build; j++)
8483     {
8484       unsigned int iter_num = j / group_size;
8485       unsigned int stmt_num = j % group_size;
8486       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8487       bitmap_set_bit (used_in_lanes, i);
8488       if (repeating_p)
8489         {
8490           first_vec_index = 0;
8491           mask_element = i;
8492         }
8493       else
8494         {
8495           /* Enforced before the loop when !repeating_p.  */
8496           unsigned int const_nunits = nunits.to_constant ();
8497           vec_index = i / const_nunits;
8498           mask_element = i % const_nunits;
8499           if (vec_index == first_vec_index
8500               || first_vec_index == -1)
8501             {
8502               first_vec_index = vec_index;
8503             }
8504           else if (vec_index == second_vec_index
8505                    || second_vec_index == -1)
8506             {
8507               second_vec_index = vec_index;
8508               mask_element += const_nunits;
8509             }
8510           else
8511             {
8512               if (dump_p)
8513                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8514                                  "permutation requires at "
8515                                  "least three vectors %G",
8516                                  stmt_info->stmt);
8517               gcc_assert (analyze_only);
8518               return false;
8519             }
8520
8521           gcc_assert (mask_element < 2 * const_nunits);
8522         }
8523
8524       if (mask_element != index)
8525         noop_p = false;
8526       mask[index++] = mask_element;
8527
8528       if (index == count)
8529         {
8530           if (!noop_p)
8531             {
8532               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8533               if (!can_vec_perm_const_p (mode, mode, indices))
8534                 {
8535                   if (dump_p)
8536                     {
8537                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8538                                        "unsupported vect permute { ");
8539                       for (i = 0; i < count; ++i)
8540                         {
8541                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8542                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8543                         }
8544                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8545                     }
8546                   gcc_assert (analyze_only);
8547                   return false;
8548                 }
8549
8550               tree mask_vec = NULL_TREE;
8551               if (!analyze_only)
8552                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8553
8554               if (second_vec_index == -1)
8555                 second_vec_index = first_vec_index;
8556
8557               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8558                 {
8559                   ++*n_perms;
8560                   if (analyze_only)
8561                     continue;
8562                   /* Generate the permute statement if necessary.  */
8563                   tree first_vec = dr_chain[first_vec_index + ri];
8564                   tree second_vec = dr_chain[second_vec_index + ri];
8565                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8566                   tree perm_dest
8567                     = vect_create_destination_var (gimple_assign_lhs (stmt),
8568                                                    vectype);
8569                   perm_dest = make_ssa_name (perm_dest);
8570                   gimple *perm_stmt
8571                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8572                                            second_vec, mask_vec);
8573                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8574                                                gsi);
8575                   if (dce_chain)
8576                     {
8577                       bitmap_set_bit (used_defs, first_vec_index + ri);
8578                       bitmap_set_bit (used_defs, second_vec_index + ri);
8579                     }
8580
8581                   /* Store the vector statement in NODE.  */
8582                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8583                 }
8584             }
8585           else if (!analyze_only)
8586             {
8587               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8588                 {
8589                   tree first_vec = dr_chain[first_vec_index + ri];
8590                   /* If mask was NULL_TREE generate the requested
8591                      identity transform.  */
8592                   if (dce_chain)
8593                     bitmap_set_bit (used_defs, first_vec_index + ri);
8594
8595                   /* Store the vector statement in NODE.  */
8596                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8597                 }
8598             }
8599
8600           index = 0;
8601           first_vec_index = -1;
8602           second_vec_index = -1;
8603           noop_p = true;
8604         }
8605     }
8606
8607   if (n_loads)
8608     {
8609       if (repeating_p)
8610         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8611       else
8612         {
8613           /* Enforced above when !repeating_p.  */
8614           unsigned int const_nunits = nunits.to_constant ();
8615           *n_loads = 0;
8616           bool load_seen = false;
8617           for (unsigned i = 0; i < in_nlanes; ++i)
8618             {
8619               if (i % const_nunits == 0)
8620                 {
8621                   if (load_seen)
8622                     *n_loads += 1;
8623                   load_seen = false;
8624                 }
8625               if (bitmap_bit_p (used_in_lanes, i))
8626                 load_seen = true;
8627             }
8628           if (load_seen)
8629             *n_loads += 1;
8630         }
8631     }
8632
8633   if (dce_chain)
8634     for (unsigned i = 0; i < dr_chain.length (); ++i)
8635       if (!bitmap_bit_p (used_defs, i))
8636         {
8637           tree def = dr_chain[i];
8638           do
8639             {
8640               gimple *stmt = SSA_NAME_DEF_STMT (def);
8641               if (is_gimple_assign (stmt)
8642                   && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
8643                       || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
8644                 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
8645               else
8646                 def = NULL;
8647               gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8648               gsi_remove (&rgsi, true);
8649               release_defs (stmt);
8650             }
8651           while (def);
8652         }
8653
8654   return true;
8655 }
8656
8657 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8658    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8659    permute statements for the SLP node NODE.  Store the number of vector
8660    permute instructions in *N_PERMS and the number of vector load
8661    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
8662    that were not needed.  */
8663
8664 bool
8665 vect_transform_slp_perm_load (vec_info *vinfo,
8666                               slp_tree node, const vec<tree> &dr_chain,
8667                               gimple_stmt_iterator *gsi, poly_uint64 vf,
8668                               bool analyze_only, unsigned *n_perms,
8669                               unsigned int *n_loads, bool dce_chain)
8670 {
8671   return vect_transform_slp_perm_load_1 (vinfo, node,
8672                                          SLP_TREE_LOAD_PERMUTATION (node),
8673                                          dr_chain, gsi, vf, analyze_only,
8674                                          dump_enabled_p (), n_perms, n_loads,
8675                                          dce_chain);
8676 }
8677
8678 /* Produce the next vector result for SLP permutation NODE by adding a vector
8679    statement at GSI.  If MASK_VEC is nonnull, add:
8680
8681       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8682
8683    otherwise add:
8684
8685       <new SSA name> = FIRST_DEF.  */
8686
8687 static void
8688 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8689                           slp_tree node, tree first_def, tree second_def,
8690                           tree mask_vec, poly_uint64 identity_offset)
8691 {
8692   tree vectype = SLP_TREE_VECTYPE (node);
8693
8694   /* ???  We SLP match existing vector element extracts but
8695      allow punning which we need to re-instantiate at uses
8696      but have no good way of explicitly representing.  */
8697   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8698       && !types_compatible_p (TREE_TYPE (first_def), vectype))
8699     {
8700       gassign *conv_stmt
8701         = gimple_build_assign (make_ssa_name (vectype),
8702                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8703       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8704       first_def = gimple_assign_lhs (conv_stmt);
8705     }
8706   gassign *perm_stmt;
8707   tree perm_dest = make_ssa_name (vectype);
8708   if (mask_vec)
8709     {
8710       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8711                            TYPE_SIZE (vectype))
8712           && !types_compatible_p (TREE_TYPE (second_def), vectype))
8713         {
8714           gassign *conv_stmt
8715             = gimple_build_assign (make_ssa_name (vectype),
8716                                    build1 (VIEW_CONVERT_EXPR,
8717                                            vectype, second_def));
8718           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8719           second_def = gimple_assign_lhs (conv_stmt);
8720         }
8721       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8722                                        first_def, second_def,
8723                                        mask_vec);
8724     }
8725   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8726     {
8727       /* For identity permutes we still need to handle the case
8728          of offsetted extracts or concats.  */
8729       unsigned HOST_WIDE_INT c;
8730       auto first_def_nunits
8731         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8732       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8733         {
8734           unsigned HOST_WIDE_INT elsz
8735             = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8736           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8737                                  TYPE_SIZE (vectype),
8738                                  bitsize_int (identity_offset * elsz));
8739           perm_stmt = gimple_build_assign (perm_dest, lowpart);
8740         }
8741       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8742                                     first_def_nunits, &c) && c == 2)
8743         {
8744           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8745                                             NULL_TREE, second_def);
8746           perm_stmt = gimple_build_assign (perm_dest, ctor);
8747         }
8748       else
8749         gcc_unreachable ();
8750     }
8751   else
8752     {
8753       /* We need a copy here in case the def was external.  */
8754       perm_stmt = gimple_build_assign (perm_dest, first_def);
8755     }
8756   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8757   /* Store the vector statement in NODE.  */
8758   node->push_vec_def (perm_stmt);
8759 }
8760
8761 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
8762    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8763    If GSI is nonnull, emit the permutation there.
8764
8765    When GSI is null, the only purpose of NODE is to give properties
8766    of the result, such as the vector type and number of SLP lanes.
8767    The node does not need to be a VEC_PERM_EXPR.
8768
8769    If the target supports the operation, return the number of individual
8770    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
8771    dump file if DUMP_P is true.  */
8772
8773 static int
8774 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8775                                 slp_tree node, lane_permutation_t &perm,
8776                                 vec<slp_tree> &children, bool dump_p)
8777 {
8778   tree vectype = SLP_TREE_VECTYPE (node);
8779
8780   /* ???  We currently only support all same vector input types
8781      while the SLP IL should really do a concat + select and thus accept
8782      arbitrary mismatches.  */
8783   slp_tree child;
8784   unsigned i;
8785   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8786   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8787   tree op_vectype = NULL_TREE;
8788   FOR_EACH_VEC_ELT (children, i, child)
8789     if (SLP_TREE_VECTYPE (child))
8790       {
8791         op_vectype = SLP_TREE_VECTYPE (child);
8792         break;
8793       }
8794   if (!op_vectype)
8795     op_vectype = vectype;
8796   FOR_EACH_VEC_ELT (children, i, child)
8797     {
8798       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8799            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8800           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8801           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8802         {
8803           if (dump_p)
8804             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8805                              "Unsupported vector types in lane permutation\n");
8806           return -1;
8807         }
8808       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8809         repeating_p = false;
8810     }
8811
8812   gcc_assert (perm.length () == SLP_TREE_LANES (node));
8813   if (dump_p)
8814     {
8815       dump_printf_loc (MSG_NOTE, vect_location,
8816                        "vectorizing permutation");
8817       for (unsigned i = 0; i < perm.length (); ++i)
8818         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8819       if (repeating_p)
8820         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8821       dump_printf (MSG_NOTE, "\n");
8822     }
8823
8824   /* REPEATING_P is true if every output vector is guaranteed to use the
8825      same permute vector.  We can handle that case for both variable-length
8826      and constant-length vectors, but we only handle other cases for
8827      constant-length vectors.
8828
8829      Set:
8830
8831      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8832        mask vector that we want to build.
8833
8834      - NCOPIES to the number of copies of PERM that we need in order
8835        to build the necessary permute mask vectors.
8836
8837      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8838        for each permute mask vector.  This is only relevant when GSI is
8839        nonnull.  */
8840   uint64_t npatterns;
8841   unsigned nelts_per_pattern;
8842   uint64_t ncopies;
8843   unsigned noutputs_per_mask;
8844   if (repeating_p)
8845     {
8846       /* We need a single permute mask vector that has the form:
8847
8848            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8849
8850          In other words, the original n-element permute in PERM is
8851          "unrolled" to fill a full vector.  The stepped vector encoding
8852          that we use for permutes requires 3n elements.  */
8853       npatterns = SLP_TREE_LANES (node);
8854       nelts_per_pattern = ncopies = 3;
8855       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8856     }
8857   else
8858     {
8859       /* Calculate every element of every permute mask vector explicitly,
8860          instead of relying on the pattern described above.  */
8861       if (!nunits.is_constant (&npatterns))
8862         return -1;
8863       nelts_per_pattern = ncopies = 1;
8864       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8865         if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8866           return -1;
8867       noutputs_per_mask = 1;
8868     }
8869   unsigned olanes = ncopies * SLP_TREE_LANES (node);
8870   gcc_assert (repeating_p || multiple_p (olanes, nunits));
8871
8872   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8873      from the { SLP operand, scalar lane } permutation as recorded in the
8874      SLP node as intermediate step.  This part should already work
8875      with SLP children with arbitrary number of lanes.  */
8876   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8877   auto_vec<unsigned> active_lane;
8878   vperm.create (olanes);
8879   active_lane.safe_grow_cleared (children.length (), true);
8880   for (unsigned i = 0; i < ncopies; ++i)
8881     {
8882       for (unsigned pi = 0; pi < perm.length (); ++pi)
8883         {
8884           std::pair<unsigned, unsigned> p = perm[pi];
8885           tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8886           if (repeating_p)
8887             vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8888           else
8889             {
8890               /* We checked above that the vectors are constant-length.  */
8891               unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8892               unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8893               unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8894               vperm.quick_push ({{p.first, vi}, vl});
8895             }
8896         }
8897       /* Advance to the next group.  */
8898       for (unsigned j = 0; j < children.length (); ++j)
8899         active_lane[j] += SLP_TREE_LANES (children[j]);
8900     }
8901
8902   if (dump_p)
8903     {
8904       dump_printf_loc (MSG_NOTE, vect_location,
8905                        "vectorizing permutation");
8906       for (unsigned i = 0; i < perm.length (); ++i)
8907         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8908       if (repeating_p)
8909         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8910       dump_printf (MSG_NOTE, "\n");
8911       dump_printf_loc (MSG_NOTE, vect_location, "as");
8912       for (unsigned i = 0; i < vperm.length (); ++i)
8913         {
8914           if (i != 0
8915               && (repeating_p
8916                   ? multiple_p (i, npatterns)
8917                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8918             dump_printf (MSG_NOTE, ",");
8919           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8920                        vperm[i].first.first, vperm[i].first.second,
8921                        vperm[i].second);
8922         }
8923       dump_printf (MSG_NOTE, "\n");
8924     }
8925
8926   /* We can only handle two-vector permutes, everything else should
8927      be lowered on the SLP level.  The following is closely inspired
8928      by vect_transform_slp_perm_load and is supposed to eventually
8929      replace it.
8930      ???   As intermediate step do code-gen in the SLP tree representation
8931      somehow?  */
8932   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8933   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8934   unsigned int index = 0;
8935   poly_uint64 mask_element;
8936   vec_perm_builder mask;
8937   mask.new_vector (nunits, npatterns, nelts_per_pattern);
8938   unsigned int count = mask.encoded_nelts ();
8939   mask.quick_grow (count);
8940   vec_perm_indices indices;
8941   unsigned nperms = 0;
8942   for (unsigned i = 0; i < vperm.length (); ++i)
8943     {
8944       mask_element = vperm[i].second;
8945       if (first_vec.first == -1U
8946           || first_vec == vperm[i].first)
8947         first_vec = vperm[i].first;
8948       else if (second_vec.first == -1U
8949                || second_vec == vperm[i].first)
8950         {
8951           second_vec = vperm[i].first;
8952           mask_element += nunits;
8953         }
8954       else
8955         {
8956           if (dump_p)
8957             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8958                              "permutation requires at "
8959                              "least three vectors\n");
8960           gcc_assert (!gsi);
8961           return -1;
8962         }
8963
8964       mask[index++] = mask_element;
8965
8966       if (index == count)
8967         {
8968           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8969                               TYPE_VECTOR_SUBPARTS (op_vectype));
8970           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
8971                              && constant_multiple_p (mask[0], nunits));
8972           machine_mode vmode = TYPE_MODE (vectype);
8973           machine_mode op_vmode = TYPE_MODE (op_vectype);
8974           unsigned HOST_WIDE_INT c;
8975           if ((!identity_p
8976                && !can_vec_perm_const_p (vmode, op_vmode, indices))
8977               || (identity_p
8978                   && !known_le (nunits,
8979                                 TYPE_VECTOR_SUBPARTS (op_vectype))
8980                   && (!constant_multiple_p (nunits,
8981                                             TYPE_VECTOR_SUBPARTS (op_vectype),
8982                                             &c) || c != 2)))
8983             {
8984               if (dump_p)
8985                 {
8986                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8987                                    vect_location,
8988                                    "unsupported vect permute { ");
8989                   for (i = 0; i < count; ++i)
8990                     {
8991                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8992                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8993                     }
8994                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8995                 }
8996               gcc_assert (!gsi);
8997               return -1;
8998             }
8999
9000           if (!identity_p)
9001             nperms++;
9002           if (gsi)
9003             {
9004               if (second_vec.first == -1U)
9005                 second_vec = first_vec;
9006
9007               slp_tree
9008                 first_node = children[first_vec.first],
9009                 second_node = children[second_vec.first];
9010
9011               tree mask_vec = NULL_TREE;
9012               if (!identity_p)
9013                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
9014
9015               for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
9016                 {
9017                   tree first_def
9018                     = vect_get_slp_vect_def (first_node,
9019                                              first_vec.second + vi);
9020                   tree second_def
9021                     = vect_get_slp_vect_def (second_node,
9022                                              second_vec.second + vi);
9023                   vect_add_slp_permutation (vinfo, gsi, node, first_def,
9024                                             second_def, mask_vec, mask[0]);
9025                 }
9026             }
9027
9028           index = 0;
9029           first_vec = std::make_pair (-1U, -1U);
9030           second_vec = std::make_pair (-1U, -1U);
9031         }
9032     }
9033
9034   return nperms;
9035 }
9036
9037 /* Vectorize the SLP permutations in NODE as specified
9038    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
9039    child number and lane number.
9040    Interleaving of two two-lane two-child SLP subtrees (not supported):
9041      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
9042    A blend of two four-lane two-child SLP subtrees:
9043      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
9044    Highpart of a four-lane one-child SLP subtree (not supported):
9045      [ { 0, 2 }, { 0, 3 } ]
9046    Where currently only a subset is supported by code generating below.  */
9047
9048 static bool
9049 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
9050                               slp_tree node, stmt_vector_for_cost *cost_vec)
9051 {
9052   tree vectype = SLP_TREE_VECTYPE (node);
9053   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
9054   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
9055                                                SLP_TREE_CHILDREN (node),
9056                                                dump_enabled_p ());
9057   if (nperms < 0)
9058     return false;
9059
9060   if (!gsi)
9061     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
9062
9063   return true;
9064 }
9065
9066 /* Vectorize SLP NODE.  */
9067
9068 static void
9069 vect_schedule_slp_node (vec_info *vinfo,
9070                         slp_tree node, slp_instance instance)
9071 {
9072   gimple_stmt_iterator si;
9073   int i;
9074   slp_tree child;
9075
9076   /* Vectorize externals and constants.  */
9077   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9078       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
9079     {
9080       /* ???  vectorizable_shift can end up using a scalar operand which is
9081          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
9082          node in this case.  */
9083       if (!SLP_TREE_VECTYPE (node))
9084         return;
9085
9086       /* There are two reasons vector defs might already exist.  The first
9087          is that we are vectorizing an existing vector def.  The second is
9088          when performing BB vectorization shared constant/external nodes
9089          are not split apart during partitioning so during the code-gen
9090          DFS walk we can end up visiting them twice.  */
9091       if (! SLP_TREE_VEC_DEFS (node).exists ())
9092         vect_create_constant_vectors (vinfo, node);
9093       return;
9094     }
9095
9096   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9097
9098   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9099
9100   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
9101   SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9102
9103   if (dump_enabled_p ())
9104     dump_printf_loc (MSG_NOTE, vect_location,
9105                      "------>vectorizing SLP node starting from: %G",
9106                      stmt_info->stmt);
9107
9108   if (STMT_VINFO_DATA_REF (stmt_info)
9109       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9110     {
9111       /* Vectorized loads go before the first scalar load to make it
9112          ready early, vectorized stores go before the last scalar
9113          stmt which is where all uses are ready.  */
9114       stmt_vec_info last_stmt_info = NULL;
9115       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9116         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9117       else /* DR_IS_WRITE */
9118         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9119       si = gsi_for_stmt (last_stmt_info->stmt);
9120     }
9121   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9122             || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9123             || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9124            && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9125     {
9126       /* For PHI node vectorization we do not use the insertion iterator.  */
9127       si = gsi_none ();
9128     }
9129   else
9130     {
9131       /* Emit other stmts after the children vectorized defs which is
9132          earliest possible.  */
9133       gimple *last_stmt = NULL;
9134       if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
9135         if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
9136             || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
9137           {
9138             /* But avoid scheduling internal defs outside of the loop when
9139                we might have only implicitly tracked loop mask/len defs.  */
9140             gimple_stmt_iterator si
9141               = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
9142             last_stmt = *si;
9143           }
9144       bool seen_vector_def = false;
9145       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9146         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9147           {
9148             /* For fold-left reductions we are retaining the scalar
9149                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9150                set so the representation isn't perfect.  Resort to the
9151                last scalar def here.  */
9152             if (SLP_TREE_VEC_DEFS (child).is_empty ())
9153               {
9154                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9155                             == cycle_phi_info_type);
9156                 gphi *phi = as_a <gphi *>
9157                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
9158                 if (!last_stmt
9159                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
9160                   last_stmt = phi;
9161               }
9162             /* We are emitting all vectorized stmts in the same place and
9163                the last one is the last.
9164                ???  Unless we have a load permutation applied and that
9165                figures to re-use an earlier generated load.  */
9166             unsigned j;
9167             tree vdef;
9168             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9169               {
9170                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9171                 if (!last_stmt
9172                     || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9173                   last_stmt = vstmt;
9174               }
9175           }
9176         else if (!SLP_TREE_VECTYPE (child))
9177           {
9178             /* For externals we use unvectorized at all scalar defs.  */
9179             unsigned j;
9180             tree def;
9181             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9182               if (TREE_CODE (def) == SSA_NAME
9183                   && !SSA_NAME_IS_DEFAULT_DEF (def))
9184                 {
9185                   gimple *stmt = SSA_NAME_DEF_STMT (def);
9186                   if (!last_stmt
9187                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9188                     last_stmt = stmt;
9189                 }
9190           }
9191         else
9192           {
9193             /* For externals we have to look at all defs since their
9194                insertion place is decided per vector.  But beware
9195                of pre-existing vectors where we need to make sure
9196                we do not insert before the region boundary.  */
9197             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9198                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9199               seen_vector_def = true;
9200             else
9201               {
9202                 unsigned j;
9203                 tree vdef;
9204                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9205                   if (TREE_CODE (vdef) == SSA_NAME
9206                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9207                     {
9208                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9209                       if (!last_stmt
9210                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9211                         last_stmt = vstmt;
9212                     }
9213               }
9214           }
9215       /* This can happen when all children are pre-existing vectors or
9216          constants.  */
9217       if (!last_stmt)
9218         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9219       if (!last_stmt)
9220         {
9221           gcc_assert (seen_vector_def);
9222           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9223         }
9224       else if (is_ctrl_altering_stmt (last_stmt))
9225         {
9226           /* We split regions to vectorize at control altering stmts
9227              with a definition so this must be an external which
9228              we can insert at the start of the region.  */
9229           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9230         }
9231       else if (is_a <bb_vec_info> (vinfo)
9232                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9233                && gimple_could_trap_p (stmt_info->stmt))
9234         {
9235           /* We've constrained possibly trapping operations to all come
9236              from the same basic-block, if vectorized defs would allow earlier
9237              scheduling still force vectorized stmts to the original block.
9238              This is only necessary for BB vectorization since for loop vect
9239              all operations are in a single BB and scalar stmt based
9240              placement doesn't play well with epilogue vectorization.  */
9241           gcc_assert (dominated_by_p (CDI_DOMINATORS,
9242                                       gimple_bb (stmt_info->stmt),
9243                                       gimple_bb (last_stmt)));
9244           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9245         }
9246       else if (is_a <gphi *> (last_stmt))
9247         si = gsi_after_labels (gimple_bb (last_stmt));
9248       else
9249         {
9250           si = gsi_for_stmt (last_stmt);
9251           gsi_next (&si);
9252         }
9253     }
9254
9255   /* Handle purely internal nodes.  */
9256   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9257     {
9258       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
9259          be shared with different SLP nodes (but usually it's the same
9260          operation apart from the case the stmt is only there for denoting
9261          the actual scalar lane defs ...).  So do not call vect_transform_stmt
9262          but open-code it here (partly).  */
9263       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9264       gcc_assert (done);
9265       stmt_vec_info slp_stmt_info;
9266       unsigned int i;
9267       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9268         if (STMT_VINFO_LIVE_P (slp_stmt_info))
9269           {
9270             done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9271                                                 instance, i, true, NULL);
9272             gcc_assert (done);
9273           }
9274     }
9275   else
9276     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9277 }
9278
9279 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9280    For loop vectorization this is done in vectorizable_call, but for SLP
9281    it needs to be deferred until end of vect_schedule_slp, because multiple
9282    SLP instances may refer to the same scalar stmt.  */
9283
9284 static void
9285 vect_remove_slp_scalar_calls (vec_info *vinfo,
9286                               slp_tree node, hash_set<slp_tree> &visited)
9287 {
9288   gimple *new_stmt;
9289   gimple_stmt_iterator gsi;
9290   int i;
9291   slp_tree child;
9292   tree lhs;
9293   stmt_vec_info stmt_info;
9294
9295   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9296     return;
9297
9298   if (visited.add (node))
9299     return;
9300
9301   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9302     vect_remove_slp_scalar_calls (vinfo, child, visited);
9303
9304   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9305     {
9306       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9307       if (!stmt || gimple_bb (stmt) == NULL)
9308         continue;
9309       if (is_pattern_stmt_p (stmt_info)
9310           || !PURE_SLP_STMT (stmt_info))
9311         continue;
9312       lhs = gimple_call_lhs (stmt);
9313       if (lhs)
9314         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9315       else
9316         {
9317           new_stmt = gimple_build_nop ();
9318           unlink_stmt_vdef (stmt_info->stmt);
9319         }
9320       gsi = gsi_for_stmt (stmt);
9321       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9322       if (lhs)
9323         SSA_NAME_DEF_STMT (lhs) = new_stmt;
9324     }
9325 }
9326
9327 static void
9328 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9329 {
9330   hash_set<slp_tree> visited;
9331   vect_remove_slp_scalar_calls (vinfo, node, visited);
9332 }
9333
9334 /* Vectorize the instance root.  */
9335
9336 void
9337 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9338 {
9339   gassign *rstmt = NULL;
9340
9341   if (instance->kind == slp_inst_kind_ctor)
9342     {
9343       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9344         {
9345           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9346           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9347           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9348                                           TREE_TYPE (vect_lhs)))
9349             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9350                                vect_lhs);
9351           rstmt = gimple_build_assign (root_lhs, vect_lhs);
9352         }
9353       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9354         {
9355           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9356           tree child_def;
9357           int j;
9358           vec<constructor_elt, va_gc> *v;
9359           vec_alloc (v, nelts);
9360
9361           /* A CTOR can handle V16HI composition from VNx8HI so we
9362              do not need to convert vector elements if the types
9363              do not match.  */
9364           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9365             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9366           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9367           tree rtype
9368             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9369           tree r_constructor = build_constructor (rtype, v);
9370           rstmt = gimple_build_assign (lhs, r_constructor);
9371         }
9372     }
9373   else if (instance->kind == slp_inst_kind_bb_reduc)
9374     {
9375       /* Largely inspired by reduction chain epilogue handling in
9376          vect_create_epilog_for_reduction.  */
9377       vec<tree> vec_defs = vNULL;
9378       vect_get_slp_defs (node, &vec_defs);
9379       enum tree_code reduc_code
9380         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9381       /* ???  We actually have to reflect signs somewhere.  */
9382       if (reduc_code == MINUS_EXPR)
9383         reduc_code = PLUS_EXPR;
9384       gimple_seq epilogue = NULL;
9385       /* We may end up with more than one vector result, reduce them
9386          to one vector.  */
9387       tree vec_def = vec_defs[0];
9388       tree vectype = TREE_TYPE (vec_def);
9389       tree compute_vectype = vectype;
9390       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9391                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
9392                                  && operation_can_overflow (reduc_code));
9393       if (pun_for_overflow_p)
9394         {
9395           compute_vectype = unsigned_type_for (vectype);
9396           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9397                                   compute_vectype, vec_def);
9398         }
9399       for (unsigned i = 1; i < vec_defs.length (); ++i)
9400         {
9401           tree def = vec_defs[i];
9402           if (pun_for_overflow_p)
9403             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9404                                 compute_vectype, def);
9405           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9406                                   vec_def, def);
9407         }
9408       vec_defs.release ();
9409       /* ???  Support other schemes than direct internal fn.  */
9410       internal_fn reduc_fn;
9411       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9412           || reduc_fn == IFN_LAST)
9413         gcc_unreachable ();
9414       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9415                                       TREE_TYPE (compute_vectype), vec_def);
9416       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9417         {
9418           tree rem_def = NULL_TREE;
9419           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9420             {
9421               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9422               if (!rem_def)
9423                 rem_def = def;
9424               else
9425                 rem_def = gimple_build (&epilogue, reduc_code,
9426                                         TREE_TYPE (scalar_def),
9427                                         rem_def, def);
9428             }
9429           scalar_def = gimple_build (&epilogue, reduc_code,
9430                                      TREE_TYPE (scalar_def),
9431                                      scalar_def, rem_def);
9432         }
9433       scalar_def = gimple_convert (&epilogue,
9434                                    TREE_TYPE (vectype), scalar_def);
9435       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9436       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9437       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9438       update_stmt (gsi_stmt (rgsi));
9439       return;
9440     }
9441   else
9442     gcc_unreachable ();
9443
9444   gcc_assert (rstmt);
9445
9446   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9447   gsi_replace (&rgsi, rstmt, true);
9448 }
9449
9450 struct slp_scc_info
9451 {
9452   bool on_stack;
9453   int dfs;
9454   int lowlink;
9455 };
9456
9457 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
9458
9459 static void
9460 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9461                    hash_map<slp_tree, slp_scc_info> &scc_info,
9462                    int &maxdfs, vec<slp_tree> &stack)
9463 {
9464   bool existed_p;
9465   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9466   gcc_assert (!existed_p);
9467   info->dfs = maxdfs;
9468   info->lowlink = maxdfs;
9469   maxdfs++;
9470
9471   /* Leaf.  */
9472   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9473     {
9474       info->on_stack = false;
9475       vect_schedule_slp_node (vinfo, node, instance);
9476       return;
9477     }
9478
9479   info->on_stack = true;
9480   stack.safe_push (node);
9481
9482   unsigned i;
9483   slp_tree child;
9484   /* DFS recurse.  */
9485   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9486     {
9487       if (!child)
9488         continue;
9489       slp_scc_info *child_info = scc_info.get (child);
9490       if (!child_info)
9491         {
9492           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9493           /* Recursion might have re-allocated the node.  */
9494           info = scc_info.get (node);
9495           child_info = scc_info.get (child);
9496           info->lowlink = MIN (info->lowlink, child_info->lowlink);
9497         }
9498       else if (child_info->on_stack)
9499         info->lowlink = MIN (info->lowlink, child_info->dfs);
9500     }
9501   if (info->lowlink != info->dfs)
9502     return;
9503
9504   auto_vec<slp_tree, 4> phis_to_fixup;
9505
9506   /* Singleton.  */
9507   if (stack.last () == node)
9508     {
9509       stack.pop ();
9510       info->on_stack = false;
9511       vect_schedule_slp_node (vinfo, node, instance);
9512       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9513           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9514         phis_to_fixup.quick_push (node);
9515     }
9516   else
9517     {
9518       /* SCC.  */
9519       int last_idx = stack.length () - 1;
9520       while (stack[last_idx] != node)
9521         last_idx--;
9522       /* We can break the cycle at PHIs who have at least one child
9523          code generated.  Then we could re-start the DFS walk until
9524          all nodes in the SCC are covered (we might have new entries
9525          for only back-reachable nodes).  But it's simpler to just
9526          iterate and schedule those that are ready.  */
9527       unsigned todo = stack.length () - last_idx;
9528       do
9529         {
9530           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9531             {
9532               slp_tree entry = stack[idx];
9533               if (!entry)
9534                 continue;
9535               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9536                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9537               bool ready = !phi;
9538               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9539                   if (!child)
9540                     {
9541                       gcc_assert (phi);
9542                       ready = true;
9543                       break;
9544                     }
9545                   else if (scc_info.get (child)->on_stack)
9546                     {
9547                       if (!phi)
9548                         {
9549                           ready = false;
9550                           break;
9551                         }
9552                     }
9553                   else
9554                     {
9555                       if (phi)
9556                         {
9557                           ready = true;
9558                           break;
9559                         }
9560                     }
9561               if (ready)
9562                 {
9563                   vect_schedule_slp_node (vinfo, entry, instance);
9564                   scc_info.get (entry)->on_stack = false;
9565                   stack[idx] = NULL;
9566                   todo--;
9567                   if (phi)
9568                     phis_to_fixup.safe_push (entry);
9569                 }
9570             }
9571         }
9572       while (todo != 0);
9573
9574       /* Pop the SCC.  */
9575       stack.truncate (last_idx);
9576     }
9577
9578   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
9579   slp_tree phi_node;
9580   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9581     {
9582       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9583       edge_iterator ei;
9584       edge e;
9585       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9586         {
9587           unsigned dest_idx = e->dest_idx;
9588           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9589           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9590             continue;
9591           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9592           /* Simply fill all args.  */
9593           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9594               != vect_first_order_recurrence)
9595             for (unsigned i = 0; i < n; ++i)
9596               {
9597                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9598                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9599                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9600                              e, gimple_phi_arg_location (phi, dest_idx));
9601               }
9602           else
9603             {
9604               /* Unless it is a first order recurrence which needs
9605                  args filled in for both the PHI node and the permutes.  */
9606               gimple *perm
9607                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9608               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9609               add_phi_arg (as_a <gphi *> (rphi),
9610                            vect_get_slp_vect_def (child, n - 1),
9611                            e, gimple_phi_arg_location (phi, dest_idx));
9612               for (unsigned i = 0; i < n; ++i)
9613                 {
9614                   gimple *perm
9615                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9616                   if (i > 0)
9617                     gimple_assign_set_rhs1 (perm,
9618                                             vect_get_slp_vect_def (child, i - 1));
9619                   gimple_assign_set_rhs2 (perm,
9620                                           vect_get_slp_vect_def (child, i));
9621                   update_stmt (perm);
9622                 }
9623             }
9624         }
9625     }
9626 }
9627
9628 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
9629
9630 void
9631 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9632 {
9633   slp_instance instance;
9634   unsigned int i;
9635
9636   hash_map<slp_tree, slp_scc_info> scc_info;
9637   int maxdfs = 0;
9638   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9639     {
9640       slp_tree node = SLP_INSTANCE_TREE (instance);
9641       if (dump_enabled_p ())
9642         {
9643           dump_printf_loc (MSG_NOTE, vect_location,
9644                            "Vectorizing SLP tree:\n");
9645           /* ???  Dump all?  */
9646           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9647             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9648                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9649           vect_print_slp_graph (MSG_NOTE, vect_location,
9650                                 SLP_INSTANCE_TREE (instance));
9651         }
9652       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9653          have a PHI be the node breaking the cycle.  */
9654       auto_vec<slp_tree> stack;
9655       if (!scc_info.get (node))
9656         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9657
9658       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9659         vectorize_slp_instance_root_stmt (node, instance);
9660
9661       if (dump_enabled_p ())
9662         dump_printf_loc (MSG_NOTE, vect_location,
9663                          "vectorizing stmts using SLP.\n");
9664     }
9665
9666   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9667     {
9668       slp_tree root = SLP_INSTANCE_TREE (instance);
9669       stmt_vec_info store_info;
9670       unsigned int j;
9671
9672       /* Remove scalar call stmts.  Do not do this for basic-block
9673          vectorization as not all uses may be vectorized.
9674          ???  Why should this be necessary?  DCE should be able to
9675          remove the stmts itself.
9676          ???  For BB vectorization we can as well remove scalar
9677          stmts starting from the SLP tree root if they have no
9678          uses.  */
9679       if (is_a <loop_vec_info> (vinfo))
9680         vect_remove_slp_scalar_calls (vinfo, root);
9681
9682       /* Remove vectorized stores original scalar stmts.  */
9683       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9684         {
9685           if (!STMT_VINFO_DATA_REF (store_info)
9686               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9687             break;
9688
9689           store_info = vect_orig_stmt (store_info);
9690           /* Free the attached stmt_vec_info and remove the stmt.  */
9691           vinfo->remove_stmt (store_info);
9692
9693           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9694              to not crash in vect_free_slp_tree later.  */
9695           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9696             SLP_TREE_REPRESENTATIVE (root) = NULL;
9697         }
9698     }
9699 }