gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "insn-config.h"
  35 #include "recog.h"              /* FIXME: for insn_data */
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "gimple-iterator.h"
  39 #include "cfgloop.h"
  40 #include "tree-vectorizer.h"
  41 #include "langhooks.h"
  42 #include "gimple-walk.h"
  43 #include "dbgcnt.h"
  44 #include "tree-vector-builder.h"
  45 #include "vec-perm-indices.h"
  46 #include "gimple-fold.h"
  47 #include "internal-fn.h"
  48 #include "dump-context.h"
  49 #include "cfganal.h"
  50 #include "tree-eh.h"
  51 #include "tree-cfg.h"
  52 #include "alloc-pool.h"
  53 #include "sreal.h"
  54 #include "predict.h"
  55
  56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  57                                             load_permutation_t &,
  58                                             const vec<tree> &,
  59                                             gimple_stmt_iterator *,
  60                                             poly_uint64, bool, bool,
  61                                             unsigned *,
  62                                             unsigned * = nullptr,
  63                                             bool = false);
  64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  65                                            slp_tree, lane_permutation_t &,
  66                                            vec<slp_tree> &, bool);
  67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  68                                           slp_tree, stmt_vector_for_cost *);
  69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  70
  71 static object_allocator<_slp_tree> *slp_tree_pool;
  72 static slp_tree slp_first_node;
  73
  74 void
  75 vect_slp_init (void)
  76 {
  77   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  78 }
  79
  80 void
  81 vect_slp_fini (void)
  82 {
  83   while (slp_first_node)
  84     delete slp_first_node;
  85   delete slp_tree_pool;
  86   slp_tree_pool = NULL;
  87 }
  88
  89 void *
  90 _slp_tree::operator new (size_t n)
  91 {
  92   gcc_assert (n == sizeof (_slp_tree));
  93   return slp_tree_pool->allocate_raw ();
  94 }
  95
  96 void
  97 _slp_tree::operator delete (void *node, size_t n)
  98 {
  99   gcc_assert (n == sizeof (_slp_tree));
 100   slp_tree_pool->remove_raw (node);
 101 }
 102
 103
 104 /* Initialize a SLP node.  */
 105
 106 _slp_tree::_slp_tree ()
 107 {
 108   this->prev_node = NULL;
 109   if (slp_first_node)
 110     slp_first_node->prev_node = this;
 111   this->next_node = slp_first_node;
 112   slp_first_node = this;
 113   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 114   SLP_TREE_SCALAR_OPS (this) = vNULL;
 115   SLP_TREE_VEC_DEFS (this) = vNULL;
 116   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 117   SLP_TREE_CHILDREN (this) = vNULL;
 118   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 119   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 120   SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
 121   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 122   SLP_TREE_CODE (this) = ERROR_MARK;
 123   SLP_TREE_VECTYPE (this) = NULL_TREE;
 124   SLP_TREE_REPRESENTATIVE (this) = NULL;
 125   SLP_TREE_REF_COUNT (this) = 1;
 126   this->failed = NULL;
 127   this->max_nunits = 1;
 128   this->lanes = 0;
 129 }
 130
 131 /* Tear down a SLP node.  */
 132
 133 _slp_tree::~_slp_tree ()
 134 {
 135   if (this->prev_node)
 136     this->prev_node->next_node = this->next_node;
 137   else
 138     slp_first_node = this->next_node;
 139   if (this->next_node)
 140     this->next_node->prev_node = this->prev_node;
 141   SLP_TREE_CHILDREN (this).release ();
 142   SLP_TREE_SCALAR_STMTS (this).release ();
 143   SLP_TREE_SCALAR_OPS (this).release ();
 144   SLP_TREE_VEC_DEFS (this).release ();
 145   SLP_TREE_LOAD_PERMUTATION (this).release ();
 146   SLP_TREE_LANE_PERMUTATION (this).release ();
 147   SLP_TREE_SIMD_CLONE_INFO (this).release ();
 148   if (this->failed)
 149     free (failed);
 150 }
 151
 152 /* Push the single SSA definition in DEF to the vector of vector defs.  */
 153
 154 void
 155 _slp_tree::push_vec_def (gimple *def)
 156 {
 157   if (gphi *phi = dyn_cast <gphi *> (def))
 158     vec_defs.quick_push (gimple_phi_result (phi));
 159   else
 160     {
 161       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
 162       vec_defs.quick_push (get_def_from_ptr (defop));
 163     }
 164 }
 165
 166 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 167
 168 void
 169 vect_free_slp_tree (slp_tree node)
 170 {
 171   int i;
 172   slp_tree child;
 173
 174   if (--SLP_TREE_REF_COUNT (node) != 0)
 175     return;
 176
 177   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 178     if (child)
 179       vect_free_slp_tree (child);
 180
 181   /* If the node defines any SLP only patterns then those patterns are no
 182      longer valid and should be removed.  */
 183   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 184   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 185     {
 186       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 187       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 188       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 189     }
 190
 191   delete node;
 192 }
 193
 194 /* Return a location suitable for dumpings related to the SLP instance.  */
 195
 196 dump_user_location_t
 197 _slp_instance::location () const
 198 {
 199   if (!root_stmts.is_empty ())
 200     return root_stmts[0]->stmt;
 201   else
 202     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 203 }
 204
 205
 206 /* Free the memory allocated for the SLP instance.  */
 207
 208 void
 209 vect_free_slp_instance (slp_instance instance)
 210 {
 211   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 212   SLP_INSTANCE_LOADS (instance).release ();
 213   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 214   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
 215   instance->subgraph_entries.release ();
 216   instance->cost_vec.release ();
 217   free (instance);
 218 }
 219
 220
 221 /* Create an SLP node for SCALAR_STMTS.  */
 222
 223 slp_tree
 224 vect_create_new_slp_node (unsigned nops, tree_code code)
 225 {
 226   slp_tree node = new _slp_tree;
 227   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 228   SLP_TREE_CHILDREN (node).create (nops);
 229   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 230   SLP_TREE_CODE (node) = code;
 231   return node;
 232 }
 233 /* Create an SLP node for SCALAR_STMTS.  */
 234
 235 static slp_tree
 236 vect_create_new_slp_node (slp_tree node,
 237                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 238 {
 239   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 240   SLP_TREE_CHILDREN (node).create (nops);
 241   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 242   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 243   SLP_TREE_LANES (node) = scalar_stmts.length ();
 244   return node;
 245 }
 246
 247 /* Create an SLP node for SCALAR_STMTS.  */
 248
 249 static slp_tree
 250 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 251 {
 252   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 253 }
 254
 255 /* Create an SLP node for OPS.  */
 256
 257 static slp_tree
 258 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 259 {
 260   SLP_TREE_SCALAR_OPS (node) = ops;
 261   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 262   SLP_TREE_LANES (node) = ops.length ();
 263   return node;
 264 }
 265
 266 /* Create an SLP node for OPS.  */
 267
 268 static slp_tree
 269 vect_create_new_slp_node (vec<tree> ops)
 270 {
 271   return vect_create_new_slp_node (new _slp_tree, ops);
 272 }
 273
 274
 275 /* This structure is used in creation of an SLP tree.  Each instance
 276    corresponds to the same operand in a group of scalar stmts in an SLP
 277    node.  */
 278 typedef struct _slp_oprnd_info
 279 {
 280   /* Def-stmts for the operands.  */
 281   vec<stmt_vec_info> def_stmts;
 282   /* Operands.  */
 283   vec<tree> ops;
 284   /* Information about the first statement, its vector def-type, type, the
 285      operand itself in case it's constant, and an indication if it's a pattern
 286      stmt and gather/scatter info.  */
 287   tree first_op_type;
 288   enum vect_def_type first_dt;
 289   bool any_pattern;
 290   bool first_gs_p;
 291   gather_scatter_info first_gs_info;
 292 } *slp_oprnd_info;
 293
 294
 295 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 296    operand.  */
 297 static vec<slp_oprnd_info>
 298 vect_create_oprnd_info (int nops, int group_size)
 299 {
 300   int i;
 301   slp_oprnd_info oprnd_info;
 302   vec<slp_oprnd_info> oprnds_info;
 303
 304   oprnds_info.create (nops);
 305   for (i = 0; i < nops; i++)
 306     {
 307       oprnd_info = XNEW (struct _slp_oprnd_info);
 308       oprnd_info->def_stmts.create (group_size);
 309       oprnd_info->ops.create (group_size);
 310       oprnd_info->first_dt = vect_uninitialized_def;
 311       oprnd_info->first_op_type = NULL_TREE;
 312       oprnd_info->any_pattern = false;
 313       oprnd_info->first_gs_p = false;
 314       oprnds_info.quick_push (oprnd_info);
 315     }
 316
 317   return oprnds_info;
 318 }
 319
 320
 321 /* Free operands info.  */
 322
 323 static void
 324 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 325 {
 326   int i;
 327   slp_oprnd_info oprnd_info;
 328
 329   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 330     {
 331       oprnd_info->def_stmts.release ();
 332       oprnd_info->ops.release ();
 333       XDELETE (oprnd_info);
 334     }
 335
 336   oprnds_info.release ();
 337 }
 338
 339 /* Return the execution frequency of NODE (so that a higher value indicates
 340    a "more important" node when optimizing for speed).  */
 341
 342 static sreal
 343 vect_slp_node_weight (slp_tree node)
 344 {
 345   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 346   basic_block bb = gimple_bb (stmt_info->stmt);
 347   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 348 }
 349
 350 /* Return true if STMTS contains a pattern statement.  */
 351
 352 static bool
 353 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 354 {
 355   stmt_vec_info stmt_info;
 356   unsigned int i;
 357   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 358     if (is_pattern_stmt_p (stmt_info))
 359       return true;
 360   return false;
 361 }
 362
 363 /* Return true when all lanes in the external or constant NODE have
 364    the same value.  */
 365
 366 static bool
 367 vect_slp_tree_uniform_p (slp_tree node)
 368 {
 369   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 370               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 371
 372   /* Pre-exsting vectors.  */
 373   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 374     return false;
 375
 376   unsigned i;
 377   tree op, first = NULL_TREE;
 378   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 379     if (!first)
 380       first = op;
 381     else if (!operand_equal_p (first, op, 0))
 382       return false;
 383
 384   return true;
 385 }
 386
 387 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 388    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 389    of the chain.  */
 390
 391 int
 392 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 393                                       stmt_vec_info first_stmt_info)
 394 {
 395   stmt_vec_info next_stmt_info = first_stmt_info;
 396   int result = 0;
 397
 398   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 399     return -1;
 400
 401   do
 402     {
 403       if (next_stmt_info == stmt_info)
 404         return result;
 405       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 406       if (next_stmt_info)
 407         result += DR_GROUP_GAP (next_stmt_info);
 408     }
 409   while (next_stmt_info);
 410
 411   return -1;
 412 }
 413
 414 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 415    using the method implemented by duplicate_and_interleave.  Return true
 416    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 417    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 418    (if nonnull).  */
 419
 420 bool
 421 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 422                                 tree elt_type, unsigned int *nvectors_out,
 423                                 tree *vector_type_out,
 424                                 tree *permutes)
 425 {
 426   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 427   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 428     return false;
 429
 430   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 431   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 432   unsigned int nvectors = 1;
 433   for (;;)
 434     {
 435       scalar_int_mode int_mode;
 436       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 437       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 438         {
 439           /* Get the natural vector type for this SLP group size.  */
 440           tree int_type = build_nonstandard_integer_type
 441             (GET_MODE_BITSIZE (int_mode), 1);
 442           tree vector_type
 443             = get_vectype_for_scalar_type (vinfo, int_type, count);
 444           poly_int64 half_nelts;
 445           if (vector_type
 446               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 447               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 448                            GET_MODE_SIZE (base_vector_mode))
 449               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
 450                              2, &half_nelts))
 451             {
 452               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 453                  together into elements of type INT_TYPE and using the result
 454                  to build NVECTORS vectors.  */
 455               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 456               vec_perm_builder sel1 (nelts, 2, 3);
 457               vec_perm_builder sel2 (nelts, 2, 3);
 458
 459               for (unsigned int i = 0; i < 3; ++i)
 460                 {
 461                   sel1.quick_push (i);
 462                   sel1.quick_push (i + nelts);
 463                   sel2.quick_push (half_nelts + i);
 464                   sel2.quick_push (half_nelts + i + nelts);
 465                 }
 466               vec_perm_indices indices1 (sel1, 2, nelts);
 467               vec_perm_indices indices2 (sel2, 2, nelts);
 468               machine_mode vmode = TYPE_MODE (vector_type);
 469               if (can_vec_perm_const_p (vmode, vmode, indices1)
 470                   && can_vec_perm_const_p (vmode, vmode, indices2))
 471                 {
 472                   if (nvectors_out)
 473                     *nvectors_out = nvectors;
 474                   if (vector_type_out)
 475                     *vector_type_out = vector_type;
 476                   if (permutes)
 477                     {
 478                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 479                                                                 indices1);
 480                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 481                                                                 indices2);
 482                     }
 483                   return true;
 484                 }
 485             }
 486         }
 487       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 488         return false;
 489       nvectors *= 2;
 490     }
 491 }
 492
 493 /* Return true if DTA and DTB match.  */
 494
 495 static bool
 496 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 497 {
 498   return (dta == dtb
 499           || ((dta == vect_external_def || dta == vect_constant_def)
 500               && (dtb == vect_external_def || dtb == vect_constant_def)));
 501 }
 502
 503 static const int cond_expr_maps[3][5] = {
 504   { 4, -1, -2, 1, 2 },
 505   { 4, -2, -1, 1, 2 },
 506   { 4, -1, -2, 2, 1 }
 507 };
 508 static const int arg1_map[] = { 1, 1 };
 509 static const int arg2_map[] = { 1, 2 };
 510 static const int arg1_arg4_map[] = { 2, 1, 4 };
 511 static const int arg3_arg2_map[] = { 2, 3, 2 };
 512 static const int op1_op0_map[] = { 2, 1, 0 };
 513 static const int off_map[] = { 1, -3 };
 514 static const int off_op0_map[] = { 2, -3, 0 };
 515 static const int off_arg2_map[] = { 2, -3, 2 };
 516 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
 517 static const int mask_call_maps[6][7] = {
 518   { 1, 1, },
 519   { 2, 1, 2, },
 520   { 3, 1, 2, 3, },
 521   { 4, 1, 2, 3, 4, },
 522   { 5, 1, 2, 3, 4, 5, },
 523   { 6, 1, 2, 3, 4, 5, 6 },
 524 };
 525
 526 /* For most SLP statements, there is a one-to-one mapping between
 527    gimple arguments and child nodes.  If that is not true for STMT,
 528    return an array that contains:
 529
 530    - the number of child nodes, followed by
 531    - for each child node, the index of the argument associated with that node.
 532      The special index -1 is the first operand of an embedded comparison and
 533      the special index -2 is the second operand of an embedded comparison.
 534      The special indes -3 is the offset of a gather as analyzed by
 535      vect_check_gather_scatter.
 536
 537    SWAP is as for vect_get_and_check_slp_defs.  */
 538
 539 static const int *
 540 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
 541                       unsigned char swap = 0)
 542 {
 543   if (auto assign = dyn_cast<const gassign *> (stmt))
 544     {
 545       if (gimple_assign_rhs_code (assign) == COND_EXPR
 546           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 547         return cond_expr_maps[swap];
 548       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 549           && swap)
 550         return op1_op0_map;
 551       if (gather_scatter_p)
 552         return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
 553                 ? off_op0_map : off_map);
 554     }
 555   gcc_assert (!swap);
 556   if (auto call = dyn_cast<const gcall *> (stmt))
 557     {
 558       if (gimple_call_internal_p (call))
 559         switch (gimple_call_internal_fn (call))
 560           {
 561           case IFN_MASK_LOAD:
 562             return gather_scatter_p ? off_arg2_map : arg2_map;
 563
 564           case IFN_GATHER_LOAD:
 565             return arg1_map;
 566
 567           case IFN_MASK_GATHER_LOAD:
 568           case IFN_MASK_LEN_GATHER_LOAD:
 569             return arg1_arg4_map;
 570
 571           case IFN_MASK_STORE:
 572             return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
 573
 574           case IFN_MASK_CALL:
 575             {
 576               unsigned nargs = gimple_call_num_args (call);
 577               if (nargs >= 2 && nargs <= 7)
 578                 return mask_call_maps[nargs-2];
 579               else
 580                 return nullptr;
 581             }
 582
 583           default:
 584             break;
 585           }
 586     }
 587   return nullptr;
 588 }
 589
 590 /* Return the SLP node child index for operand OP of STMT.  */
 591
 592 int
 593 vect_slp_child_index_for_operand (const gimple *stmt, int op,
 594                                   bool gather_scatter_p)
 595 {
 596   const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
 597   if (!opmap)
 598     return op;
 599   for (int i = 1; i < 1 + opmap[0]; ++i)
 600     if (opmap[i] == op)
 601       return i - 1;
 602   gcc_unreachable ();
 603 }
 604
 605 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 606    they are of a valid type and that they match the defs of the first stmt of
 607    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 608    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 609    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 610    is 1 if STMT is cond and operands of comparison need to be swapped;
 611    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 612
 613    If there was a fatal error return -1; if the error could be corrected by
 614    swapping operands of father node of this one, return 1; if everything is
 615    ok return 0.  */
 616 static int
 617 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 618                              bool *skip_args,
 619                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 620                              vec<slp_oprnd_info> *oprnds_info)
 621 {
 622   stmt_vec_info stmt_info = stmts[stmt_num];
 623   tree oprnd;
 624   unsigned int i, number_of_oprnds;
 625   enum vect_def_type dt = vect_uninitialized_def;
 626   slp_oprnd_info oprnd_info;
 627   gather_scatter_info gs_info;
 628   unsigned int gs_op = -1u;
 629   unsigned int commutative_op = -1U;
 630   bool first = stmt_num == 0;
 631
 632   if (!is_a<gcall *> (stmt_info->stmt)
 633       && !is_a<gassign *> (stmt_info->stmt)
 634       && !is_a<gphi *> (stmt_info->stmt))
 635     return -1;
 636
 637   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 638   const int *map
 639     = vect_get_operand_map (stmt_info->stmt,
 640                             STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
 641   if (map)
 642     number_of_oprnds = *map++;
 643   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 644     {
 645       if (gimple_call_internal_p (stmt))
 646         {
 647           internal_fn ifn = gimple_call_internal_fn (stmt);
 648           commutative_op = first_commutative_argument (ifn);
 649         }
 650     }
 651   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 652     {
 653       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 654         commutative_op = 0;
 655     }
 656
 657   bool swapped = (swap != 0);
 658   bool backedge = false;
 659   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 660   for (i = 0; i < number_of_oprnds; i++)
 661     {
 662       oprnd_info = (*oprnds_info)[i];
 663       int opno = map ? map[i] : int (i);
 664       if (opno == -3)
 665         {
 666           gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
 667           if (!is_a <loop_vec_info> (vinfo)
 668               || !vect_check_gather_scatter (stmt_info,
 669                                              as_a <loop_vec_info> (vinfo),
 670                                              first ? &oprnd_info->first_gs_info
 671                                              : &gs_info))
 672             return -1;
 673
 674           if (first)
 675             {
 676               oprnd_info->first_gs_p = true;
 677               oprnd = oprnd_info->first_gs_info.offset;
 678             }
 679           else
 680             {
 681               gs_op = i;
 682               oprnd = gs_info.offset;
 683             }
 684         }
 685       else if (opno < 0)
 686         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 687       else
 688         {
 689           oprnd = gimple_arg (stmt_info->stmt, opno);
 690           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 691             {
 692               edge e = gimple_phi_arg_edge (stmt, opno);
 693               backedge = (is_a <bb_vec_info> (vinfo)
 694                           ? e->flags & EDGE_DFS_BACK
 695                           : dominated_by_p (CDI_DOMINATORS, e->src,
 696                                             gimple_bb (stmt_info->stmt)));
 697             }
 698         }
 699       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 700         oprnd = TREE_OPERAND (oprnd, 0);
 701
 702       stmt_vec_info def_stmt_info;
 703       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 704         {
 705           if (dump_enabled_p ())
 706             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 707                              "Build SLP failed: can't analyze def for %T\n",
 708                              oprnd);
 709
 710           return -1;
 711         }
 712
 713       if (skip_args[i])
 714         {
 715           oprnd_info->def_stmts.quick_push (NULL);
 716           oprnd_info->ops.quick_push (NULL_TREE);
 717           oprnd_info->first_dt = vect_uninitialized_def;
 718           continue;
 719         }
 720
 721       oprnd_info->def_stmts.quick_push (def_stmt_info);
 722       oprnd_info->ops.quick_push (oprnd);
 723
 724       if (def_stmt_info
 725           && is_pattern_stmt_p (def_stmt_info))
 726         {
 727           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 728               != def_stmt_info)
 729             oprnd_info->any_pattern = true;
 730           else
 731             /* If we promote this to external use the original stmt def.  */
 732             oprnd_info->ops.last ()
 733               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 734         }
 735
 736       /* If there's a extern def on a backedge make sure we can
 737          code-generate at the region start.
 738          ???  This is another case that could be fixed by adjusting
 739          how we split the function but at the moment we'd have conflicting
 740          goals there.  */
 741       if (backedge
 742           && dts[i] == vect_external_def
 743           && is_a <bb_vec_info> (vinfo)
 744           && TREE_CODE (oprnd) == SSA_NAME
 745           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 746           && !dominated_by_p (CDI_DOMINATORS,
 747                               as_a <bb_vec_info> (vinfo)->bbs[0],
 748                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 749         {
 750           if (dump_enabled_p ())
 751             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 752                              "Build SLP failed: extern def %T only defined "
 753                              "on backedge\n", oprnd);
 754           return -1;
 755         }
 756
 757       if (first)
 758         {
 759           tree type = TREE_TYPE (oprnd);
 760           dt = dts[i];
 761           if ((dt == vect_constant_def
 762                || dt == vect_external_def)
 763               && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
 764               && TREE_CODE (type) != BOOLEAN_TYPE
 765               && !can_duplicate_and_interleave_p (vinfo, stmts.length (), type))
 766             {
 767               if (dump_enabled_p ())
 768                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 769                                  "Build SLP failed: invalid type of def "
 770                                  "for variable-length SLP %T\n", oprnd);
 771               return -1;
 772             }
 773
 774           /* For the swapping logic below force vect_reduction_def
 775              for the reduction op in a SLP reduction group.  */
 776           if (!STMT_VINFO_DATA_REF (stmt_info)
 777               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 778               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 779               && def_stmt_info)
 780             dts[i] = dt = vect_reduction_def;
 781
 782           /* Check the types of the definition.  */
 783           switch (dt)
 784             {
 785             case vect_external_def:
 786             case vect_constant_def:
 787             case vect_internal_def:
 788             case vect_reduction_def:
 789             case vect_induction_def:
 790             case vect_nested_cycle:
 791             case vect_first_order_recurrence:
 792               break;
 793
 794             default:
 795               /* FORNOW: Not supported.  */
 796               if (dump_enabled_p ())
 797                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 798                                  "Build SLP failed: illegal type of def %T\n",
 799                                  oprnd);
 800               return -1;
 801             }
 802
 803           oprnd_info->first_dt = dt;
 804           oprnd_info->first_op_type = type;
 805         }
 806     }
 807   if (first)
 808     return 0;
 809
 810   /* Now match the operand definition types to that of the first stmt.  */
 811   for (i = 0; i < number_of_oprnds;)
 812     {
 813       if (skip_args[i])
 814         {
 815           ++i;
 816           continue;
 817         }
 818
 819       oprnd_info = (*oprnds_info)[i];
 820       dt = dts[i];
 821       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 822       oprnd = oprnd_info->ops[stmt_num];
 823       tree type = TREE_TYPE (oprnd);
 824
 825       if (!types_compatible_p (oprnd_info->first_op_type, type))
 826         {
 827           if (dump_enabled_p ())
 828             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 829                              "Build SLP failed: different operand types\n");
 830           return 1;
 831         }
 832
 833       if ((gs_op == i) != oprnd_info->first_gs_p)
 834         {
 835           if (dump_enabled_p ())
 836             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 837                              "Build SLP failed: mixed gather and non-gather\n");
 838           return 1;
 839         }
 840       else if (gs_op == i)
 841         {
 842           if (!operand_equal_p (oprnd_info->first_gs_info.base,
 843                                 gs_info.base))
 844             {
 845               if (dump_enabled_p ())
 846                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 847                                  "Build SLP failed: different gather base\n");
 848               return 1;
 849             }
 850           if (oprnd_info->first_gs_info.scale != gs_info.scale)
 851             {
 852               if (dump_enabled_p ())
 853                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 854                                  "Build SLP failed: different gather scale\n");
 855               return 1;
 856             }
 857         }
 858
 859       /* Not first stmt of the group, check that the def-stmt/s match
 860          the def-stmt/s of the first stmt.  Allow different definition
 861          types for reduction chains: the first stmt must be a
 862          vect_reduction_def (a phi node), and the rest
 863          end in the reduction chain.  */
 864       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 865            && !(oprnd_info->first_dt == vect_reduction_def
 866                 && !STMT_VINFO_DATA_REF (stmt_info)
 867                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 868                 && def_stmt_info
 869                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 870                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 871                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 872           || (!STMT_VINFO_DATA_REF (stmt_info)
 873               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 874               && ((!def_stmt_info
 875                    || STMT_VINFO_DATA_REF (def_stmt_info)
 876                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 877                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 878                   != (oprnd_info->first_dt != vect_reduction_def))))
 879         {
 880           /* Try swapping operands if we got a mismatch.  For BB
 881              vectorization only in case it will clearly improve things.  */
 882           if (i == commutative_op && !swapped
 883               && (!is_a <bb_vec_info> (vinfo)
 884                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 885                                              dts[i+1])
 886                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 887                           || vect_def_types_match
 888                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 889             {
 890               if (dump_enabled_p ())
 891                 dump_printf_loc (MSG_NOTE, vect_location,
 892                                  "trying swapped operands\n");
 893               std::swap (dts[i], dts[i+1]);
 894               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 895                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 896               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 897                          (*oprnds_info)[i+1]->ops[stmt_num]);
 898               swapped = true;
 899               continue;
 900             }
 901
 902           if (is_a <bb_vec_info> (vinfo)
 903               && !oprnd_info->any_pattern)
 904             {
 905               /* Now for commutative ops we should see whether we can
 906                  make the other operand matching.  */
 907               if (dump_enabled_p ())
 908                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 909                                  "treating operand as external\n");
 910               oprnd_info->first_dt = dt = vect_external_def;
 911             }
 912           else
 913             {
 914               if (dump_enabled_p ())
 915                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 916                                  "Build SLP failed: different types\n");
 917               return 1;
 918             }
 919         }
 920
 921       /* Make sure to demote the overall operand to external.  */
 922       if (dt == vect_external_def)
 923         oprnd_info->first_dt = vect_external_def;
 924       /* For a SLP reduction chain we want to duplicate the reduction to
 925          each of the chain members.  That gets us a sane SLP graph (still
 926          the stmts are not 100% correct wrt the initial values).  */
 927       else if ((dt == vect_internal_def
 928                 || dt == vect_reduction_def)
 929                && oprnd_info->first_dt == vect_reduction_def
 930                && !STMT_VINFO_DATA_REF (stmt_info)
 931                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 932                && !STMT_VINFO_DATA_REF (def_stmt_info)
 933                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 934                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 935         {
 936           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 937           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 938         }
 939
 940       ++i;
 941     }
 942
 943   /* Swap operands.  */
 944   if (swapped)
 945     {
 946       if (dump_enabled_p ())
 947         dump_printf_loc (MSG_NOTE, vect_location,
 948                          "swapped operands to match def types in %G",
 949                          stmt_info->stmt);
 950     }
 951
 952   return 0;
 953 }
 954
 955 /* Return true if call statements CALL1 and CALL2 are similar enough
 956    to be combined into the same SLP group.  */
 957
 958 bool
 959 compatible_calls_p (gcall *call1, gcall *call2)
 960 {
 961   unsigned int nargs = gimple_call_num_args (call1);
 962   if (nargs != gimple_call_num_args (call2))
 963     return false;
 964
 965   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 966     return false;
 967
 968   if (gimple_call_internal_p (call1))
 969     {
 970       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
 971                                TREE_TYPE (gimple_call_lhs (call2))))
 972         return false;
 973       for (unsigned int i = 0; i < nargs; ++i)
 974         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
 975                                  TREE_TYPE (gimple_call_arg (call2, i))))
 976           return false;
 977     }
 978   else
 979     {
 980       if (!operand_equal_p (gimple_call_fn (call1),
 981                             gimple_call_fn (call2), 0))
 982         return false;
 983
 984       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
 985         return false;
 986     }
 987
 988   /* Check that any unvectorized arguments are equal.  */
 989   if (const int *map = vect_get_operand_map (call1))
 990     {
 991       unsigned int nkept = *map++;
 992       unsigned int mapi = 0;
 993       for (unsigned int i = 0; i < nargs; ++i)
 994         if (mapi < nkept && map[mapi] == int (i))
 995           mapi += 1;
 996         else if (!operand_equal_p (gimple_call_arg (call1, i),
 997                                    gimple_call_arg (call2, i)))
 998           return false;
 999     }
1000
1001   return true;
1002 }
1003
1004 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1005    caller's attempt to find the vector type in STMT_INFO with the narrowest
1006    element type.  Return true if VECTYPE is nonnull and if it is valid
1007    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
1008    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
1009    vect_build_slp_tree.  */
1010
1011 static bool
1012 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1013                         unsigned int group_size,
1014                         tree vectype, poly_uint64 *max_nunits)
1015 {
1016   if (!vectype)
1017     {
1018       if (dump_enabled_p ())
1019         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1020                          "Build SLP failed: unsupported data-type in %G\n",
1021                          stmt_info->stmt);
1022       /* Fatal mismatch.  */
1023       return false;
1024     }
1025
1026   /* If populating the vector type requires unrolling then fail
1027      before adjusting *max_nunits for basic-block vectorization.  */
1028   if (is_a <bb_vec_info> (vinfo)
1029       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1030     {
1031       if (dump_enabled_p ())
1032         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1033                          "Build SLP failed: unrolling required "
1034                          "in basic block SLP\n");
1035       /* Fatal mismatch.  */
1036       return false;
1037     }
1038
1039   /* In case of multiple types we need to detect the smallest type.  */
1040   vect_update_max_nunits (max_nunits, vectype);
1041   return true;
1042 }
1043
1044 /* Verify if the scalar stmts STMTS are isomorphic, require data
1045    permutation or are of unsupported types of operation.  Return
1046    true if they are, otherwise return false and indicate in *MATCHES
1047    which stmts are not isomorphic to the first one.  If MATCHES[0]
1048    is false then this indicates the comparison could not be
1049    carried out or the stmts will never be vectorized by SLP.
1050
1051    Note COND_EXPR is possibly isomorphic to another one after swapping its
1052    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1053    the first stmt by swapping the two operands of comparison; set SWAP[i]
1054    to 2 if stmt I is isormorphic to the first stmt by inverting the code
1055    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1056    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
1057
1058 static bool
1059 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1060                        vec<stmt_vec_info> stmts, unsigned int group_size,
1061                        poly_uint64 *max_nunits, bool *matches,
1062                        bool *two_operators, tree *node_vectype)
1063 {
1064   unsigned int i;
1065   stmt_vec_info first_stmt_info = stmts[0];
1066   code_helper first_stmt_code = ERROR_MARK;
1067   code_helper alt_stmt_code = ERROR_MARK;
1068   code_helper rhs_code = ERROR_MARK;
1069   code_helper first_cond_code = ERROR_MARK;
1070   tree lhs;
1071   bool need_same_oprnds = false;
1072   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1073   stmt_vec_info first_load = NULL, prev_first_load = NULL;
1074   bool first_stmt_ldst_p = false, ldst_p = false;
1075   bool first_stmt_phi_p = false, phi_p = false;
1076   bool maybe_soft_fail = false;
1077   tree soft_fail_nunits_vectype = NULL_TREE;
1078
1079   /* For every stmt in NODE find its def stmt/s.  */
1080   stmt_vec_info stmt_info;
1081   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1082     {
1083       gimple *stmt = stmt_info->stmt;
1084       swap[i] = 0;
1085       matches[i] = false;
1086
1087       if (dump_enabled_p ())
1088         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1089
1090       /* Fail to vectorize statements marked as unvectorizable, throw
1091          or are volatile.  */
1092       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1093           || stmt_can_throw_internal (cfun, stmt)
1094           || gimple_has_volatile_ops (stmt))
1095         {
1096           if (dump_enabled_p ())
1097             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1098                              "Build SLP failed: unvectorizable statement %G",
1099                              stmt);
1100           /* ???  For BB vectorization we want to commutate operands in a way
1101              to shuffle all unvectorizable defs into one operand and have
1102              the other still vectorized.  The following doesn't reliably
1103              work for this though but it's the easiest we can do here.  */
1104           if (is_a <bb_vec_info> (vinfo) && i != 0)
1105             continue;
1106           /* Fatal mismatch.  */
1107           matches[0] = false;
1108           return false;
1109         }
1110
1111       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1112       lhs = gimple_get_lhs (stmt);
1113       if (lhs == NULL_TREE
1114           && (!call_stmt
1115               || !gimple_call_internal_p (stmt)
1116               || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1117         {
1118           if (dump_enabled_p ())
1119             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1120                              "Build SLP failed: not GIMPLE_ASSIGN nor "
1121                              "GIMPLE_CALL %G", stmt);
1122           if (is_a <bb_vec_info> (vinfo) && i != 0)
1123             continue;
1124           /* Fatal mismatch.  */
1125           matches[0] = false;
1126           return false;
1127         }
1128
1129       tree nunits_vectype;
1130       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1131                                            &nunits_vectype, group_size))
1132         {
1133           if (is_a <bb_vec_info> (vinfo) && i != 0)
1134             continue;
1135           /* Fatal mismatch.  */
1136           matches[0] = false;
1137           return false;
1138         }
1139       /* Record nunits required but continue analysis, producing matches[]
1140          as if nunits was not an issue.  This allows splitting of groups
1141          to happen.  */
1142       if (nunits_vectype
1143           && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1144                                       nunits_vectype, max_nunits))
1145         {
1146           gcc_assert (is_a <bb_vec_info> (vinfo));
1147           maybe_soft_fail = true;
1148           soft_fail_nunits_vectype = nunits_vectype;
1149         }
1150
1151       gcc_assert (vectype);
1152
1153       if (call_stmt)
1154         {
1155           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1156           if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1157             rhs_code = cfn;
1158           else
1159             rhs_code = CALL_EXPR;
1160
1161           if (cfn == CFN_MASK_LOAD
1162               || cfn == CFN_GATHER_LOAD
1163               || cfn == CFN_MASK_GATHER_LOAD
1164               || cfn == CFN_MASK_LEN_GATHER_LOAD)
1165             ldst_p = true;
1166           else if (cfn == CFN_MASK_STORE)
1167             {
1168               ldst_p = true;
1169               rhs_code = CFN_MASK_STORE;
1170             }
1171           else if ((cfn != CFN_LAST
1172                     && cfn != CFN_MASK_CALL
1173                     && internal_fn_p (cfn)
1174                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1175                    || gimple_call_tail_p (call_stmt)
1176                    || gimple_call_noreturn_p (call_stmt)
1177                    || gimple_call_chain (call_stmt))
1178             {
1179               if (dump_enabled_p ())
1180                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1181                                  "Build SLP failed: unsupported call type %G",
1182                                  (gimple *) call_stmt);
1183               if (is_a <bb_vec_info> (vinfo) && i != 0)
1184                 continue;
1185               /* Fatal mismatch.  */
1186               matches[0] = false;
1187               return false;
1188             }
1189         }
1190       else if (gimple_code (stmt) == GIMPLE_PHI)
1191         {
1192           rhs_code = ERROR_MARK;
1193           phi_p = true;
1194         }
1195       else
1196         {
1197           rhs_code = gimple_assign_rhs_code (stmt);
1198           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1199         }
1200
1201       /* Check the operation.  */
1202       if (i == 0)
1203         {
1204           *node_vectype = vectype;
1205           first_stmt_code = rhs_code;
1206           first_stmt_ldst_p = ldst_p;
1207           first_stmt_phi_p = phi_p;
1208
1209           /* Shift arguments should be equal in all the packed stmts for a
1210              vector shift with scalar shift operand.  */
1211           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1212               || rhs_code == LROTATE_EXPR
1213               || rhs_code == RROTATE_EXPR)
1214             {
1215               /* First see if we have a vector/vector shift.  */
1216               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1217                 {
1218                   /* No vector/vector shift, try for a vector/scalar shift.  */
1219                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1220                     {
1221                       if (dump_enabled_p ())
1222                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223                                          "Build SLP failed: "
1224                                          "op not supported by target.\n");
1225                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1226                         continue;
1227                       /* Fatal mismatch.  */
1228                       matches[0] = false;
1229                       return false;
1230                     }
1231                   need_same_oprnds = true;
1232                   first_op1 = gimple_assign_rhs2 (stmt);
1233                 }
1234             }
1235           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1236             {
1237               need_same_oprnds = true;
1238               first_op1 = gimple_assign_rhs2 (stmt);
1239             }
1240           else if (!ldst_p
1241                    && rhs_code == BIT_FIELD_REF)
1242             {
1243               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1244               if (!is_a <bb_vec_info> (vinfo)
1245                   || TREE_CODE (vec) != SSA_NAME
1246                   /* When the element types are not compatible we pun the
1247                      source to the target vectype which requires equal size.  */
1248                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1249                        || !types_compatible_p (TREE_TYPE (vectype),
1250                                                TREE_TYPE (TREE_TYPE (vec))))
1251                       && !operand_equal_p (TYPE_SIZE (vectype),
1252                                            TYPE_SIZE (TREE_TYPE (vec)))))
1253                 {
1254                   if (dump_enabled_p ())
1255                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1256                                      "Build SLP failed: "
1257                                      "BIT_FIELD_REF not supported\n");
1258                   /* Fatal mismatch.  */
1259                   matches[0] = false;
1260                   return false;
1261                 }
1262             }
1263           else if (rhs_code == CFN_DIV_POW2)
1264             {
1265               need_same_oprnds = true;
1266               first_op1 = gimple_call_arg (call_stmt, 1);
1267             }
1268         }
1269       else
1270         {
1271           if (first_stmt_code != rhs_code
1272               && alt_stmt_code == ERROR_MARK)
1273             alt_stmt_code = rhs_code;
1274           if ((first_stmt_code != rhs_code
1275                && (first_stmt_code != IMAGPART_EXPR
1276                    || rhs_code != REALPART_EXPR)
1277                && (first_stmt_code != REALPART_EXPR
1278                    || rhs_code != IMAGPART_EXPR)
1279                /* Handle mismatches in plus/minus by computing both
1280                   and merging the results.  */
1281                && !((first_stmt_code == PLUS_EXPR
1282                      || first_stmt_code == MINUS_EXPR)
1283                     && (alt_stmt_code == PLUS_EXPR
1284                         || alt_stmt_code == MINUS_EXPR)
1285                     && rhs_code == alt_stmt_code)
1286                && !(first_stmt_code.is_tree_code ()
1287                     && rhs_code.is_tree_code ()
1288                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1289                         == tcc_comparison)
1290                     && (swap_tree_comparison (tree_code (first_stmt_code))
1291                         == tree_code (rhs_code)))
1292                && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1293                     && (first_stmt_code == ARRAY_REF
1294                         || first_stmt_code == BIT_FIELD_REF
1295                         || first_stmt_code == INDIRECT_REF
1296                         || first_stmt_code == COMPONENT_REF
1297                         || first_stmt_code == MEM_REF)
1298                     && (rhs_code == ARRAY_REF
1299                         || rhs_code == BIT_FIELD_REF
1300                         || rhs_code == INDIRECT_REF
1301                         || rhs_code == COMPONENT_REF
1302                         || rhs_code == MEM_REF)))
1303               || (ldst_p
1304                   && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1305                       != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1306               || (ldst_p
1307                   && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1308                       != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1309               || first_stmt_ldst_p != ldst_p
1310               || first_stmt_phi_p != phi_p)
1311             {
1312               if (dump_enabled_p ())
1313                 {
1314                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1315                                    "Build SLP failed: different operation "
1316                                    "in stmt %G", stmt);
1317                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1318                                    "original stmt %G", first_stmt_info->stmt);
1319                 }
1320               /* Mismatch.  */
1321               continue;
1322             }
1323
1324           if (!ldst_p
1325               && first_stmt_code == BIT_FIELD_REF
1326               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1327                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1328             {
1329               if (dump_enabled_p ())
1330                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1331                                  "Build SLP failed: different BIT_FIELD_REF "
1332                                  "arguments in %G", stmt);
1333               /* Mismatch.  */
1334               continue;
1335             }
1336
1337           if (call_stmt
1338               && first_stmt_code != CFN_MASK_LOAD
1339               && first_stmt_code != CFN_MASK_STORE)
1340             {
1341               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1342                                        call_stmt))
1343                 {
1344                   if (dump_enabled_p ())
1345                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1346                                      "Build SLP failed: different calls in %G",
1347                                      stmt);
1348                   /* Mismatch.  */
1349                   continue;
1350                 }
1351             }
1352
1353           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1354               && (gimple_bb (first_stmt_info->stmt)
1355                   != gimple_bb (stmt_info->stmt)))
1356             {
1357               if (dump_enabled_p ())
1358                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1359                                  "Build SLP failed: different BB for PHI "
1360                                  "or possibly trapping operation in %G", stmt);
1361               /* Mismatch.  */
1362               continue;
1363             }
1364
1365           if (need_same_oprnds)
1366             {
1367               tree other_op1 = gimple_arg (stmt, 1);
1368               if (!operand_equal_p (first_op1, other_op1, 0))
1369                 {
1370                   if (dump_enabled_p ())
1371                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1372                                      "Build SLP failed: different shift "
1373                                      "arguments in %G", stmt);
1374                   /* Mismatch.  */
1375                   continue;
1376                 }
1377             }
1378
1379           if (!types_compatible_p (vectype, *node_vectype))
1380             {
1381               if (dump_enabled_p ())
1382                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1383                                  "Build SLP failed: different vector type "
1384                                  "in %G", stmt);
1385               /* Mismatch.  */
1386               continue;
1387             }
1388         }
1389
1390       /* Grouped store or load.  */
1391       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1392         {
1393           gcc_assert (ldst_p);
1394           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1395             {
1396               /* Store.  */
1397               gcc_assert (rhs_code == CFN_MASK_STORE
1398                           || REFERENCE_CLASS_P (lhs)
1399                           || DECL_P (lhs));
1400             }
1401           else
1402             {
1403               /* Load.  */
1404               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1405               if (prev_first_load)
1406                 {
1407                   /* Check that there are no loads from different interleaving
1408                      chains in the same node.  */
1409                   if (prev_first_load != first_load)
1410                     {
1411                       if (dump_enabled_p ())
1412                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1413                                          vect_location,
1414                                          "Build SLP failed: different "
1415                                          "interleaving chains in one node %G",
1416                                          stmt);
1417                       /* Mismatch.  */
1418                       continue;
1419                     }
1420                 }
1421               else
1422                 prev_first_load = first_load;
1423            }
1424         }
1425       /* Non-grouped store or load.  */
1426       else if (ldst_p)
1427         {
1428           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1429               && rhs_code != CFN_GATHER_LOAD
1430               && rhs_code != CFN_MASK_GATHER_LOAD
1431               && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1432               && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1433               /* Not grouped loads are handled as externals for BB
1434                  vectorization.  For loop vectorization we can handle
1435                  splats the same we handle single element interleaving.  */
1436               && (is_a <bb_vec_info> (vinfo)
1437                   || stmt_info != first_stmt_info))
1438             {
1439               /* Not grouped load.  */
1440               if (dump_enabled_p ())
1441                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1442                                  "Build SLP failed: not grouped load %G", stmt);
1443
1444               if (i != 0)
1445                 continue;
1446               /* Fatal mismatch.  */
1447               matches[0] = false;
1448               return false;
1449             }
1450         }
1451       /* Not memory operation.  */
1452       else
1453         {
1454           if (!phi_p
1455               && rhs_code.is_tree_code ()
1456               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1457               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1458               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1459               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1460               && rhs_code != VIEW_CONVERT_EXPR
1461               && rhs_code != CALL_EXPR
1462               && rhs_code != BIT_FIELD_REF)
1463             {
1464               if (dump_enabled_p ())
1465                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1466                                  "Build SLP failed: operation unsupported %G",
1467                                  stmt);
1468               if (is_a <bb_vec_info> (vinfo) && i != 0)
1469                 continue;
1470               /* Fatal mismatch.  */
1471               matches[0] = false;
1472               return false;
1473             }
1474
1475           if (rhs_code == COND_EXPR)
1476             {
1477               tree cond_expr = gimple_assign_rhs1 (stmt);
1478               enum tree_code cond_code = TREE_CODE (cond_expr);
1479               enum tree_code swap_code = ERROR_MARK;
1480               enum tree_code invert_code = ERROR_MARK;
1481
1482               if (i == 0)
1483                 first_cond_code = TREE_CODE (cond_expr);
1484               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1485                 {
1486                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1487                   swap_code = swap_tree_comparison (cond_code);
1488                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1489                 }
1490
1491               if (first_cond_code == cond_code)
1492                 ;
1493               /* Isomorphic can be achieved by swapping.  */
1494               else if (first_cond_code == swap_code)
1495                 swap[i] = 1;
1496               /* Isomorphic can be achieved by inverting.  */
1497               else if (first_cond_code == invert_code)
1498                 swap[i] = 2;
1499               else
1500                 {
1501                   if (dump_enabled_p ())
1502                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1503                                      "Build SLP failed: different"
1504                                      " operation %G", stmt);
1505                   /* Mismatch.  */
1506                   continue;
1507                 }
1508             }
1509
1510           if (rhs_code.is_tree_code ()
1511               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1512               && (swap_tree_comparison ((tree_code)first_stmt_code)
1513                   == (tree_code)rhs_code))
1514             swap[i] = 1;
1515         }
1516
1517       matches[i] = true;
1518     }
1519
1520   for (i = 0; i < group_size; ++i)
1521     if (!matches[i])
1522       return false;
1523
1524   /* If we allowed a two-operation SLP node verify the target can cope
1525      with the permute we are going to use.  */
1526   if (alt_stmt_code != ERROR_MARK
1527       && (!alt_stmt_code.is_tree_code ()
1528           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1529               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1530     {
1531       *two_operators = true;
1532     }
1533
1534   if (maybe_soft_fail)
1535     {
1536       unsigned HOST_WIDE_INT const_nunits;
1537       if (!TYPE_VECTOR_SUBPARTS
1538             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1539           || const_nunits > group_size)
1540         matches[0] = false;
1541       else
1542         {
1543           /* With constant vector elements simulate a mismatch at the
1544              point we need to split.  */
1545           unsigned tail = group_size & (const_nunits - 1);
1546           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1547         }
1548       return false;
1549     }
1550
1551   return true;
1552 }
1553
1554 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1555    Note we never remove apart from at destruction time so we do not
1556    need a special value for deleted that differs from empty.  */
1557 struct bst_traits
1558 {
1559   typedef vec <stmt_vec_info> value_type;
1560   typedef vec <stmt_vec_info> compare_type;
1561   static inline hashval_t hash (value_type);
1562   static inline bool equal (value_type existing, value_type candidate);
1563   static inline bool is_empty (value_type x) { return !x.exists (); }
1564   static inline bool is_deleted (value_type x) { return !x.exists (); }
1565   static const bool empty_zero_p = true;
1566   static inline void mark_empty (value_type &x) { x.release (); }
1567   static inline void mark_deleted (value_type &x) { x.release (); }
1568   static inline void remove (value_type &x) { x.release (); }
1569 };
1570 inline hashval_t
1571 bst_traits::hash (value_type x)
1572 {
1573   inchash::hash h;
1574   for (unsigned i = 0; i < x.length (); ++i)
1575     h.add_int (gimple_uid (x[i]->stmt));
1576   return h.end ();
1577 }
1578 inline bool
1579 bst_traits::equal (value_type existing, value_type candidate)
1580 {
1581   if (existing.length () != candidate.length ())
1582     return false;
1583   for (unsigned i = 0; i < existing.length (); ++i)
1584     if (existing[i] != candidate[i])
1585       return false;
1586   return true;
1587 }
1588
1589 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1590    but then vec::insert does memmove and that's not compatible with
1591    std::pair.  */
1592 struct chain_op_t
1593 {
1594   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1595       : code (code_), dt (dt_), op (op_) {}
1596   tree_code code;
1597   vect_def_type dt;
1598   tree op;
1599 };
1600
1601 /* Comparator for sorting associatable chains.  */
1602
1603 static int
1604 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1605 {
1606   auto *op1 = (const chain_op_t *) op1_;
1607   auto *op2 = (const chain_op_t *) op2_;
1608   if (op1->dt != op2->dt)
1609     return (int)op1->dt - (int)op2->dt;
1610   return (int)op1->code - (int)op2->code;
1611 }
1612
1613 /* Linearize the associatable expression chain at START with the
1614    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1615    filling CHAIN with the result and using WORKLIST as intermediate storage.
1616    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1617    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1618    stmts, starting with START.  */
1619
1620 static void
1621 vect_slp_linearize_chain (vec_info *vinfo,
1622                           vec<std::pair<tree_code, gimple *> > &worklist,
1623                           vec<chain_op_t> &chain,
1624                           enum tree_code code, gimple *start,
1625                           gimple *&code_stmt, gimple *&alt_code_stmt,
1626                           vec<gimple *> *chain_stmts)
1627 {
1628   /* For each lane linearize the addition/subtraction (or other
1629      uniform associatable operation) expression tree.  */
1630   worklist.safe_push (std::make_pair (code, start));
1631   while (!worklist.is_empty ())
1632     {
1633       auto entry = worklist.pop ();
1634       gassign *stmt = as_a <gassign *> (entry.second);
1635       enum tree_code in_code = entry.first;
1636       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1637       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1638       if (!code_stmt
1639           && gimple_assign_rhs_code (stmt) == code)
1640         code_stmt = stmt;
1641       else if (!alt_code_stmt
1642                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1643         alt_code_stmt = stmt;
1644       if (chain_stmts)
1645         chain_stmts->safe_push (stmt);
1646       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1647         {
1648           tree op = gimple_op (stmt, opnum);
1649           vect_def_type dt;
1650           stmt_vec_info def_stmt_info;
1651           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1652           gcc_assert (res);
1653           if (dt == vect_internal_def
1654               && is_pattern_stmt_p (def_stmt_info))
1655             op = gimple_get_lhs (def_stmt_info->stmt);
1656           gimple *use_stmt;
1657           use_operand_p use_p;
1658           if (dt == vect_internal_def
1659               && single_imm_use (op, &use_p, &use_stmt)
1660               && is_gimple_assign (def_stmt_info->stmt)
1661               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1662                   || (code == PLUS_EXPR
1663                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1664                           == MINUS_EXPR))))
1665             {
1666               tree_code op_def_code = this_code;
1667               if (op_def_code == MINUS_EXPR && opnum == 1)
1668                 op_def_code = PLUS_EXPR;
1669               if (in_code == MINUS_EXPR)
1670                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1671               worklist.safe_push (std::make_pair (op_def_code,
1672                                                   def_stmt_info->stmt));
1673             }
1674           else
1675             {
1676               tree_code op_def_code = this_code;
1677               if (op_def_code == MINUS_EXPR && opnum == 1)
1678                 op_def_code = PLUS_EXPR;
1679               if (in_code == MINUS_EXPR)
1680                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1681               chain.safe_push (chain_op_t (op_def_code, dt, op));
1682             }
1683         }
1684     }
1685 }
1686
1687 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1688                   simple_hashmap_traits <bst_traits, slp_tree> >
1689   scalar_stmts_to_slp_tree_map_t;
1690
1691 static slp_tree
1692 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1693                        vec<stmt_vec_info> stmts, unsigned int group_size,
1694                        poly_uint64 *max_nunits,
1695                        bool *matches, unsigned *limit, unsigned *tree_size,
1696                        scalar_stmts_to_slp_tree_map_t *bst_map);
1697
1698 static slp_tree
1699 vect_build_slp_tree (vec_info *vinfo,
1700                      vec<stmt_vec_info> stmts, unsigned int group_size,
1701                      poly_uint64 *max_nunits,
1702                      bool *matches, unsigned *limit, unsigned *tree_size,
1703                      scalar_stmts_to_slp_tree_map_t *bst_map)
1704 {
1705   if (slp_tree *leader = bst_map->get (stmts))
1706     {
1707       if (dump_enabled_p ())
1708         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1709                          !(*leader)->failed ? "" : "failed ",
1710                          (void *) *leader);
1711       if (!(*leader)->failed)
1712         {
1713           SLP_TREE_REF_COUNT (*leader)++;
1714           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1715           stmts.release ();
1716           return *leader;
1717         }
1718       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1719       return NULL;
1720     }
1721
1722   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1723      so we can pick up backedge destinations during discovery.  */
1724   slp_tree res = new _slp_tree;
1725   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1726   SLP_TREE_SCALAR_STMTS (res) = stmts;
1727   bst_map->put (stmts.copy (), res);
1728
1729   if (*limit == 0)
1730     {
1731       if (dump_enabled_p ())
1732         dump_printf_loc (MSG_NOTE, vect_location,
1733                          "SLP discovery limit exceeded\n");
1734       /* Mark the node invalid so we can detect those when still in use
1735          as backedge destinations.  */
1736       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1737       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1738       res->failed = XNEWVEC (bool, group_size);
1739       memset (res->failed, 0, sizeof (bool) * group_size);
1740       memset (matches, 0, sizeof (bool) * group_size);
1741       return NULL;
1742     }
1743   --*limit;
1744
1745   if (dump_enabled_p ())
1746     dump_printf_loc (MSG_NOTE, vect_location,
1747                      "starting SLP discovery for node %p\n", (void *) res);
1748
1749   poly_uint64 this_max_nunits = 1;
1750   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1751                                         &this_max_nunits,
1752                                         matches, limit, tree_size, bst_map);
1753   if (!res_)
1754     {
1755       if (dump_enabled_p ())
1756         dump_printf_loc (MSG_NOTE, vect_location,
1757                          "SLP discovery for node %p failed\n", (void *) res);
1758       /* Mark the node invalid so we can detect those when still in use
1759          as backedge destinations.  */
1760       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1761       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1762       res->failed = XNEWVEC (bool, group_size);
1763       if (flag_checking)
1764         {
1765           unsigned i;
1766           for (i = 0; i < group_size; ++i)
1767             if (!matches[i])
1768               break;
1769           gcc_assert (i < group_size);
1770         }
1771       memcpy (res->failed, matches, sizeof (bool) * group_size);
1772     }
1773   else
1774     {
1775       if (dump_enabled_p ())
1776         dump_printf_loc (MSG_NOTE, vect_location,
1777                          "SLP discovery for node %p succeeded\n",
1778                          (void *) res);
1779       gcc_assert (res_ == res);
1780       res->max_nunits = this_max_nunits;
1781       vect_update_max_nunits (max_nunits, this_max_nunits);
1782       /* Keep a reference for the bst_map use.  */
1783       SLP_TREE_REF_COUNT (res)++;
1784     }
1785   return res_;
1786 }
1787
1788 /* Helper for building an associated SLP node chain.  */
1789
1790 static void
1791 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1792                                    slp_tree op0, slp_tree op1,
1793                                    stmt_vec_info oper1, stmt_vec_info oper2,
1794                                    vec<std::pair<unsigned, unsigned> > lperm)
1795 {
1796   unsigned group_size = SLP_TREE_LANES (op1);
1797
1798   slp_tree child1 = new _slp_tree;
1799   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1800   SLP_TREE_VECTYPE (child1) = vectype;
1801   SLP_TREE_LANES (child1) = group_size;
1802   SLP_TREE_CHILDREN (child1).create (2);
1803   SLP_TREE_CHILDREN (child1).quick_push (op0);
1804   SLP_TREE_CHILDREN (child1).quick_push (op1);
1805   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1806
1807   slp_tree child2 = new _slp_tree;
1808   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1809   SLP_TREE_VECTYPE (child2) = vectype;
1810   SLP_TREE_LANES (child2) = group_size;
1811   SLP_TREE_CHILDREN (child2).create (2);
1812   SLP_TREE_CHILDREN (child2).quick_push (op0);
1813   SLP_TREE_REF_COUNT (op0)++;
1814   SLP_TREE_CHILDREN (child2).quick_push (op1);
1815   SLP_TREE_REF_COUNT (op1)++;
1816   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1817
1818   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1819   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1820   SLP_TREE_VECTYPE (perm) = vectype;
1821   SLP_TREE_LANES (perm) = group_size;
1822   /* ???  We should set this NULL but that's not expected.  */
1823   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1824   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1825   SLP_TREE_CHILDREN (perm).quick_push (child1);
1826   SLP_TREE_CHILDREN (perm).quick_push (child2);
1827 }
1828
1829 /* Recursively build an SLP tree starting from NODE.
1830    Fail (and return a value not equal to zero) if def-stmts are not
1831    isomorphic, require data permutation or are of unsupported types of
1832    operation.  Otherwise, return 0.
1833    The value returned is the depth in the SLP tree where a mismatch
1834    was found.  */
1835
1836 static slp_tree
1837 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1838                        vec<stmt_vec_info> stmts, unsigned int group_size,
1839                        poly_uint64 *max_nunits,
1840                        bool *matches, unsigned *limit, unsigned *tree_size,
1841                        scalar_stmts_to_slp_tree_map_t *bst_map)
1842 {
1843   unsigned nops, i, this_tree_size = 0;
1844   poly_uint64 this_max_nunits = *max_nunits;
1845
1846   matches[0] = false;
1847
1848   stmt_vec_info stmt_info = stmts[0];
1849   if (!is_a<gcall *> (stmt_info->stmt)
1850       && !is_a<gassign *> (stmt_info->stmt)
1851       && !is_a<gphi *> (stmt_info->stmt))
1852     return NULL;
1853
1854   nops = gimple_num_args (stmt_info->stmt);
1855   if (const int *map = vect_get_operand_map (stmt_info->stmt,
1856                                              STMT_VINFO_GATHER_SCATTER_P
1857                                                (stmt_info)))
1858     nops = map[0];
1859
1860   /* If the SLP node is a PHI (induction or reduction), terminate
1861      the recursion.  */
1862   bool *skip_args = XALLOCAVEC (bool, nops);
1863   memset (skip_args, 0, sizeof (bool) * nops);
1864   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1865     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1866       {
1867         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1868         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1869                                                     group_size);
1870         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1871                                      max_nunits))
1872           return NULL;
1873
1874         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1875         if (def_type == vect_induction_def)
1876           {
1877             /* Induction PHIs are not cycles but walk the initial
1878                value.  Only for inner loops through, for outer loops
1879                we need to pick up the value from the actual PHIs
1880                to more easily support peeling and epilogue vectorization.  */
1881             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1882             if (!nested_in_vect_loop_p (loop, stmt_info))
1883               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1884             else
1885               loop = loop->inner;
1886             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1887           }
1888         else if (def_type == vect_reduction_def
1889                  || def_type == vect_double_reduction_def
1890                  || def_type == vect_nested_cycle
1891                  || def_type == vect_first_order_recurrence)
1892           {
1893             /* Else def types have to match.  */
1894             stmt_vec_info other_info;
1895             bool all_same = true;
1896             FOR_EACH_VEC_ELT (stmts, i, other_info)
1897               {
1898                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1899                   return NULL;
1900                 if (other_info != stmt_info)
1901                   all_same = false;
1902               }
1903             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1904             /* Reduction initial values are not explicitely represented.  */
1905             if (def_type != vect_first_order_recurrence
1906                 && !nested_in_vect_loop_p (loop, stmt_info))
1907               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1908             /* Reduction chain backedge defs are filled manually.
1909                ???  Need a better way to identify a SLP reduction chain PHI.
1910                Or a better overall way to SLP match those.  */
1911             if (all_same && def_type == vect_reduction_def)
1912               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1913           }
1914         else if (def_type != vect_internal_def)
1915           return NULL;
1916       }
1917
1918
1919   bool two_operators = false;
1920   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1921   tree vectype = NULL_TREE;
1922   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1923                               &this_max_nunits, matches, &two_operators,
1924                               &vectype))
1925     return NULL;
1926
1927   /* If the SLP node is a load, terminate the recursion unless masked.  */
1928   if (STMT_VINFO_DATA_REF (stmt_info)
1929       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1930     {
1931       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1932         gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1933                     || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1934                     || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
1935                     || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD));
1936       else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1937         gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1938       else
1939         {
1940           *max_nunits = this_max_nunits;
1941           (*tree_size)++;
1942           node = vect_create_new_slp_node (node, stmts, 0);
1943           SLP_TREE_VECTYPE (node) = vectype;
1944           /* And compute the load permutation.  Whether it is actually
1945              a permutation depends on the unrolling factor which is
1946              decided later.  */
1947           vec<unsigned> load_permutation;
1948           int j;
1949           stmt_vec_info load_info;
1950           load_permutation.create (group_size);
1951           stmt_vec_info first_stmt_info
1952             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1953           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1954             {
1955               int load_place;
1956               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1957                 load_place = vect_get_place_in_interleaving_chain
1958                                 (load_info, first_stmt_info);
1959               else
1960                 load_place = 0;
1961               gcc_assert (load_place != -1);
1962               load_permutation.safe_push (load_place);
1963             }
1964           SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1965           return node;
1966         }
1967     }
1968   else if (gimple_assign_single_p (stmt_info->stmt)
1969            && !gimple_vuse (stmt_info->stmt)
1970            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1971     {
1972       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1973          the same SSA name vector of a compatible type to vectype.  */
1974       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1975       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1976       stmt_vec_info estmt_info;
1977       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1978         {
1979           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1980           tree bfref = gimple_assign_rhs1 (estmt);
1981           HOST_WIDE_INT lane;
1982           if (!known_eq (bit_field_size (bfref),
1983                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1984               || !constant_multiple_p (bit_field_offset (bfref),
1985                                        bit_field_size (bfref), &lane))
1986             {
1987               lperm.release ();
1988               matches[0] = false;
1989               return NULL;
1990             }
1991           lperm.safe_push (std::make_pair (0, (unsigned)lane));
1992         }
1993       slp_tree vnode = vect_create_new_slp_node (vNULL);
1994       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1995         /* ???  We record vectype here but we hide eventually necessary
1996            punning and instead rely on code generation to materialize
1997            VIEW_CONVERT_EXPRs as necessary.  We instead should make
1998            this explicit somehow.  */
1999         SLP_TREE_VECTYPE (vnode) = vectype;
2000       else
2001         {
2002           /* For different size but compatible elements we can still
2003              use VEC_PERM_EXPR without punning.  */
2004           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2005                       && types_compatible_p (TREE_TYPE (vectype),
2006                                              TREE_TYPE (TREE_TYPE (vec))));
2007           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2008         }
2009       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2010       unsigned HOST_WIDE_INT const_nunits;
2011       if (nunits.is_constant (&const_nunits))
2012         SLP_TREE_LANES (vnode) = const_nunits;
2013       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2014       /* We are always building a permutation node even if it is an identity
2015          permute to shield the rest of the vectorizer from the odd node
2016          representing an actual vector without any scalar ops.
2017          ???  We could hide it completely with making the permute node
2018          external?  */
2019       node = vect_create_new_slp_node (node, stmts, 1);
2020       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2021       SLP_TREE_LANE_PERMUTATION (node) = lperm;
2022       SLP_TREE_VECTYPE (node) = vectype;
2023       SLP_TREE_CHILDREN (node).quick_push (vnode);
2024       return node;
2025     }
2026   /* When discovery reaches an associatable operation see whether we can
2027      improve that to match up lanes in a way superior to the operand
2028      swapping code which at most looks at two defs.
2029      ???  For BB vectorization we cannot do the brute-force search
2030      for matching as we can succeed by means of builds from scalars
2031      and have no good way to "cost" one build against another.  */
2032   else if (is_a <loop_vec_info> (vinfo)
2033            /* ???  We don't handle !vect_internal_def defs below.  */
2034            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2035            && is_gimple_assign (stmt_info->stmt)
2036            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2037                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2038            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2039                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2040                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2041     {
2042       /* See if we have a chain of (mixed) adds or subtracts or other
2043          associatable ops.  */
2044       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2045       if (code == MINUS_EXPR)
2046         code = PLUS_EXPR;
2047       stmt_vec_info other_op_stmt_info = NULL;
2048       stmt_vec_info op_stmt_info = NULL;
2049       unsigned chain_len = 0;
2050       auto_vec<chain_op_t> chain;
2051       auto_vec<std::pair<tree_code, gimple *> > worklist;
2052       auto_vec<vec<chain_op_t> > chains (group_size);
2053       auto_vec<slp_tree, 4> children;
2054       bool hard_fail = true;
2055       for (unsigned lane = 0; lane < group_size; ++lane)
2056         {
2057           /* For each lane linearize the addition/subtraction (or other
2058              uniform associatable operation) expression tree.  */
2059           gimple *op_stmt = NULL, *other_op_stmt = NULL;
2060           vect_slp_linearize_chain (vinfo, worklist, chain, code,
2061                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
2062                                     NULL);
2063           if (!op_stmt_info && op_stmt)
2064             op_stmt_info = vinfo->lookup_stmt (op_stmt);
2065           if (!other_op_stmt_info && other_op_stmt)
2066             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2067           if (chain.length () == 2)
2068             {
2069               /* In a chain of just two elements resort to the regular
2070                  operand swapping scheme.  If we run into a length
2071                  mismatch still hard-FAIL.  */
2072               if (chain_len == 0)
2073                 hard_fail = false;
2074               else
2075                 {
2076                   matches[lane] = false;
2077                   /* ???  We might want to process the other lanes, but
2078                      make sure to not give false matching hints to the
2079                      caller for lanes we did not process.  */
2080                   if (lane != group_size - 1)
2081                     matches[0] = false;
2082                 }
2083               break;
2084             }
2085           else if (chain_len == 0)
2086             chain_len = chain.length ();
2087           else if (chain.length () != chain_len)
2088             {
2089               /* ???  Here we could slip in magic to compensate with
2090                  neutral operands.  */
2091               matches[lane] = false;
2092               if (lane != group_size - 1)
2093                 matches[0] = false;
2094               break;
2095             }
2096           chains.quick_push (chain.copy ());
2097           chain.truncate (0);
2098         }
2099       if (chains.length () == group_size)
2100         {
2101           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
2102           if (!op_stmt_info)
2103             {
2104               hard_fail = false;
2105               goto out;
2106             }
2107           /* Now we have a set of chains with the same length.  */
2108           /* 1. pre-sort according to def_type and operation.  */
2109           for (unsigned lane = 0; lane < group_size; ++lane)
2110             chains[lane].stablesort (dt_sort_cmp, vinfo);
2111           if (dump_enabled_p ())
2112             {
2113               dump_printf_loc (MSG_NOTE, vect_location,
2114                                "pre-sorted chains of %s\n",
2115                                get_tree_code_name (code));
2116               for (unsigned lane = 0; lane < group_size; ++lane)
2117                 {
2118                   for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2119                     dump_printf (MSG_NOTE, "%s %T ",
2120                                  get_tree_code_name (chains[lane][opnum].code),
2121                                  chains[lane][opnum].op);
2122                   dump_printf (MSG_NOTE, "\n");
2123                 }
2124             }
2125           /* 2. try to build children nodes, associating as necessary.  */
2126           for (unsigned n = 0; n < chain_len; ++n)
2127             {
2128               vect_def_type dt = chains[0][n].dt;
2129               unsigned lane;
2130               for (lane = 0; lane < group_size; ++lane)
2131                 if (chains[lane][n].dt != dt)
2132                   {
2133                     if (dt == vect_constant_def
2134                         && chains[lane][n].dt == vect_external_def)
2135                       dt = vect_external_def;
2136                     else if (dt == vect_external_def
2137                              && chains[lane][n].dt == vect_constant_def)
2138                       ;
2139                     else
2140                       break;
2141                   }
2142               if (lane != group_size)
2143                 {
2144                   if (dump_enabled_p ())
2145                     dump_printf_loc (MSG_NOTE, vect_location,
2146                                      "giving up on chain due to mismatched "
2147                                      "def types\n");
2148                   matches[lane] = false;
2149                   if (lane != group_size - 1)
2150                     matches[0] = false;
2151                   goto out;
2152                 }
2153               if (dt == vect_constant_def
2154                   || dt == vect_external_def)
2155                 {
2156                   /* Check whether we can build the invariant.  If we can't
2157                      we never will be able to.  */
2158                   tree type = TREE_TYPE (chains[0][n].op);
2159                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2160                       && (TREE_CODE (type) == BOOLEAN_TYPE
2161                           || !can_duplicate_and_interleave_p (vinfo, group_size,
2162                                                               type)))
2163                     {
2164                       matches[0] = false;
2165                       goto out;
2166                     }
2167                   vec<tree> ops;
2168                   ops.create (group_size);
2169                   for (lane = 0; lane < group_size; ++lane)
2170                     ops.quick_push (chains[lane][n].op);
2171                   slp_tree child = vect_create_new_slp_node (ops);
2172                   SLP_TREE_DEF_TYPE (child) = dt;
2173                   children.safe_push (child);
2174                 }
2175               else if (dt != vect_internal_def)
2176                 {
2177                   /* Not sure, we might need sth special.
2178                      gcc.dg/vect/pr96854.c,
2179                      gfortran.dg/vect/fast-math-pr37021.f90
2180                      and gfortran.dg/vect/pr61171.f trigger.  */
2181                   /* Soft-fail for now.  */
2182                   hard_fail = false;
2183                   goto out;
2184                 }
2185               else
2186                 {
2187                   vec<stmt_vec_info> op_stmts;
2188                   op_stmts.create (group_size);
2189                   slp_tree child = NULL;
2190                   /* Brute-force our way.  We have to consider a lane
2191                      failing after fixing an earlier fail up in the
2192                      SLP discovery recursion.  So track the current
2193                      permute per lane.  */
2194                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2195                   memset (perms, 0, sizeof (unsigned) * group_size);
2196                   do
2197                     {
2198                       op_stmts.truncate (0);
2199                       for (lane = 0; lane < group_size; ++lane)
2200                         op_stmts.quick_push
2201                           (vinfo->lookup_def (chains[lane][n].op));
2202                       child = vect_build_slp_tree (vinfo, op_stmts,
2203                                                    group_size, &this_max_nunits,
2204                                                    matches, limit,
2205                                                    &this_tree_size, bst_map);
2206                       /* ???  We're likely getting too many fatal mismatches
2207                          here so maybe we want to ignore them (but then we
2208                          have no idea which lanes fatally mismatched).  */
2209                       if (child || !matches[0])
2210                         break;
2211                       /* Swap another lane we have not yet matched up into
2212                          lanes that did not match.  If we run out of
2213                          permute possibilities for a lane terminate the
2214                          search.  */
2215                       bool term = false;
2216                       for (lane = 1; lane < group_size; ++lane)
2217                         if (!matches[lane])
2218                           {
2219                             if (n + perms[lane] + 1 == chain_len)
2220                               {
2221                                 term = true;
2222                                 break;
2223                               }
2224                             std::swap (chains[lane][n],
2225                                        chains[lane][n + perms[lane] + 1]);
2226                             perms[lane]++;
2227                           }
2228                       if (term)
2229                         break;
2230                     }
2231                   while (1);
2232                   if (!child)
2233                     {
2234                       if (dump_enabled_p ())
2235                         dump_printf_loc (MSG_NOTE, vect_location,
2236                                          "failed to match up op %d\n", n);
2237                       op_stmts.release ();
2238                       if (lane != group_size - 1)
2239                         matches[0] = false;
2240                       else
2241                         matches[lane] = false;
2242                       goto out;
2243                     }
2244                   if (dump_enabled_p ())
2245                     {
2246                       dump_printf_loc (MSG_NOTE, vect_location,
2247                                        "matched up op %d to\n", n);
2248                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2249                     }
2250                   children.safe_push (child);
2251                 }
2252             }
2253           /* 3. build SLP nodes to combine the chain.  */
2254           for (unsigned lane = 0; lane < group_size; ++lane)
2255             if (chains[lane][0].code != code)
2256               {
2257                 /* See if there's any alternate all-PLUS entry.  */
2258                 unsigned n;
2259                 for (n = 1; n < chain_len; ++n)
2260                   {
2261                     for (lane = 0; lane < group_size; ++lane)
2262                       if (chains[lane][n].code != code)
2263                         break;
2264                     if (lane == group_size)
2265                       break;
2266                   }
2267                 if (n != chain_len)
2268                   {
2269                     /* Swap that in at first position.  */
2270                     std::swap (children[0], children[n]);
2271                     for (lane = 0; lane < group_size; ++lane)
2272                       std::swap (chains[lane][0], chains[lane][n]);
2273                   }
2274                 else
2275                   {
2276                     /* ???  When this triggers and we end up with two
2277                        vect_constant/external_def up-front things break (ICE)
2278                        spectacularly finding an insertion place for the
2279                        all-constant op.  We should have a fully
2280                        vect_internal_def operand though(?) so we can swap
2281                        that into first place and then prepend the all-zero
2282                        constant.  */
2283                     if (dump_enabled_p ())
2284                       dump_printf_loc (MSG_NOTE, vect_location,
2285                                        "inserting constant zero to compensate "
2286                                        "for (partially) negated first "
2287                                        "operand\n");
2288                     chain_len++;
2289                     for (lane = 0; lane < group_size; ++lane)
2290                       chains[lane].safe_insert
2291                         (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2292                     vec<tree> zero_ops;
2293                     zero_ops.create (group_size);
2294                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2295                     for (lane = 1; lane < group_size; ++lane)
2296                       zero_ops.quick_push (zero_ops[0]);
2297                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2298                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2299                     children.safe_insert (0, zero);
2300                   }
2301                 break;
2302               }
2303           for (unsigned i = 1; i < children.length (); ++i)
2304             {
2305               slp_tree op0 = children[i - 1];
2306               slp_tree op1 = children[i];
2307               bool this_two_op = false;
2308               for (unsigned lane = 0; lane < group_size; ++lane)
2309                 if (chains[lane][i].code != chains[0][i].code)
2310                   {
2311                     this_two_op = true;
2312                     break;
2313                   }
2314               slp_tree child;
2315               if (i == children.length () - 1)
2316                 child = vect_create_new_slp_node (node, stmts, 2);
2317               else
2318                 child = vect_create_new_slp_node (2, ERROR_MARK);
2319               if (this_two_op)
2320                 {
2321                   vec<std::pair<unsigned, unsigned> > lperm;
2322                   lperm.create (group_size);
2323                   for (unsigned lane = 0; lane < group_size; ++lane)
2324                     lperm.quick_push (std::make_pair
2325                       (chains[lane][i].code != chains[0][i].code, lane));
2326                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2327                                                      (chains[0][i].code == code
2328                                                       ? op_stmt_info
2329                                                       : other_op_stmt_info),
2330                                                      (chains[0][i].code == code
2331                                                       ? other_op_stmt_info
2332                                                       : op_stmt_info),
2333                                                      lperm);
2334                 }
2335               else
2336                 {
2337                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2338                   SLP_TREE_VECTYPE (child) = vectype;
2339                   SLP_TREE_LANES (child) = group_size;
2340                   SLP_TREE_CHILDREN (child).quick_push (op0);
2341                   SLP_TREE_CHILDREN (child).quick_push (op1);
2342                   SLP_TREE_REPRESENTATIVE (child)
2343                     = (chains[0][i].code == code
2344                        ? op_stmt_info : other_op_stmt_info);
2345                 }
2346               children[i] = child;
2347             }
2348           *tree_size += this_tree_size + 1;
2349           *max_nunits = this_max_nunits;
2350           while (!chains.is_empty ())
2351             chains.pop ().release ();
2352           return node;
2353         }
2354 out:
2355       while (!children.is_empty ())
2356         vect_free_slp_tree (children.pop ());
2357       while (!chains.is_empty ())
2358         chains.pop ().release ();
2359       /* Hard-fail, otherwise we might run into quadratic processing of the
2360          chains starting one stmt into the chain again.  */
2361       if (hard_fail)
2362         return NULL;
2363       /* Fall thru to normal processing.  */
2364     }
2365
2366   /* Get at the operands, verifying they are compatible.  */
2367   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2368   slp_oprnd_info oprnd_info;
2369   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2370     {
2371       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2372                                              stmts, i, &oprnds_info);
2373       if (res != 0)
2374         matches[(res == -1) ? 0 : i] = false;
2375       if (!matches[0])
2376         break;
2377     }
2378   for (i = 0; i < group_size; ++i)
2379     if (!matches[i])
2380       {
2381         vect_free_oprnd_info (oprnds_info);
2382         return NULL;
2383       }
2384   swap = NULL;
2385
2386   auto_vec<slp_tree, 4> children;
2387
2388   stmt_info = stmts[0];
2389
2390   /* Create SLP_TREE nodes for the definition node/s.  */
2391   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2392     {
2393       slp_tree child;
2394       unsigned int j;
2395
2396       /* We're skipping certain operands from processing, for example
2397          outer loop reduction initial defs.  */
2398       if (skip_args[i])
2399         {
2400           children.safe_push (NULL);
2401           continue;
2402         }
2403
2404       if (oprnd_info->first_dt == vect_uninitialized_def)
2405         {
2406           /* COND_EXPR have one too many eventually if the condition
2407              is a SSA name.  */
2408           gcc_assert (i == 3 && nops == 4);
2409           continue;
2410         }
2411
2412       if (is_a <bb_vec_info> (vinfo)
2413           && oprnd_info->first_dt == vect_internal_def
2414           && !oprnd_info->any_pattern)
2415         {
2416           /* For BB vectorization, if all defs are the same do not
2417              bother to continue the build along the single-lane
2418              graph but use a splat of the scalar value.  */
2419           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2420           for (j = 1; j < group_size; ++j)
2421             if (oprnd_info->def_stmts[j] != first_def)
2422               break;
2423           if (j == group_size
2424               /* But avoid doing this for loads where we may be
2425                  able to CSE things, unless the stmt is not
2426                  vectorizable.  */
2427               && (!STMT_VINFO_VECTORIZABLE (first_def)
2428                   || !gimple_vuse (first_def->stmt)))
2429             {
2430               if (dump_enabled_p ())
2431                 dump_printf_loc (MSG_NOTE, vect_location,
2432                                  "Using a splat of the uniform operand %G",
2433                                  first_def->stmt);
2434               oprnd_info->first_dt = vect_external_def;
2435             }
2436         }
2437
2438       if (oprnd_info->first_dt == vect_external_def
2439           || oprnd_info->first_dt == vect_constant_def)
2440         {
2441           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2442           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2443           oprnd_info->ops = vNULL;
2444           children.safe_push (invnode);
2445           continue;
2446         }
2447
2448       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2449                                         group_size, &this_max_nunits,
2450                                         matches, limit,
2451                                         &this_tree_size, bst_map)) != NULL)
2452         {
2453           oprnd_info->def_stmts = vNULL;
2454           children.safe_push (child);
2455           continue;
2456         }
2457
2458       /* If the SLP build for operand zero failed and operand zero
2459          and one can be commutated try that for the scalar stmts
2460          that failed the match.  */
2461       if (i == 0
2462           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2463           && matches[0]
2464           /* ???  For COND_EXPRs we can swap the comparison operands
2465              as well as the arms under some constraints.  */
2466           && nops == 2
2467           && oprnds_info[1]->first_dt == vect_internal_def
2468           && is_gimple_assign (stmt_info->stmt)
2469           /* Swapping operands for reductions breaks assumptions later on.  */
2470           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2471           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2472         {
2473           /* See whether we can swap the matching or the non-matching
2474              stmt operands.  */
2475           bool swap_not_matching = true;
2476           do
2477             {
2478               for (j = 0; j < group_size; ++j)
2479                 {
2480                   if (matches[j] != !swap_not_matching)
2481                     continue;
2482                   stmt_vec_info stmt_info = stmts[j];
2483                   /* Verify if we can swap operands of this stmt.  */
2484                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2485                   if (!stmt
2486                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2487                     {
2488                       if (!swap_not_matching)
2489                         goto fail;
2490                       swap_not_matching = false;
2491                       break;
2492                     }
2493                 }
2494             }
2495           while (j != group_size);
2496
2497           /* Swap mismatched definition stmts.  */
2498           if (dump_enabled_p ())
2499             dump_printf_loc (MSG_NOTE, vect_location,
2500                              "Re-trying with swapped operands of stmts ");
2501           for (j = 0; j < group_size; ++j)
2502             if (matches[j] == !swap_not_matching)
2503               {
2504                 std::swap (oprnds_info[0]->def_stmts[j],
2505                            oprnds_info[1]->def_stmts[j]);
2506                 std::swap (oprnds_info[0]->ops[j],
2507                            oprnds_info[1]->ops[j]);
2508                 if (dump_enabled_p ())
2509                   dump_printf (MSG_NOTE, "%d ", j);
2510               }
2511           if (dump_enabled_p ())
2512             dump_printf (MSG_NOTE, "\n");
2513           /* After swapping some operands we lost track whether an
2514              operand has any pattern defs so be conservative here.  */
2515           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2516             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2517           /* And try again with scratch 'matches' ... */
2518           bool *tem = XALLOCAVEC (bool, group_size);
2519           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2520                                             group_size, &this_max_nunits,
2521                                             tem, limit,
2522                                             &this_tree_size, bst_map)) != NULL)
2523             {
2524               oprnd_info->def_stmts = vNULL;
2525               children.safe_push (child);
2526               continue;
2527             }
2528         }
2529 fail:
2530
2531       /* If the SLP build failed and we analyze a basic-block
2532          simply treat nodes we fail to build as externally defined
2533          (and thus build vectors from the scalar defs).
2534          The cost model will reject outright expensive cases.
2535          ???  This doesn't treat cases where permutation ultimatively
2536          fails (or we don't try permutation below).  Ideally we'd
2537          even compute a permutation that will end up with the maximum
2538          SLP tree size...  */
2539       if (is_a <bb_vec_info> (vinfo)
2540           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2541              do extra work to cancel the pattern so the uses see the
2542              scalar version.  */
2543           && !is_pattern_stmt_p (stmt_info)
2544           && !oprnd_info->any_pattern)
2545         {
2546           /* But if there's a leading vector sized set of matching stmts
2547              fail here so we can split the group.  This matches the condition
2548              vect_analyze_slp_instance uses.  */
2549           /* ???  We might want to split here and combine the results to support
2550              multiple vector sizes better.  */
2551           for (j = 0; j < group_size; ++j)
2552             if (!matches[j])
2553               break;
2554           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2555             {
2556               if (dump_enabled_p ())
2557                 dump_printf_loc (MSG_NOTE, vect_location,
2558                                  "Building vector operands from scalars\n");
2559               this_tree_size++;
2560               child = vect_create_new_slp_node (oprnd_info->ops);
2561               children.safe_push (child);
2562               oprnd_info->ops = vNULL;
2563               continue;
2564             }
2565         }
2566
2567       gcc_assert (child == NULL);
2568       FOR_EACH_VEC_ELT (children, j, child)
2569         if (child)
2570           vect_free_slp_tree (child);
2571       vect_free_oprnd_info (oprnds_info);
2572       return NULL;
2573     }
2574
2575   vect_free_oprnd_info (oprnds_info);
2576
2577   /* If we have all children of a child built up from uniform scalars
2578      or does more than one possibly expensive vector construction then
2579      just throw that away, causing it built up from scalars.
2580      The exception is the SLP node for the vector store.  */
2581   if (is_a <bb_vec_info> (vinfo)
2582       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2583       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2584          do extra work to cancel the pattern so the uses see the
2585          scalar version.  */
2586       && !is_pattern_stmt_p (stmt_info))
2587     {
2588       slp_tree child;
2589       unsigned j;
2590       bool all_uniform_p = true;
2591       unsigned n_vector_builds = 0;
2592       FOR_EACH_VEC_ELT (children, j, child)
2593         {
2594           if (!child)
2595             ;
2596           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2597             all_uniform_p = false;
2598           else if (!vect_slp_tree_uniform_p (child))
2599             {
2600               all_uniform_p = false;
2601               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2602                 n_vector_builds++;
2603             }
2604         }
2605       if (all_uniform_p
2606           || n_vector_builds > 1
2607           || (n_vector_builds == children.length ()
2608               && is_a <gphi *> (stmt_info->stmt)))
2609         {
2610           /* Roll back.  */
2611           matches[0] = false;
2612           FOR_EACH_VEC_ELT (children, j, child)
2613             if (child)
2614               vect_free_slp_tree (child);
2615
2616           if (dump_enabled_p ())
2617             dump_printf_loc (MSG_NOTE, vect_location,
2618                              "Building parent vector operands from "
2619                              "scalars instead\n");
2620           return NULL;
2621         }
2622     }
2623
2624   *tree_size += this_tree_size + 1;
2625   *max_nunits = this_max_nunits;
2626
2627   if (two_operators)
2628     {
2629       /* ???  We'd likely want to either cache in bst_map sth like
2630          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2631          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2632          explicit stmts to put in so the keying on 'stmts' doesn't
2633          work (but we have the same issue with nodes that use 'ops').  */
2634       slp_tree one = new _slp_tree;
2635       slp_tree two = new _slp_tree;
2636       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2637       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2638       SLP_TREE_VECTYPE (one) = vectype;
2639       SLP_TREE_VECTYPE (two) = vectype;
2640       SLP_TREE_CHILDREN (one).safe_splice (children);
2641       SLP_TREE_CHILDREN (two).safe_splice (children);
2642       slp_tree child;
2643       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2644         SLP_TREE_REF_COUNT (child)++;
2645
2646       /* Here we record the original defs since this
2647          node represents the final lane configuration.  */
2648       node = vect_create_new_slp_node (node, stmts, 2);
2649       SLP_TREE_VECTYPE (node) = vectype;
2650       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2651       SLP_TREE_CHILDREN (node).quick_push (one);
2652       SLP_TREE_CHILDREN (node).quick_push (two);
2653       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2654       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2655       enum tree_code ocode = ERROR_MARK;
2656       stmt_vec_info ostmt_info;
2657       unsigned j = 0;
2658       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2659         {
2660           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2661           if (gimple_assign_rhs_code (ostmt) != code0)
2662             {
2663               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2664               ocode = gimple_assign_rhs_code (ostmt);
2665               j = i;
2666             }
2667           else
2668             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2669         }
2670       SLP_TREE_CODE (one) = code0;
2671       SLP_TREE_CODE (two) = ocode;
2672       SLP_TREE_LANES (one) = stmts.length ();
2673       SLP_TREE_LANES (two) = stmts.length ();
2674       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2675       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2676       return node;
2677     }
2678
2679   node = vect_create_new_slp_node (node, stmts, nops);
2680   SLP_TREE_VECTYPE (node) = vectype;
2681   SLP_TREE_CHILDREN (node).splice (children);
2682   return node;
2683 }
2684
2685 /* Dump a single SLP tree NODE.  */
2686
2687 static void
2688 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2689                      slp_tree node)
2690 {
2691   unsigned i, j;
2692   slp_tree child;
2693   stmt_vec_info stmt_info;
2694   tree op;
2695
2696   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2697   dump_user_location_t user_loc = loc.get_user_location ();
2698   dump_printf_loc (metadata, user_loc,
2699                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2700                    ", refcnt=%u)",
2701                    SLP_TREE_DEF_TYPE (node) == vect_external_def
2702                    ? " (external)"
2703                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2704                       ? " (constant)"
2705                       : ""), (void *) node,
2706                    estimated_poly_value (node->max_nunits),
2707                                          SLP_TREE_REF_COUNT (node));
2708   if (SLP_TREE_VECTYPE (node))
2709     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2710   dump_printf (metadata, "\n");
2711   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2712     {
2713       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2714         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2715       else
2716         dump_printf_loc (metadata, user_loc, "op template: %G",
2717                          SLP_TREE_REPRESENTATIVE (node)->stmt);
2718     }
2719   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2720     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2721       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2722   else
2723     {
2724       dump_printf_loc (metadata, user_loc, "\t{ ");
2725       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2726         dump_printf (metadata, "%T%s ", op,
2727                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2728       dump_printf (metadata, "}\n");
2729     }
2730   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2731     {
2732       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2733       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2734         dump_printf (dump_kind, " %u", j);
2735       dump_printf (dump_kind, " }\n");
2736     }
2737   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2738     {
2739       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2740       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2741         dump_printf (dump_kind, " %u[%u]",
2742                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
2743                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
2744       dump_printf (dump_kind, " }\n");
2745     }
2746   if (SLP_TREE_CHILDREN (node).is_empty ())
2747     return;
2748   dump_printf_loc (metadata, user_loc, "\tchildren");
2749   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2750     dump_printf (dump_kind, " %p", (void *)child);
2751   dump_printf (dump_kind, "\n");
2752 }
2753
2754 DEBUG_FUNCTION void
2755 debug (slp_tree node)
2756 {
2757   debug_dump_context ctx;
2758   vect_print_slp_tree (MSG_NOTE,
2759                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2760                        node);
2761 }
2762
2763 /* Recursive helper for the dot producer below.  */
2764
2765 static void
2766 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2767 {
2768   if (visited.add (node))
2769     return;
2770
2771   fprintf (f, "\"%p\" [label=\"", (void *)node);
2772   vect_print_slp_tree (MSG_NOTE,
2773                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2774                        node);
2775   fprintf (f, "\"];\n");
2776
2777
2778   for (slp_tree child : SLP_TREE_CHILDREN (node))
2779     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2780
2781   for (slp_tree child : SLP_TREE_CHILDREN (node))
2782     if (child)
2783       dot_slp_tree (f, child, visited);
2784 }
2785
2786 DEBUG_FUNCTION void
2787 dot_slp_tree (const char *fname, slp_tree node)
2788 {
2789   FILE *f = fopen (fname, "w");
2790   fprintf (f, "digraph {\n");
2791   fflush (f);
2792     {
2793       debug_dump_context ctx (f);
2794       hash_set<slp_tree> visited;
2795       dot_slp_tree (f, node, visited);
2796     }
2797   fflush (f);
2798   fprintf (f, "}\n");
2799   fclose (f);
2800 }
2801
2802 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2803
2804 static void
2805 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2806                       slp_tree node, hash_set<slp_tree> &visited)
2807 {
2808   unsigned i;
2809   slp_tree child;
2810
2811   if (visited.add (node))
2812     return;
2813
2814   vect_print_slp_tree (dump_kind, loc, node);
2815
2816   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2817     if (child)
2818       vect_print_slp_graph (dump_kind, loc, child, visited);
2819 }
2820
2821 static void
2822 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2823                       slp_tree entry)
2824 {
2825   hash_set<slp_tree> visited;
2826   vect_print_slp_graph (dump_kind, loc, entry, visited);
2827 }
2828
2829 /* Mark the tree rooted at NODE with PURE_SLP.  */
2830
2831 static void
2832 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2833 {
2834   int i;
2835   stmt_vec_info stmt_info;
2836   slp_tree child;
2837
2838   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2839     return;
2840
2841   if (visited.add (node))
2842     return;
2843
2844   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2845     STMT_SLP_TYPE (stmt_info) = pure_slp;
2846
2847   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2848     if (child)
2849       vect_mark_slp_stmts (child, visited);
2850 }
2851
2852 static void
2853 vect_mark_slp_stmts (slp_tree node)
2854 {
2855   hash_set<slp_tree> visited;
2856   vect_mark_slp_stmts (node, visited);
2857 }
2858
2859 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2860
2861 static void
2862 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2863 {
2864   int i;
2865   stmt_vec_info stmt_info;
2866   slp_tree child;
2867
2868   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2869     return;
2870
2871   if (visited.add (node))
2872     return;
2873
2874   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2875     {
2876       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2877                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2878       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2879     }
2880
2881   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2882     if (child)
2883       vect_mark_slp_stmts_relevant (child, visited);
2884 }
2885
2886 static void
2887 vect_mark_slp_stmts_relevant (slp_tree node)
2888 {
2889   hash_set<slp_tree> visited;
2890   vect_mark_slp_stmts_relevant (node, visited);
2891 }
2892
2893
2894 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2895
2896 static void
2897 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2898                        hash_set<slp_tree> &visited)
2899 {
2900   if (!node || visited.add (node))
2901     return;
2902
2903   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2904     return;
2905
2906   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
2907     {
2908       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2909       if (STMT_VINFO_DATA_REF (stmt_info)
2910           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2911         loads.safe_push (node);
2912     }
2913
2914   unsigned i;
2915   slp_tree child;
2916   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2917     vect_gather_slp_loads (loads, child, visited);
2918 }
2919
2920
2921 /* Find the last store in SLP INSTANCE.  */
2922
2923 stmt_vec_info
2924 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2925 {
2926   stmt_vec_info last = NULL;
2927   stmt_vec_info stmt_vinfo;
2928
2929   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2930     {
2931       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2932       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2933     }
2934
2935   return last;
2936 }
2937
2938 /* Find the first stmt in NODE.  */
2939
2940 stmt_vec_info
2941 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2942 {
2943   stmt_vec_info first = NULL;
2944   stmt_vec_info stmt_vinfo;
2945
2946   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2947     {
2948       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2949       if (!first
2950           || get_later_stmt (stmt_vinfo, first) == first)
2951         first = stmt_vinfo;
2952     }
2953
2954   return first;
2955 }
2956
2957 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2958    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2959    (also containing the first GROUP1_SIZE stmts, since stores are
2960    consecutive), the second containing the remainder.
2961    Return the first stmt in the second group.  */
2962
2963 static stmt_vec_info
2964 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2965 {
2966   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2967   gcc_assert (group1_size > 0);
2968   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2969   gcc_assert (group2_size > 0);
2970   DR_GROUP_SIZE (first_vinfo) = group1_size;
2971
2972   stmt_vec_info stmt_info = first_vinfo;
2973   for (unsigned i = group1_size; i > 1; i--)
2974     {
2975       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2976       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2977     }
2978   /* STMT is now the last element of the first group.  */
2979   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2980   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2981
2982   DR_GROUP_SIZE (group2) = group2_size;
2983   for (stmt_info = group2; stmt_info;
2984        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2985     {
2986       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2987       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2988     }
2989
2990   /* For the second group, the DR_GROUP_GAP is that before the original group,
2991      plus skipping over the first vector.  */
2992   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2993
2994   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
2995   DR_GROUP_GAP (first_vinfo) += group2_size;
2996
2997   if (dump_enabled_p ())
2998     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2999                      group1_size, group2_size);
3000
3001   return group2;
3002 }
3003
3004 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3005    statements and a vector of NUNITS elements.  */
3006
3007 static poly_uint64
3008 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3009 {
3010   return exact_div (common_multiple (nunits, group_size), group_size);
3011 }
3012
3013 /* Helper that checks to see if a node is a load node.  */
3014
3015 static inline bool
3016 vect_is_slp_load_node  (slp_tree root)
3017 {
3018   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3019          && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3020          && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3021 }
3022
3023
3024 /* Helper function of optimize_load_redistribution that performs the operation
3025    recursively.  */
3026
3027 static slp_tree
3028 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3029                                 vec_info *vinfo, unsigned int group_size,
3030                                 hash_map<slp_tree, slp_tree> *load_map,
3031                                 slp_tree root)
3032 {
3033   if (slp_tree *leader = load_map->get (root))
3034     return *leader;
3035
3036   slp_tree node;
3037   unsigned i;
3038
3039   /* For now, we don't know anything about externals so do not do anything.  */
3040   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3041     return NULL;
3042   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3043     {
3044       /* First convert this node into a load node and add it to the leaves
3045          list and flatten the permute from a lane to a load one.  If it's
3046          unneeded it will be elided later.  */
3047       vec<stmt_vec_info> stmts;
3048       stmts.create (SLP_TREE_LANES (root));
3049       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3050       for (unsigned j = 0; j < lane_perm.length (); j++)
3051         {
3052           std::pair<unsigned, unsigned> perm = lane_perm[j];
3053           node = SLP_TREE_CHILDREN (root)[perm.first];
3054
3055           if (!vect_is_slp_load_node (node)
3056               || SLP_TREE_CHILDREN (node).exists ())
3057             {
3058               stmts.release ();
3059               goto next;
3060             }
3061
3062           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3063         }
3064
3065       if (dump_enabled_p ())
3066         dump_printf_loc (MSG_NOTE, vect_location,
3067                          "converting stmts on permute node %p\n",
3068                          (void *) root);
3069
3070       bool *matches = XALLOCAVEC (bool, group_size);
3071       poly_uint64 max_nunits = 1;
3072       unsigned tree_size = 0, limit = 1;
3073       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3074                                   matches, &limit, &tree_size, bst_map);
3075       if (!node)
3076         stmts.release ();
3077
3078       load_map->put (root, node);
3079       return node;
3080     }
3081
3082 next:
3083   load_map->put (root, NULL);
3084
3085   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3086     {
3087       slp_tree value
3088         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3089                                           node);
3090       if (value)
3091         {
3092           SLP_TREE_REF_COUNT (value)++;
3093           SLP_TREE_CHILDREN (root)[i] = value;
3094           /* ???  We know the original leafs of the replaced nodes will
3095              be referenced by bst_map, only the permutes created by
3096              pattern matching are not.  */
3097           if (SLP_TREE_REF_COUNT (node) == 1)
3098             load_map->remove (node);
3099           vect_free_slp_tree (node);
3100         }
3101     }
3102
3103   return NULL;
3104 }
3105
3106 /* Temporary workaround for loads not being CSEd during SLP build.  This
3107    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3108    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3109    same DR such that the final operation is equal to a permuted load.  Such
3110    NODES are then directly converted into LOADS themselves.  The nodes are
3111    CSEd using BST_MAP.  */
3112
3113 static void
3114 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3115                               vec_info *vinfo, unsigned int group_size,
3116                               hash_map<slp_tree, slp_tree> *load_map,
3117                               slp_tree root)
3118 {
3119   slp_tree node;
3120   unsigned i;
3121
3122   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3123     {
3124       slp_tree value
3125         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3126                                           node);
3127       if (value)
3128         {
3129           SLP_TREE_REF_COUNT (value)++;
3130           SLP_TREE_CHILDREN (root)[i] = value;
3131           /* ???  We know the original leafs of the replaced nodes will
3132              be referenced by bst_map, only the permutes created by
3133              pattern matching are not.  */
3134           if (SLP_TREE_REF_COUNT (node) == 1)
3135             load_map->remove (node);
3136           vect_free_slp_tree (node);
3137         }
3138     }
3139 }
3140
3141 /* Helper function of vect_match_slp_patterns.
3142
3143    Attempts to match patterns against the slp tree rooted in REF_NODE using
3144    VINFO.  Patterns are matched in post-order traversal.
3145
3146    If matching is successful the value in REF_NODE is updated and returned, if
3147    not then it is returned unchanged.  */
3148
3149 static bool
3150 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3151                            slp_tree_to_load_perm_map_t *perm_cache,
3152                            slp_compat_nodes_map_t *compat_cache,
3153                            hash_set<slp_tree> *visited)
3154 {
3155   unsigned i;
3156   slp_tree node = *ref_node;
3157   bool found_p = false;
3158   if (!node || visited->add (node))
3159     return false;
3160
3161   slp_tree child;
3162   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3163     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3164                                           vinfo, perm_cache, compat_cache,
3165                                           visited);
3166
3167   for (unsigned x = 0; x < num__slp_patterns; x++)
3168     {
3169       vect_pattern *pattern
3170         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3171       if (pattern)
3172         {
3173           pattern->build (vinfo);
3174           delete pattern;
3175           found_p = true;
3176         }
3177     }
3178
3179   return found_p;
3180 }
3181
3182 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3183    vec_info VINFO.
3184
3185    The modified tree is returned.  Patterns are tried in order and multiple
3186    patterns may match.  */
3187
3188 static bool
3189 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3190                          hash_set<slp_tree> *visited,
3191                          slp_tree_to_load_perm_map_t *perm_cache,
3192                          slp_compat_nodes_map_t *compat_cache)
3193 {
3194   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3195   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3196
3197   if (dump_enabled_p ())
3198     dump_printf_loc (MSG_NOTE, vect_location,
3199                      "Analyzing SLP tree %p for patterns\n",
3200                      (void *) SLP_INSTANCE_TREE (instance));
3201
3202   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3203                                     visited);
3204 }
3205
3206 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3207    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3208    Return true if we could use IFN_STORE_LANES instead and if that appears
3209    to be the better approach.  */
3210
3211 static bool
3212 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3213                                unsigned int group_size,
3214                                unsigned int new_group_size)
3215 {
3216   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3217   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3218   if (!vectype)
3219     return false;
3220   /* Allow the split if one of the two new groups would operate on full
3221      vectors *within* rather than across one scalar loop iteration.
3222      This is purely a heuristic, but it should work well for group
3223      sizes of 3 and 4, where the possible splits are:
3224
3225        3->2+1:  OK if the vector has exactly two elements
3226        4->2+2:  Likewise
3227        4->3+1:  Less clear-cut.  */
3228   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3229       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3230     return false;
3231   return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3232 }
3233
3234 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3235    vect_build_slp_tree to build a tree of packed stmts if possible.
3236    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3237
3238 static bool
3239 vect_analyze_slp_instance (vec_info *vinfo,
3240                            scalar_stmts_to_slp_tree_map_t *bst_map,
3241                            stmt_vec_info stmt_info, slp_instance_kind kind,
3242                            unsigned max_tree_size, unsigned *limit);
3243
3244 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3245    of KIND.  Return true if successful.  */
3246
3247 static bool
3248 vect_build_slp_instance (vec_info *vinfo,
3249                          slp_instance_kind kind,
3250                          vec<stmt_vec_info> &scalar_stmts,
3251                          vec<stmt_vec_info> &root_stmt_infos,
3252                          vec<tree> &remain,
3253                          unsigned max_tree_size, unsigned *limit,
3254                          scalar_stmts_to_slp_tree_map_t *bst_map,
3255                          /* ???  We need stmt_info for group splitting.  */
3256                          stmt_vec_info stmt_info_)
3257 {
3258   if (kind == slp_inst_kind_ctor)
3259     {
3260       if (dump_enabled_p ())
3261         dump_printf_loc (MSG_NOTE, vect_location,
3262                          "Analyzing vectorizable constructor: %G\n",
3263                          root_stmt_infos[0]->stmt);
3264     }
3265
3266   if (dump_enabled_p ())
3267     {
3268       dump_printf_loc (MSG_NOTE, vect_location,
3269                        "Starting SLP discovery for\n");
3270       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3271         dump_printf_loc (MSG_NOTE, vect_location,
3272                          "  %G", scalar_stmts[i]->stmt);
3273     }
3274
3275   /* When a BB reduction doesn't have an even number of lanes
3276      strip it down, treating the remaining lane as scalar.
3277      ???  Selecting the optimal set of lanes to vectorize would be nice
3278      but SLP build for all lanes will fail quickly because we think
3279      we're going to need unrolling.  */
3280   if (kind == slp_inst_kind_bb_reduc
3281       && (scalar_stmts.length () & 1))
3282     remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt));
3283
3284   /* Build the tree for the SLP instance.  */
3285   unsigned int group_size = scalar_stmts.length ();
3286   bool *matches = XALLOCAVEC (bool, group_size);
3287   poly_uint64 max_nunits = 1;
3288   unsigned tree_size = 0;
3289   unsigned i;
3290   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3291                                        &max_nunits, matches, limit,
3292                                        &tree_size, bst_map);
3293   if (node != NULL)
3294     {
3295       /* Calculate the unrolling factor based on the smallest type.  */
3296       poly_uint64 unrolling_factor
3297         = calculate_unrolling_factor (max_nunits, group_size);
3298
3299       if (maybe_ne (unrolling_factor, 1U)
3300           && is_a <bb_vec_info> (vinfo))
3301         {
3302           unsigned HOST_WIDE_INT const_max_nunits;
3303           if (!max_nunits.is_constant (&const_max_nunits)
3304               || const_max_nunits > group_size)
3305             {
3306               if (dump_enabled_p ())
3307                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3308                                  "Build SLP failed: store group "
3309                                  "size not a multiple of the vector size "
3310                                  "in basic block SLP\n");
3311               vect_free_slp_tree (node);
3312               return false;
3313             }
3314           /* Fatal mismatch.  */
3315           if (dump_enabled_p ())
3316             dump_printf_loc (MSG_NOTE, vect_location,
3317                              "SLP discovery succeeded but node needs "
3318                              "splitting\n");
3319           memset (matches, true, group_size);
3320           matches[group_size / const_max_nunits * const_max_nunits] = false;
3321           vect_free_slp_tree (node);
3322         }
3323       else
3324         {
3325           /* Create a new SLP instance.  */
3326           slp_instance new_instance = XNEW (class _slp_instance);
3327           SLP_INSTANCE_TREE (new_instance) = node;
3328           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3329           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3330           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3331           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3332           SLP_INSTANCE_KIND (new_instance) = kind;
3333           new_instance->reduc_phis = NULL;
3334           new_instance->cost_vec = vNULL;
3335           new_instance->subgraph_entries = vNULL;
3336
3337           if (dump_enabled_p ())
3338             dump_printf_loc (MSG_NOTE, vect_location,
3339                              "SLP size %u vs. limit %u.\n",
3340                              tree_size, max_tree_size);
3341
3342           /* Fixup SLP reduction chains.  */
3343           if (kind == slp_inst_kind_reduc_chain)
3344             {
3345               /* If this is a reduction chain with a conversion in front
3346                  amend the SLP tree with a node for that.  */
3347               gimple *scalar_def
3348                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3349               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3350                 {
3351                   /* Get at the conversion stmt - we know it's the single use
3352                      of the last stmt of the reduction chain.  */
3353                   use_operand_p use_p;
3354                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3355                                            &use_p, &scalar_def);
3356                   gcc_assert (r);
3357                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3358                   next_info = vect_stmt_to_vectorize (next_info);
3359                   scalar_stmts = vNULL;
3360                   scalar_stmts.create (group_size);
3361                   for (unsigned i = 0; i < group_size; ++i)
3362                     scalar_stmts.quick_push (next_info);
3363                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3364                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3365                   SLP_TREE_CHILDREN (conv).quick_push (node);
3366                   SLP_INSTANCE_TREE (new_instance) = conv;
3367                   /* We also have to fake this conversion stmt as SLP reduction
3368                      group so we don't have to mess with too much code
3369                      elsewhere.  */
3370                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3371                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3372                 }
3373               /* Fill the backedge child of the PHI SLP node.  The
3374                  general matching code cannot find it because the
3375                  scalar code does not reflect how we vectorize the
3376                  reduction.  */
3377               use_operand_p use_p;
3378               imm_use_iterator imm_iter;
3379               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3380               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3381                                      gimple_get_lhs (scalar_def))
3382                 /* There are exactly two non-debug uses, the reduction
3383                    PHI and the loop-closed PHI node.  */
3384                 if (!is_gimple_debug (USE_STMT (use_p))
3385                     && gimple_bb (USE_STMT (use_p)) == loop->header)
3386                   {
3387                     auto_vec<stmt_vec_info, 64> phis (group_size);
3388                     stmt_vec_info phi_info
3389                       = vinfo->lookup_stmt (USE_STMT (use_p));
3390                     for (unsigned i = 0; i < group_size; ++i)
3391                       phis.quick_push (phi_info);
3392                     slp_tree *phi_node = bst_map->get (phis);
3393                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3394                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3395                       = SLP_INSTANCE_TREE (new_instance);
3396                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
3397                   }
3398             }
3399
3400           vinfo->slp_instances.safe_push (new_instance);
3401
3402           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3403              the number of scalar stmts in the root in a few places.
3404              Verify that assumption holds.  */
3405           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3406                         .length () == group_size);
3407
3408           if (dump_enabled_p ())
3409             {
3410               dump_printf_loc (MSG_NOTE, vect_location,
3411                                "Final SLP tree for instance %p:\n",
3412                                (void *) new_instance);
3413               vect_print_slp_graph (MSG_NOTE, vect_location,
3414                                     SLP_INSTANCE_TREE (new_instance));
3415             }
3416
3417           return true;
3418         }
3419     }
3420   else
3421     {
3422       /* Failed to SLP.  */
3423       /* Free the allocated memory.  */
3424       scalar_stmts.release ();
3425     }
3426
3427   stmt_vec_info stmt_info = stmt_info_;
3428   /* Try to break the group up into pieces.  */
3429   if (kind == slp_inst_kind_store)
3430     {
3431       /* ???  We could delay all the actual splitting of store-groups
3432          until after SLP discovery of the original group completed.
3433          Then we can recurse to vect_build_slp_instance directly.  */
3434       for (i = 0; i < group_size; i++)
3435         if (!matches[i])
3436           break;
3437
3438       /* For basic block SLP, try to break the group up into multiples of
3439          a vector size.  */
3440       if (is_a <bb_vec_info> (vinfo)
3441           && (i > 1 && i < group_size))
3442         {
3443           tree scalar_type
3444             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3445           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3446                                                       1 << floor_log2 (i));
3447           unsigned HOST_WIDE_INT const_nunits;
3448           if (vectype
3449               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3450             {
3451               /* Split into two groups at the first vector boundary.  */
3452               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3453               unsigned group1_size = i & ~(const_nunits - 1);
3454
3455               if (dump_enabled_p ())
3456                 dump_printf_loc (MSG_NOTE, vect_location,
3457                                  "Splitting SLP group at stmt %u\n", i);
3458               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3459                                                                group1_size);
3460               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3461                                                     kind, max_tree_size,
3462                                                     limit);
3463               /* Split the rest at the failure point and possibly
3464                  re-analyze the remaining matching part if it has
3465                  at least two lanes.  */
3466               if (group1_size < i
3467                   && (i + 1 < group_size
3468                       || i - group1_size > 1))
3469                 {
3470                   stmt_vec_info rest2 = rest;
3471                   rest = vect_split_slp_store_group (rest, i - group1_size);
3472                   if (i - group1_size > 1)
3473                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3474                                                       kind, max_tree_size,
3475                                                       limit);
3476                 }
3477               /* Re-analyze the non-matching tail if it has at least
3478                  two lanes.  */
3479               if (i + 1 < group_size)
3480                 res |= vect_analyze_slp_instance (vinfo, bst_map,
3481                                                   rest, kind, max_tree_size,
3482                                                   limit);
3483               return res;
3484             }
3485         }
3486
3487       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3488       if (is_a <loop_vec_info> (vinfo)
3489           && (i > 1 && i < group_size)
3490           && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3491         {
3492           unsigned group1_size = i;
3493
3494           if (dump_enabled_p ())
3495             dump_printf_loc (MSG_NOTE, vect_location,
3496                              "Splitting SLP group at stmt %u\n", i);
3497
3498           stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3499                                                            group1_size);
3500           /* Loop vectorization cannot handle gaps in stores, make sure
3501              the split group appears as strided.  */
3502           STMT_VINFO_STRIDED_P (rest) = 1;
3503           DR_GROUP_GAP (rest) = 0;
3504           STMT_VINFO_STRIDED_P (stmt_info) = 1;
3505           DR_GROUP_GAP (stmt_info) = 0;
3506
3507           bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3508                                                 kind, max_tree_size, limit);
3509           if (i + 1 < group_size)
3510             res |= vect_analyze_slp_instance (vinfo, bst_map,
3511                                               rest, kind, max_tree_size, limit);
3512
3513           return res;
3514         }
3515
3516       /* Even though the first vector did not all match, we might be able to SLP
3517          (some) of the remainder.  FORNOW ignore this possibility.  */
3518     }
3519
3520   /* Failed to SLP.  */
3521   if (dump_enabled_p ())
3522     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3523   return false;
3524 }
3525
3526
3527 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3528    vect_build_slp_tree to build a tree of packed stmts if possible.
3529    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3530
3531 static bool
3532 vect_analyze_slp_instance (vec_info *vinfo,
3533                            scalar_stmts_to_slp_tree_map_t *bst_map,
3534                            stmt_vec_info stmt_info,
3535                            slp_instance_kind kind,
3536                            unsigned max_tree_size, unsigned *limit)
3537 {
3538   unsigned int i;
3539   vec<stmt_vec_info> scalar_stmts;
3540
3541   if (is_a <bb_vec_info> (vinfo))
3542     vect_location = stmt_info->stmt;
3543
3544   stmt_vec_info next_info = stmt_info;
3545   if (kind == slp_inst_kind_store)
3546     {
3547       /* Collect the stores and store them in scalar_stmts.  */
3548       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3549       while (next_info)
3550         {
3551           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3552           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3553         }
3554     }
3555   else if (kind == slp_inst_kind_reduc_chain)
3556     {
3557       /* Collect the reduction stmts and store them in scalar_stmts.  */
3558       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3559       while (next_info)
3560         {
3561           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3562           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3563         }
3564       /* Mark the first element of the reduction chain as reduction to properly
3565          transform the node.  In the reduction analysis phase only the last
3566          element of the chain is marked as reduction.  */
3567       STMT_VINFO_DEF_TYPE (stmt_info)
3568         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3569       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3570         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3571     }
3572   else if (kind == slp_inst_kind_reduc_group)
3573     {
3574       /* Collect reduction statements.  */
3575       const vec<stmt_vec_info> &reductions
3576         = as_a <loop_vec_info> (vinfo)->reductions;
3577       scalar_stmts.create (reductions.length ());
3578       for (i = 0; reductions.iterate (i, &next_info); i++)
3579         if ((STMT_VINFO_RELEVANT_P (next_info)
3580              || STMT_VINFO_LIVE_P (next_info))
3581             /* ???  Make sure we didn't skip a conversion around a reduction
3582                path.  In that case we'd have to reverse engineer that conversion
3583                stmt following the chain using reduc_idx and from the PHI
3584                using reduc_def.  */
3585             && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3586           scalar_stmts.quick_push (next_info);
3587       /* If less than two were relevant/live there's nothing to SLP.  */
3588       if (scalar_stmts.length () < 2)
3589         return false;
3590     }
3591   else
3592     gcc_unreachable ();
3593
3594   vec<stmt_vec_info> roots = vNULL;
3595   vec<tree> remain = vNULL;
3596   /* Build the tree for the SLP instance.  */
3597   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3598                                       roots, remain,
3599                                       max_tree_size, limit, bst_map,
3600                                       kind == slp_inst_kind_store
3601                                       ? stmt_info : NULL);
3602
3603   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3604      where we should do store group splitting.  */
3605
3606   return res;
3607 }
3608
3609 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3610    trees of packed scalar stmts if SLP is possible.  */
3611
3612 opt_result
3613 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3614 {
3615   unsigned int i;
3616   stmt_vec_info first_element;
3617   slp_instance instance;
3618
3619   DUMP_VECT_SCOPE ("vect_analyze_slp");
3620
3621   unsigned limit = max_tree_size;
3622
3623   scalar_stmts_to_slp_tree_map_t *bst_map
3624     = new scalar_stmts_to_slp_tree_map_t ();
3625
3626   /* Find SLP sequences starting from groups of grouped stores.  */
3627   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3628     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3629                                slp_inst_kind_store, max_tree_size, &limit);
3630
3631   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3632     {
3633       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3634         {
3635           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3636           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3637                                        bb_vinfo->roots[i].stmts,
3638                                        bb_vinfo->roots[i].roots,
3639                                        bb_vinfo->roots[i].remain,
3640                                        max_tree_size, &limit, bst_map, NULL))
3641             {
3642               bb_vinfo->roots[i].stmts = vNULL;
3643               bb_vinfo->roots[i].roots = vNULL;
3644               bb_vinfo->roots[i].remain = vNULL;
3645             }
3646         }
3647     }
3648
3649   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3650     {
3651       /* Find SLP sequences starting from reduction chains.  */
3652       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3653         if (! STMT_VINFO_RELEVANT_P (first_element)
3654             && ! STMT_VINFO_LIVE_P (first_element))
3655           ;
3656         else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3657                                               slp_inst_kind_reduc_chain,
3658                                               max_tree_size, &limit))
3659           {
3660             /* Dissolve reduction chain group.  */
3661             stmt_vec_info vinfo = first_element;
3662             stmt_vec_info last = NULL;
3663             while (vinfo)
3664               {
3665                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3666                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3667                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3668                 last = vinfo;
3669                 vinfo = next;
3670               }
3671             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3672             /* It can be still vectorized as part of an SLP reduction.  */
3673             loop_vinfo->reductions.safe_push (last);
3674           }
3675
3676       /* Find SLP sequences starting from groups of reductions.  */
3677       if (loop_vinfo->reductions.length () > 1)
3678         vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3679                                    slp_inst_kind_reduc_group, max_tree_size,
3680                                    &limit);
3681     }
3682
3683   hash_set<slp_tree> visited_patterns;
3684   slp_tree_to_load_perm_map_t perm_cache;
3685   slp_compat_nodes_map_t compat_cache;
3686
3687   /* See if any patterns can be found in the SLP tree.  */
3688   bool pattern_found = false;
3689   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3690     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3691                                               &visited_patterns, &perm_cache,
3692                                               &compat_cache);
3693
3694   /* If any were found optimize permutations of loads.  */
3695   if (pattern_found)
3696     {
3697       hash_map<slp_tree, slp_tree> load_map;
3698       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3699         {
3700           slp_tree root = SLP_INSTANCE_TREE (instance);
3701           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3702                                         &load_map, root);
3703         }
3704     }
3705
3706
3707
3708   /* The map keeps a reference on SLP nodes built, release that.  */
3709   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3710        it != bst_map->end (); ++it)
3711     if ((*it).second)
3712       vect_free_slp_tree ((*it).second);
3713   delete bst_map;
3714
3715   if (pattern_found && dump_enabled_p ())
3716     {
3717       dump_printf_loc (MSG_NOTE, vect_location,
3718                        "Pattern matched SLP tree\n");
3719       hash_set<slp_tree> visited;
3720       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3721         vect_print_slp_graph (MSG_NOTE, vect_location,
3722                               SLP_INSTANCE_TREE (instance), visited);
3723     }
3724
3725   return opt_result::success ();
3726 }
3727
3728 /* Estimates the cost of inserting layout changes into the SLP graph.
3729    It can also say that the insertion is impossible.  */
3730
3731 struct slpg_layout_cost
3732 {
3733   slpg_layout_cost () = default;
3734   slpg_layout_cost (sreal, bool);
3735
3736   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3737   bool is_possible () const { return depth != sreal::max (); }
3738
3739   bool operator== (const slpg_layout_cost &) const;
3740   bool operator!= (const slpg_layout_cost &) const;
3741
3742   bool is_better_than (const slpg_layout_cost &, bool) const;
3743
3744   void add_parallel_cost (const slpg_layout_cost &);
3745   void add_serial_cost (const slpg_layout_cost &);
3746   void split (unsigned int);
3747
3748   /* The longest sequence of layout changes needed during any traversal
3749      of the partition dag, weighted by execution frequency.
3750
3751      This is the most important metric when optimizing for speed, since
3752      it helps to ensure that we keep the number of operations on
3753      critical paths to a minimum.  */
3754   sreal depth = 0;
3755
3756   /* An estimate of the total number of operations needed.  It is weighted by
3757      execution frequency when optimizing for speed but not when optimizing for
3758      size.  In order to avoid double-counting, a node with a fanout of N will
3759      distribute 1/N of its total cost to each successor.
3760
3761      This is the most important metric when optimizing for size, since
3762      it helps to keep the total number of operations to a minimum,  */
3763   sreal total = 0;
3764 };
3765
3766 /* Construct costs for a node with weight WEIGHT.  A higher weight
3767    indicates more frequent execution.  IS_FOR_SIZE is true if we are
3768    optimizing for size rather than speed.  */
3769
3770 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3771   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3772 {
3773 }
3774
3775 bool
3776 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3777 {
3778   return depth == other.depth && total == other.total;
3779 }
3780
3781 bool
3782 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3783 {
3784   return !operator== (other);
3785 }
3786
3787 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
3788    true if we are optimizing for size rather than speed.  */
3789
3790 bool
3791 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3792                                   bool is_for_size) const
3793 {
3794   if (is_for_size)
3795     {
3796       if (total != other.total)
3797         return total < other.total;
3798       return depth < other.depth;
3799     }
3800   else
3801     {
3802       if (depth != other.depth)
3803         return depth < other.depth;
3804       return total < other.total;
3805     }
3806 }
3807
3808 /* Increase the costs to account for something with cost INPUT_COST
3809    happening in parallel with the current costs.  */
3810
3811 void
3812 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3813 {
3814   depth = std::max (depth, input_cost.depth);
3815   total += input_cost.total;
3816 }
3817
3818 /* Increase the costs to account for something with cost INPUT_COST
3819    happening in series with the current costs.  */
3820
3821 void
3822 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3823 {
3824   depth += other.depth;
3825   total += other.total;
3826 }
3827
3828 /* Split the total cost among TIMES successors or predecessors.  */
3829
3830 void
3831 slpg_layout_cost::split (unsigned int times)
3832 {
3833   if (times > 1)
3834     total /= times;
3835 }
3836
3837 /* Information about one node in the SLP graph, for use during
3838    vect_optimize_slp_pass.  */
3839
3840 struct slpg_vertex
3841 {
3842   slpg_vertex (slp_tree node_) : node (node_) {}
3843
3844   /* The node itself.  */
3845   slp_tree node;
3846
3847   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
3848      partitions are flexible; they can have whichever layout consumers
3849      want them to have.  */
3850   int partition = -1;
3851
3852   /* The number of nodes that directly use the result of this one
3853      (i.e. the number of nodes that count this one as a child).  */
3854   unsigned int out_degree = 0;
3855
3856   /* The execution frequency of the node.  */
3857   sreal weight = 0;
3858
3859   /* The total execution frequency of all nodes that directly use the
3860      result of this one.  */
3861   sreal out_weight = 0;
3862 };
3863
3864 /* Information about one partition of the SLP graph, for use during
3865    vect_optimize_slp_pass.  */
3866
3867 struct slpg_partition_info
3868 {
3869   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3870      of m_partitioned_nodes.  */
3871   unsigned int node_begin = 0;
3872   unsigned int node_end = 0;
3873
3874   /* Which layout we've chosen to use for this partition, or -1 if
3875      we haven't picked one yet.  */
3876   int layout = -1;
3877
3878   /* The number of predecessors and successors in the partition dag.
3879      The predecessors always have lower partition numbers and the
3880      successors always have higher partition numbers.
3881
3882      Note that the directions of these edges are not necessarily the
3883      same as in the data flow graph.  For example, if an SCC has separate
3884      partitions for an inner loop and an outer loop, the inner loop's
3885      partition will have at least two incoming edges from the outer loop's
3886      partition: one for a live-in value and one for a live-out value.
3887      In data flow terms, one of these edges would also be from the outer loop
3888      to the inner loop, but the other would be in the opposite direction.  */
3889   unsigned int in_degree = 0;
3890   unsigned int out_degree = 0;
3891 };
3892
3893 /* Information about the costs of using a particular layout for a
3894    particular partition.  It can also say that the combination is
3895    impossible.  */
3896
3897 struct slpg_partition_layout_costs
3898 {
3899   bool is_possible () const { return internal_cost.is_possible (); }
3900   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3901
3902   /* The costs inherited from predecessor partitions.  */
3903   slpg_layout_cost in_cost;
3904
3905   /* The inherent cost of the layout within the node itself.  For example,
3906      this is nonzero for a load if choosing a particular layout would require
3907      the load to permute the loaded elements.  It is nonzero for a
3908      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3909      to full-vector moves.  */
3910   slpg_layout_cost internal_cost;
3911
3912   /* The costs inherited from successor partitions.  */
3913   slpg_layout_cost out_cost;
3914 };
3915
3916 /* This class tries to optimize the layout of vectors in order to avoid
3917    unnecessary shuffling.  At the moment, the set of possible layouts are
3918    restricted to bijective permutations.
3919
3920    The goal of the pass depends on whether we're optimizing for size or
3921    for speed.  When optimizing for size, the goal is to reduce the overall
3922    number of layout changes (including layout changes implied by things
3923    like load permutations).  When optimizing for speed, the goal is to
3924    reduce the maximum latency attributable to layout changes on any
3925    non-cyclical path through the data flow graph.
3926
3927    For example, when optimizing a loop nest for speed, we will prefer
3928    to make layout changes outside of a loop rather than inside of a loop,
3929    and will prefer to make layout changes in parallel rather than serially,
3930    even if that increases the overall number of layout changes.
3931
3932    The high-level procedure is:
3933
3934    (1) Build a graph in which edges go from uses (parents) to definitions
3935        (children).
3936
3937    (2) Divide the graph into a dag of strongly-connected components (SCCs).
3938
3939    (3) When optimizing for speed, partition the nodes in each SCC based
3940        on their containing cfg loop.  When optimizing for size, treat
3941        each SCC as a single partition.
3942
3943        This gives us a dag of partitions.  The goal is now to assign a
3944        layout to each partition.
3945
3946    (4) Construct a set of vector layouts that are worth considering.
3947        Record which nodes must keep their current layout.
3948
3949    (5) Perform a forward walk over the partition dag (from loads to stores)
3950        accumulating the "forward" cost of using each layout.  When visiting
3951        each partition, assign a tentative choice of layout to the partition
3952        and use that choice when calculating the cost of using a different
3953        layout in successor partitions.
3954
3955    (6) Perform a backward walk over the partition dag (from stores to loads),
3956        accumulating the "backward" cost of using each layout.  When visiting
3957        each partition, make a final choice of layout for that partition based
3958        on the accumulated forward costs (from (5)) and backward costs
3959        (from (6)).
3960
3961    (7) Apply the chosen layouts to the SLP graph.
3962
3963    For example, consider the SLP statements:
3964
3965    S1:      a_1 = load
3966        loop:
3967    S2:      a_2 = PHI<a_1, a_3>
3968    S3:      b_1 = load
3969    S4:      a_3 = a_2 + b_1
3970        exit:
3971    S5:      a_4 = PHI<a_3>
3972    S6:      store a_4
3973
3974    S2 and S4 form an SCC and are part of the same loop.  Every other
3975    statement is in a singleton SCC.  In this example there is a one-to-one
3976    mapping between SCCs and partitions and the partition dag looks like this;
3977
3978         S1     S3
3979          \     /
3980           S2+S4
3981             |
3982            S5
3983             |
3984            S6
3985
3986    S2, S3 and S4 will have a higher execution frequency than the other
3987    statements, so when optimizing for speed, the goal is to avoid any
3988    layout changes:
3989
3990    - within S3
3991    - within S2+S4
3992    - on the S3->S2+S4 edge
3993
3994    For example, if S3 was originally a reversing load, the goal of the
3995    pass is to make it an unreversed load and change the layout on the
3996    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
3997    on S1->S2+S4 and S5->S6 would also be acceptable.)
3998
3999    The difference between SCCs and partitions becomes important if we
4000    add an outer loop:
4001
4002    S1:      a_1 = ...
4003        loop1:
4004    S2:      a_2 = PHI<a_1, a_6>
4005    S3:      b_1 = load
4006    S4:      a_3 = a_2 + b_1
4007        loop2:
4008    S5:      a_4 = PHI<a_3, a_5>
4009    S6:      c_1 = load
4010    S7:      a_5 = a_4 + c_1
4011        exit2:
4012    S8:      a_6 = PHI<a_5>
4013    S9:      store a_6
4014        exit1:
4015
4016    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
4017    for speed, we usually do not want restrictions in the outer loop to "infect"
4018    the decision for the inner loop.  For example, if an outer-loop node
4019    in the SCC contains a statement with a fixed layout, that should not
4020    prevent the inner loop from using a different layout.  Conversely,
4021    the inner loop should not dictate a layout to the outer loop: if the
4022    outer loop does a lot of computation, then it may not be efficient to
4023    do all of that computation in the inner loop's preferred layout.
4024
4025    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4026    and S5+S7 (inner).  We also try to arrange partitions so that:
4027
4028    - the partition for an outer loop comes before the partition for
4029      an inner loop
4030
4031    - if a sibling loop A dominates a sibling loop B, A's partition
4032      comes before B's
4033
4034    This gives the following partition dag for the example above:
4035
4036         S1        S3
4037          \        /
4038           S2+S4+S8   S6
4039            |   \\    /
4040            |    S5+S7
4041            |
4042           S9
4043
4044    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4045    one for a reversal of the edge S7->S8.
4046
4047    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
4048    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4049    preferred layout against the cost of changing the layout on entry to the
4050    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4051
4052    Although this works well when optimizing for speed, it has the downside
4053    when optimizing for size that the choice of layout for S5+S7 is completely
4054    independent of S9, which lessens the chance of reducing the overall number
4055    of permutations.  We therefore do not partition SCCs when optimizing
4056    for size.
4057
4058    To give a concrete example of the difference between optimizing
4059    for size and speed, consider:
4060
4061    a[0] = (b[1] << c[3]) - d[1];
4062    a[1] = (b[0] << c[2]) - d[0];
4063    a[2] = (b[3] << c[1]) - d[3];
4064    a[3] = (b[2] << c[0]) - d[2];
4065
4066    There are three different layouts here: one for a, one for b and d,
4067    and one for c.  When optimizing for speed it is better to permute each
4068    of b, c and d into the order required by a, since those permutations
4069    happen in parallel.  But when optimizing for size, it is better to:
4070
4071    - permute c into the same order as b
4072    - do the arithmetic
4073    - permute the result into the order required by a
4074
4075    This gives 2 permutations rather than 3.  */
4076
4077 class vect_optimize_slp_pass
4078 {
4079 public:
4080   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4081   void run ();
4082
4083 private:
4084   /* Graph building.  */
4085   struct loop *containing_loop (slp_tree);
4086   bool is_cfg_latch_edge (graph_edge *);
4087   void build_vertices (hash_set<slp_tree> &, slp_tree);
4088   void build_vertices ();
4089   void build_graph ();
4090
4091   /* Partitioning.  */
4092   void create_partitions ();
4093   template<typename T> void for_each_partition_edge (unsigned int, T);
4094
4095   /* Layout selection.  */
4096   bool is_compatible_layout (slp_tree, unsigned int);
4097   int change_layout_cost (slp_tree, unsigned int, unsigned int);
4098   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4099                                                        unsigned int);
4100   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4101                                int, unsigned int);
4102   int internal_node_cost (slp_tree, int, unsigned int);
4103   void start_choosing_layouts ();
4104
4105   /* Cost propagation.  */
4106   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4107                                      unsigned int, unsigned int);
4108   slpg_layout_cost total_in_cost (unsigned int);
4109   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4110   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4111   void forward_pass ();
4112   void backward_pass ();
4113
4114   /* Rematerialization.  */
4115   slp_tree get_result_with_layout (slp_tree, unsigned int);
4116   void materialize ();
4117
4118   /* Clean-up.  */
4119   void remove_redundant_permutations ();
4120
4121   void dump ();
4122
4123   vec_info *m_vinfo;
4124
4125   /* True if we should optimize the graph for size, false if we should
4126      optimize it for speed.  (It wouldn't be easy to make this decision
4127      more locally.)  */
4128   bool m_optimize_size;
4129
4130   /* A graph of all SLP nodes, with edges leading from uses to definitions.
4131      In other words, a node's predecessors are its slp_tree parents and
4132      a node's successors are its slp_tree children.  */
4133   graph *m_slpg = nullptr;
4134
4135   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
4136   auto_vec<slpg_vertex> m_vertices;
4137
4138   /* The list of all leaves of M_SLPG. such as external definitions, constants,
4139      and loads.  */
4140   auto_vec<int> m_leafs;
4141
4142   /* This array has one entry for every vector layout that we're considering.
4143      Element 0 is null and indicates "no change".  Other entries describe
4144      permutations that are inherent in the current graph and that we would
4145      like to reverse if possible.
4146
4147      For example, a permutation { 1, 2, 3, 0 } means that something has
4148      effectively been permuted in that way, such as a load group
4149      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4150      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4151      in order to put things "back" in order.  */
4152   auto_vec<vec<unsigned> > m_perms;
4153
4154   /* A partitioning of the nodes for which a layout must be chosen.
4155      Each partition represents an <SCC, cfg loop> pair; that is,
4156      nodes in different SCCs belong to different partitions, and nodes
4157      within an SCC can be further partitioned according to a containing
4158      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
4159
4160      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4161        from leaves (such as loads) to roots (such as stores).
4162
4163      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
4164   auto_vec<slpg_partition_info> m_partitions;
4165
4166   /* The list of all nodes for which a layout must be chosen.  Nodes for
4167      partition P come before the nodes for partition P+1.  Nodes within a
4168      partition are in reverse postorder.  */
4169   auto_vec<unsigned int> m_partitioned_nodes;
4170
4171   /* Index P * num-layouts + L contains the cost of using layout L
4172      for partition P.  */
4173   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4174
4175   /* Index N * num-layouts + L, if nonnull, is a node that provides the
4176      original output of node N adjusted to have layout L.  */
4177   auto_vec<slp_tree> m_node_layouts;
4178 };
4179
4180 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4181    Also record whether we should optimize anything for speed rather
4182    than size.  */
4183
4184 void
4185 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4186                                         slp_tree node)
4187 {
4188   unsigned i;
4189   slp_tree child;
4190
4191   if (visited.add (node))
4192     return;
4193
4194   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4195     {
4196       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4197       if (optimize_bb_for_speed_p (bb))
4198         m_optimize_size = false;
4199     }
4200
4201   node->vertex = m_vertices.length ();
4202   m_vertices.safe_push (slpg_vertex (node));
4203
4204   bool leaf = true;
4205   bool force_leaf = false;
4206   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4207     if (child)
4208       {
4209         leaf = false;
4210         build_vertices (visited, child);
4211       }
4212     else
4213       force_leaf = true;
4214   /* Since SLP discovery works along use-def edges all cycles have an
4215      entry - but there's the exception of cycles where we do not handle
4216      the entry explicitely (but with a NULL SLP node), like some reductions
4217      and inductions.  Force those SLP PHIs to act as leafs to make them
4218      backwards reachable.  */
4219   if (leaf || force_leaf)
4220     m_leafs.safe_push (node->vertex);
4221 }
4222
4223 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
4224
4225 void
4226 vect_optimize_slp_pass::build_vertices ()
4227 {
4228   hash_set<slp_tree> visited;
4229   unsigned i;
4230   slp_instance instance;
4231   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4232     build_vertices (visited, SLP_INSTANCE_TREE (instance));
4233 }
4234
4235 /* Apply (reverse) bijectite PERM to VEC.  */
4236
4237 template <class T>
4238 static void
4239 vect_slp_permute (vec<unsigned> perm,
4240                   vec<T> &vec, bool reverse)
4241 {
4242   auto_vec<T, 64> saved;
4243   saved.create (vec.length ());
4244   for (unsigned i = 0; i < vec.length (); ++i)
4245     saved.quick_push (vec[i]);
4246
4247   if (reverse)
4248     {
4249       for (unsigned i = 0; i < vec.length (); ++i)
4250         vec[perm[i]] = saved[i];
4251       for (unsigned i = 0; i < vec.length (); ++i)
4252         gcc_assert (vec[perm[i]] == saved[i]);
4253     }
4254   else
4255     {
4256       for (unsigned i = 0; i < vec.length (); ++i)
4257         vec[i] = saved[perm[i]];
4258       for (unsigned i = 0; i < vec.length (); ++i)
4259         gcc_assert (vec[i] == saved[perm[i]]);
4260     }
4261 }
4262
4263 /* Return the cfg loop that contains NODE.  */
4264
4265 struct loop *
4266 vect_optimize_slp_pass::containing_loop (slp_tree node)
4267 {
4268   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4269   if (!rep)
4270     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4271   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4272 }
4273
4274 /* Return true if UD (an edge from a use to a definition) is associated
4275    with a loop latch edge in the cfg.  */
4276
4277 bool
4278 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4279 {
4280   slp_tree use = m_vertices[ud->src].node;
4281   slp_tree def = m_vertices[ud->dest].node;
4282   if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4283       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4284     return false;
4285
4286   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4287   return (is_a<gphi *> (use_rep->stmt)
4288           && bb_loop_header_p (gimple_bb (use_rep->stmt))
4289           && containing_loop (def) == containing_loop (use));
4290 }
4291
4292 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
4293    a nonnull data field.  */
4294
4295 void
4296 vect_optimize_slp_pass::build_graph ()
4297 {
4298   m_optimize_size = true;
4299   build_vertices ();
4300
4301   m_slpg = new_graph (m_vertices.length ());
4302   for (slpg_vertex &v : m_vertices)
4303     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4304       if (child)
4305         {
4306           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4307           if (is_cfg_latch_edge (ud))
4308             ud->data = this;
4309         }
4310 }
4311
4312 /* Return true if E corresponds to a loop latch edge in the cfg.  */
4313
4314 static bool
4315 skip_cfg_latch_edges (graph_edge *e)
4316 {
4317   return e->data;
4318 }
4319
4320 /* Create the node partitions.  */
4321
4322 void
4323 vect_optimize_slp_pass::create_partitions ()
4324 {
4325   /* Calculate a postorder of the graph, ignoring edges that correspond
4326      to natural latch edges in the cfg.  Reading the vector from the end
4327      to the beginning gives the reverse postorder.  */
4328   auto_vec<int> initial_rpo;
4329   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4330                false, NULL, skip_cfg_latch_edges);
4331   gcc_assert (initial_rpo.length () == m_vertices.length ());
4332
4333   /* Calculate the strongly connected components of the graph.  */
4334   auto_vec<int> scc_grouping;
4335   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4336
4337   /* Create a new index order in which all nodes from the same SCC are
4338      consecutive.  Use scc_pos to record the index of the first node in
4339      each SCC.  */
4340   auto_vec<unsigned int> scc_pos (num_sccs);
4341   int last_component = -1;
4342   unsigned int node_count = 0;
4343   for (unsigned int node_i : scc_grouping)
4344     {
4345       if (last_component != m_slpg->vertices[node_i].component)
4346         {
4347           last_component = m_slpg->vertices[node_i].component;
4348           gcc_assert (last_component == int (scc_pos.length ()));
4349           scc_pos.quick_push (node_count);
4350         }
4351       node_count += 1;
4352     }
4353   gcc_assert (node_count == initial_rpo.length ()
4354               && last_component + 1 == int (num_sccs));
4355
4356   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4357      inside each SCC following the RPO we calculated above.  The fact that
4358      we ignored natural latch edges when calculating the RPO should ensure
4359      that, for natural loop nests:
4360
4361      - the first node that we encounter in a cfg loop is the loop header phi
4362      - the loop header phis are in dominance order
4363
4364      Arranging for this is an optimization (see below) rather than a
4365      correctness issue.  Unnatural loops with a tangled mess of backedges
4366      will still work correctly, but might give poorer results.
4367
4368      Also update scc_pos so that it gives 1 + the index of the last node
4369      in the SCC.  */
4370   m_partitioned_nodes.safe_grow (node_count);
4371   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4372     {
4373       unsigned int node_i = initial_rpo[old_i];
4374       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4375       m_partitioned_nodes[new_i] = node_i;
4376     }
4377
4378   /* When optimizing for speed, partition each SCC based on the containing
4379      cfg loop. The order we constructed above should ensure that, for natural
4380      cfg loops, we'll create sub-SCC partitions for outer loops before
4381      the corresponding sub-SCC partitions for inner loops.  Similarly,
4382      when one sibling loop A dominates another sibling loop B, we should
4383      create a sub-SCC partition for A before a sub-SCC partition for B.
4384
4385      As above, nothing depends for correctness on whether this achieves
4386      a natural nesting, but we should get better results when it does.  */
4387   m_partitions.reserve (m_vertices.length ());
4388   unsigned int next_partition_i = 0;
4389   hash_map<struct loop *, int> loop_partitions;
4390   unsigned int rpo_begin = 0;
4391   unsigned int num_partitioned_nodes = 0;
4392   for (unsigned int rpo_end : scc_pos)
4393     {
4394       loop_partitions.empty ();
4395       unsigned int partition_i = next_partition_i;
4396       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4397         {
4398           /* Handle externals and constants optimistically throughout.
4399              But treat existing vectors as fixed since we do not handle
4400              permuting them.  */
4401           unsigned int node_i = m_partitioned_nodes[rpo_i];
4402           auto &vertex = m_vertices[node_i];
4403           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4404                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4405               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4406             vertex.partition = -1;
4407           else
4408             {
4409               bool existed;
4410               if (m_optimize_size)
4411                 existed = next_partition_i > partition_i;
4412               else
4413                 {
4414                   struct loop *loop = containing_loop (vertex.node);
4415                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
4416                   if (!existed)
4417                     entry = next_partition_i;
4418                   partition_i = entry;
4419                 }
4420               if (!existed)
4421                 {
4422                   m_partitions.quick_push (slpg_partition_info ());
4423                   next_partition_i += 1;
4424                 }
4425               vertex.partition = partition_i;
4426               num_partitioned_nodes += 1;
4427               m_partitions[partition_i].node_end += 1;
4428             }
4429         }
4430       rpo_begin = rpo_end;
4431     }
4432
4433   /* Assign ranges of consecutive node indices to each partition,
4434      in partition order.  Start with node_end being the same as
4435      node_begin so that the next loop can use it as a counter.  */
4436   unsigned int node_begin = 0;
4437   for (auto &partition : m_partitions)
4438     {
4439       partition.node_begin = node_begin;
4440       node_begin += partition.node_end;
4441       partition.node_end = partition.node_begin;
4442     }
4443   gcc_assert (node_begin == num_partitioned_nodes);
4444
4445   /* Finally build the list of nodes in partition order.  */
4446   m_partitioned_nodes.truncate (num_partitioned_nodes);
4447   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4448     {
4449       int partition_i = m_vertices[node_i].partition;
4450       if (partition_i >= 0)
4451         {
4452           unsigned int order_i = m_partitions[partition_i].node_end++;
4453           m_partitioned_nodes[order_i] = node_i;
4454         }
4455     }
4456 }
4457
4458 /* Look for edges from earlier partitions into node NODE_I and edges from
4459    node NODE_I into later partitions.  Call:
4460
4461       FN (ud, other_node_i)
4462
4463    for each such use-to-def edge ud, where other_node_i is the node at the
4464    other end of the edge.  */
4465
4466 template<typename T>
4467 void
4468 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4469 {
4470   int partition_i = m_vertices[node_i].partition;
4471   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4472        pred; pred = pred->pred_next)
4473     {
4474       int src_partition_i = m_vertices[pred->src].partition;
4475       if (src_partition_i >= 0 && src_partition_i != partition_i)
4476         fn (pred, pred->src);
4477     }
4478   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4479        succ; succ = succ->succ_next)
4480     {
4481       int dest_partition_i = m_vertices[succ->dest].partition;
4482       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4483         fn (succ, succ->dest);
4484     }
4485 }
4486
4487 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4488    that NODE would operate on.  This test is independent of NODE's actual
4489    operation.  */
4490
4491 bool
4492 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4493                                               unsigned int layout_i)
4494 {
4495   if (layout_i == 0)
4496     return true;
4497
4498   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4499     return false;
4500
4501   return true;
4502 }
4503
4504 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4505    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
4506    layouts is incompatible with NODE or if the change is not possible for
4507    some other reason.
4508
4509    The properties taken from NODE include the number of lanes and the
4510    vector type.  The actual operation doesn't matter.  */
4511
4512 int
4513 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4514                                             unsigned int from_layout_i,
4515                                             unsigned int to_layout_i)
4516 {
4517   if (!is_compatible_layout (node, from_layout_i)
4518       || !is_compatible_layout (node, to_layout_i))
4519     return -1;
4520
4521   if (from_layout_i == to_layout_i)
4522     return 0;
4523
4524   auto_vec<slp_tree, 1> children (1);
4525   children.quick_push (node);
4526   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4527   if (from_layout_i > 0)
4528     for (unsigned int i : m_perms[from_layout_i])
4529       perm.quick_push ({ 0, i });
4530   else
4531     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4532       perm.quick_push ({ 0, i });
4533   if (to_layout_i > 0)
4534     vect_slp_permute (m_perms[to_layout_i], perm, true);
4535   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4536                                                children, false);
4537   if (count >= 0)
4538     return MAX (count, 1);
4539
4540   /* ??? In principle we could try changing via layout 0, giving two
4541      layout changes rather than 1.  Doing that would require
4542      corresponding support in get_result_with_layout.  */
4543   return -1;
4544 }
4545
4546 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
4547
4548 inline slpg_partition_layout_costs &
4549 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4550                                                 unsigned int layout_i)
4551 {
4552   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4553 }
4554
4555 /* Change PERM in one of two ways:
4556
4557    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4558      chosen for child I of NODE.
4559
4560    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4561
4562    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
4563
4564 void
4565 vect_optimize_slp_pass::
4566 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4567                         int in_layout_i, unsigned int out_layout_i)
4568 {
4569   for (auto &entry : perm)
4570     {
4571       int this_in_layout_i = in_layout_i;
4572       if (this_in_layout_i < 0)
4573         {
4574           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4575           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4576           this_in_layout_i = m_partitions[in_partition_i].layout;
4577         }
4578       if (this_in_layout_i > 0)
4579         entry.second = m_perms[this_in_layout_i][entry.second];
4580     }
4581   if (out_layout_i > 0)
4582     vect_slp_permute (m_perms[out_layout_i], perm, true);
4583 }
4584
4585 /* Check whether the target allows NODE to be rearranged so that the node's
4586    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
4587    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
4588
4589    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4590    NODE can adapt to the layout changes that have (perhaps provisionally)
4591    been chosen for NODE's children, so that no extra permutations are
4592    needed on either the input or the output of NODE.
4593
4594    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4595    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4596
4597    IN_LAYOUT_I has no meaning for other types of node.
4598
4599    Keeping the node as-is is always valid.  If the target doesn't appear
4600    to support the node as-is, but might realistically support other layouts,
4601    then layout 0 instead has the cost of a worst-case permutation.  On the
4602    one hand, this ensures that every node has at least one valid layout,
4603    avoiding what would otherwise be an awkward special case.  On the other,
4604    it still encourages the pass to change an invalid pre-existing layout
4605    choice into a valid one.  */
4606
4607 int
4608 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4609                                             unsigned int out_layout_i)
4610 {
4611   const int fallback_cost = 1;
4612
4613   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4614     {
4615       auto_lane_permutation_t tmp_perm;
4616       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4617
4618       /* Check that the child nodes support the chosen layout.  Checking
4619          the first child is enough, since any second child would have the
4620          same shape.  */
4621       auto first_child = SLP_TREE_CHILDREN (node)[0];
4622       if (in_layout_i > 0
4623           && !is_compatible_layout (first_child, in_layout_i))
4624         return -1;
4625
4626       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4627       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4628                                                   node, tmp_perm,
4629                                                   SLP_TREE_CHILDREN (node),
4630                                                   false);
4631       if (count < 0)
4632         {
4633           if (in_layout_i == 0 && out_layout_i == 0)
4634             {
4635               /* Use the fallback cost if the node could in principle support
4636                  some nonzero layout for both the inputs and the outputs.
4637                  Otherwise assume that the node will be rejected later
4638                  and rebuilt from scalars.  */
4639               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4640                 return fallback_cost;
4641               return 0;
4642             }
4643           return -1;
4644         }
4645
4646       /* We currently have no way of telling whether the new layout is cheaper
4647          or more expensive than the old one.  But at least in principle,
4648          it should be worth making zero permutations (whole-vector shuffles)
4649          cheaper than real permutations, in case the pass is able to remove
4650          the latter.  */
4651       return count == 0 ? 0 : 1;
4652     }
4653
4654   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4655   if (rep
4656       && STMT_VINFO_DATA_REF (rep)
4657       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4658       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4659     {
4660       auto_load_permutation_t tmp_perm;
4661       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4662       if (out_layout_i > 0)
4663         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4664
4665       poly_uint64 vf = 1;
4666       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4667         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4668       unsigned int n_perms;
4669       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4670                                            nullptr, vf, true, false, &n_perms))
4671         {
4672           auto rep = SLP_TREE_REPRESENTATIVE (node);
4673           if (out_layout_i == 0)
4674             {
4675               /* Use the fallback cost if the load is an N-to-N permutation.
4676                  Otherwise assume that the node will be rejected later
4677                  and rebuilt from scalars.  */
4678               if (STMT_VINFO_GROUPED_ACCESS (rep)
4679                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4680                       == SLP_TREE_LANES (node)))
4681                 return fallback_cost;
4682               return 0;
4683             }
4684           return -1;
4685         }
4686
4687       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
4688       return n_perms == 0 ? 0 : 1;
4689     }
4690
4691   return 0;
4692 }
4693
4694 /* Decide which element layouts we should consider using.  Calculate the
4695    weights associated with inserting layout changes on partition edges.
4696    Also mark partitions that cannot change layout, by setting their
4697    layout to zero.  */
4698
4699 void
4700 vect_optimize_slp_pass::start_choosing_layouts ()
4701 {
4702   /* Used to assign unique permutation indices.  */
4703   using perm_hash = unbounded_hashmap_traits<
4704     vec_free_hash_base<int_hash_base<unsigned>>,
4705     int_hash<int, -1, -2>
4706   >;
4707   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4708
4709   /* Layout 0 is "no change".  */
4710   m_perms.safe_push (vNULL);
4711
4712   /* Create layouts from existing permutations.  */
4713   auto_load_permutation_t tmp_perm;
4714   for (unsigned int node_i : m_partitioned_nodes)
4715     {
4716       /* Leafs also double as entries to the reverse graph.  Allow the
4717          layout of those to be changed.  */
4718       auto &vertex = m_vertices[node_i];
4719       auto &partition = m_partitions[vertex.partition];
4720       if (!m_slpg->vertices[node_i].succ)
4721         partition.layout = 0;
4722
4723       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
4724       slp_tree node = vertex.node;
4725       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4726       slp_tree child;
4727       unsigned HOST_WIDE_INT imin, imax = 0;
4728       bool any_permute = false;
4729       tmp_perm.truncate (0);
4730       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4731         {
4732           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4733              unpermuted, record a layout that reverses this permutation.
4734
4735              We would need more work to cope with loads that are internally
4736              permuted and also have inputs (such as masks for
4737              IFN_MASK_LOADs).  */
4738           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4739           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4740             {
4741               partition.layout = -1;
4742               continue;
4743             }
4744           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4745           imin = DR_GROUP_SIZE (dr_stmt) + 1;
4746           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4747         }
4748       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4749                && SLP_TREE_CHILDREN (node).length () == 1
4750                && (child = SLP_TREE_CHILDREN (node)[0])
4751                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4752                    .is_constant (&imin)))
4753         {
4754           /* If the child has the same vector size as this node,
4755              reversing the permutation can make the permutation a no-op.
4756              In other cases it can change a true permutation into a
4757              full-vector extract.  */
4758           tmp_perm.reserve (SLP_TREE_LANES (node));
4759           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4760             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4761         }
4762       else
4763         continue;
4764
4765       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4766         {
4767           unsigned idx = tmp_perm[j];
4768           imin = MIN (imin, idx);
4769           imax = MAX (imax, idx);
4770           if (idx - tmp_perm[0] != j)
4771             any_permute = true;
4772         }
4773       /* If the span doesn't match we'd disrupt VF computation, avoid
4774          that for now.  */
4775       if (imax - imin + 1 != SLP_TREE_LANES (node))
4776         continue;
4777       /* If there's no permute no need to split one out.  In this case
4778          we can consider turning a load into a permuted load, if that
4779          turns out to be cheaper than alternatives.  */
4780       if (!any_permute)
4781         {
4782           partition.layout = -1;
4783           continue;
4784         }
4785
4786       /* For now only handle true permutes, like
4787          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
4788          when permuting constants and invariants keeping the permute
4789          bijective.  */
4790       auto_sbitmap load_index (SLP_TREE_LANES (node));
4791       bitmap_clear (load_index);
4792       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4793         bitmap_set_bit (load_index, tmp_perm[j] - imin);
4794       unsigned j;
4795       for (j = 0; j < SLP_TREE_LANES (node); ++j)
4796         if (!bitmap_bit_p (load_index, j))
4797           break;
4798       if (j != SLP_TREE_LANES (node))
4799         continue;
4800
4801       vec<unsigned> perm = vNULL;
4802       perm.safe_grow (SLP_TREE_LANES (node), true);
4803       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4804         perm[j] = tmp_perm[j] - imin;
4805
4806       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4807         {
4808           /* Continue to use existing layouts, but don't add any more.  */
4809           int *entry = layout_ids.get (perm);
4810           partition.layout = entry ? *entry : 0;
4811           perm.release ();
4812         }
4813       else
4814         {
4815           bool existed;
4816           int &layout_i = layout_ids.get_or_insert (perm, &existed);
4817           if (existed)
4818             perm.release ();
4819           else
4820             {
4821               layout_i = m_perms.length ();
4822               m_perms.safe_push (perm);
4823             }
4824           partition.layout = layout_i;
4825         }
4826     }
4827
4828   /* Initially assume that every layout is possible and has zero cost
4829      in every partition.  */
4830   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4831                                               * m_perms.length ());
4832
4833   /* We have to mark outgoing permutations facing non-associating-reduction
4834      graph entries that are not represented as to be materialized.
4835      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
4836   for (slp_instance instance : m_vinfo->slp_instances)
4837     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4838       {
4839         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4840         m_partitions[m_vertices[node_i].partition].layout = 0;
4841       }
4842     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4843       {
4844         stmt_vec_info stmt_info
4845           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4846         stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4847         if (needs_fold_left_reduction_p (TREE_TYPE
4848                                            (gimple_get_lhs (stmt_info->stmt)),
4849                                          STMT_VINFO_REDUC_CODE (reduc_info)))
4850           {
4851             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4852             m_partitions[m_vertices[node_i].partition].layout = 0;
4853           }
4854       }
4855
4856   /* Check which layouts each node and partition can handle.  Calculate the
4857      weights associated with inserting layout changes on edges.  */
4858   for (unsigned int node_i : m_partitioned_nodes)
4859     {
4860       auto &vertex = m_vertices[node_i];
4861       auto &partition = m_partitions[vertex.partition];
4862       slp_tree node = vertex.node;
4863
4864       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4865         {
4866           vertex.weight = vect_slp_node_weight (node);
4867
4868           /* We do not handle stores with a permutation, so all
4869              incoming permutations must have been materialized.
4870
4871              We also don't handle masked grouped loads, which lack a
4872              permutation vector.  In this case the memory locations
4873              form an implicit second input to the loads, on top of the
4874              explicit mask input, and the memory input's layout cannot
4875              be changed.
4876
4877              On the other hand, we do support permuting gather loads and
4878              masked gather loads, where each scalar load is independent
4879              of the others.  This can be useful if the address/index input
4880              benefits from permutation.  */
4881           if (STMT_VINFO_DATA_REF (rep)
4882               && STMT_VINFO_GROUPED_ACCESS (rep)
4883               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4884             partition.layout = 0;
4885
4886           /* We cannot change the layout of an operation that is
4887              not independent on lanes.  Note this is an explicit
4888              negative list since that's much shorter than the respective
4889              positive one but it's critical to keep maintaining it.  */
4890           if (is_gimple_call (STMT_VINFO_STMT (rep)))
4891             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4892               {
4893               case CFN_COMPLEX_ADD_ROT90:
4894               case CFN_COMPLEX_ADD_ROT270:
4895               case CFN_COMPLEX_MUL:
4896               case CFN_COMPLEX_MUL_CONJ:
4897               case CFN_VEC_ADDSUB:
4898               case CFN_VEC_FMADDSUB:
4899               case CFN_VEC_FMSUBADD:
4900                 partition.layout = 0;
4901               default:;
4902               }
4903         }
4904
4905       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4906         {
4907           auto &other_vertex = m_vertices[other_node_i];
4908
4909           /* Count the number of edges from earlier partitions and the number
4910              of edges to later partitions.  */
4911           if (other_vertex.partition < vertex.partition)
4912             partition.in_degree += 1;
4913           else
4914             partition.out_degree += 1;
4915
4916           /* If the current node uses the result of OTHER_NODE_I, accumulate
4917              the effects of that.  */
4918           if (ud->src == int (node_i))
4919             {
4920               other_vertex.out_weight += vertex.weight;
4921               other_vertex.out_degree += 1;
4922             }
4923         };
4924       for_each_partition_edge (node_i, process_edge);
4925     }
4926 }
4927
4928 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4929    its current (provisional) choice of layout.  The inputs do not necessarily
4930    have the same layout as each other.  */
4931
4932 slpg_layout_cost
4933 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4934 {
4935   auto &vertex = m_vertices[node_i];
4936   slpg_layout_cost cost;
4937   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4938     {
4939       auto &other_vertex = m_vertices[other_node_i];
4940       if (other_vertex.partition < vertex.partition)
4941         {
4942           auto &other_partition = m_partitions[other_vertex.partition];
4943           auto &other_costs = partition_layout_costs (other_vertex.partition,
4944                                                       other_partition.layout);
4945           slpg_layout_cost this_cost = other_costs.in_cost;
4946           this_cost.add_serial_cost (other_costs.internal_cost);
4947           this_cost.split (other_partition.out_degree);
4948           cost.add_parallel_cost (this_cost);
4949         }
4950     };
4951   for_each_partition_edge (node_i, add_cost);
4952   return cost;
4953 }
4954
4955 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4956    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
4957    slpg_layout_cost::impossible () if the change isn't possible.  */
4958
4959 slpg_layout_cost
4960 vect_optimize_slp_pass::
4961 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4962                   unsigned int layout2_i)
4963 {
4964   auto &def_vertex = m_vertices[ud->dest];
4965   auto &use_vertex = m_vertices[ud->src];
4966   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4967   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4968   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4969                                     use_layout_i);
4970   if (factor < 0)
4971     return slpg_layout_cost::impossible ();
4972
4973   /* We have a choice of putting the layout change at the site of the
4974      definition or at the site of the use.  Prefer the former when
4975      optimizing for size or when the execution frequency of the
4976      definition is no greater than the combined execution frequencies of
4977      the uses.  When putting the layout change at the site of the definition,
4978      divvy up the cost among all consumers.  */
4979   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4980     {
4981       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4982       cost.split (def_vertex.out_degree);
4983       return cost;
4984     }
4985   return { use_vertex.weight * factor, m_optimize_size };
4986 }
4987
4988 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4989    partition; FROM_NODE_I could be the definition node or the use node.
4990    The node at the other end of the link wants to use layout TO_LAYOUT_I.
4991    Return the cost of any necessary fix-ups on edge UD, or return
4992    slpg_layout_cost::impossible () if the change isn't possible.
4993
4994    At this point, FROM_NODE_I's partition has chosen the cheapest
4995    layout based on the information available so far, but this choice
4996    is only provisional.  */
4997
4998 slpg_layout_cost
4999 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
5000                                       unsigned int to_layout_i)
5001 {
5002   auto &from_vertex = m_vertices[from_node_i];
5003   unsigned int from_partition_i = from_vertex.partition;
5004   slpg_partition_info &from_partition = m_partitions[from_partition_i];
5005   gcc_assert (from_partition.layout >= 0);
5006
5007   /* First calculate the cost on the assumption that FROM_PARTITION sticks
5008      with its current layout preference.  */
5009   slpg_layout_cost cost = slpg_layout_cost::impossible ();
5010   auto edge_cost = edge_layout_cost (ud, from_node_i,
5011                                      from_partition.layout, to_layout_i);
5012   if (edge_cost.is_possible ())
5013     {
5014       auto &from_costs = partition_layout_costs (from_partition_i,
5015                                                  from_partition.layout);
5016       cost = from_costs.in_cost;
5017       cost.add_serial_cost (from_costs.internal_cost);
5018       cost.split (from_partition.out_degree);
5019       cost.add_serial_cost (edge_cost);
5020     }
5021
5022   /* Take the minimum of that cost and the cost that applies if
5023      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
5024   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
5025                                                       to_layout_i);
5026   if (direct_layout_costs.is_possible ())
5027     {
5028       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5029       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
5030       direct_cost.split (from_partition.out_degree);
5031       if (!cost.is_possible ()
5032           || direct_cost.is_better_than (cost, m_optimize_size))
5033         cost = direct_cost;
5034     }
5035
5036   return cost;
5037 }
5038
5039 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
5040    partition; TO_NODE_I could be the definition node or the use node.
5041    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5042    return the cost of any necessary fix-ups on edge UD, or
5043    slpg_layout_cost::impossible () if the choice cannot be made.
5044
5045    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
5046
5047 slpg_layout_cost
5048 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
5049                                        unsigned int from_layout_i)
5050 {
5051   auto &to_vertex = m_vertices[to_node_i];
5052   unsigned int to_partition_i = to_vertex.partition;
5053   slpg_partition_info &to_partition = m_partitions[to_partition_i];
5054   gcc_assert (to_partition.layout >= 0);
5055
5056   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
5057      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
5058      any other inputs keep their current choice of layout.  */
5059   auto &to_costs = partition_layout_costs (to_partition_i,
5060                                            to_partition.layout);
5061   if (ud->src == int (to_node_i)
5062       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5063     {
5064       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
5065       auto old_layout = from_partition.layout;
5066       from_partition.layout = from_layout_i;
5067       int factor = internal_node_cost (to_vertex.node, -1,
5068                                        to_partition.layout);
5069       from_partition.layout = old_layout;
5070       if (factor >= 0)
5071         {
5072           slpg_layout_cost cost = to_costs.out_cost;
5073           cost.add_serial_cost ({ to_vertex.weight * factor,
5074                                   m_optimize_size });
5075           cost.split (to_partition.in_degree);
5076           return cost;
5077         }
5078     }
5079
5080   /* Compute the cost if we insert any necessary layout change on edge UD.  */
5081   auto edge_cost = edge_layout_cost (ud, to_node_i,
5082                                      to_partition.layout, from_layout_i);
5083   if (edge_cost.is_possible ())
5084     {
5085       slpg_layout_cost cost = to_costs.out_cost;
5086       cost.add_serial_cost (to_costs.internal_cost);
5087       cost.split (to_partition.in_degree);
5088       cost.add_serial_cost (edge_cost);
5089       return cost;
5090     }
5091
5092   return slpg_layout_cost::impossible ();
5093 }
5094
5095 /* Make a forward pass through the partitions, accumulating input costs.
5096    Make a tentative (provisional) choice of layout for each partition,
5097    ensuring that this choice still allows later partitions to keep
5098    their original layout.  */
5099
5100 void
5101 vect_optimize_slp_pass::forward_pass ()
5102 {
5103   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5104        ++partition_i)
5105     {
5106       auto &partition = m_partitions[partition_i];
5107
5108       /* If the partition consists of a single VEC_PERM_EXPR, precompute
5109          the incoming cost that would apply if every predecessor partition
5110          keeps its current layout.  This is used within the loop below.  */
5111       slpg_layout_cost in_cost;
5112       slp_tree single_node = nullptr;
5113       if (partition.node_end == partition.node_begin + 1)
5114         {
5115           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5116           single_node = m_vertices[node_i].node;
5117           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5118             in_cost = total_in_cost (node_i);
5119         }
5120
5121       /* Go through the possible layouts.  Decide which ones are valid
5122          for this partition and record which of the valid layouts has
5123          the lowest cost.  */
5124       unsigned int min_layout_i = 0;
5125       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5126       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5127         {
5128           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5129           if (!layout_costs.is_possible ())
5130             continue;
5131
5132           /* If the recorded layout is already 0 then the layout cannot
5133              change.  */
5134           if (partition.layout == 0 && layout_i != 0)
5135             {
5136               layout_costs.mark_impossible ();
5137               continue;
5138             }
5139
5140           bool is_possible = true;
5141           for (unsigned int order_i = partition.node_begin;
5142                order_i < partition.node_end; ++order_i)
5143             {
5144               unsigned int node_i = m_partitioned_nodes[order_i];
5145               auto &vertex = m_vertices[node_i];
5146
5147               /* Reject the layout if it is individually incompatible
5148                  with any node in the partition.  */
5149               if (!is_compatible_layout (vertex.node, layout_i))
5150                 {
5151                   is_possible = false;
5152                   break;
5153                 }
5154
5155               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5156                 {
5157                   auto &other_vertex = m_vertices[other_node_i];
5158                   if (other_vertex.partition < vertex.partition)
5159                     {
5160                       /* Accumulate the incoming costs from earlier
5161                          partitions, plus the cost of any layout changes
5162                          on UD itself.  */
5163                       auto cost = forward_cost (ud, other_node_i, layout_i);
5164                       if (!cost.is_possible ())
5165                         is_possible = false;
5166                       else
5167                         layout_costs.in_cost.add_parallel_cost (cost);
5168                     }
5169                   else
5170                     /* Reject the layout if it would make layout 0 impossible
5171                        for later partitions.  This amounts to testing that the
5172                        target supports reversing the layout change on edges
5173                        to later partitions.
5174
5175                        In principle, it might be possible to push a layout
5176                        change all the way down a graph, so that it never
5177                        needs to be reversed and so that the target doesn't
5178                        need to support the reverse operation.  But it would
5179                        be awkward to bail out if we hit a partition that
5180                        does not support the new layout, especially since
5181                        we are not dealing with a lattice.  */
5182                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
5183                                                      layout_i).is_possible ();
5184                 };
5185               for_each_partition_edge (node_i, add_cost);
5186
5187               /* Accumulate the cost of using LAYOUT_I within NODE,
5188                  both for the inputs and the outputs.  */
5189               int factor = internal_node_cost (vertex.node, layout_i,
5190                                                layout_i);
5191               if (factor < 0)
5192                 {
5193                   is_possible = false;
5194                   break;
5195                 }
5196               else if (factor)
5197                 layout_costs.internal_cost.add_serial_cost
5198                   ({ vertex.weight * factor, m_optimize_size });
5199             }
5200           if (!is_possible)
5201             {
5202               layout_costs.mark_impossible ();
5203               continue;
5204             }
5205
5206           /* Combine the incoming and partition-internal costs.  */
5207           slpg_layout_cost combined_cost = layout_costs.in_cost;
5208           combined_cost.add_serial_cost (layout_costs.internal_cost);
5209
5210           /* If this partition consists of a single VEC_PERM_EXPR, see
5211              if the VEC_PERM_EXPR can be changed to support output layout
5212              LAYOUT_I while keeping all the provisional choices of input
5213              layout.  */
5214           if (single_node
5215               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5216             {
5217               int factor = internal_node_cost (single_node, -1, layout_i);
5218               if (factor >= 0)
5219                 {
5220                   auto weight = m_vertices[single_node->vertex].weight;
5221                   slpg_layout_cost internal_cost
5222                     = { weight * factor, m_optimize_size };
5223
5224                   slpg_layout_cost alt_cost = in_cost;
5225                   alt_cost.add_serial_cost (internal_cost);
5226                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5227                     {
5228                       combined_cost = alt_cost;
5229                       layout_costs.in_cost = in_cost;
5230                       layout_costs.internal_cost = internal_cost;
5231                     }
5232                 }
5233             }
5234
5235           /* Record the layout with the lowest cost.  Prefer layout 0 in
5236              the event of a tie between it and another layout.  */
5237           if (!min_layout_cost.is_possible ()
5238               || combined_cost.is_better_than (min_layout_cost,
5239                                                m_optimize_size))
5240             {
5241               min_layout_i = layout_i;
5242               min_layout_cost = combined_cost;
5243             }
5244         }
5245
5246       /* This loop's handling of earlier partitions should ensure that
5247          choosing the original layout for the current partition is no
5248          less valid than it was in the original graph, even with the
5249          provisional layout choices for those earlier partitions.  */
5250       gcc_assert (min_layout_cost.is_possible ());
5251       partition.layout = min_layout_i;
5252     }
5253 }
5254
5255 /* Make a backward pass through the partitions, accumulating output costs.
5256    Make a final choice of layout for each partition.  */
5257
5258 void
5259 vect_optimize_slp_pass::backward_pass ()
5260 {
5261   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5262     {
5263       auto &partition = m_partitions[partition_i];
5264
5265       unsigned int min_layout_i = 0;
5266       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5267       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5268         {
5269           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5270           if (!layout_costs.is_possible ())
5271             continue;
5272
5273           /* Accumulate the costs from successor partitions.  */
5274           bool is_possible = true;
5275           for (unsigned int order_i = partition.node_begin;
5276                order_i < partition.node_end; ++order_i)
5277             {
5278               unsigned int node_i = m_partitioned_nodes[order_i];
5279               auto &vertex = m_vertices[node_i];
5280               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5281                 {
5282                   auto &other_vertex = m_vertices[other_node_i];
5283                   auto &other_partition = m_partitions[other_vertex.partition];
5284                   if (other_vertex.partition > vertex.partition)
5285                     {
5286                       /* Accumulate the incoming costs from later
5287                          partitions, plus the cost of any layout changes
5288                          on UD itself.  */
5289                       auto cost = backward_cost (ud, other_node_i, layout_i);
5290                       if (!cost.is_possible ())
5291                         is_possible = false;
5292                       else
5293                         layout_costs.out_cost.add_parallel_cost (cost);
5294                     }
5295                   else
5296                     /* Make sure that earlier partitions can (if necessary
5297                        or beneficial) keep the layout that they chose in
5298                        the forward pass.  This ensures that there is at
5299                        least one valid choice of layout.  */
5300                     is_possible &= edge_layout_cost (ud, other_node_i,
5301                                                      other_partition.layout,
5302                                                      layout_i).is_possible ();
5303                 };
5304               for_each_partition_edge (node_i, add_cost);
5305             }
5306           if (!is_possible)
5307             {
5308               layout_costs.mark_impossible ();
5309               continue;
5310             }
5311
5312           /* Locally combine the costs from the forward and backward passes.
5313              (This combined cost is not passed on, since that would lead
5314              to double counting.)  */
5315           slpg_layout_cost combined_cost = layout_costs.in_cost;
5316           combined_cost.add_serial_cost (layout_costs.internal_cost);
5317           combined_cost.add_serial_cost (layout_costs.out_cost);
5318
5319           /* Record the layout with the lowest cost.  Prefer layout 0 in
5320              the event of a tie between it and another layout.  */
5321           if (!min_layout_cost.is_possible ()
5322               || combined_cost.is_better_than (min_layout_cost,
5323                                                m_optimize_size))
5324             {
5325               min_layout_i = layout_i;
5326               min_layout_cost = combined_cost;
5327             }
5328         }
5329
5330       gcc_assert (min_layout_cost.is_possible ());
5331       partition.layout = min_layout_i;
5332     }
5333 }
5334
5335 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5336    NODE already has the layout that was selected for its partition.  */
5337
5338 slp_tree
5339 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5340                                                 unsigned int to_layout_i)
5341 {
5342   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5343   slp_tree result = m_node_layouts[result_i];
5344   if (result)
5345     return result;
5346
5347   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5348       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5349           /* We can't permute vector defs in place.  */
5350           && SLP_TREE_VEC_DEFS (node).is_empty ()))
5351     {
5352       /* If the vector is uniform or unchanged, there's nothing to do.  */
5353       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5354         result = node;
5355       else
5356         {
5357           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5358           result = vect_create_new_slp_node (scalar_ops);
5359           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5360         }
5361     }
5362   else
5363     {
5364       unsigned int partition_i = m_vertices[node->vertex].partition;
5365       unsigned int from_layout_i = m_partitions[partition_i].layout;
5366       if (from_layout_i == to_layout_i)
5367         return node;
5368
5369       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5370          permutation instead of a serial one.  Leave the new permutation
5371          in TMP_PERM on success.  */
5372       auto_lane_permutation_t tmp_perm;
5373       unsigned int num_inputs = 1;
5374       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5375         {
5376           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5377           if (from_layout_i != 0)
5378             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5379           if (to_layout_i != 0)
5380             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5381           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5382                                               tmp_perm,
5383                                               SLP_TREE_CHILDREN (node),
5384                                               false) >= 0)
5385             num_inputs = SLP_TREE_CHILDREN (node).length ();
5386           else
5387             tmp_perm.truncate (0);
5388         }
5389
5390       if (dump_enabled_p ())
5391         {
5392           if (tmp_perm.length () > 0)
5393             dump_printf_loc (MSG_NOTE, vect_location,
5394                              "duplicating permutation node %p with"
5395                              " layout %d\n",
5396                              (void *) node, to_layout_i);
5397           else
5398             dump_printf_loc (MSG_NOTE, vect_location,
5399                              "inserting permutation node in place of %p\n",
5400                              (void *) node);
5401         }
5402
5403       unsigned int num_lanes = SLP_TREE_LANES (node);
5404       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5405       if (SLP_TREE_SCALAR_STMTS (node).length ())
5406         {
5407           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5408           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5409           if (from_layout_i != 0)
5410             vect_slp_permute (m_perms[from_layout_i], stmts, false);
5411           if (to_layout_i != 0)
5412             vect_slp_permute (m_perms[to_layout_i], stmts, true);
5413         }
5414       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5415       SLP_TREE_LANES (result) = num_lanes;
5416       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5417       result->vertex = -1;
5418
5419       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5420       if (tmp_perm.length ())
5421         {
5422           lane_perm.safe_splice (tmp_perm);
5423           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5424         }
5425       else
5426         {
5427           lane_perm.create (num_lanes);
5428           for (unsigned j = 0; j < num_lanes; ++j)
5429             lane_perm.quick_push ({ 0, j });
5430           if (from_layout_i != 0)
5431             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5432           if (to_layout_i != 0)
5433             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5434           SLP_TREE_CHILDREN (result).safe_push (node);
5435         }
5436       for (slp_tree child : SLP_TREE_CHILDREN (result))
5437         child->refcnt++;
5438     }
5439   m_node_layouts[result_i] = result;
5440   return result;
5441 }
5442
5443 /* Apply the chosen vector layouts to the SLP graph.  */
5444
5445 void
5446 vect_optimize_slp_pass::materialize ()
5447 {
5448   /* We no longer need the costs, so avoid having two O(N * P) arrays
5449      live at the same time.  */
5450   m_partition_layout_costs.release ();
5451   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5452
5453   auto_sbitmap fully_folded (m_vertices.length ());
5454   bitmap_clear (fully_folded);
5455   for (unsigned int node_i : m_partitioned_nodes)
5456     {
5457       auto &vertex = m_vertices[node_i];
5458       slp_tree node = vertex.node;
5459       int layout_i = m_partitions[vertex.partition].layout;
5460       gcc_assert (layout_i >= 0);
5461
5462       /* Rearrange the scalar statements to match the chosen layout.  */
5463       if (layout_i > 0)
5464         vect_slp_permute (m_perms[layout_i],
5465                           SLP_TREE_SCALAR_STMTS (node), true);
5466
5467       /* Update load and lane permutations.  */
5468       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5469         {
5470           /* First try to absorb the input vector layouts.  If that fails,
5471              force the inputs to have layout LAYOUT_I too.  We checked that
5472              that was possible before deciding to use nonzero output layouts.
5473              (Note that at this stage we don't really have any guarantee that
5474              the target supports the original VEC_PERM_EXPR.)  */
5475           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5476           auto_lane_permutation_t tmp_perm;
5477           tmp_perm.safe_splice (perm);
5478           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5479           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5480                                               tmp_perm,
5481                                               SLP_TREE_CHILDREN (node),
5482                                               false) >= 0)
5483             {
5484               if (dump_enabled_p ()
5485                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5486                                   perm.begin ()))
5487                 dump_printf_loc (MSG_NOTE, vect_location,
5488                                  "absorbing input layouts into %p\n",
5489                                  (void *) node);
5490               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5491               bitmap_set_bit (fully_folded, node_i);
5492             }
5493           else
5494             {
5495               /* Not MSG_MISSED because it would make no sense to users.  */
5496               if (dump_enabled_p ())
5497                 dump_printf_loc (MSG_NOTE, vect_location,
5498                                  "failed to absorb input layouts into %p\n",
5499                                  (void *) node);
5500               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5501             }
5502         }
5503       else
5504         {
5505           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5506           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5507           if (layout_i > 0)
5508             /* ???  When we handle non-bijective permutes the idea
5509                is that we can force the load-permutation to be
5510                { min, min + 1, min + 2, ... max }.  But then the
5511                scalar defs might no longer match the lane content
5512                which means wrong-code with live lane vectorization.
5513                So we possibly have to have NULL entries for those.  */
5514             vect_slp_permute (m_perms[layout_i], load_perm, true);
5515         }
5516     }
5517
5518   /* Do this before any nodes disappear, since it involves a walk
5519      over the leaves.  */
5520   remove_redundant_permutations ();
5521
5522   /* Replace each child with a correctly laid-out version.  */
5523   for (unsigned int node_i : m_partitioned_nodes)
5524     {
5525       /* Skip nodes that have already been handled above.  */
5526       if (bitmap_bit_p (fully_folded, node_i))
5527         continue;
5528
5529       auto &vertex = m_vertices[node_i];
5530       int in_layout_i = m_partitions[vertex.partition].layout;
5531       gcc_assert (in_layout_i >= 0);
5532
5533       unsigned j;
5534       slp_tree child;
5535       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5536         {
5537           if (!child)
5538             continue;
5539
5540           slp_tree new_child = get_result_with_layout (child, in_layout_i);
5541           if (new_child != child)
5542             {
5543               vect_free_slp_tree (child);
5544               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5545               new_child->refcnt += 1;
5546             }
5547         }
5548     }
5549 }
5550
5551 /* Elide load permutations that are not necessary.  Such permutations might
5552    be pre-existing, rather than created by the layout optimizations.  */
5553
5554 void
5555 vect_optimize_slp_pass::remove_redundant_permutations ()
5556 {
5557   for (unsigned int node_i : m_leafs)
5558     {
5559       slp_tree node = m_vertices[node_i].node;
5560       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5561         continue;
5562
5563       /* In basic block vectorization we allow any subchain of an interleaving
5564          chain.
5565          FORNOW: not in loop SLP because of realignment complications.  */
5566       if (is_a <bb_vec_info> (m_vinfo))
5567         {
5568           bool subchain_p = true;
5569           stmt_vec_info next_load_info = NULL;
5570           stmt_vec_info load_info;
5571           unsigned j;
5572           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5573             {
5574               if (j != 0
5575                   && (next_load_info != load_info
5576                       || DR_GROUP_GAP (load_info) != 1))
5577                 {
5578                   subchain_p = false;
5579                   break;
5580                 }
5581               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5582             }
5583           if (subchain_p)
5584             {
5585               SLP_TREE_LOAD_PERMUTATION (node).release ();
5586               continue;
5587             }
5588         }
5589       else
5590         {
5591           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5592           stmt_vec_info load_info;
5593           bool this_load_permuted = false;
5594           unsigned j;
5595           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5596             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5597               {
5598                 this_load_permuted = true;
5599                 break;
5600               }
5601           /* When this isn't a grouped access we know it's single element
5602              and contiguous.  */
5603           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5604             {
5605               if (!this_load_permuted
5606                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5607                       || SLP_TREE_LANES (node) == 1))
5608                 SLP_TREE_LOAD_PERMUTATION (node).release ();
5609               continue;
5610             }
5611           stmt_vec_info first_stmt_info
5612             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5613           if (!this_load_permuted
5614               /* The load requires permutation when unrolling exposes
5615                  a gap either because the group is larger than the SLP
5616                  group-size or because there is a gap between the groups.  */
5617               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5618                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5619                       && DR_GROUP_GAP (first_stmt_info) == 0)))
5620             {
5621               SLP_TREE_LOAD_PERMUTATION (node).release ();
5622               continue;
5623             }
5624         }
5625     }
5626 }
5627
5628 /* Print the partition graph and layout information to the dump file.  */
5629
5630 void
5631 vect_optimize_slp_pass::dump ()
5632 {
5633   dump_printf_loc (MSG_NOTE, vect_location,
5634                    "SLP optimize permutations:\n");
5635   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5636     {
5637       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
5638       const char *sep = "";
5639       for (unsigned int idx : m_perms[layout_i])
5640         {
5641           dump_printf (MSG_NOTE, "%s%d", sep, idx);
5642           sep = ", ";
5643         }
5644       dump_printf (MSG_NOTE, " }\n");
5645     }
5646   dump_printf_loc (MSG_NOTE, vect_location,
5647                    "SLP optimize partitions:\n");
5648   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5649        ++partition_i)
5650     {
5651       auto &partition = m_partitions[partition_i];
5652       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
5653       dump_printf_loc (MSG_NOTE, vect_location,
5654                        "  partition %d (layout %d):\n",
5655                        partition_i, partition.layout);
5656       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
5657       for (unsigned int order_i = partition.node_begin;
5658            order_i < partition.node_end; ++order_i)
5659         {
5660           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5661           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
5662                            (void *) vertex.node);
5663           dump_printf_loc (MSG_NOTE, vect_location,
5664                            "          weight: %f\n",
5665                            vertex.weight.to_double ());
5666           if (vertex.out_degree)
5667             dump_printf_loc (MSG_NOTE, vect_location,
5668                              "          out weight: %f (degree %d)\n",
5669                              vertex.out_weight.to_double (),
5670                              vertex.out_degree);
5671           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5672             dump_printf_loc (MSG_NOTE, vect_location,
5673                              "          op: VEC_PERM_EXPR\n");
5674           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5675             dump_printf_loc (MSG_NOTE, vect_location,
5676                              "          op template: %G", rep->stmt);
5677         }
5678       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
5679       for (unsigned int order_i = partition.node_begin;
5680            order_i < partition.node_end; ++order_i)
5681         {
5682           unsigned int node_i = m_partitioned_nodes[order_i];
5683           auto &vertex = m_vertices[node_i];
5684           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5685             {
5686               auto &other_vertex = m_vertices[other_node_i];
5687               if (other_vertex.partition < vertex.partition)
5688                 dump_printf_loc (MSG_NOTE, vect_location,
5689                                  "      - %p [%d] --> %p\n",
5690                                  (void *) other_vertex.node,
5691                                  other_vertex.partition,
5692                                  (void *) vertex.node);
5693               else
5694                 dump_printf_loc (MSG_NOTE, vect_location,
5695                                  "      - %p --> [%d] %p\n",
5696                                  (void *) vertex.node,
5697                                  other_vertex.partition,
5698                                  (void *) other_vertex.node);
5699             };
5700           for_each_partition_edge (node_i, print_edge);
5701         }
5702
5703       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5704         {
5705           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5706           if (layout_costs.is_possible ())
5707             {
5708               dump_printf_loc (MSG_NOTE, vect_location,
5709                                "    layout %d:%s\n", layout_i,
5710                                partition.layout == int (layout_i)
5711                                ? " (*)" : "");
5712               slpg_layout_cost combined_cost = layout_costs.in_cost;
5713               combined_cost.add_serial_cost (layout_costs.internal_cost);
5714               combined_cost.add_serial_cost (layout_costs.out_cost);
5715 #define TEMPLATE "{depth: %f, total: %f}"
5716               dump_printf_loc (MSG_NOTE, vect_location,
5717                                "        " TEMPLATE "\n",
5718                                layout_costs.in_cost.depth.to_double (),
5719                                layout_costs.in_cost.total.to_double ());
5720               dump_printf_loc (MSG_NOTE, vect_location,
5721                                "      + " TEMPLATE "\n",
5722                                layout_costs.internal_cost.depth.to_double (),
5723                                layout_costs.internal_cost.total.to_double ());
5724               dump_printf_loc (MSG_NOTE, vect_location,
5725                                "      + " TEMPLATE "\n",
5726                                layout_costs.out_cost.depth.to_double (),
5727                                layout_costs.out_cost.total.to_double ());
5728               dump_printf_loc (MSG_NOTE, vect_location,
5729                                "      = " TEMPLATE "\n",
5730                                combined_cost.depth.to_double (),
5731                                combined_cost.total.to_double ());
5732 #undef TEMPLATE
5733             }
5734           else
5735             dump_printf_loc (MSG_NOTE, vect_location,
5736                              "    layout %d: rejected\n", layout_i);
5737         }
5738     }
5739 }
5740
5741 /* Main entry point for the SLP graph optimization pass.  */
5742
5743 void
5744 vect_optimize_slp_pass::run ()
5745 {
5746   build_graph ();
5747   create_partitions ();
5748   start_choosing_layouts ();
5749   if (m_perms.length () > 1)
5750     {
5751       forward_pass ();
5752       backward_pass ();
5753       if (dump_enabled_p ())
5754         dump ();
5755       materialize ();
5756       while (!m_perms.is_empty ())
5757         m_perms.pop ().release ();
5758     }
5759   else
5760     remove_redundant_permutations ();
5761   free_graph (m_slpg);
5762 }
5763
5764 /* Optimize the SLP graph of VINFO.  */
5765
5766 void
5767 vect_optimize_slp (vec_info *vinfo)
5768 {
5769   if (vinfo->slp_instances.is_empty ())
5770     return;
5771   vect_optimize_slp_pass (vinfo).run ();
5772 }
5773
5774 /* Gather loads reachable from the individual SLP graph entries.  */
5775
5776 void
5777 vect_gather_slp_loads (vec_info *vinfo)
5778 {
5779   unsigned i;
5780   slp_instance instance;
5781   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5782     {
5783       hash_set<slp_tree> visited;
5784       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5785                              SLP_INSTANCE_TREE (instance), visited);
5786     }
5787 }
5788
5789
5790 /* For each possible SLP instance decide whether to SLP it and calculate overall
5791    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
5792    least one instance.  */
5793
5794 bool
5795 vect_make_slp_decision (loop_vec_info loop_vinfo)
5796 {
5797   unsigned int i;
5798   poly_uint64 unrolling_factor = 1;
5799   const vec<slp_instance> &slp_instances
5800     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5801   slp_instance instance;
5802   int decided_to_slp = 0;
5803
5804   DUMP_VECT_SCOPE ("vect_make_slp_decision");
5805
5806   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5807     {
5808       /* FORNOW: SLP if you can.  */
5809       /* All unroll factors have the form:
5810
5811            GET_MODE_SIZE (vinfo->vector_mode) * X
5812
5813          for some rational X, so they must have a common multiple.  */
5814       unrolling_factor
5815         = force_common_multiple (unrolling_factor,
5816                                  SLP_INSTANCE_UNROLLING_FACTOR (instance));
5817
5818       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
5819          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5820          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
5821       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5822       decided_to_slp++;
5823     }
5824
5825   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5826
5827   if (decided_to_slp && dump_enabled_p ())
5828     {
5829       dump_printf_loc (MSG_NOTE, vect_location,
5830                        "Decided to SLP %d instances. Unrolling factor ",
5831                        decided_to_slp);
5832       dump_dec (MSG_NOTE, unrolling_factor);
5833       dump_printf (MSG_NOTE, "\n");
5834     }
5835
5836   return (decided_to_slp > 0);
5837 }
5838
5839 /* Private data for vect_detect_hybrid_slp.  */
5840 struct vdhs_data
5841 {
5842   loop_vec_info loop_vinfo;
5843   vec<stmt_vec_info> *worklist;
5844 };
5845
5846 /* Walker for walk_gimple_op.  */
5847
5848 static tree
5849 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5850 {
5851   walk_stmt_info *wi = (walk_stmt_info *)data;
5852   vdhs_data *dat = (vdhs_data *)wi->info;
5853
5854   if (wi->is_lhs)
5855     return NULL_TREE;
5856
5857   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5858   if (!def_stmt_info)
5859     return NULL_TREE;
5860   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5861   if (PURE_SLP_STMT (def_stmt_info))
5862     {
5863       if (dump_enabled_p ())
5864         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5865                          def_stmt_info->stmt);
5866       STMT_SLP_TYPE (def_stmt_info) = hybrid;
5867       dat->worklist->safe_push (def_stmt_info);
5868     }
5869
5870   return NULL_TREE;
5871 }
5872
5873 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5874    if so, otherwise pushing it to WORKLIST.  */
5875
5876 static void
5877 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5878                                vec<stmt_vec_info> &worklist,
5879                                stmt_vec_info stmt_info)
5880 {
5881   if (dump_enabled_p ())
5882     dump_printf_loc (MSG_NOTE, vect_location,
5883                      "Processing hybrid candidate : %G", stmt_info->stmt);
5884   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5885   imm_use_iterator iter2;
5886   ssa_op_iter iter1;
5887   use_operand_p use_p;
5888   def_operand_p def_p;
5889   bool any_def = false;
5890   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5891     {
5892       any_def = true;
5893       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5894         {
5895           if (is_gimple_debug (USE_STMT (use_p)))
5896             continue;
5897           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5898           /* An out-of loop use means this is a loop_vect sink.  */
5899           if (!use_info)
5900             {
5901               if (dump_enabled_p ())
5902                 dump_printf_loc (MSG_NOTE, vect_location,
5903                                  "Found loop_vect sink: %G", stmt_info->stmt);
5904               worklist.safe_push (stmt_info);
5905               return;
5906             }
5907           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5908             {
5909               if (dump_enabled_p ())
5910                 dump_printf_loc (MSG_NOTE, vect_location,
5911                                  "Found loop_vect use: %G", use_info->stmt);
5912               worklist.safe_push (stmt_info);
5913               return;
5914             }
5915         }
5916     }
5917   /* No def means this is a loo_vect sink.  */
5918   if (!any_def)
5919     {
5920       if (dump_enabled_p ())
5921         dump_printf_loc (MSG_NOTE, vect_location,
5922                          "Found loop_vect sink: %G", stmt_info->stmt);
5923       worklist.safe_push (stmt_info);
5924       return;
5925     }
5926   if (dump_enabled_p ())
5927     dump_printf_loc (MSG_NOTE, vect_location,
5928                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5929   STMT_SLP_TYPE (stmt_info) = pure_slp;
5930 }
5931
5932 /* Find stmts that must be both vectorized and SLPed.  */
5933
5934 void
5935 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5936 {
5937   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5938
5939   /* All stmts participating in SLP are marked pure_slp, all other
5940      stmts are loop_vect.
5941      First collect all loop_vect stmts into a worklist.
5942      SLP patterns cause not all original scalar stmts to appear in
5943      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5944      Rectify this here and do a backward walk over the IL only considering
5945      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5946      mark them as pure_slp.  */
5947   auto_vec<stmt_vec_info> worklist;
5948   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5949     {
5950       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5951       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5952            gsi_next (&gsi))
5953         {
5954           gphi *phi = gsi.phi ();
5955           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5956           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5957             maybe_push_to_hybrid_worklist (loop_vinfo,
5958                                            worklist, stmt_info);
5959         }
5960       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5961            gsi_prev (&gsi))
5962         {
5963           gimple *stmt = gsi_stmt (gsi);
5964           if (is_gimple_debug (stmt))
5965             continue;
5966           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5967           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5968             {
5969               for (gimple_stmt_iterator gsi2
5970                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5971                    !gsi_end_p (gsi2); gsi_next (&gsi2))
5972                 {
5973                   stmt_vec_info patt_info
5974                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5975                   if (!STMT_SLP_TYPE (patt_info)
5976                       && STMT_VINFO_RELEVANT (patt_info))
5977                     maybe_push_to_hybrid_worklist (loop_vinfo,
5978                                                    worklist, patt_info);
5979                 }
5980               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5981             }
5982           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5983             maybe_push_to_hybrid_worklist (loop_vinfo,
5984                                            worklist, stmt_info);
5985         }
5986     }
5987
5988   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5989      mark any SLP vectorized stmt as hybrid.
5990      ???  We're visiting def stmts N times (once for each non-SLP and
5991      once for each hybrid-SLP use).  */
5992   walk_stmt_info wi;
5993   vdhs_data dat;
5994   dat.worklist = &worklist;
5995   dat.loop_vinfo = loop_vinfo;
5996   memset (&wi, 0, sizeof (wi));
5997   wi.info = (void *)&dat;
5998   while (!worklist.is_empty ())
5999     {
6000       stmt_vec_info stmt_info = worklist.pop ();
6001       /* Since SSA operands are not set up for pattern stmts we need
6002          to use walk_gimple_op.  */
6003       wi.is_lhs = 0;
6004       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
6005       /* For gather/scatter make sure to walk the offset operand, that
6006          can be a scaling and conversion away.  */
6007       gather_scatter_info gs_info;
6008       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6009           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6010         {
6011           int dummy;
6012           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
6013         }
6014     }
6015 }
6016
6017
6018 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
6019
6020 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6021   : vec_info (vec_info::bb, shared),
6022     bbs (_bbs),
6023     roots (vNULL)
6024 {
6025   for (unsigned i = 0; i < bbs.length (); ++i)
6026     {
6027       if (i != 0)
6028         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6029              gsi_next (&si))
6030           {
6031             gphi *phi = si.phi ();
6032             gimple_set_uid (phi, 0);
6033             add_stmt (phi);
6034           }
6035       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6036            !gsi_end_p (gsi); gsi_next (&gsi))
6037         {
6038           gimple *stmt = gsi_stmt (gsi);
6039           gimple_set_uid (stmt, 0);
6040           if (is_gimple_debug (stmt))
6041             continue;
6042           add_stmt (stmt);
6043         }
6044     }
6045 }
6046
6047
6048 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
6049    stmts in the basic block.  */
6050
6051 _bb_vec_info::~_bb_vec_info ()
6052 {
6053   /* Reset region marker.  */
6054   for (unsigned i = 0; i < bbs.length (); ++i)
6055     {
6056       if (i != 0)
6057         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6058              gsi_next (&si))
6059           {
6060             gphi *phi = si.phi ();
6061             gimple_set_uid (phi, -1);
6062           }
6063       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6064            !gsi_end_p (gsi); gsi_next (&gsi))
6065         {
6066           gimple *stmt = gsi_stmt (gsi);
6067           gimple_set_uid (stmt, -1);
6068         }
6069     }
6070
6071   for (unsigned i = 0; i < roots.length (); ++i)
6072     {
6073       roots[i].stmts.release ();
6074       roots[i].roots.release ();
6075       roots[i].remain.release ();
6076     }
6077   roots.release ();
6078 }
6079
6080 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
6081    given then that child nodes have already been processed, and that
6082    their def types currently match their SLP node's def type.  */
6083
6084 static bool
6085 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6086                                     slp_instance node_instance,
6087                                     stmt_vector_for_cost *cost_vec)
6088 {
6089   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6090
6091   /* Calculate the number of vector statements to be created for the
6092      scalar stmts in this node.  For SLP reductions it is equal to the
6093      number of vector statements in the children (which has already been
6094      calculated by the recursive call).  Otherwise it is the number of
6095      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6096      VF divided by the number of elements in a vector.  */
6097   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6098       && !STMT_VINFO_DATA_REF (stmt_info)
6099       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6100     {
6101       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6102         if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6103           {
6104             SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6105               = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6106             break;
6107           }
6108     }
6109   else
6110     {
6111       poly_uint64 vf;
6112       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6113         vf = loop_vinfo->vectorization_factor;
6114       else
6115         vf = 1;
6116       unsigned int group_size = SLP_TREE_LANES (node);
6117       tree vectype = SLP_TREE_VECTYPE (node);
6118       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6119         = vect_get_num_vectors (vf * group_size, vectype);
6120     }
6121
6122   /* Handle purely internal nodes.  */
6123   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6124     {
6125       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6126         return false;
6127
6128       stmt_vec_info slp_stmt_info;
6129       unsigned int i;
6130       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6131         {
6132           if (STMT_VINFO_LIVE_P (slp_stmt_info)
6133               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6134                                                node_instance, i,
6135                                                false, cost_vec))
6136             return false;
6137         }
6138       return true;
6139     }
6140
6141   bool dummy;
6142   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6143                             node, node_instance, cost_vec);
6144 }
6145
6146 /* Try to build NODE from scalars, returning true on success.
6147    NODE_INSTANCE is the SLP instance that contains NODE.  */
6148
6149 static bool
6150 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6151                               slp_instance node_instance)
6152 {
6153   stmt_vec_info stmt_info;
6154   unsigned int i;
6155
6156   if (!is_a <bb_vec_info> (vinfo)
6157       || node == SLP_INSTANCE_TREE (node_instance)
6158       || !SLP_TREE_SCALAR_STMTS (node).exists ()
6159       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6160       /* Force the mask use to be built from scalars instead.  */
6161       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6162     return false;
6163
6164   if (dump_enabled_p ())
6165     dump_printf_loc (MSG_NOTE, vect_location,
6166                      "Building vector operands of %p from scalars instead\n",
6167                      (void *) node);
6168
6169   /* Don't remove and free the child nodes here, since they could be
6170      referenced by other structures.  The analysis and scheduling phases
6171      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
6172   unsigned int group_size = SLP_TREE_LANES (node);
6173   SLP_TREE_DEF_TYPE (node) = vect_external_def;
6174   /* Invariants get their vector type from the uses.  */
6175   SLP_TREE_VECTYPE (node) = NULL_TREE;
6176   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6177   SLP_TREE_LOAD_PERMUTATION (node).release ();
6178   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6179     {
6180       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6181       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6182     }
6183   return true;
6184 }
6185
6186 /* Return true if all elements of the slice are the same.  */
6187 bool
6188 vect_scalar_ops_slice::all_same_p () const
6189 {
6190   for (unsigned int i = 1; i < length; ++i)
6191     if (!operand_equal_p (op (0), op (i)))
6192       return false;
6193   return true;
6194 }
6195
6196 hashval_t
6197 vect_scalar_ops_slice_hash::hash (const value_type &s)
6198 {
6199   hashval_t hash = 0;
6200   for (unsigned i = 0; i < s.length; ++i)
6201     hash = iterative_hash_expr (s.op (i), hash);
6202   return hash;
6203 }
6204
6205 bool
6206 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6207                                    const compare_type &s2)
6208 {
6209   if (s1.length != s2.length)
6210     return false;
6211   for (unsigned i = 0; i < s1.length; ++i)
6212     if (!operand_equal_p (s1.op (i), s2.op (i)))
6213       return false;
6214   return true;
6215 }
6216
6217 /* Compute the prologue cost for invariant or constant operands represented
6218    by NODE.  */
6219
6220 static void
6221 vect_prologue_cost_for_slp (slp_tree node,
6222                             stmt_vector_for_cost *cost_vec)
6223 {
6224   /* There's a special case of an existing vector, that costs nothing.  */
6225   if (SLP_TREE_SCALAR_OPS (node).length () == 0
6226       && !SLP_TREE_VEC_DEFS (node).is_empty ())
6227     return;
6228   /* Without looking at the actual initializer a vector of
6229      constants can be implemented as load from the constant pool.
6230      When all elements are the same we can use a splat.  */
6231   tree vectype = SLP_TREE_VECTYPE (node);
6232   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6233   unsigned HOST_WIDE_INT const_nunits;
6234   unsigned nelt_limit;
6235   auto ops = &SLP_TREE_SCALAR_OPS (node);
6236   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6237   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6238       && ! multiple_p (const_nunits, group_size))
6239     {
6240       nelt_limit = const_nunits;
6241       hash_set<vect_scalar_ops_slice_hash> vector_ops;
6242       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6243         if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6244           starts.quick_push (i * const_nunits);
6245     }
6246   else
6247     {
6248       /* If either the vector has variable length or the vectors
6249          are composed of repeated whole groups we only need to
6250          cost construction once.  All vectors will be the same.  */
6251       nelt_limit = group_size;
6252       starts.quick_push (0);
6253     }
6254   /* ???  We're just tracking whether vectors in a single node are the same.
6255      Ideally we'd do something more global.  */
6256   bool passed = false;
6257   for (unsigned int start : starts)
6258     {
6259       vect_cost_for_stmt kind;
6260       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6261         kind = vector_load;
6262       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6263         kind = scalar_to_vec;
6264       else
6265         kind = vec_construct;
6266       /* The target cost hook has no idea which part of the SLP node
6267          we are costing so avoid passing it down more than once.  Pass
6268          it to the first vec_construct or scalar_to_vec part since for those
6269          the x86 backend tries to account for GPR to XMM register moves.  */
6270       record_stmt_cost (cost_vec, 1, kind,
6271                         (kind != vector_load && !passed) ? node : nullptr,
6272                         vectype, 0, vect_prologue);
6273       if (kind != vector_load)
6274         passed = true;
6275     }
6276 }
6277
6278 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6279    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6280
6281    Return true if the operations are supported.  */
6282
6283 static bool
6284 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6285                                   slp_instance node_instance,
6286                                   hash_set<slp_tree> &visited_set,
6287                                   vec<slp_tree> &visited_vec,
6288                                   stmt_vector_for_cost *cost_vec)
6289 {
6290   int i, j;
6291   slp_tree child;
6292
6293   /* Assume we can code-generate all invariants.  */
6294   if (!node
6295       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6296       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6297     return true;
6298
6299   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6300     {
6301       if (dump_enabled_p ())
6302         dump_printf_loc (MSG_NOTE, vect_location,
6303                          "Failed cyclic SLP reference in %p\n", (void *) node);
6304       return false;
6305     }
6306   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6307
6308   /* If we already analyzed the exact same set of scalar stmts we're done.
6309      We share the generated vector stmts for those.  */
6310   if (visited_set.add (node))
6311     return true;
6312   visited_vec.safe_push (node);
6313
6314   bool res = true;
6315   unsigned visited_rec_start = visited_vec.length ();
6316   unsigned cost_vec_rec_start = cost_vec->length ();
6317   bool seen_non_constant_child = false;
6318   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6319     {
6320       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6321                                               visited_set, visited_vec,
6322                                               cost_vec);
6323       if (!res)
6324         break;
6325       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6326         seen_non_constant_child = true;
6327     }
6328   /* We're having difficulties scheduling nodes with just constant
6329      operands and no scalar stmts since we then cannot compute a stmt
6330      insertion place.  */
6331   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6332     {
6333       if (dump_enabled_p ())
6334         dump_printf_loc (MSG_NOTE, vect_location,
6335                          "Cannot vectorize all-constant op node %p\n",
6336                          (void *) node);
6337       res = false;
6338     }
6339
6340   if (res)
6341     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6342                                               cost_vec);
6343   /* If analysis failed we have to pop all recursive visited nodes
6344      plus ourselves.  */
6345   if (!res)
6346     {
6347       while (visited_vec.length () >= visited_rec_start)
6348         visited_set.remove (visited_vec.pop ());
6349       cost_vec->truncate (cost_vec_rec_start);
6350     }
6351
6352   /* When the node can be vectorized cost invariant nodes it references.
6353      This is not done in DFS order to allow the refering node
6354      vectorizable_* calls to nail down the invariant nodes vector type
6355      and possibly unshare it if it needs a different vector type than
6356      other referrers.  */
6357   if (res)
6358     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6359       if (child
6360           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6361               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6362           /* Perform usual caching, note code-generation still
6363              code-gens these nodes multiple times but we expect
6364              to CSE them later.  */
6365           && !visited_set.add (child))
6366         {
6367           visited_vec.safe_push (child);
6368           /* ???  After auditing more code paths make a "default"
6369              and push the vector type from NODE to all children
6370              if it is not already set.  */
6371           /* Compute the number of vectors to be generated.  */
6372           tree vector_type = SLP_TREE_VECTYPE (child);
6373           if (!vector_type)
6374             {
6375               /* For shifts with a scalar argument we don't need
6376                  to cost or code-generate anything.
6377                  ???  Represent this more explicitely.  */
6378               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6379                            == shift_vec_info_type)
6380                           && j == 1);
6381               continue;
6382             }
6383           unsigned group_size = SLP_TREE_LANES (child);
6384           poly_uint64 vf = 1;
6385           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6386             vf = loop_vinfo->vectorization_factor;
6387           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6388             = vect_get_num_vectors (vf * group_size, vector_type);
6389           /* And cost them.  */
6390           vect_prologue_cost_for_slp (child, cost_vec);
6391         }
6392
6393   /* If this node or any of its children can't be vectorized, try pruning
6394      the tree here rather than felling the whole thing.  */
6395   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6396     {
6397       /* We'll need to revisit this for invariant costing and number
6398          of vectorized stmt setting.   */
6399       res = true;
6400     }
6401
6402   return res;
6403 }
6404
6405 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6406    region and that can be vectorized using vectorizable_live_operation
6407    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
6408    scalar code computing it to be retained.  */
6409
6410 static void
6411 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6412                              slp_instance instance,
6413                              stmt_vector_for_cost *cost_vec,
6414                              hash_set<stmt_vec_info> &svisited,
6415                              hash_set<slp_tree> &visited)
6416 {
6417   if (visited.add (node))
6418     return;
6419
6420   unsigned i;
6421   stmt_vec_info stmt_info;
6422   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6423   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6424     {
6425       if (svisited.contains (stmt_info))
6426         continue;
6427       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6428       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6429           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6430         /* Only the pattern root stmt computes the original scalar value.  */
6431         continue;
6432       bool mark_visited = true;
6433       gimple *orig_stmt = orig_stmt_info->stmt;
6434       ssa_op_iter op_iter;
6435       def_operand_p def_p;
6436       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6437         {
6438           imm_use_iterator use_iter;
6439           gimple *use_stmt;
6440           stmt_vec_info use_stmt_info;
6441           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6442             if (!is_gimple_debug (use_stmt))
6443               {
6444                 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6445                 if (!use_stmt_info
6446                     || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6447                   {
6448                     STMT_VINFO_LIVE_P (stmt_info) = true;
6449                     if (vectorizable_live_operation (bb_vinfo, stmt_info,
6450                                                      node, instance, i,
6451                                                      false, cost_vec))
6452                       /* ???  So we know we can vectorize the live stmt
6453                          from one SLP node.  If we cannot do so from all
6454                          or none consistently we'd have to record which
6455                          SLP node (and lane) we want to use for the live
6456                          operation.  So make sure we can code-generate
6457                          from all nodes.  */
6458                       mark_visited = false;
6459                     else
6460                       STMT_VINFO_LIVE_P (stmt_info) = false;
6461                     break;
6462                   }
6463               }
6464           /* We have to verify whether we can insert the lane extract
6465              before all uses.  The following is a conservative approximation.
6466              We cannot put this into vectorizable_live_operation because
6467              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6468              doesn't work.
6469              Note that while the fact that we emit code for loads at the
6470              first load should make this a non-problem leafs we construct
6471              from scalars are vectorized after the last scalar def.
6472              ???  If we'd actually compute the insert location during
6473              analysis we could use sth less conservative than the last
6474              scalar stmt in the node for the dominance check.  */
6475           /* ???  What remains is "live" uses in vector CTORs in the same
6476              SLP graph which is where those uses can end up code-generated
6477              right after their definition instead of close to their original
6478              use.  But that would restrict us to code-generate lane-extracts
6479              from the latest stmt in a node.  So we compensate for this
6480              during code-generation, simply not replacing uses for those
6481              hopefully rare cases.  */
6482           if (STMT_VINFO_LIVE_P (stmt_info))
6483             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6484               if (!is_gimple_debug (use_stmt)
6485                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6486                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6487                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6488                 {
6489                   if (dump_enabled_p ())
6490                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6491                                      "Cannot determine insertion place for "
6492                                      "lane extract\n");
6493                   STMT_VINFO_LIVE_P (stmt_info) = false;
6494                   mark_visited = true;
6495                 }
6496         }
6497       if (mark_visited)
6498         svisited.add (stmt_info);
6499     }
6500
6501   slp_tree child;
6502   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6503     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6504       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6505                                    cost_vec, svisited, visited);
6506 }
6507
6508 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
6509
6510 static bool
6511 vectorizable_bb_reduc_epilogue (slp_instance instance,
6512                                 stmt_vector_for_cost *cost_vec)
6513 {
6514   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6515   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6516   if (reduc_code == MINUS_EXPR)
6517     reduc_code = PLUS_EXPR;
6518   internal_fn reduc_fn;
6519   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6520   if (!vectype
6521       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6522       || reduc_fn == IFN_LAST
6523       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6524       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6525                                      TREE_TYPE (vectype)))
6526     {
6527       if (dump_enabled_p ())
6528         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6529                          "not vectorized: basic block reduction epilogue "
6530                          "operation unsupported.\n");
6531       return false;
6532     }
6533
6534   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6535      cost log2 vector operations plus shuffles and one extraction.  */
6536   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6537   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6538                     vectype, 0, vect_body);
6539   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6540                     vectype, 0, vect_body);
6541   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6542                     vectype, 0, vect_body);
6543
6544   /* Since we replace all stmts of a possibly longer scalar reduction
6545      chain account for the extra scalar stmts for that.  */
6546   record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6547                     instance->root_stmts[0], 0, vect_body);
6548   return true;
6549 }
6550
6551 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6552    and recurse to children.  */
6553
6554 static void
6555 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6556                               hash_set<slp_tree> &visited)
6557 {
6558   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6559       || visited.add (node))
6560     return;
6561
6562   stmt_vec_info stmt;
6563   unsigned i;
6564   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6565     roots.remove (vect_orig_stmt (stmt));
6566
6567   slp_tree child;
6568   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6569     if (child)
6570       vect_slp_prune_covered_roots (child, roots, visited);
6571 }
6572
6573 /* Analyze statements in SLP instances of VINFO.  Return true if the
6574    operations are supported. */
6575
6576 bool
6577 vect_slp_analyze_operations (vec_info *vinfo)
6578 {
6579   slp_instance instance;
6580   int i;
6581
6582   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6583
6584   hash_set<slp_tree> visited;
6585   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6586     {
6587       auto_vec<slp_tree> visited_vec;
6588       stmt_vector_for_cost cost_vec;
6589       cost_vec.create (2);
6590       if (is_a <bb_vec_info> (vinfo))
6591         vect_location = instance->location ();
6592       if (!vect_slp_analyze_node_operations (vinfo,
6593                                              SLP_INSTANCE_TREE (instance),
6594                                              instance, visited, visited_vec,
6595                                              &cost_vec)
6596           /* CTOR instances require vectorized defs for the SLP tree root.  */
6597           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6598               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6599                   != vect_internal_def
6600                   /* Make sure we vectorized with the expected type.  */
6601                   || !useless_type_conversion_p
6602                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6603                                               (instance->root_stmts[0]->stmt))),
6604                          TREE_TYPE (SLP_TREE_VECTYPE
6605                                             (SLP_INSTANCE_TREE (instance))))))
6606           /* Check we can vectorize the reduction.  */
6607           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6608               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6609         {
6610           slp_tree node = SLP_INSTANCE_TREE (instance);
6611           stmt_vec_info stmt_info;
6612           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6613             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6614           else
6615             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6616           if (dump_enabled_p ())
6617             dump_printf_loc (MSG_NOTE, vect_location,
6618                              "removing SLP instance operations starting from: %G",
6619                              stmt_info->stmt);
6620           vect_free_slp_instance (instance);
6621           vinfo->slp_instances.ordered_remove (i);
6622           cost_vec.release ();
6623           while (!visited_vec.is_empty ())
6624             visited.remove (visited_vec.pop ());
6625         }
6626       else
6627         {
6628           i++;
6629           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6630             {
6631               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6632               cost_vec.release ();
6633             }
6634           else
6635             /* For BB vectorization remember the SLP graph entry
6636                cost for later.  */
6637             instance->cost_vec = cost_vec;
6638         }
6639     }
6640
6641   /* Now look for SLP instances with a root that are covered by other
6642      instances and remove them.  */
6643   hash_set<stmt_vec_info> roots;
6644   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6645     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6646       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6647   if (!roots.is_empty ())
6648     {
6649       visited.empty ();
6650       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6651         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6652                                       visited);
6653       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6654         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6655             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6656           {
6657             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6658             if (dump_enabled_p ())
6659               dump_printf_loc (MSG_NOTE, vect_location,
6660                                "removing SLP instance operations starting "
6661                                "from: %G", root->stmt);
6662             vect_free_slp_instance (instance);
6663             vinfo->slp_instances.ordered_remove (i);
6664           }
6665         else
6666           ++i;
6667     }
6668
6669   /* Compute vectorizable live stmts.  */
6670   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6671     {
6672       hash_set<stmt_vec_info> svisited;
6673       hash_set<slp_tree> visited;
6674       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6675         {
6676           vect_location = instance->location ();
6677           vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6678                                        instance, &instance->cost_vec, svisited,
6679                                        visited);
6680         }
6681     }
6682
6683   return !vinfo->slp_instances.is_empty ();
6684 }
6685
6686 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6687    closing the eventual chain.  */
6688
6689 static slp_instance
6690 get_ultimate_leader (slp_instance instance,
6691                      hash_map<slp_instance, slp_instance> &instance_leader)
6692 {
6693   auto_vec<slp_instance *, 8> chain;
6694   slp_instance *tem;
6695   while (*(tem = instance_leader.get (instance)) != instance)
6696     {
6697       chain.safe_push (tem);
6698       instance = *tem;
6699     }
6700   while (!chain.is_empty ())
6701     *chain.pop () = instance;
6702   return instance;
6703 }
6704
6705 namespace {
6706 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
6707    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6708    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
6709
6710    INSTANCE_LEADER is as for get_ultimate_leader.  */
6711
6712 template<typename T>
6713 bool
6714 vect_map_to_instance (slp_instance instance, T key,
6715                       hash_map<T, slp_instance> &key_to_instance,
6716                       hash_map<slp_instance, slp_instance> &instance_leader)
6717 {
6718   bool existed_p;
6719   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6720   if (!existed_p)
6721     ;
6722   else if (key_instance != instance)
6723     {
6724       /* If we're running into a previously marked key make us the
6725          leader of the current ultimate leader.  This keeps the
6726          leader chain acyclic and works even when the current instance
6727          connects two previously independent graph parts.  */
6728       slp_instance key_leader
6729         = get_ultimate_leader (key_instance, instance_leader);
6730       if (key_leader != instance)
6731         instance_leader.put (key_leader, instance);
6732     }
6733   key_instance = instance;
6734   return existed_p;
6735 }
6736 }
6737
6738 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
6739
6740 static void
6741 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6742                            slp_instance instance, slp_tree node,
6743                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6744                            hash_map<slp_tree, slp_instance> &node_to_instance,
6745                            hash_map<slp_instance, slp_instance> &instance_leader)
6746 {
6747   stmt_vec_info stmt_info;
6748   unsigned i;
6749
6750   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6751     vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6752                           instance_leader);
6753
6754   if (vect_map_to_instance (instance, node, node_to_instance,
6755                             instance_leader))
6756     return;
6757
6758   slp_tree child;
6759   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6760     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6761       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6762                                  node_to_instance, instance_leader);
6763 }
6764
6765 /* Partition the SLP graph into pieces that can be costed independently.  */
6766
6767 static void
6768 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6769 {
6770   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6771
6772   /* First walk the SLP graph assigning each involved scalar stmt a
6773      corresponding SLP graph entry and upon visiting a previously
6774      marked stmt, make the stmts leader the current SLP graph entry.  */
6775   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6776   hash_map<slp_tree, slp_instance> node_to_instance;
6777   hash_map<slp_instance, slp_instance> instance_leader;
6778   slp_instance instance;
6779   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6780     {
6781       instance_leader.put (instance, instance);
6782       vect_bb_partition_graph_r (bb_vinfo,
6783                                  instance, SLP_INSTANCE_TREE (instance),
6784                                  stmt_to_instance, node_to_instance,
6785                                  instance_leader);
6786     }
6787
6788   /* Then collect entries to each independent subgraph.  */
6789   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6790     {
6791       slp_instance leader = get_ultimate_leader (instance, instance_leader);
6792       leader->subgraph_entries.safe_push (instance);
6793       if (dump_enabled_p ()
6794           && leader != instance)
6795         dump_printf_loc (MSG_NOTE, vect_location,
6796                          "instance %p is leader of %p\n",
6797                          (void *) leader, (void *) instance);
6798     }
6799 }
6800
6801 /* Compute the set of scalar stmts participating in internal and external
6802    nodes.  */
6803
6804 static void
6805 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6806                                          hash_set<slp_tree> &visited,
6807                                          hash_set<stmt_vec_info> &vstmts,
6808                                          hash_set<stmt_vec_info> &estmts)
6809 {
6810   int i;
6811   stmt_vec_info stmt_info;
6812   slp_tree child;
6813
6814   if (visited.add (node))
6815     return;
6816
6817   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6818     {
6819       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6820         vstmts.add (stmt_info);
6821
6822       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6823         if (child)
6824           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6825                                                    vstmts, estmts);
6826     }
6827   else
6828     for (tree def : SLP_TREE_SCALAR_OPS (node))
6829       {
6830         stmt_vec_info def_stmt = vinfo->lookup_def (def);
6831         if (def_stmt)
6832           estmts.add (def_stmt);
6833       }
6834 }
6835
6836
6837 /* Compute the scalar cost of the SLP node NODE and its children
6838    and return it.  Do not account defs that are marked in LIFE and
6839    update LIFE according to uses of NODE.  */
6840
6841 static void
6842 vect_bb_slp_scalar_cost (vec_info *vinfo,
6843                          slp_tree node, vec<bool, va_heap> *life,
6844                          stmt_vector_for_cost *cost_vec,
6845                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6846                          hash_set<slp_tree> &visited)
6847 {
6848   unsigned i;
6849   stmt_vec_info stmt_info;
6850   slp_tree child;
6851
6852   if (visited.add (node))
6853     return;
6854
6855   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6856     {
6857       ssa_op_iter op_iter;
6858       def_operand_p def_p;
6859
6860       if ((*life)[i])
6861         continue;
6862
6863       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6864       gimple *orig_stmt = orig_stmt_info->stmt;
6865
6866       /* If there is a non-vectorized use of the defs then the scalar
6867          stmt is kept live in which case we do not account it or any
6868          required defs in the SLP children in the scalar cost.  This
6869          way we make the vectorization more costly when compared to
6870          the scalar cost.  */
6871       if (!STMT_VINFO_LIVE_P (stmt_info))
6872         {
6873           auto_vec<gimple *, 8> worklist;
6874           hash_set<gimple *> *worklist_visited = NULL;
6875           worklist.quick_push (orig_stmt);
6876           do
6877             {
6878               gimple *work_stmt = worklist.pop ();
6879               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6880                 {
6881                   imm_use_iterator use_iter;
6882                   gimple *use_stmt;
6883                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6884                                          DEF_FROM_PTR (def_p))
6885                     if (!is_gimple_debug (use_stmt))
6886                       {
6887                         stmt_vec_info use_stmt_info
6888                           = vinfo->lookup_stmt (use_stmt);
6889                         if (!use_stmt_info
6890                             || !vectorized_scalar_stmts.contains (use_stmt_info))
6891                           {
6892                             if (use_stmt_info
6893                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6894                               {
6895                                 /* For stmts participating in patterns we have
6896                                    to check its uses recursively.  */
6897                                 if (!worklist_visited)
6898                                   worklist_visited = new hash_set<gimple *> ();
6899                                 if (!worklist_visited->add (use_stmt))
6900                                   worklist.safe_push (use_stmt);
6901                                 continue;
6902                               }
6903                             (*life)[i] = true;
6904                             goto next_lane;
6905                           }
6906                       }
6907                 }
6908             }
6909           while (!worklist.is_empty ());
6910 next_lane:
6911           if (worklist_visited)
6912             delete worklist_visited;
6913           if ((*life)[i])
6914             continue;
6915         }
6916
6917       /* Count scalar stmts only once.  */
6918       if (gimple_visited_p (orig_stmt))
6919         continue;
6920       gimple_set_visited (orig_stmt, true);
6921
6922       vect_cost_for_stmt kind;
6923       if (STMT_VINFO_DATA_REF (orig_stmt_info))
6924         {
6925           if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6926             kind = scalar_load;
6927           else
6928             kind = scalar_store;
6929         }
6930       else if (vect_nop_conversion_p (orig_stmt_info))
6931         continue;
6932       /* For single-argument PHIs assume coalescing which means zero cost
6933          for the scalar and the vector PHIs.  This avoids artificially
6934          favoring the vector path (but may pessimize it in some cases).  */
6935       else if (is_a <gphi *> (orig_stmt_info->stmt)
6936                && gimple_phi_num_args
6937                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6938         continue;
6939       else
6940         kind = scalar_stmt;
6941       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6942                         SLP_TREE_VECTYPE (node), 0, vect_body);
6943     }
6944
6945   auto_vec<bool, 20> subtree_life;
6946   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6947     {
6948       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6949         {
6950           /* Do not directly pass LIFE to the recursive call, copy it to
6951              confine changes in the callee to the current child/subtree.  */
6952           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6953             {
6954               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6955               for (unsigned j = 0;
6956                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6957                 {
6958                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6959                   if (perm.first == i)
6960                     subtree_life[perm.second] = (*life)[j];
6961                 }
6962             }
6963           else
6964             {
6965               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6966               subtree_life.safe_splice (*life);
6967             }
6968           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6969                                    vectorized_scalar_stmts, visited);
6970           subtree_life.truncate (0);
6971         }
6972     }
6973 }
6974
6975 /* Comparator for the loop-index sorted cost vectors.  */
6976
6977 static int
6978 li_cost_vec_cmp (const void *a_, const void *b_)
6979 {
6980   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6981   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6982   if (a->first < b->first)
6983     return -1;
6984   else if (a->first == b->first)
6985     return 0;
6986   return 1;
6987 }
6988
6989 /* Check if vectorization of the basic block is profitable for the
6990    subgraph denoted by SLP_INSTANCES.  */
6991
6992 static bool
6993 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6994                                     vec<slp_instance> slp_instances,
6995                                     loop_p orig_loop)
6996 {
6997   slp_instance instance;
6998   int i;
6999   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
7000   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
7001
7002   if (dump_enabled_p ())
7003     {
7004       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
7005       hash_set<slp_tree> visited;
7006       FOR_EACH_VEC_ELT (slp_instances, i, instance)
7007         vect_print_slp_graph (MSG_NOTE, vect_location,
7008                               SLP_INSTANCE_TREE (instance), visited);
7009     }
7010
7011   /* Compute the set of scalar stmts we know will go away 'locally' when
7012      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
7013      not accurate for nodes promoted extern late or for scalar stmts that
7014      are used both in extern defs and in vectorized defs.  */
7015   hash_set<stmt_vec_info> vectorized_scalar_stmts;
7016   hash_set<stmt_vec_info> scalar_stmts_in_externs;
7017   hash_set<slp_tree> visited;
7018   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7019     {
7020       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
7021                                                SLP_INSTANCE_TREE (instance),
7022                                                visited,
7023                                                vectorized_scalar_stmts,
7024                                                scalar_stmts_in_externs);
7025       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7026         vectorized_scalar_stmts.add (rstmt);
7027     }
7028   /* Scalar stmts used as defs in external nodes need to be preseved, so
7029      remove them from vectorized_scalar_stmts.  */
7030   for (stmt_vec_info stmt : scalar_stmts_in_externs)
7031     vectorized_scalar_stmts.remove (stmt);
7032
7033   /* Calculate scalar cost and sum the cost for the vector stmts
7034      previously collected.  */
7035   stmt_vector_for_cost scalar_costs = vNULL;
7036   stmt_vector_for_cost vector_costs = vNULL;
7037   visited.empty ();
7038   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7039     {
7040       auto_vec<bool, 20> life;
7041       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7042                               true);
7043       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7044         record_stmt_cost (&scalar_costs,
7045                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
7046                           scalar_stmt,
7047                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
7048       vect_bb_slp_scalar_cost (bb_vinfo,
7049                                SLP_INSTANCE_TREE (instance),
7050                                &life, &scalar_costs, vectorized_scalar_stmts,
7051                                visited);
7052       vector_costs.safe_splice (instance->cost_vec);
7053       instance->cost_vec.release ();
7054     }
7055
7056   if (dump_enabled_p ())
7057     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7058
7059   /* When costing non-loop vectorization we need to consider each covered
7060      loop independently and make sure vectorization is profitable.  For
7061      now we assume a loop may be not entered or executed an arbitrary
7062      number of iterations (???  static information can provide more
7063      precise info here) which means we can simply cost each containing
7064      loops stmts separately.  */
7065
7066   /* First produce cost vectors sorted by loop index.  */
7067   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7068     li_scalar_costs (scalar_costs.length ());
7069   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7070     li_vector_costs (vector_costs.length ());
7071   stmt_info_for_cost *cost;
7072   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7073     {
7074       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7075       li_scalar_costs.quick_push (std::make_pair (l, cost));
7076     }
7077   /* Use a random used loop as fallback in case the first vector_costs
7078      entry does not have a stmt_info associated with it.  */
7079   unsigned l = li_scalar_costs[0].first;
7080   FOR_EACH_VEC_ELT (vector_costs, i, cost)
7081     {
7082       /* We inherit from the previous COST, invariants, externals and
7083          extracts immediately follow the cost for the related stmt.  */
7084       if (cost->stmt_info)
7085         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7086       li_vector_costs.quick_push (std::make_pair (l, cost));
7087     }
7088   li_scalar_costs.qsort (li_cost_vec_cmp);
7089   li_vector_costs.qsort (li_cost_vec_cmp);
7090
7091   /* Now cost the portions individually.  */
7092   unsigned vi = 0;
7093   unsigned si = 0;
7094   bool profitable = true;
7095   while (si < li_scalar_costs.length ()
7096          && vi < li_vector_costs.length ())
7097     {
7098       unsigned sl = li_scalar_costs[si].first;
7099       unsigned vl = li_vector_costs[vi].first;
7100       if (sl != vl)
7101         {
7102           if (dump_enabled_p ())
7103             dump_printf_loc (MSG_NOTE, vect_location,
7104                              "Scalar %d and vector %d loop part do not "
7105                              "match up, skipping scalar part\n", sl, vl);
7106           /* Skip the scalar part, assuming zero cost on the vector side.  */
7107           do
7108             {
7109               si++;
7110             }
7111           while (si < li_scalar_costs.length ()
7112                  && li_scalar_costs[si].first == sl);
7113           continue;
7114         }
7115
7116       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7117       do
7118         {
7119           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7120           si++;
7121         }
7122       while (si < li_scalar_costs.length ()
7123              && li_scalar_costs[si].first == sl);
7124       unsigned dummy;
7125       finish_cost (scalar_target_cost_data, nullptr,
7126                    &dummy, &scalar_cost, &dummy);
7127
7128       /* Complete the target-specific vector cost calculation.  */
7129       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7130       do
7131         {
7132           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7133           vi++;
7134         }
7135       while (vi < li_vector_costs.length ()
7136              && li_vector_costs[vi].first == vl);
7137       finish_cost (vect_target_cost_data, scalar_target_cost_data,
7138                    &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7139       delete scalar_target_cost_data;
7140       delete vect_target_cost_data;
7141
7142       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7143
7144       if (dump_enabled_p ())
7145         {
7146           dump_printf_loc (MSG_NOTE, vect_location,
7147                            "Cost model analysis for part in loop %d:\n", sl);
7148           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
7149                        vec_inside_cost + vec_outside_cost);
7150           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
7151         }
7152
7153       /* Vectorization is profitable if its cost is more than the cost of scalar
7154          version.  Note that we err on the vector side for equal cost because
7155          the cost estimate is otherwise quite pessimistic (constant uses are
7156          free on the scalar side but cost a load on the vector side for
7157          example).  */
7158       if (vec_outside_cost + vec_inside_cost > scalar_cost)
7159         {
7160           profitable = false;
7161           break;
7162         }
7163     }
7164   if (profitable && vi < li_vector_costs.length ())
7165     {
7166       if (dump_enabled_p ())
7167         dump_printf_loc (MSG_NOTE, vect_location,
7168                          "Excess vector cost for part in loop %d:\n",
7169                          li_vector_costs[vi].first);
7170       profitable = false;
7171     }
7172
7173   /* Unset visited flag.  This is delayed when the subgraph is profitable
7174      and we process the loop for remaining unvectorized if-converted code.  */
7175   if (!orig_loop || !profitable)
7176     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7177       gimple_set_visited  (cost->stmt_info->stmt, false);
7178
7179   scalar_costs.release ();
7180   vector_costs.release ();
7181
7182   return profitable;
7183 }
7184
7185 /* qsort comparator for lane defs.  */
7186
7187 static int
7188 vld_cmp (const void *a_, const void *b_)
7189 {
7190   auto *a = (const std::pair<unsigned, tree> *)a_;
7191   auto *b = (const std::pair<unsigned, tree> *)b_;
7192   return a->first - b->first;
7193 }
7194
7195 /* Return true if USE_STMT is a vector lane insert into VEC and set
7196    *THIS_LANE to the lane number that is set.  */
7197
7198 static bool
7199 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7200 {
7201   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7202   if (!use_ass
7203       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7204       || (vec
7205           ? gimple_assign_rhs1 (use_ass) != vec
7206           : ((vec = gimple_assign_rhs1 (use_ass)), false))
7207       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7208                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7209       || !constant_multiple_p
7210             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7211              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7212              this_lane))
7213     return false;
7214   return true;
7215 }
7216
7217 /* Find any vectorizable constructors and add them to the grouped_store
7218    array.  */
7219
7220 static void
7221 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7222 {
7223   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7224     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7225          !gsi_end_p (gsi); gsi_next (&gsi))
7226     {
7227       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7228       if (!assign)
7229         continue;
7230
7231       tree rhs = gimple_assign_rhs1 (assign);
7232       enum tree_code code = gimple_assign_rhs_code (assign);
7233       use_operand_p use_p;
7234       gimple *use_stmt;
7235       if (code == CONSTRUCTOR)
7236         {
7237           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7238               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7239                            CONSTRUCTOR_NELTS (rhs))
7240               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7241               || uniform_vector_p (rhs))
7242             continue;
7243
7244           unsigned j;
7245           tree val;
7246           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7247             if (TREE_CODE (val) != SSA_NAME
7248                 || !bb_vinfo->lookup_def (val))
7249               break;
7250           if (j != CONSTRUCTOR_NELTS (rhs))
7251             continue;
7252
7253           vec<stmt_vec_info> roots = vNULL;
7254           roots.safe_push (bb_vinfo->lookup_stmt (assign));
7255           vec<stmt_vec_info> stmts;
7256           stmts.create (CONSTRUCTOR_NELTS (rhs));
7257           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7258             stmts.quick_push
7259               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7260           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7261                                                stmts, roots));
7262         }
7263       else if (code == BIT_INSERT_EXPR
7264                && VECTOR_TYPE_P (TREE_TYPE (rhs))
7265                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7266                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7267                && integer_zerop (gimple_assign_rhs3 (assign))
7268                && useless_type_conversion_p
7269                     (TREE_TYPE (TREE_TYPE (rhs)),
7270                      TREE_TYPE (gimple_assign_rhs2 (assign)))
7271                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7272         {
7273           /* We start to match on insert to lane zero but since the
7274              inserts need not be ordered we'd have to search both
7275              the def and the use chains.  */
7276           tree vectype = TREE_TYPE (rhs);
7277           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7278           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7279           auto_sbitmap lanes (nlanes);
7280           bitmap_clear (lanes);
7281           bitmap_set_bit (lanes, 0);
7282           tree def = gimple_assign_lhs (assign);
7283           lane_defs.quick_push
7284                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
7285           unsigned lanes_found = 1;
7286           /* Start with the use chains, the last stmt will be the root.  */
7287           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7288           vec<stmt_vec_info> roots = vNULL;
7289           roots.safe_push (last);
7290           do
7291             {
7292               use_operand_p use_p;
7293               gimple *use_stmt;
7294               if (!single_imm_use (def, &use_p, &use_stmt))
7295                 break;
7296               unsigned this_lane;
7297               if (!bb_vinfo->lookup_stmt (use_stmt)
7298                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7299                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7300                 break;
7301               if (bitmap_bit_p (lanes, this_lane))
7302                 break;
7303               lanes_found++;
7304               bitmap_set_bit (lanes, this_lane);
7305               gassign *use_ass = as_a <gassign *> (use_stmt);
7306               lane_defs.quick_push (std::make_pair
7307                                      (this_lane, gimple_assign_rhs2 (use_ass)));
7308               last = bb_vinfo->lookup_stmt (use_ass);
7309               roots.safe_push (last);
7310               def = gimple_assign_lhs (use_ass);
7311             }
7312           while (lanes_found < nlanes);
7313           if (roots.length () > 1)
7314             std::swap(roots[0], roots[roots.length () - 1]);
7315           if (lanes_found < nlanes)
7316             {
7317               /* Now search the def chain.  */
7318               def = gimple_assign_rhs1 (assign);
7319               do
7320                 {
7321                   if (TREE_CODE (def) != SSA_NAME
7322                       || !has_single_use (def))
7323                     break;
7324                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7325                   unsigned this_lane;
7326                   if (!bb_vinfo->lookup_stmt (def_stmt)
7327                       || !vect_slp_is_lane_insert (def_stmt,
7328                                                    NULL_TREE, &this_lane)
7329                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7330                     break;
7331                   if (bitmap_bit_p (lanes, this_lane))
7332                     break;
7333                   lanes_found++;
7334                   bitmap_set_bit (lanes, this_lane);
7335                   lane_defs.quick_push (std::make_pair
7336                                           (this_lane,
7337                                            gimple_assign_rhs2 (def_stmt)));
7338                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7339                   def = gimple_assign_rhs1 (def_stmt);
7340                 }
7341               while (lanes_found < nlanes);
7342             }
7343           if (lanes_found == nlanes)
7344             {
7345               /* Sort lane_defs after the lane index and register the root.  */
7346               lane_defs.qsort (vld_cmp);
7347               vec<stmt_vec_info> stmts;
7348               stmts.create (nlanes);
7349               for (unsigned i = 0; i < nlanes; ++i)
7350                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7351               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7352                                                    stmts, roots));
7353             }
7354           else
7355             roots.release ();
7356         }
7357       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7358                && (associative_tree_code (code) || code == MINUS_EXPR)
7359                /* ???  This pessimizes a two-element reduction.  PR54400.
7360                   ???  In-order reduction could be handled if we only
7361                   traverse one operand chain in vect_slp_linearize_chain.  */
7362                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7363                /* Ops with constants at the tail can be stripped here.  */
7364                && TREE_CODE (rhs) == SSA_NAME
7365                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7366                /* Should be the chain end.  */
7367                && (!single_imm_use (gimple_assign_lhs (assign),
7368                                     &use_p, &use_stmt)
7369                    || !is_gimple_assign (use_stmt)
7370                    || (gimple_assign_rhs_code (use_stmt) != code
7371                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
7372                            || (gimple_assign_rhs_code (use_stmt)
7373                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7374         {
7375           /* We start the match at the end of a possible association
7376              chain.  */
7377           auto_vec<chain_op_t> chain;
7378           auto_vec<std::pair<tree_code, gimple *> > worklist;
7379           auto_vec<gimple *> chain_stmts;
7380           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7381           if (code == MINUS_EXPR)
7382             code = PLUS_EXPR;
7383           internal_fn reduc_fn;
7384           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7385               || reduc_fn == IFN_LAST)
7386             continue;
7387           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7388                                     /* ??? */
7389                                     code_stmt, alt_code_stmt, &chain_stmts);
7390           if (chain.length () > 1)
7391             {
7392               /* Sort the chain according to def_type and operation.  */
7393               chain.sort (dt_sort_cmp, bb_vinfo);
7394               /* ???  Now we'd want to strip externals and constants
7395                  but record those to be handled in the epilogue.  */
7396               /* ???  For now do not allow mixing ops or externs/constants.  */
7397               bool invalid = false;
7398               unsigned remain_cnt = 0;
7399               for (unsigned i = 0; i < chain.length (); ++i)
7400                 {
7401                   if (chain[i].code != code)
7402                     {
7403                       invalid = true;
7404                       break;
7405                     }
7406                   if (chain[i].dt != vect_internal_def)
7407                     remain_cnt++;
7408                 }
7409               if (!invalid && chain.length () - remain_cnt > 1)
7410                 {
7411                   vec<stmt_vec_info> stmts;
7412                   vec<tree> remain = vNULL;
7413                   stmts.create (chain.length ());
7414                   if (remain_cnt > 0)
7415                     remain.create (remain_cnt);
7416                   for (unsigned i = 0; i < chain.length (); ++i)
7417                     {
7418                       if (chain[i].dt == vect_internal_def)
7419                         stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7420                       else
7421                         remain.quick_push (chain[i].op);
7422                     }
7423                   vec<stmt_vec_info> roots;
7424                   roots.create (chain_stmts.length ());
7425                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
7426                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7427                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7428                                                        stmts, roots, remain));
7429                 }
7430             }
7431         }
7432     }
7433 }
7434
7435 /* Walk the grouped store chains and replace entries with their
7436    pattern variant if any.  */
7437
7438 static void
7439 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7440 {
7441   stmt_vec_info first_element;
7442   unsigned i;
7443
7444   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7445     {
7446       /* We also have CTORs in this array.  */
7447       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7448         continue;
7449       if (STMT_VINFO_IN_PATTERN_P (first_element))
7450         {
7451           stmt_vec_info orig = first_element;
7452           first_element = STMT_VINFO_RELATED_STMT (first_element);
7453           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7454           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7455           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7456           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7457           vinfo->grouped_stores[i] = first_element;
7458         }
7459       stmt_vec_info prev = first_element;
7460       while (DR_GROUP_NEXT_ELEMENT (prev))
7461         {
7462           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7463           if (STMT_VINFO_IN_PATTERN_P (elt))
7464             {
7465               stmt_vec_info orig = elt;
7466               elt = STMT_VINFO_RELATED_STMT (elt);
7467               DR_GROUP_NEXT_ELEMENT (prev) = elt;
7468               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7469               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7470             }
7471           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7472           prev = elt;
7473         }
7474     }
7475 }
7476
7477 /* Check if the region described by BB_VINFO can be vectorized, returning
7478    true if so.  When returning false, set FATAL to true if the same failure
7479    would prevent vectorization at other vector sizes, false if it is still
7480    worth trying other sizes.  N_STMTS is the number of statements in the
7481    region.  */
7482
7483 static bool
7484 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7485                        vec<int> *dataref_groups)
7486 {
7487   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7488
7489   slp_instance instance;
7490   int i;
7491   poly_uint64 min_vf = 2;
7492
7493   /* The first group of checks is independent of the vector size.  */
7494   fatal = true;
7495
7496   /* Analyze the data references.  */
7497
7498   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7499     {
7500       if (dump_enabled_p ())
7501         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7502                          "not vectorized: unhandled data-ref in basic "
7503                          "block.\n");
7504       return false;
7505     }
7506
7507   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7508     {
7509      if (dump_enabled_p ())
7510        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7511                         "not vectorized: unhandled data access in "
7512                         "basic block.\n");
7513       return false;
7514     }
7515
7516   vect_slp_check_for_roots (bb_vinfo);
7517
7518   /* If there are no grouped stores and no constructors in the region
7519      there is no need to continue with pattern recog as vect_analyze_slp
7520      will fail anyway.  */
7521   if (bb_vinfo->grouped_stores.is_empty ()
7522       && bb_vinfo->roots.is_empty ())
7523     {
7524       if (dump_enabled_p ())
7525         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7526                          "not vectorized: no grouped stores in "
7527                          "basic block.\n");
7528       return false;
7529     }
7530
7531   /* While the rest of the analysis below depends on it in some way.  */
7532   fatal = false;
7533
7534   vect_pattern_recog (bb_vinfo);
7535
7536   /* Update store groups from pattern processing.  */
7537   vect_fixup_store_groups_with_patterns (bb_vinfo);
7538
7539   /* Check the SLP opportunities in the basic block, analyze and build SLP
7540      trees.  */
7541   if (!vect_analyze_slp (bb_vinfo, n_stmts))
7542     {
7543       if (dump_enabled_p ())
7544         {
7545           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7546                            "Failed to SLP the basic block.\n");
7547           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7548                            "not vectorized: failed to find SLP opportunities "
7549                            "in basic block.\n");
7550         }
7551       return false;
7552     }
7553
7554   /* Optimize permutations.  */
7555   vect_optimize_slp (bb_vinfo);
7556
7557   /* Gather the loads reachable from the SLP graph entries.  */
7558   vect_gather_slp_loads (bb_vinfo);
7559
7560   vect_record_base_alignments (bb_vinfo);
7561
7562   /* Analyze and verify the alignment of data references and the
7563      dependence in the SLP instances.  */
7564   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7565     {
7566       vect_location = instance->location ();
7567       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7568           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7569         {
7570           slp_tree node = SLP_INSTANCE_TREE (instance);
7571           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7572           if (dump_enabled_p ())
7573             dump_printf_loc (MSG_NOTE, vect_location,
7574                              "removing SLP instance operations starting from: %G",
7575                              stmt_info->stmt);
7576           vect_free_slp_instance (instance);
7577           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7578           continue;
7579         }
7580
7581       /* Mark all the statements that we want to vectorize as pure SLP and
7582          relevant.  */
7583       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7584       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7585       unsigned j;
7586       stmt_vec_info root;
7587       /* Likewise consider instance root stmts as vectorized.  */
7588       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7589         STMT_SLP_TYPE (root) = pure_slp;
7590
7591       i++;
7592     }
7593   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7594     return false;
7595
7596   if (!vect_slp_analyze_operations (bb_vinfo))
7597     {
7598       if (dump_enabled_p ())
7599         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7600                          "not vectorized: bad operation in basic block.\n");
7601       return false;
7602     }
7603
7604   vect_bb_partition_graph (bb_vinfo);
7605
7606   return true;
7607 }
7608
7609 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
7610    basic blocks in BBS, returning true on success.
7611    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
7612
7613 static bool
7614 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7615                  vec<int> *dataref_groups, unsigned int n_stmts,
7616                  loop_p orig_loop)
7617 {
7618   bb_vec_info bb_vinfo;
7619   auto_vector_modes vector_modes;
7620
7621   /* Autodetect first vector size we try.  */
7622   machine_mode next_vector_mode = VOIDmode;
7623   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7624   unsigned int mode_i = 0;
7625
7626   vec_info_shared shared;
7627
7628   machine_mode autodetected_vector_mode = VOIDmode;
7629   while (1)
7630     {
7631       bool vectorized = false;
7632       bool fatal = false;
7633       bb_vinfo = new _bb_vec_info (bbs, &shared);
7634
7635       bool first_time_p = shared.datarefs.is_empty ();
7636       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7637       if (first_time_p)
7638         bb_vinfo->shared->save_datarefs ();
7639       else
7640         bb_vinfo->shared->check_datarefs ();
7641       bb_vinfo->vector_mode = next_vector_mode;
7642
7643       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7644         {
7645           if (dump_enabled_p ())
7646             {
7647               dump_printf_loc (MSG_NOTE, vect_location,
7648                                "***** Analysis succeeded with vector mode"
7649                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7650               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7651             }
7652
7653           bb_vinfo->shared->check_datarefs ();
7654
7655           bool force_clear = false;
7656           auto_vec<slp_instance> profitable_subgraphs;
7657           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7658             {
7659               if (instance->subgraph_entries.is_empty ())
7660                 continue;
7661
7662               dump_user_location_t saved_vect_location = vect_location;
7663               vect_location = instance->location ();
7664               if (!unlimited_cost_model (NULL)
7665                   && !vect_bb_vectorization_profitable_p
7666                         (bb_vinfo, instance->subgraph_entries, orig_loop))
7667                 {
7668                   if (dump_enabled_p ())
7669                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7670                                      "not vectorized: vectorization is not "
7671                                      "profitable.\n");
7672                   vect_location = saved_vect_location;
7673                   continue;
7674                 }
7675
7676               vect_location = saved_vect_location;
7677               if (!dbg_cnt (vect_slp))
7678                 {
7679                   force_clear = true;
7680                   continue;
7681                 }
7682
7683               profitable_subgraphs.safe_push (instance);
7684             }
7685
7686           /* When we're vectorizing an if-converted loop body make sure
7687              we vectorized all if-converted code.  */
7688           if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
7689             {
7690               gcc_assert (bb_vinfo->bbs.length () == 1);
7691               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7692                    !gsi_end_p (gsi); gsi_next (&gsi))
7693                 {
7694                   /* The costing above left us with DCEable vectorized scalar
7695                      stmts having the visited flag set on profitable
7696                      subgraphs.  Do the delayed clearing of the flag here.  */
7697                   if (gimple_visited_p (gsi_stmt (gsi)))
7698                     {
7699                       gimple_set_visited (gsi_stmt (gsi), false);
7700                       continue;
7701                     }
7702                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7703                     continue;
7704
7705                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7706                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
7707                       {
7708                         if (!profitable_subgraphs.is_empty ()
7709                             && dump_enabled_p ())
7710                           dump_printf_loc (MSG_NOTE, vect_location,
7711                                            "not profitable because of "
7712                                            "unprofitable if-converted scalar "
7713                                            "code\n");
7714                         profitable_subgraphs.truncate (0);
7715                       }
7716                 }
7717             }
7718
7719           /* Finally schedule the profitable subgraphs.  */
7720           for (slp_instance instance : profitable_subgraphs)
7721             {
7722               if (!vectorized && dump_enabled_p ())
7723                 dump_printf_loc (MSG_NOTE, vect_location,
7724                                  "Basic block will be vectorized "
7725                                  "using SLP\n");
7726               vectorized = true;
7727
7728               /* Dump before scheduling as store vectorization will remove
7729                  the original stores and mess with the instance tree
7730                  so querying its location will eventually ICE.  */
7731               if (flag_checking)
7732                 for (slp_instance sub : instance->subgraph_entries)
7733                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7734               unsigned HOST_WIDE_INT bytes;
7735               if (dump_enabled_p ())
7736                 for (slp_instance sub : instance->subgraph_entries)
7737                   {
7738                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7739                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7740                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7741                                        sub->location (),
7742                                        "basic block part vectorized using %wu "
7743                                        "byte vectors\n", bytes);
7744                     else
7745                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7746                                        sub->location (),
7747                                        "basic block part vectorized using "
7748                                        "variable length vectors\n");
7749                   }
7750
7751               dump_user_location_t saved_vect_location = vect_location;
7752               vect_location = instance->location ();
7753
7754               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7755
7756               vect_location = saved_vect_location;
7757             }
7758         }
7759       else
7760         {
7761           if (dump_enabled_p ())
7762             dump_printf_loc (MSG_NOTE, vect_location,
7763                              "***** Analysis failed with vector mode %s\n",
7764                              GET_MODE_NAME (bb_vinfo->vector_mode));
7765         }
7766
7767       if (mode_i == 0)
7768         autodetected_vector_mode = bb_vinfo->vector_mode;
7769
7770       if (!fatal)
7771         while (mode_i < vector_modes.length ()
7772                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7773           {
7774             if (dump_enabled_p ())
7775               dump_printf_loc (MSG_NOTE, vect_location,
7776                                "***** The result for vector mode %s would"
7777                                " be the same\n",
7778                                GET_MODE_NAME (vector_modes[mode_i]));
7779             mode_i += 1;
7780           }
7781
7782       delete bb_vinfo;
7783
7784       if (mode_i < vector_modes.length ()
7785           && VECTOR_MODE_P (autodetected_vector_mode)
7786           && (related_vector_mode (vector_modes[mode_i],
7787                                    GET_MODE_INNER (autodetected_vector_mode))
7788               == autodetected_vector_mode)
7789           && (related_vector_mode (autodetected_vector_mode,
7790                                    GET_MODE_INNER (vector_modes[mode_i]))
7791               == vector_modes[mode_i]))
7792         {
7793           if (dump_enabled_p ())
7794             dump_printf_loc (MSG_NOTE, vect_location,
7795                              "***** Skipping vector mode %s, which would"
7796                              " repeat the analysis for %s\n",
7797                              GET_MODE_NAME (vector_modes[mode_i]),
7798                              GET_MODE_NAME (autodetected_vector_mode));
7799           mode_i += 1;
7800         }
7801
7802       if (vectorized
7803           || mode_i == vector_modes.length ()
7804           || autodetected_vector_mode == VOIDmode
7805           /* If vect_slp_analyze_bb_1 signaled that analysis for all
7806              vector sizes will fail do not bother iterating.  */
7807           || fatal)
7808         return vectorized;
7809
7810       /* Try the next biggest vector size.  */
7811       next_vector_mode = vector_modes[mode_i++];
7812       if (dump_enabled_p ())
7813         dump_printf_loc (MSG_NOTE, vect_location,
7814                          "***** Re-trying analysis with vector mode %s\n",
7815                          GET_MODE_NAME (next_vector_mode));
7816     }
7817 }
7818
7819
7820 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
7821    true if anything in the basic-block was vectorized.  */
7822
7823 static bool
7824 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7825 {
7826   vec<data_reference_p> datarefs = vNULL;
7827   auto_vec<int> dataref_groups;
7828   int insns = 0;
7829   int current_group = 0;
7830
7831   for (unsigned i = 0; i < bbs.length (); i++)
7832     {
7833       basic_block bb = bbs[i];
7834       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7835            gsi_next (&gsi))
7836         {
7837           gimple *stmt = gsi_stmt (gsi);
7838           if (is_gimple_debug (stmt))
7839             continue;
7840
7841           insns++;
7842
7843           if (gimple_location (stmt) != UNKNOWN_LOCATION)
7844             vect_location = stmt;
7845
7846           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7847                                               &dataref_groups, current_group))
7848             ++current_group;
7849         }
7850       /* New BBs always start a new DR group.  */
7851       ++current_group;
7852     }
7853
7854   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7855 }
7856
7857 /* Special entry for the BB vectorizer.  Analyze and transform a single
7858    if-converted BB with ORIG_LOOPs body being the not if-converted
7859    representation.  Returns true if anything in the basic-block was
7860    vectorized.  */
7861
7862 bool
7863 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7864 {
7865   auto_vec<basic_block> bbs;
7866   bbs.safe_push (bb);
7867   return vect_slp_bbs (bbs, orig_loop);
7868 }
7869
7870 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
7871    true if anything in the basic-block was vectorized.  */
7872
7873 bool
7874 vect_slp_function (function *fun)
7875 {
7876   bool r = false;
7877   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7878   auto_bitmap exit_bbs;
7879   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
7880   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
7881   unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
7882                                                       true, rpo, NULL);
7883
7884   /* For the moment split the function into pieces to avoid making
7885      the iteration on the vector mode moot.  Split at points we know
7886      to not handle well which is CFG merges (SLP discovery doesn't
7887      handle non-loop-header PHIs) and loop exits.  Since pattern
7888      recog requires reverse iteration to visit uses before defs
7889      simply chop RPO into pieces.  */
7890   auto_vec<basic_block> bbs;
7891   for (unsigned i = 0; i < n; i++)
7892     {
7893       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7894       bool split = false;
7895
7896       /* Split when a BB is not dominated by the first block.  */
7897       if (!bbs.is_empty ()
7898           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7899         {
7900           if (dump_enabled_p ())
7901             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7902                              "splitting region at dominance boundary bb%d\n",
7903                              bb->index);
7904           split = true;
7905         }
7906       /* Split when the loop determined by the first block
7907          is exited.  This is because we eventually insert
7908          invariants at region begin.  */
7909       else if (!bbs.is_empty ()
7910                && bbs[0]->loop_father != bb->loop_father
7911                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7912         {
7913           if (dump_enabled_p ())
7914             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7915                              "splitting region at loop %d exit at bb%d\n",
7916                              bbs[0]->loop_father->num, bb->index);
7917           split = true;
7918         }
7919       else if (!bbs.is_empty ()
7920                && bb->loop_father->header == bb
7921                && bb->loop_father->dont_vectorize)
7922         {
7923           if (dump_enabled_p ())
7924             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7925                              "splitting region at dont-vectorize loop %d "
7926                              "entry at bb%d\n",
7927                              bb->loop_father->num, bb->index);
7928           split = true;
7929         }
7930
7931       if (split && !bbs.is_empty ())
7932         {
7933           r |= vect_slp_bbs (bbs, NULL);
7934           bbs.truncate (0);
7935         }
7936
7937       if (bbs.is_empty ())
7938         {
7939           /* We need to be able to insert at the head of the region which
7940              we cannot for region starting with a returns-twice call.  */
7941           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
7942             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
7943               {
7944                 if (dump_enabled_p ())
7945                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7946                                    "skipping bb%d as start of region as it "
7947                                    "starts with returns-twice call\n",
7948                                    bb->index);
7949                 continue;
7950               }
7951           /* If the loop this BB belongs to is marked as not to be vectorized
7952              honor that also for BB vectorization.  */
7953           if (bb->loop_father->dont_vectorize)
7954             continue;
7955         }
7956
7957       bbs.safe_push (bb);
7958
7959       /* When we have a stmt ending this block and defining a
7960          value we have to insert on edges when inserting after it for
7961          a vector containing its definition.  Avoid this for now.  */
7962       if (gimple *last = *gsi_last_bb (bb))
7963         if (gimple_get_lhs (last)
7964             && is_ctrl_altering_stmt (last))
7965           {
7966             if (dump_enabled_p ())
7967               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7968                                "splitting region at control altering "
7969                                "definition %G", last);
7970             r |= vect_slp_bbs (bbs, NULL);
7971             bbs.truncate (0);
7972           }
7973     }
7974
7975   if (!bbs.is_empty ())
7976     r |= vect_slp_bbs (bbs, NULL);
7977
7978   free (rpo);
7979
7980   return r;
7981 }
7982
7983 /* Build a variable-length vector in which the elements in ELTS are repeated
7984    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
7985    RESULTS and add any new instructions to SEQ.
7986
7987    The approach we use is:
7988
7989    (1) Find a vector mode VM with integer elements of mode IM.
7990
7991    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7992        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
7993        from small vectors to IM.
7994
7995    (3) Duplicate each ELTS'[I] into a vector of mode VM.
7996
7997    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7998        correct byte contents.
7999
8000    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
8001
8002    We try to find the largest IM for which this sequence works, in order
8003    to cut down on the number of interleaves.  */
8004
8005 void
8006 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
8007                           const vec<tree> &elts, unsigned int nresults,
8008                           vec<tree> &results)
8009 {
8010   unsigned int nelts = elts.length ();
8011   tree element_type = TREE_TYPE (vector_type);
8012
8013   /* (1) Find a vector mode VM with integer elements of mode IM.  */
8014   unsigned int nvectors = 1;
8015   tree new_vector_type;
8016   tree permutes[2];
8017   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
8018                                        &nvectors, &new_vector_type,
8019                                        permutes))
8020     gcc_unreachable ();
8021
8022   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
8023   unsigned int partial_nelts = nelts / nvectors;
8024   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8025
8026   tree_vector_builder partial_elts;
8027   auto_vec<tree, 32> pieces (nvectors * 2);
8028   pieces.quick_grow_cleared (nvectors * 2);
8029   for (unsigned int i = 0; i < nvectors; ++i)
8030     {
8031       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8032              ELTS' has mode IM.  */
8033       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
8034       for (unsigned int j = 0; j < partial_nelts; ++j)
8035         partial_elts.quick_push (elts[i * partial_nelts + j]);
8036       tree t = gimple_build_vector (seq, &partial_elts);
8037       t = gimple_build (seq, VIEW_CONVERT_EXPR,
8038                         TREE_TYPE (new_vector_type), t);
8039
8040       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
8041       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
8042     }
8043
8044   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
8045          correct byte contents.
8046
8047      Conceptually, we need to repeat the following operation log2(nvectors)
8048      times, where hi_start = nvectors / 2:
8049
8050         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
8051         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
8052
8053      However, if each input repeats every N elements and the VF is
8054      a multiple of N * 2, the HI result is the same as the LO result.
8055      This will be true for the first N1 iterations of the outer loop,
8056      followed by N2 iterations for which both the LO and HI results
8057      are needed.  I.e.:
8058
8059         N1 + N2 = log2(nvectors)
8060
8061      Each "N1 iteration" doubles the number of redundant vectors and the
8062      effect of the process as a whole is to have a sequence of nvectors/2**N1
8063      vectors that repeats 2**N1 times.  Rather than generate these redundant
8064      vectors, we halve the number of vectors for each N1 iteration.  */
8065   unsigned int in_start = 0;
8066   unsigned int out_start = nvectors;
8067   unsigned int new_nvectors = nvectors;
8068   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
8069     {
8070       unsigned int hi_start = new_nvectors / 2;
8071       unsigned int out_i = 0;
8072       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
8073         {
8074           if ((in_i & 1) != 0
8075               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
8076                              2 * in_repeat))
8077             continue;
8078
8079           tree output = make_ssa_name (new_vector_type);
8080           tree input1 = pieces[in_start + (in_i / 2)];
8081           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
8082           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8083                                                input1, input2,
8084                                                permutes[in_i & 1]);
8085           gimple_seq_add_stmt (seq, stmt);
8086           pieces[out_start + out_i] = output;
8087           out_i += 1;
8088         }
8089       std::swap (in_start, out_start);
8090       new_nvectors = out_i;
8091     }
8092
8093   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
8094   results.reserve (nresults);
8095   for (unsigned int i = 0; i < nresults; ++i)
8096     if (i < new_nvectors)
8097       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
8098                                         pieces[in_start + i]));
8099     else
8100       results.quick_push (results[i - new_nvectors]);
8101 }
8102
8103
8104 /* For constant and loop invariant defs in OP_NODE this function creates
8105    vector defs that will be used in the vectorized stmts and stores them
8106    to SLP_TREE_VEC_DEFS of OP_NODE.  */
8107
8108 static void
8109 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8110 {
8111   unsigned HOST_WIDE_INT nunits;
8112   tree vec_cst;
8113   unsigned j, number_of_places_left_in_vector;
8114   tree vector_type;
8115   tree vop;
8116   int group_size = op_node->ops.length ();
8117   unsigned int vec_num, i;
8118   unsigned number_of_copies = 1;
8119   bool constant_p;
8120   gimple_seq ctor_seq = NULL;
8121   auto_vec<tree, 16> permute_results;
8122
8123   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
8124   vector_type = SLP_TREE_VECTYPE (op_node);
8125
8126   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8127   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8128   auto_vec<tree> voprnds (number_of_vectors);
8129
8130   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8131      created vectors. It is greater than 1 if unrolling is performed.
8132
8133      For example, we have two scalar operands, s1 and s2 (e.g., group of
8134      strided accesses of size two), while NUNITS is four (i.e., four scalars
8135      of this type can be packed in a vector).  The output vector will contain
8136      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
8137      will be 2).
8138
8139      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8140      containing the operands.
8141
8142      For example, NUNITS is four as before, and the group size is 8
8143      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
8144      {s5, s6, s7, s8}.  */
8145
8146   /* When using duplicate_and_interleave, we just need one element for
8147      each scalar statement.  */
8148   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8149     nunits = group_size;
8150
8151   number_of_copies = nunits * number_of_vectors / group_size;
8152
8153   number_of_places_left_in_vector = nunits;
8154   constant_p = true;
8155   tree_vector_builder elts (vector_type, nunits, 1);
8156   elts.quick_grow (nunits);
8157   stmt_vec_info insert_after = NULL;
8158   for (j = 0; j < number_of_copies; j++)
8159     {
8160       tree op;
8161       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8162         {
8163           /* Create 'vect_ = {op0,op1,...,opn}'.  */
8164           number_of_places_left_in_vector--;
8165           tree orig_op = op;
8166           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8167             {
8168               if (CONSTANT_CLASS_P (op))
8169                 {
8170                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8171                     {
8172                       /* Can't use VIEW_CONVERT_EXPR for booleans because
8173                          of possibly different sizes of scalar value and
8174                          vector element.  */
8175                       if (integer_zerop (op))
8176                         op = build_int_cst (TREE_TYPE (vector_type), 0);
8177                       else if (integer_onep (op))
8178                         op = build_all_ones_cst (TREE_TYPE (vector_type));
8179                       else
8180                         gcc_unreachable ();
8181                     }
8182                   else
8183                     op = fold_unary (VIEW_CONVERT_EXPR,
8184                                      TREE_TYPE (vector_type), op);
8185                   gcc_assert (op && CONSTANT_CLASS_P (op));
8186                 }
8187               else
8188                 {
8189                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8190                   gimple *init_stmt;
8191                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8192                     {
8193                       tree true_val
8194                         = build_all_ones_cst (TREE_TYPE (vector_type));
8195                       tree false_val
8196                         = build_zero_cst (TREE_TYPE (vector_type));
8197                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8198                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8199                                                        op, true_val,
8200                                                        false_val);
8201                     }
8202                   else
8203                     {
8204                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8205                                    op);
8206                       init_stmt
8207                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8208                                                op);
8209                     }
8210                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
8211                   op = new_temp;
8212                 }
8213             }
8214           elts[number_of_places_left_in_vector] = op;
8215           if (!CONSTANT_CLASS_P (op))
8216             constant_p = false;
8217           /* For BB vectorization we have to compute an insert location
8218              when a def is inside the analyzed region since we cannot
8219              simply insert at the BB start in this case.  */
8220           stmt_vec_info opdef;
8221           if (TREE_CODE (orig_op) == SSA_NAME
8222               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8223               && is_a <bb_vec_info> (vinfo)
8224               && (opdef = vinfo->lookup_def (orig_op)))
8225             {
8226               if (!insert_after)
8227                 insert_after = opdef;
8228               else
8229                 insert_after = get_later_stmt (insert_after, opdef);
8230             }
8231
8232           if (number_of_places_left_in_vector == 0)
8233             {
8234               if (constant_p
8235                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
8236                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
8237                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8238               else
8239                 {
8240                   if (permute_results.is_empty ())
8241                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8242                                               elts, number_of_vectors,
8243                                               permute_results);
8244                   vec_cst = permute_results[number_of_vectors - j - 1];
8245                 }
8246               if (!gimple_seq_empty_p (ctor_seq))
8247                 {
8248                   if (insert_after)
8249                     {
8250                       gimple_stmt_iterator gsi;
8251                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8252                         {
8253                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8254                           gsi_insert_seq_before (&gsi, ctor_seq,
8255                                                  GSI_CONTINUE_LINKING);
8256                         }
8257                       else if (!stmt_ends_bb_p (insert_after->stmt))
8258                         {
8259                           gsi = gsi_for_stmt (insert_after->stmt);
8260                           gsi_insert_seq_after (&gsi, ctor_seq,
8261                                                 GSI_CONTINUE_LINKING);
8262                         }
8263                       else
8264                         {
8265                           /* When we want to insert after a def where the
8266                              defining stmt throws then insert on the fallthru
8267                              edge.  */
8268                           edge e = find_fallthru_edge
8269                                      (gimple_bb (insert_after->stmt)->succs);
8270                           basic_block new_bb
8271                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8272                           gcc_assert (!new_bb);
8273                         }
8274                     }
8275                   else
8276                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
8277                   ctor_seq = NULL;
8278                 }
8279               voprnds.quick_push (vec_cst);
8280               insert_after = NULL;
8281               number_of_places_left_in_vector = nunits;
8282               constant_p = true;
8283               elts.new_vector (vector_type, nunits, 1);
8284               elts.quick_grow (nunits);
8285             }
8286         }
8287     }
8288
8289   /* Since the vectors are created in the reverse order, we should invert
8290      them.  */
8291   vec_num = voprnds.length ();
8292   for (j = vec_num; j != 0; j--)
8293     {
8294       vop = voprnds[j - 1];
8295       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8296     }
8297
8298   /* In case that VF is greater than the unrolling factor needed for the SLP
8299      group of stmts, NUMBER_OF_VECTORS to be created is greater than
8300      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8301      to replicate the vectors.  */
8302   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8303     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8304          i++)
8305       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8306 }
8307
8308 /* Get the Ith vectorized definition from SLP_NODE.  */
8309
8310 tree
8311 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8312 {
8313   return SLP_TREE_VEC_DEFS (slp_node)[i];
8314 }
8315
8316 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
8317
8318 void
8319 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8320 {
8321   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8322   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8323 }
8324
8325 /* Get N vectorized definitions for SLP_NODE.  */
8326
8327 void
8328 vect_get_slp_defs (vec_info *,
8329                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8330 {
8331   if (n == -1U)
8332     n = SLP_TREE_CHILDREN (slp_node).length ();
8333
8334   for (unsigned i = 0; i < n; ++i)
8335     {
8336       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8337       vec<tree> vec_defs = vNULL;
8338       vect_get_slp_defs (child, &vec_defs);
8339       vec_oprnds->quick_push (vec_defs);
8340     }
8341 }
8342
8343 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8344    - PERM gives the permutation that the caller wants to use for NODE,
8345      which might be different from SLP_LOAD_PERMUTATION.
8346    - DUMP_P controls whether the function dumps information.  */
8347
8348 static bool
8349 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8350                                 load_permutation_t &perm,
8351                                 const vec<tree> &dr_chain,
8352                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
8353                                 bool analyze_only, bool dump_p,
8354                                 unsigned *n_perms, unsigned int *n_loads,
8355                                 bool dce_chain)
8356 {
8357   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8358   int vec_index = 0;
8359   tree vectype = SLP_TREE_VECTYPE (node);
8360   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8361   unsigned int mask_element;
8362   unsigned dr_group_size;
8363   machine_mode mode;
8364
8365   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8366     dr_group_size = 1;
8367   else
8368     {
8369       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8370       dr_group_size = DR_GROUP_SIZE (stmt_info);
8371     }
8372
8373   mode = TYPE_MODE (vectype);
8374   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8375   unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8376
8377   /* Initialize the vect stmts of NODE to properly insert the generated
8378      stmts later.  */
8379   if (! analyze_only)
8380     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8381       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8382
8383   /* Generate permutation masks for every NODE. Number of masks for each NODE
8384      is equal to GROUP_SIZE.
8385      E.g., we have a group of three nodes with three loads from the same
8386      location in each node, and the vector size is 4. I.e., we have a
8387      a0b0c0a1b1c1... sequence and we need to create the following vectors:
8388      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8389      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8390      ...
8391
8392      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8393      The last mask is illegal since we assume two operands for permute
8394      operation, and the mask element values can't be outside that range.
8395      Hence, the last mask must be converted into {2,5,5,5}.
8396      For the first two permutations we need the first and the second input
8397      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8398      we need the second and the third vectors: {b1,c1,a2,b2} and
8399      {c2,a3,b3,c3}.  */
8400
8401   int vect_stmts_counter = 0;
8402   unsigned int index = 0;
8403   int first_vec_index = -1;
8404   int second_vec_index = -1;
8405   bool noop_p = true;
8406   *n_perms = 0;
8407
8408   vec_perm_builder mask;
8409   unsigned int nelts_to_build;
8410   unsigned int nvectors_per_build;
8411   unsigned int in_nlanes;
8412   bool repeating_p = (group_size == dr_group_size
8413                       && multiple_p (nunits, group_size));
8414   if (repeating_p)
8415     {
8416       /* A single vector contains a whole number of copies of the node, so:
8417          (a) all permutes can use the same mask; and
8418          (b) the permutes only need a single vector input.  */
8419       mask.new_vector (nunits, group_size, 3);
8420       nelts_to_build = mask.encoded_nelts ();
8421       /* It's possible to obtain zero nstmts during analyze_only, so make
8422          it at least one to ensure the later computation for n_perms
8423          proceed.  */
8424       nvectors_per_build = nstmts > 0 ? nstmts : 1;
8425       in_nlanes = dr_group_size * 3;
8426     }
8427   else
8428     {
8429       /* We need to construct a separate mask for each vector statement.  */
8430       unsigned HOST_WIDE_INT const_nunits, const_vf;
8431       if (!nunits.is_constant (&const_nunits)
8432           || !vf.is_constant (&const_vf))
8433         return false;
8434       mask.new_vector (const_nunits, const_nunits, 1);
8435       nelts_to_build = const_vf * group_size;
8436       nvectors_per_build = 1;
8437       in_nlanes = const_vf * dr_group_size;
8438     }
8439   auto_sbitmap used_in_lanes (in_nlanes);
8440   bitmap_clear (used_in_lanes);
8441   auto_bitmap used_defs;
8442
8443   unsigned int count = mask.encoded_nelts ();
8444   mask.quick_grow (count);
8445   vec_perm_indices indices;
8446
8447   for (unsigned int j = 0; j < nelts_to_build; j++)
8448     {
8449       unsigned int iter_num = j / group_size;
8450       unsigned int stmt_num = j % group_size;
8451       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8452       bitmap_set_bit (used_in_lanes, i);
8453       if (repeating_p)
8454         {
8455           first_vec_index = 0;
8456           mask_element = i;
8457         }
8458       else
8459         {
8460           /* Enforced before the loop when !repeating_p.  */
8461           unsigned int const_nunits = nunits.to_constant ();
8462           vec_index = i / const_nunits;
8463           mask_element = i % const_nunits;
8464           if (vec_index == first_vec_index
8465               || first_vec_index == -1)
8466             {
8467               first_vec_index = vec_index;
8468             }
8469           else if (vec_index == second_vec_index
8470                    || second_vec_index == -1)
8471             {
8472               second_vec_index = vec_index;
8473               mask_element += const_nunits;
8474             }
8475           else
8476             {
8477               if (dump_p)
8478                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8479                                  "permutation requires at "
8480                                  "least three vectors %G",
8481                                  stmt_info->stmt);
8482               gcc_assert (analyze_only);
8483               return false;
8484             }
8485
8486           gcc_assert (mask_element < 2 * const_nunits);
8487         }
8488
8489       if (mask_element != index)
8490         noop_p = false;
8491       mask[index++] = mask_element;
8492
8493       if (index == count)
8494         {
8495           if (!noop_p)
8496             {
8497               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8498               if (!can_vec_perm_const_p (mode, mode, indices))
8499                 {
8500                   if (dump_p)
8501                     {
8502                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8503                                        "unsupported vect permute { ");
8504                       for (i = 0; i < count; ++i)
8505                         {
8506                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8507                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8508                         }
8509                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8510                     }
8511                   gcc_assert (analyze_only);
8512                   return false;
8513                 }
8514
8515               tree mask_vec = NULL_TREE;
8516               if (!analyze_only)
8517                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8518
8519               if (second_vec_index == -1)
8520                 second_vec_index = first_vec_index;
8521
8522               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8523                 {
8524                   ++*n_perms;
8525                   if (analyze_only)
8526                     continue;
8527                   /* Generate the permute statement if necessary.  */
8528                   tree first_vec = dr_chain[first_vec_index + ri];
8529                   tree second_vec = dr_chain[second_vec_index + ri];
8530                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8531                   tree perm_dest
8532                     = vect_create_destination_var (gimple_assign_lhs (stmt),
8533                                                    vectype);
8534                   perm_dest = make_ssa_name (perm_dest);
8535                   gimple *perm_stmt
8536                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8537                                            second_vec, mask_vec);
8538                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8539                                                gsi);
8540                   if (dce_chain)
8541                     {
8542                       bitmap_set_bit (used_defs, first_vec_index + ri);
8543                       bitmap_set_bit (used_defs, second_vec_index + ri);
8544                     }
8545
8546                   /* Store the vector statement in NODE.  */
8547                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8548                 }
8549             }
8550           else if (!analyze_only)
8551             {
8552               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8553                 {
8554                   tree first_vec = dr_chain[first_vec_index + ri];
8555                   /* If mask was NULL_TREE generate the requested
8556                      identity transform.  */
8557                   if (dce_chain)
8558                     bitmap_set_bit (used_defs, first_vec_index + ri);
8559
8560                   /* Store the vector statement in NODE.  */
8561                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8562                 }
8563             }
8564
8565           index = 0;
8566           first_vec_index = -1;
8567           second_vec_index = -1;
8568           noop_p = true;
8569         }
8570     }
8571
8572   if (n_loads)
8573     {
8574       if (repeating_p)
8575         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8576       else
8577         {
8578           /* Enforced above when !repeating_p.  */
8579           unsigned int const_nunits = nunits.to_constant ();
8580           *n_loads = 0;
8581           bool load_seen = false;
8582           for (unsigned i = 0; i < in_nlanes; ++i)
8583             {
8584               if (i % const_nunits == 0)
8585                 {
8586                   if (load_seen)
8587                     *n_loads += 1;
8588                   load_seen = false;
8589                 }
8590               if (bitmap_bit_p (used_in_lanes, i))
8591                 load_seen = true;
8592             }
8593           if (load_seen)
8594             *n_loads += 1;
8595         }
8596     }
8597
8598   if (dce_chain)
8599     for (unsigned i = 0; i < dr_chain.length (); ++i)
8600       if (!bitmap_bit_p (used_defs, i))
8601         {
8602           gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8603           gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8604           gsi_remove (&rgsi, true);
8605           release_defs (stmt);
8606         }
8607
8608   return true;
8609 }
8610
8611 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8612    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8613    permute statements for the SLP node NODE.  Store the number of vector
8614    permute instructions in *N_PERMS and the number of vector load
8615    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
8616    that were not needed.  */
8617
8618 bool
8619 vect_transform_slp_perm_load (vec_info *vinfo,
8620                               slp_tree node, const vec<tree> &dr_chain,
8621                               gimple_stmt_iterator *gsi, poly_uint64 vf,
8622                               bool analyze_only, unsigned *n_perms,
8623                               unsigned int *n_loads, bool dce_chain)
8624 {
8625   return vect_transform_slp_perm_load_1 (vinfo, node,
8626                                          SLP_TREE_LOAD_PERMUTATION (node),
8627                                          dr_chain, gsi, vf, analyze_only,
8628                                          dump_enabled_p (), n_perms, n_loads,
8629                                          dce_chain);
8630 }
8631
8632 /* Produce the next vector result for SLP permutation NODE by adding a vector
8633    statement at GSI.  If MASK_VEC is nonnull, add:
8634
8635       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8636
8637    otherwise add:
8638
8639       <new SSA name> = FIRST_DEF.  */
8640
8641 static void
8642 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8643                           slp_tree node, tree first_def, tree second_def,
8644                           tree mask_vec, poly_uint64 identity_offset)
8645 {
8646   tree vectype = SLP_TREE_VECTYPE (node);
8647
8648   /* ???  We SLP match existing vector element extracts but
8649      allow punning which we need to re-instantiate at uses
8650      but have no good way of explicitly representing.  */
8651   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8652       && !types_compatible_p (TREE_TYPE (first_def), vectype))
8653     {
8654       gassign *conv_stmt
8655         = gimple_build_assign (make_ssa_name (vectype),
8656                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8657       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8658       first_def = gimple_assign_lhs (conv_stmt);
8659     }
8660   gassign *perm_stmt;
8661   tree perm_dest = make_ssa_name (vectype);
8662   if (mask_vec)
8663     {
8664       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8665                            TYPE_SIZE (vectype))
8666           && !types_compatible_p (TREE_TYPE (second_def), vectype))
8667         {
8668           gassign *conv_stmt
8669             = gimple_build_assign (make_ssa_name (vectype),
8670                                    build1 (VIEW_CONVERT_EXPR,
8671                                            vectype, second_def));
8672           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8673           second_def = gimple_assign_lhs (conv_stmt);
8674         }
8675       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8676                                        first_def, second_def,
8677                                        mask_vec);
8678     }
8679   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8680     {
8681       /* For identity permutes we still need to handle the case
8682          of offsetted extracts or concats.  */
8683       unsigned HOST_WIDE_INT c;
8684       auto first_def_nunits
8685         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8686       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8687         {
8688           unsigned HOST_WIDE_INT elsz
8689             = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8690           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8691                                  TYPE_SIZE (vectype),
8692                                  bitsize_int (identity_offset * elsz));
8693           perm_stmt = gimple_build_assign (perm_dest, lowpart);
8694         }
8695       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8696                                     first_def_nunits, &c) && c == 2)
8697         {
8698           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8699                                             NULL_TREE, second_def);
8700           perm_stmt = gimple_build_assign (perm_dest, ctor);
8701         }
8702       else
8703         gcc_unreachable ();
8704     }
8705   else
8706     {
8707       /* We need a copy here in case the def was external.  */
8708       perm_stmt = gimple_build_assign (perm_dest, first_def);
8709     }
8710   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8711   /* Store the vector statement in NODE.  */
8712   node->push_vec_def (perm_stmt);
8713 }
8714
8715 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
8716    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8717    If GSI is nonnull, emit the permutation there.
8718
8719    When GSI is null, the only purpose of NODE is to give properties
8720    of the result, such as the vector type and number of SLP lanes.
8721    The node does not need to be a VEC_PERM_EXPR.
8722
8723    If the target supports the operation, return the number of individual
8724    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
8725    dump file if DUMP_P is true.  */
8726
8727 static int
8728 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8729                                 slp_tree node, lane_permutation_t &perm,
8730                                 vec<slp_tree> &children, bool dump_p)
8731 {
8732   tree vectype = SLP_TREE_VECTYPE (node);
8733
8734   /* ???  We currently only support all same vector input types
8735      while the SLP IL should really do a concat + select and thus accept
8736      arbitrary mismatches.  */
8737   slp_tree child;
8738   unsigned i;
8739   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8740   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8741   tree op_vectype = NULL_TREE;
8742   FOR_EACH_VEC_ELT (children, i, child)
8743     if (SLP_TREE_VECTYPE (child))
8744       {
8745         op_vectype = SLP_TREE_VECTYPE (child);
8746         break;
8747       }
8748   if (!op_vectype)
8749     op_vectype = vectype;
8750   FOR_EACH_VEC_ELT (children, i, child)
8751     {
8752       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8753            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8754           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8755           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8756         {
8757           if (dump_p)
8758             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8759                              "Unsupported vector types in lane permutation\n");
8760           return -1;
8761         }
8762       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8763         repeating_p = false;
8764     }
8765
8766   gcc_assert (perm.length () == SLP_TREE_LANES (node));
8767   if (dump_p)
8768     {
8769       dump_printf_loc (MSG_NOTE, vect_location,
8770                        "vectorizing permutation");
8771       for (unsigned i = 0; i < perm.length (); ++i)
8772         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8773       if (repeating_p)
8774         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8775       dump_printf (MSG_NOTE, "\n");
8776     }
8777
8778   /* REPEATING_P is true if every output vector is guaranteed to use the
8779      same permute vector.  We can handle that case for both variable-length
8780      and constant-length vectors, but we only handle other cases for
8781      constant-length vectors.
8782
8783      Set:
8784
8785      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8786        mask vector that we want to build.
8787
8788      - NCOPIES to the number of copies of PERM that we need in order
8789        to build the necessary permute mask vectors.
8790
8791      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8792        for each permute mask vector.  This is only relevant when GSI is
8793        nonnull.  */
8794   uint64_t npatterns;
8795   unsigned nelts_per_pattern;
8796   uint64_t ncopies;
8797   unsigned noutputs_per_mask;
8798   if (repeating_p)
8799     {
8800       /* We need a single permute mask vector that has the form:
8801
8802            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8803
8804          In other words, the original n-element permute in PERM is
8805          "unrolled" to fill a full vector.  The stepped vector encoding
8806          that we use for permutes requires 3n elements.  */
8807       npatterns = SLP_TREE_LANES (node);
8808       nelts_per_pattern = ncopies = 3;
8809       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8810     }
8811   else
8812     {
8813       /* Calculate every element of every permute mask vector explicitly,
8814          instead of relying on the pattern described above.  */
8815       if (!nunits.is_constant (&npatterns))
8816         return -1;
8817       nelts_per_pattern = ncopies = 1;
8818       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8819         if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8820           return -1;
8821       noutputs_per_mask = 1;
8822     }
8823   unsigned olanes = ncopies * SLP_TREE_LANES (node);
8824   gcc_assert (repeating_p || multiple_p (olanes, nunits));
8825
8826   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8827      from the { SLP operand, scalar lane } permutation as recorded in the
8828      SLP node as intermediate step.  This part should already work
8829      with SLP children with arbitrary number of lanes.  */
8830   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8831   auto_vec<unsigned> active_lane;
8832   vperm.create (olanes);
8833   active_lane.safe_grow_cleared (children.length (), true);
8834   for (unsigned i = 0; i < ncopies; ++i)
8835     {
8836       for (unsigned pi = 0; pi < perm.length (); ++pi)
8837         {
8838           std::pair<unsigned, unsigned> p = perm[pi];
8839           tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8840           if (repeating_p)
8841             vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8842           else
8843             {
8844               /* We checked above that the vectors are constant-length.  */
8845               unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8846               unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8847               unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8848               vperm.quick_push ({{p.first, vi}, vl});
8849             }
8850         }
8851       /* Advance to the next group.  */
8852       for (unsigned j = 0; j < children.length (); ++j)
8853         active_lane[j] += SLP_TREE_LANES (children[j]);
8854     }
8855
8856   if (dump_p)
8857     {
8858       dump_printf_loc (MSG_NOTE, vect_location,
8859                        "vectorizing permutation");
8860       for (unsigned i = 0; i < perm.length (); ++i)
8861         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8862       if (repeating_p)
8863         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8864       dump_printf (MSG_NOTE, "\n");
8865       dump_printf_loc (MSG_NOTE, vect_location, "as");
8866       for (unsigned i = 0; i < vperm.length (); ++i)
8867         {
8868           if (i != 0
8869               && (repeating_p
8870                   ? multiple_p (i, npatterns)
8871                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8872             dump_printf (MSG_NOTE, ",");
8873           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8874                        vperm[i].first.first, vperm[i].first.second,
8875                        vperm[i].second);
8876         }
8877       dump_printf (MSG_NOTE, "\n");
8878     }
8879
8880   /* We can only handle two-vector permutes, everything else should
8881      be lowered on the SLP level.  The following is closely inspired
8882      by vect_transform_slp_perm_load and is supposed to eventually
8883      replace it.
8884      ???   As intermediate step do code-gen in the SLP tree representation
8885      somehow?  */
8886   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8887   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8888   unsigned int index = 0;
8889   poly_uint64 mask_element;
8890   vec_perm_builder mask;
8891   mask.new_vector (nunits, npatterns, nelts_per_pattern);
8892   unsigned int count = mask.encoded_nelts ();
8893   mask.quick_grow (count);
8894   vec_perm_indices indices;
8895   unsigned nperms = 0;
8896   for (unsigned i = 0; i < vperm.length (); ++i)
8897     {
8898       mask_element = vperm[i].second;
8899       if (first_vec.first == -1U
8900           || first_vec == vperm[i].first)
8901         first_vec = vperm[i].first;
8902       else if (second_vec.first == -1U
8903                || second_vec == vperm[i].first)
8904         {
8905           second_vec = vperm[i].first;
8906           mask_element += nunits;
8907         }
8908       else
8909         {
8910           if (dump_p)
8911             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8912                              "permutation requires at "
8913                              "least three vectors\n");
8914           gcc_assert (!gsi);
8915           return -1;
8916         }
8917
8918       mask[index++] = mask_element;
8919
8920       if (index == count)
8921         {
8922           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8923                               TYPE_VECTOR_SUBPARTS (op_vectype));
8924           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
8925                              && constant_multiple_p (mask[0], nunits));
8926           machine_mode vmode = TYPE_MODE (vectype);
8927           machine_mode op_vmode = TYPE_MODE (op_vectype);
8928           unsigned HOST_WIDE_INT c;
8929           if ((!identity_p
8930                && !can_vec_perm_const_p (vmode, op_vmode, indices))
8931               || (identity_p
8932                   && !known_le (nunits,
8933                                 TYPE_VECTOR_SUBPARTS (op_vectype))
8934                   && (!constant_multiple_p (nunits,
8935                                             TYPE_VECTOR_SUBPARTS (op_vectype),
8936                                             &c) || c != 2)))
8937             {
8938               if (dump_p)
8939                 {
8940                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8941                                    vect_location,
8942                                    "unsupported vect permute { ");
8943                   for (i = 0; i < count; ++i)
8944                     {
8945                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8946                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8947                     }
8948                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8949                 }
8950               gcc_assert (!gsi);
8951               return -1;
8952             }
8953
8954           if (!identity_p)
8955             nperms++;
8956           if (gsi)
8957             {
8958               if (second_vec.first == -1U)
8959                 second_vec = first_vec;
8960
8961               slp_tree
8962                 first_node = children[first_vec.first],
8963                 second_node = children[second_vec.first];
8964
8965               tree mask_vec = NULL_TREE;
8966               if (!identity_p)
8967                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8968
8969               for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8970                 {
8971                   tree first_def
8972                     = vect_get_slp_vect_def (first_node,
8973                                              first_vec.second + vi);
8974                   tree second_def
8975                     = vect_get_slp_vect_def (second_node,
8976                                              second_vec.second + vi);
8977                   vect_add_slp_permutation (vinfo, gsi, node, first_def,
8978                                             second_def, mask_vec, mask[0]);
8979                 }
8980             }
8981
8982           index = 0;
8983           first_vec = std::make_pair (-1U, -1U);
8984           second_vec = std::make_pair (-1U, -1U);
8985         }
8986     }
8987
8988   return nperms;
8989 }
8990
8991 /* Vectorize the SLP permutations in NODE as specified
8992    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8993    child number and lane number.
8994    Interleaving of two two-lane two-child SLP subtrees (not supported):
8995      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8996    A blend of two four-lane two-child SLP subtrees:
8997      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8998    Highpart of a four-lane one-child SLP subtree (not supported):
8999      [ { 0, 2 }, { 0, 3 } ]
9000    Where currently only a subset is supported by code generating below.  */
9001
9002 static bool
9003 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
9004                               slp_tree node, stmt_vector_for_cost *cost_vec)
9005 {
9006   tree vectype = SLP_TREE_VECTYPE (node);
9007   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
9008   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
9009                                                SLP_TREE_CHILDREN (node),
9010                                                dump_enabled_p ());
9011   if (nperms < 0)
9012     return false;
9013
9014   if (!gsi)
9015     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
9016
9017   return true;
9018 }
9019
9020 /* Vectorize SLP NODE.  */
9021
9022 static void
9023 vect_schedule_slp_node (vec_info *vinfo,
9024                         slp_tree node, slp_instance instance)
9025 {
9026   gimple_stmt_iterator si;
9027   int i;
9028   slp_tree child;
9029
9030   /* For existing vectors there's nothing to do.  */
9031   if (SLP_TREE_DEF_TYPE (node) == vect_external_def
9032       && SLP_TREE_VEC_DEFS (node).exists ())
9033     return;
9034
9035   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9036
9037   /* Vectorize externals and constants.  */
9038   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9039       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
9040     {
9041       /* ???  vectorizable_shift can end up using a scalar operand which is
9042          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
9043          node in this case.  */
9044       if (!SLP_TREE_VECTYPE (node))
9045         return;
9046
9047       vect_create_constant_vectors (vinfo, node);
9048       return;
9049     }
9050
9051   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9052
9053   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
9054   SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9055
9056   if (dump_enabled_p ())
9057     dump_printf_loc (MSG_NOTE, vect_location,
9058                      "------>vectorizing SLP node starting from: %G",
9059                      stmt_info->stmt);
9060
9061   if (STMT_VINFO_DATA_REF (stmt_info)
9062       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9063     {
9064       /* Vectorized loads go before the first scalar load to make it
9065          ready early, vectorized stores go before the last scalar
9066          stmt which is where all uses are ready.  */
9067       stmt_vec_info last_stmt_info = NULL;
9068       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9069         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9070       else /* DR_IS_WRITE */
9071         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9072       si = gsi_for_stmt (last_stmt_info->stmt);
9073     }
9074   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9075             || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9076             || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9077            && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9078     {
9079       /* For PHI node vectorization we do not use the insertion iterator.  */
9080       si = gsi_none ();
9081     }
9082   else
9083     {
9084       /* Emit other stmts after the children vectorized defs which is
9085          earliest possible.  */
9086       gimple *last_stmt = NULL;
9087       if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
9088         if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
9089             || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
9090           {
9091             /* But avoid scheduling internal defs outside of the loop when
9092                we might have only implicitly tracked loop mask/len defs.  */
9093             gimple_stmt_iterator si
9094               = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
9095             last_stmt = *si;
9096           }
9097       bool seen_vector_def = false;
9098       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9099         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9100           {
9101             /* For fold-left reductions we are retaining the scalar
9102                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9103                set so the representation isn't perfect.  Resort to the
9104                last scalar def here.  */
9105             if (SLP_TREE_VEC_DEFS (child).is_empty ())
9106               {
9107                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9108                             == cycle_phi_info_type);
9109                 gphi *phi = as_a <gphi *>
9110                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
9111                 if (!last_stmt
9112                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
9113                   last_stmt = phi;
9114               }
9115             /* We are emitting all vectorized stmts in the same place and
9116                the last one is the last.
9117                ???  Unless we have a load permutation applied and that
9118                figures to re-use an earlier generated load.  */
9119             unsigned j;
9120             tree vdef;
9121             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9122               {
9123                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9124                 if (!last_stmt
9125                     || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9126                   last_stmt = vstmt;
9127               }
9128           }
9129         else if (!SLP_TREE_VECTYPE (child))
9130           {
9131             /* For externals we use unvectorized at all scalar defs.  */
9132             unsigned j;
9133             tree def;
9134             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9135               if (TREE_CODE (def) == SSA_NAME
9136                   && !SSA_NAME_IS_DEFAULT_DEF (def))
9137                 {
9138                   gimple *stmt = SSA_NAME_DEF_STMT (def);
9139                   if (!last_stmt
9140                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9141                     last_stmt = stmt;
9142                 }
9143           }
9144         else
9145           {
9146             /* For externals we have to look at all defs since their
9147                insertion place is decided per vector.  But beware
9148                of pre-existing vectors where we need to make sure
9149                we do not insert before the region boundary.  */
9150             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9151                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9152               seen_vector_def = true;
9153             else
9154               {
9155                 unsigned j;
9156                 tree vdef;
9157                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9158                   if (TREE_CODE (vdef) == SSA_NAME
9159                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9160                     {
9161                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9162                       if (!last_stmt
9163                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9164                         last_stmt = vstmt;
9165                     }
9166               }
9167           }
9168       /* This can happen when all children are pre-existing vectors or
9169          constants.  */
9170       if (!last_stmt)
9171         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9172       if (!last_stmt)
9173         {
9174           gcc_assert (seen_vector_def);
9175           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9176         }
9177       else if (is_ctrl_altering_stmt (last_stmt))
9178         {
9179           /* We split regions to vectorize at control altering stmts
9180              with a definition so this must be an external which
9181              we can insert at the start of the region.  */
9182           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9183         }
9184       else if (is_a <bb_vec_info> (vinfo)
9185                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9186                && gimple_could_trap_p (stmt_info->stmt))
9187         {
9188           /* We've constrained possibly trapping operations to all come
9189              from the same basic-block, if vectorized defs would allow earlier
9190              scheduling still force vectorized stmts to the original block.
9191              This is only necessary for BB vectorization since for loop vect
9192              all operations are in a single BB and scalar stmt based
9193              placement doesn't play well with epilogue vectorization.  */
9194           gcc_assert (dominated_by_p (CDI_DOMINATORS,
9195                                       gimple_bb (stmt_info->stmt),
9196                                       gimple_bb (last_stmt)));
9197           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9198         }
9199       else if (is_a <gphi *> (last_stmt))
9200         si = gsi_after_labels (gimple_bb (last_stmt));
9201       else
9202         {
9203           si = gsi_for_stmt (last_stmt);
9204           gsi_next (&si);
9205         }
9206     }
9207
9208   /* Handle purely internal nodes.  */
9209   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9210     {
9211       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
9212          be shared with different SLP nodes (but usually it's the same
9213          operation apart from the case the stmt is only there for denoting
9214          the actual scalar lane defs ...).  So do not call vect_transform_stmt
9215          but open-code it here (partly).  */
9216       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9217       gcc_assert (done);
9218       stmt_vec_info slp_stmt_info;
9219       unsigned int i;
9220       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9221         if (STMT_VINFO_LIVE_P (slp_stmt_info))
9222           {
9223             done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9224                                                 instance, i, true, NULL);
9225             gcc_assert (done);
9226           }
9227     }
9228   else
9229     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9230 }
9231
9232 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9233    For loop vectorization this is done in vectorizable_call, but for SLP
9234    it needs to be deferred until end of vect_schedule_slp, because multiple
9235    SLP instances may refer to the same scalar stmt.  */
9236
9237 static void
9238 vect_remove_slp_scalar_calls (vec_info *vinfo,
9239                               slp_tree node, hash_set<slp_tree> &visited)
9240 {
9241   gimple *new_stmt;
9242   gimple_stmt_iterator gsi;
9243   int i;
9244   slp_tree child;
9245   tree lhs;
9246   stmt_vec_info stmt_info;
9247
9248   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9249     return;
9250
9251   if (visited.add (node))
9252     return;
9253
9254   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9255     vect_remove_slp_scalar_calls (vinfo, child, visited);
9256
9257   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9258     {
9259       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9260       if (!stmt || gimple_bb (stmt) == NULL)
9261         continue;
9262       if (is_pattern_stmt_p (stmt_info)
9263           || !PURE_SLP_STMT (stmt_info))
9264         continue;
9265       lhs = gimple_call_lhs (stmt);
9266       if (lhs)
9267         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9268       else
9269         {
9270           new_stmt = gimple_build_nop ();
9271           unlink_stmt_vdef (stmt_info->stmt);
9272         }
9273       gsi = gsi_for_stmt (stmt);
9274       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9275       if (lhs)
9276         SSA_NAME_DEF_STMT (lhs) = new_stmt;
9277     }
9278 }
9279
9280 static void
9281 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9282 {
9283   hash_set<slp_tree> visited;
9284   vect_remove_slp_scalar_calls (vinfo, node, visited);
9285 }
9286
9287 /* Vectorize the instance root.  */
9288
9289 void
9290 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9291 {
9292   gassign *rstmt = NULL;
9293
9294   if (instance->kind == slp_inst_kind_ctor)
9295     {
9296       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9297         {
9298           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9299           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9300           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9301                                           TREE_TYPE (vect_lhs)))
9302             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9303                                vect_lhs);
9304           rstmt = gimple_build_assign (root_lhs, vect_lhs);
9305         }
9306       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9307         {
9308           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9309           tree child_def;
9310           int j;
9311           vec<constructor_elt, va_gc> *v;
9312           vec_alloc (v, nelts);
9313
9314           /* A CTOR can handle V16HI composition from VNx8HI so we
9315              do not need to convert vector elements if the types
9316              do not match.  */
9317           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9318             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9319           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9320           tree rtype
9321             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9322           tree r_constructor = build_constructor (rtype, v);
9323           rstmt = gimple_build_assign (lhs, r_constructor);
9324         }
9325     }
9326   else if (instance->kind == slp_inst_kind_bb_reduc)
9327     {
9328       /* Largely inspired by reduction chain epilogue handling in
9329          vect_create_epilog_for_reduction.  */
9330       vec<tree> vec_defs = vNULL;
9331       vect_get_slp_defs (node, &vec_defs);
9332       enum tree_code reduc_code
9333         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9334       /* ???  We actually have to reflect signs somewhere.  */
9335       if (reduc_code == MINUS_EXPR)
9336         reduc_code = PLUS_EXPR;
9337       gimple_seq epilogue = NULL;
9338       /* We may end up with more than one vector result, reduce them
9339          to one vector.  */
9340       tree vec_def = vec_defs[0];
9341       tree vectype = TREE_TYPE (vec_def);
9342       tree compute_vectype = vectype;
9343       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9344                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
9345                                  && operation_can_overflow (reduc_code));
9346       if (pun_for_overflow_p)
9347         {
9348           compute_vectype = unsigned_type_for (vectype);
9349           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9350                                   compute_vectype, vec_def);
9351         }
9352       for (unsigned i = 1; i < vec_defs.length (); ++i)
9353         {
9354           tree def = vec_defs[i];
9355           if (pun_for_overflow_p)
9356             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9357                                 compute_vectype, def);
9358           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9359                                   vec_def, def);
9360         }
9361       vec_defs.release ();
9362       /* ???  Support other schemes than direct internal fn.  */
9363       internal_fn reduc_fn;
9364       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9365           || reduc_fn == IFN_LAST)
9366         gcc_unreachable ();
9367       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9368                                       TREE_TYPE (compute_vectype), vec_def);
9369       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9370         {
9371           tree rem_def = NULL_TREE;
9372           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9373             {
9374               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9375               if (!rem_def)
9376                 rem_def = def;
9377               else
9378                 rem_def = gimple_build (&epilogue, reduc_code,
9379                                         TREE_TYPE (scalar_def),
9380                                         rem_def, def);
9381             }
9382           scalar_def = gimple_build (&epilogue, reduc_code,
9383                                      TREE_TYPE (scalar_def),
9384                                      scalar_def, rem_def);
9385         }
9386       scalar_def = gimple_convert (&epilogue,
9387                                    TREE_TYPE (vectype), scalar_def);
9388       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9389       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9390       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9391       update_stmt (gsi_stmt (rgsi));
9392       return;
9393     }
9394   else
9395     gcc_unreachable ();
9396
9397   gcc_assert (rstmt);
9398
9399   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9400   gsi_replace (&rgsi, rstmt, true);
9401 }
9402
9403 struct slp_scc_info
9404 {
9405   bool on_stack;
9406   int dfs;
9407   int lowlink;
9408 };
9409
9410 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
9411
9412 static void
9413 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9414                    hash_map<slp_tree, slp_scc_info> &scc_info,
9415                    int &maxdfs, vec<slp_tree> &stack)
9416 {
9417   bool existed_p;
9418   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9419   gcc_assert (!existed_p);
9420   info->dfs = maxdfs;
9421   info->lowlink = maxdfs;
9422   maxdfs++;
9423
9424   /* Leaf.  */
9425   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9426     {
9427       info->on_stack = false;
9428       vect_schedule_slp_node (vinfo, node, instance);
9429       return;
9430     }
9431
9432   info->on_stack = true;
9433   stack.safe_push (node);
9434
9435   unsigned i;
9436   slp_tree child;
9437   /* DFS recurse.  */
9438   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9439     {
9440       if (!child)
9441         continue;
9442       slp_scc_info *child_info = scc_info.get (child);
9443       if (!child_info)
9444         {
9445           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9446           /* Recursion might have re-allocated the node.  */
9447           info = scc_info.get (node);
9448           child_info = scc_info.get (child);
9449           info->lowlink = MIN (info->lowlink, child_info->lowlink);
9450         }
9451       else if (child_info->on_stack)
9452         info->lowlink = MIN (info->lowlink, child_info->dfs);
9453     }
9454   if (info->lowlink != info->dfs)
9455     return;
9456
9457   auto_vec<slp_tree, 4> phis_to_fixup;
9458
9459   /* Singleton.  */
9460   if (stack.last () == node)
9461     {
9462       stack.pop ();
9463       info->on_stack = false;
9464       vect_schedule_slp_node (vinfo, node, instance);
9465       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9466           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9467         phis_to_fixup.quick_push (node);
9468     }
9469   else
9470     {
9471       /* SCC.  */
9472       int last_idx = stack.length () - 1;
9473       while (stack[last_idx] != node)
9474         last_idx--;
9475       /* We can break the cycle at PHIs who have at least one child
9476          code generated.  Then we could re-start the DFS walk until
9477          all nodes in the SCC are covered (we might have new entries
9478          for only back-reachable nodes).  But it's simpler to just
9479          iterate and schedule those that are ready.  */
9480       unsigned todo = stack.length () - last_idx;
9481       do
9482         {
9483           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9484             {
9485               slp_tree entry = stack[idx];
9486               if (!entry)
9487                 continue;
9488               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9489                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9490               bool ready = !phi;
9491               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9492                   if (!child)
9493                     {
9494                       gcc_assert (phi);
9495                       ready = true;
9496                       break;
9497                     }
9498                   else if (scc_info.get (child)->on_stack)
9499                     {
9500                       if (!phi)
9501                         {
9502                           ready = false;
9503                           break;
9504                         }
9505                     }
9506                   else
9507                     {
9508                       if (phi)
9509                         {
9510                           ready = true;
9511                           break;
9512                         }
9513                     }
9514               if (ready)
9515                 {
9516                   vect_schedule_slp_node (vinfo, entry, instance);
9517                   scc_info.get (entry)->on_stack = false;
9518                   stack[idx] = NULL;
9519                   todo--;
9520                   if (phi)
9521                     phis_to_fixup.safe_push (entry);
9522                 }
9523             }
9524         }
9525       while (todo != 0);
9526
9527       /* Pop the SCC.  */
9528       stack.truncate (last_idx);
9529     }
9530
9531   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
9532   slp_tree phi_node;
9533   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9534     {
9535       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9536       edge_iterator ei;
9537       edge e;
9538       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9539         {
9540           unsigned dest_idx = e->dest_idx;
9541           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9542           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9543             continue;
9544           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9545           /* Simply fill all args.  */
9546           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9547               != vect_first_order_recurrence)
9548             for (unsigned i = 0; i < n; ++i)
9549               {
9550                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9551                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9552                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9553                              e, gimple_phi_arg_location (phi, dest_idx));
9554               }
9555           else
9556             {
9557               /* Unless it is a first order recurrence which needs
9558                  args filled in for both the PHI node and the permutes.  */
9559               gimple *perm
9560                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9561               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9562               add_phi_arg (as_a <gphi *> (rphi),
9563                            vect_get_slp_vect_def (child, n - 1),
9564                            e, gimple_phi_arg_location (phi, dest_idx));
9565               for (unsigned i = 0; i < n; ++i)
9566                 {
9567                   gimple *perm
9568                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9569                   if (i > 0)
9570                     gimple_assign_set_rhs1 (perm,
9571                                             vect_get_slp_vect_def (child, i - 1));
9572                   gimple_assign_set_rhs2 (perm,
9573                                           vect_get_slp_vect_def (child, i));
9574                   update_stmt (perm);
9575                 }
9576             }
9577         }
9578     }
9579 }
9580
9581 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
9582
9583 void
9584 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9585 {
9586   slp_instance instance;
9587   unsigned int i;
9588
9589   hash_map<slp_tree, slp_scc_info> scc_info;
9590   int maxdfs = 0;
9591   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9592     {
9593       slp_tree node = SLP_INSTANCE_TREE (instance);
9594       if (dump_enabled_p ())
9595         {
9596           dump_printf_loc (MSG_NOTE, vect_location,
9597                            "Vectorizing SLP tree:\n");
9598           /* ???  Dump all?  */
9599           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9600             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9601                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9602           vect_print_slp_graph (MSG_NOTE, vect_location,
9603                                 SLP_INSTANCE_TREE (instance));
9604         }
9605       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9606          have a PHI be the node breaking the cycle.  */
9607       auto_vec<slp_tree> stack;
9608       if (!scc_info.get (node))
9609         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9610
9611       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9612         vectorize_slp_instance_root_stmt (node, instance);
9613
9614       if (dump_enabled_p ())
9615         dump_printf_loc (MSG_NOTE, vect_location,
9616                          "vectorizing stmts using SLP.\n");
9617     }
9618
9619   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9620     {
9621       slp_tree root = SLP_INSTANCE_TREE (instance);
9622       stmt_vec_info store_info;
9623       unsigned int j;
9624
9625       /* Remove scalar call stmts.  Do not do this for basic-block
9626          vectorization as not all uses may be vectorized.
9627          ???  Why should this be necessary?  DCE should be able to
9628          remove the stmts itself.
9629          ???  For BB vectorization we can as well remove scalar
9630          stmts starting from the SLP tree root if they have no
9631          uses.  */
9632       if (is_a <loop_vec_info> (vinfo))
9633         vect_remove_slp_scalar_calls (vinfo, root);
9634
9635       /* Remove vectorized stores original scalar stmts.  */
9636       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9637         {
9638           if (!STMT_VINFO_DATA_REF (store_info)
9639               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9640             break;
9641
9642           store_info = vect_orig_stmt (store_info);
9643           /* Free the attached stmt_vec_info and remove the stmt.  */
9644           vinfo->remove_stmt (store_info);
9645
9646           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9647              to not crash in vect_free_slp_tree later.  */
9648           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9649             SLP_TREE_REPRESENTATIVE (root) = NULL;
9650         }
9651     }
9652 }