gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "insn-config.h"
  35 #include "recog.h"              /* FIXME: for insn_data */
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "gimple-iterator.h"
  39 #include "cfgloop.h"
  40 #include "tree-vectorizer.h"
  41 #include "langhooks.h"
  42 #include "gimple-walk.h"
  43 #include "dbgcnt.h"
  44 #include "tree-vector-builder.h"
  45 #include "vec-perm-indices.h"
  46 #include "gimple-fold.h"
  47 #include "internal-fn.h"
  48 #include "dump-context.h"
  49 #include "cfganal.h"
  50 #include "tree-eh.h"
  51 #include "tree-cfg.h"
  52 #include "alloc-pool.h"
  53 #include "sreal.h"
  54 #include "predict.h"
  55
  56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  57                                             load_permutation_t &,
  58                                             const vec<tree> &,
  59                                             gimple_stmt_iterator *,
  60                                             poly_uint64, bool, bool,
  61                                             unsigned *,
  62                                             unsigned * = nullptr,
  63                                             bool = false);
  64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  65                                            slp_tree, lane_permutation_t &,
  66                                            vec<slp_tree> &, bool);
  67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  68                                           slp_tree, stmt_vector_for_cost *);
  69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  70
  71 static object_allocator<_slp_tree> *slp_tree_pool;
  72 static slp_tree slp_first_node;
  73
  74 void
  75 vect_slp_init (void)
  76 {
  77   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  78 }
  79
  80 void
  81 vect_slp_fini (void)
  82 {
  83   while (slp_first_node)
  84     delete slp_first_node;
  85   delete slp_tree_pool;
  86   slp_tree_pool = NULL;
  87 }
  88
  89 void *
  90 _slp_tree::operator new (size_t n)
  91 {
  92   gcc_assert (n == sizeof (_slp_tree));
  93   return slp_tree_pool->allocate_raw ();
  94 }
  95
  96 void
  97 _slp_tree::operator delete (void *node, size_t n)
  98 {
  99   gcc_assert (n == sizeof (_slp_tree));
 100   slp_tree_pool->remove_raw (node);
 101 }
 102
 103
 104 /* Initialize a SLP node.  */
 105
 106 _slp_tree::_slp_tree ()
 107 {
 108   this->prev_node = NULL;
 109   if (slp_first_node)
 110     slp_first_node->prev_node = this;
 111   this->next_node = slp_first_node;
 112   slp_first_node = this;
 113   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 114   SLP_TREE_SCALAR_OPS (this) = vNULL;
 115   SLP_TREE_VEC_DEFS (this) = vNULL;
 116   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 117   SLP_TREE_CHILDREN (this) = vNULL;
 118   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 119   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 120   SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
 121   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 122   SLP_TREE_CODE (this) = ERROR_MARK;
 123   SLP_TREE_VECTYPE (this) = NULL_TREE;
 124   SLP_TREE_REPRESENTATIVE (this) = NULL;
 125   SLP_TREE_REF_COUNT (this) = 1;
 126   this->failed = NULL;
 127   this->max_nunits = 1;
 128   this->lanes = 0;
 129 }
 130
 131 /* Tear down a SLP node.  */
 132
 133 _slp_tree::~_slp_tree ()
 134 {
 135   if (this->prev_node)
 136     this->prev_node->next_node = this->next_node;
 137   else
 138     slp_first_node = this->next_node;
 139   if (this->next_node)
 140     this->next_node->prev_node = this->prev_node;
 141   SLP_TREE_CHILDREN (this).release ();
 142   SLP_TREE_SCALAR_STMTS (this).release ();
 143   SLP_TREE_SCALAR_OPS (this).release ();
 144   SLP_TREE_VEC_DEFS (this).release ();
 145   SLP_TREE_LOAD_PERMUTATION (this).release ();
 146   SLP_TREE_LANE_PERMUTATION (this).release ();
 147   SLP_TREE_SIMD_CLONE_INFO (this).release ();
 148   if (this->failed)
 149     free (failed);
 150 }
 151
 152 /* Push the single SSA definition in DEF to the vector of vector defs.  */
 153
 154 void
 155 _slp_tree::push_vec_def (gimple *def)
 156 {
 157   if (gphi *phi = dyn_cast <gphi *> (def))
 158     vec_defs.quick_push (gimple_phi_result (phi));
 159   else
 160     {
 161       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
 162       vec_defs.quick_push (get_def_from_ptr (defop));
 163     }
 164 }
 165
 166 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 167
 168 void
 169 vect_free_slp_tree (slp_tree node)
 170 {
 171   int i;
 172   slp_tree child;
 173
 174   if (--SLP_TREE_REF_COUNT (node) != 0)
 175     return;
 176
 177   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 178     if (child)
 179       vect_free_slp_tree (child);
 180
 181   /* If the node defines any SLP only patterns then those patterns are no
 182      longer valid and should be removed.  */
 183   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 184   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 185     {
 186       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 187       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 188       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 189     }
 190
 191   delete node;
 192 }
 193
 194 /* Return a location suitable for dumpings related to the SLP instance.  */
 195
 196 dump_user_location_t
 197 _slp_instance::location () const
 198 {
 199   if (!root_stmts.is_empty ())
 200     return root_stmts[0]->stmt;
 201   else
 202     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 203 }
 204
 205
 206 /* Free the memory allocated for the SLP instance.  */
 207
 208 void
 209 vect_free_slp_instance (slp_instance instance)
 210 {
 211   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 212   SLP_INSTANCE_LOADS (instance).release ();
 213   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 214   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
 215   instance->subgraph_entries.release ();
 216   instance->cost_vec.release ();
 217   free (instance);
 218 }
 219
 220
 221 /* Create an SLP node for SCALAR_STMTS.  */
 222
 223 slp_tree
 224 vect_create_new_slp_node (unsigned nops, tree_code code)
 225 {
 226   slp_tree node = new _slp_tree;
 227   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 228   SLP_TREE_CHILDREN (node).create (nops);
 229   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 230   SLP_TREE_CODE (node) = code;
 231   return node;
 232 }
 233 /* Create an SLP node for SCALAR_STMTS.  */
 234
 235 static slp_tree
 236 vect_create_new_slp_node (slp_tree node,
 237                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 238 {
 239   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 240   SLP_TREE_CHILDREN (node).create (nops);
 241   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 242   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 243   SLP_TREE_LANES (node) = scalar_stmts.length ();
 244   return node;
 245 }
 246
 247 /* Create an SLP node for SCALAR_STMTS.  */
 248
 249 static slp_tree
 250 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 251 {
 252   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 253 }
 254
 255 /* Create an SLP node for OPS.  */
 256
 257 static slp_tree
 258 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 259 {
 260   SLP_TREE_SCALAR_OPS (node) = ops;
 261   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 262   SLP_TREE_LANES (node) = ops.length ();
 263   return node;
 264 }
 265
 266 /* Create an SLP node for OPS.  */
 267
 268 static slp_tree
 269 vect_create_new_slp_node (vec<tree> ops)
 270 {
 271   return vect_create_new_slp_node (new _slp_tree, ops);
 272 }
 273
 274
 275 /* This structure is used in creation of an SLP tree.  Each instance
 276    corresponds to the same operand in a group of scalar stmts in an SLP
 277    node.  */
 278 typedef struct _slp_oprnd_info
 279 {
 280   /* Def-stmts for the operands.  */
 281   vec<stmt_vec_info> def_stmts;
 282   /* Operands.  */
 283   vec<tree> ops;
 284   /* Information about the first statement, its vector def-type, type, the
 285      operand itself in case it's constant, and an indication if it's a pattern
 286      stmt and gather/scatter info.  */
 287   tree first_op_type;
 288   enum vect_def_type first_dt;
 289   bool any_pattern;
 290   bool first_gs_p;
 291   gather_scatter_info first_gs_info;
 292 } *slp_oprnd_info;
 293
 294
 295 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 296    operand.  */
 297 static vec<slp_oprnd_info>
 298 vect_create_oprnd_info (int nops, int group_size)
 299 {
 300   int i;
 301   slp_oprnd_info oprnd_info;
 302   vec<slp_oprnd_info> oprnds_info;
 303
 304   oprnds_info.create (nops);
 305   for (i = 0; i < nops; i++)
 306     {
 307       oprnd_info = XNEW (struct _slp_oprnd_info);
 308       oprnd_info->def_stmts.create (group_size);
 309       oprnd_info->ops.create (group_size);
 310       oprnd_info->first_dt = vect_uninitialized_def;
 311       oprnd_info->first_op_type = NULL_TREE;
 312       oprnd_info->any_pattern = false;
 313       oprnd_info->first_gs_p = false;
 314       oprnds_info.quick_push (oprnd_info);
 315     }
 316
 317   return oprnds_info;
 318 }
 319
 320
 321 /* Free operands info.  */
 322
 323 static void
 324 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 325 {
 326   int i;
 327   slp_oprnd_info oprnd_info;
 328
 329   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 330     {
 331       oprnd_info->def_stmts.release ();
 332       oprnd_info->ops.release ();
 333       XDELETE (oprnd_info);
 334     }
 335
 336   oprnds_info.release ();
 337 }
 338
 339 /* Return the execution frequency of NODE (so that a higher value indicates
 340    a "more important" node when optimizing for speed).  */
 341
 342 static sreal
 343 vect_slp_node_weight (slp_tree node)
 344 {
 345   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 346   basic_block bb = gimple_bb (stmt_info->stmt);
 347   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 348 }
 349
 350 /* Return true if STMTS contains a pattern statement.  */
 351
 352 static bool
 353 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 354 {
 355   stmt_vec_info stmt_info;
 356   unsigned int i;
 357   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 358     if (is_pattern_stmt_p (stmt_info))
 359       return true;
 360   return false;
 361 }
 362
 363 /* Return true when all lanes in the external or constant NODE have
 364    the same value.  */
 365
 366 static bool
 367 vect_slp_tree_uniform_p (slp_tree node)
 368 {
 369   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 370               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 371
 372   /* Pre-exsting vectors.  */
 373   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 374     return false;
 375
 376   unsigned i;
 377   tree op, first = NULL_TREE;
 378   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 379     if (!first)
 380       first = op;
 381     else if (!operand_equal_p (first, op, 0))
 382       return false;
 383
 384   return true;
 385 }
 386
 387 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 388    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 389    of the chain.  */
 390
 391 int
 392 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 393                                       stmt_vec_info first_stmt_info)
 394 {
 395   stmt_vec_info next_stmt_info = first_stmt_info;
 396   int result = 0;
 397
 398   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 399     return -1;
 400
 401   do
 402     {
 403       if (next_stmt_info == stmt_info)
 404         return result;
 405       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 406       if (next_stmt_info)
 407         result += DR_GROUP_GAP (next_stmt_info);
 408     }
 409   while (next_stmt_info);
 410
 411   return -1;
 412 }
 413
 414 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 415    using the method implemented by duplicate_and_interleave.  Return true
 416    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 417    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 418    (if nonnull).  */
 419
 420 bool
 421 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 422                                 tree elt_type, unsigned int *nvectors_out,
 423                                 tree *vector_type_out,
 424                                 tree *permutes)
 425 {
 426   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 427   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 428     return false;
 429
 430   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 431   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 432   unsigned int nvectors = 1;
 433   for (;;)
 434     {
 435       scalar_int_mode int_mode;
 436       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 437       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 438         {
 439           /* Get the natural vector type for this SLP group size.  */
 440           tree int_type = build_nonstandard_integer_type
 441             (GET_MODE_BITSIZE (int_mode), 1);
 442           tree vector_type
 443             = get_vectype_for_scalar_type (vinfo, int_type, count);
 444           poly_int64 half_nelts;
 445           if (vector_type
 446               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 447               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 448                            GET_MODE_SIZE (base_vector_mode))
 449               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
 450                              2, &half_nelts))
 451             {
 452               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 453                  together into elements of type INT_TYPE and using the result
 454                  to build NVECTORS vectors.  */
 455               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 456               vec_perm_builder sel1 (nelts, 2, 3);
 457               vec_perm_builder sel2 (nelts, 2, 3);
 458
 459               for (unsigned int i = 0; i < 3; ++i)
 460                 {
 461                   sel1.quick_push (i);
 462                   sel1.quick_push (i + nelts);
 463                   sel2.quick_push (half_nelts + i);
 464                   sel2.quick_push (half_nelts + i + nelts);
 465                 }
 466               vec_perm_indices indices1 (sel1, 2, nelts);
 467               vec_perm_indices indices2 (sel2, 2, nelts);
 468               machine_mode vmode = TYPE_MODE (vector_type);
 469               if (can_vec_perm_const_p (vmode, vmode, indices1)
 470                   && can_vec_perm_const_p (vmode, vmode, indices2))
 471                 {
 472                   if (nvectors_out)
 473                     *nvectors_out = nvectors;
 474                   if (vector_type_out)
 475                     *vector_type_out = vector_type;
 476                   if (permutes)
 477                     {
 478                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 479                                                                 indices1);
 480                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 481                                                                 indices2);
 482                     }
 483                   return true;
 484                 }
 485             }
 486         }
 487       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 488         return false;
 489       nvectors *= 2;
 490     }
 491 }
 492
 493 /* Return true if DTA and DTB match.  */
 494
 495 static bool
 496 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 497 {
 498   return (dta == dtb
 499           || ((dta == vect_external_def || dta == vect_constant_def)
 500               && (dtb == vect_external_def || dtb == vect_constant_def)));
 501 }
 502
 503 static const int cond_expr_maps[3][5] = {
 504   { 4, -1, -2, 1, 2 },
 505   { 4, -2, -1, 1, 2 },
 506   { 4, -1, -2, 2, 1 }
 507 };
 508 static const int arg1_map[] = { 1, 1 };
 509 static const int arg2_map[] = { 1, 2 };
 510 static const int arg1_arg4_map[] = { 2, 1, 4 };
 511 static const int arg3_arg2_map[] = { 2, 3, 2 };
 512 static const int op1_op0_map[] = { 2, 1, 0 };
 513 static const int off_map[] = { 1, -3 };
 514 static const int off_op0_map[] = { 2, -3, 0 };
 515 static const int off_arg2_map[] = { 2, -3, 2 };
 516 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
 517 static const int mask_call_maps[6][7] = {
 518   { 1, 1, },
 519   { 2, 1, 2, },
 520   { 3, 1, 2, 3, },
 521   { 4, 1, 2, 3, 4, },
 522   { 5, 1, 2, 3, 4, 5, },
 523   { 6, 1, 2, 3, 4, 5, 6 },
 524 };
 525
 526 /* For most SLP statements, there is a one-to-one mapping between
 527    gimple arguments and child nodes.  If that is not true for STMT,
 528    return an array that contains:
 529
 530    - the number of child nodes, followed by
 531    - for each child node, the index of the argument associated with that node.
 532      The special index -1 is the first operand of an embedded comparison and
 533      the special index -2 is the second operand of an embedded comparison.
 534      The special indes -3 is the offset of a gather as analyzed by
 535      vect_check_gather_scatter.
 536
 537    SWAP is as for vect_get_and_check_slp_defs.  */
 538
 539 static const int *
 540 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
 541                       unsigned char swap = 0)
 542 {
 543   if (auto assign = dyn_cast<const gassign *> (stmt))
 544     {
 545       if (gimple_assign_rhs_code (assign) == COND_EXPR
 546           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 547         return cond_expr_maps[swap];
 548       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 549           && swap)
 550         return op1_op0_map;
 551       if (gather_scatter_p)
 552         return gimple_vdef (stmt) ? off_op0_map : off_map;
 553     }
 554   gcc_assert (!swap);
 555   if (auto call = dyn_cast<const gcall *> (stmt))
 556     {
 557       if (gimple_call_internal_p (call))
 558         switch (gimple_call_internal_fn (call))
 559           {
 560           case IFN_MASK_LOAD:
 561             return gather_scatter_p ? off_arg2_map : arg2_map;
 562
 563           case IFN_GATHER_LOAD:
 564             return arg1_map;
 565
 566           case IFN_MASK_GATHER_LOAD:
 567             return arg1_arg4_map;
 568
 569           case IFN_MASK_STORE:
 570             return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
 571
 572           case IFN_MASK_CALL:
 573             {
 574               unsigned nargs = gimple_call_num_args (call);
 575               if (nargs >= 2 && nargs <= 7)
 576                 return mask_call_maps[nargs-2];
 577               else
 578                 return nullptr;
 579             }
 580
 581           default:
 582             break;
 583           }
 584     }
 585   return nullptr;
 586 }
 587
 588 /* Return the SLP node child index for operand OP of STMT.  */
 589
 590 int
 591 vect_slp_child_index_for_operand (const gimple *stmt, int op)
 592 {
 593   const int *opmap = vect_get_operand_map (stmt);
 594   if (!opmap)
 595     return op;
 596   for (int i = 1; i < 1 + opmap[0]; ++i)
 597     if (opmap[i] == op)
 598       return i - 1;
 599   gcc_unreachable ();
 600 }
 601
 602 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 603    they are of a valid type and that they match the defs of the first stmt of
 604    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 605    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 606    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 607    is 1 if STMT is cond and operands of comparison need to be swapped;
 608    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 609
 610    If there was a fatal error return -1; if the error could be corrected by
 611    swapping operands of father node of this one, return 1; if everything is
 612    ok return 0.  */
 613 static int
 614 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 615                              bool *skip_args,
 616                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 617                              vec<slp_oprnd_info> *oprnds_info)
 618 {
 619   stmt_vec_info stmt_info = stmts[stmt_num];
 620   tree oprnd;
 621   unsigned int i, number_of_oprnds;
 622   enum vect_def_type dt = vect_uninitialized_def;
 623   slp_oprnd_info oprnd_info;
 624   gather_scatter_info gs_info;
 625   unsigned int gs_op = -1u;
 626   unsigned int commutative_op = -1U;
 627   bool first = stmt_num == 0;
 628
 629   if (!is_a<gcall *> (stmt_info->stmt)
 630       && !is_a<gassign *> (stmt_info->stmt)
 631       && !is_a<gphi *> (stmt_info->stmt))
 632     return -1;
 633
 634   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 635   const int *map
 636     = vect_get_operand_map (stmt_info->stmt,
 637                             STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
 638   if (map)
 639     number_of_oprnds = *map++;
 640   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 641     {
 642       if (gimple_call_internal_p (stmt))
 643         {
 644           internal_fn ifn = gimple_call_internal_fn (stmt);
 645           commutative_op = first_commutative_argument (ifn);
 646         }
 647     }
 648   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 649     {
 650       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 651         commutative_op = 0;
 652     }
 653
 654   bool swapped = (swap != 0);
 655   bool backedge = false;
 656   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 657   for (i = 0; i < number_of_oprnds; i++)
 658     {
 659       oprnd_info = (*oprnds_info)[i];
 660       int opno = map ? map[i] : int (i);
 661       if (opno == -3)
 662         {
 663           gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
 664           if (!is_a <loop_vec_info> (vinfo)
 665               || !vect_check_gather_scatter (stmt_info,
 666                                              as_a <loop_vec_info> (vinfo),
 667                                              first ? &oprnd_info->first_gs_info
 668                                              : &gs_info))
 669             return -1;
 670
 671           if (first)
 672             {
 673               oprnd_info->first_gs_p = true;
 674               oprnd = oprnd_info->first_gs_info.offset;
 675             }
 676           else
 677             {
 678               gs_op = i;
 679               oprnd = gs_info.offset;
 680             }
 681         }
 682       else if (opno < 0)
 683         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 684       else
 685         {
 686           oprnd = gimple_arg (stmt_info->stmt, opno);
 687           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 688             {
 689               edge e = gimple_phi_arg_edge (stmt, opno);
 690               backedge = (is_a <bb_vec_info> (vinfo)
 691                           ? e->flags & EDGE_DFS_BACK
 692                           : dominated_by_p (CDI_DOMINATORS, e->src,
 693                                             gimple_bb (stmt_info->stmt)));
 694             }
 695         }
 696       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 697         oprnd = TREE_OPERAND (oprnd, 0);
 698
 699       stmt_vec_info def_stmt_info;
 700       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 701         {
 702           if (dump_enabled_p ())
 703             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 704                              "Build SLP failed: can't analyze def for %T\n",
 705                              oprnd);
 706
 707           return -1;
 708         }
 709
 710       if (skip_args[i])
 711         {
 712           oprnd_info->def_stmts.quick_push (NULL);
 713           oprnd_info->ops.quick_push (NULL_TREE);
 714           oprnd_info->first_dt = vect_uninitialized_def;
 715           continue;
 716         }
 717
 718       oprnd_info->def_stmts.quick_push (def_stmt_info);
 719       oprnd_info->ops.quick_push (oprnd);
 720
 721       if (def_stmt_info
 722           && is_pattern_stmt_p (def_stmt_info))
 723         {
 724           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 725               != def_stmt_info)
 726             oprnd_info->any_pattern = true;
 727           else
 728             /* If we promote this to external use the original stmt def.  */
 729             oprnd_info->ops.last ()
 730               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 731         }
 732
 733       /* If there's a extern def on a backedge make sure we can
 734          code-generate at the region start.
 735          ???  This is another case that could be fixed by adjusting
 736          how we split the function but at the moment we'd have conflicting
 737          goals there.  */
 738       if (backedge
 739           && dts[i] == vect_external_def
 740           && is_a <bb_vec_info> (vinfo)
 741           && TREE_CODE (oprnd) == SSA_NAME
 742           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 743           && !dominated_by_p (CDI_DOMINATORS,
 744                               as_a <bb_vec_info> (vinfo)->bbs[0],
 745                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 746         {
 747           if (dump_enabled_p ())
 748             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 749                              "Build SLP failed: extern def %T only defined "
 750                              "on backedge\n", oprnd);
 751           return -1;
 752         }
 753
 754       if (first)
 755         {
 756           tree type = TREE_TYPE (oprnd);
 757           dt = dts[i];
 758           if ((dt == vect_constant_def
 759                || dt == vect_external_def)
 760               && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
 761               && (TREE_CODE (type) == BOOLEAN_TYPE
 762                   || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
 763                                                       type)))
 764             {
 765               if (dump_enabled_p ())
 766                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 767                                  "Build SLP failed: invalid type of def "
 768                                  "for variable-length SLP %T\n", oprnd);
 769               return -1;
 770             }
 771
 772           /* For the swapping logic below force vect_reduction_def
 773              for the reduction op in a SLP reduction group.  */
 774           if (!STMT_VINFO_DATA_REF (stmt_info)
 775               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 776               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 777               && def_stmt_info)
 778             dts[i] = dt = vect_reduction_def;
 779
 780           /* Check the types of the definition.  */
 781           switch (dt)
 782             {
 783             case vect_external_def:
 784             case vect_constant_def:
 785             case vect_internal_def:
 786             case vect_reduction_def:
 787             case vect_induction_def:
 788             case vect_nested_cycle:
 789             case vect_first_order_recurrence:
 790               break;
 791
 792             default:
 793               /* FORNOW: Not supported.  */
 794               if (dump_enabled_p ())
 795                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 796                                  "Build SLP failed: illegal type of def %T\n",
 797                                  oprnd);
 798               return -1;
 799             }
 800
 801           oprnd_info->first_dt = dt;
 802           oprnd_info->first_op_type = type;
 803         }
 804     }
 805   if (first)
 806     return 0;
 807
 808   /* Now match the operand definition types to that of the first stmt.  */
 809   for (i = 0; i < number_of_oprnds;)
 810     {
 811       if (skip_args[i])
 812         {
 813           ++i;
 814           continue;
 815         }
 816
 817       oprnd_info = (*oprnds_info)[i];
 818       dt = dts[i];
 819       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 820       oprnd = oprnd_info->ops[stmt_num];
 821       tree type = TREE_TYPE (oprnd);
 822
 823       if (!types_compatible_p (oprnd_info->first_op_type, type))
 824         {
 825           if (dump_enabled_p ())
 826             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 827                              "Build SLP failed: different operand types\n");
 828           return 1;
 829         }
 830
 831       if ((gs_op == i) != oprnd_info->first_gs_p)
 832         {
 833           if (dump_enabled_p ())
 834             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 835                              "Build SLP failed: mixed gather and non-gather\n");
 836           return 1;
 837         }
 838       else if (gs_op == i)
 839         {
 840           if (!operand_equal_p (oprnd_info->first_gs_info.base,
 841                                 gs_info.base))
 842             {
 843               if (dump_enabled_p ())
 844                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 845                                  "Build SLP failed: different gather base\n");
 846               return 1;
 847             }
 848           if (oprnd_info->first_gs_info.scale != gs_info.scale)
 849             {
 850               if (dump_enabled_p ())
 851                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 852                                  "Build SLP failed: different gather scale\n");
 853               return 1;
 854             }
 855         }
 856
 857       /* Not first stmt of the group, check that the def-stmt/s match
 858          the def-stmt/s of the first stmt.  Allow different definition
 859          types for reduction chains: the first stmt must be a
 860          vect_reduction_def (a phi node), and the rest
 861          end in the reduction chain.  */
 862       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 863            && !(oprnd_info->first_dt == vect_reduction_def
 864                 && !STMT_VINFO_DATA_REF (stmt_info)
 865                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 866                 && def_stmt_info
 867                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 868                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 869                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 870           || (!STMT_VINFO_DATA_REF (stmt_info)
 871               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 872               && ((!def_stmt_info
 873                    || STMT_VINFO_DATA_REF (def_stmt_info)
 874                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 875                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 876                   != (oprnd_info->first_dt != vect_reduction_def))))
 877         {
 878           /* Try swapping operands if we got a mismatch.  For BB
 879              vectorization only in case it will clearly improve things.  */
 880           if (i == commutative_op && !swapped
 881               && (!is_a <bb_vec_info> (vinfo)
 882                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 883                                              dts[i+1])
 884                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 885                           || vect_def_types_match
 886                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 887             {
 888               if (dump_enabled_p ())
 889                 dump_printf_loc (MSG_NOTE, vect_location,
 890                                  "trying swapped operands\n");
 891               std::swap (dts[i], dts[i+1]);
 892               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 893                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 894               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 895                          (*oprnds_info)[i+1]->ops[stmt_num]);
 896               swapped = true;
 897               continue;
 898             }
 899
 900           if (is_a <bb_vec_info> (vinfo)
 901               && !oprnd_info->any_pattern)
 902             {
 903               /* Now for commutative ops we should see whether we can
 904                  make the other operand matching.  */
 905               if (dump_enabled_p ())
 906                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 907                                  "treating operand as external\n");
 908               oprnd_info->first_dt = dt = vect_external_def;
 909             }
 910           else
 911             {
 912               if (dump_enabled_p ())
 913                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 914                                  "Build SLP failed: different types\n");
 915               return 1;
 916             }
 917         }
 918
 919       /* Make sure to demote the overall operand to external.  */
 920       if (dt == vect_external_def)
 921         oprnd_info->first_dt = vect_external_def;
 922       /* For a SLP reduction chain we want to duplicate the reduction to
 923          each of the chain members.  That gets us a sane SLP graph (still
 924          the stmts are not 100% correct wrt the initial values).  */
 925       else if ((dt == vect_internal_def
 926                 || dt == vect_reduction_def)
 927                && oprnd_info->first_dt == vect_reduction_def
 928                && !STMT_VINFO_DATA_REF (stmt_info)
 929                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 930                && !STMT_VINFO_DATA_REF (def_stmt_info)
 931                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 932                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 933         {
 934           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 935           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 936         }
 937
 938       ++i;
 939     }
 940
 941   /* Swap operands.  */
 942   if (swapped)
 943     {
 944       if (dump_enabled_p ())
 945         dump_printf_loc (MSG_NOTE, vect_location,
 946                          "swapped operands to match def types in %G",
 947                          stmt_info->stmt);
 948     }
 949
 950   return 0;
 951 }
 952
 953 /* Return true if call statements CALL1 and CALL2 are similar enough
 954    to be combined into the same SLP group.  */
 955
 956 bool
 957 compatible_calls_p (gcall *call1, gcall *call2)
 958 {
 959   unsigned int nargs = gimple_call_num_args (call1);
 960   if (nargs != gimple_call_num_args (call2))
 961     return false;
 962
 963   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 964     return false;
 965
 966   if (gimple_call_internal_p (call1))
 967     {
 968       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
 969                                TREE_TYPE (gimple_call_lhs (call2))))
 970         return false;
 971       for (unsigned int i = 0; i < nargs; ++i)
 972         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
 973                                  TREE_TYPE (gimple_call_arg (call2, i))))
 974           return false;
 975     }
 976   else
 977     {
 978       if (!operand_equal_p (gimple_call_fn (call1),
 979                             gimple_call_fn (call2), 0))
 980         return false;
 981
 982       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
 983         return false;
 984     }
 985
 986   /* Check that any unvectorized arguments are equal.  */
 987   if (const int *map = vect_get_operand_map (call1))
 988     {
 989       unsigned int nkept = *map++;
 990       unsigned int mapi = 0;
 991       for (unsigned int i = 0; i < nargs; ++i)
 992         if (mapi < nkept && map[mapi] == int (i))
 993           mapi += 1;
 994         else if (!operand_equal_p (gimple_call_arg (call1, i),
 995                                    gimple_call_arg (call2, i)))
 996           return false;
 997     }
 998
 999   return true;
1000 }
1001
1002 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1003    caller's attempt to find the vector type in STMT_INFO with the narrowest
1004    element type.  Return true if VECTYPE is nonnull and if it is valid
1005    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
1006    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
1007    vect_build_slp_tree.  */
1008
1009 static bool
1010 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1011                         unsigned int group_size,
1012                         tree vectype, poly_uint64 *max_nunits)
1013 {
1014   if (!vectype)
1015     {
1016       if (dump_enabled_p ())
1017         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1018                          "Build SLP failed: unsupported data-type in %G\n",
1019                          stmt_info->stmt);
1020       /* Fatal mismatch.  */
1021       return false;
1022     }
1023
1024   /* If populating the vector type requires unrolling then fail
1025      before adjusting *max_nunits for basic-block vectorization.  */
1026   if (is_a <bb_vec_info> (vinfo)
1027       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1028     {
1029       if (dump_enabled_p ())
1030         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1031                          "Build SLP failed: unrolling required "
1032                          "in basic block SLP\n");
1033       /* Fatal mismatch.  */
1034       return false;
1035     }
1036
1037   /* In case of multiple types we need to detect the smallest type.  */
1038   vect_update_max_nunits (max_nunits, vectype);
1039   return true;
1040 }
1041
1042 /* Verify if the scalar stmts STMTS are isomorphic, require data
1043    permutation or are of unsupported types of operation.  Return
1044    true if they are, otherwise return false and indicate in *MATCHES
1045    which stmts are not isomorphic to the first one.  If MATCHES[0]
1046    is false then this indicates the comparison could not be
1047    carried out or the stmts will never be vectorized by SLP.
1048
1049    Note COND_EXPR is possibly isomorphic to another one after swapping its
1050    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1051    the first stmt by swapping the two operands of comparison; set SWAP[i]
1052    to 2 if stmt I is isormorphic to the first stmt by inverting the code
1053    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1054    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
1055
1056 static bool
1057 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1058                        vec<stmt_vec_info> stmts, unsigned int group_size,
1059                        poly_uint64 *max_nunits, bool *matches,
1060                        bool *two_operators, tree *node_vectype)
1061 {
1062   unsigned int i;
1063   stmt_vec_info first_stmt_info = stmts[0];
1064   code_helper first_stmt_code = ERROR_MARK;
1065   code_helper alt_stmt_code = ERROR_MARK;
1066   code_helper rhs_code = ERROR_MARK;
1067   code_helper first_cond_code = ERROR_MARK;
1068   tree lhs;
1069   bool need_same_oprnds = false;
1070   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1071   stmt_vec_info first_load = NULL, prev_first_load = NULL;
1072   bool first_stmt_ldst_p = false, ldst_p = false;
1073   bool first_stmt_phi_p = false, phi_p = false;
1074   bool maybe_soft_fail = false;
1075   tree soft_fail_nunits_vectype = NULL_TREE;
1076
1077   /* For every stmt in NODE find its def stmt/s.  */
1078   stmt_vec_info stmt_info;
1079   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1080     {
1081       gimple *stmt = stmt_info->stmt;
1082       swap[i] = 0;
1083       matches[i] = false;
1084
1085       if (dump_enabled_p ())
1086         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1087
1088       /* Fail to vectorize statements marked as unvectorizable, throw
1089          or are volatile.  */
1090       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1091           || stmt_can_throw_internal (cfun, stmt)
1092           || gimple_has_volatile_ops (stmt))
1093         {
1094           if (dump_enabled_p ())
1095             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1096                              "Build SLP failed: unvectorizable statement %G",
1097                              stmt);
1098           /* ???  For BB vectorization we want to commutate operands in a way
1099              to shuffle all unvectorizable defs into one operand and have
1100              the other still vectorized.  The following doesn't reliably
1101              work for this though but it's the easiest we can do here.  */
1102           if (is_a <bb_vec_info> (vinfo) && i != 0)
1103             continue;
1104           /* Fatal mismatch.  */
1105           matches[0] = false;
1106           return false;
1107         }
1108
1109       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1110       lhs = gimple_get_lhs (stmt);
1111       if (lhs == NULL_TREE
1112           && (!call_stmt
1113               || !gimple_call_internal_p (stmt)
1114               || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1115         {
1116           if (dump_enabled_p ())
1117             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1118                              "Build SLP failed: not GIMPLE_ASSIGN nor "
1119                              "GIMPLE_CALL %G", stmt);
1120           if (is_a <bb_vec_info> (vinfo) && i != 0)
1121             continue;
1122           /* Fatal mismatch.  */
1123           matches[0] = false;
1124           return false;
1125         }
1126
1127       tree nunits_vectype;
1128       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1129                                            &nunits_vectype, group_size))
1130         {
1131           if (is_a <bb_vec_info> (vinfo) && i != 0)
1132             continue;
1133           /* Fatal mismatch.  */
1134           matches[0] = false;
1135           return false;
1136         }
1137       /* Record nunits required but continue analysis, producing matches[]
1138          as if nunits was not an issue.  This allows splitting of groups
1139          to happen.  */
1140       if (nunits_vectype
1141           && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1142                                       nunits_vectype, max_nunits))
1143         {
1144           gcc_assert (is_a <bb_vec_info> (vinfo));
1145           maybe_soft_fail = true;
1146           soft_fail_nunits_vectype = nunits_vectype;
1147         }
1148
1149       gcc_assert (vectype);
1150
1151       if (call_stmt)
1152         {
1153           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1154           if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1155             rhs_code = cfn;
1156           else
1157             rhs_code = CALL_EXPR;
1158
1159           if (cfn == CFN_MASK_LOAD
1160               || cfn == CFN_GATHER_LOAD
1161               || cfn == CFN_MASK_GATHER_LOAD)
1162             ldst_p = true;
1163           else if (cfn == CFN_MASK_STORE)
1164             {
1165               ldst_p = true;
1166               rhs_code = CFN_MASK_STORE;
1167             }
1168           else if ((cfn != CFN_LAST
1169                     && cfn != CFN_MASK_CALL
1170                     && internal_fn_p (cfn)
1171                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1172                    || gimple_call_tail_p (call_stmt)
1173                    || gimple_call_noreturn_p (call_stmt)
1174                    || gimple_call_chain (call_stmt))
1175             {
1176               if (dump_enabled_p ())
1177                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1178                                  "Build SLP failed: unsupported call type %G",
1179                                  (gimple *) call_stmt);
1180               if (is_a <bb_vec_info> (vinfo) && i != 0)
1181                 continue;
1182               /* Fatal mismatch.  */
1183               matches[0] = false;
1184               return false;
1185             }
1186         }
1187       else if (gimple_code (stmt) == GIMPLE_PHI)
1188         {
1189           rhs_code = ERROR_MARK;
1190           phi_p = true;
1191         }
1192       else
1193         {
1194           rhs_code = gimple_assign_rhs_code (stmt);
1195           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1196         }
1197
1198       /* Check the operation.  */
1199       if (i == 0)
1200         {
1201           *node_vectype = vectype;
1202           first_stmt_code = rhs_code;
1203           first_stmt_ldst_p = ldst_p;
1204           first_stmt_phi_p = phi_p;
1205
1206           /* Shift arguments should be equal in all the packed stmts for a
1207              vector shift with scalar shift operand.  */
1208           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1209               || rhs_code == LROTATE_EXPR
1210               || rhs_code == RROTATE_EXPR)
1211             {
1212               /* First see if we have a vector/vector shift.  */
1213               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1214                 {
1215                   /* No vector/vector shift, try for a vector/scalar shift.  */
1216                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1217                     {
1218                       if (dump_enabled_p ())
1219                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1220                                          "Build SLP failed: "
1221                                          "op not supported by target.\n");
1222                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1223                         continue;
1224                       /* Fatal mismatch.  */
1225                       matches[0] = false;
1226                       return false;
1227                     }
1228                   need_same_oprnds = true;
1229                   first_op1 = gimple_assign_rhs2 (stmt);
1230                 }
1231             }
1232           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1233             {
1234               need_same_oprnds = true;
1235               first_op1 = gimple_assign_rhs2 (stmt);
1236             }
1237           else if (!ldst_p
1238                    && rhs_code == BIT_FIELD_REF)
1239             {
1240               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1241               if (!is_a <bb_vec_info> (vinfo)
1242                   || TREE_CODE (vec) != SSA_NAME
1243                   /* When the element types are not compatible we pun the
1244                      source to the target vectype which requires equal size.  */
1245                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1246                        || !types_compatible_p (TREE_TYPE (vectype),
1247                                                TREE_TYPE (TREE_TYPE (vec))))
1248                       && !operand_equal_p (TYPE_SIZE (vectype),
1249                                            TYPE_SIZE (TREE_TYPE (vec)))))
1250                 {
1251                   if (dump_enabled_p ())
1252                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1253                                      "Build SLP failed: "
1254                                      "BIT_FIELD_REF not supported\n");
1255                   /* Fatal mismatch.  */
1256                   matches[0] = false;
1257                   return false;
1258                 }
1259             }
1260           else if (rhs_code == CFN_DIV_POW2)
1261             {
1262               need_same_oprnds = true;
1263               first_op1 = gimple_call_arg (call_stmt, 1);
1264             }
1265         }
1266       else
1267         {
1268           if (first_stmt_code != rhs_code
1269               && alt_stmt_code == ERROR_MARK)
1270             alt_stmt_code = rhs_code;
1271           if ((first_stmt_code != rhs_code
1272                && (first_stmt_code != IMAGPART_EXPR
1273                    || rhs_code != REALPART_EXPR)
1274                && (first_stmt_code != REALPART_EXPR
1275                    || rhs_code != IMAGPART_EXPR)
1276                /* Handle mismatches in plus/minus by computing both
1277                   and merging the results.  */
1278                && !((first_stmt_code == PLUS_EXPR
1279                      || first_stmt_code == MINUS_EXPR)
1280                     && (alt_stmt_code == PLUS_EXPR
1281                         || alt_stmt_code == MINUS_EXPR)
1282                     && rhs_code == alt_stmt_code)
1283                && !(first_stmt_code.is_tree_code ()
1284                     && rhs_code.is_tree_code ()
1285                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1286                         == tcc_comparison)
1287                     && (swap_tree_comparison (tree_code (first_stmt_code))
1288                         == tree_code (rhs_code)))
1289                && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1290                     && (first_stmt_code == ARRAY_REF
1291                         || first_stmt_code == BIT_FIELD_REF
1292                         || first_stmt_code == INDIRECT_REF
1293                         || first_stmt_code == COMPONENT_REF
1294                         || first_stmt_code == MEM_REF)
1295                     && (rhs_code == ARRAY_REF
1296                         || rhs_code == BIT_FIELD_REF
1297                         || rhs_code == INDIRECT_REF
1298                         || rhs_code == COMPONENT_REF
1299                         || rhs_code == MEM_REF)))
1300               || (ldst_p
1301                   && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1302                       != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1303               || first_stmt_ldst_p != ldst_p
1304               || first_stmt_phi_p != phi_p)
1305             {
1306               if (dump_enabled_p ())
1307                 {
1308                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1309                                    "Build SLP failed: different operation "
1310                                    "in stmt %G", stmt);
1311                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1312                                    "original stmt %G", first_stmt_info->stmt);
1313                 }
1314               /* Mismatch.  */
1315               continue;
1316             }
1317
1318           if (!ldst_p
1319               && first_stmt_code == BIT_FIELD_REF
1320               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1321                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1322             {
1323               if (dump_enabled_p ())
1324                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325                                  "Build SLP failed: different BIT_FIELD_REF "
1326                                  "arguments in %G", stmt);
1327               /* Mismatch.  */
1328               continue;
1329             }
1330
1331           if (call_stmt
1332               && first_stmt_code != CFN_MASK_LOAD
1333               && first_stmt_code != CFN_MASK_STORE)
1334             {
1335               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1336                                        call_stmt))
1337                 {
1338                   if (dump_enabled_p ())
1339                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1340                                      "Build SLP failed: different calls in %G",
1341                                      stmt);
1342                   /* Mismatch.  */
1343                   continue;
1344                 }
1345             }
1346
1347           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1348               && (gimple_bb (first_stmt_info->stmt)
1349                   != gimple_bb (stmt_info->stmt)))
1350             {
1351               if (dump_enabled_p ())
1352                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1353                                  "Build SLP failed: different BB for PHI "
1354                                  "or possibly trapping operation in %G", stmt);
1355               /* Mismatch.  */
1356               continue;
1357             }
1358
1359           if (need_same_oprnds)
1360             {
1361               tree other_op1 = gimple_arg (stmt, 1);
1362               if (!operand_equal_p (first_op1, other_op1, 0))
1363                 {
1364                   if (dump_enabled_p ())
1365                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1366                                      "Build SLP failed: different shift "
1367                                      "arguments in %G", stmt);
1368                   /* Mismatch.  */
1369                   continue;
1370                 }
1371             }
1372
1373           if (!types_compatible_p (vectype, *node_vectype))
1374             {
1375               if (dump_enabled_p ())
1376                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1377                                  "Build SLP failed: different vector type "
1378                                  "in %G", stmt);
1379               /* Mismatch.  */
1380               continue;
1381             }
1382         }
1383
1384       /* Grouped store or load.  */
1385       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1386         {
1387           gcc_assert (ldst_p);
1388           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1389             {
1390               /* Store.  */
1391               gcc_assert (rhs_code == CFN_MASK_STORE
1392                           || REFERENCE_CLASS_P (lhs)
1393                           || DECL_P (lhs));
1394             }
1395           else
1396             {
1397               /* Load.  */
1398               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1399               if (prev_first_load)
1400                 {
1401                   /* Check that there are no loads from different interleaving
1402                      chains in the same node.  */
1403                   if (prev_first_load != first_load)
1404                     {
1405                       if (dump_enabled_p ())
1406                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1407                                          vect_location,
1408                                          "Build SLP failed: different "
1409                                          "interleaving chains in one node %G",
1410                                          stmt);
1411                       /* Mismatch.  */
1412                       continue;
1413                     }
1414                 }
1415               else
1416                 prev_first_load = first_load;
1417            }
1418         }
1419       /* Non-grouped store or load.  */
1420       else if (ldst_p)
1421         {
1422           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1423               && rhs_code != CFN_GATHER_LOAD
1424               && rhs_code != CFN_MASK_GATHER_LOAD
1425               && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1426               /* Not grouped loads are handled as externals for BB
1427                  vectorization.  For loop vectorization we can handle
1428                  splats the same we handle single element interleaving.  */
1429               && (is_a <bb_vec_info> (vinfo)
1430                   || stmt_info != first_stmt_info))
1431             {
1432               /* Not grouped load.  */
1433               if (dump_enabled_p ())
1434                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1435                                  "Build SLP failed: not grouped load %G", stmt);
1436
1437               if (i != 0)
1438                 continue;
1439               /* Fatal mismatch.  */
1440               matches[0] = false;
1441               return false;
1442             }
1443         }
1444       /* Not memory operation.  */
1445       else
1446         {
1447           if (!phi_p
1448               && rhs_code.is_tree_code ()
1449               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1450               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1451               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1452               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1453               && rhs_code != VIEW_CONVERT_EXPR
1454               && rhs_code != CALL_EXPR
1455               && rhs_code != BIT_FIELD_REF)
1456             {
1457               if (dump_enabled_p ())
1458                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459                                  "Build SLP failed: operation unsupported %G",
1460                                  stmt);
1461               if (is_a <bb_vec_info> (vinfo) && i != 0)
1462                 continue;
1463               /* Fatal mismatch.  */
1464               matches[0] = false;
1465               return false;
1466             }
1467
1468           if (rhs_code == COND_EXPR)
1469             {
1470               tree cond_expr = gimple_assign_rhs1 (stmt);
1471               enum tree_code cond_code = TREE_CODE (cond_expr);
1472               enum tree_code swap_code = ERROR_MARK;
1473               enum tree_code invert_code = ERROR_MARK;
1474
1475               if (i == 0)
1476                 first_cond_code = TREE_CODE (cond_expr);
1477               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1478                 {
1479                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1480                   swap_code = swap_tree_comparison (cond_code);
1481                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1482                 }
1483
1484               if (first_cond_code == cond_code)
1485                 ;
1486               /* Isomorphic can be achieved by swapping.  */
1487               else if (first_cond_code == swap_code)
1488                 swap[i] = 1;
1489               /* Isomorphic can be achieved by inverting.  */
1490               else if (first_cond_code == invert_code)
1491                 swap[i] = 2;
1492               else
1493                 {
1494                   if (dump_enabled_p ())
1495                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1496                                      "Build SLP failed: different"
1497                                      " operation %G", stmt);
1498                   /* Mismatch.  */
1499                   continue;
1500                 }
1501             }
1502
1503           if (rhs_code.is_tree_code ()
1504               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1505               && (swap_tree_comparison ((tree_code)first_stmt_code)
1506                   == (tree_code)rhs_code))
1507             swap[i] = 1;
1508         }
1509
1510       matches[i] = true;
1511     }
1512
1513   for (i = 0; i < group_size; ++i)
1514     if (!matches[i])
1515       return false;
1516
1517   /* If we allowed a two-operation SLP node verify the target can cope
1518      with the permute we are going to use.  */
1519   if (alt_stmt_code != ERROR_MARK
1520       && (!alt_stmt_code.is_tree_code ()
1521           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1522               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1523     {
1524       *two_operators = true;
1525     }
1526
1527   if (maybe_soft_fail)
1528     {
1529       unsigned HOST_WIDE_INT const_nunits;
1530       if (!TYPE_VECTOR_SUBPARTS
1531             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1532           || const_nunits > group_size)
1533         matches[0] = false;
1534       else
1535         {
1536           /* With constant vector elements simulate a mismatch at the
1537              point we need to split.  */
1538           unsigned tail = group_size & (const_nunits - 1);
1539           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1540         }
1541       return false;
1542     }
1543
1544   return true;
1545 }
1546
1547 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1548    Note we never remove apart from at destruction time so we do not
1549    need a special value for deleted that differs from empty.  */
1550 struct bst_traits
1551 {
1552   typedef vec <stmt_vec_info> value_type;
1553   typedef vec <stmt_vec_info> compare_type;
1554   static inline hashval_t hash (value_type);
1555   static inline bool equal (value_type existing, value_type candidate);
1556   static inline bool is_empty (value_type x) { return !x.exists (); }
1557   static inline bool is_deleted (value_type x) { return !x.exists (); }
1558   static const bool empty_zero_p = true;
1559   static inline void mark_empty (value_type &x) { x.release (); }
1560   static inline void mark_deleted (value_type &x) { x.release (); }
1561   static inline void remove (value_type &x) { x.release (); }
1562 };
1563 inline hashval_t
1564 bst_traits::hash (value_type x)
1565 {
1566   inchash::hash h;
1567   for (unsigned i = 0; i < x.length (); ++i)
1568     h.add_int (gimple_uid (x[i]->stmt));
1569   return h.end ();
1570 }
1571 inline bool
1572 bst_traits::equal (value_type existing, value_type candidate)
1573 {
1574   if (existing.length () != candidate.length ())
1575     return false;
1576   for (unsigned i = 0; i < existing.length (); ++i)
1577     if (existing[i] != candidate[i])
1578       return false;
1579   return true;
1580 }
1581
1582 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1583    but then vec::insert does memmove and that's not compatible with
1584    std::pair.  */
1585 struct chain_op_t
1586 {
1587   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1588       : code (code_), dt (dt_), op (op_) {}
1589   tree_code code;
1590   vect_def_type dt;
1591   tree op;
1592 };
1593
1594 /* Comparator for sorting associatable chains.  */
1595
1596 static int
1597 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1598 {
1599   auto *op1 = (const chain_op_t *) op1_;
1600   auto *op2 = (const chain_op_t *) op2_;
1601   if (op1->dt != op2->dt)
1602     return (int)op1->dt - (int)op2->dt;
1603   return (int)op1->code - (int)op2->code;
1604 }
1605
1606 /* Linearize the associatable expression chain at START with the
1607    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1608    filling CHAIN with the result and using WORKLIST as intermediate storage.
1609    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1610    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1611    stmts, starting with START.  */
1612
1613 static void
1614 vect_slp_linearize_chain (vec_info *vinfo,
1615                           vec<std::pair<tree_code, gimple *> > &worklist,
1616                           vec<chain_op_t> &chain,
1617                           enum tree_code code, gimple *start,
1618                           gimple *&code_stmt, gimple *&alt_code_stmt,
1619                           vec<gimple *> *chain_stmts)
1620 {
1621   /* For each lane linearize the addition/subtraction (or other
1622      uniform associatable operation) expression tree.  */
1623   worklist.safe_push (std::make_pair (code, start));
1624   while (!worklist.is_empty ())
1625     {
1626       auto entry = worklist.pop ();
1627       gassign *stmt = as_a <gassign *> (entry.second);
1628       enum tree_code in_code = entry.first;
1629       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1630       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1631       if (!code_stmt
1632           && gimple_assign_rhs_code (stmt) == code)
1633         code_stmt = stmt;
1634       else if (!alt_code_stmt
1635                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1636         alt_code_stmt = stmt;
1637       if (chain_stmts)
1638         chain_stmts->safe_push (stmt);
1639       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1640         {
1641           tree op = gimple_op (stmt, opnum);
1642           vect_def_type dt;
1643           stmt_vec_info def_stmt_info;
1644           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1645           gcc_assert (res);
1646           if (dt == vect_internal_def
1647               && is_pattern_stmt_p (def_stmt_info))
1648             op = gimple_get_lhs (def_stmt_info->stmt);
1649           gimple *use_stmt;
1650           use_operand_p use_p;
1651           if (dt == vect_internal_def
1652               && single_imm_use (op, &use_p, &use_stmt)
1653               && is_gimple_assign (def_stmt_info->stmt)
1654               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1655                   || (code == PLUS_EXPR
1656                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1657                           == MINUS_EXPR))))
1658             {
1659               tree_code op_def_code = this_code;
1660               if (op_def_code == MINUS_EXPR && opnum == 1)
1661                 op_def_code = PLUS_EXPR;
1662               if (in_code == MINUS_EXPR)
1663                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1664               worklist.safe_push (std::make_pair (op_def_code,
1665                                                   def_stmt_info->stmt));
1666             }
1667           else
1668             {
1669               tree_code op_def_code = this_code;
1670               if (op_def_code == MINUS_EXPR && opnum == 1)
1671                 op_def_code = PLUS_EXPR;
1672               if (in_code == MINUS_EXPR)
1673                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1674               chain.safe_push (chain_op_t (op_def_code, dt, op));
1675             }
1676         }
1677     }
1678 }
1679
1680 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1681                   simple_hashmap_traits <bst_traits, slp_tree> >
1682   scalar_stmts_to_slp_tree_map_t;
1683
1684 static slp_tree
1685 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1686                        vec<stmt_vec_info> stmts, unsigned int group_size,
1687                        poly_uint64 *max_nunits,
1688                        bool *matches, unsigned *limit, unsigned *tree_size,
1689                        scalar_stmts_to_slp_tree_map_t *bst_map);
1690
1691 static slp_tree
1692 vect_build_slp_tree (vec_info *vinfo,
1693                      vec<stmt_vec_info> stmts, unsigned int group_size,
1694                      poly_uint64 *max_nunits,
1695                      bool *matches, unsigned *limit, unsigned *tree_size,
1696                      scalar_stmts_to_slp_tree_map_t *bst_map)
1697 {
1698   if (slp_tree *leader = bst_map->get (stmts))
1699     {
1700       if (dump_enabled_p ())
1701         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1702                          !(*leader)->failed ? "" : "failed ",
1703                          (void *) *leader);
1704       if (!(*leader)->failed)
1705         {
1706           SLP_TREE_REF_COUNT (*leader)++;
1707           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1708           stmts.release ();
1709           return *leader;
1710         }
1711       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1712       return NULL;
1713     }
1714
1715   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1716      so we can pick up backedge destinations during discovery.  */
1717   slp_tree res = new _slp_tree;
1718   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1719   SLP_TREE_SCALAR_STMTS (res) = stmts;
1720   bst_map->put (stmts.copy (), res);
1721
1722   if (*limit == 0)
1723     {
1724       if (dump_enabled_p ())
1725         dump_printf_loc (MSG_NOTE, vect_location,
1726                          "SLP discovery limit exceeded\n");
1727       /* Mark the node invalid so we can detect those when still in use
1728          as backedge destinations.  */
1729       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1730       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1731       res->failed = XNEWVEC (bool, group_size);
1732       memset (res->failed, 0, sizeof (bool) * group_size);
1733       memset (matches, 0, sizeof (bool) * group_size);
1734       return NULL;
1735     }
1736   --*limit;
1737
1738   if (dump_enabled_p ())
1739     dump_printf_loc (MSG_NOTE, vect_location,
1740                      "starting SLP discovery for node %p\n", (void *) res);
1741
1742   poly_uint64 this_max_nunits = 1;
1743   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1744                                         &this_max_nunits,
1745                                         matches, limit, tree_size, bst_map);
1746   if (!res_)
1747     {
1748       if (dump_enabled_p ())
1749         dump_printf_loc (MSG_NOTE, vect_location,
1750                          "SLP discovery for node %p failed\n", (void *) res);
1751       /* Mark the node invalid so we can detect those when still in use
1752          as backedge destinations.  */
1753       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1754       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1755       res->failed = XNEWVEC (bool, group_size);
1756       if (flag_checking)
1757         {
1758           unsigned i;
1759           for (i = 0; i < group_size; ++i)
1760             if (!matches[i])
1761               break;
1762           gcc_assert (i < group_size);
1763         }
1764       memcpy (res->failed, matches, sizeof (bool) * group_size);
1765     }
1766   else
1767     {
1768       if (dump_enabled_p ())
1769         dump_printf_loc (MSG_NOTE, vect_location,
1770                          "SLP discovery for node %p succeeded\n",
1771                          (void *) res);
1772       gcc_assert (res_ == res);
1773       res->max_nunits = this_max_nunits;
1774       vect_update_max_nunits (max_nunits, this_max_nunits);
1775       /* Keep a reference for the bst_map use.  */
1776       SLP_TREE_REF_COUNT (res)++;
1777     }
1778   return res_;
1779 }
1780
1781 /* Helper for building an associated SLP node chain.  */
1782
1783 static void
1784 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1785                                    slp_tree op0, slp_tree op1,
1786                                    stmt_vec_info oper1, stmt_vec_info oper2,
1787                                    vec<std::pair<unsigned, unsigned> > lperm)
1788 {
1789   unsigned group_size = SLP_TREE_LANES (op1);
1790
1791   slp_tree child1 = new _slp_tree;
1792   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1793   SLP_TREE_VECTYPE (child1) = vectype;
1794   SLP_TREE_LANES (child1) = group_size;
1795   SLP_TREE_CHILDREN (child1).create (2);
1796   SLP_TREE_CHILDREN (child1).quick_push (op0);
1797   SLP_TREE_CHILDREN (child1).quick_push (op1);
1798   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1799
1800   slp_tree child2 = new _slp_tree;
1801   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1802   SLP_TREE_VECTYPE (child2) = vectype;
1803   SLP_TREE_LANES (child2) = group_size;
1804   SLP_TREE_CHILDREN (child2).create (2);
1805   SLP_TREE_CHILDREN (child2).quick_push (op0);
1806   SLP_TREE_REF_COUNT (op0)++;
1807   SLP_TREE_CHILDREN (child2).quick_push (op1);
1808   SLP_TREE_REF_COUNT (op1)++;
1809   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1810
1811   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1812   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1813   SLP_TREE_VECTYPE (perm) = vectype;
1814   SLP_TREE_LANES (perm) = group_size;
1815   /* ???  We should set this NULL but that's not expected.  */
1816   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1817   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1818   SLP_TREE_CHILDREN (perm).quick_push (child1);
1819   SLP_TREE_CHILDREN (perm).quick_push (child2);
1820 }
1821
1822 /* Recursively build an SLP tree starting from NODE.
1823    Fail (and return a value not equal to zero) if def-stmts are not
1824    isomorphic, require data permutation or are of unsupported types of
1825    operation.  Otherwise, return 0.
1826    The value returned is the depth in the SLP tree where a mismatch
1827    was found.  */
1828
1829 static slp_tree
1830 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1831                        vec<stmt_vec_info> stmts, unsigned int group_size,
1832                        poly_uint64 *max_nunits,
1833                        bool *matches, unsigned *limit, unsigned *tree_size,
1834                        scalar_stmts_to_slp_tree_map_t *bst_map)
1835 {
1836   unsigned nops, i, this_tree_size = 0;
1837   poly_uint64 this_max_nunits = *max_nunits;
1838
1839   matches[0] = false;
1840
1841   stmt_vec_info stmt_info = stmts[0];
1842   if (!is_a<gcall *> (stmt_info->stmt)
1843       && !is_a<gassign *> (stmt_info->stmt)
1844       && !is_a<gphi *> (stmt_info->stmt))
1845     return NULL;
1846
1847   nops = gimple_num_args (stmt_info->stmt);
1848   if (const int *map = vect_get_operand_map (stmt_info->stmt,
1849                                              STMT_VINFO_GATHER_SCATTER_P
1850                                                (stmt_info)))
1851     nops = map[0];
1852
1853   /* If the SLP node is a PHI (induction or reduction), terminate
1854      the recursion.  */
1855   bool *skip_args = XALLOCAVEC (bool, nops);
1856   memset (skip_args, 0, sizeof (bool) * nops);
1857   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1858     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1859       {
1860         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1861         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1862                                                     group_size);
1863         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1864                                      max_nunits))
1865           return NULL;
1866
1867         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1868         if (def_type == vect_induction_def)
1869           {
1870             /* Induction PHIs are not cycles but walk the initial
1871                value.  Only for inner loops through, for outer loops
1872                we need to pick up the value from the actual PHIs
1873                to more easily support peeling and epilogue vectorization.  */
1874             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1875             if (!nested_in_vect_loop_p (loop, stmt_info))
1876               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1877             else
1878               loop = loop->inner;
1879             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1880           }
1881         else if (def_type == vect_reduction_def
1882                  || def_type == vect_double_reduction_def
1883                  || def_type == vect_nested_cycle
1884                  || def_type == vect_first_order_recurrence)
1885           {
1886             /* Else def types have to match.  */
1887             stmt_vec_info other_info;
1888             bool all_same = true;
1889             FOR_EACH_VEC_ELT (stmts, i, other_info)
1890               {
1891                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1892                   return NULL;
1893                 if (other_info != stmt_info)
1894                   all_same = false;
1895               }
1896             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1897             /* Reduction initial values are not explicitely represented.  */
1898             if (def_type != vect_first_order_recurrence
1899                 && !nested_in_vect_loop_p (loop, stmt_info))
1900               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1901             /* Reduction chain backedge defs are filled manually.
1902                ???  Need a better way to identify a SLP reduction chain PHI.
1903                Or a better overall way to SLP match those.  */
1904             if (all_same && def_type == vect_reduction_def)
1905               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1906           }
1907         else if (def_type != vect_internal_def)
1908           return NULL;
1909       }
1910
1911
1912   bool two_operators = false;
1913   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1914   tree vectype = NULL_TREE;
1915   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1916                               &this_max_nunits, matches, &two_operators,
1917                               &vectype))
1918     return NULL;
1919
1920   /* If the SLP node is a load, terminate the recursion unless masked.  */
1921   if (STMT_VINFO_DATA_REF (stmt_info)
1922       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1923     {
1924       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1925         gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1926                     || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1927                     || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1928       else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1929         gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1930       else
1931         {
1932           *max_nunits = this_max_nunits;
1933           (*tree_size)++;
1934           node = vect_create_new_slp_node (node, stmts, 0);
1935           SLP_TREE_VECTYPE (node) = vectype;
1936           /* And compute the load permutation.  Whether it is actually
1937              a permutation depends on the unrolling factor which is
1938              decided later.  */
1939           vec<unsigned> load_permutation;
1940           int j;
1941           stmt_vec_info load_info;
1942           load_permutation.create (group_size);
1943           stmt_vec_info first_stmt_info
1944             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1945           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1946             {
1947               int load_place;
1948               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1949                 load_place = vect_get_place_in_interleaving_chain
1950                                 (load_info, first_stmt_info);
1951               else
1952                 load_place = 0;
1953               gcc_assert (load_place != -1);
1954               load_permutation.safe_push (load_place);
1955             }
1956           SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1957           return node;
1958         }
1959     }
1960   else if (gimple_assign_single_p (stmt_info->stmt)
1961            && !gimple_vuse (stmt_info->stmt)
1962            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1963     {
1964       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1965          the same SSA name vector of a compatible type to vectype.  */
1966       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1967       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1968       stmt_vec_info estmt_info;
1969       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1970         {
1971           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1972           tree bfref = gimple_assign_rhs1 (estmt);
1973           HOST_WIDE_INT lane;
1974           if (!known_eq (bit_field_size (bfref),
1975                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1976               || !constant_multiple_p (bit_field_offset (bfref),
1977                                        bit_field_size (bfref), &lane))
1978             {
1979               lperm.release ();
1980               matches[0] = false;
1981               return NULL;
1982             }
1983           lperm.safe_push (std::make_pair (0, (unsigned)lane));
1984         }
1985       slp_tree vnode = vect_create_new_slp_node (vNULL);
1986       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1987         /* ???  We record vectype here but we hide eventually necessary
1988            punning and instead rely on code generation to materialize
1989            VIEW_CONVERT_EXPRs as necessary.  We instead should make
1990            this explicit somehow.  */
1991         SLP_TREE_VECTYPE (vnode) = vectype;
1992       else
1993         {
1994           /* For different size but compatible elements we can still
1995              use VEC_PERM_EXPR without punning.  */
1996           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1997                       && types_compatible_p (TREE_TYPE (vectype),
1998                                              TREE_TYPE (TREE_TYPE (vec))));
1999           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2000         }
2001       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2002       unsigned HOST_WIDE_INT const_nunits;
2003       if (nunits.is_constant (&const_nunits))
2004         SLP_TREE_LANES (vnode) = const_nunits;
2005       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2006       /* We are always building a permutation node even if it is an identity
2007          permute to shield the rest of the vectorizer from the odd node
2008          representing an actual vector without any scalar ops.
2009          ???  We could hide it completely with making the permute node
2010          external?  */
2011       node = vect_create_new_slp_node (node, stmts, 1);
2012       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2013       SLP_TREE_LANE_PERMUTATION (node) = lperm;
2014       SLP_TREE_VECTYPE (node) = vectype;
2015       SLP_TREE_CHILDREN (node).quick_push (vnode);
2016       return node;
2017     }
2018   /* When discovery reaches an associatable operation see whether we can
2019      improve that to match up lanes in a way superior to the operand
2020      swapping code which at most looks at two defs.
2021      ???  For BB vectorization we cannot do the brute-force search
2022      for matching as we can succeed by means of builds from scalars
2023      and have no good way to "cost" one build against another.  */
2024   else if (is_a <loop_vec_info> (vinfo)
2025            /* ???  We don't handle !vect_internal_def defs below.  */
2026            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2027            && is_gimple_assign (stmt_info->stmt)
2028            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2029                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2030            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2031                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2032                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2033     {
2034       /* See if we have a chain of (mixed) adds or subtracts or other
2035          associatable ops.  */
2036       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2037       if (code == MINUS_EXPR)
2038         code = PLUS_EXPR;
2039       stmt_vec_info other_op_stmt_info = NULL;
2040       stmt_vec_info op_stmt_info = NULL;
2041       unsigned chain_len = 0;
2042       auto_vec<chain_op_t> chain;
2043       auto_vec<std::pair<tree_code, gimple *> > worklist;
2044       auto_vec<vec<chain_op_t> > chains (group_size);
2045       auto_vec<slp_tree, 4> children;
2046       bool hard_fail = true;
2047       for (unsigned lane = 0; lane < group_size; ++lane)
2048         {
2049           /* For each lane linearize the addition/subtraction (or other
2050              uniform associatable operation) expression tree.  */
2051           gimple *op_stmt = NULL, *other_op_stmt = NULL;
2052           vect_slp_linearize_chain (vinfo, worklist, chain, code,
2053                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
2054                                     NULL);
2055           if (!op_stmt_info && op_stmt)
2056             op_stmt_info = vinfo->lookup_stmt (op_stmt);
2057           if (!other_op_stmt_info && other_op_stmt)
2058             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2059           if (chain.length () == 2)
2060             {
2061               /* In a chain of just two elements resort to the regular
2062                  operand swapping scheme.  If we run into a length
2063                  mismatch still hard-FAIL.  */
2064               if (chain_len == 0)
2065                 hard_fail = false;
2066               else
2067                 {
2068                   matches[lane] = false;
2069                   /* ???  We might want to process the other lanes, but
2070                      make sure to not give false matching hints to the
2071                      caller for lanes we did not process.  */
2072                   if (lane != group_size - 1)
2073                     matches[0] = false;
2074                 }
2075               break;
2076             }
2077           else if (chain_len == 0)
2078             chain_len = chain.length ();
2079           else if (chain.length () != chain_len)
2080             {
2081               /* ???  Here we could slip in magic to compensate with
2082                  neutral operands.  */
2083               matches[lane] = false;
2084               if (lane != group_size - 1)
2085                 matches[0] = false;
2086               break;
2087             }
2088           chains.quick_push (chain.copy ());
2089           chain.truncate (0);
2090         }
2091       if (chains.length () == group_size)
2092         {
2093           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
2094           if (!op_stmt_info)
2095             {
2096               hard_fail = false;
2097               goto out;
2098             }
2099           /* Now we have a set of chains with the same length.  */
2100           /* 1. pre-sort according to def_type and operation.  */
2101           for (unsigned lane = 0; lane < group_size; ++lane)
2102             chains[lane].stablesort (dt_sort_cmp, vinfo);
2103           if (dump_enabled_p ())
2104             {
2105               dump_printf_loc (MSG_NOTE, vect_location,
2106                                "pre-sorted chains of %s\n",
2107                                get_tree_code_name (code));
2108               for (unsigned lane = 0; lane < group_size; ++lane)
2109                 {
2110                   for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2111                     dump_printf (MSG_NOTE, "%s %T ",
2112                                  get_tree_code_name (chains[lane][opnum].code),
2113                                  chains[lane][opnum].op);
2114                   dump_printf (MSG_NOTE, "\n");
2115                 }
2116             }
2117           /* 2. try to build children nodes, associating as necessary.  */
2118           for (unsigned n = 0; n < chain_len; ++n)
2119             {
2120               vect_def_type dt = chains[0][n].dt;
2121               unsigned lane;
2122               for (lane = 0; lane < group_size; ++lane)
2123                 if (chains[lane][n].dt != dt)
2124                   {
2125                     if (dt == vect_constant_def
2126                         && chains[lane][n].dt == vect_external_def)
2127                       dt = vect_external_def;
2128                     else if (dt == vect_external_def
2129                              && chains[lane][n].dt == vect_constant_def)
2130                       ;
2131                     else
2132                       break;
2133                   }
2134               if (lane != group_size)
2135                 {
2136                   if (dump_enabled_p ())
2137                     dump_printf_loc (MSG_NOTE, vect_location,
2138                                      "giving up on chain due to mismatched "
2139                                      "def types\n");
2140                   matches[lane] = false;
2141                   if (lane != group_size - 1)
2142                     matches[0] = false;
2143                   goto out;
2144                 }
2145               if (dt == vect_constant_def
2146                   || dt == vect_external_def)
2147                 {
2148                   /* Check whether we can build the invariant.  If we can't
2149                      we never will be able to.  */
2150                   tree type = TREE_TYPE (chains[0][n].op);
2151                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2152                       && (TREE_CODE (type) == BOOLEAN_TYPE
2153                           || !can_duplicate_and_interleave_p (vinfo, group_size,
2154                                                               type)))
2155                     {
2156                       matches[0] = false;
2157                       goto out;
2158                     }
2159                   vec<tree> ops;
2160                   ops.create (group_size);
2161                   for (lane = 0; lane < group_size; ++lane)
2162                     ops.quick_push (chains[lane][n].op);
2163                   slp_tree child = vect_create_new_slp_node (ops);
2164                   SLP_TREE_DEF_TYPE (child) = dt;
2165                   children.safe_push (child);
2166                 }
2167               else if (dt != vect_internal_def)
2168                 {
2169                   /* Not sure, we might need sth special.
2170                      gcc.dg/vect/pr96854.c,
2171                      gfortran.dg/vect/fast-math-pr37021.f90
2172                      and gfortran.dg/vect/pr61171.f trigger.  */
2173                   /* Soft-fail for now.  */
2174                   hard_fail = false;
2175                   goto out;
2176                 }
2177               else
2178                 {
2179                   vec<stmt_vec_info> op_stmts;
2180                   op_stmts.create (group_size);
2181                   slp_tree child = NULL;
2182                   /* Brute-force our way.  We have to consider a lane
2183                      failing after fixing an earlier fail up in the
2184                      SLP discovery recursion.  So track the current
2185                      permute per lane.  */
2186                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2187                   memset (perms, 0, sizeof (unsigned) * group_size);
2188                   do
2189                     {
2190                       op_stmts.truncate (0);
2191                       for (lane = 0; lane < group_size; ++lane)
2192                         op_stmts.quick_push
2193                           (vinfo->lookup_def (chains[lane][n].op));
2194                       child = vect_build_slp_tree (vinfo, op_stmts,
2195                                                    group_size, &this_max_nunits,
2196                                                    matches, limit,
2197                                                    &this_tree_size, bst_map);
2198                       /* ???  We're likely getting too many fatal mismatches
2199                          here so maybe we want to ignore them (but then we
2200                          have no idea which lanes fatally mismatched).  */
2201                       if (child || !matches[0])
2202                         break;
2203                       /* Swap another lane we have not yet matched up into
2204                          lanes that did not match.  If we run out of
2205                          permute possibilities for a lane terminate the
2206                          search.  */
2207                       bool term = false;
2208                       for (lane = 1; lane < group_size; ++lane)
2209                         if (!matches[lane])
2210                           {
2211                             if (n + perms[lane] + 1 == chain_len)
2212                               {
2213                                 term = true;
2214                                 break;
2215                               }
2216                             std::swap (chains[lane][n],
2217                                        chains[lane][n + perms[lane] + 1]);
2218                             perms[lane]++;
2219                           }
2220                       if (term)
2221                         break;
2222                     }
2223                   while (1);
2224                   if (!child)
2225                     {
2226                       if (dump_enabled_p ())
2227                         dump_printf_loc (MSG_NOTE, vect_location,
2228                                          "failed to match up op %d\n", n);
2229                       op_stmts.release ();
2230                       if (lane != group_size - 1)
2231                         matches[0] = false;
2232                       else
2233                         matches[lane] = false;
2234                       goto out;
2235                     }
2236                   if (dump_enabled_p ())
2237                     {
2238                       dump_printf_loc (MSG_NOTE, vect_location,
2239                                        "matched up op %d to\n", n);
2240                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2241                     }
2242                   children.safe_push (child);
2243                 }
2244             }
2245           /* 3. build SLP nodes to combine the chain.  */
2246           for (unsigned lane = 0; lane < group_size; ++lane)
2247             if (chains[lane][0].code != code)
2248               {
2249                 /* See if there's any alternate all-PLUS entry.  */
2250                 unsigned n;
2251                 for (n = 1; n < chain_len; ++n)
2252                   {
2253                     for (lane = 0; lane < group_size; ++lane)
2254                       if (chains[lane][n].code != code)
2255                         break;
2256                     if (lane == group_size)
2257                       break;
2258                   }
2259                 if (n != chain_len)
2260                   {
2261                     /* Swap that in at first position.  */
2262                     std::swap (children[0], children[n]);
2263                     for (lane = 0; lane < group_size; ++lane)
2264                       std::swap (chains[lane][0], chains[lane][n]);
2265                   }
2266                 else
2267                   {
2268                     /* ???  When this triggers and we end up with two
2269                        vect_constant/external_def up-front things break (ICE)
2270                        spectacularly finding an insertion place for the
2271                        all-constant op.  We should have a fully
2272                        vect_internal_def operand though(?) so we can swap
2273                        that into first place and then prepend the all-zero
2274                        constant.  */
2275                     if (dump_enabled_p ())
2276                       dump_printf_loc (MSG_NOTE, vect_location,
2277                                        "inserting constant zero to compensate "
2278                                        "for (partially) negated first "
2279                                        "operand\n");
2280                     chain_len++;
2281                     for (lane = 0; lane < group_size; ++lane)
2282                       chains[lane].safe_insert
2283                         (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2284                     vec<tree> zero_ops;
2285                     zero_ops.create (group_size);
2286                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2287                     for (lane = 1; lane < group_size; ++lane)
2288                       zero_ops.quick_push (zero_ops[0]);
2289                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2290                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2291                     children.safe_insert (0, zero);
2292                   }
2293                 break;
2294               }
2295           for (unsigned i = 1; i < children.length (); ++i)
2296             {
2297               slp_tree op0 = children[i - 1];
2298               slp_tree op1 = children[i];
2299               bool this_two_op = false;
2300               for (unsigned lane = 0; lane < group_size; ++lane)
2301                 if (chains[lane][i].code != chains[0][i].code)
2302                   {
2303                     this_two_op = true;
2304                     break;
2305                   }
2306               slp_tree child;
2307               if (i == children.length () - 1)
2308                 child = vect_create_new_slp_node (node, stmts, 2);
2309               else
2310                 child = vect_create_new_slp_node (2, ERROR_MARK);
2311               if (this_two_op)
2312                 {
2313                   vec<std::pair<unsigned, unsigned> > lperm;
2314                   lperm.create (group_size);
2315                   for (unsigned lane = 0; lane < group_size; ++lane)
2316                     lperm.quick_push (std::make_pair
2317                       (chains[lane][i].code != chains[0][i].code, lane));
2318                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2319                                                      (chains[0][i].code == code
2320                                                       ? op_stmt_info
2321                                                       : other_op_stmt_info),
2322                                                      (chains[0][i].code == code
2323                                                       ? other_op_stmt_info
2324                                                       : op_stmt_info),
2325                                                      lperm);
2326                 }
2327               else
2328                 {
2329                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2330                   SLP_TREE_VECTYPE (child) = vectype;
2331                   SLP_TREE_LANES (child) = group_size;
2332                   SLP_TREE_CHILDREN (child).quick_push (op0);
2333                   SLP_TREE_CHILDREN (child).quick_push (op1);
2334                   SLP_TREE_REPRESENTATIVE (child)
2335                     = (chains[0][i].code == code
2336                        ? op_stmt_info : other_op_stmt_info);
2337                 }
2338               children[i] = child;
2339             }
2340           *tree_size += this_tree_size + 1;
2341           *max_nunits = this_max_nunits;
2342           while (!chains.is_empty ())
2343             chains.pop ().release ();
2344           return node;
2345         }
2346 out:
2347       while (!children.is_empty ())
2348         vect_free_slp_tree (children.pop ());
2349       while (!chains.is_empty ())
2350         chains.pop ().release ();
2351       /* Hard-fail, otherwise we might run into quadratic processing of the
2352          chains starting one stmt into the chain again.  */
2353       if (hard_fail)
2354         return NULL;
2355       /* Fall thru to normal processing.  */
2356     }
2357
2358   /* Get at the operands, verifying they are compatible.  */
2359   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2360   slp_oprnd_info oprnd_info;
2361   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2362     {
2363       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2364                                              stmts, i, &oprnds_info);
2365       if (res != 0)
2366         matches[(res == -1) ? 0 : i] = false;
2367       if (!matches[0])
2368         break;
2369     }
2370   for (i = 0; i < group_size; ++i)
2371     if (!matches[i])
2372       {
2373         vect_free_oprnd_info (oprnds_info);
2374         return NULL;
2375       }
2376   swap = NULL;
2377
2378   auto_vec<slp_tree, 4> children;
2379
2380   stmt_info = stmts[0];
2381
2382   /* Create SLP_TREE nodes for the definition node/s.  */
2383   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2384     {
2385       slp_tree child;
2386       unsigned int j;
2387
2388       /* We're skipping certain operands from processing, for example
2389          outer loop reduction initial defs.  */
2390       if (skip_args[i])
2391         {
2392           children.safe_push (NULL);
2393           continue;
2394         }
2395
2396       if (oprnd_info->first_dt == vect_uninitialized_def)
2397         {
2398           /* COND_EXPR have one too many eventually if the condition
2399              is a SSA name.  */
2400           gcc_assert (i == 3 && nops == 4);
2401           continue;
2402         }
2403
2404       if (is_a <bb_vec_info> (vinfo)
2405           && oprnd_info->first_dt == vect_internal_def
2406           && !oprnd_info->any_pattern)
2407         {
2408           /* For BB vectorization, if all defs are the same do not
2409              bother to continue the build along the single-lane
2410              graph but use a splat of the scalar value.  */
2411           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2412           for (j = 1; j < group_size; ++j)
2413             if (oprnd_info->def_stmts[j] != first_def)
2414               break;
2415           if (j == group_size
2416               /* But avoid doing this for loads where we may be
2417                  able to CSE things, unless the stmt is not
2418                  vectorizable.  */
2419               && (!STMT_VINFO_VECTORIZABLE (first_def)
2420                   || !gimple_vuse (first_def->stmt)))
2421             {
2422               if (dump_enabled_p ())
2423                 dump_printf_loc (MSG_NOTE, vect_location,
2424                                  "Using a splat of the uniform operand %G",
2425                                  first_def->stmt);
2426               oprnd_info->first_dt = vect_external_def;
2427             }
2428         }
2429
2430       if (oprnd_info->first_dt == vect_external_def
2431           || oprnd_info->first_dt == vect_constant_def)
2432         {
2433           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2434           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2435           oprnd_info->ops = vNULL;
2436           children.safe_push (invnode);
2437           continue;
2438         }
2439
2440       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2441                                         group_size, &this_max_nunits,
2442                                         matches, limit,
2443                                         &this_tree_size, bst_map)) != NULL)
2444         {
2445           oprnd_info->def_stmts = vNULL;
2446           children.safe_push (child);
2447           continue;
2448         }
2449
2450       /* If the SLP build for operand zero failed and operand zero
2451          and one can be commutated try that for the scalar stmts
2452          that failed the match.  */
2453       if (i == 0
2454           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2455           && matches[0]
2456           /* ???  For COND_EXPRs we can swap the comparison operands
2457              as well as the arms under some constraints.  */
2458           && nops == 2
2459           && oprnds_info[1]->first_dt == vect_internal_def
2460           && is_gimple_assign (stmt_info->stmt)
2461           /* Swapping operands for reductions breaks assumptions later on.  */
2462           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2463           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2464         {
2465           /* See whether we can swap the matching or the non-matching
2466              stmt operands.  */
2467           bool swap_not_matching = true;
2468           do
2469             {
2470               for (j = 0; j < group_size; ++j)
2471                 {
2472                   if (matches[j] != !swap_not_matching)
2473                     continue;
2474                   stmt_vec_info stmt_info = stmts[j];
2475                   /* Verify if we can swap operands of this stmt.  */
2476                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2477                   if (!stmt
2478                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2479                     {
2480                       if (!swap_not_matching)
2481                         goto fail;
2482                       swap_not_matching = false;
2483                       break;
2484                     }
2485                 }
2486             }
2487           while (j != group_size);
2488
2489           /* Swap mismatched definition stmts.  */
2490           if (dump_enabled_p ())
2491             dump_printf_loc (MSG_NOTE, vect_location,
2492                              "Re-trying with swapped operands of stmts ");
2493           for (j = 0; j < group_size; ++j)
2494             if (matches[j] == !swap_not_matching)
2495               {
2496                 std::swap (oprnds_info[0]->def_stmts[j],
2497                            oprnds_info[1]->def_stmts[j]);
2498                 std::swap (oprnds_info[0]->ops[j],
2499                            oprnds_info[1]->ops[j]);
2500                 if (dump_enabled_p ())
2501                   dump_printf (MSG_NOTE, "%d ", j);
2502               }
2503           if (dump_enabled_p ())
2504             dump_printf (MSG_NOTE, "\n");
2505           /* After swapping some operands we lost track whether an
2506              operand has any pattern defs so be conservative here.  */
2507           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2508             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2509           /* And try again with scratch 'matches' ... */
2510           bool *tem = XALLOCAVEC (bool, group_size);
2511           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2512                                             group_size, &this_max_nunits,
2513                                             tem, limit,
2514                                             &this_tree_size, bst_map)) != NULL)
2515             {
2516               oprnd_info->def_stmts = vNULL;
2517               children.safe_push (child);
2518               continue;
2519             }
2520         }
2521 fail:
2522
2523       /* If the SLP build failed and we analyze a basic-block
2524          simply treat nodes we fail to build as externally defined
2525          (and thus build vectors from the scalar defs).
2526          The cost model will reject outright expensive cases.
2527          ???  This doesn't treat cases where permutation ultimatively
2528          fails (or we don't try permutation below).  Ideally we'd
2529          even compute a permutation that will end up with the maximum
2530          SLP tree size...  */
2531       if (is_a <bb_vec_info> (vinfo)
2532           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2533              do extra work to cancel the pattern so the uses see the
2534              scalar version.  */
2535           && !is_pattern_stmt_p (stmt_info)
2536           && !oprnd_info->any_pattern)
2537         {
2538           /* But if there's a leading vector sized set of matching stmts
2539              fail here so we can split the group.  This matches the condition
2540              vect_analyze_slp_instance uses.  */
2541           /* ???  We might want to split here and combine the results to support
2542              multiple vector sizes better.  */
2543           for (j = 0; j < group_size; ++j)
2544             if (!matches[j])
2545               break;
2546           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2547             {
2548               if (dump_enabled_p ())
2549                 dump_printf_loc (MSG_NOTE, vect_location,
2550                                  "Building vector operands from scalars\n");
2551               this_tree_size++;
2552               child = vect_create_new_slp_node (oprnd_info->ops);
2553               children.safe_push (child);
2554               oprnd_info->ops = vNULL;
2555               continue;
2556             }
2557         }
2558
2559       gcc_assert (child == NULL);
2560       FOR_EACH_VEC_ELT (children, j, child)
2561         if (child)
2562           vect_free_slp_tree (child);
2563       vect_free_oprnd_info (oprnds_info);
2564       return NULL;
2565     }
2566
2567   vect_free_oprnd_info (oprnds_info);
2568
2569   /* If we have all children of a child built up from uniform scalars
2570      or does more than one possibly expensive vector construction then
2571      just throw that away, causing it built up from scalars.
2572      The exception is the SLP node for the vector store.  */
2573   if (is_a <bb_vec_info> (vinfo)
2574       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2575       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2576          do extra work to cancel the pattern so the uses see the
2577          scalar version.  */
2578       && !is_pattern_stmt_p (stmt_info))
2579     {
2580       slp_tree child;
2581       unsigned j;
2582       bool all_uniform_p = true;
2583       unsigned n_vector_builds = 0;
2584       FOR_EACH_VEC_ELT (children, j, child)
2585         {
2586           if (!child)
2587             ;
2588           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2589             all_uniform_p = false;
2590           else if (!vect_slp_tree_uniform_p (child))
2591             {
2592               all_uniform_p = false;
2593               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2594                 n_vector_builds++;
2595             }
2596         }
2597       if (all_uniform_p
2598           || n_vector_builds > 1
2599           || (n_vector_builds == children.length ()
2600               && is_a <gphi *> (stmt_info->stmt)))
2601         {
2602           /* Roll back.  */
2603           matches[0] = false;
2604           FOR_EACH_VEC_ELT (children, j, child)
2605             if (child)
2606               vect_free_slp_tree (child);
2607
2608           if (dump_enabled_p ())
2609             dump_printf_loc (MSG_NOTE, vect_location,
2610                              "Building parent vector operands from "
2611                              "scalars instead\n");
2612           return NULL;
2613         }
2614     }
2615
2616   *tree_size += this_tree_size + 1;
2617   *max_nunits = this_max_nunits;
2618
2619   if (two_operators)
2620     {
2621       /* ???  We'd likely want to either cache in bst_map sth like
2622          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2623          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2624          explicit stmts to put in so the keying on 'stmts' doesn't
2625          work (but we have the same issue with nodes that use 'ops').  */
2626       slp_tree one = new _slp_tree;
2627       slp_tree two = new _slp_tree;
2628       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2629       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2630       SLP_TREE_VECTYPE (one) = vectype;
2631       SLP_TREE_VECTYPE (two) = vectype;
2632       SLP_TREE_CHILDREN (one).safe_splice (children);
2633       SLP_TREE_CHILDREN (two).safe_splice (children);
2634       slp_tree child;
2635       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2636         SLP_TREE_REF_COUNT (child)++;
2637
2638       /* Here we record the original defs since this
2639          node represents the final lane configuration.  */
2640       node = vect_create_new_slp_node (node, stmts, 2);
2641       SLP_TREE_VECTYPE (node) = vectype;
2642       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2643       SLP_TREE_CHILDREN (node).quick_push (one);
2644       SLP_TREE_CHILDREN (node).quick_push (two);
2645       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2646       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2647       enum tree_code ocode = ERROR_MARK;
2648       stmt_vec_info ostmt_info;
2649       unsigned j = 0;
2650       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2651         {
2652           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2653           if (gimple_assign_rhs_code (ostmt) != code0)
2654             {
2655               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2656               ocode = gimple_assign_rhs_code (ostmt);
2657               j = i;
2658             }
2659           else
2660             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2661         }
2662       SLP_TREE_CODE (one) = code0;
2663       SLP_TREE_CODE (two) = ocode;
2664       SLP_TREE_LANES (one) = stmts.length ();
2665       SLP_TREE_LANES (two) = stmts.length ();
2666       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2667       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2668       return node;
2669     }
2670
2671   node = vect_create_new_slp_node (node, stmts, nops);
2672   SLP_TREE_VECTYPE (node) = vectype;
2673   SLP_TREE_CHILDREN (node).splice (children);
2674   return node;
2675 }
2676
2677 /* Dump a single SLP tree NODE.  */
2678
2679 static void
2680 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2681                      slp_tree node)
2682 {
2683   unsigned i, j;
2684   slp_tree child;
2685   stmt_vec_info stmt_info;
2686   tree op;
2687
2688   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2689   dump_user_location_t user_loc = loc.get_user_location ();
2690   dump_printf_loc (metadata, user_loc,
2691                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2692                    ", refcnt=%u)",
2693                    SLP_TREE_DEF_TYPE (node) == vect_external_def
2694                    ? " (external)"
2695                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2696                       ? " (constant)"
2697                       : ""), (void *) node,
2698                    estimated_poly_value (node->max_nunits),
2699                                          SLP_TREE_REF_COUNT (node));
2700   if (SLP_TREE_VECTYPE (node))
2701     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2702   dump_printf (metadata, "\n");
2703   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2704     {
2705       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2706         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2707       else
2708         dump_printf_loc (metadata, user_loc, "op template: %G",
2709                          SLP_TREE_REPRESENTATIVE (node)->stmt);
2710     }
2711   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2712     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2713       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2714   else
2715     {
2716       dump_printf_loc (metadata, user_loc, "\t{ ");
2717       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2718         dump_printf (metadata, "%T%s ", op,
2719                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2720       dump_printf (metadata, "}\n");
2721     }
2722   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2723     {
2724       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2725       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2726         dump_printf (dump_kind, " %u", j);
2727       dump_printf (dump_kind, " }\n");
2728     }
2729   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2730     {
2731       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2732       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2733         dump_printf (dump_kind, " %u[%u]",
2734                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
2735                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
2736       dump_printf (dump_kind, " }\n");
2737     }
2738   if (SLP_TREE_CHILDREN (node).is_empty ())
2739     return;
2740   dump_printf_loc (metadata, user_loc, "\tchildren");
2741   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2742     dump_printf (dump_kind, " %p", (void *)child);
2743   dump_printf (dump_kind, "\n");
2744 }
2745
2746 DEBUG_FUNCTION void
2747 debug (slp_tree node)
2748 {
2749   debug_dump_context ctx;
2750   vect_print_slp_tree (MSG_NOTE,
2751                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2752                        node);
2753 }
2754
2755 /* Recursive helper for the dot producer below.  */
2756
2757 static void
2758 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2759 {
2760   if (visited.add (node))
2761     return;
2762
2763   fprintf (f, "\"%p\" [label=\"", (void *)node);
2764   vect_print_slp_tree (MSG_NOTE,
2765                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2766                        node);
2767   fprintf (f, "\"];\n");
2768
2769
2770   for (slp_tree child : SLP_TREE_CHILDREN (node))
2771     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2772
2773   for (slp_tree child : SLP_TREE_CHILDREN (node))
2774     if (child)
2775       dot_slp_tree (f, child, visited);
2776 }
2777
2778 DEBUG_FUNCTION void
2779 dot_slp_tree (const char *fname, slp_tree node)
2780 {
2781   FILE *f = fopen (fname, "w");
2782   fprintf (f, "digraph {\n");
2783   fflush (f);
2784     {
2785       debug_dump_context ctx (f);
2786       hash_set<slp_tree> visited;
2787       dot_slp_tree (f, node, visited);
2788     }
2789   fflush (f);
2790   fprintf (f, "}\n");
2791   fclose (f);
2792 }
2793
2794 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2795
2796 static void
2797 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2798                       slp_tree node, hash_set<slp_tree> &visited)
2799 {
2800   unsigned i;
2801   slp_tree child;
2802
2803   if (visited.add (node))
2804     return;
2805
2806   vect_print_slp_tree (dump_kind, loc, node);
2807
2808   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2809     if (child)
2810       vect_print_slp_graph (dump_kind, loc, child, visited);
2811 }
2812
2813 static void
2814 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2815                       slp_tree entry)
2816 {
2817   hash_set<slp_tree> visited;
2818   vect_print_slp_graph (dump_kind, loc, entry, visited);
2819 }
2820
2821 /* Mark the tree rooted at NODE with PURE_SLP.  */
2822
2823 static void
2824 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2825 {
2826   int i;
2827   stmt_vec_info stmt_info;
2828   slp_tree child;
2829
2830   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2831     return;
2832
2833   if (visited.add (node))
2834     return;
2835
2836   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2837     STMT_SLP_TYPE (stmt_info) = pure_slp;
2838
2839   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2840     if (child)
2841       vect_mark_slp_stmts (child, visited);
2842 }
2843
2844 static void
2845 vect_mark_slp_stmts (slp_tree node)
2846 {
2847   hash_set<slp_tree> visited;
2848   vect_mark_slp_stmts (node, visited);
2849 }
2850
2851 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2852
2853 static void
2854 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2855 {
2856   int i;
2857   stmt_vec_info stmt_info;
2858   slp_tree child;
2859
2860   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2861     return;
2862
2863   if (visited.add (node))
2864     return;
2865
2866   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2867     {
2868       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2869                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2870       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2871     }
2872
2873   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2874     if (child)
2875       vect_mark_slp_stmts_relevant (child, visited);
2876 }
2877
2878 static void
2879 vect_mark_slp_stmts_relevant (slp_tree node)
2880 {
2881   hash_set<slp_tree> visited;
2882   vect_mark_slp_stmts_relevant (node, visited);
2883 }
2884
2885
2886 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2887
2888 static void
2889 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2890                        hash_set<slp_tree> &visited)
2891 {
2892   if (!node || visited.add (node))
2893     return;
2894
2895   if (SLP_TREE_CHILDREN (node).length () == 0)
2896     {
2897       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2898         return;
2899       stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2900       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2901           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2902         loads.safe_push (node);
2903     }
2904   else
2905     {
2906       unsigned i;
2907       slp_tree child;
2908       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2909         vect_gather_slp_loads (loads, child, visited);
2910     }
2911 }
2912
2913
2914 /* Find the last store in SLP INSTANCE.  */
2915
2916 stmt_vec_info
2917 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2918 {
2919   stmt_vec_info last = NULL;
2920   stmt_vec_info stmt_vinfo;
2921
2922   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2923     {
2924       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2925       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2926     }
2927
2928   return last;
2929 }
2930
2931 /* Find the first stmt in NODE.  */
2932
2933 stmt_vec_info
2934 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2935 {
2936   stmt_vec_info first = NULL;
2937   stmt_vec_info stmt_vinfo;
2938
2939   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2940     {
2941       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2942       if (!first
2943           || get_later_stmt (stmt_vinfo, first) == first)
2944         first = stmt_vinfo;
2945     }
2946
2947   return first;
2948 }
2949
2950 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2951    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2952    (also containing the first GROUP1_SIZE stmts, since stores are
2953    consecutive), the second containing the remainder.
2954    Return the first stmt in the second group.  */
2955
2956 static stmt_vec_info
2957 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2958 {
2959   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2960   gcc_assert (group1_size > 0);
2961   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2962   gcc_assert (group2_size > 0);
2963   DR_GROUP_SIZE (first_vinfo) = group1_size;
2964
2965   stmt_vec_info stmt_info = first_vinfo;
2966   for (unsigned i = group1_size; i > 1; i--)
2967     {
2968       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2969       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2970     }
2971   /* STMT is now the last element of the first group.  */
2972   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2973   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2974
2975   DR_GROUP_SIZE (group2) = group2_size;
2976   for (stmt_info = group2; stmt_info;
2977        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2978     {
2979       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2980       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2981     }
2982
2983   /* For the second group, the DR_GROUP_GAP is that before the original group,
2984      plus skipping over the first vector.  */
2985   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2986
2987   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
2988   DR_GROUP_GAP (first_vinfo) += group2_size;
2989
2990   if (dump_enabled_p ())
2991     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2992                      group1_size, group2_size);
2993
2994   return group2;
2995 }
2996
2997 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2998    statements and a vector of NUNITS elements.  */
2999
3000 static poly_uint64
3001 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3002 {
3003   return exact_div (common_multiple (nunits, group_size), group_size);
3004 }
3005
3006 /* Helper that checks to see if a node is a load node.  */
3007
3008 static inline bool
3009 vect_is_slp_load_node  (slp_tree root)
3010 {
3011   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3012          && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3013          && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3014 }
3015
3016
3017 /* Helper function of optimize_load_redistribution that performs the operation
3018    recursively.  */
3019
3020 static slp_tree
3021 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3022                                 vec_info *vinfo, unsigned int group_size,
3023                                 hash_map<slp_tree, slp_tree> *load_map,
3024                                 slp_tree root)
3025 {
3026   if (slp_tree *leader = load_map->get (root))
3027     return *leader;
3028
3029   slp_tree node;
3030   unsigned i;
3031
3032   /* For now, we don't know anything about externals so do not do anything.  */
3033   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3034     return NULL;
3035   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3036     {
3037       /* First convert this node into a load node and add it to the leaves
3038          list and flatten the permute from a lane to a load one.  If it's
3039          unneeded it will be elided later.  */
3040       vec<stmt_vec_info> stmts;
3041       stmts.create (SLP_TREE_LANES (root));
3042       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3043       for (unsigned j = 0; j < lane_perm.length (); j++)
3044         {
3045           std::pair<unsigned, unsigned> perm = lane_perm[j];
3046           node = SLP_TREE_CHILDREN (root)[perm.first];
3047
3048           if (!vect_is_slp_load_node (node)
3049               || SLP_TREE_CHILDREN (node).exists ())
3050             {
3051               stmts.release ();
3052               goto next;
3053             }
3054
3055           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3056         }
3057
3058       if (dump_enabled_p ())
3059         dump_printf_loc (MSG_NOTE, vect_location,
3060                          "converting stmts on permute node %p\n",
3061                          (void *) root);
3062
3063       bool *matches = XALLOCAVEC (bool, group_size);
3064       poly_uint64 max_nunits = 1;
3065       unsigned tree_size = 0, limit = 1;
3066       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3067                                   matches, &limit, &tree_size, bst_map);
3068       if (!node)
3069         stmts.release ();
3070
3071       load_map->put (root, node);
3072       return node;
3073     }
3074
3075 next:
3076   load_map->put (root, NULL);
3077
3078   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3079     {
3080       slp_tree value
3081         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3082                                           node);
3083       if (value)
3084         {
3085           SLP_TREE_REF_COUNT (value)++;
3086           SLP_TREE_CHILDREN (root)[i] = value;
3087           /* ???  We know the original leafs of the replaced nodes will
3088              be referenced by bst_map, only the permutes created by
3089              pattern matching are not.  */
3090           if (SLP_TREE_REF_COUNT (node) == 1)
3091             load_map->remove (node);
3092           vect_free_slp_tree (node);
3093         }
3094     }
3095
3096   return NULL;
3097 }
3098
3099 /* Temporary workaround for loads not being CSEd during SLP build.  This
3100    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3101    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3102    same DR such that the final operation is equal to a permuted load.  Such
3103    NODES are then directly converted into LOADS themselves.  The nodes are
3104    CSEd using BST_MAP.  */
3105
3106 static void
3107 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3108                               vec_info *vinfo, unsigned int group_size,
3109                               hash_map<slp_tree, slp_tree> *load_map,
3110                               slp_tree root)
3111 {
3112   slp_tree node;
3113   unsigned i;
3114
3115   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3116     {
3117       slp_tree value
3118         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3119                                           node);
3120       if (value)
3121         {
3122           SLP_TREE_REF_COUNT (value)++;
3123           SLP_TREE_CHILDREN (root)[i] = value;
3124           /* ???  We know the original leafs of the replaced nodes will
3125              be referenced by bst_map, only the permutes created by
3126              pattern matching are not.  */
3127           if (SLP_TREE_REF_COUNT (node) == 1)
3128             load_map->remove (node);
3129           vect_free_slp_tree (node);
3130         }
3131     }
3132 }
3133
3134 /* Helper function of vect_match_slp_patterns.
3135
3136    Attempts to match patterns against the slp tree rooted in REF_NODE using
3137    VINFO.  Patterns are matched in post-order traversal.
3138
3139    If matching is successful the value in REF_NODE is updated and returned, if
3140    not then it is returned unchanged.  */
3141
3142 static bool
3143 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3144                            slp_tree_to_load_perm_map_t *perm_cache,
3145                            slp_compat_nodes_map_t *compat_cache,
3146                            hash_set<slp_tree> *visited)
3147 {
3148   unsigned i;
3149   slp_tree node = *ref_node;
3150   bool found_p = false;
3151   if (!node || visited->add (node))
3152     return false;
3153
3154   slp_tree child;
3155   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3156     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3157                                           vinfo, perm_cache, compat_cache,
3158                                           visited);
3159
3160   for (unsigned x = 0; x < num__slp_patterns; x++)
3161     {
3162       vect_pattern *pattern
3163         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3164       if (pattern)
3165         {
3166           pattern->build (vinfo);
3167           delete pattern;
3168           found_p = true;
3169         }
3170     }
3171
3172   return found_p;
3173 }
3174
3175 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3176    vec_info VINFO.
3177
3178    The modified tree is returned.  Patterns are tried in order and multiple
3179    patterns may match.  */
3180
3181 static bool
3182 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3183                          hash_set<slp_tree> *visited,
3184                          slp_tree_to_load_perm_map_t *perm_cache,
3185                          slp_compat_nodes_map_t *compat_cache)
3186 {
3187   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3188   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3189
3190   if (dump_enabled_p ())
3191     dump_printf_loc (MSG_NOTE, vect_location,
3192                      "Analyzing SLP tree %p for patterns\n",
3193                      (void *) SLP_INSTANCE_TREE (instance));
3194
3195   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3196                                     visited);
3197 }
3198
3199 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3200    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3201    Return true if we could use IFN_STORE_LANES instead and if that appears
3202    to be the better approach.  */
3203
3204 static bool
3205 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3206                                unsigned int group_size,
3207                                unsigned int new_group_size)
3208 {
3209   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3210   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3211   if (!vectype)
3212     return false;
3213   /* Allow the split if one of the two new groups would operate on full
3214      vectors *within* rather than across one scalar loop iteration.
3215      This is purely a heuristic, but it should work well for group
3216      sizes of 3 and 4, where the possible splits are:
3217
3218        3->2+1:  OK if the vector has exactly two elements
3219        4->2+2:  Likewise
3220        4->3+1:  Less clear-cut.  */
3221   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3222       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3223     return false;
3224   return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3225 }
3226
3227 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3228    vect_build_slp_tree to build a tree of packed stmts if possible.
3229    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3230
3231 static bool
3232 vect_analyze_slp_instance (vec_info *vinfo,
3233                            scalar_stmts_to_slp_tree_map_t *bst_map,
3234                            stmt_vec_info stmt_info, slp_instance_kind kind,
3235                            unsigned max_tree_size, unsigned *limit);
3236
3237 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3238    of KIND.  Return true if successful.  */
3239
3240 static bool
3241 vect_build_slp_instance (vec_info *vinfo,
3242                          slp_instance_kind kind,
3243                          vec<stmt_vec_info> &scalar_stmts,
3244                          vec<stmt_vec_info> &root_stmt_infos,
3245                          vec<tree> &remain,
3246                          unsigned max_tree_size, unsigned *limit,
3247                          scalar_stmts_to_slp_tree_map_t *bst_map,
3248                          /* ???  We need stmt_info for group splitting.  */
3249                          stmt_vec_info stmt_info_)
3250 {
3251   if (kind == slp_inst_kind_ctor)
3252     {
3253       if (dump_enabled_p ())
3254         dump_printf_loc (MSG_NOTE, vect_location,
3255                          "Analyzing vectorizable constructor: %G\n",
3256                          root_stmt_infos[0]->stmt);
3257     }
3258
3259   if (dump_enabled_p ())
3260     {
3261       dump_printf_loc (MSG_NOTE, vect_location,
3262                        "Starting SLP discovery for\n");
3263       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3264         dump_printf_loc (MSG_NOTE, vect_location,
3265                          "  %G", scalar_stmts[i]->stmt);
3266     }
3267
3268   /* When a BB reduction doesn't have an even number of lanes
3269      strip it down, treating the remaining lane as scalar.
3270      ???  Selecting the optimal set of lanes to vectorize would be nice
3271      but SLP build for all lanes will fail quickly because we think
3272      we're going to need unrolling.  */
3273   if (kind == slp_inst_kind_bb_reduc
3274       && (scalar_stmts.length () & 1))
3275     remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt));
3276
3277   /* Build the tree for the SLP instance.  */
3278   unsigned int group_size = scalar_stmts.length ();
3279   bool *matches = XALLOCAVEC (bool, group_size);
3280   poly_uint64 max_nunits = 1;
3281   unsigned tree_size = 0;
3282   unsigned i;
3283   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3284                                        &max_nunits, matches, limit,
3285                                        &tree_size, bst_map);
3286   if (node != NULL)
3287     {
3288       /* Calculate the unrolling factor based on the smallest type.  */
3289       poly_uint64 unrolling_factor
3290         = calculate_unrolling_factor (max_nunits, group_size);
3291
3292       if (maybe_ne (unrolling_factor, 1U)
3293           && is_a <bb_vec_info> (vinfo))
3294         {
3295           unsigned HOST_WIDE_INT const_max_nunits;
3296           if (!max_nunits.is_constant (&const_max_nunits)
3297               || const_max_nunits > group_size)
3298             {
3299               if (dump_enabled_p ())
3300                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3301                                  "Build SLP failed: store group "
3302                                  "size not a multiple of the vector size "
3303                                  "in basic block SLP\n");
3304               vect_free_slp_tree (node);
3305               return false;
3306             }
3307           /* Fatal mismatch.  */
3308           if (dump_enabled_p ())
3309             dump_printf_loc (MSG_NOTE, vect_location,
3310                              "SLP discovery succeeded but node needs "
3311                              "splitting\n");
3312           memset (matches, true, group_size);
3313           matches[group_size / const_max_nunits * const_max_nunits] = false;
3314           vect_free_slp_tree (node);
3315         }
3316       else
3317         {
3318           /* Create a new SLP instance.  */
3319           slp_instance new_instance = XNEW (class _slp_instance);
3320           SLP_INSTANCE_TREE (new_instance) = node;
3321           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3322           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3323           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3324           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3325           SLP_INSTANCE_KIND (new_instance) = kind;
3326           new_instance->reduc_phis = NULL;
3327           new_instance->cost_vec = vNULL;
3328           new_instance->subgraph_entries = vNULL;
3329
3330           if (dump_enabled_p ())
3331             dump_printf_loc (MSG_NOTE, vect_location,
3332                              "SLP size %u vs. limit %u.\n",
3333                              tree_size, max_tree_size);
3334
3335           /* Fixup SLP reduction chains.  */
3336           if (kind == slp_inst_kind_reduc_chain)
3337             {
3338               /* If this is a reduction chain with a conversion in front
3339                  amend the SLP tree with a node for that.  */
3340               gimple *scalar_def
3341                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3342               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3343                 {
3344                   /* Get at the conversion stmt - we know it's the single use
3345                      of the last stmt of the reduction chain.  */
3346                   use_operand_p use_p;
3347                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3348                                            &use_p, &scalar_def);
3349                   gcc_assert (r);
3350                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3351                   next_info = vect_stmt_to_vectorize (next_info);
3352                   scalar_stmts = vNULL;
3353                   scalar_stmts.create (group_size);
3354                   for (unsigned i = 0; i < group_size; ++i)
3355                     scalar_stmts.quick_push (next_info);
3356                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3357                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3358                   SLP_TREE_CHILDREN (conv).quick_push (node);
3359                   SLP_INSTANCE_TREE (new_instance) = conv;
3360                   /* We also have to fake this conversion stmt as SLP reduction
3361                      group so we don't have to mess with too much code
3362                      elsewhere.  */
3363                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3364                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3365                 }
3366               /* Fill the backedge child of the PHI SLP node.  The
3367                  general matching code cannot find it because the
3368                  scalar code does not reflect how we vectorize the
3369                  reduction.  */
3370               use_operand_p use_p;
3371               imm_use_iterator imm_iter;
3372               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3373               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3374                                      gimple_get_lhs (scalar_def))
3375                 /* There are exactly two non-debug uses, the reduction
3376                    PHI and the loop-closed PHI node.  */
3377                 if (!is_gimple_debug (USE_STMT (use_p))
3378                     && gimple_bb (USE_STMT (use_p)) == loop->header)
3379                   {
3380                     auto_vec<stmt_vec_info, 64> phis (group_size);
3381                     stmt_vec_info phi_info
3382                       = vinfo->lookup_stmt (USE_STMT (use_p));
3383                     for (unsigned i = 0; i < group_size; ++i)
3384                       phis.quick_push (phi_info);
3385                     slp_tree *phi_node = bst_map->get (phis);
3386                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3387                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3388                       = SLP_INSTANCE_TREE (new_instance);
3389                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
3390                   }
3391             }
3392
3393           vinfo->slp_instances.safe_push (new_instance);
3394
3395           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3396              the number of scalar stmts in the root in a few places.
3397              Verify that assumption holds.  */
3398           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3399                         .length () == group_size);
3400
3401           if (dump_enabled_p ())
3402             {
3403               dump_printf_loc (MSG_NOTE, vect_location,
3404                                "Final SLP tree for instance %p:\n",
3405                                (void *) new_instance);
3406               vect_print_slp_graph (MSG_NOTE, vect_location,
3407                                     SLP_INSTANCE_TREE (new_instance));
3408             }
3409
3410           return true;
3411         }
3412     }
3413   else
3414     {
3415       /* Failed to SLP.  */
3416       /* Free the allocated memory.  */
3417       scalar_stmts.release ();
3418     }
3419
3420   stmt_vec_info stmt_info = stmt_info_;
3421   /* Try to break the group up into pieces.  */
3422   if (kind == slp_inst_kind_store)
3423     {
3424       /* ???  We could delay all the actual splitting of store-groups
3425          until after SLP discovery of the original group completed.
3426          Then we can recurse to vect_build_slp_instance directly.  */
3427       for (i = 0; i < group_size; i++)
3428         if (!matches[i])
3429           break;
3430
3431       /* For basic block SLP, try to break the group up into multiples of
3432          a vector size.  */
3433       if (is_a <bb_vec_info> (vinfo)
3434           && (i > 1 && i < group_size))
3435         {
3436           tree scalar_type
3437             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3438           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3439                                                       1 << floor_log2 (i));
3440           unsigned HOST_WIDE_INT const_nunits;
3441           if (vectype
3442               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3443             {
3444               /* Split into two groups at the first vector boundary.  */
3445               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3446               unsigned group1_size = i & ~(const_nunits - 1);
3447
3448               if (dump_enabled_p ())
3449                 dump_printf_loc (MSG_NOTE, vect_location,
3450                                  "Splitting SLP group at stmt %u\n", i);
3451               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3452                                                                group1_size);
3453               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3454                                                     kind, max_tree_size,
3455                                                     limit);
3456               /* Split the rest at the failure point and possibly
3457                  re-analyze the remaining matching part if it has
3458                  at least two lanes.  */
3459               if (group1_size < i
3460                   && (i + 1 < group_size
3461                       || i - group1_size > 1))
3462                 {
3463                   stmt_vec_info rest2 = rest;
3464                   rest = vect_split_slp_store_group (rest, i - group1_size);
3465                   if (i - group1_size > 1)
3466                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3467                                                       kind, max_tree_size,
3468                                                       limit);
3469                 }
3470               /* Re-analyze the non-matching tail if it has at least
3471                  two lanes.  */
3472               if (i + 1 < group_size)
3473                 res |= vect_analyze_slp_instance (vinfo, bst_map,
3474                                                   rest, kind, max_tree_size,
3475                                                   limit);
3476               return res;
3477             }
3478         }
3479
3480       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3481       if (is_a <loop_vec_info> (vinfo)
3482           && (i > 1 && i < group_size)
3483           && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3484         {
3485           unsigned group1_size = i;
3486
3487           if (dump_enabled_p ())
3488             dump_printf_loc (MSG_NOTE, vect_location,
3489                              "Splitting SLP group at stmt %u\n", i);
3490
3491           stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3492                                                            group1_size);
3493           /* Loop vectorization cannot handle gaps in stores, make sure
3494              the split group appears as strided.  */
3495           STMT_VINFO_STRIDED_P (rest) = 1;
3496           DR_GROUP_GAP (rest) = 0;
3497           STMT_VINFO_STRIDED_P (stmt_info) = 1;
3498           DR_GROUP_GAP (stmt_info) = 0;
3499
3500           bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3501                                                 kind, max_tree_size, limit);
3502           if (i + 1 < group_size)
3503             res |= vect_analyze_slp_instance (vinfo, bst_map,
3504                                               rest, kind, max_tree_size, limit);
3505
3506           return res;
3507         }
3508
3509       /* Even though the first vector did not all match, we might be able to SLP
3510          (some) of the remainder.  FORNOW ignore this possibility.  */
3511     }
3512
3513   /* Failed to SLP.  */
3514   if (dump_enabled_p ())
3515     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3516   return false;
3517 }
3518
3519
3520 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3521    vect_build_slp_tree to build a tree of packed stmts if possible.
3522    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3523
3524 static bool
3525 vect_analyze_slp_instance (vec_info *vinfo,
3526                            scalar_stmts_to_slp_tree_map_t *bst_map,
3527                            stmt_vec_info stmt_info,
3528                            slp_instance_kind kind,
3529                            unsigned max_tree_size, unsigned *limit)
3530 {
3531   unsigned int i;
3532   vec<stmt_vec_info> scalar_stmts;
3533
3534   if (is_a <bb_vec_info> (vinfo))
3535     vect_location = stmt_info->stmt;
3536
3537   stmt_vec_info next_info = stmt_info;
3538   if (kind == slp_inst_kind_store)
3539     {
3540       /* Collect the stores and store them in scalar_stmts.  */
3541       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3542       while (next_info)
3543         {
3544           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3545           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3546         }
3547     }
3548   else if (kind == slp_inst_kind_reduc_chain)
3549     {
3550       /* Collect the reduction stmts and store them in scalar_stmts.  */
3551       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3552       while (next_info)
3553         {
3554           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3555           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3556         }
3557       /* Mark the first element of the reduction chain as reduction to properly
3558          transform the node.  In the reduction analysis phase only the last
3559          element of the chain is marked as reduction.  */
3560       STMT_VINFO_DEF_TYPE (stmt_info)
3561         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3562       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3563         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3564     }
3565   else if (kind == slp_inst_kind_reduc_group)
3566     {
3567       /* Collect reduction statements.  */
3568       const vec<stmt_vec_info> &reductions
3569         = as_a <loop_vec_info> (vinfo)->reductions;
3570       scalar_stmts.create (reductions.length ());
3571       for (i = 0; reductions.iterate (i, &next_info); i++)
3572         if ((STMT_VINFO_RELEVANT_P (next_info)
3573              || STMT_VINFO_LIVE_P (next_info))
3574             /* ???  Make sure we didn't skip a conversion around a reduction
3575                path.  In that case we'd have to reverse engineer that conversion
3576                stmt following the chain using reduc_idx and from the PHI
3577                using reduc_def.  */
3578             && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3579           scalar_stmts.quick_push (next_info);
3580       /* If less than two were relevant/live there's nothing to SLP.  */
3581       if (scalar_stmts.length () < 2)
3582         return false;
3583     }
3584   else
3585     gcc_unreachable ();
3586
3587   vec<stmt_vec_info> roots = vNULL;
3588   vec<tree> remain = vNULL;
3589   /* Build the tree for the SLP instance.  */
3590   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3591                                       roots, remain,
3592                                       max_tree_size, limit, bst_map,
3593                                       kind == slp_inst_kind_store
3594                                       ? stmt_info : NULL);
3595
3596   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3597      where we should do store group splitting.  */
3598
3599   return res;
3600 }
3601
3602 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3603    trees of packed scalar stmts if SLP is possible.  */
3604
3605 opt_result
3606 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3607 {
3608   unsigned int i;
3609   stmt_vec_info first_element;
3610   slp_instance instance;
3611
3612   DUMP_VECT_SCOPE ("vect_analyze_slp");
3613
3614   unsigned limit = max_tree_size;
3615
3616   scalar_stmts_to_slp_tree_map_t *bst_map
3617     = new scalar_stmts_to_slp_tree_map_t ();
3618
3619   /* Find SLP sequences starting from groups of grouped stores.  */
3620   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3621     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3622                                slp_inst_kind_store, max_tree_size, &limit);
3623
3624   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3625     {
3626       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3627         {
3628           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3629           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3630                                        bb_vinfo->roots[i].stmts,
3631                                        bb_vinfo->roots[i].roots,
3632                                        bb_vinfo->roots[i].remain,
3633                                        max_tree_size, &limit, bst_map, NULL))
3634             {
3635               bb_vinfo->roots[i].stmts = vNULL;
3636               bb_vinfo->roots[i].roots = vNULL;
3637               bb_vinfo->roots[i].remain = vNULL;
3638             }
3639         }
3640     }
3641
3642   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3643     {
3644       /* Find SLP sequences starting from reduction chains.  */
3645       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3646         if (! STMT_VINFO_RELEVANT_P (first_element)
3647             && ! STMT_VINFO_LIVE_P (first_element))
3648           ;
3649         else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3650                                               slp_inst_kind_reduc_chain,
3651                                               max_tree_size, &limit))
3652           {
3653             /* Dissolve reduction chain group.  */
3654             stmt_vec_info vinfo = first_element;
3655             stmt_vec_info last = NULL;
3656             while (vinfo)
3657               {
3658                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3659                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3660                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3661                 last = vinfo;
3662                 vinfo = next;
3663               }
3664             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3665             /* It can be still vectorized as part of an SLP reduction.  */
3666             loop_vinfo->reductions.safe_push (last);
3667           }
3668
3669       /* Find SLP sequences starting from groups of reductions.  */
3670       if (loop_vinfo->reductions.length () > 1)
3671         vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3672                                    slp_inst_kind_reduc_group, max_tree_size,
3673                                    &limit);
3674     }
3675
3676   hash_set<slp_tree> visited_patterns;
3677   slp_tree_to_load_perm_map_t perm_cache;
3678   slp_compat_nodes_map_t compat_cache;
3679
3680   /* See if any patterns can be found in the SLP tree.  */
3681   bool pattern_found = false;
3682   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3683     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3684                                               &visited_patterns, &perm_cache,
3685                                               &compat_cache);
3686
3687   /* If any were found optimize permutations of loads.  */
3688   if (pattern_found)
3689     {
3690       hash_map<slp_tree, slp_tree> load_map;
3691       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3692         {
3693           slp_tree root = SLP_INSTANCE_TREE (instance);
3694           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3695                                         &load_map, root);
3696         }
3697     }
3698
3699
3700
3701   /* The map keeps a reference on SLP nodes built, release that.  */
3702   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3703        it != bst_map->end (); ++it)
3704     if ((*it).second)
3705       vect_free_slp_tree ((*it).second);
3706   delete bst_map;
3707
3708   if (pattern_found && dump_enabled_p ())
3709     {
3710       dump_printf_loc (MSG_NOTE, vect_location,
3711                        "Pattern matched SLP tree\n");
3712       hash_set<slp_tree> visited;
3713       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3714         vect_print_slp_graph (MSG_NOTE, vect_location,
3715                               SLP_INSTANCE_TREE (instance), visited);
3716     }
3717
3718   return opt_result::success ();
3719 }
3720
3721 /* Estimates the cost of inserting layout changes into the SLP graph.
3722    It can also say that the insertion is impossible.  */
3723
3724 struct slpg_layout_cost
3725 {
3726   slpg_layout_cost () = default;
3727   slpg_layout_cost (sreal, bool);
3728
3729   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3730   bool is_possible () const { return depth != sreal::max (); }
3731
3732   bool operator== (const slpg_layout_cost &) const;
3733   bool operator!= (const slpg_layout_cost &) const;
3734
3735   bool is_better_than (const slpg_layout_cost &, bool) const;
3736
3737   void add_parallel_cost (const slpg_layout_cost &);
3738   void add_serial_cost (const slpg_layout_cost &);
3739   void split (unsigned int);
3740
3741   /* The longest sequence of layout changes needed during any traversal
3742      of the partition dag, weighted by execution frequency.
3743
3744      This is the most important metric when optimizing for speed, since
3745      it helps to ensure that we keep the number of operations on
3746      critical paths to a minimum.  */
3747   sreal depth = 0;
3748
3749   /* An estimate of the total number of operations needed.  It is weighted by
3750      execution frequency when optimizing for speed but not when optimizing for
3751      size.  In order to avoid double-counting, a node with a fanout of N will
3752      distribute 1/N of its total cost to each successor.
3753
3754      This is the most important metric when optimizing for size, since
3755      it helps to keep the total number of operations to a minimum,  */
3756   sreal total = 0;
3757 };
3758
3759 /* Construct costs for a node with weight WEIGHT.  A higher weight
3760    indicates more frequent execution.  IS_FOR_SIZE is true if we are
3761    optimizing for size rather than speed.  */
3762
3763 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3764   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3765 {
3766 }
3767
3768 bool
3769 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3770 {
3771   return depth == other.depth && total == other.total;
3772 }
3773
3774 bool
3775 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3776 {
3777   return !operator== (other);
3778 }
3779
3780 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
3781    true if we are optimizing for size rather than speed.  */
3782
3783 bool
3784 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3785                                   bool is_for_size) const
3786 {
3787   if (is_for_size)
3788     {
3789       if (total != other.total)
3790         return total < other.total;
3791       return depth < other.depth;
3792     }
3793   else
3794     {
3795       if (depth != other.depth)
3796         return depth < other.depth;
3797       return total < other.total;
3798     }
3799 }
3800
3801 /* Increase the costs to account for something with cost INPUT_COST
3802    happening in parallel with the current costs.  */
3803
3804 void
3805 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3806 {
3807   depth = std::max (depth, input_cost.depth);
3808   total += input_cost.total;
3809 }
3810
3811 /* Increase the costs to account for something with cost INPUT_COST
3812    happening in series with the current costs.  */
3813
3814 void
3815 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3816 {
3817   depth += other.depth;
3818   total += other.total;
3819 }
3820
3821 /* Split the total cost among TIMES successors or predecessors.  */
3822
3823 void
3824 slpg_layout_cost::split (unsigned int times)
3825 {
3826   if (times > 1)
3827     total /= times;
3828 }
3829
3830 /* Information about one node in the SLP graph, for use during
3831    vect_optimize_slp_pass.  */
3832
3833 struct slpg_vertex
3834 {
3835   slpg_vertex (slp_tree node_) : node (node_) {}
3836
3837   /* The node itself.  */
3838   slp_tree node;
3839
3840   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
3841      partitions are flexible; they can have whichever layout consumers
3842      want them to have.  */
3843   int partition = -1;
3844
3845   /* The number of nodes that directly use the result of this one
3846      (i.e. the number of nodes that count this one as a child).  */
3847   unsigned int out_degree = 0;
3848
3849   /* The execution frequency of the node.  */
3850   sreal weight = 0;
3851
3852   /* The total execution frequency of all nodes that directly use the
3853      result of this one.  */
3854   sreal out_weight = 0;
3855 };
3856
3857 /* Information about one partition of the SLP graph, for use during
3858    vect_optimize_slp_pass.  */
3859
3860 struct slpg_partition_info
3861 {
3862   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3863      of m_partitioned_nodes.  */
3864   unsigned int node_begin = 0;
3865   unsigned int node_end = 0;
3866
3867   /* Which layout we've chosen to use for this partition, or -1 if
3868      we haven't picked one yet.  */
3869   int layout = -1;
3870
3871   /* The number of predecessors and successors in the partition dag.
3872      The predecessors always have lower partition numbers and the
3873      successors always have higher partition numbers.
3874
3875      Note that the directions of these edges are not necessarily the
3876      same as in the data flow graph.  For example, if an SCC has separate
3877      partitions for an inner loop and an outer loop, the inner loop's
3878      partition will have at least two incoming edges from the outer loop's
3879      partition: one for a live-in value and one for a live-out value.
3880      In data flow terms, one of these edges would also be from the outer loop
3881      to the inner loop, but the other would be in the opposite direction.  */
3882   unsigned int in_degree = 0;
3883   unsigned int out_degree = 0;
3884 };
3885
3886 /* Information about the costs of using a particular layout for a
3887    particular partition.  It can also say that the combination is
3888    impossible.  */
3889
3890 struct slpg_partition_layout_costs
3891 {
3892   bool is_possible () const { return internal_cost.is_possible (); }
3893   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3894
3895   /* The costs inherited from predecessor partitions.  */
3896   slpg_layout_cost in_cost;
3897
3898   /* The inherent cost of the layout within the node itself.  For example,
3899      this is nonzero for a load if choosing a particular layout would require
3900      the load to permute the loaded elements.  It is nonzero for a
3901      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3902      to full-vector moves.  */
3903   slpg_layout_cost internal_cost;
3904
3905   /* The costs inherited from successor partitions.  */
3906   slpg_layout_cost out_cost;
3907 };
3908
3909 /* This class tries to optimize the layout of vectors in order to avoid
3910    unnecessary shuffling.  At the moment, the set of possible layouts are
3911    restricted to bijective permutations.
3912
3913    The goal of the pass depends on whether we're optimizing for size or
3914    for speed.  When optimizing for size, the goal is to reduce the overall
3915    number of layout changes (including layout changes implied by things
3916    like load permutations).  When optimizing for speed, the goal is to
3917    reduce the maximum latency attributable to layout changes on any
3918    non-cyclical path through the data flow graph.
3919
3920    For example, when optimizing a loop nest for speed, we will prefer
3921    to make layout changes outside of a loop rather than inside of a loop,
3922    and will prefer to make layout changes in parallel rather than serially,
3923    even if that increases the overall number of layout changes.
3924
3925    The high-level procedure is:
3926
3927    (1) Build a graph in which edges go from uses (parents) to definitions
3928        (children).
3929
3930    (2) Divide the graph into a dag of strongly-connected components (SCCs).
3931
3932    (3) When optimizing for speed, partition the nodes in each SCC based
3933        on their containing cfg loop.  When optimizing for size, treat
3934        each SCC as a single partition.
3935
3936        This gives us a dag of partitions.  The goal is now to assign a
3937        layout to each partition.
3938
3939    (4) Construct a set of vector layouts that are worth considering.
3940        Record which nodes must keep their current layout.
3941
3942    (5) Perform a forward walk over the partition dag (from loads to stores)
3943        accumulating the "forward" cost of using each layout.  When visiting
3944        each partition, assign a tentative choice of layout to the partition
3945        and use that choice when calculating the cost of using a different
3946        layout in successor partitions.
3947
3948    (6) Perform a backward walk over the partition dag (from stores to loads),
3949        accumulating the "backward" cost of using each layout.  When visiting
3950        each partition, make a final choice of layout for that partition based
3951        on the accumulated forward costs (from (5)) and backward costs
3952        (from (6)).
3953
3954    (7) Apply the chosen layouts to the SLP graph.
3955
3956    For example, consider the SLP statements:
3957
3958    S1:      a_1 = load
3959        loop:
3960    S2:      a_2 = PHI<a_1, a_3>
3961    S3:      b_1 = load
3962    S4:      a_3 = a_2 + b_1
3963        exit:
3964    S5:      a_4 = PHI<a_3>
3965    S6:      store a_4
3966
3967    S2 and S4 form an SCC and are part of the same loop.  Every other
3968    statement is in a singleton SCC.  In this example there is a one-to-one
3969    mapping between SCCs and partitions and the partition dag looks like this;
3970
3971         S1     S3
3972          \     /
3973           S2+S4
3974             |
3975            S5
3976             |
3977            S6
3978
3979    S2, S3 and S4 will have a higher execution frequency than the other
3980    statements, so when optimizing for speed, the goal is to avoid any
3981    layout changes:
3982
3983    - within S3
3984    - within S2+S4
3985    - on the S3->S2+S4 edge
3986
3987    For example, if S3 was originally a reversing load, the goal of the
3988    pass is to make it an unreversed load and change the layout on the
3989    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
3990    on S1->S2+S4 and S5->S6 would also be acceptable.)
3991
3992    The difference between SCCs and partitions becomes important if we
3993    add an outer loop:
3994
3995    S1:      a_1 = ...
3996        loop1:
3997    S2:      a_2 = PHI<a_1, a_6>
3998    S3:      b_1 = load
3999    S4:      a_3 = a_2 + b_1
4000        loop2:
4001    S5:      a_4 = PHI<a_3, a_5>
4002    S6:      c_1 = load
4003    S7:      a_5 = a_4 + c_1
4004        exit2:
4005    S8:      a_6 = PHI<a_5>
4006    S9:      store a_6
4007        exit1:
4008
4009    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
4010    for speed, we usually do not want restrictions in the outer loop to "infect"
4011    the decision for the inner loop.  For example, if an outer-loop node
4012    in the SCC contains a statement with a fixed layout, that should not
4013    prevent the inner loop from using a different layout.  Conversely,
4014    the inner loop should not dictate a layout to the outer loop: if the
4015    outer loop does a lot of computation, then it may not be efficient to
4016    do all of that computation in the inner loop's preferred layout.
4017
4018    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4019    and S5+S7 (inner).  We also try to arrange partitions so that:
4020
4021    - the partition for an outer loop comes before the partition for
4022      an inner loop
4023
4024    - if a sibling loop A dominates a sibling loop B, A's partition
4025      comes before B's
4026
4027    This gives the following partition dag for the example above:
4028
4029         S1        S3
4030          \        /
4031           S2+S4+S8   S6
4032            |   \\    /
4033            |    S5+S7
4034            |
4035           S9
4036
4037    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4038    one for a reversal of the edge S7->S8.
4039
4040    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
4041    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4042    preferred layout against the cost of changing the layout on entry to the
4043    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4044
4045    Although this works well when optimizing for speed, it has the downside
4046    when optimizing for size that the choice of layout for S5+S7 is completely
4047    independent of S9, which lessens the chance of reducing the overall number
4048    of permutations.  We therefore do not partition SCCs when optimizing
4049    for size.
4050
4051    To give a concrete example of the difference between optimizing
4052    for size and speed, consider:
4053
4054    a[0] = (b[1] << c[3]) - d[1];
4055    a[1] = (b[0] << c[2]) - d[0];
4056    a[2] = (b[3] << c[1]) - d[3];
4057    a[3] = (b[2] << c[0]) - d[2];
4058
4059    There are three different layouts here: one for a, one for b and d,
4060    and one for c.  When optimizing for speed it is better to permute each
4061    of b, c and d into the order required by a, since those permutations
4062    happen in parallel.  But when optimizing for size, it is better to:
4063
4064    - permute c into the same order as b
4065    - do the arithmetic
4066    - permute the result into the order required by a
4067
4068    This gives 2 permutations rather than 3.  */
4069
4070 class vect_optimize_slp_pass
4071 {
4072 public:
4073   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4074   void run ();
4075
4076 private:
4077   /* Graph building.  */
4078   struct loop *containing_loop (slp_tree);
4079   bool is_cfg_latch_edge (graph_edge *);
4080   void build_vertices (hash_set<slp_tree> &, slp_tree);
4081   void build_vertices ();
4082   void build_graph ();
4083
4084   /* Partitioning.  */
4085   void create_partitions ();
4086   template<typename T> void for_each_partition_edge (unsigned int, T);
4087
4088   /* Layout selection.  */
4089   bool is_compatible_layout (slp_tree, unsigned int);
4090   int change_layout_cost (slp_tree, unsigned int, unsigned int);
4091   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4092                                                        unsigned int);
4093   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4094                                int, unsigned int);
4095   int internal_node_cost (slp_tree, int, unsigned int);
4096   void start_choosing_layouts ();
4097
4098   /* Cost propagation.  */
4099   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4100                                      unsigned int, unsigned int);
4101   slpg_layout_cost total_in_cost (unsigned int);
4102   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4103   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4104   void forward_pass ();
4105   void backward_pass ();
4106
4107   /* Rematerialization.  */
4108   slp_tree get_result_with_layout (slp_tree, unsigned int);
4109   void materialize ();
4110
4111   /* Clean-up.  */
4112   void remove_redundant_permutations ();
4113
4114   void dump ();
4115
4116   vec_info *m_vinfo;
4117
4118   /* True if we should optimize the graph for size, false if we should
4119      optimize it for speed.  (It wouldn't be easy to make this decision
4120      more locally.)  */
4121   bool m_optimize_size;
4122
4123   /* A graph of all SLP nodes, with edges leading from uses to definitions.
4124      In other words, a node's predecessors are its slp_tree parents and
4125      a node's successors are its slp_tree children.  */
4126   graph *m_slpg = nullptr;
4127
4128   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
4129   auto_vec<slpg_vertex> m_vertices;
4130
4131   /* The list of all leaves of M_SLPG. such as external definitions, constants,
4132      and loads.  */
4133   auto_vec<int> m_leafs;
4134
4135   /* This array has one entry for every vector layout that we're considering.
4136      Element 0 is null and indicates "no change".  Other entries describe
4137      permutations that are inherent in the current graph and that we would
4138      like to reverse if possible.
4139
4140      For example, a permutation { 1, 2, 3, 0 } means that something has
4141      effectively been permuted in that way, such as a load group
4142      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4143      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4144      in order to put things "back" in order.  */
4145   auto_vec<vec<unsigned> > m_perms;
4146
4147   /* A partitioning of the nodes for which a layout must be chosen.
4148      Each partition represents an <SCC, cfg loop> pair; that is,
4149      nodes in different SCCs belong to different partitions, and nodes
4150      within an SCC can be further partitioned according to a containing
4151      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
4152
4153      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4154        from leaves (such as loads) to roots (such as stores).
4155
4156      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
4157   auto_vec<slpg_partition_info> m_partitions;
4158
4159   /* The list of all nodes for which a layout must be chosen.  Nodes for
4160      partition P come before the nodes for partition P+1.  Nodes within a
4161      partition are in reverse postorder.  */
4162   auto_vec<unsigned int> m_partitioned_nodes;
4163
4164   /* Index P * num-layouts + L contains the cost of using layout L
4165      for partition P.  */
4166   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4167
4168   /* Index N * num-layouts + L, if nonnull, is a node that provides the
4169      original output of node N adjusted to have layout L.  */
4170   auto_vec<slp_tree> m_node_layouts;
4171 };
4172
4173 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4174    Also record whether we should optimize anything for speed rather
4175    than size.  */
4176
4177 void
4178 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4179                                         slp_tree node)
4180 {
4181   unsigned i;
4182   slp_tree child;
4183
4184   if (visited.add (node))
4185     return;
4186
4187   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4188     {
4189       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4190       if (optimize_bb_for_speed_p (bb))
4191         m_optimize_size = false;
4192     }
4193
4194   node->vertex = m_vertices.length ();
4195   m_vertices.safe_push (slpg_vertex (node));
4196
4197   bool leaf = true;
4198   bool force_leaf = false;
4199   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4200     if (child)
4201       {
4202         leaf = false;
4203         build_vertices (visited, child);
4204       }
4205     else
4206       force_leaf = true;
4207   /* Since SLP discovery works along use-def edges all cycles have an
4208      entry - but there's the exception of cycles where we do not handle
4209      the entry explicitely (but with a NULL SLP node), like some reductions
4210      and inductions.  Force those SLP PHIs to act as leafs to make them
4211      backwards reachable.  */
4212   if (leaf || force_leaf)
4213     m_leafs.safe_push (node->vertex);
4214 }
4215
4216 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
4217
4218 void
4219 vect_optimize_slp_pass::build_vertices ()
4220 {
4221   hash_set<slp_tree> visited;
4222   unsigned i;
4223   slp_instance instance;
4224   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4225     build_vertices (visited, SLP_INSTANCE_TREE (instance));
4226 }
4227
4228 /* Apply (reverse) bijectite PERM to VEC.  */
4229
4230 template <class T>
4231 static void
4232 vect_slp_permute (vec<unsigned> perm,
4233                   vec<T> &vec, bool reverse)
4234 {
4235   auto_vec<T, 64> saved;
4236   saved.create (vec.length ());
4237   for (unsigned i = 0; i < vec.length (); ++i)
4238     saved.quick_push (vec[i]);
4239
4240   if (reverse)
4241     {
4242       for (unsigned i = 0; i < vec.length (); ++i)
4243         vec[perm[i]] = saved[i];
4244       for (unsigned i = 0; i < vec.length (); ++i)
4245         gcc_assert (vec[perm[i]] == saved[i]);
4246     }
4247   else
4248     {
4249       for (unsigned i = 0; i < vec.length (); ++i)
4250         vec[i] = saved[perm[i]];
4251       for (unsigned i = 0; i < vec.length (); ++i)
4252         gcc_assert (vec[i] == saved[perm[i]]);
4253     }
4254 }
4255
4256 /* Return the cfg loop that contains NODE.  */
4257
4258 struct loop *
4259 vect_optimize_slp_pass::containing_loop (slp_tree node)
4260 {
4261   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4262   if (!rep)
4263     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4264   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4265 }
4266
4267 /* Return true if UD (an edge from a use to a definition) is associated
4268    with a loop latch edge in the cfg.  */
4269
4270 bool
4271 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4272 {
4273   slp_tree use = m_vertices[ud->src].node;
4274   slp_tree def = m_vertices[ud->dest].node;
4275   if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4276       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4277     return false;
4278
4279   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4280   return (is_a<gphi *> (use_rep->stmt)
4281           && bb_loop_header_p (gimple_bb (use_rep->stmt))
4282           && containing_loop (def) == containing_loop (use));
4283 }
4284
4285 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
4286    a nonnull data field.  */
4287
4288 void
4289 vect_optimize_slp_pass::build_graph ()
4290 {
4291   m_optimize_size = true;
4292   build_vertices ();
4293
4294   m_slpg = new_graph (m_vertices.length ());
4295   for (slpg_vertex &v : m_vertices)
4296     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4297       if (child)
4298         {
4299           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4300           if (is_cfg_latch_edge (ud))
4301             ud->data = this;
4302         }
4303 }
4304
4305 /* Return true if E corresponds to a loop latch edge in the cfg.  */
4306
4307 static bool
4308 skip_cfg_latch_edges (graph_edge *e)
4309 {
4310   return e->data;
4311 }
4312
4313 /* Create the node partitions.  */
4314
4315 void
4316 vect_optimize_slp_pass::create_partitions ()
4317 {
4318   /* Calculate a postorder of the graph, ignoring edges that correspond
4319      to natural latch edges in the cfg.  Reading the vector from the end
4320      to the beginning gives the reverse postorder.  */
4321   auto_vec<int> initial_rpo;
4322   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4323                false, NULL, skip_cfg_latch_edges);
4324   gcc_assert (initial_rpo.length () == m_vertices.length ());
4325
4326   /* Calculate the strongly connected components of the graph.  */
4327   auto_vec<int> scc_grouping;
4328   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4329
4330   /* Create a new index order in which all nodes from the same SCC are
4331      consecutive.  Use scc_pos to record the index of the first node in
4332      each SCC.  */
4333   auto_vec<unsigned int> scc_pos (num_sccs);
4334   int last_component = -1;
4335   unsigned int node_count = 0;
4336   for (unsigned int node_i : scc_grouping)
4337     {
4338       if (last_component != m_slpg->vertices[node_i].component)
4339         {
4340           last_component = m_slpg->vertices[node_i].component;
4341           gcc_assert (last_component == int (scc_pos.length ()));
4342           scc_pos.quick_push (node_count);
4343         }
4344       node_count += 1;
4345     }
4346   gcc_assert (node_count == initial_rpo.length ()
4347               && last_component + 1 == int (num_sccs));
4348
4349   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4350      inside each SCC following the RPO we calculated above.  The fact that
4351      we ignored natural latch edges when calculating the RPO should ensure
4352      that, for natural loop nests:
4353
4354      - the first node that we encounter in a cfg loop is the loop header phi
4355      - the loop header phis are in dominance order
4356
4357      Arranging for this is an optimization (see below) rather than a
4358      correctness issue.  Unnatural loops with a tangled mess of backedges
4359      will still work correctly, but might give poorer results.
4360
4361      Also update scc_pos so that it gives 1 + the index of the last node
4362      in the SCC.  */
4363   m_partitioned_nodes.safe_grow (node_count);
4364   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4365     {
4366       unsigned int node_i = initial_rpo[old_i];
4367       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4368       m_partitioned_nodes[new_i] = node_i;
4369     }
4370
4371   /* When optimizing for speed, partition each SCC based on the containing
4372      cfg loop. The order we constructed above should ensure that, for natural
4373      cfg loops, we'll create sub-SCC partitions for outer loops before
4374      the corresponding sub-SCC partitions for inner loops.  Similarly,
4375      when one sibling loop A dominates another sibling loop B, we should
4376      create a sub-SCC partition for A before a sub-SCC partition for B.
4377
4378      As above, nothing depends for correctness on whether this achieves
4379      a natural nesting, but we should get better results when it does.  */
4380   m_partitions.reserve (m_vertices.length ());
4381   unsigned int next_partition_i = 0;
4382   hash_map<struct loop *, int> loop_partitions;
4383   unsigned int rpo_begin = 0;
4384   unsigned int num_partitioned_nodes = 0;
4385   for (unsigned int rpo_end : scc_pos)
4386     {
4387       loop_partitions.empty ();
4388       unsigned int partition_i = next_partition_i;
4389       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4390         {
4391           /* Handle externals and constants optimistically throughout.
4392              But treat existing vectors as fixed since we do not handle
4393              permuting them.  */
4394           unsigned int node_i = m_partitioned_nodes[rpo_i];
4395           auto &vertex = m_vertices[node_i];
4396           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4397                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4398               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4399             vertex.partition = -1;
4400           else
4401             {
4402               bool existed;
4403               if (m_optimize_size)
4404                 existed = next_partition_i > partition_i;
4405               else
4406                 {
4407                   struct loop *loop = containing_loop (vertex.node);
4408                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
4409                   if (!existed)
4410                     entry = next_partition_i;
4411                   partition_i = entry;
4412                 }
4413               if (!existed)
4414                 {
4415                   m_partitions.quick_push (slpg_partition_info ());
4416                   next_partition_i += 1;
4417                 }
4418               vertex.partition = partition_i;
4419               num_partitioned_nodes += 1;
4420               m_partitions[partition_i].node_end += 1;
4421             }
4422         }
4423       rpo_begin = rpo_end;
4424     }
4425
4426   /* Assign ranges of consecutive node indices to each partition,
4427      in partition order.  Start with node_end being the same as
4428      node_begin so that the next loop can use it as a counter.  */
4429   unsigned int node_begin = 0;
4430   for (auto &partition : m_partitions)
4431     {
4432       partition.node_begin = node_begin;
4433       node_begin += partition.node_end;
4434       partition.node_end = partition.node_begin;
4435     }
4436   gcc_assert (node_begin == num_partitioned_nodes);
4437
4438   /* Finally build the list of nodes in partition order.  */
4439   m_partitioned_nodes.truncate (num_partitioned_nodes);
4440   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4441     {
4442       int partition_i = m_vertices[node_i].partition;
4443       if (partition_i >= 0)
4444         {
4445           unsigned int order_i = m_partitions[partition_i].node_end++;
4446           m_partitioned_nodes[order_i] = node_i;
4447         }
4448     }
4449 }
4450
4451 /* Look for edges from earlier partitions into node NODE_I and edges from
4452    node NODE_I into later partitions.  Call:
4453
4454       FN (ud, other_node_i)
4455
4456    for each such use-to-def edge ud, where other_node_i is the node at the
4457    other end of the edge.  */
4458
4459 template<typename T>
4460 void
4461 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4462 {
4463   int partition_i = m_vertices[node_i].partition;
4464   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4465        pred; pred = pred->pred_next)
4466     {
4467       int src_partition_i = m_vertices[pred->src].partition;
4468       if (src_partition_i >= 0 && src_partition_i != partition_i)
4469         fn (pred, pred->src);
4470     }
4471   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4472        succ; succ = succ->succ_next)
4473     {
4474       int dest_partition_i = m_vertices[succ->dest].partition;
4475       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4476         fn (succ, succ->dest);
4477     }
4478 }
4479
4480 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4481    that NODE would operate on.  This test is independent of NODE's actual
4482    operation.  */
4483
4484 bool
4485 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4486                                               unsigned int layout_i)
4487 {
4488   if (layout_i == 0)
4489     return true;
4490
4491   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4492     return false;
4493
4494   return true;
4495 }
4496
4497 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4498    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
4499    layouts is incompatible with NODE or if the change is not possible for
4500    some other reason.
4501
4502    The properties taken from NODE include the number of lanes and the
4503    vector type.  The actual operation doesn't matter.  */
4504
4505 int
4506 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4507                                             unsigned int from_layout_i,
4508                                             unsigned int to_layout_i)
4509 {
4510   if (!is_compatible_layout (node, from_layout_i)
4511       || !is_compatible_layout (node, to_layout_i))
4512     return -1;
4513
4514   if (from_layout_i == to_layout_i)
4515     return 0;
4516
4517   auto_vec<slp_tree, 1> children (1);
4518   children.quick_push (node);
4519   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4520   if (from_layout_i > 0)
4521     for (unsigned int i : m_perms[from_layout_i])
4522       perm.quick_push ({ 0, i });
4523   else
4524     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4525       perm.quick_push ({ 0, i });
4526   if (to_layout_i > 0)
4527     vect_slp_permute (m_perms[to_layout_i], perm, true);
4528   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4529                                                children, false);
4530   if (count >= 0)
4531     return MAX (count, 1);
4532
4533   /* ??? In principle we could try changing via layout 0, giving two
4534      layout changes rather than 1.  Doing that would require
4535      corresponding support in get_result_with_layout.  */
4536   return -1;
4537 }
4538
4539 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
4540
4541 inline slpg_partition_layout_costs &
4542 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4543                                                 unsigned int layout_i)
4544 {
4545   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4546 }
4547
4548 /* Change PERM in one of two ways:
4549
4550    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4551      chosen for child I of NODE.
4552
4553    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4554
4555    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
4556
4557 void
4558 vect_optimize_slp_pass::
4559 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4560                         int in_layout_i, unsigned int out_layout_i)
4561 {
4562   for (auto &entry : perm)
4563     {
4564       int this_in_layout_i = in_layout_i;
4565       if (this_in_layout_i < 0)
4566         {
4567           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4568           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4569           this_in_layout_i = m_partitions[in_partition_i].layout;
4570         }
4571       if (this_in_layout_i > 0)
4572         entry.second = m_perms[this_in_layout_i][entry.second];
4573     }
4574   if (out_layout_i > 0)
4575     vect_slp_permute (m_perms[out_layout_i], perm, true);
4576 }
4577
4578 /* Check whether the target allows NODE to be rearranged so that the node's
4579    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
4580    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
4581
4582    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4583    NODE can adapt to the layout changes that have (perhaps provisionally)
4584    been chosen for NODE's children, so that no extra permutations are
4585    needed on either the input or the output of NODE.
4586
4587    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4588    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4589
4590    IN_LAYOUT_I has no meaning for other types of node.
4591
4592    Keeping the node as-is is always valid.  If the target doesn't appear
4593    to support the node as-is, but might realistically support other layouts,
4594    then layout 0 instead has the cost of a worst-case permutation.  On the
4595    one hand, this ensures that every node has at least one valid layout,
4596    avoiding what would otherwise be an awkward special case.  On the other,
4597    it still encourages the pass to change an invalid pre-existing layout
4598    choice into a valid one.  */
4599
4600 int
4601 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4602                                             unsigned int out_layout_i)
4603 {
4604   const int fallback_cost = 1;
4605
4606   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4607     {
4608       auto_lane_permutation_t tmp_perm;
4609       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4610
4611       /* Check that the child nodes support the chosen layout.  Checking
4612          the first child is enough, since any second child would have the
4613          same shape.  */
4614       auto first_child = SLP_TREE_CHILDREN (node)[0];
4615       if (in_layout_i > 0
4616           && !is_compatible_layout (first_child, in_layout_i))
4617         return -1;
4618
4619       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4620       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4621                                                   node, tmp_perm,
4622                                                   SLP_TREE_CHILDREN (node),
4623                                                   false);
4624       if (count < 0)
4625         {
4626           if (in_layout_i == 0 && out_layout_i == 0)
4627             {
4628               /* Use the fallback cost if the node could in principle support
4629                  some nonzero layout for both the inputs and the outputs.
4630                  Otherwise assume that the node will be rejected later
4631                  and rebuilt from scalars.  */
4632               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4633                 return fallback_cost;
4634               return 0;
4635             }
4636           return -1;
4637         }
4638
4639       /* We currently have no way of telling whether the new layout is cheaper
4640          or more expensive than the old one.  But at least in principle,
4641          it should be worth making zero permutations (whole-vector shuffles)
4642          cheaper than real permutations, in case the pass is able to remove
4643          the latter.  */
4644       return count == 0 ? 0 : 1;
4645     }
4646
4647   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4648   if (rep
4649       && STMT_VINFO_DATA_REF (rep)
4650       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4651       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4652     {
4653       auto_load_permutation_t tmp_perm;
4654       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4655       if (out_layout_i > 0)
4656         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4657
4658       poly_uint64 vf = 1;
4659       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4660         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4661       unsigned int n_perms;
4662       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4663                                            nullptr, vf, true, false, &n_perms))
4664         {
4665           auto rep = SLP_TREE_REPRESENTATIVE (node);
4666           if (out_layout_i == 0)
4667             {
4668               /* Use the fallback cost if the load is an N-to-N permutation.
4669                  Otherwise assume that the node will be rejected later
4670                  and rebuilt from scalars.  */
4671               if (STMT_VINFO_GROUPED_ACCESS (rep)
4672                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4673                       == SLP_TREE_LANES (node)))
4674                 return fallback_cost;
4675               return 0;
4676             }
4677           return -1;
4678         }
4679
4680       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
4681       return n_perms == 0 ? 0 : 1;
4682     }
4683
4684   return 0;
4685 }
4686
4687 /* Decide which element layouts we should consider using.  Calculate the
4688    weights associated with inserting layout changes on partition edges.
4689    Also mark partitions that cannot change layout, by setting their
4690    layout to zero.  */
4691
4692 void
4693 vect_optimize_slp_pass::start_choosing_layouts ()
4694 {
4695   /* Used to assign unique permutation indices.  */
4696   using perm_hash = unbounded_hashmap_traits<
4697     vec_free_hash_base<int_hash_base<unsigned>>,
4698     int_hash<int, -1, -2>
4699   >;
4700   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4701
4702   /* Layout 0 is "no change".  */
4703   m_perms.safe_push (vNULL);
4704
4705   /* Create layouts from existing permutations.  */
4706   auto_load_permutation_t tmp_perm;
4707   for (unsigned int node_i : m_partitioned_nodes)
4708     {
4709       /* Leafs also double as entries to the reverse graph.  Allow the
4710          layout of those to be changed.  */
4711       auto &vertex = m_vertices[node_i];
4712       auto &partition = m_partitions[vertex.partition];
4713       if (!m_slpg->vertices[node_i].succ)
4714         partition.layout = 0;
4715
4716       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
4717       slp_tree node = vertex.node;
4718       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4719       slp_tree child;
4720       unsigned HOST_WIDE_INT imin, imax = 0;
4721       bool any_permute = false;
4722       tmp_perm.truncate (0);
4723       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4724         {
4725           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4726              unpermuted, record a layout that reverses this permutation.
4727
4728              We would need more work to cope with loads that are internally
4729              permuted and also have inputs (such as masks for
4730              IFN_MASK_LOADs).  */
4731           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4732           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4733             {
4734               partition.layout = -1;
4735               continue;
4736             }
4737           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4738           imin = DR_GROUP_SIZE (dr_stmt) + 1;
4739           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4740         }
4741       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4742                && SLP_TREE_CHILDREN (node).length () == 1
4743                && (child = SLP_TREE_CHILDREN (node)[0])
4744                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4745                    .is_constant (&imin)))
4746         {
4747           /* If the child has the same vector size as this node,
4748              reversing the permutation can make the permutation a no-op.
4749              In other cases it can change a true permutation into a
4750              full-vector extract.  */
4751           tmp_perm.reserve (SLP_TREE_LANES (node));
4752           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4753             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4754         }
4755       else
4756         continue;
4757
4758       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4759         {
4760           unsigned idx = tmp_perm[j];
4761           imin = MIN (imin, idx);
4762           imax = MAX (imax, idx);
4763           if (idx - tmp_perm[0] != j)
4764             any_permute = true;
4765         }
4766       /* If the span doesn't match we'd disrupt VF computation, avoid
4767          that for now.  */
4768       if (imax - imin + 1 != SLP_TREE_LANES (node))
4769         continue;
4770       /* If there's no permute no need to split one out.  In this case
4771          we can consider turning a load into a permuted load, if that
4772          turns out to be cheaper than alternatives.  */
4773       if (!any_permute)
4774         {
4775           partition.layout = -1;
4776           continue;
4777         }
4778
4779       /* For now only handle true permutes, like
4780          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
4781          when permuting constants and invariants keeping the permute
4782          bijective.  */
4783       auto_sbitmap load_index (SLP_TREE_LANES (node));
4784       bitmap_clear (load_index);
4785       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4786         bitmap_set_bit (load_index, tmp_perm[j] - imin);
4787       unsigned j;
4788       for (j = 0; j < SLP_TREE_LANES (node); ++j)
4789         if (!bitmap_bit_p (load_index, j))
4790           break;
4791       if (j != SLP_TREE_LANES (node))
4792         continue;
4793
4794       vec<unsigned> perm = vNULL;
4795       perm.safe_grow (SLP_TREE_LANES (node), true);
4796       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4797         perm[j] = tmp_perm[j] - imin;
4798
4799       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4800         {
4801           /* Continue to use existing layouts, but don't add any more.  */
4802           int *entry = layout_ids.get (perm);
4803           partition.layout = entry ? *entry : 0;
4804           perm.release ();
4805         }
4806       else
4807         {
4808           bool existed;
4809           int &layout_i = layout_ids.get_or_insert (perm, &existed);
4810           if (existed)
4811             perm.release ();
4812           else
4813             {
4814               layout_i = m_perms.length ();
4815               m_perms.safe_push (perm);
4816             }
4817           partition.layout = layout_i;
4818         }
4819     }
4820
4821   /* Initially assume that every layout is possible and has zero cost
4822      in every partition.  */
4823   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4824                                               * m_perms.length ());
4825
4826   /* We have to mark outgoing permutations facing non-associating-reduction
4827      graph entries that are not represented as to be materialized.
4828      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
4829   for (slp_instance instance : m_vinfo->slp_instances)
4830     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4831       {
4832         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4833         m_partitions[m_vertices[node_i].partition].layout = 0;
4834       }
4835     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4836       {
4837         stmt_vec_info stmt_info
4838           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4839         stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4840         if (needs_fold_left_reduction_p (TREE_TYPE
4841                                            (gimple_get_lhs (stmt_info->stmt)),
4842                                          STMT_VINFO_REDUC_CODE (reduc_info)))
4843           {
4844             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4845             m_partitions[m_vertices[node_i].partition].layout = 0;
4846           }
4847       }
4848
4849   /* Check which layouts each node and partition can handle.  Calculate the
4850      weights associated with inserting layout changes on edges.  */
4851   for (unsigned int node_i : m_partitioned_nodes)
4852     {
4853       auto &vertex = m_vertices[node_i];
4854       auto &partition = m_partitions[vertex.partition];
4855       slp_tree node = vertex.node;
4856
4857       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4858         {
4859           vertex.weight = vect_slp_node_weight (node);
4860
4861           /* We do not handle stores with a permutation, so all
4862              incoming permutations must have been materialized.
4863
4864              We also don't handle masked grouped loads, which lack a
4865              permutation vector.  In this case the memory locations
4866              form an implicit second input to the loads, on top of the
4867              explicit mask input, and the memory input's layout cannot
4868              be changed.
4869
4870              On the other hand, we do support permuting gather loads and
4871              masked gather loads, where each scalar load is independent
4872              of the others.  This can be useful if the address/index input
4873              benefits from permutation.  */
4874           if (STMT_VINFO_DATA_REF (rep)
4875               && STMT_VINFO_GROUPED_ACCESS (rep)
4876               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4877             partition.layout = 0;
4878
4879           /* We cannot change the layout of an operation that is
4880              not independent on lanes.  Note this is an explicit
4881              negative list since that's much shorter than the respective
4882              positive one but it's critical to keep maintaining it.  */
4883           if (is_gimple_call (STMT_VINFO_STMT (rep)))
4884             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4885               {
4886               case CFN_COMPLEX_ADD_ROT90:
4887               case CFN_COMPLEX_ADD_ROT270:
4888               case CFN_COMPLEX_MUL:
4889               case CFN_COMPLEX_MUL_CONJ:
4890               case CFN_VEC_ADDSUB:
4891               case CFN_VEC_FMADDSUB:
4892               case CFN_VEC_FMSUBADD:
4893                 partition.layout = 0;
4894               default:;
4895               }
4896         }
4897
4898       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4899         {
4900           auto &other_vertex = m_vertices[other_node_i];
4901
4902           /* Count the number of edges from earlier partitions and the number
4903              of edges to later partitions.  */
4904           if (other_vertex.partition < vertex.partition)
4905             partition.in_degree += 1;
4906           else
4907             partition.out_degree += 1;
4908
4909           /* If the current node uses the result of OTHER_NODE_I, accumulate
4910              the effects of that.  */
4911           if (ud->src == int (node_i))
4912             {
4913               other_vertex.out_weight += vertex.weight;
4914               other_vertex.out_degree += 1;
4915             }
4916         };
4917       for_each_partition_edge (node_i, process_edge);
4918     }
4919 }
4920
4921 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4922    its current (provisional) choice of layout.  The inputs do not necessarily
4923    have the same layout as each other.  */
4924
4925 slpg_layout_cost
4926 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4927 {
4928   auto &vertex = m_vertices[node_i];
4929   slpg_layout_cost cost;
4930   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4931     {
4932       auto &other_vertex = m_vertices[other_node_i];
4933       if (other_vertex.partition < vertex.partition)
4934         {
4935           auto &other_partition = m_partitions[other_vertex.partition];
4936           auto &other_costs = partition_layout_costs (other_vertex.partition,
4937                                                       other_partition.layout);
4938           slpg_layout_cost this_cost = other_costs.in_cost;
4939           this_cost.add_serial_cost (other_costs.internal_cost);
4940           this_cost.split (other_partition.out_degree);
4941           cost.add_parallel_cost (this_cost);
4942         }
4943     };
4944   for_each_partition_edge (node_i, add_cost);
4945   return cost;
4946 }
4947
4948 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4949    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
4950    slpg_layout_cost::impossible () if the change isn't possible.  */
4951
4952 slpg_layout_cost
4953 vect_optimize_slp_pass::
4954 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4955                   unsigned int layout2_i)
4956 {
4957   auto &def_vertex = m_vertices[ud->dest];
4958   auto &use_vertex = m_vertices[ud->src];
4959   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4960   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4961   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4962                                     use_layout_i);
4963   if (factor < 0)
4964     return slpg_layout_cost::impossible ();
4965
4966   /* We have a choice of putting the layout change at the site of the
4967      definition or at the site of the use.  Prefer the former when
4968      optimizing for size or when the execution frequency of the
4969      definition is no greater than the combined execution frequencies of
4970      the uses.  When putting the layout change at the site of the definition,
4971      divvy up the cost among all consumers.  */
4972   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4973     {
4974       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4975       cost.split (def_vertex.out_degree);
4976       return cost;
4977     }
4978   return { use_vertex.weight * factor, m_optimize_size };
4979 }
4980
4981 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4982    partition; FROM_NODE_I could be the definition node or the use node.
4983    The node at the other end of the link wants to use layout TO_LAYOUT_I.
4984    Return the cost of any necessary fix-ups on edge UD, or return
4985    slpg_layout_cost::impossible () if the change isn't possible.
4986
4987    At this point, FROM_NODE_I's partition has chosen the cheapest
4988    layout based on the information available so far, but this choice
4989    is only provisional.  */
4990
4991 slpg_layout_cost
4992 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
4993                                       unsigned int to_layout_i)
4994 {
4995   auto &from_vertex = m_vertices[from_node_i];
4996   unsigned int from_partition_i = from_vertex.partition;
4997   slpg_partition_info &from_partition = m_partitions[from_partition_i];
4998   gcc_assert (from_partition.layout >= 0);
4999
5000   /* First calculate the cost on the assumption that FROM_PARTITION sticks
5001      with its current layout preference.  */
5002   slpg_layout_cost cost = slpg_layout_cost::impossible ();
5003   auto edge_cost = edge_layout_cost (ud, from_node_i,
5004                                      from_partition.layout, to_layout_i);
5005   if (edge_cost.is_possible ())
5006     {
5007       auto &from_costs = partition_layout_costs (from_partition_i,
5008                                                  from_partition.layout);
5009       cost = from_costs.in_cost;
5010       cost.add_serial_cost (from_costs.internal_cost);
5011       cost.split (from_partition.out_degree);
5012       cost.add_serial_cost (edge_cost);
5013     }
5014
5015   /* Take the minimum of that cost and the cost that applies if
5016      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
5017   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
5018                                                       to_layout_i);
5019   if (direct_layout_costs.is_possible ())
5020     {
5021       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5022       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
5023       direct_cost.split (from_partition.out_degree);
5024       if (!cost.is_possible ()
5025           || direct_cost.is_better_than (cost, m_optimize_size))
5026         cost = direct_cost;
5027     }
5028
5029   return cost;
5030 }
5031
5032 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
5033    partition; TO_NODE_I could be the definition node or the use node.
5034    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5035    return the cost of any necessary fix-ups on edge UD, or
5036    slpg_layout_cost::impossible () if the choice cannot be made.
5037
5038    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
5039
5040 slpg_layout_cost
5041 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
5042                                        unsigned int from_layout_i)
5043 {
5044   auto &to_vertex = m_vertices[to_node_i];
5045   unsigned int to_partition_i = to_vertex.partition;
5046   slpg_partition_info &to_partition = m_partitions[to_partition_i];
5047   gcc_assert (to_partition.layout >= 0);
5048
5049   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
5050      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
5051      any other inputs keep their current choice of layout.  */
5052   auto &to_costs = partition_layout_costs (to_partition_i,
5053                                            to_partition.layout);
5054   if (ud->src == int (to_node_i)
5055       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5056     {
5057       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
5058       auto old_layout = from_partition.layout;
5059       from_partition.layout = from_layout_i;
5060       int factor = internal_node_cost (to_vertex.node, -1,
5061                                        to_partition.layout);
5062       from_partition.layout = old_layout;
5063       if (factor >= 0)
5064         {
5065           slpg_layout_cost cost = to_costs.out_cost;
5066           cost.add_serial_cost ({ to_vertex.weight * factor,
5067                                   m_optimize_size });
5068           cost.split (to_partition.in_degree);
5069           return cost;
5070         }
5071     }
5072
5073   /* Compute the cost if we insert any necessary layout change on edge UD.  */
5074   auto edge_cost = edge_layout_cost (ud, to_node_i,
5075                                      to_partition.layout, from_layout_i);
5076   if (edge_cost.is_possible ())
5077     {
5078       slpg_layout_cost cost = to_costs.out_cost;
5079       cost.add_serial_cost (to_costs.internal_cost);
5080       cost.split (to_partition.in_degree);
5081       cost.add_serial_cost (edge_cost);
5082       return cost;
5083     }
5084
5085   return slpg_layout_cost::impossible ();
5086 }
5087
5088 /* Make a forward pass through the partitions, accumulating input costs.
5089    Make a tentative (provisional) choice of layout for each partition,
5090    ensuring that this choice still allows later partitions to keep
5091    their original layout.  */
5092
5093 void
5094 vect_optimize_slp_pass::forward_pass ()
5095 {
5096   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5097        ++partition_i)
5098     {
5099       auto &partition = m_partitions[partition_i];
5100
5101       /* If the partition consists of a single VEC_PERM_EXPR, precompute
5102          the incoming cost that would apply if every predecessor partition
5103          keeps its current layout.  This is used within the loop below.  */
5104       slpg_layout_cost in_cost;
5105       slp_tree single_node = nullptr;
5106       if (partition.node_end == partition.node_begin + 1)
5107         {
5108           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5109           single_node = m_vertices[node_i].node;
5110           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5111             in_cost = total_in_cost (node_i);
5112         }
5113
5114       /* Go through the possible layouts.  Decide which ones are valid
5115          for this partition and record which of the valid layouts has
5116          the lowest cost.  */
5117       unsigned int min_layout_i = 0;
5118       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5119       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5120         {
5121           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5122           if (!layout_costs.is_possible ())
5123             continue;
5124
5125           /* If the recorded layout is already 0 then the layout cannot
5126              change.  */
5127           if (partition.layout == 0 && layout_i != 0)
5128             {
5129               layout_costs.mark_impossible ();
5130               continue;
5131             }
5132
5133           bool is_possible = true;
5134           for (unsigned int order_i = partition.node_begin;
5135                order_i < partition.node_end; ++order_i)
5136             {
5137               unsigned int node_i = m_partitioned_nodes[order_i];
5138               auto &vertex = m_vertices[node_i];
5139
5140               /* Reject the layout if it is individually incompatible
5141                  with any node in the partition.  */
5142               if (!is_compatible_layout (vertex.node, layout_i))
5143                 {
5144                   is_possible = false;
5145                   break;
5146                 }
5147
5148               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5149                 {
5150                   auto &other_vertex = m_vertices[other_node_i];
5151                   if (other_vertex.partition < vertex.partition)
5152                     {
5153                       /* Accumulate the incoming costs from earlier
5154                          partitions, plus the cost of any layout changes
5155                          on UD itself.  */
5156                       auto cost = forward_cost (ud, other_node_i, layout_i);
5157                       if (!cost.is_possible ())
5158                         is_possible = false;
5159                       else
5160                         layout_costs.in_cost.add_parallel_cost (cost);
5161                     }
5162                   else
5163                     /* Reject the layout if it would make layout 0 impossible
5164                        for later partitions.  This amounts to testing that the
5165                        target supports reversing the layout change on edges
5166                        to later partitions.
5167
5168                        In principle, it might be possible to push a layout
5169                        change all the way down a graph, so that it never
5170                        needs to be reversed and so that the target doesn't
5171                        need to support the reverse operation.  But it would
5172                        be awkward to bail out if we hit a partition that
5173                        does not support the new layout, especially since
5174                        we are not dealing with a lattice.  */
5175                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
5176                                                      layout_i).is_possible ();
5177                 };
5178               for_each_partition_edge (node_i, add_cost);
5179
5180               /* Accumulate the cost of using LAYOUT_I within NODE,
5181                  both for the inputs and the outputs.  */
5182               int factor = internal_node_cost (vertex.node, layout_i,
5183                                                layout_i);
5184               if (factor < 0)
5185                 {
5186                   is_possible = false;
5187                   break;
5188                 }
5189               else if (factor)
5190                 layout_costs.internal_cost.add_serial_cost
5191                   ({ vertex.weight * factor, m_optimize_size });
5192             }
5193           if (!is_possible)
5194             {
5195               layout_costs.mark_impossible ();
5196               continue;
5197             }
5198
5199           /* Combine the incoming and partition-internal costs.  */
5200           slpg_layout_cost combined_cost = layout_costs.in_cost;
5201           combined_cost.add_serial_cost (layout_costs.internal_cost);
5202
5203           /* If this partition consists of a single VEC_PERM_EXPR, see
5204              if the VEC_PERM_EXPR can be changed to support output layout
5205              LAYOUT_I while keeping all the provisional choices of input
5206              layout.  */
5207           if (single_node
5208               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5209             {
5210               int factor = internal_node_cost (single_node, -1, layout_i);
5211               if (factor >= 0)
5212                 {
5213                   auto weight = m_vertices[single_node->vertex].weight;
5214                   slpg_layout_cost internal_cost
5215                     = { weight * factor, m_optimize_size };
5216
5217                   slpg_layout_cost alt_cost = in_cost;
5218                   alt_cost.add_serial_cost (internal_cost);
5219                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5220                     {
5221                       combined_cost = alt_cost;
5222                       layout_costs.in_cost = in_cost;
5223                       layout_costs.internal_cost = internal_cost;
5224                     }
5225                 }
5226             }
5227
5228           /* Record the layout with the lowest cost.  Prefer layout 0 in
5229              the event of a tie between it and another layout.  */
5230           if (!min_layout_cost.is_possible ()
5231               || combined_cost.is_better_than (min_layout_cost,
5232                                                m_optimize_size))
5233             {
5234               min_layout_i = layout_i;
5235               min_layout_cost = combined_cost;
5236             }
5237         }
5238
5239       /* This loop's handling of earlier partitions should ensure that
5240          choosing the original layout for the current partition is no
5241          less valid than it was in the original graph, even with the
5242          provisional layout choices for those earlier partitions.  */
5243       gcc_assert (min_layout_cost.is_possible ());
5244       partition.layout = min_layout_i;
5245     }
5246 }
5247
5248 /* Make a backward pass through the partitions, accumulating output costs.
5249    Make a final choice of layout for each partition.  */
5250
5251 void
5252 vect_optimize_slp_pass::backward_pass ()
5253 {
5254   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5255     {
5256       auto &partition = m_partitions[partition_i];
5257
5258       unsigned int min_layout_i = 0;
5259       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5260       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5261         {
5262           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5263           if (!layout_costs.is_possible ())
5264             continue;
5265
5266           /* Accumulate the costs from successor partitions.  */
5267           bool is_possible = true;
5268           for (unsigned int order_i = partition.node_begin;
5269                order_i < partition.node_end; ++order_i)
5270             {
5271               unsigned int node_i = m_partitioned_nodes[order_i];
5272               auto &vertex = m_vertices[node_i];
5273               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5274                 {
5275                   auto &other_vertex = m_vertices[other_node_i];
5276                   auto &other_partition = m_partitions[other_vertex.partition];
5277                   if (other_vertex.partition > vertex.partition)
5278                     {
5279                       /* Accumulate the incoming costs from later
5280                          partitions, plus the cost of any layout changes
5281                          on UD itself.  */
5282                       auto cost = backward_cost (ud, other_node_i, layout_i);
5283                       if (!cost.is_possible ())
5284                         is_possible = false;
5285                       else
5286                         layout_costs.out_cost.add_parallel_cost (cost);
5287                     }
5288                   else
5289                     /* Make sure that earlier partitions can (if necessary
5290                        or beneficial) keep the layout that they chose in
5291                        the forward pass.  This ensures that there is at
5292                        least one valid choice of layout.  */
5293                     is_possible &= edge_layout_cost (ud, other_node_i,
5294                                                      other_partition.layout,
5295                                                      layout_i).is_possible ();
5296                 };
5297               for_each_partition_edge (node_i, add_cost);
5298             }
5299           if (!is_possible)
5300             {
5301               layout_costs.mark_impossible ();
5302               continue;
5303             }
5304
5305           /* Locally combine the costs from the forward and backward passes.
5306              (This combined cost is not passed on, since that would lead
5307              to double counting.)  */
5308           slpg_layout_cost combined_cost = layout_costs.in_cost;
5309           combined_cost.add_serial_cost (layout_costs.internal_cost);
5310           combined_cost.add_serial_cost (layout_costs.out_cost);
5311
5312           /* Record the layout with the lowest cost.  Prefer layout 0 in
5313              the event of a tie between it and another layout.  */
5314           if (!min_layout_cost.is_possible ()
5315               || combined_cost.is_better_than (min_layout_cost,
5316                                                m_optimize_size))
5317             {
5318               min_layout_i = layout_i;
5319               min_layout_cost = combined_cost;
5320             }
5321         }
5322
5323       gcc_assert (min_layout_cost.is_possible ());
5324       partition.layout = min_layout_i;
5325     }
5326 }
5327
5328 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5329    NODE already has the layout that was selected for its partition.  */
5330
5331 slp_tree
5332 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5333                                                 unsigned int to_layout_i)
5334 {
5335   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5336   slp_tree result = m_node_layouts[result_i];
5337   if (result)
5338     return result;
5339
5340   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5341       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5342           /* We can't permute vector defs in place.  */
5343           && SLP_TREE_VEC_DEFS (node).is_empty ()))
5344     {
5345       /* If the vector is uniform or unchanged, there's nothing to do.  */
5346       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5347         result = node;
5348       else
5349         {
5350           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5351           result = vect_create_new_slp_node (scalar_ops);
5352           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5353         }
5354     }
5355   else
5356     {
5357       unsigned int partition_i = m_vertices[node->vertex].partition;
5358       unsigned int from_layout_i = m_partitions[partition_i].layout;
5359       if (from_layout_i == to_layout_i)
5360         return node;
5361
5362       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5363          permutation instead of a serial one.  Leave the new permutation
5364          in TMP_PERM on success.  */
5365       auto_lane_permutation_t tmp_perm;
5366       unsigned int num_inputs = 1;
5367       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5368         {
5369           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5370           if (from_layout_i != 0)
5371             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5372           if (to_layout_i != 0)
5373             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5374           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5375                                               tmp_perm,
5376                                               SLP_TREE_CHILDREN (node),
5377                                               false) >= 0)
5378             num_inputs = SLP_TREE_CHILDREN (node).length ();
5379           else
5380             tmp_perm.truncate (0);
5381         }
5382
5383       if (dump_enabled_p ())
5384         {
5385           if (tmp_perm.length () > 0)
5386             dump_printf_loc (MSG_NOTE, vect_location,
5387                              "duplicating permutation node %p with"
5388                              " layout %d\n",
5389                              (void *) node, to_layout_i);
5390           else
5391             dump_printf_loc (MSG_NOTE, vect_location,
5392                              "inserting permutation node in place of %p\n",
5393                              (void *) node);
5394         }
5395
5396       unsigned int num_lanes = SLP_TREE_LANES (node);
5397       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5398       if (SLP_TREE_SCALAR_STMTS (node).length ())
5399         {
5400           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5401           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5402           if (from_layout_i != 0)
5403             vect_slp_permute (m_perms[from_layout_i], stmts, false);
5404           if (to_layout_i != 0)
5405             vect_slp_permute (m_perms[to_layout_i], stmts, true);
5406         }
5407       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5408       SLP_TREE_LANES (result) = num_lanes;
5409       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5410       result->vertex = -1;
5411
5412       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5413       if (tmp_perm.length ())
5414         {
5415           lane_perm.safe_splice (tmp_perm);
5416           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5417         }
5418       else
5419         {
5420           lane_perm.create (num_lanes);
5421           for (unsigned j = 0; j < num_lanes; ++j)
5422             lane_perm.quick_push ({ 0, j });
5423           if (from_layout_i != 0)
5424             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5425           if (to_layout_i != 0)
5426             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5427           SLP_TREE_CHILDREN (result).safe_push (node);
5428         }
5429       for (slp_tree child : SLP_TREE_CHILDREN (result))
5430         child->refcnt++;
5431     }
5432   m_node_layouts[result_i] = result;
5433   return result;
5434 }
5435
5436 /* Apply the chosen vector layouts to the SLP graph.  */
5437
5438 void
5439 vect_optimize_slp_pass::materialize ()
5440 {
5441   /* We no longer need the costs, so avoid having two O(N * P) arrays
5442      live at the same time.  */
5443   m_partition_layout_costs.release ();
5444   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5445
5446   auto_sbitmap fully_folded (m_vertices.length ());
5447   bitmap_clear (fully_folded);
5448   for (unsigned int node_i : m_partitioned_nodes)
5449     {
5450       auto &vertex = m_vertices[node_i];
5451       slp_tree node = vertex.node;
5452       int layout_i = m_partitions[vertex.partition].layout;
5453       gcc_assert (layout_i >= 0);
5454
5455       /* Rearrange the scalar statements to match the chosen layout.  */
5456       if (layout_i > 0)
5457         vect_slp_permute (m_perms[layout_i],
5458                           SLP_TREE_SCALAR_STMTS (node), true);
5459
5460       /* Update load and lane permutations.  */
5461       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5462         {
5463           /* First try to absorb the input vector layouts.  If that fails,
5464              force the inputs to have layout LAYOUT_I too.  We checked that
5465              that was possible before deciding to use nonzero output layouts.
5466              (Note that at this stage we don't really have any guarantee that
5467              the target supports the original VEC_PERM_EXPR.)  */
5468           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5469           auto_lane_permutation_t tmp_perm;
5470           tmp_perm.safe_splice (perm);
5471           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5472           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5473                                               tmp_perm,
5474                                               SLP_TREE_CHILDREN (node),
5475                                               false) >= 0)
5476             {
5477               if (dump_enabled_p ()
5478                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5479                                   perm.begin ()))
5480                 dump_printf_loc (MSG_NOTE, vect_location,
5481                                  "absorbing input layouts into %p\n",
5482                                  (void *) node);
5483               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5484               bitmap_set_bit (fully_folded, node_i);
5485             }
5486           else
5487             {
5488               /* Not MSG_MISSED because it would make no sense to users.  */
5489               if (dump_enabled_p ())
5490                 dump_printf_loc (MSG_NOTE, vect_location,
5491                                  "failed to absorb input layouts into %p\n",
5492                                  (void *) node);
5493               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5494             }
5495         }
5496       else
5497         {
5498           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5499           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5500           if (layout_i > 0)
5501             /* ???  When we handle non-bijective permutes the idea
5502                is that we can force the load-permutation to be
5503                { min, min + 1, min + 2, ... max }.  But then the
5504                scalar defs might no longer match the lane content
5505                which means wrong-code with live lane vectorization.
5506                So we possibly have to have NULL entries for those.  */
5507             vect_slp_permute (m_perms[layout_i], load_perm, true);
5508         }
5509     }
5510
5511   /* Do this before any nodes disappear, since it involves a walk
5512      over the leaves.  */
5513   remove_redundant_permutations ();
5514
5515   /* Replace each child with a correctly laid-out version.  */
5516   for (unsigned int node_i : m_partitioned_nodes)
5517     {
5518       /* Skip nodes that have already been handled above.  */
5519       if (bitmap_bit_p (fully_folded, node_i))
5520         continue;
5521
5522       auto &vertex = m_vertices[node_i];
5523       int in_layout_i = m_partitions[vertex.partition].layout;
5524       gcc_assert (in_layout_i >= 0);
5525
5526       unsigned j;
5527       slp_tree child;
5528       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5529         {
5530           if (!child)
5531             continue;
5532
5533           slp_tree new_child = get_result_with_layout (child, in_layout_i);
5534           if (new_child != child)
5535             {
5536               vect_free_slp_tree (child);
5537               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5538               new_child->refcnt += 1;
5539             }
5540         }
5541     }
5542 }
5543
5544 /* Elide load permutations that are not necessary.  Such permutations might
5545    be pre-existing, rather than created by the layout optimizations.  */
5546
5547 void
5548 vect_optimize_slp_pass::remove_redundant_permutations ()
5549 {
5550   for (unsigned int node_i : m_leafs)
5551     {
5552       slp_tree node = m_vertices[node_i].node;
5553       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5554         continue;
5555
5556       /* In basic block vectorization we allow any subchain of an interleaving
5557          chain.
5558          FORNOW: not in loop SLP because of realignment complications.  */
5559       if (is_a <bb_vec_info> (m_vinfo))
5560         {
5561           bool subchain_p = true;
5562           stmt_vec_info next_load_info = NULL;
5563           stmt_vec_info load_info;
5564           unsigned j;
5565           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5566             {
5567               if (j != 0
5568                   && (next_load_info != load_info
5569                       || DR_GROUP_GAP (load_info) != 1))
5570                 {
5571                   subchain_p = false;
5572                   break;
5573                 }
5574               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5575             }
5576           if (subchain_p)
5577             {
5578               SLP_TREE_LOAD_PERMUTATION (node).release ();
5579               continue;
5580             }
5581         }
5582       else
5583         {
5584           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5585           stmt_vec_info load_info;
5586           bool this_load_permuted = false;
5587           unsigned j;
5588           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5589             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5590               {
5591                 this_load_permuted = true;
5592                 break;
5593               }
5594           /* When this isn't a grouped access we know it's single element
5595              and contiguous.  */
5596           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5597             {
5598               if (!this_load_permuted
5599                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5600                       || SLP_TREE_LANES (node) == 1))
5601                 SLP_TREE_LOAD_PERMUTATION (node).release ();
5602               continue;
5603             }
5604           stmt_vec_info first_stmt_info
5605             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5606           if (!this_load_permuted
5607               /* The load requires permutation when unrolling exposes
5608                  a gap either because the group is larger than the SLP
5609                  group-size or because there is a gap between the groups.  */
5610               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5611                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5612                       && DR_GROUP_GAP (first_stmt_info) == 0)))
5613             {
5614               SLP_TREE_LOAD_PERMUTATION (node).release ();
5615               continue;
5616             }
5617         }
5618     }
5619 }
5620
5621 /* Print the partition graph and layout information to the dump file.  */
5622
5623 void
5624 vect_optimize_slp_pass::dump ()
5625 {
5626   dump_printf_loc (MSG_NOTE, vect_location,
5627                    "SLP optimize permutations:\n");
5628   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5629     {
5630       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
5631       const char *sep = "";
5632       for (unsigned int idx : m_perms[layout_i])
5633         {
5634           dump_printf (MSG_NOTE, "%s%d", sep, idx);
5635           sep = ", ";
5636         }
5637       dump_printf (MSG_NOTE, " }\n");
5638     }
5639   dump_printf_loc (MSG_NOTE, vect_location,
5640                    "SLP optimize partitions:\n");
5641   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5642        ++partition_i)
5643     {
5644       auto &partition = m_partitions[partition_i];
5645       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
5646       dump_printf_loc (MSG_NOTE, vect_location,
5647                        "  partition %d (layout %d):\n",
5648                        partition_i, partition.layout);
5649       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
5650       for (unsigned int order_i = partition.node_begin;
5651            order_i < partition.node_end; ++order_i)
5652         {
5653           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5654           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
5655                            (void *) vertex.node);
5656           dump_printf_loc (MSG_NOTE, vect_location,
5657                            "          weight: %f\n",
5658                            vertex.weight.to_double ());
5659           if (vertex.out_degree)
5660             dump_printf_loc (MSG_NOTE, vect_location,
5661                              "          out weight: %f (degree %d)\n",
5662                              vertex.out_weight.to_double (),
5663                              vertex.out_degree);
5664           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5665             dump_printf_loc (MSG_NOTE, vect_location,
5666                              "          op: VEC_PERM_EXPR\n");
5667           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5668             dump_printf_loc (MSG_NOTE, vect_location,
5669                              "          op template: %G", rep->stmt);
5670         }
5671       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
5672       for (unsigned int order_i = partition.node_begin;
5673            order_i < partition.node_end; ++order_i)
5674         {
5675           unsigned int node_i = m_partitioned_nodes[order_i];
5676           auto &vertex = m_vertices[node_i];
5677           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5678             {
5679               auto &other_vertex = m_vertices[other_node_i];
5680               if (other_vertex.partition < vertex.partition)
5681                 dump_printf_loc (MSG_NOTE, vect_location,
5682                                  "      - %p [%d] --> %p\n",
5683                                  (void *) other_vertex.node,
5684                                  other_vertex.partition,
5685                                  (void *) vertex.node);
5686               else
5687                 dump_printf_loc (MSG_NOTE, vect_location,
5688                                  "      - %p --> [%d] %p\n",
5689                                  (void *) vertex.node,
5690                                  other_vertex.partition,
5691                                  (void *) other_vertex.node);
5692             };
5693           for_each_partition_edge (node_i, print_edge);
5694         }
5695
5696       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5697         {
5698           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5699           if (layout_costs.is_possible ())
5700             {
5701               dump_printf_loc (MSG_NOTE, vect_location,
5702                                "    layout %d:%s\n", layout_i,
5703                                partition.layout == int (layout_i)
5704                                ? " (*)" : "");
5705               slpg_layout_cost combined_cost = layout_costs.in_cost;
5706               combined_cost.add_serial_cost (layout_costs.internal_cost);
5707               combined_cost.add_serial_cost (layout_costs.out_cost);
5708 #define TEMPLATE "{depth: %f, total: %f}"
5709               dump_printf_loc (MSG_NOTE, vect_location,
5710                                "        " TEMPLATE "\n",
5711                                layout_costs.in_cost.depth.to_double (),
5712                                layout_costs.in_cost.total.to_double ());
5713               dump_printf_loc (MSG_NOTE, vect_location,
5714                                "      + " TEMPLATE "\n",
5715                                layout_costs.internal_cost.depth.to_double (),
5716                                layout_costs.internal_cost.total.to_double ());
5717               dump_printf_loc (MSG_NOTE, vect_location,
5718                                "      + " TEMPLATE "\n",
5719                                layout_costs.out_cost.depth.to_double (),
5720                                layout_costs.out_cost.total.to_double ());
5721               dump_printf_loc (MSG_NOTE, vect_location,
5722                                "      = " TEMPLATE "\n",
5723                                combined_cost.depth.to_double (),
5724                                combined_cost.total.to_double ());
5725 #undef TEMPLATE
5726             }
5727           else
5728             dump_printf_loc (MSG_NOTE, vect_location,
5729                              "    layout %d: rejected\n", layout_i);
5730         }
5731     }
5732 }
5733
5734 /* Main entry point for the SLP graph optimization pass.  */
5735
5736 void
5737 vect_optimize_slp_pass::run ()
5738 {
5739   build_graph ();
5740   create_partitions ();
5741   start_choosing_layouts ();
5742   if (m_perms.length () > 1)
5743     {
5744       forward_pass ();
5745       backward_pass ();
5746       if (dump_enabled_p ())
5747         dump ();
5748       materialize ();
5749       while (!m_perms.is_empty ())
5750         m_perms.pop ().release ();
5751     }
5752   else
5753     remove_redundant_permutations ();
5754   free_graph (m_slpg);
5755 }
5756
5757 /* Optimize the SLP graph of VINFO.  */
5758
5759 void
5760 vect_optimize_slp (vec_info *vinfo)
5761 {
5762   if (vinfo->slp_instances.is_empty ())
5763     return;
5764   vect_optimize_slp_pass (vinfo).run ();
5765 }
5766
5767 /* Gather loads reachable from the individual SLP graph entries.  */
5768
5769 void
5770 vect_gather_slp_loads (vec_info *vinfo)
5771 {
5772   unsigned i;
5773   slp_instance instance;
5774   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5775     {
5776       hash_set<slp_tree> visited;
5777       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5778                              SLP_INSTANCE_TREE (instance), visited);
5779     }
5780 }
5781
5782
5783 /* For each possible SLP instance decide whether to SLP it and calculate overall
5784    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
5785    least one instance.  */
5786
5787 bool
5788 vect_make_slp_decision (loop_vec_info loop_vinfo)
5789 {
5790   unsigned int i;
5791   poly_uint64 unrolling_factor = 1;
5792   const vec<slp_instance> &slp_instances
5793     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5794   slp_instance instance;
5795   int decided_to_slp = 0;
5796
5797   DUMP_VECT_SCOPE ("vect_make_slp_decision");
5798
5799   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5800     {
5801       /* FORNOW: SLP if you can.  */
5802       /* All unroll factors have the form:
5803
5804            GET_MODE_SIZE (vinfo->vector_mode) * X
5805
5806          for some rational X, so they must have a common multiple.  */
5807       unrolling_factor
5808         = force_common_multiple (unrolling_factor,
5809                                  SLP_INSTANCE_UNROLLING_FACTOR (instance));
5810
5811       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
5812          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5813          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
5814       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5815       decided_to_slp++;
5816     }
5817
5818   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5819
5820   if (decided_to_slp && dump_enabled_p ())
5821     {
5822       dump_printf_loc (MSG_NOTE, vect_location,
5823                        "Decided to SLP %d instances. Unrolling factor ",
5824                        decided_to_slp);
5825       dump_dec (MSG_NOTE, unrolling_factor);
5826       dump_printf (MSG_NOTE, "\n");
5827     }
5828
5829   return (decided_to_slp > 0);
5830 }
5831
5832 /* Private data for vect_detect_hybrid_slp.  */
5833 struct vdhs_data
5834 {
5835   loop_vec_info loop_vinfo;
5836   vec<stmt_vec_info> *worklist;
5837 };
5838
5839 /* Walker for walk_gimple_op.  */
5840
5841 static tree
5842 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5843 {
5844   walk_stmt_info *wi = (walk_stmt_info *)data;
5845   vdhs_data *dat = (vdhs_data *)wi->info;
5846
5847   if (wi->is_lhs)
5848     return NULL_TREE;
5849
5850   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5851   if (!def_stmt_info)
5852     return NULL_TREE;
5853   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5854   if (PURE_SLP_STMT (def_stmt_info))
5855     {
5856       if (dump_enabled_p ())
5857         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5858                          def_stmt_info->stmt);
5859       STMT_SLP_TYPE (def_stmt_info) = hybrid;
5860       dat->worklist->safe_push (def_stmt_info);
5861     }
5862
5863   return NULL_TREE;
5864 }
5865
5866 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5867    if so, otherwise pushing it to WORKLIST.  */
5868
5869 static void
5870 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5871                                vec<stmt_vec_info> &worklist,
5872                                stmt_vec_info stmt_info)
5873 {
5874   if (dump_enabled_p ())
5875     dump_printf_loc (MSG_NOTE, vect_location,
5876                      "Processing hybrid candidate : %G", stmt_info->stmt);
5877   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5878   imm_use_iterator iter2;
5879   ssa_op_iter iter1;
5880   use_operand_p use_p;
5881   def_operand_p def_p;
5882   bool any_def = false;
5883   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5884     {
5885       any_def = true;
5886       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5887         {
5888           if (is_gimple_debug (USE_STMT (use_p)))
5889             continue;
5890           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5891           /* An out-of loop use means this is a loop_vect sink.  */
5892           if (!use_info)
5893             {
5894               if (dump_enabled_p ())
5895                 dump_printf_loc (MSG_NOTE, vect_location,
5896                                  "Found loop_vect sink: %G", stmt_info->stmt);
5897               worklist.safe_push (stmt_info);
5898               return;
5899             }
5900           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5901             {
5902               if (dump_enabled_p ())
5903                 dump_printf_loc (MSG_NOTE, vect_location,
5904                                  "Found loop_vect use: %G", use_info->stmt);
5905               worklist.safe_push (stmt_info);
5906               return;
5907             }
5908         }
5909     }
5910   /* No def means this is a loo_vect sink.  */
5911   if (!any_def)
5912     {
5913       if (dump_enabled_p ())
5914         dump_printf_loc (MSG_NOTE, vect_location,
5915                          "Found loop_vect sink: %G", stmt_info->stmt);
5916       worklist.safe_push (stmt_info);
5917       return;
5918     }
5919   if (dump_enabled_p ())
5920     dump_printf_loc (MSG_NOTE, vect_location,
5921                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5922   STMT_SLP_TYPE (stmt_info) = pure_slp;
5923 }
5924
5925 /* Find stmts that must be both vectorized and SLPed.  */
5926
5927 void
5928 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5929 {
5930   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5931
5932   /* All stmts participating in SLP are marked pure_slp, all other
5933      stmts are loop_vect.
5934      First collect all loop_vect stmts into a worklist.
5935      SLP patterns cause not all original scalar stmts to appear in
5936      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5937      Rectify this here and do a backward walk over the IL only considering
5938      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5939      mark them as pure_slp.  */
5940   auto_vec<stmt_vec_info> worklist;
5941   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5942     {
5943       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5944       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5945            gsi_next (&gsi))
5946         {
5947           gphi *phi = gsi.phi ();
5948           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5949           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5950             maybe_push_to_hybrid_worklist (loop_vinfo,
5951                                            worklist, stmt_info);
5952         }
5953       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5954            gsi_prev (&gsi))
5955         {
5956           gimple *stmt = gsi_stmt (gsi);
5957           if (is_gimple_debug (stmt))
5958             continue;
5959           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5960           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5961             {
5962               for (gimple_stmt_iterator gsi2
5963                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5964                    !gsi_end_p (gsi2); gsi_next (&gsi2))
5965                 {
5966                   stmt_vec_info patt_info
5967                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5968                   if (!STMT_SLP_TYPE (patt_info)
5969                       && STMT_VINFO_RELEVANT (patt_info))
5970                     maybe_push_to_hybrid_worklist (loop_vinfo,
5971                                                    worklist, patt_info);
5972                 }
5973               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5974             }
5975           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5976             maybe_push_to_hybrid_worklist (loop_vinfo,
5977                                            worklist, stmt_info);
5978         }
5979     }
5980
5981   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5982      mark any SLP vectorized stmt as hybrid.
5983      ???  We're visiting def stmts N times (once for each non-SLP and
5984      once for each hybrid-SLP use).  */
5985   walk_stmt_info wi;
5986   vdhs_data dat;
5987   dat.worklist = &worklist;
5988   dat.loop_vinfo = loop_vinfo;
5989   memset (&wi, 0, sizeof (wi));
5990   wi.info = (void *)&dat;
5991   while (!worklist.is_empty ())
5992     {
5993       stmt_vec_info stmt_info = worklist.pop ();
5994       /* Since SSA operands are not set up for pattern stmts we need
5995          to use walk_gimple_op.  */
5996       wi.is_lhs = 0;
5997       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
5998       /* For gather/scatter make sure to walk the offset operand, that
5999          can be a scaling and conversion away.  */
6000       gather_scatter_info gs_info;
6001       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6002           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6003         {
6004           int dummy;
6005           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
6006         }
6007     }
6008 }
6009
6010
6011 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
6012
6013 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6014   : vec_info (vec_info::bb, shared),
6015     bbs (_bbs),
6016     roots (vNULL)
6017 {
6018   for (unsigned i = 0; i < bbs.length (); ++i)
6019     {
6020       if (i != 0)
6021         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6022              gsi_next (&si))
6023           {
6024             gphi *phi = si.phi ();
6025             gimple_set_uid (phi, 0);
6026             add_stmt (phi);
6027           }
6028       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6029            !gsi_end_p (gsi); gsi_next (&gsi))
6030         {
6031           gimple *stmt = gsi_stmt (gsi);
6032           gimple_set_uid (stmt, 0);
6033           if (is_gimple_debug (stmt))
6034             continue;
6035           add_stmt (stmt);
6036         }
6037     }
6038 }
6039
6040
6041 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
6042    stmts in the basic block.  */
6043
6044 _bb_vec_info::~_bb_vec_info ()
6045 {
6046   /* Reset region marker.  */
6047   for (unsigned i = 0; i < bbs.length (); ++i)
6048     {
6049       if (i != 0)
6050         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6051              gsi_next (&si))
6052           {
6053             gphi *phi = si.phi ();
6054             gimple_set_uid (phi, -1);
6055           }
6056       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6057            !gsi_end_p (gsi); gsi_next (&gsi))
6058         {
6059           gimple *stmt = gsi_stmt (gsi);
6060           gimple_set_uid (stmt, -1);
6061         }
6062     }
6063
6064   for (unsigned i = 0; i < roots.length (); ++i)
6065     {
6066       roots[i].stmts.release ();
6067       roots[i].roots.release ();
6068       roots[i].remain.release ();
6069     }
6070   roots.release ();
6071 }
6072
6073 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
6074    given then that child nodes have already been processed, and that
6075    their def types currently match their SLP node's def type.  */
6076
6077 static bool
6078 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6079                                     slp_instance node_instance,
6080                                     stmt_vector_for_cost *cost_vec)
6081 {
6082   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6083
6084   /* Calculate the number of vector statements to be created for the
6085      scalar stmts in this node.  For SLP reductions it is equal to the
6086      number of vector statements in the children (which has already been
6087      calculated by the recursive call).  Otherwise it is the number of
6088      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6089      VF divided by the number of elements in a vector.  */
6090   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6091       && !STMT_VINFO_DATA_REF (stmt_info)
6092       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6093     {
6094       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6095         if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6096           {
6097             SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6098               = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6099             break;
6100           }
6101     }
6102   else
6103     {
6104       poly_uint64 vf;
6105       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6106         vf = loop_vinfo->vectorization_factor;
6107       else
6108         vf = 1;
6109       unsigned int group_size = SLP_TREE_LANES (node);
6110       tree vectype = SLP_TREE_VECTYPE (node);
6111       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6112         = vect_get_num_vectors (vf * group_size, vectype);
6113     }
6114
6115   /* Handle purely internal nodes.  */
6116   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6117     {
6118       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6119         return false;
6120
6121       stmt_vec_info slp_stmt_info;
6122       unsigned int i;
6123       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6124         {
6125           if (STMT_VINFO_LIVE_P (slp_stmt_info)
6126               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6127                                                node_instance, i,
6128                                                false, cost_vec))
6129             return false;
6130         }
6131       return true;
6132     }
6133
6134   bool dummy;
6135   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6136                             node, node_instance, cost_vec);
6137 }
6138
6139 /* Try to build NODE from scalars, returning true on success.
6140    NODE_INSTANCE is the SLP instance that contains NODE.  */
6141
6142 static bool
6143 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6144                               slp_instance node_instance)
6145 {
6146   stmt_vec_info stmt_info;
6147   unsigned int i;
6148
6149   if (!is_a <bb_vec_info> (vinfo)
6150       || node == SLP_INSTANCE_TREE (node_instance)
6151       || !SLP_TREE_SCALAR_STMTS (node).exists ()
6152       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6153       /* Force the mask use to be built from scalars instead.  */
6154       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6155     return false;
6156
6157   if (dump_enabled_p ())
6158     dump_printf_loc (MSG_NOTE, vect_location,
6159                      "Building vector operands of %p from scalars instead\n",
6160                      (void *) node);
6161
6162   /* Don't remove and free the child nodes here, since they could be
6163      referenced by other structures.  The analysis and scheduling phases
6164      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
6165   unsigned int group_size = SLP_TREE_LANES (node);
6166   SLP_TREE_DEF_TYPE (node) = vect_external_def;
6167   /* Invariants get their vector type from the uses.  */
6168   SLP_TREE_VECTYPE (node) = NULL_TREE;
6169   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6170   SLP_TREE_LOAD_PERMUTATION (node).release ();
6171   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6172     {
6173       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6174       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6175     }
6176   return true;
6177 }
6178
6179 /* Return true if all elements of the slice are the same.  */
6180 bool
6181 vect_scalar_ops_slice::all_same_p () const
6182 {
6183   for (unsigned int i = 1; i < length; ++i)
6184     if (!operand_equal_p (op (0), op (i)))
6185       return false;
6186   return true;
6187 }
6188
6189 hashval_t
6190 vect_scalar_ops_slice_hash::hash (const value_type &s)
6191 {
6192   hashval_t hash = 0;
6193   for (unsigned i = 0; i < s.length; ++i)
6194     hash = iterative_hash_expr (s.op (i), hash);
6195   return hash;
6196 }
6197
6198 bool
6199 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6200                                    const compare_type &s2)
6201 {
6202   if (s1.length != s2.length)
6203     return false;
6204   for (unsigned i = 0; i < s1.length; ++i)
6205     if (!operand_equal_p (s1.op (i), s2.op (i)))
6206       return false;
6207   return true;
6208 }
6209
6210 /* Compute the prologue cost for invariant or constant operands represented
6211    by NODE.  */
6212
6213 static void
6214 vect_prologue_cost_for_slp (slp_tree node,
6215                             stmt_vector_for_cost *cost_vec)
6216 {
6217   /* There's a special case of an existing vector, that costs nothing.  */
6218   if (SLP_TREE_SCALAR_OPS (node).length () == 0
6219       && !SLP_TREE_VEC_DEFS (node).is_empty ())
6220     return;
6221   /* Without looking at the actual initializer a vector of
6222      constants can be implemented as load from the constant pool.
6223      When all elements are the same we can use a splat.  */
6224   tree vectype = SLP_TREE_VECTYPE (node);
6225   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6226   unsigned HOST_WIDE_INT const_nunits;
6227   unsigned nelt_limit;
6228   auto ops = &SLP_TREE_SCALAR_OPS (node);
6229   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6230   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6231       && ! multiple_p (const_nunits, group_size))
6232     {
6233       nelt_limit = const_nunits;
6234       hash_set<vect_scalar_ops_slice_hash> vector_ops;
6235       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6236         if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6237           starts.quick_push (i * const_nunits);
6238     }
6239   else
6240     {
6241       /* If either the vector has variable length or the vectors
6242          are composed of repeated whole groups we only need to
6243          cost construction once.  All vectors will be the same.  */
6244       nelt_limit = group_size;
6245       starts.quick_push (0);
6246     }
6247   /* ???  We're just tracking whether vectors in a single node are the same.
6248      Ideally we'd do something more global.  */
6249   bool passed = false;
6250   for (unsigned int start : starts)
6251     {
6252       vect_cost_for_stmt kind;
6253       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6254         kind = vector_load;
6255       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6256         kind = scalar_to_vec;
6257       else
6258         kind = vec_construct;
6259       /* The target cost hook has no idea which part of the SLP node
6260          we are costing so avoid passing it down more than once.  Pass
6261          it to the first vec_construct or scalar_to_vec part since for those
6262          the x86 backend tries to account for GPR to XMM register moves.  */
6263       record_stmt_cost (cost_vec, 1, kind,
6264                         (kind != vector_load && !passed) ? node : nullptr,
6265                         vectype, 0, vect_prologue);
6266       if (kind != vector_load)
6267         passed = true;
6268     }
6269 }
6270
6271 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6272    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6273
6274    Return true if the operations are supported.  */
6275
6276 static bool
6277 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6278                                   slp_instance node_instance,
6279                                   hash_set<slp_tree> &visited_set,
6280                                   vec<slp_tree> &visited_vec,
6281                                   stmt_vector_for_cost *cost_vec)
6282 {
6283   int i, j;
6284   slp_tree child;
6285
6286   /* Assume we can code-generate all invariants.  */
6287   if (!node
6288       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6289       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6290     return true;
6291
6292   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6293     {
6294       if (dump_enabled_p ())
6295         dump_printf_loc (MSG_NOTE, vect_location,
6296                          "Failed cyclic SLP reference in %p\n", (void *) node);
6297       return false;
6298     }
6299   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6300
6301   /* If we already analyzed the exact same set of scalar stmts we're done.
6302      We share the generated vector stmts for those.  */
6303   if (visited_set.add (node))
6304     return true;
6305   visited_vec.safe_push (node);
6306
6307   bool res = true;
6308   unsigned visited_rec_start = visited_vec.length ();
6309   unsigned cost_vec_rec_start = cost_vec->length ();
6310   bool seen_non_constant_child = false;
6311   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6312     {
6313       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6314                                               visited_set, visited_vec,
6315                                               cost_vec);
6316       if (!res)
6317         break;
6318       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6319         seen_non_constant_child = true;
6320     }
6321   /* We're having difficulties scheduling nodes with just constant
6322      operands and no scalar stmts since we then cannot compute a stmt
6323      insertion place.  */
6324   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6325     {
6326       if (dump_enabled_p ())
6327         dump_printf_loc (MSG_NOTE, vect_location,
6328                          "Cannot vectorize all-constant op node %p\n",
6329                          (void *) node);
6330       res = false;
6331     }
6332
6333   if (res)
6334     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6335                                               cost_vec);
6336   /* If analysis failed we have to pop all recursive visited nodes
6337      plus ourselves.  */
6338   if (!res)
6339     {
6340       while (visited_vec.length () >= visited_rec_start)
6341         visited_set.remove (visited_vec.pop ());
6342       cost_vec->truncate (cost_vec_rec_start);
6343     }
6344
6345   /* When the node can be vectorized cost invariant nodes it references.
6346      This is not done in DFS order to allow the refering node
6347      vectorizable_* calls to nail down the invariant nodes vector type
6348      and possibly unshare it if it needs a different vector type than
6349      other referrers.  */
6350   if (res)
6351     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6352       if (child
6353           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6354               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6355           /* Perform usual caching, note code-generation still
6356              code-gens these nodes multiple times but we expect
6357              to CSE them later.  */
6358           && !visited_set.add (child))
6359         {
6360           visited_vec.safe_push (child);
6361           /* ???  After auditing more code paths make a "default"
6362              and push the vector type from NODE to all children
6363              if it is not already set.  */
6364           /* Compute the number of vectors to be generated.  */
6365           tree vector_type = SLP_TREE_VECTYPE (child);
6366           if (!vector_type)
6367             {
6368               /* For shifts with a scalar argument we don't need
6369                  to cost or code-generate anything.
6370                  ???  Represent this more explicitely.  */
6371               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6372                            == shift_vec_info_type)
6373                           && j == 1);
6374               continue;
6375             }
6376           unsigned group_size = SLP_TREE_LANES (child);
6377           poly_uint64 vf = 1;
6378           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6379             vf = loop_vinfo->vectorization_factor;
6380           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6381             = vect_get_num_vectors (vf * group_size, vector_type);
6382           /* And cost them.  */
6383           vect_prologue_cost_for_slp (child, cost_vec);
6384         }
6385
6386   /* If this node or any of its children can't be vectorized, try pruning
6387      the tree here rather than felling the whole thing.  */
6388   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6389     {
6390       /* We'll need to revisit this for invariant costing and number
6391          of vectorized stmt setting.   */
6392       res = true;
6393     }
6394
6395   return res;
6396 }
6397
6398 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6399    region and that can be vectorized using vectorizable_live_operation
6400    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
6401    scalar code computing it to be retained.  */
6402
6403 static void
6404 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6405                              slp_instance instance,
6406                              stmt_vector_for_cost *cost_vec,
6407                              hash_set<stmt_vec_info> &svisited,
6408                              hash_set<slp_tree> &visited)
6409 {
6410   if (visited.add (node))
6411     return;
6412
6413   unsigned i;
6414   stmt_vec_info stmt_info;
6415   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6416   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6417     {
6418       if (svisited.contains (stmt_info))
6419         continue;
6420       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6421       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6422           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6423         /* Only the pattern root stmt computes the original scalar value.  */
6424         continue;
6425       bool mark_visited = true;
6426       gimple *orig_stmt = orig_stmt_info->stmt;
6427       ssa_op_iter op_iter;
6428       def_operand_p def_p;
6429       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6430         {
6431           imm_use_iterator use_iter;
6432           gimple *use_stmt;
6433           stmt_vec_info use_stmt_info;
6434           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6435             if (!is_gimple_debug (use_stmt))
6436               {
6437                 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6438                 if (!use_stmt_info
6439                     || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6440                   {
6441                     STMT_VINFO_LIVE_P (stmt_info) = true;
6442                     if (vectorizable_live_operation (bb_vinfo, stmt_info,
6443                                                      node, instance, i,
6444                                                      false, cost_vec))
6445                       /* ???  So we know we can vectorize the live stmt
6446                          from one SLP node.  If we cannot do so from all
6447                          or none consistently we'd have to record which
6448                          SLP node (and lane) we want to use for the live
6449                          operation.  So make sure we can code-generate
6450                          from all nodes.  */
6451                       mark_visited = false;
6452                     else
6453                       STMT_VINFO_LIVE_P (stmt_info) = false;
6454                     break;
6455                   }
6456               }
6457           /* We have to verify whether we can insert the lane extract
6458              before all uses.  The following is a conservative approximation.
6459              We cannot put this into vectorizable_live_operation because
6460              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6461              doesn't work.
6462              Note that while the fact that we emit code for loads at the
6463              first load should make this a non-problem leafs we construct
6464              from scalars are vectorized after the last scalar def.
6465              ???  If we'd actually compute the insert location during
6466              analysis we could use sth less conservative than the last
6467              scalar stmt in the node for the dominance check.  */
6468           /* ???  What remains is "live" uses in vector CTORs in the same
6469              SLP graph which is where those uses can end up code-generated
6470              right after their definition instead of close to their original
6471              use.  But that would restrict us to code-generate lane-extracts
6472              from the latest stmt in a node.  So we compensate for this
6473              during code-generation, simply not replacing uses for those
6474              hopefully rare cases.  */
6475           if (STMT_VINFO_LIVE_P (stmt_info))
6476             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6477               if (!is_gimple_debug (use_stmt)
6478                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6479                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6480                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6481                 {
6482                   if (dump_enabled_p ())
6483                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6484                                      "Cannot determine insertion place for "
6485                                      "lane extract\n");
6486                   STMT_VINFO_LIVE_P (stmt_info) = false;
6487                   mark_visited = true;
6488                 }
6489         }
6490       if (mark_visited)
6491         svisited.add (stmt_info);
6492     }
6493
6494   slp_tree child;
6495   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6496     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6497       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6498                                    cost_vec, svisited, visited);
6499 }
6500
6501 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
6502
6503 static bool
6504 vectorizable_bb_reduc_epilogue (slp_instance instance,
6505                                 stmt_vector_for_cost *cost_vec)
6506 {
6507   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6508   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6509   if (reduc_code == MINUS_EXPR)
6510     reduc_code = PLUS_EXPR;
6511   internal_fn reduc_fn;
6512   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6513   if (!vectype
6514       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6515       || reduc_fn == IFN_LAST
6516       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6517       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6518                                      TREE_TYPE (vectype)))
6519     {
6520       if (dump_enabled_p ())
6521         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6522                          "not vectorized: basic block reduction epilogue "
6523                          "operation unsupported.\n");
6524       return false;
6525     }
6526
6527   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6528      cost log2 vector operations plus shuffles and one extraction.  */
6529   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6530   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6531                     vectype, 0, vect_body);
6532   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6533                     vectype, 0, vect_body);
6534   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6535                     vectype, 0, vect_body);
6536
6537   /* Since we replace all stmts of a possibly longer scalar reduction
6538      chain account for the extra scalar stmts for that.  */
6539   record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6540                     instance->root_stmts[0], 0, vect_body);
6541   return true;
6542 }
6543
6544 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6545    and recurse to children.  */
6546
6547 static void
6548 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6549                               hash_set<slp_tree> &visited)
6550 {
6551   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6552       || visited.add (node))
6553     return;
6554
6555   stmt_vec_info stmt;
6556   unsigned i;
6557   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6558     roots.remove (vect_orig_stmt (stmt));
6559
6560   slp_tree child;
6561   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6562     if (child)
6563       vect_slp_prune_covered_roots (child, roots, visited);
6564 }
6565
6566 /* Analyze statements in SLP instances of VINFO.  Return true if the
6567    operations are supported. */
6568
6569 bool
6570 vect_slp_analyze_operations (vec_info *vinfo)
6571 {
6572   slp_instance instance;
6573   int i;
6574
6575   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6576
6577   hash_set<slp_tree> visited;
6578   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6579     {
6580       auto_vec<slp_tree> visited_vec;
6581       stmt_vector_for_cost cost_vec;
6582       cost_vec.create (2);
6583       if (is_a <bb_vec_info> (vinfo))
6584         vect_location = instance->location ();
6585       if (!vect_slp_analyze_node_operations (vinfo,
6586                                              SLP_INSTANCE_TREE (instance),
6587                                              instance, visited, visited_vec,
6588                                              &cost_vec)
6589           /* CTOR instances require vectorized defs for the SLP tree root.  */
6590           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6591               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6592                   != vect_internal_def
6593                   /* Make sure we vectorized with the expected type.  */
6594                   || !useless_type_conversion_p
6595                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6596                                               (instance->root_stmts[0]->stmt))),
6597                          TREE_TYPE (SLP_TREE_VECTYPE
6598                                             (SLP_INSTANCE_TREE (instance))))))
6599           /* Check we can vectorize the reduction.  */
6600           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6601               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6602         {
6603           slp_tree node = SLP_INSTANCE_TREE (instance);
6604           stmt_vec_info stmt_info;
6605           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6606             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6607           else
6608             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6609           if (dump_enabled_p ())
6610             dump_printf_loc (MSG_NOTE, vect_location,
6611                              "removing SLP instance operations starting from: %G",
6612                              stmt_info->stmt);
6613           vect_free_slp_instance (instance);
6614           vinfo->slp_instances.ordered_remove (i);
6615           cost_vec.release ();
6616           while (!visited_vec.is_empty ())
6617             visited.remove (visited_vec.pop ());
6618         }
6619       else
6620         {
6621           i++;
6622           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6623             {
6624               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6625               cost_vec.release ();
6626             }
6627           else
6628             /* For BB vectorization remember the SLP graph entry
6629                cost for later.  */
6630             instance->cost_vec = cost_vec;
6631         }
6632     }
6633
6634   /* Now look for SLP instances with a root that are covered by other
6635      instances and remove them.  */
6636   hash_set<stmt_vec_info> roots;
6637   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6638     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6639       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6640   if (!roots.is_empty ())
6641     {
6642       visited.empty ();
6643       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6644         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6645                                       visited);
6646       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6647         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6648             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6649           {
6650             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6651             if (dump_enabled_p ())
6652               dump_printf_loc (MSG_NOTE, vect_location,
6653                                "removing SLP instance operations starting "
6654                                "from: %G", root->stmt);
6655             vect_free_slp_instance (instance);
6656             vinfo->slp_instances.ordered_remove (i);
6657           }
6658         else
6659           ++i;
6660     }
6661
6662   /* Compute vectorizable live stmts.  */
6663   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6664     {
6665       hash_set<stmt_vec_info> svisited;
6666       hash_set<slp_tree> visited;
6667       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6668         {
6669           vect_location = instance->location ();
6670           vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6671                                        instance, &instance->cost_vec, svisited,
6672                                        visited);
6673         }
6674     }
6675
6676   return !vinfo->slp_instances.is_empty ();
6677 }
6678
6679 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6680    closing the eventual chain.  */
6681
6682 static slp_instance
6683 get_ultimate_leader (slp_instance instance,
6684                      hash_map<slp_instance, slp_instance> &instance_leader)
6685 {
6686   auto_vec<slp_instance *, 8> chain;
6687   slp_instance *tem;
6688   while (*(tem = instance_leader.get (instance)) != instance)
6689     {
6690       chain.safe_push (tem);
6691       instance = *tem;
6692     }
6693   while (!chain.is_empty ())
6694     *chain.pop () = instance;
6695   return instance;
6696 }
6697
6698 namespace {
6699 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
6700    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6701    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
6702
6703    INSTANCE_LEADER is as for get_ultimate_leader.  */
6704
6705 template<typename T>
6706 bool
6707 vect_map_to_instance (slp_instance instance, T key,
6708                       hash_map<T, slp_instance> &key_to_instance,
6709                       hash_map<slp_instance, slp_instance> &instance_leader)
6710 {
6711   bool existed_p;
6712   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6713   if (!existed_p)
6714     ;
6715   else if (key_instance != instance)
6716     {
6717       /* If we're running into a previously marked key make us the
6718          leader of the current ultimate leader.  This keeps the
6719          leader chain acyclic and works even when the current instance
6720          connects two previously independent graph parts.  */
6721       slp_instance key_leader
6722         = get_ultimate_leader (key_instance, instance_leader);
6723       if (key_leader != instance)
6724         instance_leader.put (key_leader, instance);
6725     }
6726   key_instance = instance;
6727   return existed_p;
6728 }
6729 }
6730
6731 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
6732
6733 static void
6734 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6735                            slp_instance instance, slp_tree node,
6736                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6737                            hash_map<slp_tree, slp_instance> &node_to_instance,
6738                            hash_map<slp_instance, slp_instance> &instance_leader)
6739 {
6740   stmt_vec_info stmt_info;
6741   unsigned i;
6742
6743   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6744     vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6745                           instance_leader);
6746
6747   if (vect_map_to_instance (instance, node, node_to_instance,
6748                             instance_leader))
6749     return;
6750
6751   slp_tree child;
6752   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6753     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6754       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6755                                  node_to_instance, instance_leader);
6756 }
6757
6758 /* Partition the SLP graph into pieces that can be costed independently.  */
6759
6760 static void
6761 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6762 {
6763   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6764
6765   /* First walk the SLP graph assigning each involved scalar stmt a
6766      corresponding SLP graph entry and upon visiting a previously
6767      marked stmt, make the stmts leader the current SLP graph entry.  */
6768   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6769   hash_map<slp_tree, slp_instance> node_to_instance;
6770   hash_map<slp_instance, slp_instance> instance_leader;
6771   slp_instance instance;
6772   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6773     {
6774       instance_leader.put (instance, instance);
6775       vect_bb_partition_graph_r (bb_vinfo,
6776                                  instance, SLP_INSTANCE_TREE (instance),
6777                                  stmt_to_instance, node_to_instance,
6778                                  instance_leader);
6779     }
6780
6781   /* Then collect entries to each independent subgraph.  */
6782   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6783     {
6784       slp_instance leader = get_ultimate_leader (instance, instance_leader);
6785       leader->subgraph_entries.safe_push (instance);
6786       if (dump_enabled_p ()
6787           && leader != instance)
6788         dump_printf_loc (MSG_NOTE, vect_location,
6789                          "instance %p is leader of %p\n",
6790                          (void *) leader, (void *) instance);
6791     }
6792 }
6793
6794 /* Compute the set of scalar stmts participating in internal and external
6795    nodes.  */
6796
6797 static void
6798 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6799                                          hash_set<slp_tree> &visited,
6800                                          hash_set<stmt_vec_info> &vstmts,
6801                                          hash_set<stmt_vec_info> &estmts)
6802 {
6803   int i;
6804   stmt_vec_info stmt_info;
6805   slp_tree child;
6806
6807   if (visited.add (node))
6808     return;
6809
6810   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6811     {
6812       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6813         vstmts.add (stmt_info);
6814
6815       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6816         if (child)
6817           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6818                                                    vstmts, estmts);
6819     }
6820   else
6821     for (tree def : SLP_TREE_SCALAR_OPS (node))
6822       {
6823         stmt_vec_info def_stmt = vinfo->lookup_def (def);
6824         if (def_stmt)
6825           estmts.add (def_stmt);
6826       }
6827 }
6828
6829
6830 /* Compute the scalar cost of the SLP node NODE and its children
6831    and return it.  Do not account defs that are marked in LIFE and
6832    update LIFE according to uses of NODE.  */
6833
6834 static void
6835 vect_bb_slp_scalar_cost (vec_info *vinfo,
6836                          slp_tree node, vec<bool, va_heap> *life,
6837                          stmt_vector_for_cost *cost_vec,
6838                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6839                          hash_set<slp_tree> &visited)
6840 {
6841   unsigned i;
6842   stmt_vec_info stmt_info;
6843   slp_tree child;
6844
6845   if (visited.add (node))
6846     return;
6847
6848   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6849     {
6850       ssa_op_iter op_iter;
6851       def_operand_p def_p;
6852
6853       if ((*life)[i])
6854         continue;
6855
6856       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6857       gimple *orig_stmt = orig_stmt_info->stmt;
6858
6859       /* If there is a non-vectorized use of the defs then the scalar
6860          stmt is kept live in which case we do not account it or any
6861          required defs in the SLP children in the scalar cost.  This
6862          way we make the vectorization more costly when compared to
6863          the scalar cost.  */
6864       if (!STMT_VINFO_LIVE_P (stmt_info))
6865         {
6866           auto_vec<gimple *, 8> worklist;
6867           hash_set<gimple *> *worklist_visited = NULL;
6868           worklist.quick_push (orig_stmt);
6869           do
6870             {
6871               gimple *work_stmt = worklist.pop ();
6872               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6873                 {
6874                   imm_use_iterator use_iter;
6875                   gimple *use_stmt;
6876                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6877                                          DEF_FROM_PTR (def_p))
6878                     if (!is_gimple_debug (use_stmt))
6879                       {
6880                         stmt_vec_info use_stmt_info
6881                           = vinfo->lookup_stmt (use_stmt);
6882                         if (!use_stmt_info
6883                             || !vectorized_scalar_stmts.contains (use_stmt_info))
6884                           {
6885                             if (use_stmt_info
6886                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6887                               {
6888                                 /* For stmts participating in patterns we have
6889                                    to check its uses recursively.  */
6890                                 if (!worklist_visited)
6891                                   worklist_visited = new hash_set<gimple *> ();
6892                                 if (!worklist_visited->add (use_stmt))
6893                                   worklist.safe_push (use_stmt);
6894                                 continue;
6895                               }
6896                             (*life)[i] = true;
6897                             goto next_lane;
6898                           }
6899                       }
6900                 }
6901             }
6902           while (!worklist.is_empty ());
6903 next_lane:
6904           if (worklist_visited)
6905             delete worklist_visited;
6906           if ((*life)[i])
6907             continue;
6908         }
6909
6910       /* Count scalar stmts only once.  */
6911       if (gimple_visited_p (orig_stmt))
6912         continue;
6913       gimple_set_visited (orig_stmt, true);
6914
6915       vect_cost_for_stmt kind;
6916       if (STMT_VINFO_DATA_REF (orig_stmt_info))
6917         {
6918           if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6919             kind = scalar_load;
6920           else
6921             kind = scalar_store;
6922         }
6923       else if (vect_nop_conversion_p (orig_stmt_info))
6924         continue;
6925       /* For single-argument PHIs assume coalescing which means zero cost
6926          for the scalar and the vector PHIs.  This avoids artificially
6927          favoring the vector path (but may pessimize it in some cases).  */
6928       else if (is_a <gphi *> (orig_stmt_info->stmt)
6929                && gimple_phi_num_args
6930                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6931         continue;
6932       else
6933         kind = scalar_stmt;
6934       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6935                         SLP_TREE_VECTYPE (node), 0, vect_body);
6936     }
6937
6938   auto_vec<bool, 20> subtree_life;
6939   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6940     {
6941       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6942         {
6943           /* Do not directly pass LIFE to the recursive call, copy it to
6944              confine changes in the callee to the current child/subtree.  */
6945           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6946             {
6947               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6948               for (unsigned j = 0;
6949                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6950                 {
6951                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6952                   if (perm.first == i)
6953                     subtree_life[perm.second] = (*life)[j];
6954                 }
6955             }
6956           else
6957             {
6958               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6959               subtree_life.safe_splice (*life);
6960             }
6961           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6962                                    vectorized_scalar_stmts, visited);
6963           subtree_life.truncate (0);
6964         }
6965     }
6966 }
6967
6968 /* Comparator for the loop-index sorted cost vectors.  */
6969
6970 static int
6971 li_cost_vec_cmp (const void *a_, const void *b_)
6972 {
6973   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6974   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6975   if (a->first < b->first)
6976     return -1;
6977   else if (a->first == b->first)
6978     return 0;
6979   return 1;
6980 }
6981
6982 /* Check if vectorization of the basic block is profitable for the
6983    subgraph denoted by SLP_INSTANCES.  */
6984
6985 static bool
6986 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6987                                     vec<slp_instance> slp_instances,
6988                                     loop_p orig_loop)
6989 {
6990   slp_instance instance;
6991   int i;
6992   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
6993   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
6994
6995   if (dump_enabled_p ())
6996     {
6997       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
6998       hash_set<slp_tree> visited;
6999       FOR_EACH_VEC_ELT (slp_instances, i, instance)
7000         vect_print_slp_graph (MSG_NOTE, vect_location,
7001                               SLP_INSTANCE_TREE (instance), visited);
7002     }
7003
7004   /* Compute the set of scalar stmts we know will go away 'locally' when
7005      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
7006      not accurate for nodes promoted extern late or for scalar stmts that
7007      are used both in extern defs and in vectorized defs.  */
7008   hash_set<stmt_vec_info> vectorized_scalar_stmts;
7009   hash_set<stmt_vec_info> scalar_stmts_in_externs;
7010   hash_set<slp_tree> visited;
7011   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7012     {
7013       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
7014                                                SLP_INSTANCE_TREE (instance),
7015                                                visited,
7016                                                vectorized_scalar_stmts,
7017                                                scalar_stmts_in_externs);
7018       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7019         vectorized_scalar_stmts.add (rstmt);
7020     }
7021   /* Scalar stmts used as defs in external nodes need to be preseved, so
7022      remove them from vectorized_scalar_stmts.  */
7023   for (stmt_vec_info stmt : scalar_stmts_in_externs)
7024     vectorized_scalar_stmts.remove (stmt);
7025
7026   /* Calculate scalar cost and sum the cost for the vector stmts
7027      previously collected.  */
7028   stmt_vector_for_cost scalar_costs = vNULL;
7029   stmt_vector_for_cost vector_costs = vNULL;
7030   visited.empty ();
7031   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7032     {
7033       auto_vec<bool, 20> life;
7034       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7035                               true);
7036       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7037         record_stmt_cost (&scalar_costs,
7038                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
7039                           scalar_stmt,
7040                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
7041       vect_bb_slp_scalar_cost (bb_vinfo,
7042                                SLP_INSTANCE_TREE (instance),
7043                                &life, &scalar_costs, vectorized_scalar_stmts,
7044                                visited);
7045       vector_costs.safe_splice (instance->cost_vec);
7046       instance->cost_vec.release ();
7047     }
7048
7049   if (dump_enabled_p ())
7050     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7051
7052   /* When costing non-loop vectorization we need to consider each covered
7053      loop independently and make sure vectorization is profitable.  For
7054      now we assume a loop may be not entered or executed an arbitrary
7055      number of iterations (???  static information can provide more
7056      precise info here) which means we can simply cost each containing
7057      loops stmts separately.  */
7058
7059   /* First produce cost vectors sorted by loop index.  */
7060   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7061     li_scalar_costs (scalar_costs.length ());
7062   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7063     li_vector_costs (vector_costs.length ());
7064   stmt_info_for_cost *cost;
7065   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7066     {
7067       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7068       li_scalar_costs.quick_push (std::make_pair (l, cost));
7069     }
7070   /* Use a random used loop as fallback in case the first vector_costs
7071      entry does not have a stmt_info associated with it.  */
7072   unsigned l = li_scalar_costs[0].first;
7073   FOR_EACH_VEC_ELT (vector_costs, i, cost)
7074     {
7075       /* We inherit from the previous COST, invariants, externals and
7076          extracts immediately follow the cost for the related stmt.  */
7077       if (cost->stmt_info)
7078         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7079       li_vector_costs.quick_push (std::make_pair (l, cost));
7080     }
7081   li_scalar_costs.qsort (li_cost_vec_cmp);
7082   li_vector_costs.qsort (li_cost_vec_cmp);
7083
7084   /* Now cost the portions individually.  */
7085   unsigned vi = 0;
7086   unsigned si = 0;
7087   bool profitable = true;
7088   while (si < li_scalar_costs.length ()
7089          && vi < li_vector_costs.length ())
7090     {
7091       unsigned sl = li_scalar_costs[si].first;
7092       unsigned vl = li_vector_costs[vi].first;
7093       if (sl != vl)
7094         {
7095           if (dump_enabled_p ())
7096             dump_printf_loc (MSG_NOTE, vect_location,
7097                              "Scalar %d and vector %d loop part do not "
7098                              "match up, skipping scalar part\n", sl, vl);
7099           /* Skip the scalar part, assuming zero cost on the vector side.  */
7100           do
7101             {
7102               si++;
7103             }
7104           while (si < li_scalar_costs.length ()
7105                  && li_scalar_costs[si].first == sl);
7106           continue;
7107         }
7108
7109       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7110       do
7111         {
7112           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7113           si++;
7114         }
7115       while (si < li_scalar_costs.length ()
7116              && li_scalar_costs[si].first == sl);
7117       unsigned dummy;
7118       finish_cost (scalar_target_cost_data, nullptr,
7119                    &dummy, &scalar_cost, &dummy);
7120
7121       /* Complete the target-specific vector cost calculation.  */
7122       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7123       do
7124         {
7125           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7126           vi++;
7127         }
7128       while (vi < li_vector_costs.length ()
7129              && li_vector_costs[vi].first == vl);
7130       finish_cost (vect_target_cost_data, scalar_target_cost_data,
7131                    &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7132       delete scalar_target_cost_data;
7133       delete vect_target_cost_data;
7134
7135       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7136
7137       if (dump_enabled_p ())
7138         {
7139           dump_printf_loc (MSG_NOTE, vect_location,
7140                            "Cost model analysis for part in loop %d:\n", sl);
7141           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
7142                        vec_inside_cost + vec_outside_cost);
7143           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
7144         }
7145
7146       /* Vectorization is profitable if its cost is more than the cost of scalar
7147          version.  Note that we err on the vector side for equal cost because
7148          the cost estimate is otherwise quite pessimistic (constant uses are
7149          free on the scalar side but cost a load on the vector side for
7150          example).  */
7151       if (vec_outside_cost + vec_inside_cost > scalar_cost)
7152         {
7153           profitable = false;
7154           break;
7155         }
7156     }
7157   if (profitable && vi < li_vector_costs.length ())
7158     {
7159       if (dump_enabled_p ())
7160         dump_printf_loc (MSG_NOTE, vect_location,
7161                          "Excess vector cost for part in loop %d:\n",
7162                          li_vector_costs[vi].first);
7163       profitable = false;
7164     }
7165
7166   /* Unset visited flag.  This is delayed when the subgraph is profitable
7167      and we process the loop for remaining unvectorized if-converted code.  */
7168   if (!orig_loop || !profitable)
7169     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7170       gimple_set_visited  (cost->stmt_info->stmt, false);
7171
7172   scalar_costs.release ();
7173   vector_costs.release ();
7174
7175   return profitable;
7176 }
7177
7178 /* qsort comparator for lane defs.  */
7179
7180 static int
7181 vld_cmp (const void *a_, const void *b_)
7182 {
7183   auto *a = (const std::pair<unsigned, tree> *)a_;
7184   auto *b = (const std::pair<unsigned, tree> *)b_;
7185   return a->first - b->first;
7186 }
7187
7188 /* Return true if USE_STMT is a vector lane insert into VEC and set
7189    *THIS_LANE to the lane number that is set.  */
7190
7191 static bool
7192 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7193 {
7194   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7195   if (!use_ass
7196       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7197       || (vec
7198           ? gimple_assign_rhs1 (use_ass) != vec
7199           : ((vec = gimple_assign_rhs1 (use_ass)), false))
7200       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7201                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7202       || !constant_multiple_p
7203             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7204              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7205              this_lane))
7206     return false;
7207   return true;
7208 }
7209
7210 /* Find any vectorizable constructors and add them to the grouped_store
7211    array.  */
7212
7213 static void
7214 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7215 {
7216   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7217     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7218          !gsi_end_p (gsi); gsi_next (&gsi))
7219     {
7220       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7221       if (!assign)
7222         continue;
7223
7224       tree rhs = gimple_assign_rhs1 (assign);
7225       enum tree_code code = gimple_assign_rhs_code (assign);
7226       use_operand_p use_p;
7227       gimple *use_stmt;
7228       if (code == CONSTRUCTOR)
7229         {
7230           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7231               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7232                            CONSTRUCTOR_NELTS (rhs))
7233               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7234               || uniform_vector_p (rhs))
7235             continue;
7236
7237           unsigned j;
7238           tree val;
7239           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7240             if (TREE_CODE (val) != SSA_NAME
7241                 || !bb_vinfo->lookup_def (val))
7242               break;
7243           if (j != CONSTRUCTOR_NELTS (rhs))
7244             continue;
7245
7246           vec<stmt_vec_info> roots = vNULL;
7247           roots.safe_push (bb_vinfo->lookup_stmt (assign));
7248           vec<stmt_vec_info> stmts;
7249           stmts.create (CONSTRUCTOR_NELTS (rhs));
7250           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7251             stmts.quick_push
7252               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7253           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7254                                                stmts, roots));
7255         }
7256       else if (code == BIT_INSERT_EXPR
7257                && VECTOR_TYPE_P (TREE_TYPE (rhs))
7258                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7259                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7260                && integer_zerop (gimple_assign_rhs3 (assign))
7261                && useless_type_conversion_p
7262                     (TREE_TYPE (TREE_TYPE (rhs)),
7263                      TREE_TYPE (gimple_assign_rhs2 (assign)))
7264                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7265         {
7266           /* We start to match on insert to lane zero but since the
7267              inserts need not be ordered we'd have to search both
7268              the def and the use chains.  */
7269           tree vectype = TREE_TYPE (rhs);
7270           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7271           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7272           auto_sbitmap lanes (nlanes);
7273           bitmap_clear (lanes);
7274           bitmap_set_bit (lanes, 0);
7275           tree def = gimple_assign_lhs (assign);
7276           lane_defs.quick_push
7277                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
7278           unsigned lanes_found = 1;
7279           /* Start with the use chains, the last stmt will be the root.  */
7280           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7281           vec<stmt_vec_info> roots = vNULL;
7282           roots.safe_push (last);
7283           do
7284             {
7285               use_operand_p use_p;
7286               gimple *use_stmt;
7287               if (!single_imm_use (def, &use_p, &use_stmt))
7288                 break;
7289               unsigned this_lane;
7290               if (!bb_vinfo->lookup_stmt (use_stmt)
7291                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7292                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7293                 break;
7294               if (bitmap_bit_p (lanes, this_lane))
7295                 break;
7296               lanes_found++;
7297               bitmap_set_bit (lanes, this_lane);
7298               gassign *use_ass = as_a <gassign *> (use_stmt);
7299               lane_defs.quick_push (std::make_pair
7300                                      (this_lane, gimple_assign_rhs2 (use_ass)));
7301               last = bb_vinfo->lookup_stmt (use_ass);
7302               roots.safe_push (last);
7303               def = gimple_assign_lhs (use_ass);
7304             }
7305           while (lanes_found < nlanes);
7306           if (roots.length () > 1)
7307             std::swap(roots[0], roots[roots.length () - 1]);
7308           if (lanes_found < nlanes)
7309             {
7310               /* Now search the def chain.  */
7311               def = gimple_assign_rhs1 (assign);
7312               do
7313                 {
7314                   if (TREE_CODE (def) != SSA_NAME
7315                       || !has_single_use (def))
7316                     break;
7317                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7318                   unsigned this_lane;
7319                   if (!bb_vinfo->lookup_stmt (def_stmt)
7320                       || !vect_slp_is_lane_insert (def_stmt,
7321                                                    NULL_TREE, &this_lane)
7322                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7323                     break;
7324                   if (bitmap_bit_p (lanes, this_lane))
7325                     break;
7326                   lanes_found++;
7327                   bitmap_set_bit (lanes, this_lane);
7328                   lane_defs.quick_push (std::make_pair
7329                                           (this_lane,
7330                                            gimple_assign_rhs2 (def_stmt)));
7331                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7332                   def = gimple_assign_rhs1 (def_stmt);
7333                 }
7334               while (lanes_found < nlanes);
7335             }
7336           if (lanes_found == nlanes)
7337             {
7338               /* Sort lane_defs after the lane index and register the root.  */
7339               lane_defs.qsort (vld_cmp);
7340               vec<stmt_vec_info> stmts;
7341               stmts.create (nlanes);
7342               for (unsigned i = 0; i < nlanes; ++i)
7343                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7344               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7345                                                    stmts, roots));
7346             }
7347           else
7348             roots.release ();
7349         }
7350       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7351                && (associative_tree_code (code) || code == MINUS_EXPR)
7352                /* ???  This pessimizes a two-element reduction.  PR54400.
7353                   ???  In-order reduction could be handled if we only
7354                   traverse one operand chain in vect_slp_linearize_chain.  */
7355                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7356                /* Ops with constants at the tail can be stripped here.  */
7357                && TREE_CODE (rhs) == SSA_NAME
7358                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7359                /* Should be the chain end.  */
7360                && (!single_imm_use (gimple_assign_lhs (assign),
7361                                     &use_p, &use_stmt)
7362                    || !is_gimple_assign (use_stmt)
7363                    || (gimple_assign_rhs_code (use_stmt) != code
7364                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
7365                            || (gimple_assign_rhs_code (use_stmt)
7366                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7367         {
7368           /* We start the match at the end of a possible association
7369              chain.  */
7370           auto_vec<chain_op_t> chain;
7371           auto_vec<std::pair<tree_code, gimple *> > worklist;
7372           auto_vec<gimple *> chain_stmts;
7373           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7374           if (code == MINUS_EXPR)
7375             code = PLUS_EXPR;
7376           internal_fn reduc_fn;
7377           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7378               || reduc_fn == IFN_LAST)
7379             continue;
7380           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7381                                     /* ??? */
7382                                     code_stmt, alt_code_stmt, &chain_stmts);
7383           if (chain.length () > 1)
7384             {
7385               /* Sort the chain according to def_type and operation.  */
7386               chain.sort (dt_sort_cmp, bb_vinfo);
7387               /* ???  Now we'd want to strip externals and constants
7388                  but record those to be handled in the epilogue.  */
7389               /* ???  For now do not allow mixing ops or externs/constants.  */
7390               bool invalid = false;
7391               unsigned remain_cnt = 0;
7392               for (unsigned i = 0; i < chain.length (); ++i)
7393                 {
7394                   if (chain[i].code != code)
7395                     {
7396                       invalid = true;
7397                       break;
7398                     }
7399                   if (chain[i].dt != vect_internal_def)
7400                     remain_cnt++;
7401                 }
7402               if (!invalid && chain.length () - remain_cnt > 1)
7403                 {
7404                   vec<stmt_vec_info> stmts;
7405                   vec<tree> remain = vNULL;
7406                   stmts.create (chain.length ());
7407                   if (remain_cnt > 0)
7408                     remain.create (remain_cnt);
7409                   for (unsigned i = 0; i < chain.length (); ++i)
7410                     {
7411                       if (chain[i].dt == vect_internal_def)
7412                         stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7413                       else
7414                         remain.quick_push (chain[i].op);
7415                     }
7416                   vec<stmt_vec_info> roots;
7417                   roots.create (chain_stmts.length ());
7418                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
7419                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7420                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7421                                                        stmts, roots, remain));
7422                 }
7423             }
7424         }
7425     }
7426 }
7427
7428 /* Walk the grouped store chains and replace entries with their
7429    pattern variant if any.  */
7430
7431 static void
7432 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7433 {
7434   stmt_vec_info first_element;
7435   unsigned i;
7436
7437   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7438     {
7439       /* We also have CTORs in this array.  */
7440       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7441         continue;
7442       if (STMT_VINFO_IN_PATTERN_P (first_element))
7443         {
7444           stmt_vec_info orig = first_element;
7445           first_element = STMT_VINFO_RELATED_STMT (first_element);
7446           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7447           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7448           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7449           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7450           vinfo->grouped_stores[i] = first_element;
7451         }
7452       stmt_vec_info prev = first_element;
7453       while (DR_GROUP_NEXT_ELEMENT (prev))
7454         {
7455           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7456           if (STMT_VINFO_IN_PATTERN_P (elt))
7457             {
7458               stmt_vec_info orig = elt;
7459               elt = STMT_VINFO_RELATED_STMT (elt);
7460               DR_GROUP_NEXT_ELEMENT (prev) = elt;
7461               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7462               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7463             }
7464           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7465           prev = elt;
7466         }
7467     }
7468 }
7469
7470 /* Check if the region described by BB_VINFO can be vectorized, returning
7471    true if so.  When returning false, set FATAL to true if the same failure
7472    would prevent vectorization at other vector sizes, false if it is still
7473    worth trying other sizes.  N_STMTS is the number of statements in the
7474    region.  */
7475
7476 static bool
7477 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7478                        vec<int> *dataref_groups)
7479 {
7480   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7481
7482   slp_instance instance;
7483   int i;
7484   poly_uint64 min_vf = 2;
7485
7486   /* The first group of checks is independent of the vector size.  */
7487   fatal = true;
7488
7489   /* Analyze the data references.  */
7490
7491   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7492     {
7493       if (dump_enabled_p ())
7494         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7495                          "not vectorized: unhandled data-ref in basic "
7496                          "block.\n");
7497       return false;
7498     }
7499
7500   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7501     {
7502      if (dump_enabled_p ())
7503        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7504                         "not vectorized: unhandled data access in "
7505                         "basic block.\n");
7506       return false;
7507     }
7508
7509   vect_slp_check_for_roots (bb_vinfo);
7510
7511   /* If there are no grouped stores and no constructors in the region
7512      there is no need to continue with pattern recog as vect_analyze_slp
7513      will fail anyway.  */
7514   if (bb_vinfo->grouped_stores.is_empty ()
7515       && bb_vinfo->roots.is_empty ())
7516     {
7517       if (dump_enabled_p ())
7518         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7519                          "not vectorized: no grouped stores in "
7520                          "basic block.\n");
7521       return false;
7522     }
7523
7524   /* While the rest of the analysis below depends on it in some way.  */
7525   fatal = false;
7526
7527   vect_pattern_recog (bb_vinfo);
7528
7529   /* Update store groups from pattern processing.  */
7530   vect_fixup_store_groups_with_patterns (bb_vinfo);
7531
7532   /* Check the SLP opportunities in the basic block, analyze and build SLP
7533      trees.  */
7534   if (!vect_analyze_slp (bb_vinfo, n_stmts))
7535     {
7536       if (dump_enabled_p ())
7537         {
7538           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7539                            "Failed to SLP the basic block.\n");
7540           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7541                            "not vectorized: failed to find SLP opportunities "
7542                            "in basic block.\n");
7543         }
7544       return false;
7545     }
7546
7547   /* Optimize permutations.  */
7548   vect_optimize_slp (bb_vinfo);
7549
7550   /* Gather the loads reachable from the SLP graph entries.  */
7551   vect_gather_slp_loads (bb_vinfo);
7552
7553   vect_record_base_alignments (bb_vinfo);
7554
7555   /* Analyze and verify the alignment of data references and the
7556      dependence in the SLP instances.  */
7557   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7558     {
7559       vect_location = instance->location ();
7560       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7561           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7562         {
7563           slp_tree node = SLP_INSTANCE_TREE (instance);
7564           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7565           if (dump_enabled_p ())
7566             dump_printf_loc (MSG_NOTE, vect_location,
7567                              "removing SLP instance operations starting from: %G",
7568                              stmt_info->stmt);
7569           vect_free_slp_instance (instance);
7570           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7571           continue;
7572         }
7573
7574       /* Mark all the statements that we want to vectorize as pure SLP and
7575          relevant.  */
7576       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7577       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7578       unsigned j;
7579       stmt_vec_info root;
7580       /* Likewise consider instance root stmts as vectorized.  */
7581       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7582         STMT_SLP_TYPE (root) = pure_slp;
7583
7584       i++;
7585     }
7586   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7587     return false;
7588
7589   if (!vect_slp_analyze_operations (bb_vinfo))
7590     {
7591       if (dump_enabled_p ())
7592         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7593                          "not vectorized: bad operation in basic block.\n");
7594       return false;
7595     }
7596
7597   vect_bb_partition_graph (bb_vinfo);
7598
7599   return true;
7600 }
7601
7602 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
7603    basic blocks in BBS, returning true on success.
7604    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
7605
7606 static bool
7607 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7608                  vec<int> *dataref_groups, unsigned int n_stmts,
7609                  loop_p orig_loop)
7610 {
7611   bb_vec_info bb_vinfo;
7612   auto_vector_modes vector_modes;
7613
7614   /* Autodetect first vector size we try.  */
7615   machine_mode next_vector_mode = VOIDmode;
7616   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7617   unsigned int mode_i = 0;
7618
7619   vec_info_shared shared;
7620
7621   machine_mode autodetected_vector_mode = VOIDmode;
7622   while (1)
7623     {
7624       bool vectorized = false;
7625       bool fatal = false;
7626       bb_vinfo = new _bb_vec_info (bbs, &shared);
7627
7628       bool first_time_p = shared.datarefs.is_empty ();
7629       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7630       if (first_time_p)
7631         bb_vinfo->shared->save_datarefs ();
7632       else
7633         bb_vinfo->shared->check_datarefs ();
7634       bb_vinfo->vector_mode = next_vector_mode;
7635
7636       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7637         {
7638           if (dump_enabled_p ())
7639             {
7640               dump_printf_loc (MSG_NOTE, vect_location,
7641                                "***** Analysis succeeded with vector mode"
7642                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7643               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7644             }
7645
7646           bb_vinfo->shared->check_datarefs ();
7647
7648           auto_vec<slp_instance> profitable_subgraphs;
7649           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7650             {
7651               if (instance->subgraph_entries.is_empty ())
7652                 continue;
7653
7654               dump_user_location_t saved_vect_location = vect_location;
7655               vect_location = instance->location ();
7656               if (!unlimited_cost_model (NULL)
7657                   && !vect_bb_vectorization_profitable_p
7658                         (bb_vinfo, instance->subgraph_entries, orig_loop))
7659                 {
7660                   if (dump_enabled_p ())
7661                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7662                                      "not vectorized: vectorization is not "
7663                                      "profitable.\n");
7664                   vect_location = saved_vect_location;
7665                   continue;
7666                 }
7667
7668               vect_location = saved_vect_location;
7669               if (!dbg_cnt (vect_slp))
7670                 continue;
7671
7672               profitable_subgraphs.safe_push (instance);
7673             }
7674
7675           /* When we're vectorizing an if-converted loop body make sure
7676              we vectorized all if-converted code.  */
7677           if (!profitable_subgraphs.is_empty ()
7678               && orig_loop)
7679             {
7680               gcc_assert (bb_vinfo->bbs.length () == 1);
7681               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7682                    !gsi_end_p (gsi); gsi_next (&gsi))
7683                 {
7684                   /* The costing above left us with DCEable vectorized scalar
7685                      stmts having the visited flag set on profitable
7686                      subgraphs.  Do the delayed clearing of the flag here.  */
7687                   if (gimple_visited_p (gsi_stmt (gsi)))
7688                     {
7689                       gimple_set_visited (gsi_stmt (gsi), false);
7690                       continue;
7691                     }
7692                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7693                     continue;
7694
7695                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7696                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
7697                       {
7698                         if (!profitable_subgraphs.is_empty ()
7699                             && dump_enabled_p ())
7700                           dump_printf_loc (MSG_NOTE, vect_location,
7701                                            "not profitable because of "
7702                                            "unprofitable if-converted scalar "
7703                                            "code\n");
7704                         profitable_subgraphs.truncate (0);
7705                       }
7706                 }
7707             }
7708
7709           /* Finally schedule the profitable subgraphs.  */
7710           for (slp_instance instance : profitable_subgraphs)
7711             {
7712               if (!vectorized && dump_enabled_p ())
7713                 dump_printf_loc (MSG_NOTE, vect_location,
7714                                  "Basic block will be vectorized "
7715                                  "using SLP\n");
7716               vectorized = true;
7717
7718               /* Dump before scheduling as store vectorization will remove
7719                  the original stores and mess with the instance tree
7720                  so querying its location will eventually ICE.  */
7721               if (flag_checking)
7722                 for (slp_instance sub : instance->subgraph_entries)
7723                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7724               unsigned HOST_WIDE_INT bytes;
7725               if (dump_enabled_p ())
7726                 for (slp_instance sub : instance->subgraph_entries)
7727                   {
7728                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7729                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7730                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7731                                        sub->location (),
7732                                        "basic block part vectorized using %wu "
7733                                        "byte vectors\n", bytes);
7734                     else
7735                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7736                                        sub->location (),
7737                                        "basic block part vectorized using "
7738                                        "variable length vectors\n");
7739                   }
7740
7741               dump_user_location_t saved_vect_location = vect_location;
7742               vect_location = instance->location ();
7743
7744               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7745
7746               vect_location = saved_vect_location;
7747             }
7748         }
7749       else
7750         {
7751           if (dump_enabled_p ())
7752             dump_printf_loc (MSG_NOTE, vect_location,
7753                              "***** Analysis failed with vector mode %s\n",
7754                              GET_MODE_NAME (bb_vinfo->vector_mode));
7755         }
7756
7757       if (mode_i == 0)
7758         autodetected_vector_mode = bb_vinfo->vector_mode;
7759
7760       if (!fatal)
7761         while (mode_i < vector_modes.length ()
7762                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7763           {
7764             if (dump_enabled_p ())
7765               dump_printf_loc (MSG_NOTE, vect_location,
7766                                "***** The result for vector mode %s would"
7767                                " be the same\n",
7768                                GET_MODE_NAME (vector_modes[mode_i]));
7769             mode_i += 1;
7770           }
7771
7772       delete bb_vinfo;
7773
7774       if (mode_i < vector_modes.length ()
7775           && VECTOR_MODE_P (autodetected_vector_mode)
7776           && (related_vector_mode (vector_modes[mode_i],
7777                                    GET_MODE_INNER (autodetected_vector_mode))
7778               == autodetected_vector_mode)
7779           && (related_vector_mode (autodetected_vector_mode,
7780                                    GET_MODE_INNER (vector_modes[mode_i]))
7781               == vector_modes[mode_i]))
7782         {
7783           if (dump_enabled_p ())
7784             dump_printf_loc (MSG_NOTE, vect_location,
7785                              "***** Skipping vector mode %s, which would"
7786                              " repeat the analysis for %s\n",
7787                              GET_MODE_NAME (vector_modes[mode_i]),
7788                              GET_MODE_NAME (autodetected_vector_mode));
7789           mode_i += 1;
7790         }
7791
7792       if (vectorized
7793           || mode_i == vector_modes.length ()
7794           || autodetected_vector_mode == VOIDmode
7795           /* If vect_slp_analyze_bb_1 signaled that analysis for all
7796              vector sizes will fail do not bother iterating.  */
7797           || fatal)
7798         return vectorized;
7799
7800       /* Try the next biggest vector size.  */
7801       next_vector_mode = vector_modes[mode_i++];
7802       if (dump_enabled_p ())
7803         dump_printf_loc (MSG_NOTE, vect_location,
7804                          "***** Re-trying analysis with vector mode %s\n",
7805                          GET_MODE_NAME (next_vector_mode));
7806     }
7807 }
7808
7809
7810 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
7811    true if anything in the basic-block was vectorized.  */
7812
7813 static bool
7814 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7815 {
7816   vec<data_reference_p> datarefs = vNULL;
7817   auto_vec<int> dataref_groups;
7818   int insns = 0;
7819   int current_group = 0;
7820
7821   for (unsigned i = 0; i < bbs.length (); i++)
7822     {
7823       basic_block bb = bbs[i];
7824       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7825            gsi_next (&gsi))
7826         {
7827           gimple *stmt = gsi_stmt (gsi);
7828           if (is_gimple_debug (stmt))
7829             continue;
7830
7831           insns++;
7832
7833           if (gimple_location (stmt) != UNKNOWN_LOCATION)
7834             vect_location = stmt;
7835
7836           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7837                                               &dataref_groups, current_group))
7838             ++current_group;
7839         }
7840       /* New BBs always start a new DR group.  */
7841       ++current_group;
7842     }
7843
7844   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7845 }
7846
7847 /* Special entry for the BB vectorizer.  Analyze and transform a single
7848    if-converted BB with ORIG_LOOPs body being the not if-converted
7849    representation.  Returns true if anything in the basic-block was
7850    vectorized.  */
7851
7852 bool
7853 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7854 {
7855   auto_vec<basic_block> bbs;
7856   bbs.safe_push (bb);
7857   return vect_slp_bbs (bbs, orig_loop);
7858 }
7859
7860 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
7861    true if anything in the basic-block was vectorized.  */
7862
7863 bool
7864 vect_slp_function (function *fun)
7865 {
7866   bool r = false;
7867   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7868   auto_bitmap exit_bbs;
7869   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
7870   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
7871   unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
7872                                                       true, rpo, NULL);
7873
7874   /* For the moment split the function into pieces to avoid making
7875      the iteration on the vector mode moot.  Split at points we know
7876      to not handle well which is CFG merges (SLP discovery doesn't
7877      handle non-loop-header PHIs) and loop exits.  Since pattern
7878      recog requires reverse iteration to visit uses before defs
7879      simply chop RPO into pieces.  */
7880   auto_vec<basic_block> bbs;
7881   for (unsigned i = 0; i < n; i++)
7882     {
7883       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7884       bool split = false;
7885
7886       /* Split when a BB is not dominated by the first block.  */
7887       if (!bbs.is_empty ()
7888           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7889         {
7890           if (dump_enabled_p ())
7891             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7892                              "splitting region at dominance boundary bb%d\n",
7893                              bb->index);
7894           split = true;
7895         }
7896       /* Split when the loop determined by the first block
7897          is exited.  This is because we eventually insert
7898          invariants at region begin.  */
7899       else if (!bbs.is_empty ()
7900                && bbs[0]->loop_father != bb->loop_father
7901                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7902         {
7903           if (dump_enabled_p ())
7904             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7905                              "splitting region at loop %d exit at bb%d\n",
7906                              bbs[0]->loop_father->num, bb->index);
7907           split = true;
7908         }
7909       else if (!bbs.is_empty ()
7910                && bb->loop_father->header == bb
7911                && bb->loop_father->dont_vectorize)
7912         {
7913           if (dump_enabled_p ())
7914             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7915                              "splitting region at dont-vectorize loop %d "
7916                              "entry at bb%d\n",
7917                              bb->loop_father->num, bb->index);
7918           split = true;
7919         }
7920
7921       if (split && !bbs.is_empty ())
7922         {
7923           r |= vect_slp_bbs (bbs, NULL);
7924           bbs.truncate (0);
7925         }
7926
7927       if (bbs.is_empty ())
7928         {
7929           /* We need to be able to insert at the head of the region which
7930              we cannot for region starting with a returns-twice call.  */
7931           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
7932             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
7933               {
7934                 if (dump_enabled_p ())
7935                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7936                                    "skipping bb%d as start of region as it "
7937                                    "starts with returns-twice call\n",
7938                                    bb->index);
7939                 continue;
7940               }
7941           /* If the loop this BB belongs to is marked as not to be vectorized
7942              honor that also for BB vectorization.  */
7943           if (bb->loop_father->dont_vectorize)
7944             continue;
7945         }
7946
7947       bbs.safe_push (bb);
7948
7949       /* When we have a stmt ending this block and defining a
7950          value we have to insert on edges when inserting after it for
7951          a vector containing its definition.  Avoid this for now.  */
7952       if (gimple *last = *gsi_last_bb (bb))
7953         if (gimple_get_lhs (last)
7954             && is_ctrl_altering_stmt (last))
7955           {
7956             if (dump_enabled_p ())
7957               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7958                                "splitting region at control altering "
7959                                "definition %G", last);
7960             r |= vect_slp_bbs (bbs, NULL);
7961             bbs.truncate (0);
7962           }
7963     }
7964
7965   if (!bbs.is_empty ())
7966     r |= vect_slp_bbs (bbs, NULL);
7967
7968   free (rpo);
7969
7970   return r;
7971 }
7972
7973 /* Build a variable-length vector in which the elements in ELTS are repeated
7974    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
7975    RESULTS and add any new instructions to SEQ.
7976
7977    The approach we use is:
7978
7979    (1) Find a vector mode VM with integer elements of mode IM.
7980
7981    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7982        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
7983        from small vectors to IM.
7984
7985    (3) Duplicate each ELTS'[I] into a vector of mode VM.
7986
7987    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7988        correct byte contents.
7989
7990    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7991
7992    We try to find the largest IM for which this sequence works, in order
7993    to cut down on the number of interleaves.  */
7994
7995 void
7996 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
7997                           const vec<tree> &elts, unsigned int nresults,
7998                           vec<tree> &results)
7999 {
8000   unsigned int nelts = elts.length ();
8001   tree element_type = TREE_TYPE (vector_type);
8002
8003   /* (1) Find a vector mode VM with integer elements of mode IM.  */
8004   unsigned int nvectors = 1;
8005   tree new_vector_type;
8006   tree permutes[2];
8007   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
8008                                        &nvectors, &new_vector_type,
8009                                        permutes))
8010     gcc_unreachable ();
8011
8012   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
8013   unsigned int partial_nelts = nelts / nvectors;
8014   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8015
8016   tree_vector_builder partial_elts;
8017   auto_vec<tree, 32> pieces (nvectors * 2);
8018   pieces.quick_grow_cleared (nvectors * 2);
8019   for (unsigned int i = 0; i < nvectors; ++i)
8020     {
8021       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8022              ELTS' has mode IM.  */
8023       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
8024       for (unsigned int j = 0; j < partial_nelts; ++j)
8025         partial_elts.quick_push (elts[i * partial_nelts + j]);
8026       tree t = gimple_build_vector (seq, &partial_elts);
8027       t = gimple_build (seq, VIEW_CONVERT_EXPR,
8028                         TREE_TYPE (new_vector_type), t);
8029
8030       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
8031       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
8032     }
8033
8034   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
8035          correct byte contents.
8036
8037      Conceptually, we need to repeat the following operation log2(nvectors)
8038      times, where hi_start = nvectors / 2:
8039
8040         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
8041         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
8042
8043      However, if each input repeats every N elements and the VF is
8044      a multiple of N * 2, the HI result is the same as the LO result.
8045      This will be true for the first N1 iterations of the outer loop,
8046      followed by N2 iterations for which both the LO and HI results
8047      are needed.  I.e.:
8048
8049         N1 + N2 = log2(nvectors)
8050
8051      Each "N1 iteration" doubles the number of redundant vectors and the
8052      effect of the process as a whole is to have a sequence of nvectors/2**N1
8053      vectors that repeats 2**N1 times.  Rather than generate these redundant
8054      vectors, we halve the number of vectors for each N1 iteration.  */
8055   unsigned int in_start = 0;
8056   unsigned int out_start = nvectors;
8057   unsigned int new_nvectors = nvectors;
8058   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
8059     {
8060       unsigned int hi_start = new_nvectors / 2;
8061       unsigned int out_i = 0;
8062       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
8063         {
8064           if ((in_i & 1) != 0
8065               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
8066                              2 * in_repeat))
8067             continue;
8068
8069           tree output = make_ssa_name (new_vector_type);
8070           tree input1 = pieces[in_start + (in_i / 2)];
8071           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
8072           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8073                                                input1, input2,
8074                                                permutes[in_i & 1]);
8075           gimple_seq_add_stmt (seq, stmt);
8076           pieces[out_start + out_i] = output;
8077           out_i += 1;
8078         }
8079       std::swap (in_start, out_start);
8080       new_nvectors = out_i;
8081     }
8082
8083   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
8084   results.reserve (nresults);
8085   for (unsigned int i = 0; i < nresults; ++i)
8086     if (i < new_nvectors)
8087       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
8088                                         pieces[in_start + i]));
8089     else
8090       results.quick_push (results[i - new_nvectors]);
8091 }
8092
8093
8094 /* For constant and loop invariant defs in OP_NODE this function creates
8095    vector defs that will be used in the vectorized stmts and stores them
8096    to SLP_TREE_VEC_DEFS of OP_NODE.  */
8097
8098 static void
8099 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8100 {
8101   unsigned HOST_WIDE_INT nunits;
8102   tree vec_cst;
8103   unsigned j, number_of_places_left_in_vector;
8104   tree vector_type;
8105   tree vop;
8106   int group_size = op_node->ops.length ();
8107   unsigned int vec_num, i;
8108   unsigned number_of_copies = 1;
8109   bool constant_p;
8110   gimple_seq ctor_seq = NULL;
8111   auto_vec<tree, 16> permute_results;
8112
8113   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
8114   vector_type = SLP_TREE_VECTYPE (op_node);
8115
8116   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8117   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8118   auto_vec<tree> voprnds (number_of_vectors);
8119
8120   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8121      created vectors. It is greater than 1 if unrolling is performed.
8122
8123      For example, we have two scalar operands, s1 and s2 (e.g., group of
8124      strided accesses of size two), while NUNITS is four (i.e., four scalars
8125      of this type can be packed in a vector).  The output vector will contain
8126      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
8127      will be 2).
8128
8129      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8130      containing the operands.
8131
8132      For example, NUNITS is four as before, and the group size is 8
8133      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
8134      {s5, s6, s7, s8}.  */
8135
8136   /* When using duplicate_and_interleave, we just need one element for
8137      each scalar statement.  */
8138   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8139     nunits = group_size;
8140
8141   number_of_copies = nunits * number_of_vectors / group_size;
8142
8143   number_of_places_left_in_vector = nunits;
8144   constant_p = true;
8145   tree_vector_builder elts (vector_type, nunits, 1);
8146   elts.quick_grow (nunits);
8147   stmt_vec_info insert_after = NULL;
8148   for (j = 0; j < number_of_copies; j++)
8149     {
8150       tree op;
8151       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8152         {
8153           /* Create 'vect_ = {op0,op1,...,opn}'.  */
8154           number_of_places_left_in_vector--;
8155           tree orig_op = op;
8156           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8157             {
8158               if (CONSTANT_CLASS_P (op))
8159                 {
8160                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8161                     {
8162                       /* Can't use VIEW_CONVERT_EXPR for booleans because
8163                          of possibly different sizes of scalar value and
8164                          vector element.  */
8165                       if (integer_zerop (op))
8166                         op = build_int_cst (TREE_TYPE (vector_type), 0);
8167                       else if (integer_onep (op))
8168                         op = build_all_ones_cst (TREE_TYPE (vector_type));
8169                       else
8170                         gcc_unreachable ();
8171                     }
8172                   else
8173                     op = fold_unary (VIEW_CONVERT_EXPR,
8174                                      TREE_TYPE (vector_type), op);
8175                   gcc_assert (op && CONSTANT_CLASS_P (op));
8176                 }
8177               else
8178                 {
8179                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8180                   gimple *init_stmt;
8181                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8182                     {
8183                       tree true_val
8184                         = build_all_ones_cst (TREE_TYPE (vector_type));
8185                       tree false_val
8186                         = build_zero_cst (TREE_TYPE (vector_type));
8187                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8188                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8189                                                        op, true_val,
8190                                                        false_val);
8191                     }
8192                   else
8193                     {
8194                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8195                                    op);
8196                       init_stmt
8197                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8198                                                op);
8199                     }
8200                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
8201                   op = new_temp;
8202                 }
8203             }
8204           elts[number_of_places_left_in_vector] = op;
8205           if (!CONSTANT_CLASS_P (op))
8206             constant_p = false;
8207           /* For BB vectorization we have to compute an insert location
8208              when a def is inside the analyzed region since we cannot
8209              simply insert at the BB start in this case.  */
8210           stmt_vec_info opdef;
8211           if (TREE_CODE (orig_op) == SSA_NAME
8212               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8213               && is_a <bb_vec_info> (vinfo)
8214               && (opdef = vinfo->lookup_def (orig_op)))
8215             {
8216               if (!insert_after)
8217                 insert_after = opdef;
8218               else
8219                 insert_after = get_later_stmt (insert_after, opdef);
8220             }
8221
8222           if (number_of_places_left_in_vector == 0)
8223             {
8224               if (constant_p
8225                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
8226                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
8227                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8228               else
8229                 {
8230                   if (permute_results.is_empty ())
8231                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8232                                               elts, number_of_vectors,
8233                                               permute_results);
8234                   vec_cst = permute_results[number_of_vectors - j - 1];
8235                 }
8236               if (!gimple_seq_empty_p (ctor_seq))
8237                 {
8238                   if (insert_after)
8239                     {
8240                       gimple_stmt_iterator gsi;
8241                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8242                         {
8243                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8244                           gsi_insert_seq_before (&gsi, ctor_seq,
8245                                                  GSI_CONTINUE_LINKING);
8246                         }
8247                       else if (!stmt_ends_bb_p (insert_after->stmt))
8248                         {
8249                           gsi = gsi_for_stmt (insert_after->stmt);
8250                           gsi_insert_seq_after (&gsi, ctor_seq,
8251                                                 GSI_CONTINUE_LINKING);
8252                         }
8253                       else
8254                         {
8255                           /* When we want to insert after a def where the
8256                              defining stmt throws then insert on the fallthru
8257                              edge.  */
8258                           edge e = find_fallthru_edge
8259                                      (gimple_bb (insert_after->stmt)->succs);
8260                           basic_block new_bb
8261                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8262                           gcc_assert (!new_bb);
8263                         }
8264                     }
8265                   else
8266                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
8267                   ctor_seq = NULL;
8268                 }
8269               voprnds.quick_push (vec_cst);
8270               insert_after = NULL;
8271               number_of_places_left_in_vector = nunits;
8272               constant_p = true;
8273               elts.new_vector (vector_type, nunits, 1);
8274               elts.quick_grow (nunits);
8275             }
8276         }
8277     }
8278
8279   /* Since the vectors are created in the reverse order, we should invert
8280      them.  */
8281   vec_num = voprnds.length ();
8282   for (j = vec_num; j != 0; j--)
8283     {
8284       vop = voprnds[j - 1];
8285       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8286     }
8287
8288   /* In case that VF is greater than the unrolling factor needed for the SLP
8289      group of stmts, NUMBER_OF_VECTORS to be created is greater than
8290      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8291      to replicate the vectors.  */
8292   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8293     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8294          i++)
8295       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8296 }
8297
8298 /* Get the Ith vectorized definition from SLP_NODE.  */
8299
8300 tree
8301 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8302 {
8303   return SLP_TREE_VEC_DEFS (slp_node)[i];
8304 }
8305
8306 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
8307
8308 void
8309 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8310 {
8311   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8312   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8313 }
8314
8315 /* Get N vectorized definitions for SLP_NODE.  */
8316
8317 void
8318 vect_get_slp_defs (vec_info *,
8319                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8320 {
8321   if (n == -1U)
8322     n = SLP_TREE_CHILDREN (slp_node).length ();
8323
8324   for (unsigned i = 0; i < n; ++i)
8325     {
8326       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8327       vec<tree> vec_defs = vNULL;
8328       vect_get_slp_defs (child, &vec_defs);
8329       vec_oprnds->quick_push (vec_defs);
8330     }
8331 }
8332
8333 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8334    - PERM gives the permutation that the caller wants to use for NODE,
8335      which might be different from SLP_LOAD_PERMUTATION.
8336    - DUMP_P controls whether the function dumps information.  */
8337
8338 static bool
8339 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8340                                 load_permutation_t &perm,
8341                                 const vec<tree> &dr_chain,
8342                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
8343                                 bool analyze_only, bool dump_p,
8344                                 unsigned *n_perms, unsigned int *n_loads,
8345                                 bool dce_chain)
8346 {
8347   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8348   int vec_index = 0;
8349   tree vectype = SLP_TREE_VECTYPE (node);
8350   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8351   unsigned int mask_element;
8352   unsigned dr_group_size;
8353   machine_mode mode;
8354
8355   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8356     dr_group_size = 1;
8357   else
8358     {
8359       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8360       dr_group_size = DR_GROUP_SIZE (stmt_info);
8361     }
8362
8363   mode = TYPE_MODE (vectype);
8364   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8365   unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8366
8367   /* Initialize the vect stmts of NODE to properly insert the generated
8368      stmts later.  */
8369   if (! analyze_only)
8370     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8371       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8372
8373   /* Generate permutation masks for every NODE. Number of masks for each NODE
8374      is equal to GROUP_SIZE.
8375      E.g., we have a group of three nodes with three loads from the same
8376      location in each node, and the vector size is 4. I.e., we have a
8377      a0b0c0a1b1c1... sequence and we need to create the following vectors:
8378      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8379      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8380      ...
8381
8382      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8383      The last mask is illegal since we assume two operands for permute
8384      operation, and the mask element values can't be outside that range.
8385      Hence, the last mask must be converted into {2,5,5,5}.
8386      For the first two permutations we need the first and the second input
8387      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8388      we need the second and the third vectors: {b1,c1,a2,b2} and
8389      {c2,a3,b3,c3}.  */
8390
8391   int vect_stmts_counter = 0;
8392   unsigned int index = 0;
8393   int first_vec_index = -1;
8394   int second_vec_index = -1;
8395   bool noop_p = true;
8396   *n_perms = 0;
8397
8398   vec_perm_builder mask;
8399   unsigned int nelts_to_build;
8400   unsigned int nvectors_per_build;
8401   unsigned int in_nlanes;
8402   bool repeating_p = (group_size == dr_group_size
8403                       && multiple_p (nunits, group_size));
8404   if (repeating_p)
8405     {
8406       /* A single vector contains a whole number of copies of the node, so:
8407          (a) all permutes can use the same mask; and
8408          (b) the permutes only need a single vector input.  */
8409       mask.new_vector (nunits, group_size, 3);
8410       nelts_to_build = mask.encoded_nelts ();
8411       /* It's possible to obtain zero nstmts during analyze_only, so make
8412          it at least one to ensure the later computation for n_perms
8413          proceed.  */
8414       nvectors_per_build = nstmts > 0 ? nstmts : 1;
8415       in_nlanes = dr_group_size * 3;
8416     }
8417   else
8418     {
8419       /* We need to construct a separate mask for each vector statement.  */
8420       unsigned HOST_WIDE_INT const_nunits, const_vf;
8421       if (!nunits.is_constant (&const_nunits)
8422           || !vf.is_constant (&const_vf))
8423         return false;
8424       mask.new_vector (const_nunits, const_nunits, 1);
8425       nelts_to_build = const_vf * group_size;
8426       nvectors_per_build = 1;
8427       in_nlanes = const_vf * dr_group_size;
8428     }
8429   auto_sbitmap used_in_lanes (in_nlanes);
8430   bitmap_clear (used_in_lanes);
8431   auto_bitmap used_defs;
8432
8433   unsigned int count = mask.encoded_nelts ();
8434   mask.quick_grow (count);
8435   vec_perm_indices indices;
8436
8437   for (unsigned int j = 0; j < nelts_to_build; j++)
8438     {
8439       unsigned int iter_num = j / group_size;
8440       unsigned int stmt_num = j % group_size;
8441       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8442       bitmap_set_bit (used_in_lanes, i);
8443       if (repeating_p)
8444         {
8445           first_vec_index = 0;
8446           mask_element = i;
8447         }
8448       else
8449         {
8450           /* Enforced before the loop when !repeating_p.  */
8451           unsigned int const_nunits = nunits.to_constant ();
8452           vec_index = i / const_nunits;
8453           mask_element = i % const_nunits;
8454           if (vec_index == first_vec_index
8455               || first_vec_index == -1)
8456             {
8457               first_vec_index = vec_index;
8458             }
8459           else if (vec_index == second_vec_index
8460                    || second_vec_index == -1)
8461             {
8462               second_vec_index = vec_index;
8463               mask_element += const_nunits;
8464             }
8465           else
8466             {
8467               if (dump_p)
8468                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8469                                  "permutation requires at "
8470                                  "least three vectors %G",
8471                                  stmt_info->stmt);
8472               gcc_assert (analyze_only);
8473               return false;
8474             }
8475
8476           gcc_assert (mask_element < 2 * const_nunits);
8477         }
8478
8479       if (mask_element != index)
8480         noop_p = false;
8481       mask[index++] = mask_element;
8482
8483       if (index == count)
8484         {
8485           if (!noop_p)
8486             {
8487               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8488               if (!can_vec_perm_const_p (mode, mode, indices))
8489                 {
8490                   if (dump_p)
8491                     {
8492                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8493                                        "unsupported vect permute { ");
8494                       for (i = 0; i < count; ++i)
8495                         {
8496                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8497                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8498                         }
8499                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8500                     }
8501                   gcc_assert (analyze_only);
8502                   return false;
8503                 }
8504
8505               tree mask_vec = NULL_TREE;
8506               if (!analyze_only)
8507                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8508
8509               if (second_vec_index == -1)
8510                 second_vec_index = first_vec_index;
8511
8512               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8513                 {
8514                   ++*n_perms;
8515                   if (analyze_only)
8516                     continue;
8517                   /* Generate the permute statement if necessary.  */
8518                   tree first_vec = dr_chain[first_vec_index + ri];
8519                   tree second_vec = dr_chain[second_vec_index + ri];
8520                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8521                   tree perm_dest
8522                     = vect_create_destination_var (gimple_assign_lhs (stmt),
8523                                                    vectype);
8524                   perm_dest = make_ssa_name (perm_dest);
8525                   gimple *perm_stmt
8526                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8527                                            second_vec, mask_vec);
8528                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8529                                                gsi);
8530                   if (dce_chain)
8531                     {
8532                       bitmap_set_bit (used_defs, first_vec_index + ri);
8533                       bitmap_set_bit (used_defs, second_vec_index + ri);
8534                     }
8535
8536                   /* Store the vector statement in NODE.  */
8537                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8538                 }
8539             }
8540           else if (!analyze_only)
8541             {
8542               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8543                 {
8544                   tree first_vec = dr_chain[first_vec_index + ri];
8545                   /* If mask was NULL_TREE generate the requested
8546                      identity transform.  */
8547                   if (dce_chain)
8548                     bitmap_set_bit (used_defs, first_vec_index + ri);
8549
8550                   /* Store the vector statement in NODE.  */
8551                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8552                 }
8553             }
8554
8555           index = 0;
8556           first_vec_index = -1;
8557           second_vec_index = -1;
8558           noop_p = true;
8559         }
8560     }
8561
8562   if (n_loads)
8563     {
8564       if (repeating_p)
8565         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8566       else
8567         {
8568           /* Enforced above when !repeating_p.  */
8569           unsigned int const_nunits = nunits.to_constant ();
8570           *n_loads = 0;
8571           bool load_seen = false;
8572           for (unsigned i = 0; i < in_nlanes; ++i)
8573             {
8574               if (i % const_nunits == 0)
8575                 {
8576                   if (load_seen)
8577                     *n_loads += 1;
8578                   load_seen = false;
8579                 }
8580               if (bitmap_bit_p (used_in_lanes, i))
8581                 load_seen = true;
8582             }
8583           if (load_seen)
8584             *n_loads += 1;
8585         }
8586     }
8587
8588   if (dce_chain)
8589     for (unsigned i = 0; i < dr_chain.length (); ++i)
8590       if (!bitmap_bit_p (used_defs, i))
8591         {
8592           gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8593           gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8594           gsi_remove (&rgsi, true);
8595           release_defs (stmt);
8596         }
8597
8598   return true;
8599 }
8600
8601 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8602    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8603    permute statements for the SLP node NODE.  Store the number of vector
8604    permute instructions in *N_PERMS and the number of vector load
8605    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
8606    that were not needed.  */
8607
8608 bool
8609 vect_transform_slp_perm_load (vec_info *vinfo,
8610                               slp_tree node, const vec<tree> &dr_chain,
8611                               gimple_stmt_iterator *gsi, poly_uint64 vf,
8612                               bool analyze_only, unsigned *n_perms,
8613                               unsigned int *n_loads, bool dce_chain)
8614 {
8615   return vect_transform_slp_perm_load_1 (vinfo, node,
8616                                          SLP_TREE_LOAD_PERMUTATION (node),
8617                                          dr_chain, gsi, vf, analyze_only,
8618                                          dump_enabled_p (), n_perms, n_loads,
8619                                          dce_chain);
8620 }
8621
8622 /* Produce the next vector result for SLP permutation NODE by adding a vector
8623    statement at GSI.  If MASK_VEC is nonnull, add:
8624
8625       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8626
8627    otherwise add:
8628
8629       <new SSA name> = FIRST_DEF.  */
8630
8631 static void
8632 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8633                           slp_tree node, tree first_def, tree second_def,
8634                           tree mask_vec, poly_uint64 identity_offset)
8635 {
8636   tree vectype = SLP_TREE_VECTYPE (node);
8637
8638   /* ???  We SLP match existing vector element extracts but
8639      allow punning which we need to re-instantiate at uses
8640      but have no good way of explicitly representing.  */
8641   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8642       && !types_compatible_p (TREE_TYPE (first_def), vectype))
8643     {
8644       gassign *conv_stmt
8645         = gimple_build_assign (make_ssa_name (vectype),
8646                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8647       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8648       first_def = gimple_assign_lhs (conv_stmt);
8649     }
8650   gassign *perm_stmt;
8651   tree perm_dest = make_ssa_name (vectype);
8652   if (mask_vec)
8653     {
8654       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8655                            TYPE_SIZE (vectype))
8656           && !types_compatible_p (TREE_TYPE (second_def), vectype))
8657         {
8658           gassign *conv_stmt
8659             = gimple_build_assign (make_ssa_name (vectype),
8660                                    build1 (VIEW_CONVERT_EXPR,
8661                                            vectype, second_def));
8662           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8663           second_def = gimple_assign_lhs (conv_stmt);
8664         }
8665       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8666                                        first_def, second_def,
8667                                        mask_vec);
8668     }
8669   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8670     {
8671       /* For identity permutes we still need to handle the case
8672          of offsetted extracts or concats.  */
8673       unsigned HOST_WIDE_INT c;
8674       auto first_def_nunits
8675         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8676       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8677         {
8678           unsigned HOST_WIDE_INT elsz
8679             = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8680           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8681                                  TYPE_SIZE (vectype),
8682                                  bitsize_int (identity_offset * elsz));
8683           perm_stmt = gimple_build_assign (perm_dest, lowpart);
8684         }
8685       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8686                                     first_def_nunits, &c) && c == 2)
8687         {
8688           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8689                                             NULL_TREE, second_def);
8690           perm_stmt = gimple_build_assign (perm_dest, ctor);
8691         }
8692       else
8693         gcc_unreachable ();
8694     }
8695   else
8696     {
8697       /* We need a copy here in case the def was external.  */
8698       perm_stmt = gimple_build_assign (perm_dest, first_def);
8699     }
8700   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8701   /* Store the vector statement in NODE.  */
8702   node->push_vec_def (perm_stmt);
8703 }
8704
8705 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
8706    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8707    If GSI is nonnull, emit the permutation there.
8708
8709    When GSI is null, the only purpose of NODE is to give properties
8710    of the result, such as the vector type and number of SLP lanes.
8711    The node does not need to be a VEC_PERM_EXPR.
8712
8713    If the target supports the operation, return the number of individual
8714    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
8715    dump file if DUMP_P is true.  */
8716
8717 static int
8718 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8719                                 slp_tree node, lane_permutation_t &perm,
8720                                 vec<slp_tree> &children, bool dump_p)
8721 {
8722   tree vectype = SLP_TREE_VECTYPE (node);
8723
8724   /* ???  We currently only support all same vector input types
8725      while the SLP IL should really do a concat + select and thus accept
8726      arbitrary mismatches.  */
8727   slp_tree child;
8728   unsigned i;
8729   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8730   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8731   tree op_vectype = NULL_TREE;
8732   FOR_EACH_VEC_ELT (children, i, child)
8733     if (SLP_TREE_VECTYPE (child))
8734       {
8735         op_vectype = SLP_TREE_VECTYPE (child);
8736         break;
8737       }
8738   if (!op_vectype)
8739     op_vectype = vectype;
8740   FOR_EACH_VEC_ELT (children, i, child)
8741     {
8742       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8743            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8744           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8745           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8746         {
8747           if (dump_p)
8748             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8749                              "Unsupported vector types in lane permutation\n");
8750           return -1;
8751         }
8752       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8753         repeating_p = false;
8754     }
8755
8756   gcc_assert (perm.length () == SLP_TREE_LANES (node));
8757   if (dump_p)
8758     {
8759       dump_printf_loc (MSG_NOTE, vect_location,
8760                        "vectorizing permutation");
8761       for (unsigned i = 0; i < perm.length (); ++i)
8762         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8763       if (repeating_p)
8764         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8765       dump_printf (MSG_NOTE, "\n");
8766     }
8767
8768   /* REPEATING_P is true if every output vector is guaranteed to use the
8769      same permute vector.  We can handle that case for both variable-length
8770      and constant-length vectors, but we only handle other cases for
8771      constant-length vectors.
8772
8773      Set:
8774
8775      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8776        mask vector that we want to build.
8777
8778      - NCOPIES to the number of copies of PERM that we need in order
8779        to build the necessary permute mask vectors.
8780
8781      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8782        for each permute mask vector.  This is only relevant when GSI is
8783        nonnull.  */
8784   uint64_t npatterns;
8785   unsigned nelts_per_pattern;
8786   uint64_t ncopies;
8787   unsigned noutputs_per_mask;
8788   if (repeating_p)
8789     {
8790       /* We need a single permute mask vector that has the form:
8791
8792            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8793
8794          In other words, the original n-element permute in PERM is
8795          "unrolled" to fill a full vector.  The stepped vector encoding
8796          that we use for permutes requires 3n elements.  */
8797       npatterns = SLP_TREE_LANES (node);
8798       nelts_per_pattern = ncopies = 3;
8799       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8800     }
8801   else
8802     {
8803       /* Calculate every element of every permute mask vector explicitly,
8804          instead of relying on the pattern described above.  */
8805       if (!nunits.is_constant (&npatterns))
8806         return -1;
8807       nelts_per_pattern = ncopies = 1;
8808       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8809         if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8810           return -1;
8811       noutputs_per_mask = 1;
8812     }
8813   unsigned olanes = ncopies * SLP_TREE_LANES (node);
8814   gcc_assert (repeating_p || multiple_p (olanes, nunits));
8815
8816   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8817      from the { SLP operand, scalar lane } permutation as recorded in the
8818      SLP node as intermediate step.  This part should already work
8819      with SLP children with arbitrary number of lanes.  */
8820   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8821   auto_vec<unsigned> active_lane;
8822   vperm.create (olanes);
8823   active_lane.safe_grow_cleared (children.length (), true);
8824   for (unsigned i = 0; i < ncopies; ++i)
8825     {
8826       for (unsigned pi = 0; pi < perm.length (); ++pi)
8827         {
8828           std::pair<unsigned, unsigned> p = perm[pi];
8829           tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8830           if (repeating_p)
8831             vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8832           else
8833             {
8834               /* We checked above that the vectors are constant-length.  */
8835               unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8836               unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8837               unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8838               vperm.quick_push ({{p.first, vi}, vl});
8839             }
8840         }
8841       /* Advance to the next group.  */
8842       for (unsigned j = 0; j < children.length (); ++j)
8843         active_lane[j] += SLP_TREE_LANES (children[j]);
8844     }
8845
8846   if (dump_p)
8847     {
8848       dump_printf_loc (MSG_NOTE, vect_location,
8849                        "vectorizing permutation");
8850       for (unsigned i = 0; i < perm.length (); ++i)
8851         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8852       if (repeating_p)
8853         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8854       dump_printf (MSG_NOTE, "\n");
8855       dump_printf_loc (MSG_NOTE, vect_location, "as");
8856       for (unsigned i = 0; i < vperm.length (); ++i)
8857         {
8858           if (i != 0
8859               && (repeating_p
8860                   ? multiple_p (i, npatterns)
8861                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8862             dump_printf (MSG_NOTE, ",");
8863           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8864                        vperm[i].first.first, vperm[i].first.second,
8865                        vperm[i].second);
8866         }
8867       dump_printf (MSG_NOTE, "\n");
8868     }
8869
8870   /* We can only handle two-vector permutes, everything else should
8871      be lowered on the SLP level.  The following is closely inspired
8872      by vect_transform_slp_perm_load and is supposed to eventually
8873      replace it.
8874      ???   As intermediate step do code-gen in the SLP tree representation
8875      somehow?  */
8876   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8877   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8878   unsigned int index = 0;
8879   poly_uint64 mask_element;
8880   vec_perm_builder mask;
8881   mask.new_vector (nunits, npatterns, nelts_per_pattern);
8882   unsigned int count = mask.encoded_nelts ();
8883   mask.quick_grow (count);
8884   vec_perm_indices indices;
8885   unsigned nperms = 0;
8886   for (unsigned i = 0; i < vperm.length (); ++i)
8887     {
8888       mask_element = vperm[i].second;
8889       if (first_vec.first == -1U
8890           || first_vec == vperm[i].first)
8891         first_vec = vperm[i].first;
8892       else if (second_vec.first == -1U
8893                || second_vec == vperm[i].first)
8894         {
8895           second_vec = vperm[i].first;
8896           mask_element += nunits;
8897         }
8898       else
8899         {
8900           if (dump_p)
8901             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8902                              "permutation requires at "
8903                              "least three vectors\n");
8904           gcc_assert (!gsi);
8905           return -1;
8906         }
8907
8908       mask[index++] = mask_element;
8909
8910       if (index == count)
8911         {
8912           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8913                               TYPE_VECTOR_SUBPARTS (op_vectype));
8914           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
8915                              && constant_multiple_p (mask[0], nunits));
8916           machine_mode vmode = TYPE_MODE (vectype);
8917           machine_mode op_vmode = TYPE_MODE (op_vectype);
8918           unsigned HOST_WIDE_INT c;
8919           if ((!identity_p
8920                && !can_vec_perm_const_p (vmode, op_vmode, indices))
8921               || (identity_p
8922                   && !known_le (nunits,
8923                                 TYPE_VECTOR_SUBPARTS (op_vectype))
8924                   && (!constant_multiple_p (nunits,
8925                                             TYPE_VECTOR_SUBPARTS (op_vectype),
8926                                             &c) || c != 2)))
8927             {
8928               if (dump_p)
8929                 {
8930                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8931                                    vect_location,
8932                                    "unsupported vect permute { ");
8933                   for (i = 0; i < count; ++i)
8934                     {
8935                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8936                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8937                     }
8938                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8939                 }
8940               gcc_assert (!gsi);
8941               return -1;
8942             }
8943
8944           if (!identity_p)
8945             nperms++;
8946           if (gsi)
8947             {
8948               if (second_vec.first == -1U)
8949                 second_vec = first_vec;
8950
8951               slp_tree
8952                 first_node = children[first_vec.first],
8953                 second_node = children[second_vec.first];
8954
8955               tree mask_vec = NULL_TREE;
8956               if (!identity_p)
8957                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8958
8959               for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8960                 {
8961                   tree first_def
8962                     = vect_get_slp_vect_def (first_node,
8963                                              first_vec.second + vi);
8964                   tree second_def
8965                     = vect_get_slp_vect_def (second_node,
8966                                              second_vec.second + vi);
8967                   vect_add_slp_permutation (vinfo, gsi, node, first_def,
8968                                             second_def, mask_vec, mask[0]);
8969                 }
8970             }
8971
8972           index = 0;
8973           first_vec = std::make_pair (-1U, -1U);
8974           second_vec = std::make_pair (-1U, -1U);
8975         }
8976     }
8977
8978   return nperms;
8979 }
8980
8981 /* Vectorize the SLP permutations in NODE as specified
8982    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8983    child number and lane number.
8984    Interleaving of two two-lane two-child SLP subtrees (not supported):
8985      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8986    A blend of two four-lane two-child SLP subtrees:
8987      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8988    Highpart of a four-lane one-child SLP subtree (not supported):
8989      [ { 0, 2 }, { 0, 3 } ]
8990    Where currently only a subset is supported by code generating below.  */
8991
8992 static bool
8993 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8994                               slp_tree node, stmt_vector_for_cost *cost_vec)
8995 {
8996   tree vectype = SLP_TREE_VECTYPE (node);
8997   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
8998   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
8999                                                SLP_TREE_CHILDREN (node),
9000                                                dump_enabled_p ());
9001   if (nperms < 0)
9002     return false;
9003
9004   if (!gsi)
9005     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
9006
9007   return true;
9008 }
9009
9010 /* Vectorize SLP NODE.  */
9011
9012 static void
9013 vect_schedule_slp_node (vec_info *vinfo,
9014                         slp_tree node, slp_instance instance)
9015 {
9016   gimple_stmt_iterator si;
9017   int i;
9018   slp_tree child;
9019
9020   /* For existing vectors there's nothing to do.  */
9021   if (SLP_TREE_DEF_TYPE (node) == vect_external_def
9022       && SLP_TREE_VEC_DEFS (node).exists ())
9023     return;
9024
9025   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9026
9027   /* Vectorize externals and constants.  */
9028   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9029       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
9030     {
9031       /* ???  vectorizable_shift can end up using a scalar operand which is
9032          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
9033          node in this case.  */
9034       if (!SLP_TREE_VECTYPE (node))
9035         return;
9036
9037       vect_create_constant_vectors (vinfo, node);
9038       return;
9039     }
9040
9041   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9042
9043   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
9044   SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9045
9046   if (dump_enabled_p ())
9047     dump_printf_loc (MSG_NOTE, vect_location,
9048                      "------>vectorizing SLP node starting from: %G",
9049                      stmt_info->stmt);
9050
9051   if (STMT_VINFO_DATA_REF (stmt_info)
9052       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9053     {
9054       /* Vectorized loads go before the first scalar load to make it
9055          ready early, vectorized stores go before the last scalar
9056          stmt which is where all uses are ready.  */
9057       stmt_vec_info last_stmt_info = NULL;
9058       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9059         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9060       else /* DR_IS_WRITE */
9061         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9062       si = gsi_for_stmt (last_stmt_info->stmt);
9063     }
9064   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9065             || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9066             || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9067            && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9068     {
9069       /* For PHI node vectorization we do not use the insertion iterator.  */
9070       si = gsi_none ();
9071     }
9072   else
9073     {
9074       /* Emit other stmts after the children vectorized defs which is
9075          earliest possible.  */
9076       gimple *last_stmt = NULL;
9077       bool seen_vector_def = false;
9078       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9079         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9080           {
9081             /* For fold-left reductions we are retaining the scalar
9082                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9083                set so the representation isn't perfect.  Resort to the
9084                last scalar def here.  */
9085             if (SLP_TREE_VEC_DEFS (child).is_empty ())
9086               {
9087                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9088                             == cycle_phi_info_type);
9089                 gphi *phi = as_a <gphi *>
9090                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
9091                 if (!last_stmt
9092                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
9093                   last_stmt = phi;
9094               }
9095             /* We are emitting all vectorized stmts in the same place and
9096                the last one is the last.
9097                ???  Unless we have a load permutation applied and that
9098                figures to re-use an earlier generated load.  */
9099             unsigned j;
9100             tree vdef;
9101             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9102               {
9103                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9104                 if (!last_stmt
9105                     || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9106                   last_stmt = vstmt;
9107               }
9108           }
9109         else if (!SLP_TREE_VECTYPE (child))
9110           {
9111             /* For externals we use unvectorized at all scalar defs.  */
9112             unsigned j;
9113             tree def;
9114             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9115               if (TREE_CODE (def) == SSA_NAME
9116                   && !SSA_NAME_IS_DEFAULT_DEF (def))
9117                 {
9118                   gimple *stmt = SSA_NAME_DEF_STMT (def);
9119                   if (!last_stmt
9120                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9121                     last_stmt = stmt;
9122                 }
9123           }
9124         else
9125           {
9126             /* For externals we have to look at all defs since their
9127                insertion place is decided per vector.  But beware
9128                of pre-existing vectors where we need to make sure
9129                we do not insert before the region boundary.  */
9130             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9131                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9132               seen_vector_def = true;
9133             else
9134               {
9135                 unsigned j;
9136                 tree vdef;
9137                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9138                   if (TREE_CODE (vdef) == SSA_NAME
9139                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9140                     {
9141                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9142                       if (!last_stmt
9143                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9144                         last_stmt = vstmt;
9145                     }
9146               }
9147           }
9148       /* This can happen when all children are pre-existing vectors or
9149          constants.  */
9150       if (!last_stmt)
9151         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9152       if (!last_stmt)
9153         {
9154           gcc_assert (seen_vector_def);
9155           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9156         }
9157       else if (is_ctrl_altering_stmt (last_stmt))
9158         {
9159           /* We split regions to vectorize at control altering stmts
9160              with a definition so this must be an external which
9161              we can insert at the start of the region.  */
9162           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9163         }
9164       else if (is_a <bb_vec_info> (vinfo)
9165                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9166                && gimple_could_trap_p (stmt_info->stmt))
9167         {
9168           /* We've constrained possibly trapping operations to all come
9169              from the same basic-block, if vectorized defs would allow earlier
9170              scheduling still force vectorized stmts to the original block.
9171              This is only necessary for BB vectorization since for loop vect
9172              all operations are in a single BB and scalar stmt based
9173              placement doesn't play well with epilogue vectorization.  */
9174           gcc_assert (dominated_by_p (CDI_DOMINATORS,
9175                                       gimple_bb (stmt_info->stmt),
9176                                       gimple_bb (last_stmt)));
9177           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9178         }
9179       else if (is_a <gphi *> (last_stmt))
9180         si = gsi_after_labels (gimple_bb (last_stmt));
9181       else
9182         {
9183           si = gsi_for_stmt (last_stmt);
9184           gsi_next (&si);
9185         }
9186     }
9187
9188   /* Handle purely internal nodes.  */
9189   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9190     {
9191       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
9192          be shared with different SLP nodes (but usually it's the same
9193          operation apart from the case the stmt is only there for denoting
9194          the actual scalar lane defs ...).  So do not call vect_transform_stmt
9195          but open-code it here (partly).  */
9196       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9197       gcc_assert (done);
9198       stmt_vec_info slp_stmt_info;
9199       unsigned int i;
9200       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9201         if (STMT_VINFO_LIVE_P (slp_stmt_info))
9202           {
9203             done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9204                                                 instance, i, true, NULL);
9205             gcc_assert (done);
9206           }
9207     }
9208   else
9209     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9210 }
9211
9212 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9213    For loop vectorization this is done in vectorizable_call, but for SLP
9214    it needs to be deferred until end of vect_schedule_slp, because multiple
9215    SLP instances may refer to the same scalar stmt.  */
9216
9217 static void
9218 vect_remove_slp_scalar_calls (vec_info *vinfo,
9219                               slp_tree node, hash_set<slp_tree> &visited)
9220 {
9221   gimple *new_stmt;
9222   gimple_stmt_iterator gsi;
9223   int i;
9224   slp_tree child;
9225   tree lhs;
9226   stmt_vec_info stmt_info;
9227
9228   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9229     return;
9230
9231   if (visited.add (node))
9232     return;
9233
9234   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9235     vect_remove_slp_scalar_calls (vinfo, child, visited);
9236
9237   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9238     {
9239       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9240       if (!stmt || gimple_bb (stmt) == NULL)
9241         continue;
9242       if (is_pattern_stmt_p (stmt_info)
9243           || !PURE_SLP_STMT (stmt_info))
9244         continue;
9245       lhs = gimple_call_lhs (stmt);
9246       if (lhs)
9247         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9248       else
9249         {
9250           new_stmt = gimple_build_nop ();
9251           unlink_stmt_vdef (stmt_info->stmt);
9252         }
9253       gsi = gsi_for_stmt (stmt);
9254       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9255       if (lhs)
9256         SSA_NAME_DEF_STMT (lhs) = new_stmt;
9257     }
9258 }
9259
9260 static void
9261 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9262 {
9263   hash_set<slp_tree> visited;
9264   vect_remove_slp_scalar_calls (vinfo, node, visited);
9265 }
9266
9267 /* Vectorize the instance root.  */
9268
9269 void
9270 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9271 {
9272   gassign *rstmt = NULL;
9273
9274   if (instance->kind == slp_inst_kind_ctor)
9275     {
9276       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9277         {
9278           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9279           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9280           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9281                                           TREE_TYPE (vect_lhs)))
9282             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9283                                vect_lhs);
9284           rstmt = gimple_build_assign (root_lhs, vect_lhs);
9285         }
9286       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9287         {
9288           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9289           tree child_def;
9290           int j;
9291           vec<constructor_elt, va_gc> *v;
9292           vec_alloc (v, nelts);
9293
9294           /* A CTOR can handle V16HI composition from VNx8HI so we
9295              do not need to convert vector elements if the types
9296              do not match.  */
9297           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9298             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9299           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9300           tree rtype
9301             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9302           tree r_constructor = build_constructor (rtype, v);
9303           rstmt = gimple_build_assign (lhs, r_constructor);
9304         }
9305     }
9306   else if (instance->kind == slp_inst_kind_bb_reduc)
9307     {
9308       /* Largely inspired by reduction chain epilogue handling in
9309          vect_create_epilog_for_reduction.  */
9310       vec<tree> vec_defs = vNULL;
9311       vect_get_slp_defs (node, &vec_defs);
9312       enum tree_code reduc_code
9313         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9314       /* ???  We actually have to reflect signs somewhere.  */
9315       if (reduc_code == MINUS_EXPR)
9316         reduc_code = PLUS_EXPR;
9317       gimple_seq epilogue = NULL;
9318       /* We may end up with more than one vector result, reduce them
9319          to one vector.  */
9320       tree vec_def = vec_defs[0];
9321       tree vectype = TREE_TYPE (vec_def);
9322       tree compute_vectype = vectype;
9323       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9324                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
9325                                  && operation_can_overflow (reduc_code));
9326       if (pun_for_overflow_p)
9327         {
9328           compute_vectype = unsigned_type_for (vectype);
9329           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9330                                   compute_vectype, vec_def);
9331         }
9332       for (unsigned i = 1; i < vec_defs.length (); ++i)
9333         {
9334           tree def = vec_defs[i];
9335           if (pun_for_overflow_p)
9336             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9337                                 compute_vectype, def);
9338           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9339                                   vec_def, def);
9340         }
9341       vec_defs.release ();
9342       /* ???  Support other schemes than direct internal fn.  */
9343       internal_fn reduc_fn;
9344       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9345           || reduc_fn == IFN_LAST)
9346         gcc_unreachable ();
9347       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9348                                       TREE_TYPE (compute_vectype), vec_def);
9349       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9350         {
9351           tree rem_def = NULL_TREE;
9352           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9353             {
9354               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9355               if (!rem_def)
9356                 rem_def = def;
9357               else
9358                 rem_def = gimple_build (&epilogue, reduc_code,
9359                                         TREE_TYPE (scalar_def),
9360                                         rem_def, def);
9361             }
9362           scalar_def = gimple_build (&epilogue, reduc_code,
9363                                      TREE_TYPE (scalar_def),
9364                                      scalar_def, rem_def);
9365         }
9366       scalar_def = gimple_convert (&epilogue,
9367                                    TREE_TYPE (vectype), scalar_def);
9368       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9369       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9370       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9371       update_stmt (gsi_stmt (rgsi));
9372       return;
9373     }
9374   else
9375     gcc_unreachable ();
9376
9377   gcc_assert (rstmt);
9378
9379   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9380   gsi_replace (&rgsi, rstmt, true);
9381 }
9382
9383 struct slp_scc_info
9384 {
9385   bool on_stack;
9386   int dfs;
9387   int lowlink;
9388 };
9389
9390 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
9391
9392 static void
9393 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9394                    hash_map<slp_tree, slp_scc_info> &scc_info,
9395                    int &maxdfs, vec<slp_tree> &stack)
9396 {
9397   bool existed_p;
9398   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9399   gcc_assert (!existed_p);
9400   info->dfs = maxdfs;
9401   info->lowlink = maxdfs;
9402   maxdfs++;
9403
9404   /* Leaf.  */
9405   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9406     {
9407       info->on_stack = false;
9408       vect_schedule_slp_node (vinfo, node, instance);
9409       return;
9410     }
9411
9412   info->on_stack = true;
9413   stack.safe_push (node);
9414
9415   unsigned i;
9416   slp_tree child;
9417   /* DFS recurse.  */
9418   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9419     {
9420       if (!child)
9421         continue;
9422       slp_scc_info *child_info = scc_info.get (child);
9423       if (!child_info)
9424         {
9425           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9426           /* Recursion might have re-allocated the node.  */
9427           info = scc_info.get (node);
9428           child_info = scc_info.get (child);
9429           info->lowlink = MIN (info->lowlink, child_info->lowlink);
9430         }
9431       else if (child_info->on_stack)
9432         info->lowlink = MIN (info->lowlink, child_info->dfs);
9433     }
9434   if (info->lowlink != info->dfs)
9435     return;
9436
9437   auto_vec<slp_tree, 4> phis_to_fixup;
9438
9439   /* Singleton.  */
9440   if (stack.last () == node)
9441     {
9442       stack.pop ();
9443       info->on_stack = false;
9444       vect_schedule_slp_node (vinfo, node, instance);
9445       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9446           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9447         phis_to_fixup.quick_push (node);
9448     }
9449   else
9450     {
9451       /* SCC.  */
9452       int last_idx = stack.length () - 1;
9453       while (stack[last_idx] != node)
9454         last_idx--;
9455       /* We can break the cycle at PHIs who have at least one child
9456          code generated.  Then we could re-start the DFS walk until
9457          all nodes in the SCC are covered (we might have new entries
9458          for only back-reachable nodes).  But it's simpler to just
9459          iterate and schedule those that are ready.  */
9460       unsigned todo = stack.length () - last_idx;
9461       do
9462         {
9463           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9464             {
9465               slp_tree entry = stack[idx];
9466               if (!entry)
9467                 continue;
9468               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9469                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9470               bool ready = !phi;
9471               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9472                   if (!child)
9473                     {
9474                       gcc_assert (phi);
9475                       ready = true;
9476                       break;
9477                     }
9478                   else if (scc_info.get (child)->on_stack)
9479                     {
9480                       if (!phi)
9481                         {
9482                           ready = false;
9483                           break;
9484                         }
9485                     }
9486                   else
9487                     {
9488                       if (phi)
9489                         {
9490                           ready = true;
9491                           break;
9492                         }
9493                     }
9494               if (ready)
9495                 {
9496                   vect_schedule_slp_node (vinfo, entry, instance);
9497                   scc_info.get (entry)->on_stack = false;
9498                   stack[idx] = NULL;
9499                   todo--;
9500                   if (phi)
9501                     phis_to_fixup.safe_push (entry);
9502                 }
9503             }
9504         }
9505       while (todo != 0);
9506
9507       /* Pop the SCC.  */
9508       stack.truncate (last_idx);
9509     }
9510
9511   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
9512   slp_tree phi_node;
9513   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9514     {
9515       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9516       edge_iterator ei;
9517       edge e;
9518       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9519         {
9520           unsigned dest_idx = e->dest_idx;
9521           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9522           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9523             continue;
9524           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9525           /* Simply fill all args.  */
9526           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9527               != vect_first_order_recurrence)
9528             for (unsigned i = 0; i < n; ++i)
9529               {
9530                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9531                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9532                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9533                              e, gimple_phi_arg_location (phi, dest_idx));
9534               }
9535           else
9536             {
9537               /* Unless it is a first order recurrence which needs
9538                  args filled in for both the PHI node and the permutes.  */
9539               gimple *perm
9540                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9541               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9542               add_phi_arg (as_a <gphi *> (rphi),
9543                            vect_get_slp_vect_def (child, n - 1),
9544                            e, gimple_phi_arg_location (phi, dest_idx));
9545               for (unsigned i = 0; i < n; ++i)
9546                 {
9547                   gimple *perm
9548                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9549                   if (i > 0)
9550                     gimple_assign_set_rhs1 (perm,
9551                                             vect_get_slp_vect_def (child, i - 1));
9552                   gimple_assign_set_rhs2 (perm,
9553                                           vect_get_slp_vect_def (child, i));
9554                   update_stmt (perm);
9555                 }
9556             }
9557         }
9558     }
9559 }
9560
9561 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
9562
9563 void
9564 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9565 {
9566   slp_instance instance;
9567   unsigned int i;
9568
9569   hash_map<slp_tree, slp_scc_info> scc_info;
9570   int maxdfs = 0;
9571   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9572     {
9573       slp_tree node = SLP_INSTANCE_TREE (instance);
9574       if (dump_enabled_p ())
9575         {
9576           dump_printf_loc (MSG_NOTE, vect_location,
9577                            "Vectorizing SLP tree:\n");
9578           /* ???  Dump all?  */
9579           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9580             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9581                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9582           vect_print_slp_graph (MSG_NOTE, vect_location,
9583                                 SLP_INSTANCE_TREE (instance));
9584         }
9585       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9586          have a PHI be the node breaking the cycle.  */
9587       auto_vec<slp_tree> stack;
9588       if (!scc_info.get (node))
9589         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9590
9591       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9592         vectorize_slp_instance_root_stmt (node, instance);
9593
9594       if (dump_enabled_p ())
9595         dump_printf_loc (MSG_NOTE, vect_location,
9596                          "vectorizing stmts using SLP.\n");
9597     }
9598
9599   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9600     {
9601       slp_tree root = SLP_INSTANCE_TREE (instance);
9602       stmt_vec_info store_info;
9603       unsigned int j;
9604
9605       /* Remove scalar call stmts.  Do not do this for basic-block
9606          vectorization as not all uses may be vectorized.
9607          ???  Why should this be necessary?  DCE should be able to
9608          remove the stmts itself.
9609          ???  For BB vectorization we can as well remove scalar
9610          stmts starting from the SLP tree root if they have no
9611          uses.  */
9612       if (is_a <loop_vec_info> (vinfo))
9613         vect_remove_slp_scalar_calls (vinfo, root);
9614
9615       /* Remove vectorized stores original scalar stmts.  */
9616       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9617         {
9618           if (!STMT_VINFO_DATA_REF (store_info)
9619               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9620             break;
9621
9622           store_info = vect_orig_stmt (store_info);
9623           /* Free the attached stmt_vec_info and remove the stmt.  */
9624           vinfo->remove_stmt (store_info);
9625
9626           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9627              to not crash in vect_free_slp_tree later.  */
9628           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9629             SLP_TREE_REPRESENTATIVE (root) = NULL;
9630         }
9631     }
9632 }